Python tensor2tensor.data_generators.generator_utils.get_or_generate_vocab() Examples
The following are 15
code examples of tensor2tensor.data_generators.generator_utils.get_or_generate_vocab().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensor2tensor.data_generators.generator_utils
, or try the search function
.
Example #1
Source File: style_transfer.py From fine-lm with MIT License | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file = os.path.join(tmp_dir, tag + ".modern") target_file = os.path.join(tmp_dir, tag + ".original") return text_problems.text2text_txt_iterator(source_file, target_file)
Example #2
Source File: style_transfer.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
Example #3
Source File: style_transfer.py From BERT with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
Example #4
Source File: style_transfer.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
Example #5
Source File: base.py From t2t_wmt_zhen with MIT License | 6 votes |
def generator(self, data_dir, tmp_dir, train): datasets = self.get_datasets(train) # build vocab from training datasets source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)] target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, source_datasets) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, target_datasets) tag = "train" if train else "dev" data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag) return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS)
Example #6
Source File: style_transfer.py From fine-lm with MIT License | 5 votes |
def vocab_data_files(self): """Files to be passed to get_or_generate_vocab.""" return self.dataset_url(problem.DatasetSplit.TRAIN)
Example #7
Source File: translate.py From fine-lm with MIT License | 5 votes |
def vocab_data_files(self): """Files to be passed to get_or_generate_vocab.""" return self.source_data_files(problem.DatasetSplit.TRAIN)
Example #8
Source File: translate.py From fine-lm with MIT License | 5 votes |
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
Example #9
Source File: translate_enzh.py From fine-lm with MIT License | 5 votes |
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
Example #10
Source File: style_transfer.py From tensor2tensor with Apache License 2.0 | 5 votes |
def vocab_data_files(self): """Files to be passed to get_or_generate_vocab.""" return self.dataset_url(problem.DatasetSplit.TRAIN)
Example #11
Source File: translate_enzh.py From tensor2tensor with Apache License 2.0 | 5 votes |
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8, max_subtoken_length=self.max_subtoken_length) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8, max_subtoken_length=self.max_subtoken_length) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
Example #12
Source File: style_transfer.py From BERT with Apache License 2.0 | 5 votes |
def vocab_data_files(self): """Files to be passed to get_or_generate_vocab.""" return self.dataset_url(problem.DatasetSplit.TRAIN)
Example #13
Source File: translate_enzh.py From BERT with Apache License 2.0 | 5 votes |
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8, max_subtoken_length=self.max_subtoken_length) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8, max_subtoken_length=self.max_subtoken_length) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
Example #14
Source File: style_transfer.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def vocab_data_files(self): """Files to be passed to get_or_generate_vocab.""" return self.dataset_url(problem.DatasetSplit.TRAIN)
Example #15
Source File: translate_enzh.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)