Python tensor2tensor.data_generators.generator_utils.get_or_generate_vocab() Examples

The following are 15 code examples of tensor2tensor.data_generators.generator_utils.get_or_generate_vocab(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensor2tensor.data_generators.generator_utils , or try the search function .
Example #1
Source File: style_transfer.py    From fine-lm with MIT License 6 votes vote down vote up
def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file = os.path.join(tmp_dir, tag + ".modern")
    target_file = os.path.join(tmp_dir, tag + ".original")
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file) 
Example #2
Source File: style_transfer.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file) 
Example #3
Source File: style_transfer.py    From BERT with Apache License 2.0 6 votes vote down vote up
def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file) 
Example #4
Source File: style_transfer.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def generate_samples(self, data_dir, tmp_dir, dataset_split):
    dataset = self.dataset_url(dataset_split)

    url = dataset[0][0]
    compressed_filename = os.path.basename(url)
    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
    generator_utils.maybe_download(tmp_dir, compressed_filename, url)

    mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
    with tarfile.open(compressed_filepath, mode) as corpus_tar:
      corpus_tar.extractall(tmp_dir)

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
    return text_problems.text2text_txt_iterator(source_file,
                                                target_file) 
Example #5
Source File: base.py    From t2t_wmt_zhen with MIT License 6 votes vote down vote up
def generator(self, data_dir, tmp_dir, train):
        datasets = self.get_datasets(train)

        # build vocab from training datasets
        source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)]
        target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)]
        source_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
            source_datasets)
        target_vocab = generator_utils.get_or_generate_vocab(
            data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
            target_datasets)

        tag = "train" if train else "dev"
        data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
        return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
                                            source_vocab, target_vocab, EOS) 
Example #6
Source File: style_transfer.py    From fine-lm with MIT License 5 votes vote down vote up
def vocab_data_files(self):
    """Files to be passed to get_or_generate_vocab."""
    return self.dataset_url(problem.DatasetSplit.TRAIN) 
Example #7
Source File: translate.py    From fine-lm with MIT License 5 votes vote down vote up
def vocab_data_files(self):
    """Files to be passed to get_or_generate_vocab."""
    return self.source_data_files(problem.DatasetSplit.TRAIN) 
Example #8
Source File: translate.py    From fine-lm with MIT License 5 votes vote down vote up
def generate_samples(self, data_dir, tmp_dir, dataset_split):
    datasets = self.source_data_files(dataset_split)
    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
    data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name,
                                                                    tag))

    if self.vocab_type == text_problems.VocabType.SUBWORD:
      generator_utils.get_or_generate_vocab(
          data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
          self.vocab_data_files())

    return text_problems.text2text_txt_iterator(data_path + ".lang1",
                                                data_path + ".lang2") 
Example #9
Source File: translate_enzh.py    From fine-lm with MIT License 5 votes vote down vote up
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
    train = dataset_split == problem.DatasetSplit.TRAIN
    train_dataset = self.get_training_dataset(tmp_dir)
    datasets = train_dataset if train else _NC_TEST_DATASETS
    source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
    target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
    source_vocab = generator_utils.get_or_generate_vocab(
        data_dir,
        tmp_dir,
        self.source_vocab_name,
        self.approx_vocab_size,
        source_datasets,
        file_byte_budget=1e8)
    target_vocab = generator_utils.get_or_generate_vocab(
        data_dir,
        tmp_dir,
        self.target_vocab_name,
        self.approx_vocab_size,
        target_datasets,
        file_byte_budget=1e8)
    tag = "train" if train else "dev"
    filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
    data_path = translate.compile_data(tmp_dir, datasets, filename_base)
    return text_problems.text2text_generate_encoded(
        text_problems.text2text_txt_iterator(data_path + ".lang1",
                                             data_path + ".lang2"),
        source_vocab, target_vocab) 
Example #10
Source File: style_transfer.py    From tensor2tensor with Apache License 2.0 5 votes vote down vote up
def vocab_data_files(self):
    """Files to be passed to get_or_generate_vocab."""
    return self.dataset_url(problem.DatasetSplit.TRAIN) 
Example #11
Source File: translate_enzh.py    From tensor2tensor with Apache License 2.0 5 votes vote down vote up
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
    train = dataset_split == problem.DatasetSplit.TRAIN
    train_dataset = self.get_training_dataset(tmp_dir)
    datasets = train_dataset if train else _NC_TEST_DATASETS
    source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
    target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
    source_vocab = generator_utils.get_or_generate_vocab(
        data_dir,
        tmp_dir,
        self.source_vocab_name,
        self.approx_vocab_size,
        source_datasets,
        file_byte_budget=1e8,
        max_subtoken_length=self.max_subtoken_length)
    target_vocab = generator_utils.get_or_generate_vocab(
        data_dir,
        tmp_dir,
        self.target_vocab_name,
        self.approx_vocab_size,
        target_datasets,
        file_byte_budget=1e8,
        max_subtoken_length=self.max_subtoken_length)
    tag = "train" if train else "dev"
    filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
    data_path = translate.compile_data(tmp_dir, datasets, filename_base)
    return text_problems.text2text_generate_encoded(
        text_problems.text2text_txt_iterator(data_path + ".lang1",
                                             data_path + ".lang2"),
        source_vocab, target_vocab) 
Example #12
Source File: style_transfer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def vocab_data_files(self):
    """Files to be passed to get_or_generate_vocab."""
    return self.dataset_url(problem.DatasetSplit.TRAIN) 
Example #13
Source File: translate_enzh.py    From BERT with Apache License 2.0 5 votes vote down vote up
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
    train = dataset_split == problem.DatasetSplit.TRAIN
    train_dataset = self.get_training_dataset(tmp_dir)
    datasets = train_dataset if train else _NC_TEST_DATASETS
    source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
    target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
    source_vocab = generator_utils.get_or_generate_vocab(
        data_dir,
        tmp_dir,
        self.source_vocab_name,
        self.approx_vocab_size,
        source_datasets,
        file_byte_budget=1e8,
        max_subtoken_length=self.max_subtoken_length)
    target_vocab = generator_utils.get_or_generate_vocab(
        data_dir,
        tmp_dir,
        self.target_vocab_name,
        self.approx_vocab_size,
        target_datasets,
        file_byte_budget=1e8,
        max_subtoken_length=self.max_subtoken_length)
    tag = "train" if train else "dev"
    filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
    data_path = translate.compile_data(tmp_dir, datasets, filename_base)
    return text_problems.text2text_generate_encoded(
        text_problems.text2text_txt_iterator(data_path + ".lang1",
                                             data_path + ".lang2"),
        source_vocab, target_vocab) 
Example #14
Source File: style_transfer.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def vocab_data_files(self):
    """Files to be passed to get_or_generate_vocab."""
    return self.dataset_url(problem.DatasetSplit.TRAIN) 
Example #15
Source File: translate_enzh.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
    train = dataset_split == problem.DatasetSplit.TRAIN
    train_dataset = self.get_training_dataset(tmp_dir)
    datasets = train_dataset if train else _NC_TEST_DATASETS
    source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
    target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
    source_vocab = generator_utils.get_or_generate_vocab(
        data_dir,
        tmp_dir,
        self.source_vocab_name,
        self.approx_vocab_size,
        source_datasets,
        file_byte_budget=1e8)
    target_vocab = generator_utils.get_or_generate_vocab(
        data_dir,
        tmp_dir,
        self.target_vocab_name,
        self.approx_vocab_size,
        target_datasets,
        file_byte_budget=1e8)
    tag = "train" if train else "dev"
    filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
    data_path = translate.compile_data(tmp_dir, datasets, filename_base)
    return text_problems.text2text_generate_encoded(
        text_problems.text2text_txt_iterator(data_path + ".lang1",
                                             data_path + ".lang2"),
        source_vocab, target_vocab)