Python tensor2tensor.data_generators.generator_utils.shuffle_dataset() Examples
The following are 30
code examples of tensor2tensor.data_generators.generator_utils.shuffle_dataset().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensor2tensor.data_generators.generator_utils
, or try the search function
.
Example #1
Source File: text_problems.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates training/dev data. Args: data_dir: a string tmp_dir: a string task_id: an optional integer Returns: shard or shards for which data was generated. """ tf.logging.info("generate_data task_id=%s" % task_id) encoder = self.get_or_create_vocab(data_dir, tmp_dir) assert task_id >= 0 and task_id < self.num_generate_tasks if task_id < self.num_train_shards: out_file = self.training_filepaths( data_dir, self.num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False)[task_id - self.num_train_shards] generator_utils.generate_files( self.example_generator(encoder, tmp_dir, task_id), [out_file]) generator_utils.shuffle_dataset([out_file])
Example #2
Source File: common_voice.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #3
Source File: librispeech.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #4
Source File: text_problems.py From BERT with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): # task_id should be in [0, self.num_output_shards) assert (0 <= task_id) and (task_id < self.num_output_shards) # A task_id is only supposed to write only one output shard, it can operate # over multiple *input* shards. input_files = self._task_id_to_input_files(task_id) output_file = self._task_id_to_output_file(data_dir, task_id) # Which output split is this task writing to? split, _, _ = self._task_id_to_output_split(task_id) # Actually generate examples. generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, split, input_files), [output_file]) # Shuffle the output. generator_utils.shuffle_dataset([output_file], extra_fn=self._pack_fn())
Example #5
Source File: text_problems.py From BERT with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates training/dev data. Args: data_dir: a string tmp_dir: a string task_id: an optional integer Returns: shard or shards for which data was generated. """ tf.logging.info("generate_data task_id=%s" % task_id) encoder = self.get_or_create_vocab(data_dir, tmp_dir) assert task_id >= 0 and task_id < self.num_generate_tasks if task_id < self.num_train_shards: out_file = self.training_filepaths( data_dir, self.num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False)[task_id - self.num_train_shards] generator_utils.generate_files( self.example_generator(encoder, tmp_dir, task_id), [out_file]) generator_utils.shuffle_dataset([out_file])
Example #6
Source File: timeseries.py From BERT with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
Example #7
Source File: timeseries.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
Example #8
Source File: librispeech.py From BERT with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #9
Source File: common_voice.py From BERT with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #10
Source File: text_problems.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): """Generates training/dev data. Args: data_dir: a string tmp_dir: a string task_id: an optional integer Returns: shard or shards for which data was generated. """ tf.logging.info("generate_data task_id=%s" % task_id) encoder = self.get_or_create_vocab(data_dir, tmp_dir) assert task_id >= 0 and task_id < self.num_generate_tasks if task_id < self.num_train_shards: out_file = self.training_filepaths( data_dir, self.num_train_shards, shuffled=False)[task_id] else: out_file = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False)[task_id - self.num_train_shards] generator_utils.generate_files( self.example_generator(encoder, tmp_dir, task_id), [out_file]) generator_utils.shuffle_dataset([out_file])
Example #11
Source File: text_problems.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): # task_id should be in [0, self.num_output_shards) assert (0 <= task_id) and (task_id < self.num_output_shards) # A task_id is only supposed to write only one output shard, it can operate # over multiple *input* shards. input_files = self._task_id_to_input_files(task_id) output_file = self._task_id_to_output_file(data_dir, task_id) # Which output split is this task writing to? split, _, _ = self._task_id_to_output_split(task_id) # Actually generate examples. generator_utils.generate_files( self._maybe_pack_examples( self.generate_encoded_samples( data_dir, tmp_dir, split, input_files)), [output_file]) # Shuffle the output. generator_utils.shuffle_dataset([output_file])
Example #12
Source File: text_problems.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): # task_id should be in [0, self.num_output_shards) assert (0 <= task_id) and (task_id < self.num_output_shards) # A task_id is only supposed to write only one output shard, it can operate # over multiple *input* shards. input_files = self._task_id_to_input_files(task_id) output_file = self._task_id_to_output_file(data_dir, task_id) # Which output split is this task writing to? split, _, _ = self._task_id_to_output_split(task_id) # Actually generate examples. generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, split, input_files), [output_file]) # Shuffle the output. generator_utils.shuffle_dataset([output_file], extra_fn=self._pack_fn())
Example #13
Source File: text_problems.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=self.already_shuffled)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
Example #14
Source File: librispeech.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #15
Source File: librispeech_specaugment.py From specAugment with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #16
Source File: common_voice.py From tensor2tensor with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #17
Source File: glyphazzn.py From magenta with Apache License 2.0 | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split['split'], filepath_fns[split['split']]( data_dir, split['shards'], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
Example #18
Source File: t2t_datagen.py From fine-lm with MIT License | 6 votes |
def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, 1) generator_utils.generate_files(dev_gen(), dev_output_files) all_output_files = train_output_files + dev_output_files generator_utils.shuffle_dataset(all_output_files)
Example #19
Source File: timeseries.py From fine-lm with MIT License | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): filepath_fns = { problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths, } split_paths = [(split["split"], filepath_fns[split["split"]]( data_dir, split["shards"], shuffled=False)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, split), paths) else: generator_utils.generate_files( self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths)
Example #20
Source File: librispeech.py From fine-lm with MIT License | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #21
Source File: common_voice.py From fine-lm with MIT License | 6 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_paths = self.training_filepaths( data_dir, self.num_shards, shuffled=False) dev_paths = self.dev_filepaths( data_dir, self.num_dev_shards, shuffled=False) test_paths = self.test_filepaths( data_dir, self.num_test_shards, shuffled=True) generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths) if self.use_train_shards_for_dev: all_paths = train_paths + dev_paths generator_utils.generate_files( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), all_paths) generator_utils.shuffle_dataset(all_paths) else: generator_utils.generate_dataset_and_shuffle( self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS), train_paths, self.generator(data_dir, tmp_dir, self.DEV_DATASETS), dev_paths)
Example #22
Source File: t2t_datagen.py From language with Apache License 2.0 | 5 votes |
def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_train_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_train_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) num_dev_shards = int(num_train_shards * 0.1) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_dev_shards) generator_utils.generate_files(dev_gen(), dev_output_files) num_test_shards = int(num_train_shards * 0.1) test_output_files = [] test_gen_data = test_gen() if test_gen_data is not None: tf.logging.info("Generating test data for %s.", problem) test_output_files = generator_utils.test_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_test_shards) generator_utils.generate_files(test_gen_data, test_output_files) all_output_files = train_output_files + dev_output_files + test_output_files generator_utils.shuffle_dataset(all_output_files)
Example #23
Source File: t2t_datagen.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_train_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_train_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) num_dev_shards = int(num_train_shards * 0.1) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_dev_shards) generator_utils.generate_files(dev_gen(), dev_output_files) num_test_shards = int(num_train_shards * 0.1) test_output_files = [] test_gen_data = test_gen() if test_gen_data is not None: tf.logging.info("Generating test data for %s.", problem) test_output_files = generator_utils.test_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_test_shards) generator_utils.generate_files(test_gen_data, test_output_files) all_output_files = train_output_files + dev_output_files + test_output_files generator_utils.shuffle_dataset(all_output_files)
Example #24
Source File: word_chatbot.py From Seq2seqChatbots with MIT License | 5 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): self.data_dir = data_dir # Determine whether we are in training or validation mode. self.mode = {problem.DatasetSplit.TRAIN: 'train', problem.DatasetSplit.EVAL: 'dev', problem.DatasetSplit.TEST: 'test'} filepath_fns = {problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths} split_paths = [(split['split'], filepath_fns[split['split']]( data_dir, split['shards'], shuffled=self.already_shuffled)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: # Create the source and target txt files from the raw data. self.preprocess_data(self.mode[split]) generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: self.preprocess_data(self.mode[problem.DatasetSplit.TRAIN]) generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn()) # This function generates train and validation pairs in t2t-datagen style.
Example #25
Source File: celeba.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_gen = self.generator(tmp_dir, 162770) train_paths = self.training_filepaths( data_dir, self.train_shards, shuffled=False) generator_utils.generate_files(train_gen, train_paths) dev_gen = self.generator(tmp_dir, 19867, 162770) dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False) generator_utils.generate_files(dev_gen, dev_paths) test_gen = self.generator(tmp_dir, 19962, 162770+19867) test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False) generator_utils.generate_files(test_gen, test_paths) generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)
Example #26
Source File: t2t_datagen.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_train_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_train_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) num_dev_shards = int(num_train_shards * 0.1) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_dev_shards) generator_utils.generate_files(dev_gen(), dev_output_files) num_test_shards = int(num_train_shards * 0.1) test_output_files = [] test_gen_data = test_gen() if test_gen_data is not None: tf.logging.info("Generating test data for %s.", problem) test_output_files = generator_utils.test_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_test_shards) generator_utils.generate_files(test_gen_data, test_output_files) all_output_files = train_output_files + dev_output_files + test_output_files generator_utils.shuffle_dataset(all_output_files)
Example #27
Source File: celeba.py From BERT with Apache License 2.0 | 5 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_gen = self.generator(tmp_dir, 162770) train_paths = self.training_filepaths( data_dir, self.train_shards, shuffled=False) generator_utils.generate_files(train_gen, train_paths) dev_gen = self.generator(tmp_dir, 19867, 162770) dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False) generator_utils.generate_files(dev_gen, dev_paths) test_gen = self.generator(tmp_dir, 19962, 162770+19867) test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False) generator_utils.generate_files(test_gen, test_paths) generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)
Example #28
Source File: t2t_datagen.py From BERT with Apache License 2.0 | 5 votes |
def generate_data_for_problem(problem): """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS.""" training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem] num_train_shards = FLAGS.num_shards or 10 tf.logging.info("Generating training data for %s.", problem) train_output_files = generator_utils.train_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_train_shards) generator_utils.generate_files(training_gen(), train_output_files, FLAGS.max_cases) num_dev_shards = int(num_train_shards * 0.1) tf.logging.info("Generating development data for %s.", problem) dev_output_files = generator_utils.dev_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_dev_shards) generator_utils.generate_files(dev_gen(), dev_output_files) num_test_shards = int(num_train_shards * 0.1) test_output_files = [] test_gen_data = test_gen() if test_gen_data is not None: tf.logging.info("Generating test data for %s.", problem) test_output_files = generator_utils.test_data_filenames( problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_test_shards) generator_utils.generate_files(test_gen_data, test_output_files) all_output_files = train_output_files + dev_output_files + test_output_files generator_utils.shuffle_dataset(all_output_files)
Example #29
Source File: celeba.py From tensor2tensor with Apache License 2.0 | 5 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): train_gen = self.generator(tmp_dir, 162770) train_paths = self.training_filepaths( data_dir, self.train_shards, shuffled=False) generator_utils.generate_files(train_gen, train_paths) dev_gen = self.generator(tmp_dir, 19867, 162770) dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False) generator_utils.generate_files(dev_gen, dev_paths) test_gen = self.generator(tmp_dir, 19962, 162770+19867) test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False) generator_utils.generate_files(test_gen, test_paths) generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)
Example #30
Source File: dialog_abstract.py From tensor2tensor with Apache License 2.0 | 5 votes |
def generate_data(self, data_dir, tmp_dir, task_id=-1): self.data_dir = data_dir # Determine whether we are in training or validation mode. self.mode = {problem.DatasetSplit.TRAIN: 'train', problem.DatasetSplit.EVAL: 'dev', problem.DatasetSplit.TEST: 'test'} filepath_fns = {problem.DatasetSplit.TRAIN: self.training_filepaths, problem.DatasetSplit.EVAL: self.dev_filepaths, problem.DatasetSplit.TEST: self.test_filepaths} split_paths = [(split['split'], filepath_fns[split['split']]( data_dir, split['shards'], shuffled=self.already_shuffled)) for split in self.dataset_splits] all_paths = [] for _, paths in split_paths: all_paths.extend(paths) if self.is_generate_per_split: for split, paths in split_paths: # Create the source and target txt files from the raw data. self.preprocess_data(self.mode[split]) generator_utils.generate_files( self.generate_encoded_samples(data_dir, tmp_dir, split), paths) else: self.preprocess_data(self.mode[problem.DatasetSplit.TRAIN]) generator_utils.generate_files( self.generate_encoded_samples( data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths) generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())