Python tensor2tensor.data_generators.generator_utils.train_data_filenames() Examples

The following are 14 code examples of tensor2tensor.data_generators.generator_utils.train_data_filenames(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensor2tensor.data_generators.generator_utils , or try the search function .
Example #1
Source File: t2t_datagen.py    From fine-lm with MIT License 6 votes vote down vote up
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, 1)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  all_output_files = train_output_files + dev_output_files
  generator_utils.shuffle_dataset(all_output_files) 
Example #2
Source File: generator_utils_test.py    From fine-lm with MIT License 6 votes vote down vote up
def testGenerateFiles(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a trivial file and assert the file exists.
    def test_generator():
      yield {"inputs": [1], "target": [1]}

    filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1)
    generator_utils.generate_files(test_generator(), filenames)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

    # Clean up.
    os.remove(tmp_file_path + "-train-00000-of-00001")
    os.remove(tmp_file_path) 
Example #3
Source File: generator_utils_test.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def testGenerateFiles(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a trivial file and assert the file exists.
    def test_generator():
      yield {"inputs": [1], "target": [1]}

    filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1)
    generator_utils.generate_files(test_generator(), filenames)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

    # Clean up.
    os.remove(tmp_file_path + "-train-00000-of-00001")
    os.remove(tmp_file_path) 
Example #4
Source File: generator_utils_test.py    From BERT with Apache License 2.0 6 votes vote down vote up
def testGenerateFiles(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a trivial file and assert the file exists.
    def test_generator():
      yield {"inputs": [1], "target": [1]}

    filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1)
    generator_utils.generate_files(test_generator(), filenames)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

    # Clean up.
    os.remove(tmp_file_path + "-train-00000-of-00001")
    os.remove(tmp_file_path) 
Example #5
Source File: generator_utils_test.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def testGenerateFiles(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a trivial file and assert the file exists.
    def test_generator():
      yield {"inputs": [1], "target": [1]}

    filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1)
    generator_utils.generate_files(test_generator(), filenames)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

    # Clean up.
    os.remove(tmp_file_path + "-train-00000-of-00001")
    os.remove(tmp_file_path) 
Example #6
Source File: problem.py    From fine-lm with MIT License 5 votes vote down vote up
def training_filepaths(self, data_dir, num_shards, shuffled):
    file_basename = self.dataset_filename()
    if not shuffled:
      file_basename += generator_utils.UNSHUFFLED_SUFFIX
    return generator_utils.train_data_filenames(file_basename, data_dir,
                                                num_shards) 
Example #7
Source File: t2t_datagen.py    From tensor2tensor with Apache License 2.0 5 votes vote down vote up
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_train_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_train_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  num_dev_shards = int(num_train_shards * 0.1)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_dev_shards)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  num_test_shards = int(num_train_shards * 0.1)
  test_output_files = []
  test_gen_data = test_gen()
  if test_gen_data is not None:
    tf.logging.info("Generating test data for %s.", problem)
    test_output_files = generator_utils.test_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_test_shards)
    generator_utils.generate_files(test_gen_data, test_output_files)
  all_output_files = train_output_files + dev_output_files + test_output_files
  generator_utils.shuffle_dataset(all_output_files) 
Example #8
Source File: problem.py    From tensor2tensor with Apache License 2.0 5 votes vote down vote up
def training_filepaths(self, data_dir, num_shards, shuffled):
    file_basename = self.dataset_filename()
    if not shuffled:
      file_basename += generator_utils.UNSHUFFLED_SUFFIX
    return generator_utils.train_data_filenames(file_basename, data_dir,
                                                num_shards) 
Example #9
Source File: t2t_datagen.py    From BERT with Apache License 2.0 5 votes vote down vote up
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_train_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_train_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  num_dev_shards = int(num_train_shards * 0.1)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_dev_shards)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  num_test_shards = int(num_train_shards * 0.1)
  test_output_files = []
  test_gen_data = test_gen()
  if test_gen_data is not None:
    tf.logging.info("Generating test data for %s.", problem)
    test_output_files = generator_utils.test_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_test_shards)
    generator_utils.generate_files(test_gen_data, test_output_files)
  all_output_files = train_output_files + dev_output_files + test_output_files
  generator_utils.shuffle_dataset(all_output_files) 
Example #10
Source File: problem.py    From BERT with Apache License 2.0 5 votes vote down vote up
def training_filepaths(self, data_dir, num_shards, shuffled):
    file_basename = self.dataset_filename()
    if not shuffled:
      file_basename += generator_utils.UNSHUFFLED_SUFFIX
    return generator_utils.train_data_filenames(file_basename, data_dir,
                                                num_shards) 
Example #11
Source File: t2t_datagen.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_train_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_train_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  num_dev_shards = int(num_train_shards * 0.1)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_dev_shards)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  num_test_shards = int(num_train_shards * 0.1)
  test_output_files = []
  test_gen_data = test_gen()
  if test_gen_data is not None:
    tf.logging.info("Generating test data for %s.", problem)
    test_output_files = generator_utils.test_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_test_shards)
    generator_utils.generate_files(test_gen_data, test_output_files)
  all_output_files = train_output_files + dev_output_files + test_output_files
  generator_utils.shuffle_dataset(all_output_files) 
Example #12
Source File: problem.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def training_filepaths(self, data_dir, num_shards, shuffled):
    file_basename = self.dataset_filename()
    if not shuffled:
      file_basename += generator_utils.UNSHUFFLED_SUFFIX
    return generator_utils.train_data_filenames(file_basename, data_dir,
                                                num_shards) 
Example #13
Source File: t2t_datagen.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_train_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_train_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  num_dev_shards = int(num_train_shards * 0.1)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_dev_shards)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  num_test_shards = int(num_train_shards * 0.1)
  test_output_files = []
  test_gen_data = test_gen()
  if test_gen_data is not None:
    tf.logging.info("Generating test data for %s.", problem)
    test_output_files = generator_utils.test_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_test_shards)
    generator_utils.generate_files(test_gen_data, test_output_files)
  all_output_files = train_output_files + dev_output_files + test_output_files
  generator_utils.shuffle_dataset(all_output_files) 
Example #14
Source File: t2t_datagen.py    From language with Apache License 2.0 5 votes vote down vote up
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_train_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_train_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  num_dev_shards = int(num_train_shards * 0.1)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_dev_shards)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  num_test_shards = int(num_train_shards * 0.1)
  test_output_files = []
  test_gen_data = test_gen()
  if test_gen_data is not None:
    tf.logging.info("Generating test data for %s.", problem)
    test_output_files = generator_utils.test_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_test_shards)
    generator_utils.generate_files(test_gen_data, test_output_files)
  all_output_files = train_output_files + dev_output_files + test_output_files
  generator_utils.shuffle_dataset(all_output_files)