Python apache_beam.Reshuffle() Examples

The following are 14 code examples of apache_beam.Reshuffle(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: create_kitti_crop_dataset.py    From lingvo with Apache License 2.0 6 votes vote down vote up
def main(_):
  beam_utils.BeamInit()

  if not FLAGS.output_file_pattern:
    raise ValueError('Must provide an output_file_pattern')

  reader = beam.io.ReadFromTFRecord(
      FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))

  model_name = FLAGS.model_name
  split = FLAGS.split
  run_preprocessors = FLAGS.run_preprocessors

  with beam_utils.GetPipelineRoot() as root:
    _ = (
        root
        | 'Read' >> reader
        | 'ToTFExample' >> beam.ParDo(
            _ProcessShard(model_name, split, run_preprocessors))
        | 'Reshuffle' >> beam.Reshuffle()
        | 'Write' >> beam.io.WriteToTFRecord(
            FLAGS.output_file_pattern,
            coder=beam.coders.ProtoCoder(tf.train.Example))) 
Example #2
Source File: generate_detection_data.py    From models with Apache License 2.0 6 votes vote down vote up
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
                       confidence_threshold, num_shards):
  """Returns a Beam pipeline to run object detection inference.

  Args:
    pipeline: Initialized beam pipeline.
    input_tfrecord: A TFRecord of tf.train.Example protos containing images.
    output_tfrecord: A TFRecord of tf.train.Example protos that contain images
      in the input TFRecord and the detections from the model.
    model_dir: Path to `saved_model` to use for inference.
    confidence_threshold: Threshold to use when keeping detection results.
    num_shards: The number of output shards.
  """
  input_collection = (
      pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
          input_tfrecord,
          coder=beam.coders.BytesCoder()))
  output_collection = input_collection | 'RunInference' >> beam.ParDo(
      GenerateDetectionDataFn(model_dir, confidence_threshold))
  output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
  _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
      output_tfrecord,
      num_shards=num_shards,
      coder=beam.coders.ProtoCoder(tf.train.Example)) 
Example #3
Source File: vcfio.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    return (pcoll
            | 'InputFiles' >> beam.Create(self._input_files)
            | 'SplitSource' >> beam.FlatMap(bgzf_io.split_bgzf)
            | 'Reshuffle' >> beam.Reshuffle()
            | 'ReadBlock' >> beam.ParDo(self._read_records)) 
Example #4
Source File: beam_reshuffle.py    From exoplanet-ml with Apache License 2.0 5 votes vote down vote up
def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_patterns
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards

    # Create Pipeline.
    tfrecords = []
    for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")):
      logging.info("Reading TFRecords from %s", file_pattern)
      stage_name = "read_tfrecords_{}".format(i)
      tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord(
          file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))

    # pylint: disable=expression-not-assigned
    (tfrecords
     | "flatten" >> beam.Flatten()
     | "count_labels" >> beam.ParDo(CountLabelsDoFn())
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.") 
Example #5
Source File: wiki_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline 
Example #6
Source File: ccnews_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def ccnews_pipeline():
  """Read CCNews filenames and create Beam pipeline."""

  if FLAGS.dataset == "ccnews":
    data_filename = "ccnews.txt-%05d-of-01000"
    datasize = 1000
    testsize = 100
  else:
    data_filename = "wikipedia.txt-%05d-of-00500"
    datasize = 500
    testsize = 50
  train_files = [
      FLAGS.input_file + data_filename % i for i in range(datasize - testsize)
  ]
  test_files = [
      FLAGS.input_file + data_filename % i
      for i in range(datasize - testsize, testsize)
  ]

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=testsize))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord",
            num_shards=datasize - testsize))
    return

  return pipeline 
Example #7
Source File: wiki_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline 
Example #8
Source File: cache_tasks_main.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def expand(self, pipeline):
    return (
        pipeline
        | beam.Create(self.files)
        | beam.FlatMap(self._emit_tokenized_examples)
        | beam.Reshuffle())  # Allows for additional parallelization. 
Example #9
Source File: cache_tasks_main.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    return (
        pcoll
        | beam.Map(t5.data.dict_to_tfexample)
        | beam.Reshuffle()
        | beam.io.tfrecordio.WriteToTFRecord(
            self._output_path,
            num_shards=self._num_shards,
            coder=beam.coders.ProtoCoder(tf.train.Example))) 
Example #10
Source File: create_cococameratraps_tfexample_main.py    From models with Apache License 2.0 5 votes vote down vote up
def create_pipeline(pipeline,
                    image_directory,
                    input_annotations_file,
                    output_tfrecord_prefix=None,
                    num_images_per_shard=200,
                    keep_bboxes=True):
  """Creates a beam pipeline for producing a COCO-CameraTraps Image dataset.

  Args:
    pipeline: Initialized beam pipeline.
    image_directory: Path to image directory
    input_annotations_file: Path to a coco-cameratraps annotation file
    output_tfrecord_prefix: Absolute path for tfrecord outputs. Final files will
      be named {output_tfrecord_prefix}@N.
    num_images_per_shard: The number of images to store in each shard
    keep_bboxes: Whether to keep any bounding boxes that exist in the json file
  """

  logging.info('Reading data from COCO-CameraTraps Dataset.')

  data = load_json_data(input_annotations_file)

  num_shards = int(np.ceil(float(len(data['images']))/num_images_per_shard))

  image_examples = (
      pipeline | ('CreateCollections') >> beam.Create(
          [im['id'] for im in data['images']])
      | ('ParseImage') >> beam.ParDo(ParseImage(
          image_directory, data['images'], data['annotations'],
          data['categories'], keep_bboxes=keep_bboxes)))
  _ = (image_examples
       | ('Reshuffle') >> beam.Reshuffle()
       | ('WriteTfImageExample') >> beam.io.tfrecordio.WriteToTFRecord(
           output_tfrecord_prefix,
           num_shards=num_shards,
           coder=beam.coders.ProtoCoder(tf.train.Example))) 
Example #11
Source File: generate_embedding_data.py    From models with Apache License 2.0 5 votes vote down vote up
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
                       top_k_embedding_count, bottom_k_embedding_count,
                       num_shards):
  """Returns a beam pipeline to run object detection inference.

  Args:
    pipeline: Initialized beam pipeline.
    input_tfrecord: An TFRecord of tf.train.Example protos containing images.
    output_tfrecord: An TFRecord of tf.train.Example protos that contain images
      in the input TFRecord and the detections from the model.
    model_dir: Path to `saved_model` to use for inference.
    top_k_embedding_count: The number of high-confidence embeddings to store.
    bottom_k_embedding_count: The number of low-confidence embeddings to store.
    num_shards: The number of output shards.
  """
  input_collection = (
      pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
          input_tfrecord,
          coder=beam.coders.BytesCoder()))
  output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo(
      GenerateEmbeddingDataFn(model_dir, top_k_embedding_count,
                              bottom_k_embedding_count))
  output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
  _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
      output_tfrecord,
      num_shards=num_shards,
      coder=beam.coders.ProtoCoder(tf.train.Example)) 
Example #12
Source File: beam_prepare_embedding_inputs.py    From exoplanet-ml with Apache License 2.0 4 votes vote down vote up
def main(argv):
  del argv  # Unused.
  logging.set_verbosity(logging.INFO)

  def pipeline(root):
    """Beam pipeline for preprocessing Kepler events."""
    # Separately process and write each TCE dataset, and gather all the results.
    configs = _parse_configs()
    subsets = {
        "train": [],
        "val": [],
        "test": [],
    }
    for config in configs:
      output_dir = os.path.join(FLAGS.output_dir, config.name)
      # Write the config.
      config_json = json.dumps(config, indent=2)
      logging.info(config_json)
      (root
       | "{}-create-config".format(config.name) >> beam.Create([config_json])
       | "{}-write_config".format(config.name) >> beam.io.WriteToText(
           os.path.join(output_dir, "config.json"),
           num_shards=1,
           shard_name_template=""))
      # Process TCEs and write each subset.
      results = _process_tces(root, config)
      for subset_name, subset_values in results:
        _write_subset(config.name, subset_name, subset_values)
        subsets[subset_name].append(subset_values)

    # Create one dataset comprising all TCE datasets.
    for subset_name, subset_values in subsets.items():
      combined_subset_values = (
          subset_values
          | "combined-{}-flatten".format(subset_name) >> beam.Flatten()
          | "combined-{}-count_labels".format(subset_name) >> beam.ParDo(
              _CountLabelsDoFn(prefix="combined-{}".format(subset_name)))
          | "combined-{}-reshuffle".format(subset_name) >> beam.Reshuffle())
      _write_subset("combined", subset_name, combined_subset_values)

  pipeline.run()
  logging.info("Preprocessing complete.") 
Example #13
Source File: beam_sample_tfrecord.py    From exoplanet-ml with Apache License 2.0 4 votes vote down vote up
def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_pattern
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards
    assert FLAGS.kepid_whitelist

    # Read label whitelist.
    kepid_whitelist = [int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")]
    logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist))

    # Initialize DoFn.
    process_example = ProcessExampleDoFn(kepid_whitelist)

    # Create Pipeline.
    # pylint: disable=expression-not-assigned
    (root
     | "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord(
         FLAGS.input_file_pattern,
         coder=beam.coders.ProtoCoder(tf.train.Example))
     | "process_examples" >> beam.ParDo(process_example)
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.") 
Example #14
Source File: raw_books_preproc_pipeline.py    From language with Apache License 2.0 4 votes vote down vote up
def ccnews_pipeline():
  """Read Books Corpus filenames and create Beam pipeline."""

  # set a random seed for reproducability
  rng = random.Random(FLAGS.random_seed)

  # BooksCorpus is organized into directories of genre and files of books
  # adventure-all.txt seems to contain all the adventure books in 1 file
  # romance-all.txt is the same. None of the other directories have this,
  # so we will skip it to not double count those books
  file_name_set = set()
  input_files_by_genre = collections.defaultdict(list)
  for path, _, fnames in tf.gfile.Walk(FLAGS.input_file):
    genre = path.split("/")[-1]
    for fname in fnames:
      if fname == "adventure-all.txt" or fname == "romance-all.txt":
        continue
      if fname in file_name_set:
        continue
      file_name_set.add(fname)
      input_files_by_genre[genre].append(path + "/" + fname)

  # Sort genres and iterate in order for reproducability
  train_files, test_files = [], []
  for genre, file_list in sorted(input_files_by_genre.items()):
    rng.shuffle(file_list)
    genre_size = len(file_list)
    test_size = int(FLAGS.test_size * genre_size)
    test_files.extend(file_list[:test_size])
    train_files.extend(file_list[test_size:])
    assert len(file_list[:test_size]) + \
        len(file_list[test_size:]) == len(file_list)

  # make sure there is no test train overlap
  for filename in train_files:
    assert filename not in test_files

  rng.shuffle(train_files)
  rng.shuffle(test_files)

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=50))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord", num_shards=450))
    return

  return pipeline