Python Examples of apache

Source File: create_kitti_crop_dataset.py From lingvo with Apache License 2.0

6 votes

def main(_):
  beam_utils.BeamInit()

  if not FLAGS.output_file_pattern:
    raise ValueError('Must provide an output_file_pattern')

  reader = beam.io.ReadFromTFRecord(
      FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))

  model_name = FLAGS.model_name
  split = FLAGS.split
  run_preprocessors = FLAGS.run_preprocessors

  with beam_utils.GetPipelineRoot() as root:
    _ = (
        root
        | 'Read' >> reader
        | 'ToTFExample' >> beam.ParDo(
            _ProcessShard(model_name, split, run_preprocessors))
        | 'Reshuffle' >> beam.Reshuffle()
        | 'Write' >> beam.io.WriteToTFRecord(
            FLAGS.output_file_pattern,
            coder=beam.coders.ProtoCoder(tf.train.Example)))

Source File: generate_detection_data.py From models with Apache License 2.0

6 votes

def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
                       confidence_threshold, num_shards):
  """Returns a Beam pipeline to run object detection inference.

  Args:
    pipeline: Initialized beam pipeline.
    input_tfrecord: A TFRecord of tf.train.Example protos containing images.
    output_tfrecord: A TFRecord of tf.train.Example protos that contain images
      in the input TFRecord and the detections from the model.
    model_dir: Path to `saved_model` to use for inference.
    confidence_threshold: Threshold to use when keeping detection results.
    num_shards: The number of output shards.
  """
  input_collection = (
      pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
          input_tfrecord,
          coder=beam.coders.BytesCoder()))
  output_collection = input_collection | 'RunInference' >> beam.ParDo(
      GenerateDetectionDataFn(model_dir, confidence_threshold))
  output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
  _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
      output_tfrecord,
      num_shards=num_shards,
      coder=beam.coders.ProtoCoder(tf.train.Example))

Source File: vcfio.py From gcp-variant-transforms with Apache License 2.0

5 votes

def expand(self, pcoll):
    return (pcoll
            | 'InputFiles' >> beam.Create(self._input_files)
            | 'SplitSource' >> beam.FlatMap(bgzf_io.split_bgzf)
            | 'Reshuffle' >> beam.Reshuffle()
            | 'ReadBlock' >> beam.ParDo(self._read_records))

Source File: beam_reshuffle.py From exoplanet-ml with Apache License 2.0

5 votes

def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_patterns
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards

    # Create Pipeline.
    tfrecords = []
    for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")):
      logging.info("Reading TFRecords from %s", file_pattern)
      stage_name = "read_tfrecords_{}".format(i)
      tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord(
          file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))

    # pylint: disable=expression-not-assigned
    (tfrecords
     | "flatten" >> beam.Flatten()
     | "count_labels" >> beam.ParDo(CountLabelsDoFn())
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.")

Source File: wiki_preproc_pipeline.py From language with Apache License 2.0

5 votes

def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline

Source File: ccnews_preproc_pipeline.py From language with Apache License 2.0

5 votes

def ccnews_pipeline():
  """Read CCNews filenames and create Beam pipeline."""

  if FLAGS.dataset == "ccnews":
    data_filename = "ccnews.txt-%05d-of-01000"
    datasize = 1000
    testsize = 100
  else:
    data_filename = "wikipedia.txt-%05d-of-00500"
    datasize = 500
    testsize = 50
  train_files = [
      FLAGS.input_file + data_filename % i for i in range(datasize - testsize)
  ]
  test_files = [
      FLAGS.input_file + data_filename % i
      for i in range(datasize - testsize, testsize)
  ]

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=testsize))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord",
            num_shards=datasize - testsize))
    return

  return pipeline

Source File: wiki_preproc_pipeline.py From language with Apache License 2.0

5 votes

def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline

Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0

5 votes

def expand(self, pipeline):
    return (
        pipeline
        | beam.Create(self.files)
        | beam.FlatMap(self._emit_tokenized_examples)
        | beam.Reshuffle())  # Allows for additional parallelization.

Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0

5 votes

def expand(self, pcoll):
    return (
        pcoll
        | beam.Map(t5.data.dict_to_tfexample)
        | beam.Reshuffle()
        | beam.io.tfrecordio.WriteToTFRecord(
            self._output_path,
            num_shards=self._num_shards,
            coder=beam.coders.ProtoCoder(tf.train.Example)))

Source File: create_cococameratraps_tfexample_main.py From models with Apache License 2.0

5 votes

def create_pipeline(pipeline,
                    image_directory,
                    input_annotations_file,
                    output_tfrecord_prefix=None,
                    num_images_per_shard=200,
                    keep_bboxes=True):
  """Creates a beam pipeline for producing a COCO-CameraTraps Image dataset.

  Args:
    pipeline: Initialized beam pipeline.
    image_directory: Path to image directory
    input_annotations_file: Path to a coco-cameratraps annotation file
    output_tfrecord_prefix: Absolute path for tfrecord outputs. Final files will
      be named {output_tfrecord_prefix}@N.
    num_images_per_shard: The number of images to store in each shard
    keep_bboxes: Whether to keep any bounding boxes that exist in the json file
  """

  logging.info('Reading data from COCO-CameraTraps Dataset.')

  data = load_json_data(input_annotations_file)

  num_shards = int(np.ceil(float(len(data['images']))/num_images_per_shard))

  image_examples = (
      pipeline | ('CreateCollections') >> beam.Create(
          [im['id'] for im in data['images']])
      | ('ParseImage') >> beam.ParDo(ParseImage(
          image_directory, data['images'], data['annotations'],
          data['categories'], keep_bboxes=keep_bboxes)))
  _ = (image_examples
       | ('Reshuffle') >> beam.Reshuffle()
       | ('WriteTfImageExample') >> beam.io.tfrecordio.WriteToTFRecord(
           output_tfrecord_prefix,
           num_shards=num_shards,
           coder=beam.coders.ProtoCoder(tf.train.Example)))

Source File: generate_embedding_data.py From models with Apache License 2.0

5 votes

def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
                       top_k_embedding_count, bottom_k_embedding_count,
                       num_shards):
  """Returns a beam pipeline to run object detection inference.

  Args:
    pipeline: Initialized beam pipeline.
    input_tfrecord: An TFRecord of tf.train.Example protos containing images.
    output_tfrecord: An TFRecord of tf.train.Example protos that contain images
      in the input TFRecord and the detections from the model.
    model_dir: Path to `saved_model` to use for inference.
    top_k_embedding_count: The number of high-confidence embeddings to store.
    bottom_k_embedding_count: The number of low-confidence embeddings to store.
    num_shards: The number of output shards.
  """
  input_collection = (
      pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
          input_tfrecord,
          coder=beam.coders.BytesCoder()))
  output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo(
      GenerateEmbeddingDataFn(model_dir, top_k_embedding_count,
                              bottom_k_embedding_count))
  output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
  _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
      output_tfrecord,
      num_shards=num_shards,
      coder=beam.coders.ProtoCoder(tf.train.Example))

Source File: beam_prepare_embedding_inputs.py From exoplanet-ml with Apache License 2.0

4 votes

def main(argv):
  del argv  # Unused.
  logging.set_verbosity(logging.INFO)

  def pipeline(root):
    """Beam pipeline for preprocessing Kepler events."""
    # Separately process and write each TCE dataset, and gather all the results.
    configs = _parse_configs()
    subsets = {
        "train": [],
        "val": [],
        "test": [],
    }
    for config in configs:
      output_dir = os.path.join(FLAGS.output_dir, config.name)
      # Write the config.
      config_json = json.dumps(config, indent=2)
      logging.info(config_json)
      (root
       | "{}-create-config".format(config.name) >> beam.Create([config_json])
       | "{}-write_config".format(config.name) >> beam.io.WriteToText(
           os.path.join(output_dir, "config.json"),
           num_shards=1,
           shard_name_template=""))
      # Process TCEs and write each subset.
      results = _process_tces(root, config)
      for subset_name, subset_values in results:
        _write_subset(config.name, subset_name, subset_values)
        subsets[subset_name].append(subset_values)

    # Create one dataset comprising all TCE datasets.
    for subset_name, subset_values in subsets.items():
      combined_subset_values = (
          subset_values
          | "combined-{}-flatten".format(subset_name) >> beam.Flatten()
          | "combined-{}-count_labels".format(subset_name) >> beam.ParDo(
              _CountLabelsDoFn(prefix="combined-{}".format(subset_name)))
          | "combined-{}-reshuffle".format(subset_name) >> beam.Reshuffle())
      _write_subset("combined", subset_name, combined_subset_values)

  pipeline.run()
  logging.info("Preprocessing complete.")

Source File: beam_sample_tfrecord.py From exoplanet-ml with Apache License 2.0

4 votes

def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_pattern
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards
    assert FLAGS.kepid_whitelist

    # Read label whitelist.
    kepid_whitelist = [int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")]
    logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist))

    # Initialize DoFn.
    process_example = ProcessExampleDoFn(kepid_whitelist)

    # Create Pipeline.
    # pylint: disable=expression-not-assigned
    (root
     | "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord(
         FLAGS.input_file_pattern,
         coder=beam.coders.ProtoCoder(tf.train.Example))
     | "process_examples" >> beam.ParDo(process_example)
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.")

Source File: raw_books_preproc_pipeline.py From language with Apache License 2.0

4 votes

def ccnews_pipeline():
  """Read Books Corpus filenames and create Beam pipeline."""

  # set a random seed for reproducability
  rng = random.Random(FLAGS.random_seed)

  # BooksCorpus is organized into directories of genre and files of books
  # adventure-all.txt seems to contain all the adventure books in 1 file
  # romance-all.txt is the same. None of the other directories have this,
  # so we will skip it to not double count those books
  file_name_set = set()
  input_files_by_genre = collections.defaultdict(list)
  for path, _, fnames in tf.gfile.Walk(FLAGS.input_file):
    genre = path.split("/")[-1]
    for fname in fnames:
      if fname == "adventure-all.txt" or fname == "romance-all.txt":
        continue
      if fname in file_name_set:
        continue
      file_name_set.add(fname)
      input_files_by_genre[genre].append(path + "/" + fname)

  # Sort genres and iterate in order for reproducability
  train_files, test_files = [], []
  for genre, file_list in sorted(input_files_by_genre.items()):
    rng.shuffle(file_list)
    genre_size = len(file_list)
    test_size = int(FLAGS.test_size * genre_size)
    test_files.extend(file_list[:test_size])
    train_files.extend(file_list[test_size:])
    assert len(file_list[:test_size]) + \
        len(file_list[test_size:]) == len(file_list)

  # make sure there is no test train overlap
  for filename in train_files:
    assert filename not in test_files

  rng.shuffle(train_files)
  rng.shuffle(test_files)

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=50))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord", num_shards=450))
    return

  return pipeline

Python apache_beam.Reshuffle() Examples