Python apache_beam.Reshuffle() Examples
The following are 14
code examples of apache_beam.Reshuffle().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: create_kitti_crop_dataset.py From lingvo with Apache License 2.0 | 6 votes |
def main(_): beam_utils.BeamInit() if not FLAGS.output_file_pattern: raise ValueError('Must provide an output_file_pattern') reader = beam.io.ReadFromTFRecord( FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)) model_name = FLAGS.model_name split = FLAGS.split run_preprocessors = FLAGS.run_preprocessors with beam_utils.GetPipelineRoot() as root: _ = ( root | 'Read' >> reader | 'ToTFExample' >> beam.ParDo( _ProcessShard(model_name, split, run_preprocessors)) | 'Reshuffle' >> beam.Reshuffle() | 'Write' >> beam.io.WriteToTFRecord( FLAGS.output_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))
Example #2
Source File: generate_detection_data.py From models with Apache License 2.0 | 6 votes |
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir, confidence_threshold, num_shards): """Returns a Beam pipeline to run object detection inference. Args: pipeline: Initialized beam pipeline. input_tfrecord: A TFRecord of tf.train.Example protos containing images. output_tfrecord: A TFRecord of tf.train.Example protos that contain images in the input TFRecord and the detections from the model. model_dir: Path to `saved_model` to use for inference. confidence_threshold: Threshold to use when keeping detection results. num_shards: The number of output shards. """ input_collection = ( pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord( input_tfrecord, coder=beam.coders.BytesCoder())) output_collection = input_collection | 'RunInference' >> beam.ParDo( GenerateDetectionDataFn(model_dir, confidence_threshold)) output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle() _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord( output_tfrecord, num_shards=num_shards, coder=beam.coders.ProtoCoder(tf.train.Example))
Example #3
Source File: vcfio.py From gcp-variant-transforms with Apache License 2.0 | 5 votes |
def expand(self, pcoll): return (pcoll | 'InputFiles' >> beam.Create(self._input_files) | 'SplitSource' >> beam.FlatMap(bgzf_io.split_bgzf) | 'Reshuffle' >> beam.Reshuffle() | 'ReadBlock' >> beam.ParDo(self._read_records))
Example #4
Source File: beam_reshuffle.py From exoplanet-ml with Apache License 2.0 | 5 votes |
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") def pipeline(root): """Beam pipeline for preprocessing open images.""" assert FLAGS.input_file_patterns assert FLAGS.output_dir assert FLAGS.output_name assert FLAGS.num_shards # Create Pipeline. tfrecords = [] for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")): logging.info("Reading TFRecords from %s", file_pattern) stage_name = "read_tfrecords_{}".format(i) tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord( file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))) # pylint: disable=expression-not-assigned (tfrecords | "flatten" >> beam.Flatten() | "count_labels" >> beam.ParDo(CountLabelsDoFn()) | "reshuffle" >> beam.Reshuffle() | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(FLAGS.output_dir, FLAGS.output_name), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=FLAGS.num_shards)) # pylint: enable=expression-not-assigned pipeline.run() logging.info("Processing complete.")
Example #5
Source File: wiki_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def wiki_pipeline(): """Read WikiText103 filenames and create Beam pipeline.""" train_files = FLAGS.input_file + "/wiki.train.raw" dev_files = FLAGS.input_file + "/wiki.valid.raw" test_files = FLAGS.input_file + "/wiki.test.raw" def pipeline(root): """Beam pipeline for converting WikiText103 files to TF Examples.""" _ = ( root | "Create test files" >> beam.Create([test_files]) | "Read test files" >> beam.FlatMap(read_file) | "test Shuffle" >> beam.Reshuffle() | "Preproc test docs" >> beam.FlatMap(preproc_doc) | "record test Shuffle" >> beam.Reshuffle() | "Write to test tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cpc." + FLAGS.format + ".test.tfrecord", num_shards=10)) _ = ( root | "Create dev files" >> beam.Create([dev_files]) | "Read dev files" >> beam.FlatMap(read_file) | "dev Shuffle" >> beam.Reshuffle() | "Preproc dev docs" >> beam.FlatMap(preproc_doc) | "record dev Shuffle" >> beam.Reshuffle() | "Write to dev tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cpc." + FLAGS.format + ".dev.tfrecord", num_shards=10)) _ = ( root | "Create train files" >> beam.Create([train_files]) | "Read train files" >> beam.FlatMap(read_file) | "train Shuffle" >> beam.Reshuffle() | "Preproc train docs" >> beam.FlatMap(preproc_doc) | "record train Shuffle" >> beam.Reshuffle() | "Write to train tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cpc." + FLAGS.format + ".train.tfrecord", num_shards=100)) return return pipeline
Example #6
Source File: ccnews_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def ccnews_pipeline(): """Read CCNews filenames and create Beam pipeline.""" if FLAGS.dataset == "ccnews": data_filename = "ccnews.txt-%05d-of-01000" datasize = 1000 testsize = 100 else: data_filename = "wikipedia.txt-%05d-of-00500" datasize = 500 testsize = 50 train_files = [ FLAGS.input_file + data_filename % i for i in range(datasize - testsize) ] test_files = [ FLAGS.input_file + data_filename % i for i in range(datasize - testsize, testsize) ] def pipeline(root): """Beam pipeline for converting CCNews files to TF Examples.""" _ = ( root | "Create test files" >> beam.Create(test_files) | "Read test files" >> beam.FlatMap(read_file) | "test Shuffle" >> beam.Reshuffle() | "Preproc test docs" >> beam.FlatMap(preproc_doc) | "record test Shuffle" >> beam.Reshuffle() | "Write to test tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=testsize)) _ = ( root | "Create train files" >> beam.Create(train_files) | "Read train files" >> beam.FlatMap(read_file) | "train Shuffle" >> beam.Reshuffle() | "Preproc train docs" >> beam.FlatMap(preproc_doc) | "record train Shuffle" >> beam.Reshuffle() | "Write to train tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cc_cpc.train.tfrecord", num_shards=datasize - testsize)) return return pipeline
Example #7
Source File: wiki_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def wiki_pipeline(): """Read WikiText103 filenames and create Beam pipeline.""" train_files = FLAGS.input_file + "/wiki.train.raw" dev_files = FLAGS.input_file + "/wiki.valid.raw" test_files = FLAGS.input_file + "/wiki.test.raw" def pipeline(root): """Beam pipeline for converting WikiText103 files to TF Examples.""" _ = ( root | "Create test files" >> beam.Create([test_files]) | "Read test files" >> beam.FlatMap(read_file) | "test Shuffle" >> beam.Reshuffle() | "Preproc test docs" >> beam.FlatMap(preproc_doc) | "record test Shuffle" >> beam.Reshuffle() | "Write to test tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + "." + FLAGS.format + ".test.tfrecord", num_shards=10)) _ = ( root | "Create dev files" >> beam.Create([dev_files]) | "Read dev files" >> beam.FlatMap(read_file) | "dev Shuffle" >> beam.Reshuffle() | "Preproc dev docs" >> beam.FlatMap(preproc_doc) | "record dev Shuffle" >> beam.Reshuffle() | "Write to dev tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + "." + FLAGS.format + ".dev.tfrecord", num_shards=10)) _ = ( root | "Create train files" >> beam.Create([train_files]) | "Read train files" >> beam.FlatMap(read_file) | "train Shuffle" >> beam.Reshuffle() | "Preproc train docs" >> beam.FlatMap(preproc_doc) | "record train Shuffle" >> beam.Reshuffle() | "Write to train tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + "." + FLAGS.format + ".train.tfrecord", num_shards=100)) return return pipeline
Example #8
Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0 | 5 votes |
def expand(self, pipeline): return ( pipeline | beam.Create(self.files) | beam.FlatMap(self._emit_tokenized_examples) | beam.Reshuffle()) # Allows for additional parallelization.
Example #9
Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0 | 5 votes |
def expand(self, pcoll): return ( pcoll | beam.Map(t5.data.dict_to_tfexample) | beam.Reshuffle() | beam.io.tfrecordio.WriteToTFRecord( self._output_path, num_shards=self._num_shards, coder=beam.coders.ProtoCoder(tf.train.Example)))
Example #10
Source File: create_cococameratraps_tfexample_main.py From models with Apache License 2.0 | 5 votes |
def create_pipeline(pipeline, image_directory, input_annotations_file, output_tfrecord_prefix=None, num_images_per_shard=200, keep_bboxes=True): """Creates a beam pipeline for producing a COCO-CameraTraps Image dataset. Args: pipeline: Initialized beam pipeline. image_directory: Path to image directory input_annotations_file: Path to a coco-cameratraps annotation file output_tfrecord_prefix: Absolute path for tfrecord outputs. Final files will be named {output_tfrecord_prefix}@N. num_images_per_shard: The number of images to store in each shard keep_bboxes: Whether to keep any bounding boxes that exist in the json file """ logging.info('Reading data from COCO-CameraTraps Dataset.') data = load_json_data(input_annotations_file) num_shards = int(np.ceil(float(len(data['images']))/num_images_per_shard)) image_examples = ( pipeline | ('CreateCollections') >> beam.Create( [im['id'] for im in data['images']]) | ('ParseImage') >> beam.ParDo(ParseImage( image_directory, data['images'], data['annotations'], data['categories'], keep_bboxes=keep_bboxes))) _ = (image_examples | ('Reshuffle') >> beam.Reshuffle() | ('WriteTfImageExample') >> beam.io.tfrecordio.WriteToTFRecord( output_tfrecord_prefix, num_shards=num_shards, coder=beam.coders.ProtoCoder(tf.train.Example)))
Example #11
Source File: generate_embedding_data.py From models with Apache License 2.0 | 5 votes |
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir, top_k_embedding_count, bottom_k_embedding_count, num_shards): """Returns a beam pipeline to run object detection inference. Args: pipeline: Initialized beam pipeline. input_tfrecord: An TFRecord of tf.train.Example protos containing images. output_tfrecord: An TFRecord of tf.train.Example protos that contain images in the input TFRecord and the detections from the model. model_dir: Path to `saved_model` to use for inference. top_k_embedding_count: The number of high-confidence embeddings to store. bottom_k_embedding_count: The number of low-confidence embeddings to store. num_shards: The number of output shards. """ input_collection = ( pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord( input_tfrecord, coder=beam.coders.BytesCoder())) output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo( GenerateEmbeddingDataFn(model_dir, top_k_embedding_count, bottom_k_embedding_count)) output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle() _ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord( output_tfrecord, num_shards=num_shards, coder=beam.coders.ProtoCoder(tf.train.Example))
Example #12
Source File: beam_prepare_embedding_inputs.py From exoplanet-ml with Apache License 2.0 | 4 votes |
def main(argv): del argv # Unused. logging.set_verbosity(logging.INFO) def pipeline(root): """Beam pipeline for preprocessing Kepler events.""" # Separately process and write each TCE dataset, and gather all the results. configs = _parse_configs() subsets = { "train": [], "val": [], "test": [], } for config in configs: output_dir = os.path.join(FLAGS.output_dir, config.name) # Write the config. config_json = json.dumps(config, indent=2) logging.info(config_json) (root | "{}-create-config".format(config.name) >> beam.Create([config_json]) | "{}-write_config".format(config.name) >> beam.io.WriteToText( os.path.join(output_dir, "config.json"), num_shards=1, shard_name_template="")) # Process TCEs and write each subset. results = _process_tces(root, config) for subset_name, subset_values in results: _write_subset(config.name, subset_name, subset_values) subsets[subset_name].append(subset_values) # Create one dataset comprising all TCE datasets. for subset_name, subset_values in subsets.items(): combined_subset_values = ( subset_values | "combined-{}-flatten".format(subset_name) >> beam.Flatten() | "combined-{}-count_labels".format(subset_name) >> beam.ParDo( _CountLabelsDoFn(prefix="combined-{}".format(subset_name))) | "combined-{}-reshuffle".format(subset_name) >> beam.Reshuffle()) _write_subset("combined", subset_name, combined_subset_values) pipeline.run() logging.info("Preprocessing complete.")
Example #13
Source File: beam_sample_tfrecord.py From exoplanet-ml with Apache License 2.0 | 4 votes |
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") def pipeline(root): """Beam pipeline for preprocessing open images.""" assert FLAGS.input_file_pattern assert FLAGS.output_dir assert FLAGS.output_name assert FLAGS.num_shards assert FLAGS.kepid_whitelist # Read label whitelist. kepid_whitelist = [int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")] logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist)) # Initialize DoFn. process_example = ProcessExampleDoFn(kepid_whitelist) # Create Pipeline. # pylint: disable=expression-not-assigned (root | "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord( FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)) | "process_examples" >> beam.ParDo(process_example) | "reshuffle" >> beam.Reshuffle() | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(FLAGS.output_dir, FLAGS.output_name), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=FLAGS.num_shards)) # pylint: enable=expression-not-assigned pipeline.run() logging.info("Processing complete.")
Example #14
Source File: raw_books_preproc_pipeline.py From language with Apache License 2.0 | 4 votes |
def ccnews_pipeline(): """Read Books Corpus filenames and create Beam pipeline.""" # set a random seed for reproducability rng = random.Random(FLAGS.random_seed) # BooksCorpus is organized into directories of genre and files of books # adventure-all.txt seems to contain all the adventure books in 1 file # romance-all.txt is the same. None of the other directories have this, # so we will skip it to not double count those books file_name_set = set() input_files_by_genre = collections.defaultdict(list) for path, _, fnames in tf.gfile.Walk(FLAGS.input_file): genre = path.split("/")[-1] for fname in fnames: if fname == "adventure-all.txt" or fname == "romance-all.txt": continue if fname in file_name_set: continue file_name_set.add(fname) input_files_by_genre[genre].append(path + "/" + fname) # Sort genres and iterate in order for reproducability train_files, test_files = [], [] for genre, file_list in sorted(input_files_by_genre.items()): rng.shuffle(file_list) genre_size = len(file_list) test_size = int(FLAGS.test_size * genre_size) test_files.extend(file_list[:test_size]) train_files.extend(file_list[test_size:]) assert len(file_list[:test_size]) + \ len(file_list[test_size:]) == len(file_list) # make sure there is no test train overlap for filename in train_files: assert filename not in test_files rng.shuffle(train_files) rng.shuffle(test_files) def pipeline(root): """Beam pipeline for converting CCNews files to TF Examples.""" _ = ( root | "Create test files" >> beam.Create(test_files) | "Read test files" >> beam.FlatMap(read_file) | "test Shuffle" >> beam.Reshuffle() | "Preproc test docs" >> beam.FlatMap(preproc_doc) | "record test Shuffle" >> beam.Reshuffle() | "Write to test tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=50)) _ = ( root | "Create train files" >> beam.Create(train_files) | "Read train files" >> beam.FlatMap(read_file) | "train Shuffle" >> beam.Reshuffle() | "Preproc train docs" >> beam.FlatMap(preproc_doc) | "record train Shuffle" >> beam.Reshuffle() | "Write to train tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cc_cpc.train.tfrecord", num_shards=450)) return return pipeline