Python apache_beam.FlatMap() Examples
The following are 30
code examples of apache_beam.FlatMap().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: preprocess.py From professional-services with Apache License 2.0 | 7 votes |
def shuffle(p): """Shuffles data from PCollection. Args: p: PCollection. Returns: PCollection of shuffled data. """ class _AddRandomKey(beam.DoFn): def process(self, element): yield random.random(), element shuffled_data = ( p | 'PairWithRandom' >> beam.ParDo(_AddRandomKey()) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) return shuffled_data
Example #2
Source File: linters.py From data-linter with Apache License 2.0 | 6 votes |
def _lint(self, examples): feature_val_w_counts = ( examples | 'Tuplize' >> beam.FlatMap( utils.example_tuplizer(self._counted_features)) | 'FlattenFeatureVals' >> beam.FlatMap(self._flatten_feature_vals) | 'CountFeatureVals' >> beam.combiners.Count.PerElement()) if hasattr(self, '_count_transformer'): feature_val_w_counts |= 'TransformCounts' >> self._count_transformer return ( feature_val_w_counts | 'PairValWithCount' >> beam.Map(self._shift_key) | 'GroupByFeature' >> beam.GroupByKey() | 'ValCountsToDict' >> beam.Map(self._val_counts_as_dict) | 'GenResults' >> beam.Map(self._check_feature) | 'DropUnwarned' >> beam.Filter(bool) | 'AsList' >> beam.combiners.ToList() | 'ToResult' >> beam.Map(self._to_result))
Example #3
Source File: impl.py From transform with Apache License 2.0 | 6 votes |
def _clear_shared_state_after_barrier(pipeline, input_barrier): """Clears any shared state from within a pipeline context. This will only be cleared once input_barrier becomes available. Args: pipeline: A `beam.Pipeline` object. input_barrier: A `PCollection` which the pipeline should wait for. Returns: An empty `PCollection`. """ empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap( lambda x: None) return (pipeline | 'PrepareToClearSharedKeepAlives' >> beam.Create([None]) | 'WaitAndClearSharedKeepAlives' >> beam.Map( lambda x, empty_side_input: shared.Shared().acquire(lambda: None), beam.pvalue.AsIter(empty_pcoll)))
Example #4
Source File: analyzer_impls.py From transform with Apache License 2.0 | 6 votes |
def expand(self, inputs): pcoll, = inputs def extract_outputs(outputs, num_outputs): if len(outputs) != num_outputs: raise ValueError( 'Analyzer has {} outputs but its implementation produced {} ' 'values'.format(num_outputs, len(outputs))) for i, output in enumerate(outputs): yield beam.pvalue.TaggedOutput(str(i), output) output_keys = [str(i) for i in range(self._num_outputs)] outputs_tuple = ( pcoll | 'ExtractOutputs' >> beam.FlatMap( extract_outputs, self._num_outputs).with_outputs(*output_keys)) return tuple(outputs_tuple[key] for key in output_keys)
Example #5
Source File: obspyio.py From SeisNN with MIT License | 6 votes |
def expand(self, pcollection): def get_dir_list(file_dir, suffix=""): file_list = [] for file_name in os.listdir(file_dir): f = os.path.join(file_dir, file_name) if file_name.endswith(suffix): file_list.append(f) return file_list def get_events(filename): catalog, wavename = read_nordic(filename, return_wavnames=True) for event in catalog.events: for pick in event.picks: pick.waveform_id.wavename = wavename yield event return ( pcollection | 'Create file directory' >> beam.Create(self.file_patterns) | 'List all files' >> beam.FlatMap(get_dir_list) | 'Get event' >> beam.FlatMap(get_events) )
Example #6
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def shuffle_data(p): """Shuffles data from PCollection. Args: p: PCollection. Returns: PCollection of shuffled data. """ class _AddRandomKey(beam.DoFn): def process(self, element): yield (random.random(), element) shuffled_data = ( p | 'PairWithRandom' >> beam.ParDo(_AddRandomKey()) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) return shuffled_data
Example #7
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def run(p, args): """Creates a pipeline to build and write train/val/test datasets.""" # pylint: disable=no-value-for-parameter query = bq_query.query if not args.cloud: query = "{} LIMIT 10".format(query) raw_data = (p | "ReadBQ" >> ReadBQ(query) | "HandleNullUserTags" >> beam.Map(_handle_null_user_tags) | "NormalizeUserTags" >> beam.Map(_normalize_user_tags)) data = _run_tft_fn(raw_data, _preprocess_tft, args.tft_dir, args.user_min_count, args.item_min_count) data = (data | "FilterData" >> beam.FlatMap(_filter_data) | "CleanTags" >> beam.Map(_clean_tags)) data = _split_data(data) for name, dataset in data: dataset | "Write{}Output".format(name) >> WriteOutput( name, args.output_dir, constants.TRAIN_SPEC, args.plain_text)
Example #8
Source File: preprocess.py From professional-services with Apache License 2.0 | 5 votes |
def shuffle(p): """Shuffles the given pCollection.""" return (p | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x)) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda x: x[1])) # pylint: disable=expression-not-assigned # pylint: disable=no-value-for-parameter
Example #9
Source File: fusion_break.py From gcp-variant-transforms with Apache License 2.0 | 5 votes |
def expand(self, pcoll): # Create an empty PCollection that depends on pcoll. empty = pcoll | beam.FlatMap(lambda x: ()) return pcoll | beam.Map(lambda x, unused: x, beam.pvalue.AsIter(empty))
Example #10
Source File: gcs_to_bigquery_lib.py From healthcare-deid with Apache License 2.0 | 5 votes |
def run_pipeline(input_pattern, output_table, pipeline_args): """Read the records from GCS and write them to BigQuery.""" p = beam.Pipeline(options=PipelineOptions(pipeline_args)) _ = (p | 'match_files' >> beam.Create(f2pn.match_files(input_pattern)) | 'to_records' >> beam.FlatMap(map_file_to_records) | 'map_to_bq_inputs' >> beam.Map(map_to_bq_inputs) | 'write' >> beam.io.Write(beam.io.BigQuerySink( output_table, schema='patient_id:INTEGER, note:STRING', write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) result = p.run().wait_until_finish() logging.info('GCS to BigQuery result: %s', result)
Example #11
Source File: obspyio.py From SeisNN with MIT License | 5 votes |
def expand(self, pcollection): def search_pick(pick_list, stream): tmp_pick = {} starttime = stream.traces[0].stats.starttime endtime = stream.traces[0].stats.endtime for pick in pick_list: phase = pick.phase_hint if starttime < pick.time < endtime: if not tmp_pick.get(phase): tmp_pick[phase] = [pick] else: tmp_pick[phase].append(pick) return tmp_pick def stream_get_pick(data): key, dictionary = data pick_list = dictionary['pick'] stream_list = dictionary['stream'] location = dictionary['location'][0] for stream in stream_list: picks = search_pick(pick_list, stream) stream.picks = picks stream.location = location yield stream return ( pcollection | 'Stream search picks' >> beam.FlatMap(stream_get_pick) )
Example #12
Source File: wiki_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def wiki_pipeline(): """Read WikiText103 filenames and create Beam pipeline.""" train_files = FLAGS.input_file + "/wiki.train.raw" dev_files = FLAGS.input_file + "/wiki.valid.raw" test_files = FLAGS.input_file + "/wiki.test.raw" def pipeline(root): """Beam pipeline for converting WikiText103 files to TF Examples.""" _ = ( root | "Create test files" >> beam.Create([test_files]) | "Read test files" >> beam.FlatMap(read_file) | "test Shuffle" >> beam.Reshuffle() | "Preproc test docs" >> beam.FlatMap(preproc_doc) | "record test Shuffle" >> beam.Reshuffle() | "Write to test tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cpc." + FLAGS.format + ".test.tfrecord", num_shards=10)) _ = ( root | "Create dev files" >> beam.Create([dev_files]) | "Read dev files" >> beam.FlatMap(read_file) | "dev Shuffle" >> beam.Reshuffle() | "Preproc dev docs" >> beam.FlatMap(preproc_doc) | "record dev Shuffle" >> beam.Reshuffle() | "Write to dev tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cpc." + FLAGS.format + ".dev.tfrecord", num_shards=10)) _ = ( root | "Create train files" >> beam.Create([train_files]) | "Read train files" >> beam.FlatMap(read_file) | "train Shuffle" >> beam.Reshuffle() | "Preproc train docs" >> beam.FlatMap(preproc_doc) | "record train Shuffle" >> beam.Reshuffle() | "Write to train tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cpc." + FLAGS.format + ".train.tfrecord", num_shards=100)) return return pipeline
Example #13
Source File: ccnews_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def ccnews_pipeline(): """Read CCNews filenames and create Beam pipeline.""" if FLAGS.dataset == "ccnews": data_filename = "ccnews.txt-%05d-of-01000" datasize = 1000 testsize = 100 else: data_filename = "wikipedia.txt-%05d-of-00500" datasize = 500 testsize = 50 train_files = [ FLAGS.input_file + data_filename % i for i in range(datasize - testsize) ] test_files = [ FLAGS.input_file + data_filename % i for i in range(datasize - testsize, testsize) ] def pipeline(root): """Beam pipeline for converting CCNews files to TF Examples.""" _ = ( root | "Create test files" >> beam.Create(test_files) | "Read test files" >> beam.FlatMap(read_file) | "test Shuffle" >> beam.Reshuffle() | "Preproc test docs" >> beam.FlatMap(preproc_doc) | "record test Shuffle" >> beam.Reshuffle() | "Write to test tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=testsize)) _ = ( root | "Create train files" >> beam.Create(train_files) | "Read train files" >> beam.FlatMap(read_file) | "train Shuffle" >> beam.Reshuffle() | "Preproc train docs" >> beam.FlatMap(preproc_doc) | "record train Shuffle" >> beam.Reshuffle() | "Write to train tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + ".cc_cpc.train.tfrecord", num_shards=datasize - testsize)) return return pipeline
Example #14
Source File: wiki_preproc_pipeline.py From language with Apache License 2.0 | 5 votes |
def wiki_pipeline(): """Read WikiText103 filenames and create Beam pipeline.""" train_files = FLAGS.input_file + "/wiki.train.raw" dev_files = FLAGS.input_file + "/wiki.valid.raw" test_files = FLAGS.input_file + "/wiki.test.raw" def pipeline(root): """Beam pipeline for converting WikiText103 files to TF Examples.""" _ = ( root | "Create test files" >> beam.Create([test_files]) | "Read test files" >> beam.FlatMap(read_file) | "test Shuffle" >> beam.Reshuffle() | "Preproc test docs" >> beam.FlatMap(preproc_doc) | "record test Shuffle" >> beam.Reshuffle() | "Write to test tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + "." + FLAGS.format + ".test.tfrecord", num_shards=10)) _ = ( root | "Create dev files" >> beam.Create([dev_files]) | "Read dev files" >> beam.FlatMap(read_file) | "dev Shuffle" >> beam.Reshuffle() | "Preproc dev docs" >> beam.FlatMap(preproc_doc) | "record dev Shuffle" >> beam.Reshuffle() | "Write to dev tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + "." + FLAGS.format + ".dev.tfrecord", num_shards=10)) _ = ( root | "Create train files" >> beam.Create([train_files]) | "Read train files" >> beam.FlatMap(read_file) | "train Shuffle" >> beam.Reshuffle() | "Preproc train docs" >> beam.FlatMap(preproc_doc) | "record train Shuffle" >> beam.Reshuffle() | "Write to train tfrecord" >> beam.io.WriteToTFRecord( FLAGS.output_file + "." + FLAGS.format + ".train.tfrecord", num_shards=100)) return return pipeline
Example #15
Source File: preprocess.py From professional-services with Apache License 2.0 | 5 votes |
def Shuffle(p): """Shuffles the given pCollection.""" return (p | "PairWithRandom" >> beam.Map(lambda x: (np.random.random(), x)) | "GroupByRandom" >> beam.GroupByKey() | "DropRandom" >> beam.FlatMap(lambda x: x[1]))
Example #16
Source File: physionet_to_mae_lib.py From healthcare-deid with Apache License 2.0 | 5 votes |
def run_pipeline(input_pattern, output_dir, mae_task_name, project, pipeline_args): """Read the physionet records from GCS and write them out as MAE.""" p = beam.Pipeline(options=PipelineOptions(pipeline_args)) _ = (p | 'match_files' >> beam.Create(f2pn.match_files(input_pattern)) | 'to_records' >> beam.FlatMap(f2pn.map_phi_to_findings) | 'generate_mae' >> beam.Map(mae.generate_mae, mae_task_name, {}, ['patient_id', 'record_number']) | 'write_mae' >> beam.Map(write_mae, project, output_dir) ) result = p.run().wait_until_finish() logging.info('GCS to BigQuery result: %s', result)
Example #17
Source File: test_pipeline.py From professional-services with Apache License 2.0 | 5 votes |
def test_enrichment_allevents(self): """Tests the pipeline by creating mock data and invoking respective transforms""" with TestPipeline() as pipeline: sideinput_filepath = ( pipeline | 'Read side Input file path' >> beam.Create( ["gs://bucketname/input/sideinput/20200503".encode()]) | "Add Timestamp" >> beam.Map( lambda e: beam.window.TimestampedValue(e, int(time.time()))) ) sideinput_collections = main.get_sideinput_collections( sideinput_filepath, static_data.ReadStaticSideInput()) sales_events = ( pipeline | 'Read primary events' >> beam.Create( static_data.get_maininput_events()) | 'Attach timestamp' >> beam.Map( lambda e: beam.window.TimestampedValue(e, int(time.time()))) | "Add Fixed window" >> beam.WindowInto( beam.window.FixedWindows(10))) enriched_events = main.get_enriched_events(sales_events, sideinput_collections) results = enriched_events | "UnPack the events" >> beam.FlatMap( lambda x: x) assert_that(results, equal_to(static_data.get_expected_enriched_events()))
Example #18
Source File: static_data.py From professional-services with Apache License 2.0 | 5 votes |
def expand(self, pcol): """Fetch files based on the file pattern, read the contents from matching files Args: pcol: PCollection containing list of file patterns Returns: PCollection of type Json objects that represents data read from the files """ # yapf: disable return (pcol | "Static Side Input Values" >> beam.FlatMap(self._get_lookup_values) ) # yapf: enable
Example #19
Source File: enforce_primary_keys.py From professional-services with Apache License 2.0 | 5 votes |
def expand(self, pcoll): return ( pcoll | 'Extract Primary Key' >> beam.FlatMap(lambda row: [(row[self.primary_key], row)]) | 'Sample n=1 by Primary Key' >> CombinePerKey(SampleCombineFn(1)) | 'Drop keys' >> beam.FlatMap(lambda kv: kv[1]))
Example #20
Source File: create_data.py From conversational-datasets with Apache License 2.0 | 5 votes |
def _shuffle_examples(examples): examples |= ("add random key" >> beam.Map( lambda example: (uuid.uuid4(), example))) examples |= ("group by key" >> beam.GroupByKey()) examples |= ("get shuffled values" >> beam.FlatMap(lambda t: t[1])) return examples
Example #21
Source File: create_data.py From conversational-datasets with Apache License 2.0 | 5 votes |
def _shuffle_examples(examples): examples |= "add random key" >> beam.Map( lambda example: (uuid.uuid4(), example) ) examples |= "group by key" >> beam.GroupByKey() examples |= "get shuffled values" >> beam.FlatMap(lambda t: t[1]) return examples
Example #22
Source File: create_data.py From conversational-datasets with Apache License 2.0 | 5 votes |
def _shuffle(pcollection): """Shuffles the input pcollection.""" pcollection |= "add random key" >> beam.Map( lambda value: (uuid.uuid4(), value)) pcollection |= "group by key" >> beam.GroupByKey() pcollection |= "get shuffled values" >> beam.FlatMap(lambda t: t[1]) return pcollection
Example #23
Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0 | 5 votes |
def expand(self, pipeline): return ( pipeline | beam.Create(self.files) | beam.FlatMap(self._emit_tokenized_examples) | beam.Reshuffle()) # Allows for additional parallelization.
Example #24
Source File: jackknife.py From model-analysis with Apache License 2.0 | 5 votes |
def expand(self, sliced_extracts): def partition_fn(_, num_partitions): return self._random_state.randint(num_partitions) # Partition the data # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]] partitions = ( sliced_extracts | 'Partition' >> beam.Partition(partition_fn, self._num_jackknife_samples)) def add_partition_index(slice_key, accumulator_and_size, partition_index=None): accumulator, size = accumulator_and_size return slice_key, _PartitionInfo(accumulator, size, partition_index) # Within each partition, partially combine per slice key to get accumulators # and partition sizes; add partition_id for determinism. # List[PCollection[slicer.SliceKeyType, _PartitionInfo]] partition_accumulators = [] for i, partition in enumerate(partitions): partition_accumulators.append( partition | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey( beam.transforms.combiners.SingleInputTupleCombineFn( _AccumulateOnlyCombiner(combiner=self._combiner), beam.transforms.combiners.CountCombineFn())) | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple( add_partition_index, i)) # Group partitions for the same slice, compute LOO metrics, and flatten back # into per-partition LOO metrics. # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict]) return (partition_accumulators | 'FlattenPartitionAccumulators' >> beam.Flatten() | 'CollectPerSlicePartitions' >> beam.GroupByKey() | 'MakeJackknifeSamples' >> beam.FlatMap( _make_jackknife_samples, combiner=self._combiner))
Example #25
Source File: unbatch_extractor.py From model-analysis with Apache License 2.0 | 5 votes |
def _UnbatchInputs( extracts: beam.pvalue.PCollection) -> beam.pvalue.PCollection: """Extracts unbatched inputs from batched extracts. Args: extracts: PCollection containing batched extracts. Returns: PCollection of per-example extracts. """ return extracts | 'UnbatchInputs' >> beam.FlatMap(_ExtractUnbatchedInputs)
Example #26
Source File: embedding_generator.py From hub with Apache License 2.0 | 5 votes |
def run(args): """Runs the embedding generation Beam pipeline.""" if tf.io.gfile.exists(args.embed_output_dir): print('Removing embedding output directory...') tf.io.gfile.rmtree(args.embed_output_dir) print('Creating empty output directory...') tf.io.gfile.makedirs(args.embed_output_dir) options = beam.options.pipeline_options.PipelineOptions(**vars(args)) original_dim = hub.load(args.module_url)(['']).shape[1] random_projection_matrix = generate_random_projection_weights( original_dim, args.projected_dim, args.embed_output_dir) print('Starting the Beam pipeline...') with beam.Pipeline(runner=_RUNNER, options=options) as pipeline: _ = ( pipeline | 'Read sentences from files' >> beam.io.ReadFromText(file_pattern=args.data_file_pattern) | 'Batch elements' >> util.BatchElements( min_batch_size=_BATCH_SIZE / 2, max_batch_size=_BATCH_SIZE) | 'Generate embeddings' >> beam.Map( generate_embeddings, args.module_url, random_projection_matrix) | 'Encode to tf example' >> beam.FlatMap(to_tf_example) | 'Write to TFRecords files' >> beam.io.WriteToTFRecord( file_path_prefix='{}/emb'.format(args.embed_output_dir), file_name_suffix='.tfrecords') ) print('Beam pipeline completed.')
Example #27
Source File: linters.py From data-linter with Apache License 2.0 | 5 votes |
def _lint(self, examples): """Returns the result of the TailedDistributionDetector linter. Args: examples: A `PTransform` that yields a `PCollection` of `tf.Example`s. Returns: A `PTransform` that yields a `LintResult` of the format warnings: [feature names] lint_samples: [ [stats: {min: feature_min if outlying, max: feature_max if outlying}] for each warning ] """ feature_values = ( examples | 'FlattenFeatureValue' >> beam.FlatMap( self._flatten_feature_vals(self.numeric_features))) feature_min_trimmed_mean = ( feature_values | self._make_trimmed_averager(self._MIN)) feature_max_trimmed_mean = ( feature_values | self._make_trimmed_averager(self._MAX)) return ( (feature_min_trimmed_mean, feature_max_trimmed_mean) | 'MergeTrimmedMeans' >> beam.CoGroupByKey() | 'AsList' >> beam.combiners.ToList() | 'ToResult' >> beam.Map(self._to_result))
Example #28
Source File: transform.py From pydatalab with Apache License 2.0 | 5 votes |
def shuffle(pcoll): # pylint: disable=invalid-name import random return (pcoll | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x)) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
Example #29
Source File: preprocess.py From cloudml-samples with Apache License 2.0 | 5 votes |
def _Shuffle(pcoll): # pylint: disable=invalid-name import random return (pcoll | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x)) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
Example #30
Source File: preprocess.py From cloudml-samples with Apache License 2.0 | 5 votes |
def _Shuffle(pcoll): # pylint: disable=invalid-name """Shuffles a PCollection.""" import random return (pcoll | 'PairWithRand' >> beam.Map(lambda x: (random.random(), x)) | 'GroupByRand' >> beam.GroupByKey() | 'DropRand' >> beam.FlatMap(lambda (k, vs): vs))