Python apache_beam.ParDo() Examples
The following are 30
code examples of apache_beam.ParDo().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: preprocess.py From professional-services with Apache License 2.0 | 7 votes |
def shuffle(p): """Shuffles data from PCollection. Args: p: PCollection. Returns: PCollection of shuffled data. """ class _AddRandomKey(beam.DoFn): def process(self, element): yield random.random(), element shuffled_data = ( p | 'PairWithRandom' >> beam.ParDo(_AddRandomKey()) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) return shuffled_data
Example #2
Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0 | 6 votes |
def expand(self, pcoll): return ( pcoll # Assigns window info to each Pub/Sub message based on its # publish timestamp. | "Window into Fixed Intervals" >> beam.WindowInto(window.FixedWindows(self.window_size)) | "Add timestamps to messages" >> beam.ParDo(AddTimestamps()) # Use a dummy key to group the elements in the same window. # Note that all the elements in one window must fit into memory # for this. If the windowed elements do not fit into memory, # please consider using `beam.util.BatchElements`. # https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem)) | "Groupby" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val) )
Example #3
Source File: tf_example_record.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _RawRecordToRecordBatchInternal(self, batch_size: Optional[int] = None ) -> beam.PTransform: @beam.typehints.with_input_types(bytes) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(), self.raw_record_column_name, self._can_produce_large_types))) return beam.ptransform_fn(_PTransformFn)()
Example #4
Source File: variant_to_bigquery_test.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def test_convert_variant_to_bigquery_row(self): variant_1, row_1, header_num_dict_1 = self._get_sample_variant_1() variant_2, row_2, header_num_dict_2 = self._get_sample_variant_2() variant_3, row_3, header_num_dict_3 = self._get_sample_variant_3() header_num_dict = header_num_dict_1.copy() header_num_dict.update(header_num_dict_2) header_num_dict.update(header_num_dict_3) header_fields = vcf_header_util.make_header(header_num_dict) proc_var_1 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_1) proc_var_2 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_2) proc_var_3 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_3) pipeline = TestPipeline(blocking=True) bigquery_rows = ( pipeline | Create([proc_var_1, proc_var_2, proc_var_3]) | 'ConvertToRow' >> beam.ParDo(ConvertVariantToRow( self._row_generator))) assert_that(bigquery_rows, equal_to([row_1, row_2, row_3])) pipeline.run()
Example #5
Source File: infer_headers.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def expand(self, pcoll): return (pcoll | 'InferHeaderFields' >> beam.ParDo( _InferHeaderFields(self._infer_headers, self._annotation_fields_to_infer), self._defined_headers) # TODO(nmousavi): Modify the MergeHeaders to resolve 1 vs '.' # mismatch for headers extracted from variants. # # Note: argument `split_alternate_allele_info_fields` is not # relevant here since no fields with `Number=A` will be extracted # from variants, therefore we let the default value (True) for it # be used. Should this changes, we should modify the default value. | 'MergeHeaders' >> merge_headers.MergeHeaders( split_alternate_allele_info_fields=True, allow_incompatible_records=( self._allow_incompatible_records or bool(self._annotation_fields_to_infer))))
Example #6
Source File: run_inference.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _Predict(pcoll: beam.pvalue.PCollection, # pylint: disable=invalid-name inference_spec_type: model_spec_pb2.InferenceSpecType): """Performs predict PTransform.""" if _using_in_process_inference(inference_spec_type): predictions = ( pcoll | 'Predict' >> beam.ParDo( _BatchPredictDoFn(inference_spec_type, shared.Shared()))) else: predictions = ( pcoll | 'RemotePredict' >> beam.ParDo( _RemotePredictDoFn(inference_spec_type, pcoll.pipeline.options))) return (predictions | 'BuildPredictionLogForPredictions' >> beam.ParDo( _BuildPredictionLogForPredictionsDoFn()))
Example #7
Source File: preprocess.py From cloudml-samples with Apache License 2.0 | 6 votes |
def run(p, input_path, output_directory, train_fraction=0.8): """Runs the pipeline.""" raw_data = (p | "ReadTrainData" >> beam.io.Read(CsvFileSource( input_path, column_names=constants.CSV_COLUMNS))) train_data, eval_data = split_data(raw_data, train_fraction) (train_data | "PrepareCSV_train" >> beam.ParDo( ConvertDictToCSV(ordered_fieldnames=constants.CSV_COLUMNS)) | "Write_train" >> beam.io.WriteToText( os.path.join(output_directory, "output_data", "train"), file_name_suffix=".csv")) (eval_data | "PrepareCSV_eval" >> beam.ParDo( ConvertDictToCSV(ordered_fieldnames=constants.CSV_COLUMNS)) | "Write_eval" >> beam.io.WriteToText( os.path.join(output_directory, "output_data", "eval"), file_name_suffix=".csv"))
Example #8
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def shuffle_data(p): """Shuffles data from PCollection. Args: p: PCollection. Returns: PCollection of shuffled data. """ class _AddRandomKey(beam.DoFn): def process(self, element): yield (random.random(), element) shuffled_data = ( p | 'PairWithRandom' >> beam.ParDo(_AddRandomKey()) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) return shuffled_data
Example #9
Source File: create_kitti_crop_dataset.py From lingvo with Apache License 2.0 | 6 votes |
def main(_): beam_utils.BeamInit() if not FLAGS.output_file_pattern: raise ValueError('Must provide an output_file_pattern') reader = beam.io.ReadFromTFRecord( FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)) model_name = FLAGS.model_name split = FLAGS.split run_preprocessors = FLAGS.run_preprocessors with beam_utils.GetPipelineRoot() as root: _ = ( root | 'Read' >> reader | 'ToTFExample' >> beam.ParDo( _ProcessShard(model_name, split, run_preprocessors)) | 'Reshuffle' >> beam.Reshuffle() | 'Write' >> beam.io.WriteToTFRecord( FLAGS.output_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))
Example #10
Source File: beam_utils.py From lingvo with Apache License 2.0 | 6 votes |
def GetPipelineRoot(options=None): """Return the root of the beam pipeline. Typical usage looks like: with GetPipelineRoot() as root: _ = (root | beam.ParDo() | ...) In this example, the pipeline is automatically executed when the context is exited, though one can manually run the pipeline built from the root object as well. Args: options: A beam.options.pipeline_options.PipelineOptions object. Returns: A beam.Pipeline root object. """ return beam.Pipeline(options=options)
Example #11
Source File: executor.py From tfx with Apache License 2.0 | 6 votes |
def _ToArrowRecordBatches( pcoll: beam.pvalue.PCollection, schema: Optional[schema_pb2.Schema]) -> beam.pvalue.PCollection: """Converts serialized examples to Arrow RecordBatches. Args: pcoll: PCollection of Transformed data. schema: schema. Returns: PCollection of `DatasetFeatureStatisticsList`. """ kwargs = tfdv.utils.batch_util.GetBeamBatchKwargs( tft_beam.Context.get_desired_batch_size()) return ( pcoll | 'Values' >> beam.Values() | 'BatchElements' >> beam.BatchElements(**kwargs) | 'ToArrowRecordBatches' >> beam.ParDo( Executor._ToArrowRecordBatchesFn(schema)))
Example #12
Source File: executor.py From tfx with Apache License 2.0 | 6 votes |
def _PrestoToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, exec_properties: Dict[Text, Any], split_pattern: Text) -> beam.pvalue.PCollection: """Read from Presto and transform to TF examples. Args: pipeline: beam pipeline. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, a Presto sql string. Returns: PCollection of TF examples. """ conn_config = example_gen_pb2.CustomConfig() json_format.Parse(exec_properties['custom_config'], conn_config) presto_config = presto_config_pb2.PrestoConnConfig() conn_config.custom_config.Unpack(presto_config) client = _deserialize_conn_config(presto_config) return (pipeline | 'Query' >> beam.Create([split_pattern]) | 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client)) | 'ToTFExample' >> beam.Map(_row_to_example))
Example #13
Source File: predict.py From pydatalab with Apache License 2.0 | 6 votes |
def make_prediction_pipeline(pipeline, args): """Builds the prediction pipeline. Reads the csv files, prepends a ',' if the target column is missing, run prediction, and then prints the formated results to a file. Args: pipeline: the pipeline args: command line args """ # DF bug: DF does not work with unicode strings predicted_values, errors = ( pipeline | 'Read CSV Files' >> beam.io.ReadFromText(str(args.predict_data), strip_trailing_newlines=True) | 'Batch Input' >> beam.ParDo(EmitAsBatchDoFn(args.batch_size)) | 'Run TF Graph on Batches' >> beam.ParDo(RunGraphDoFn(args.trained_model_dir)).with_outputs('errors', main='main')) ((predicted_values, errors) | 'Format and Save' >> FormatAndSave(args))
Example #14
Source File: utils_test.py From text with Apache License 2.0 | 6 votes |
def testTwoLangs(self): with TestPipeline() as p: tokens = p | 'CreateInput' >> beam.Create(self.sample_input) result = tokens | beam.ParDo(utils.CompileTokenizationInfo()) assert_that(result, equal_to([{ 'lang': 'en', 'count': 1, 'num_preserved_chars': 13, 'num_dropped_chars': 2, 'num_non_unk_wordpieces': 4, 'preserved_ratio': [13/4], 'dropped_ratio': [2/15], 'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce']) }, { 'lang': 'fr', 'count': 1, 'num_preserved_chars': 14, 'num_dropped_chars': 0, 'num_non_unk_wordpieces': 5, 'preserved_ratio': [14/5], 'dropped_ratio': [0], 'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir']) }]))
Example #15
Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0 | 6 votes |
def run(input_topic, output_path, window_size=1.0, pipeline_args=None): # `save_main_session` is set to true because some DoFn's rely on # globally imported modules. pipeline_options = PipelineOptions( pipeline_args, streaming=True, save_main_session=True ) with beam.Pipeline(options=pipeline_options) as pipeline: ( pipeline | "Read PubSub Messages" >> beam.io.ReadFromPubSub(topic=input_topic) | "Window into" >> GroupWindowsIntoBatches(window_size) | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path)) )
Example #16
Source File: impl.py From transform with Apache License 2.0 | 6 votes |
def expand(self, inputs): saved_model_dir_pcol, input_values_pcol = inputs # We don't deep_copy pcollections used for the first phase, or when # the user defined `Context` disables it. if self._phase > 0 and Context.get_use_deep_copy_optimization(): # Obviates unnecessary data materialization when the input data source is # safe to read more than once. tf.compat.v1.logging.info('Deep copying inputs for phase: %d', self._phase) input_values_pcol = deep_copy.deep_copy(input_values_pcol) if not self._use_tfxio: input_values_pcol |= 'BatchInputs' >> _BatchElements() return (input_values_pcol | 'ApplySavedModel' >> beam.ParDo( _RunMetaGraphDoFn( self._tf_config, use_tfxio=self._use_tfxio, input_schema=self._input_schema, input_tensor_adapter_config=self._input_tensor_adapter_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir_pcol)))
Example #17
Source File: preprocess.py From cloudml-samples with Apache License 2.0 | 6 votes |
def configure_pipeline(p, opt): """Specify PCollection and transformations in pipeline.""" read_input_source = beam.io.ReadFromText( opt.input_path, strip_trailing_newlines=True) read_label_source = beam.io.ReadFromText( opt.input_dict, strip_trailing_newlines=True) labels = (p | 'Read dictionary' >> read_label_source) _ = (p | 'Read input' >> read_input_source | 'Parse input' >> beam.Map(lambda line: csv.reader([line]).next()) | 'Extract label ids' >> beam.ParDo(ExtractLabelIdsDoFn(), beam.pvalue.AsIter(labels)) | 'Read and convert to JPEG' >> beam.ParDo(ReadImageAndConvertToJpegDoFn()) | 'Embed and make TFExample' >> beam.ParDo(TFExampleFromImageDoFn()) # TODO(b/35133536): Get rid of this Map and instead use # coder=beam.coders.ProtoCoder(tf.train.Example) in WriteToTFRecord # below. | 'SerializeToString' >> beam.Map(lambda x: x.SerializeToString()) | 'Save to disk' >> beam.io.WriteToTFRecord(opt.output_path, file_name_suffix='.tfrecord.gz'))
Example #18
Source File: batch_util.py From data-validation with Apache License 2.0 | 6 votes |
def BatchSerializedExamplesToArrowRecordBatches( examples: beam.pvalue.PCollection, desired_batch_size: Optional[int] = constants .DEFAULT_DESIRED_INPUT_BATCH_SIZE ) -> beam.pvalue.PCollection: """Batches serialized examples into Arrow record batches. Args: examples: A PCollection of serialized tf.Examples. desired_batch_size: Batch size. The output Arrow record batches will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow record batches. """ return (examples | "BatchSerializedExamples" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(desired_batch_size)) | "BatchDecodeExamples" >> beam.ParDo(_BatchDecodeExamplesDoFn()))
Example #19
Source File: utils_test.py From text with Apache License 2.0 | 5 votes |
def testLangNotInLangSet(self): with TestPipeline() as p: tokens = p | beam.Create(self.sample_input) result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'})) assert_that(result, equal_to([]))
Example #20
Source File: utils_test.py From text with Apache License 2.0 | 5 votes |
def testLangInLangSet(self): with TestPipeline() as p: tokens = p | beam.Create(self.sample_input) result = tokens | beam.ParDo(utils.FilterTokensByLang({'en'})) assert_that(result, equal_to([('I', 'en'), ('like', 'en'), ('pie', 'en'), ('.', 'en')]))
Example #21
Source File: beam_testutil.py From healthcare-deid with Apache License 2.0 | 5 votes |
def expand(self, pcoll): return pcoll | 'DummyWriteForTesting' >> beam.ParDo( DummyWriteTransform.WriteDoFn(self.filename))
Example #22
Source File: telemetry.py From tfx-bsl with Apache License 2.0 | 5 votes |
def ProfileRawRecords( pcoll: beam.pvalue.PCollection, telemetry_descriptors: Optional[List[Text]], logical_format: Text, physical_format: Text) -> beam.PTransform: """An identity transform to profile raw records for record based TFXIO.""" return pcoll | "ProfileRawRecords" >> beam.ParDo(_ProfileRawRecordDoFn( telemetry_descriptors, logical_format, physical_format))
Example #23
Source File: main.py From professional-services with Apache License 2.0 | 5 votes |
def get_sideinput_collections(sideinput_filepath: beam.pvalue.PCollection, readTransform: beam.PTransform) \ -> Dict[str,beam.pvalue.PCollection]: """"Load Side Input data from respective file paths Args: sideinput_filepath: File path representing base path for side inputs to be loaded readTransform: Transform responsible for loading the side input data Returns: Dictionary containing Side Input name as key and corresponding PCollection as value """ sideinput_types = get_sideinput_types() # yapf: disable filepaths = (sideinput_filepath | "Get side input paths from base path" >> beam.ParDo( dofns.SplitToMultiple(sideinput_types)).with_outputs(*sideinput_types) ) sideinput_collections = {} for sideinput_type in sideinput_types: sideinput_collections[sideinput_type] = (filepaths[sideinput_type] | f"Read {sideinput_type}" >> readTransform | f"{sideinput_type}:Extract KV" >> beam.Map( transforms.kv_of, "productname", sideinput_type) ) # yapf: enable return sideinput_collections
Example #24
Source File: transforms.py From professional-services with Apache License 2.0 | 5 votes |
def LogEvents(pcol): pcol | 'Output Events' >> beam.ParDo(LogFn())
Example #25
Source File: utils_test.py From text with Apache License 2.0 | 5 votes |
def testNotEqual(self): with TestPipeline() as p: sample_input = [('I', 'en'), ('kind', 'en'), ('of', 'en'), ('like', 'en'), ('to', 'en'), ('eat', 'en'), ('pie', 'en'), ('!', 'en'), ('Je', 'fr'), ('suis', 'fr'), ('une', 'fr'), ('fille', 'fr'), ('.', 'fr')] tokens = p | beam.Create(sample_input) result = (tokens | beam.CombineGlobally(utils.CalculateCoefficients(0.5)) | beam.ParDo(CompareValues())) assert_that(result, equal_to([True]))
Example #26
Source File: utils_test.py From text with Apache License 2.0 | 5 votes |
def testBasic(self): with TestPipeline() as p: tokens = p | 'CreateInput' >> beam.Create(self.sample_input) coeffs = p | 'CreateCoeffs' >> beam.Create(self.coeffs) result = tokens | beam.ParDo( utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs)) assert_that(result, equal_to([('Hello', 0.75), (',', 0.75), ('world', 0.75), ('!', 0.75), ('Bonjour', 1.5), ('.', 1.5)]))
Example #27
Source File: utils_test.py From text with Apache License 2.0 | 5 votes |
def testTokenTooLong(self): sample_input = [('one', 1), ('two', 2), ('three', 3), ('four', 4), ('qwertyuiopasdfghjklzxcvbnmqwertyuiopasdfghjklzxcvbnm', 5), ('blah', 20)] with TestPipeline() as p: tokens = p | 'CreateInput' >> beam.Create(sample_input) result = tokens | beam.ParDo(utils.FilterByCount(self.max_token_length, min_token_frequency=2)) assert_that(result, equal_to([('three', 3), ('four', 4), ('blah', 20)]))
Example #28
Source File: telemetry.py From tfx-bsl with Apache License 2.0 | 5 votes |
def ProfileRecordBatches( pcoll: beam.pvalue.PCollection, telemetry_descriptors: Optional[List[Text]], logical_format: Text, physical_format: Text, distribution_update_probability: float = 0.1) -> beam.PTransform: """An identity transform to profile RecordBatches and updated Beam metrics. Args: pcoll: a PCollection[pa.RecordBatch] telemetry_descriptors: a set of descriptors that identify the component that invokes this PTransform. These will be used to construct the namespace to contain the beam metrics created within this PTransform. All such namespaces will be prefixed by "tfxio.". If None, a default "unknown" descriptor will be used. logical_format: the logical format of the data (before parsed into RecordBatches). Used to construct metric names. physical_format: the physical format in which the data is stored on disk. Used to construct metric names. distribution_update_probability: probability to update the expensive, per-row distributions. Returns: `pcoll` (identity function). """ assert 0 < distribution_update_probability <= 1.0, ( "Invalid probability: {}".format(distribution_update_probability)) return pcoll | "ProfileRecordBatches" >> beam.ParDo( _ProfileRecordBatchDoFn(telemetry_descriptors, logical_format, physical_format, distribution_update_probability))
Example #29
Source File: preprocess.py From professional-services with Apache License 2.0 | 5 votes |
def randomly_split(p, train_size, validation_size, test_size): """Randomly splits input pipeline in three sets based on input ratio. Args: p: PCollection, input pipeline. train_size: float, ratio of data going to train set. validation_size: float, ratio of data going to validation set. test_size: float, ratio of data going to test set. Returns: Tuple of PCollection. Raises: ValueError: Train validation and test sizes don`t add up to 1.0. """ if train_size + validation_size + test_size != 1.0: raise ValueError( 'Train, validation, and test sizes don`t add up to 1.0.') class SplitData(beam.DoFn): def process(self, element): r = random.random() if r < test_size: yield beam.pvalue.TaggedOutput('Test', element) elif r < 1 - train_size: yield beam.pvalue.TaggedOutput('Val', element) else: yield element split_data = ( p | 'SplitData' >> beam.ParDo(SplitData()).with_outputs( 'Test', 'Val', main='Train')) return split_data['Train'], split_data['Val'], split_data['Test']
Example #30
Source File: utils_test.py From text with Apache License 2.0 | 5 votes |
def testLangNotInLangSetIncludeOthers(self): with TestPipeline() as p: tokens = p | beam.Create(self.sample_input) result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'}, True)) assert_that(result, equal_to([('I', 'other'), ('like', 'other'), ('pie', 'other'), ('.', 'other')]))