Python apache_beam.ptransform_fn() Examples
The following are 11
code examples of apache_beam.ptransform_fn().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: base_example_gen_executor.py From tfx with Apache License 2.0 | 6 votes |
def GetInputSourceToExamplePTransform(self) -> beam.PTransform: """Returns PTransform for converting input source to records. The record is by default assumed to be tf.train.Example protos, subclassses can serialize any protocol buffer into bytes as output PCollection, so long as the downstream component can consume it. Note that each input split will be transformed by this function separately. For complex use case, consider override 'GenerateExamplesByBeam' instead. Here is an example PTransform: @beam.ptransform_fn @beam.typehints.with_input_types(beam.Pipeline) @beam.typehints.with_output_types(Union[tf.train.Example, tf.train.SequenceExample, bytes]) def ExamplePTransform( pipeline: beam.Pipeline, exec_properties: Dict[Text, Any], split_pattern: Text) -> beam.pvalue.PCollection """ pass
Example #2
Source File: tf_example_record.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _RawRecordToRecordBatchInternal(self, batch_size: Optional[int] = None ) -> beam.PTransform: @beam.typehints.with_input_types(bytes) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(), self.raw_record_column_name, self._can_produce_large_types))) return beam.ptransform_fn(_PTransformFn)()
Example #3
Source File: tf_sequence_example_record.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _RawRecordToRecordBatchInternal(self, batch_size: Optional[int] = None ) -> beam.PTransform: @beam.typehints.with_input_types(bytes) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn(self._schema, self.raw_record_column_name, self._can_produce_large_types))) return beam.ptransform_fn(_PTransformFn)()
Example #4
Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0 | 6 votes |
def RawRecordToRecordBatch(self, batch_size: Optional[int] = None ) -> beam.PTransform: """Returns a PTransform that converts raw records to Arrow RecordBatches. The input PCollection must be from self.RawRecordBeamSource() (also see the documentation for that method). Args: batch_size: if not None, the `pa.RecordBatch` produced will be of the specified size. Otherwise it's automatically tuned by Beam. """ @beam.typehints.with_input_types(bytes) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(pcoll: beam.pvalue.PCollection): return (pcoll | "RawRecordToRecordBatch" >> self._RawRecordToRecordBatchInternal(batch_size) | "CollectRecordBatchTelemetry" >> telemetry.ProfileRecordBatches(self._telemetry_descriptors, self._logical_format, self._physical_format)) return beam.ptransform_fn(_PTransformFn)()
Example #5
Source File: tft_unit.py From transform with Apache License 2.0 | 5 votes |
def _RawRecordBeamSourceInternal(self): """A PTransform that maps batched instances to RecordBatches.""" @beam.ptransform_fn @beam.typehints.with_output_types(pa.RecordBatch) def _ptransform_fn(instances): return (instances | 'EncodeToTfExamples' >> beam.Map( example_proto_coder.ExampleProtoCoder(self._schema).encode)) return _ptransform_fn() # pylint: disable=no-value-for-parameter # TODO(b/156761358): deprecated; remove after tfx-bsl 0.23 release.
Example #6
Source File: test_util.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _RawRecordBeamSourceInternal(self) -> beam.PTransform: @beam.typehints.with_input_types(bytes) @beam.typehints.with_output_types(bytes) def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return raw_records_pcoll return beam.ptransform_fn(_PTransformFn)()
Example #7
Source File: raw_tf_record.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _RawRecordToRecordBatchInternal(self, batch_size: Optional[int] = None ) -> beam.PTransform: @beam.typehints.with_input_types(beam.Pipeline) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection): return (raw_record_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "ToRecordBatch" >> beam.Map(_BatchedRecordsToArrow, self.raw_record_column_name, self._can_produce_large_types)) return beam.ptransform_fn(_PTransformFn)()
Example #8
Source File: raw_tf_record.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _RawRecordBeamSourceInternal(self) -> beam.PTransform: @beam.typehints.with_input_types(beam.Pipeline) @beam.typehints.with_output_types(bytes) def _PTransformFn(pipeline: beam.pvalue.PCollection): return pipeline | "ReadFromTFRecord" >> beam.io.ReadFromTFRecord( self._file_pattern, coder=beam.coders.BytesCoder(), # TODO(b/114938612): Eventually remove this override. validate=False) return beam.ptransform_fn(_PTransformFn)()
Example #9
Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0 | 5 votes |
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform: @beam.typehints.with_input_types(beam.Pipeline) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(pipeline: beam.pvalue.PCollection): """Converts raw records to RecordBatches.""" return ( pipeline | "RawRecordBeamSource" >> self.RawRecordBeamSource() | "RawRecordToRecordBatch" >> self.RawRecordToRecordBatch(batch_size)) return beam.ptransform_fn(_PTransformFn)()
Example #10
Source File: csv_tfxio.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _RawRecordToRecordBatchInternal(self, batch_size: Optional[int] = None ) -> beam.PTransform: @beam.typehints.with_input_types(List[bytes]) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): """Returns RecordBatch of csv lines.""" # Decode raw csv lines to record batches. record_batches = ( raw_records_pcoll | "CSVToRecordBatch" >> csv_decoder.CSVToRecordBatch( column_names=self._column_names, delimiter=self._delimiter, skip_blank_lines=self._skip_blank_lines, schema=self._schema, desired_batch_size=batch_size, multivalent_columns=self._multivalent_columns, secondary_delimiter=self._secondary_delimiter, produce_large_types=self._can_produce_large_types, raw_record_column_name=self._raw_record_column_name)) return record_batches return beam.ptransform_fn(_PTransformFn)()
Example #11
Source File: executor.py From tfx with Apache License 2.0 | 4 votes |
def GetInputSourceToExamplePTransform(self) -> beam.PTransform: """Returns PTransform for importing records.""" @beam.ptransform_fn @beam.typehints.with_input_types(beam.Pipeline) @beam.typehints.with_output_types(Union[tf.train.Example, tf.train.SequenceExample, bytes]) def ImportRecord(pipeline: beam.Pipeline, exec_properties: Dict[Text, Any], split_pattern: Text) -> beam.pvalue.PCollection: """PTransform to import records. The records are tf.train.Example, tf.train.SequenceExample, or serialized proto. Args: pipeline: Beam pipeline. exec_properties: A dict of execution properties. - input_base: input dir that contains input data. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of records (tf.Example, tf.SequenceExample, or bytes). """ output_payload_format = exec_properties.get(utils.OUTPUT_DATA_FORMAT_KEY) serialized_records = ( pipeline # pylint: disable=no-value-for-parameter | _ImportSerializedRecord(exec_properties, split_pattern)) if output_payload_format == example_gen_pb2.PayloadFormat.FORMAT_PROTO: return serialized_records elif (output_payload_format == example_gen_pb2.PayloadFormat.FORMAT_TF_EXAMPLE): return (serialized_records | 'ToTFExample' >> beam.Map(tf.train.Example.FromString)) elif (output_payload_format == example_gen_pb2.PayloadFormat.FORMAT_TF_SEQUENCE_EXAMPLE): return (serialized_records | 'ToTFSequenceExample' >> beam.Map( tf.train.SequenceExample.FromString)) raise ValueError('output_payload_format must be one of FORMAT_TF_EXAMPLE,' ' FORMAT_TF_SEQUENCE_EXAMPLE or FORMAT_PROTO') return ImportRecord