Python Examples of apache

Source File: preprocess.py From professional-services with Apache License 2.0

7 votes

def shuffle(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield random.random(), element

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data

Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0

6 votes

def expand(self, pcoll):
        return (
            pcoll
            # Assigns window info to each Pub/Sub message based on its
            # publish timestamp.
            | "Window into Fixed Intervals"
            >> beam.WindowInto(window.FixedWindows(self.window_size))
            | "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
            # Use a dummy key to group the elements in the same window.
            # Note that all the elements in one window must fit into memory
            # for this. If the windowed elements do not fit into memory,
            # please consider using `beam.util.BatchElements`.
            # https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
            | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
            | "Groupby" >> beam.GroupByKey()
            | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
        )

Source File: tf_example_record.py From tfx-bsl with Apache License 2.0

6 votes

def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
      return (raw_records_pcoll
              | "Batch" >> beam.BatchElements(
                  **batch_util.GetBatchElementsKwargs(batch_size))
              | "Decode" >> beam.ParDo(
                  _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(),
                                           self.raw_record_column_name,
                                           self._can_produce_large_types)))

    return beam.ptransform_fn(_PTransformFn)()

Source File: variant_to_bigquery_test.py From gcp-variant-transforms with Apache License 2.0

6 votes

def test_convert_variant_to_bigquery_row(self):
    variant_1, row_1, header_num_dict_1 = self._get_sample_variant_1()
    variant_2, row_2, header_num_dict_2 = self._get_sample_variant_2()
    variant_3, row_3, header_num_dict_3 = self._get_sample_variant_3()
    header_num_dict = header_num_dict_1.copy()
    header_num_dict.update(header_num_dict_2)
    header_num_dict.update(header_num_dict_3)
    header_fields = vcf_header_util.make_header(header_num_dict)
    proc_var_1 = processed_variant.ProcessedVariantFactory(
        header_fields).create_processed_variant(variant_1)
    proc_var_2 = processed_variant.ProcessedVariantFactory(
        header_fields).create_processed_variant(variant_2)
    proc_var_3 = processed_variant.ProcessedVariantFactory(
        header_fields).create_processed_variant(variant_3)
    pipeline = TestPipeline(blocking=True)
    bigquery_rows = (
        pipeline
        | Create([proc_var_1, proc_var_2, proc_var_3])
        | 'ConvertToRow' >> beam.ParDo(ConvertVariantToRow(
            self._row_generator)))
    assert_that(bigquery_rows, equal_to([row_1, row_2, row_3]))
    pipeline.run()

Source File: infer_headers.py From gcp-variant-transforms with Apache License 2.0

6 votes

def expand(self, pcoll):
    return (pcoll
            | 'InferHeaderFields' >> beam.ParDo(
                _InferHeaderFields(self._infer_headers,
                                   self._annotation_fields_to_infer),
                self._defined_headers)
            # TODO(nmousavi): Modify the MergeHeaders to resolve 1 vs '.'
            # mismatch for headers extracted from variants.
            #
            # Note: argument `split_alternate_allele_info_fields` is not
            # relevant here since no fields with `Number=A` will be extracted
            # from variants, therefore we let the default value (True) for it
            # be used. Should this changes, we should modify the default value.
            | 'MergeHeaders' >> merge_headers.MergeHeaders(
                split_alternate_allele_info_fields=True,
                allow_incompatible_records=(
                    self._allow_incompatible_records or
                    bool(self._annotation_fields_to_infer))))

Source File: run_inference.py From tfx-bsl with Apache License 2.0

6 votes

def _Predict(pcoll: beam.pvalue.PCollection,  # pylint: disable=invalid-name
             inference_spec_type: model_spec_pb2.InferenceSpecType):
  """Performs predict PTransform."""
  if _using_in_process_inference(inference_spec_type):
    predictions = (
        pcoll
        | 'Predict' >> beam.ParDo(
            _BatchPredictDoFn(inference_spec_type, shared.Shared())))
  else:
    predictions = (
        pcoll
        | 'RemotePredict' >> beam.ParDo(
            _RemotePredictDoFn(inference_spec_type, pcoll.pipeline.options)))
  return (predictions
          | 'BuildPredictionLogForPredictions' >> beam.ParDo(
              _BuildPredictionLogForPredictionsDoFn()))

Source File: preprocess.py From cloudml-samples with Apache License 2.0

6 votes

def run(p, input_path, output_directory, train_fraction=0.8):
    """Runs the pipeline."""

    raw_data = (p | "ReadTrainData" >> beam.io.Read(CsvFileSource(
            input_path, column_names=constants.CSV_COLUMNS)))
    train_data, eval_data = split_data(raw_data, train_fraction)

    (train_data | "PrepareCSV_train" >> beam.ParDo(
        ConvertDictToCSV(ordered_fieldnames=constants.CSV_COLUMNS))
     | "Write_train" >> beam.io.WriteToText(
            os.path.join(output_directory, "output_data", "train"),
            file_name_suffix=".csv"))
    (eval_data | "PrepareCSV_eval" >> beam.ParDo(
        ConvertDictToCSV(ordered_fieldnames=constants.CSV_COLUMNS))
     | "Write_eval" >> beam.io.WriteToText(
            os.path.join(output_directory, "output_data", "eval"),
            file_name_suffix=".csv"))

Source File: preprocess.py From professional-services with Apache License 2.0

6 votes

def shuffle_data(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield (random.random(), element)

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data

Source File: create_kitti_crop_dataset.py From lingvo with Apache License 2.0

6 votes

def main(_):
  beam_utils.BeamInit()

  if not FLAGS.output_file_pattern:
    raise ValueError('Must provide an output_file_pattern')

  reader = beam.io.ReadFromTFRecord(
      FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))

  model_name = FLAGS.model_name
  split = FLAGS.split
  run_preprocessors = FLAGS.run_preprocessors

  with beam_utils.GetPipelineRoot() as root:
    _ = (
        root
        | 'Read' >> reader
        | 'ToTFExample' >> beam.ParDo(
            _ProcessShard(model_name, split, run_preprocessors))
        | 'Reshuffle' >> beam.Reshuffle()
        | 'Write' >> beam.io.WriteToTFRecord(
            FLAGS.output_file_pattern,
            coder=beam.coders.ProtoCoder(tf.train.Example)))

Source File: beam_utils.py From lingvo with Apache License 2.0

6 votes

def GetPipelineRoot(options=None):
  """Return the root of the beam pipeline.

  Typical usage looks like:

    with GetPipelineRoot() as root:
      _ = (root | beam.ParDo() | ...)

  In this example, the pipeline is automatically executed when the context is
  exited, though one can manually run the pipeline built from the root object as
  well.

  Args:
    options: A beam.options.pipeline_options.PipelineOptions object.

  Returns:
    A beam.Pipeline root object.
  """
  return beam.Pipeline(options=options)

Source File: executor.py From tfx with Apache License 2.0

6 votes

def _ToArrowRecordBatches(
      pcoll: beam.pvalue.PCollection,
      schema: Optional[schema_pb2.Schema]) -> beam.pvalue.PCollection:
    """Converts serialized examples to Arrow RecordBatches.

    Args:
      pcoll: PCollection of Transformed data.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
    kwargs = tfdv.utils.batch_util.GetBeamBatchKwargs(
        tft_beam.Context.get_desired_batch_size())
    return (
        pcoll
        | 'Values' >> beam.Values()
        | 'BatchElements' >> beam.BatchElements(**kwargs)
        | 'ToArrowRecordBatches' >> beam.ParDo(
            Executor._ToArrowRecordBatchesFn(schema)))

Source File: executor.py From tfx with Apache License 2.0

6 votes

def _PrestoToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    exec_properties: Dict[Text, Any],
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read from Presto and transform to TF examples.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, a Presto sql string.

  Returns:
    PCollection of TF examples.
  """
  conn_config = example_gen_pb2.CustomConfig()
  json_format.Parse(exec_properties['custom_config'], conn_config)
  presto_config = presto_config_pb2.PrestoConnConfig()
  conn_config.custom_config.Unpack(presto_config)

  client = _deserialize_conn_config(presto_config)
  return (pipeline
          | 'Query' >> beam.Create([split_pattern])
          | 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client))
          | 'ToTFExample' >> beam.Map(_row_to_example))

Source File: predict.py From pydatalab with Apache License 2.0

6 votes

def make_prediction_pipeline(pipeline, args):
  """Builds the prediction pipeline.

  Reads the csv files, prepends a ',' if the target column is missing, run
  prediction, and then prints the formated results to a file.

  Args:
    pipeline: the pipeline
    args: command line args
  """

  # DF bug: DF does not work with unicode strings
  predicted_values, errors = (
      pipeline |
      'Read CSV Files' >>
      beam.io.ReadFromText(str(args.predict_data),
                           strip_trailing_newlines=True) |
      'Batch Input' >>
      beam.ParDo(EmitAsBatchDoFn(args.batch_size)) |
      'Run TF Graph on Batches' >>
      beam.ParDo(RunGraphDoFn(args.trained_model_dir)).with_outputs('errors', main='main'))

  ((predicted_values, errors) |
   'Format and Save' >>
   FormatAndSave(args))

Source File: utils_test.py From text with Apache License 2.0

6 votes

def testTwoLangs(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.CompileTokenizationInfo())
      assert_that(result, equal_to([{
          'lang': 'en',
          'count': 1,
          'num_preserved_chars': 13,
          'num_dropped_chars': 2,
          'num_non_unk_wordpieces': 4,
          'preserved_ratio': [13/4],
          'dropped_ratio': [2/15],
          'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce'])
      }, {
          'lang': 'fr',
          'count': 1,
          'num_preserved_chars': 14,
          'num_dropped_chars': 0,
          'num_non_unk_wordpieces': 5,
          'preserved_ratio': [14/5],
          'dropped_ratio': [0],
          'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir'])
      }]))

Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0

6 votes

def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
    # `save_main_session` is set to true because some DoFn's rely on
    # globally imported modules.
    pipeline_options = PipelineOptions(
        pipeline_args, streaming=True, save_main_session=True
    )

    with beam.Pipeline(options=pipeline_options) as pipeline:
        (
            pipeline
            | "Read PubSub Messages"
            >> beam.io.ReadFromPubSub(topic=input_topic)
            | "Window into" >> GroupWindowsIntoBatches(window_size)
            | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
        )

Source File: impl.py From transform with Apache License 2.0

6 votes

def expand(self, inputs):
    saved_model_dir_pcol, input_values_pcol = inputs

    # We don't deep_copy pcollections used for the first phase, or when
    # the user defined `Context` disables it.
    if self._phase > 0 and Context.get_use_deep_copy_optimization():
      # Obviates unnecessary data materialization when the input data source is
      # safe to read more than once.
      tf.compat.v1.logging.info('Deep copying inputs for phase: %d',
                                self._phase)
      input_values_pcol = deep_copy.deep_copy(input_values_pcol)

    if not self._use_tfxio:
      input_values_pcol |= 'BatchInputs' >> _BatchElements()

    return (input_values_pcol | 'ApplySavedModel' >> beam.ParDo(
        _RunMetaGraphDoFn(
            self._tf_config,
            use_tfxio=self._use_tfxio,
            input_schema=self._input_schema,
            input_tensor_adapter_config=self._input_tensor_adapter_config,
            shared_graph_state_handle=shared.Shared(),
            passthrough_keys=Context.get_passthrough_keys()),
        saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir_pcol)))

Source File: preprocess.py From cloudml-samples with Apache License 2.0

6 votes

def configure_pipeline(p, opt):
  """Specify PCollection and transformations in pipeline."""
  read_input_source = beam.io.ReadFromText(
      opt.input_path, strip_trailing_newlines=True)
  read_label_source = beam.io.ReadFromText(
      opt.input_dict, strip_trailing_newlines=True)
  labels = (p | 'Read dictionary' >> read_label_source)
  _ = (p
       | 'Read input' >> read_input_source
       | 'Parse input' >> beam.Map(lambda line: csv.reader([line]).next())
       | 'Extract label ids' >> beam.ParDo(ExtractLabelIdsDoFn(),
                                           beam.pvalue.AsIter(labels))
       | 'Read and convert to JPEG'
       >> beam.ParDo(ReadImageAndConvertToJpegDoFn())
       | 'Embed and make TFExample' >> beam.ParDo(TFExampleFromImageDoFn())
       # TODO(b/35133536): Get rid of this Map and instead use
       # coder=beam.coders.ProtoCoder(tf.train.Example) in WriteToTFRecord
       # below.
       | 'SerializeToString' >> beam.Map(lambda x: x.SerializeToString())
       | 'Save to disk'
       >> beam.io.WriteToTFRecord(opt.output_path,
                                  file_name_suffix='.tfrecord.gz'))

Source File: batch_util.py From data-validation with Apache License 2.0

6 votes

def BatchSerializedExamplesToArrowRecordBatches(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants
    .DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
  """Batches serialized examples into Arrow record batches.

  Args:
    examples: A PCollection of serialized tf.Examples.
    desired_batch_size: Batch size. The output Arrow record batches will have as
      many rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow record batches.
  """
  return (examples
          | "BatchSerializedExamples" >> beam.BatchElements(
              **batch_util.GetBatchElementsKwargs(desired_batch_size))
          | "BatchDecodeExamples" >> beam.ParDo(_BatchDecodeExamplesDoFn()))

Source File: utils_test.py From text with Apache License 2.0

5 votes

def testLangNotInLangSet(self):
    with TestPipeline() as p:
      tokens = p | beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'}))
      assert_that(result, equal_to([]))

Source File: utils_test.py From text with Apache License 2.0

5 votes

def testLangInLangSet(self):
    with TestPipeline() as p:
      tokens = p | beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.FilterTokensByLang({'en'}))
      assert_that(result, equal_to([('I', 'en'),
                                    ('like', 'en'),
                                    ('pie', 'en'),
                                    ('.', 'en')]))

Source File: beam_testutil.py From healthcare-deid with Apache License 2.0

5 votes

def expand(self, pcoll):
    return pcoll | 'DummyWriteForTesting' >> beam.ParDo(
        DummyWriteTransform.WriteDoFn(self.filename))

Source File: telemetry.py From tfx-bsl with Apache License 2.0

5 votes

def ProfileRawRecords(
    pcoll: beam.pvalue.PCollection,
    telemetry_descriptors: Optional[List[Text]],
    logical_format: Text,
    physical_format: Text) -> beam.PTransform:
  """An identity transform to profile raw records for record based TFXIO."""
  return pcoll | "ProfileRawRecords" >> beam.ParDo(_ProfileRawRecordDoFn(
      telemetry_descriptors, logical_format, physical_format))

Source File: main.py From professional-services with Apache License 2.0

5 votes

def get_sideinput_collections(sideinput_filepath: beam.pvalue.PCollection, readTransform: beam.PTransform) \
        -> Dict[str,beam.pvalue.PCollection]:
    """"Load Side Input data from respective file paths

      Args:
          sideinput_filepath: File path representing base path for side inputs to be loaded
          readTransform: Transform responsible for loading the side input data

      Returns:
          Dictionary containing Side Input name as key and corresponding PCollection as value
  """
    sideinput_types = get_sideinput_types()
    # yapf: disable
    filepaths =  (sideinput_filepath
                  | "Get side input paths from base path" >> beam.ParDo(
                     dofns.SplitToMultiple(sideinput_types)).with_outputs(*sideinput_types)
                  )

    sideinput_collections = {}
    for sideinput_type in sideinput_types:
        sideinput_collections[sideinput_type] = (filepaths[sideinput_type]
                                                    | f"Read {sideinput_type}" >> readTransform
                                                   | f"{sideinput_type}:Extract KV" >> beam.Map(
                                                     transforms.kv_of, "productname", sideinput_type)
                                                 )
    # yapf: enable
    return sideinput_collections

Source File: transforms.py From professional-services with Apache License 2.0

5 votes

def LogEvents(pcol):
    pcol | 'Output Events' >> beam.ParDo(LogFn())

Source File: utils_test.py From text with Apache License 2.0

5 votes

def testNotEqual(self):
    with TestPipeline() as p:
      sample_input = [('I', 'en'), ('kind', 'en'), ('of', 'en'), ('like', 'en'),
                      ('to', 'en'), ('eat', 'en'), ('pie', 'en'), ('!', 'en'),
                      ('Je', 'fr'), ('suis', 'fr'), ('une', 'fr'),
                      ('fille', 'fr'), ('.', 'fr')]
      tokens = p | beam.Create(sample_input)
      result = (tokens
                | beam.CombineGlobally(utils.CalculateCoefficients(0.5))
                | beam.ParDo(CompareValues()))
      assert_that(result, equal_to([True]))

Source File: utils_test.py From text with Apache License 2.0

5 votes

def testBasic(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      coeffs = p | 'CreateCoeffs' >> beam.Create(self.coeffs)
      result = tokens | beam.ParDo(
          utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
      assert_that(result, equal_to([('Hello', 0.75), (',', 0.75),
                                    ('world', 0.75), ('!', 0.75),
                                    ('Bonjour', 1.5), ('.', 1.5)]))

Source File: utils_test.py From text with Apache License 2.0

5 votes

def testTokenTooLong(self):
    sample_input = [('one', 1), ('two', 2), ('three', 3), ('four', 4),
                    ('qwertyuiopasdfghjklzxcvbnmqwertyuiopasdfghjklzxcvbnm', 5),
                    ('blah', 20)]

    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(sample_input)
      result = tokens | beam.ParDo(utils.FilterByCount(self.max_token_length,
                                                       min_token_frequency=2))
      assert_that(result, equal_to([('three', 3), ('four', 4), ('blah', 20)]))

Source File: telemetry.py From tfx-bsl with Apache License 2.0

5 votes

def ProfileRecordBatches(
    pcoll: beam.pvalue.PCollection,
    telemetry_descriptors: Optional[List[Text]],
    logical_format: Text,
    physical_format: Text,
    distribution_update_probability: float = 0.1) -> beam.PTransform:
  """An identity transform to profile RecordBatches and updated Beam metrics.

  Args:
    pcoll: a PCollection[pa.RecordBatch]
    telemetry_descriptors: a set of descriptors that identify the component that
      invokes this PTransform. These will be used to construct the namespace
      to contain the beam metrics created within this PTransform. All such
      namespaces will be prefixed by "tfxio.". If None, a default "unknown"
      descriptor will be used.
    logical_format: the logical format of the data (before parsed into
      RecordBatches). Used to construct metric names.
    physical_format: the physical format in which the data is stored on disk.
      Used to construct metric names.
    distribution_update_probability: probability to update the expensive,
      per-row distributions.

  Returns:
    `pcoll` (identity function).
  """
  assert 0 < distribution_update_probability <= 1.0, (
      "Invalid probability: {}".format(distribution_update_probability))
  return pcoll | "ProfileRecordBatches" >> beam.ParDo(
      _ProfileRecordBatchDoFn(telemetry_descriptors, logical_format,
                              physical_format, distribution_update_probability))

Source File: preprocess.py From professional-services with Apache License 2.0

5 votes

def randomly_split(p, train_size, validation_size, test_size):
    """Randomly splits input pipeline in three sets based on input ratio.

    Args:
        p: PCollection, input pipeline.
        train_size: float, ratio of data going to train set.
        validation_size: float, ratio of data going to validation set.
        test_size: float, ratio of data going to test set.
    Returns:
        Tuple of PCollection.
    Raises:
        ValueError: Train validation and test sizes don`t add up to 1.0.
    """

    if train_size + validation_size + test_size != 1.0:
        raise ValueError(
            'Train, validation, and test sizes don`t add up to 1.0.')

    class SplitData(beam.DoFn):

        def process(self, element):
            r = random.random()
            if r < test_size:
                yield beam.pvalue.TaggedOutput('Test', element)
            elif r < 1 - train_size:
                yield beam.pvalue.TaggedOutput('Val', element)
            else:
                yield element

    split_data = (
        p | 'SplitData' >> beam.ParDo(SplitData()).with_outputs(
            'Test',
            'Val',
            main='Train'))
    return split_data['Train'], split_data['Val'], split_data['Test']

Source File: utils_test.py From text with Apache License 2.0

5 votes

def testLangNotInLangSetIncludeOthers(self):
    with TestPipeline() as p:
      tokens = p | beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'}, True))
      assert_that(result, equal_to([('I', 'other'),
                                    ('like', 'other'),
                                    ('pie', 'other'),
                                    ('.', 'other')]))

Python apache_beam.ParDo() Examples