Python Examples of apache

Source File: preprocess.py From professional-services with Apache License 2.0

7 votes

def shuffle(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield random.random(), element

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data

Source File: linters.py From data-linter with Apache License 2.0

6 votes

def _lint(self, examples):
    feature_val_w_counts = (
        examples
        | 'Tuplize' >> beam.FlatMap(
            utils.example_tuplizer(self._counted_features))
        | 'FlattenFeatureVals' >> beam.FlatMap(self._flatten_feature_vals)
        | 'CountFeatureVals' >> beam.combiners.Count.PerElement())

    if hasattr(self, '_count_transformer'):
      feature_val_w_counts |= 'TransformCounts' >> self._count_transformer

    return (
        feature_val_w_counts
        | 'PairValWithCount' >> beam.Map(self._shift_key)
        | 'GroupByFeature' >> beam.GroupByKey()
        | 'ValCountsToDict' >> beam.Map(self._val_counts_as_dict)
        | 'GenResults' >> beam.Map(self._check_feature)
        | 'DropUnwarned' >> beam.Filter(bool)
        | 'AsList' >> beam.combiners.ToList()
        | 'ToResult' >> beam.Map(self._to_result))

Source File: impl.py From transform with Apache License 2.0

6 votes

def _clear_shared_state_after_barrier(pipeline, input_barrier):
  """Clears any shared state from within a pipeline context.

  This will only be cleared once input_barrier becomes available.

  Args:
    pipeline: A `beam.Pipeline` object.
    input_barrier: A `PCollection` which the pipeline should wait for.

  Returns:
    An empty `PCollection`.
  """
  empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
      lambda x: None)
  return (pipeline
          | 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
          | 'WaitAndClearSharedKeepAlives' >> beam.Map(
              lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
              beam.pvalue.AsIter(empty_pcoll)))

Source File: analyzer_impls.py From transform with Apache License 2.0

6 votes

def expand(self, inputs):
    pcoll, = inputs
    def extract_outputs(outputs, num_outputs):
      if len(outputs) != num_outputs:
        raise ValueError(
            'Analyzer has {} outputs but its implementation produced {} '
            'values'.format(num_outputs, len(outputs)))
      for i, output in enumerate(outputs):
        yield beam.pvalue.TaggedOutput(str(i), output)

    output_keys = [str(i) for i in range(self._num_outputs)]
    outputs_tuple = (
        pcoll |
        'ExtractOutputs' >> beam.FlatMap(
            extract_outputs, self._num_outputs).with_outputs(*output_keys))
    return tuple(outputs_tuple[key] for key in output_keys)

Source File: obspyio.py From SeisNN with MIT License

6 votes

def expand(self, pcollection):
        def get_dir_list(file_dir, suffix=""):
            file_list = []
            for file_name in os.listdir(file_dir):
                f = os.path.join(file_dir, file_name)
                if file_name.endswith(suffix):
                    file_list.append(f)

            return file_list

        def get_events(filename):
            catalog, wavename = read_nordic(filename, return_wavnames=True)
            for event in catalog.events:
                for pick in event.picks:
                    pick.waveform_id.wavename = wavename
                yield event

        return (
                pcollection
                | 'Create file directory' >> beam.Create(self.file_patterns)
                | 'List all files' >> beam.FlatMap(get_dir_list)
                | 'Get event' >> beam.FlatMap(get_events)
        )

Source File: preprocess.py From professional-services with Apache License 2.0

6 votes

def shuffle_data(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield (random.random(), element)

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data

Source File: preprocess.py From professional-services with Apache License 2.0

6 votes

def run(p, args):
  """Creates a pipeline to build and write train/val/test datasets."""
  # pylint: disable=no-value-for-parameter
  query = bq_query.query
  if not args.cloud:
    query = "{} LIMIT 10".format(query)

  raw_data = (p
              | "ReadBQ" >> ReadBQ(query)
              | "HandleNullUserTags" >> beam.Map(_handle_null_user_tags)
              | "NormalizeUserTags" >> beam.Map(_normalize_user_tags))
  data = _run_tft_fn(raw_data, _preprocess_tft, args.tft_dir,
                     args.user_min_count, args.item_min_count)
  data = (data
          | "FilterData" >> beam.FlatMap(_filter_data)
          | "CleanTags" >> beam.Map(_clean_tags))
  data = _split_data(data)
  for name, dataset in data:
    dataset | "Write{}Output".format(name) >> WriteOutput(
        name, args.output_dir, constants.TRAIN_SPEC, args.plain_text)

Source File: preprocess.py From professional-services with Apache License 2.0

5 votes

def shuffle(p):
    """Shuffles the given pCollection."""

    return (p
            | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
            | 'GroupByRandom' >> beam.GroupByKey()
            | 'DropRandom' >> beam.FlatMap(lambda x: x[1]))


# pylint: disable=expression-not-assigned
# pylint: disable=no-value-for-parameter

Source File: fusion_break.py From gcp-variant-transforms with Apache License 2.0

5 votes

def expand(self, pcoll):
    # Create an empty PCollection that depends on pcoll.
    empty = pcoll | beam.FlatMap(lambda x: ())
    return pcoll | beam.Map(lambda x, unused: x, beam.pvalue.AsIter(empty))

Source File: gcs_to_bigquery_lib.py From healthcare-deid with Apache License 2.0

5 votes

def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(map_file_to_records) |
       'map_to_bq_inputs' >> beam.Map(map_to_bq_inputs) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result)

Source File: obspyio.py From SeisNN with MIT License

5 votes

def expand(self, pcollection):
        def search_pick(pick_list, stream):
            tmp_pick = {}
            starttime = stream.traces[0].stats.starttime
            endtime = stream.traces[0].stats.endtime
            for pick in pick_list:
                phase = pick.phase_hint
                if starttime < pick.time < endtime:
                    if not tmp_pick.get(phase):
                        tmp_pick[phase] = [pick]
                    else:
                        tmp_pick[phase].append(pick)

            return tmp_pick

        def stream_get_pick(data):
            key, dictionary = data
            pick_list = dictionary['pick']
            stream_list = dictionary['stream']
            location = dictionary['location'][0]

            for stream in stream_list:
                picks = search_pick(pick_list, stream)
                stream.picks = picks
                stream.location = location
                yield stream

        return (
                pcollection
                | 'Stream search picks' >> beam.FlatMap(stream_get_pick)
        )

Source File: wiki_preproc_pipeline.py From language with Apache License 2.0

5 votes

def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline

Source File: ccnews_preproc_pipeline.py From language with Apache License 2.0

5 votes

def ccnews_pipeline():
  """Read CCNews filenames and create Beam pipeline."""

  if FLAGS.dataset == "ccnews":
    data_filename = "ccnews.txt-%05d-of-01000"
    datasize = 1000
    testsize = 100
  else:
    data_filename = "wikipedia.txt-%05d-of-00500"
    datasize = 500
    testsize = 50
  train_files = [
      FLAGS.input_file + data_filename % i for i in range(datasize - testsize)
  ]
  test_files = [
      FLAGS.input_file + data_filename % i
      for i in range(datasize - testsize, testsize)
  ]

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=testsize))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord",
            num_shards=datasize - testsize))
    return

  return pipeline

Source File: wiki_preproc_pipeline.py From language with Apache License 2.0

5 votes

def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline

Source File: preprocess.py From professional-services with Apache License 2.0

5 votes

def Shuffle(p):
  """Shuffles the given pCollection."""
  return (p
          | "PairWithRandom" >> beam.Map(lambda x: (np.random.random(), x))
          | "GroupByRandom" >> beam.GroupByKey()
          | "DropRandom" >> beam.FlatMap(lambda x: x[1]))

Source File: physionet_to_mae_lib.py From healthcare-deid with Apache License 2.0

5 votes

def run_pipeline(input_pattern, output_dir, mae_task_name, project,
                 pipeline_args):
  """Read the physionet records from GCS and write them out as MAE."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_phi_to_findings) |
       'generate_mae' >> beam.Map(mae.generate_mae, mae_task_name, {},
                                  ['patient_id', 'record_number']) |
       'write_mae' >> beam.Map(write_mae, project, output_dir)
      )
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result)

Source File: test_pipeline.py From professional-services with Apache License 2.0

5 votes

def test_enrichment_allevents(self):
        """Tests the pipeline by creating mock data and invoking respective transforms"""

        with TestPipeline() as pipeline:
            sideinput_filepath = (
                pipeline | 'Read side Input file path' >> beam.Create(
                    ["gs://bucketname/input/sideinput/20200503".encode()]) |
                "Add Timestamp" >> beam.Map(
                    lambda e: beam.window.TimestampedValue(e, int(time.time())))
            )

            sideinput_collections = main.get_sideinput_collections(
                sideinput_filepath, static_data.ReadStaticSideInput())

            sales_events = (
                pipeline | 'Read primary events' >> beam.Create(
                    static_data.get_maininput_events()) |
                'Attach timestamp' >> beam.Map(
                    lambda e: beam.window.TimestampedValue(e, int(time.time())))
                | "Add Fixed window" >> beam.WindowInto(
                    beam.window.FixedWindows(10)))

            enriched_events = main.get_enriched_events(sales_events,
                                                       sideinput_collections)

            results = enriched_events | "UnPack the events" >> beam.FlatMap(
                lambda x: x)

            assert_that(results,
                        equal_to(static_data.get_expected_enriched_events()))

Source File: static_data.py From professional-services with Apache License 2.0

5 votes

def expand(self, pcol):
        """Fetch files based on the file pattern, read the contents from matching files

        Args:
         pcol: PCollection containing list of file patterns

        Returns:
         PCollection of type Json objects that represents data read from the files
        """
        # yapf: disable
        return (pcol
                | "Static Side Input Values" >> beam.FlatMap(self._get_lookup_values)
               )
        # yapf: enable

Source File: enforce_primary_keys.py From professional-services with Apache License 2.0

5 votes

def expand(self, pcoll):
        return (
            pcoll
            | 'Extract Primary Key' >>
            beam.FlatMap(lambda row: [(row[self.primary_key], row)])
            | 'Sample n=1 by Primary Key' >> CombinePerKey(SampleCombineFn(1))
            | 'Drop keys' >> beam.FlatMap(lambda kv: kv[1]))

Source File: create_data.py From conversational-datasets with Apache License 2.0

5 votes

def _shuffle_examples(examples):
    examples |= ("add random key" >> beam.Map(
        lambda example: (uuid.uuid4(), example)))
    examples |= ("group by key" >> beam.GroupByKey())
    examples |= ("get shuffled values" >> beam.FlatMap(lambda t: t[1]))
    return examples

Source File: create_data.py From conversational-datasets with Apache License 2.0

5 votes

def _shuffle_examples(examples):
    examples |= "add random key" >> beam.Map(
        lambda example: (uuid.uuid4(), example)
    )
    examples |= "group by key" >> beam.GroupByKey()
    examples |= "get shuffled values" >> beam.FlatMap(lambda t: t[1])
    return examples

Source File: create_data.py From conversational-datasets with Apache License 2.0

5 votes

def _shuffle(pcollection):
    """Shuffles the input pcollection."""
    pcollection |= "add random key" >> beam.Map(
        lambda value: (uuid.uuid4(), value))
    pcollection |= "group by key" >> beam.GroupByKey()
    pcollection |= "get shuffled values" >> beam.FlatMap(lambda t: t[1])
    return pcollection

Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0

5 votes

def expand(self, pipeline):
    return (
        pipeline
        | beam.Create(self.files)
        | beam.FlatMap(self._emit_tokenized_examples)
        | beam.Reshuffle())  # Allows for additional parallelization.

Source File: jackknife.py From model-analysis with Apache License 2.0

5 votes

def expand(self, sliced_extracts):

    def partition_fn(_, num_partitions):
      return self._random_state.randint(num_partitions)

    # Partition the data
    # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
    partitions = (
        sliced_extracts
        | 'Partition' >> beam.Partition(partition_fn,
                                        self._num_jackknife_samples))

    def add_partition_index(slice_key,
                            accumulator_and_size,
                            partition_index=None):
      accumulator, size = accumulator_and_size
      return slice_key, _PartitionInfo(accumulator, size, partition_index)

    # Within each partition, partially combine per slice key to get accumulators
    # and partition sizes; add partition_id for determinism.
    # List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
    partition_accumulators = []
    for i, partition in enumerate(partitions):
      partition_accumulators.append(
          partition
          | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
              beam.transforms.combiners.SingleInputTupleCombineFn(
                  _AccumulateOnlyCombiner(combiner=self._combiner),
                  beam.transforms.combiners.CountCombineFn()))
          | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
              add_partition_index, i))

    # Group partitions for the same slice, compute LOO metrics, and flatten back
    # into per-partition LOO metrics.
    # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
    return (partition_accumulators
            | 'FlattenPartitionAccumulators' >> beam.Flatten()
            | 'CollectPerSlicePartitions' >> beam.GroupByKey()
            | 'MakeJackknifeSamples' >> beam.FlatMap(
                _make_jackknife_samples, combiner=self._combiner))

Source File: unbatch_extractor.py From model-analysis with Apache License 2.0

5 votes

def _UnbatchInputs(
    extracts: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
  """Extracts unbatched inputs from batched extracts.

  Args:
    extracts: PCollection containing batched extracts.

  Returns:
    PCollection of per-example extracts.
  """
  return extracts | 'UnbatchInputs' >> beam.FlatMap(_ExtractUnbatchedInputs)

Source File: embedding_generator.py From hub with Apache License 2.0

5 votes

def run(args):
  """Runs the embedding generation Beam pipeline."""

  if tf.io.gfile.exists(args.embed_output_dir):
    print('Removing embedding output directory...')
    tf.io.gfile.rmtree(args.embed_output_dir)
  print('Creating empty output directory...')
  tf.io.gfile.makedirs(args.embed_output_dir)

  options = beam.options.pipeline_options.PipelineOptions(**vars(args))

  original_dim = hub.load(args.module_url)(['']).shape[1]

  random_projection_matrix = generate_random_projection_weights(
      original_dim, args.projected_dim, args.embed_output_dir)

  print('Starting the Beam pipeline...')
  with beam.Pipeline(runner=_RUNNER, options=options) as pipeline:
    _ = (
        pipeline
        | 'Read sentences from files' >>
        beam.io.ReadFromText(file_pattern=args.data_file_pattern)
        | 'Batch elements' >> util.BatchElements(
            min_batch_size=_BATCH_SIZE / 2, max_batch_size=_BATCH_SIZE)
        | 'Generate embeddings' >> beam.Map(
            generate_embeddings, args.module_url, random_projection_matrix)
        | 'Encode to tf example' >> beam.FlatMap(to_tf_example)
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
            file_path_prefix='{}/emb'.format(args.embed_output_dir),
            file_name_suffix='.tfrecords')
    )

  print('Beam pipeline completed.')

Source File: linters.py From data-linter with Apache License 2.0

5 votes

def _lint(self, examples):
    """Returns the result of the TailedDistributionDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `PTransform` that yields a `LintResult` of the format
        warnings: [feature names]
        lint_samples: [
          [stats: {min: feature_min if outlying, max: feature_max if outlying}]
          for each warning
        ]
    """

    feature_values = (
        examples
        | 'FlattenFeatureValue' >> beam.FlatMap(
            self._flatten_feature_vals(self.numeric_features)))

    feature_min_trimmed_mean = (
        feature_values | self._make_trimmed_averager(self._MIN))
    feature_max_trimmed_mean = (
        feature_values | self._make_trimmed_averager(self._MAX))

    return (
        (feature_min_trimmed_mean, feature_max_trimmed_mean)
        | 'MergeTrimmedMeans' >> beam.CoGroupByKey()
        | 'AsList' >> beam.combiners.ToList()
        | 'ToResult' >> beam.Map(self._to_result))

Source File: transform.py From pydatalab with Apache License 2.0

5 votes

def shuffle(pcoll):  # pylint: disable=invalid-name
  import random
  return (pcoll
          | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRandom' >> beam.GroupByKey()
          | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))

Source File: preprocess.py From cloudml-samples with Apache License 2.0

5 votes

def _Shuffle(pcoll):  # pylint: disable=invalid-name
  import random
  return (pcoll
          | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRandom' >> beam.GroupByKey()
          | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))

Source File: preprocess.py From cloudml-samples with Apache License 2.0

5 votes

def _Shuffle(pcoll):  # pylint: disable=invalid-name
  """Shuffles a PCollection."""
  import random
  return (pcoll
          | 'PairWithRand' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRand' >> beam.GroupByKey()
          | 'DropRand' >> beam.FlatMap(lambda (k, vs): vs))

Python apache_beam.FlatMap() Examples