Python apache_beam.FlatMap() Examples

The following are 30 code examples of apache_beam.FlatMap(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: preprocess.py    From professional-services with Apache License 2.0 7 votes vote down vote up
def shuffle(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield random.random(), element

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data 
Example #2
Source File: linters.py    From data-linter with Apache License 2.0 6 votes vote down vote up
def _lint(self, examples):
    feature_val_w_counts = (
        examples
        | 'Tuplize' >> beam.FlatMap(
            utils.example_tuplizer(self._counted_features))
        | 'FlattenFeatureVals' >> beam.FlatMap(self._flatten_feature_vals)
        | 'CountFeatureVals' >> beam.combiners.Count.PerElement())

    if hasattr(self, '_count_transformer'):
      feature_val_w_counts |= 'TransformCounts' >> self._count_transformer

    return (
        feature_val_w_counts
        | 'PairValWithCount' >> beam.Map(self._shift_key)
        | 'GroupByFeature' >> beam.GroupByKey()
        | 'ValCountsToDict' >> beam.Map(self._val_counts_as_dict)
        | 'GenResults' >> beam.Map(self._check_feature)
        | 'DropUnwarned' >> beam.Filter(bool)
        | 'AsList' >> beam.combiners.ToList()
        | 'ToResult' >> beam.Map(self._to_result)) 
Example #3
Source File: impl.py    From transform with Apache License 2.0 6 votes vote down vote up
def _clear_shared_state_after_barrier(pipeline, input_barrier):
  """Clears any shared state from within a pipeline context.

  This will only be cleared once input_barrier becomes available.

  Args:
    pipeline: A `beam.Pipeline` object.
    input_barrier: A `PCollection` which the pipeline should wait for.

  Returns:
    An empty `PCollection`.
  """
  empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
      lambda x: None)
  return (pipeline
          | 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
          | 'WaitAndClearSharedKeepAlives' >> beam.Map(
              lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
              beam.pvalue.AsIter(empty_pcoll))) 
Example #4
Source File: analyzer_impls.py    From transform with Apache License 2.0 6 votes vote down vote up
def expand(self, inputs):
    pcoll, = inputs
    def extract_outputs(outputs, num_outputs):
      if len(outputs) != num_outputs:
        raise ValueError(
            'Analyzer has {} outputs but its implementation produced {} '
            'values'.format(num_outputs, len(outputs)))
      for i, output in enumerate(outputs):
        yield beam.pvalue.TaggedOutput(str(i), output)

    output_keys = [str(i) for i in range(self._num_outputs)]
    outputs_tuple = (
        pcoll |
        'ExtractOutputs' >> beam.FlatMap(
            extract_outputs, self._num_outputs).with_outputs(*output_keys))
    return tuple(outputs_tuple[key] for key in output_keys) 
Example #5
Source File: obspyio.py    From SeisNN with MIT License 6 votes vote down vote up
def expand(self, pcollection):
        def get_dir_list(file_dir, suffix=""):
            file_list = []
            for file_name in os.listdir(file_dir):
                f = os.path.join(file_dir, file_name)
                if file_name.endswith(suffix):
                    file_list.append(f)

            return file_list

        def get_events(filename):
            catalog, wavename = read_nordic(filename, return_wavnames=True)
            for event in catalog.events:
                for pick in event.picks:
                    pick.waveform_id.wavename = wavename
                yield event

        return (
                pcollection
                | 'Create file directory' >> beam.Create(self.file_patterns)
                | 'List all files' >> beam.FlatMap(get_dir_list)
                | 'Get event' >> beam.FlatMap(get_events)
        ) 
Example #6
Source File: preprocess.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def shuffle_data(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield (random.random(), element)

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data 
Example #7
Source File: preprocess.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def run(p, args):
  """Creates a pipeline to build and write train/val/test datasets."""
  # pylint: disable=no-value-for-parameter
  query = bq_query.query
  if not args.cloud:
    query = "{} LIMIT 10".format(query)

  raw_data = (p
              | "ReadBQ" >> ReadBQ(query)
              | "HandleNullUserTags" >> beam.Map(_handle_null_user_tags)
              | "NormalizeUserTags" >> beam.Map(_normalize_user_tags))
  data = _run_tft_fn(raw_data, _preprocess_tft, args.tft_dir,
                     args.user_min_count, args.item_min_count)
  data = (data
          | "FilterData" >> beam.FlatMap(_filter_data)
          | "CleanTags" >> beam.Map(_clean_tags))
  data = _split_data(data)
  for name, dataset in data:
    dataset | "Write{}Output".format(name) >> WriteOutput(
        name, args.output_dir, constants.TRAIN_SPEC, args.plain_text) 
Example #8
Source File: preprocess.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def shuffle(p):
    """Shuffles the given pCollection."""

    return (p
            | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
            | 'GroupByRandom' >> beam.GroupByKey()
            | 'DropRandom' >> beam.FlatMap(lambda x: x[1]))


# pylint: disable=expression-not-assigned
# pylint: disable=no-value-for-parameter 
Example #9
Source File: fusion_break.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    # Create an empty PCollection that depends on pcoll.
    empty = pcoll | beam.FlatMap(lambda x: ())
    return pcoll | beam.Map(lambda x, unused: x, beam.pvalue.AsIter(empty)) 
Example #10
Source File: gcs_to_bigquery_lib.py    From healthcare-deid with Apache License 2.0 5 votes vote down vote up
def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(map_file_to_records) |
       'map_to_bq_inputs' >> beam.Map(map_to_bq_inputs) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
Example #11
Source File: obspyio.py    From SeisNN with MIT License 5 votes vote down vote up
def expand(self, pcollection):
        def search_pick(pick_list, stream):
            tmp_pick = {}
            starttime = stream.traces[0].stats.starttime
            endtime = stream.traces[0].stats.endtime
            for pick in pick_list:
                phase = pick.phase_hint
                if starttime < pick.time < endtime:
                    if not tmp_pick.get(phase):
                        tmp_pick[phase] = [pick]
                    else:
                        tmp_pick[phase].append(pick)

            return tmp_pick

        def stream_get_pick(data):
            key, dictionary = data
            pick_list = dictionary['pick']
            stream_list = dictionary['stream']
            location = dictionary['location'][0]

            for stream in stream_list:
                picks = search_pick(pick_list, stream)
                stream.picks = picks
                stream.location = location
                yield stream

        return (
                pcollection
                | 'Stream search picks' >> beam.FlatMap(stream_get_pick)
        ) 
Example #12
Source File: wiki_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cpc." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline 
Example #13
Source File: ccnews_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def ccnews_pipeline():
  """Read CCNews filenames and create Beam pipeline."""

  if FLAGS.dataset == "ccnews":
    data_filename = "ccnews.txt-%05d-of-01000"
    datasize = 1000
    testsize = 100
  else:
    data_filename = "wikipedia.txt-%05d-of-00500"
    datasize = 500
    testsize = 50
  train_files = [
      FLAGS.input_file + data_filename % i for i in range(datasize - testsize)
  ]
  test_files = [
      FLAGS.input_file + data_filename % i
      for i in range(datasize - testsize, testsize)
  ]

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=testsize))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord",
            num_shards=datasize - testsize))
    return

  return pipeline 
Example #14
Source File: wiki_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def wiki_pipeline():
  """Read WikiText103 filenames and create Beam pipeline."""

  train_files = FLAGS.input_file + "/wiki.train.raw"
  dev_files = FLAGS.input_file + "/wiki.valid.raw"
  test_files = FLAGS.input_file + "/wiki.test.raw"

  def pipeline(root):
    """Beam pipeline for converting WikiText103 files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create([test_files])
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".test.tfrecord",
            num_shards=10))
    _ = (
        root | "Create dev files" >> beam.Create([dev_files])
        | "Read dev files" >> beam.FlatMap(read_file)
        | "dev Shuffle" >> beam.Reshuffle()
        | "Preproc dev docs" >> beam.FlatMap(preproc_doc)
        | "record dev Shuffle" >> beam.Reshuffle()
        | "Write to dev tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".dev.tfrecord",
            num_shards=10))
    _ = (
        root | "Create train files" >> beam.Create([train_files])
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + "." + FLAGS.format + ".train.tfrecord",
            num_shards=100))
    return

  return pipeline 
Example #15
Source File: preprocess.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def Shuffle(p):
  """Shuffles the given pCollection."""
  return (p
          | "PairWithRandom" >> beam.Map(lambda x: (np.random.random(), x))
          | "GroupByRandom" >> beam.GroupByKey()
          | "DropRandom" >> beam.FlatMap(lambda x: x[1])) 
Example #16
Source File: physionet_to_mae_lib.py    From healthcare-deid with Apache License 2.0 5 votes vote down vote up
def run_pipeline(input_pattern, output_dir, mae_task_name, project,
                 pipeline_args):
  """Read the physionet records from GCS and write them out as MAE."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_phi_to_findings) |
       'generate_mae' >> beam.Map(mae.generate_mae, mae_task_name, {},
                                  ['patient_id', 'record_number']) |
       'write_mae' >> beam.Map(write_mae, project, output_dir)
      )
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
Example #17
Source File: test_pipeline.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def test_enrichment_allevents(self):
        """Tests the pipeline by creating mock data and invoking respective transforms"""

        with TestPipeline() as pipeline:
            sideinput_filepath = (
                pipeline | 'Read side Input file path' >> beam.Create(
                    ["gs://bucketname/input/sideinput/20200503".encode()]) |
                "Add Timestamp" >> beam.Map(
                    lambda e: beam.window.TimestampedValue(e, int(time.time())))
            )

            sideinput_collections = main.get_sideinput_collections(
                sideinput_filepath, static_data.ReadStaticSideInput())

            sales_events = (
                pipeline | 'Read primary events' >> beam.Create(
                    static_data.get_maininput_events()) |
                'Attach timestamp' >> beam.Map(
                    lambda e: beam.window.TimestampedValue(e, int(time.time())))
                | "Add Fixed window" >> beam.WindowInto(
                    beam.window.FixedWindows(10)))

            enriched_events = main.get_enriched_events(sales_events,
                                                       sideinput_collections)

            results = enriched_events | "UnPack the events" >> beam.FlatMap(
                lambda x: x)

            assert_that(results,
                        equal_to(static_data.get_expected_enriched_events())) 
Example #18
Source File: static_data.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def expand(self, pcol):
        """Fetch files based on the file pattern, read the contents from matching files

        Args:
         pcol: PCollection containing list of file patterns

        Returns:
         PCollection of type Json objects that represents data read from the files
        """
        # yapf: disable
        return (pcol
                | "Static Side Input Values" >> beam.FlatMap(self._get_lookup_values)
               )
        # yapf: enable 
Example #19
Source File: enforce_primary_keys.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
        return (
            pcoll
            | 'Extract Primary Key' >>
            beam.FlatMap(lambda row: [(row[self.primary_key], row)])
            | 'Sample n=1 by Primary Key' >> CombinePerKey(SampleCombineFn(1))
            | 'Drop keys' >> beam.FlatMap(lambda kv: kv[1])) 
Example #20
Source File: create_data.py    From conversational-datasets with Apache License 2.0 5 votes vote down vote up
def _shuffle_examples(examples):
    examples |= ("add random key" >> beam.Map(
        lambda example: (uuid.uuid4(), example)))
    examples |= ("group by key" >> beam.GroupByKey())
    examples |= ("get shuffled values" >> beam.FlatMap(lambda t: t[1]))
    return examples 
Example #21
Source File: create_data.py    From conversational-datasets with Apache License 2.0 5 votes vote down vote up
def _shuffle_examples(examples):
    examples |= "add random key" >> beam.Map(
        lambda example: (uuid.uuid4(), example)
    )
    examples |= "group by key" >> beam.GroupByKey()
    examples |= "get shuffled values" >> beam.FlatMap(lambda t: t[1])
    return examples 
Example #22
Source File: create_data.py    From conversational-datasets with Apache License 2.0 5 votes vote down vote up
def _shuffle(pcollection):
    """Shuffles the input pcollection."""
    pcollection |= "add random key" >> beam.Map(
        lambda value: (uuid.uuid4(), value))
    pcollection |= "group by key" >> beam.GroupByKey()
    pcollection |= "get shuffled values" >> beam.FlatMap(lambda t: t[1])
    return pcollection 
Example #23
Source File: cache_tasks_main.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def expand(self, pipeline):
    return (
        pipeline
        | beam.Create(self.files)
        | beam.FlatMap(self._emit_tokenized_examples)
        | beam.Reshuffle())  # Allows for additional parallelization. 
Example #24
Source File: jackknife.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def expand(self, sliced_extracts):

    def partition_fn(_, num_partitions):
      return self._random_state.randint(num_partitions)

    # Partition the data
    # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
    partitions = (
        sliced_extracts
        | 'Partition' >> beam.Partition(partition_fn,
                                        self._num_jackknife_samples))

    def add_partition_index(slice_key,
                            accumulator_and_size,
                            partition_index=None):
      accumulator, size = accumulator_and_size
      return slice_key, _PartitionInfo(accumulator, size, partition_index)

    # Within each partition, partially combine per slice key to get accumulators
    # and partition sizes; add partition_id for determinism.
    # List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
    partition_accumulators = []
    for i, partition in enumerate(partitions):
      partition_accumulators.append(
          partition
          | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
              beam.transforms.combiners.SingleInputTupleCombineFn(
                  _AccumulateOnlyCombiner(combiner=self._combiner),
                  beam.transforms.combiners.CountCombineFn()))
          | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
              add_partition_index, i))

    # Group partitions for the same slice, compute LOO metrics, and flatten back
    # into per-partition LOO metrics.
    # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
    return (partition_accumulators
            | 'FlattenPartitionAccumulators' >> beam.Flatten()
            | 'CollectPerSlicePartitions' >> beam.GroupByKey()
            | 'MakeJackknifeSamples' >> beam.FlatMap(
                _make_jackknife_samples, combiner=self._combiner)) 
Example #25
Source File: unbatch_extractor.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def _UnbatchInputs(
    extracts: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
  """Extracts unbatched inputs from batched extracts.

  Args:
    extracts: PCollection containing batched extracts.

  Returns:
    PCollection of per-example extracts.
  """
  return extracts | 'UnbatchInputs' >> beam.FlatMap(_ExtractUnbatchedInputs) 
Example #26
Source File: embedding_generator.py    From hub with Apache License 2.0 5 votes vote down vote up
def run(args):
  """Runs the embedding generation Beam pipeline."""

  if tf.io.gfile.exists(args.embed_output_dir):
    print('Removing embedding output directory...')
    tf.io.gfile.rmtree(args.embed_output_dir)
  print('Creating empty output directory...')
  tf.io.gfile.makedirs(args.embed_output_dir)

  options = beam.options.pipeline_options.PipelineOptions(**vars(args))

  original_dim = hub.load(args.module_url)(['']).shape[1]

  random_projection_matrix = generate_random_projection_weights(
      original_dim, args.projected_dim, args.embed_output_dir)

  print('Starting the Beam pipeline...')
  with beam.Pipeline(runner=_RUNNER, options=options) as pipeline:
    _ = (
        pipeline
        | 'Read sentences from files' >>
        beam.io.ReadFromText(file_pattern=args.data_file_pattern)
        | 'Batch elements' >> util.BatchElements(
            min_batch_size=_BATCH_SIZE / 2, max_batch_size=_BATCH_SIZE)
        | 'Generate embeddings' >> beam.Map(
            generate_embeddings, args.module_url, random_projection_matrix)
        | 'Encode to tf example' >> beam.FlatMap(to_tf_example)
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
            file_path_prefix='{}/emb'.format(args.embed_output_dir),
            file_name_suffix='.tfrecords')
    )

  print('Beam pipeline completed.') 
Example #27
Source File: linters.py    From data-linter with Apache License 2.0 5 votes vote down vote up
def _lint(self, examples):
    """Returns the result of the TailedDistributionDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `PTransform` that yields a `LintResult` of the format
        warnings: [feature names]
        lint_samples: [
          [stats: {min: feature_min if outlying, max: feature_max if outlying}]
          for each warning
        ]
    """

    feature_values = (
        examples
        | 'FlattenFeatureValue' >> beam.FlatMap(
            self._flatten_feature_vals(self.numeric_features)))

    feature_min_trimmed_mean = (
        feature_values | self._make_trimmed_averager(self._MIN))
    feature_max_trimmed_mean = (
        feature_values | self._make_trimmed_averager(self._MAX))

    return (
        (feature_min_trimmed_mean, feature_max_trimmed_mean)
        | 'MergeTrimmedMeans' >> beam.CoGroupByKey()
        | 'AsList' >> beam.combiners.ToList()
        | 'ToResult' >> beam.Map(self._to_result)) 
Example #28
Source File: transform.py    From pydatalab with Apache License 2.0 5 votes vote down vote up
def shuffle(pcoll):  # pylint: disable=invalid-name
  import random
  return (pcoll
          | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRandom' >> beam.GroupByKey()
          | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) 
Example #29
Source File: preprocess.py    From cloudml-samples with Apache License 2.0 5 votes vote down vote up
def _Shuffle(pcoll):  # pylint: disable=invalid-name
  import random
  return (pcoll
          | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRandom' >> beam.GroupByKey()
          | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) 
Example #30
Source File: preprocess.py    From cloudml-samples with Apache License 2.0 5 votes vote down vote up
def _Shuffle(pcoll):  # pylint: disable=invalid-name
  """Shuffles a PCollection."""
  import random
  return (pcoll
          | 'PairWithRand' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRand' >> beam.GroupByKey()
          | 'DropRand' >> beam.FlatMap(lambda (k, vs): vs))