Python apache_beam.Partition() Examples

The following are 9 code examples of apache_beam.Partition(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: shard_variants_test.py    From gcp-variant-transforms with Apache License 2.0 6 votes vote down vote up
def test_shard_variants(self):
    expected_shards = self._get_expected_variant_shards()
    variants = [variant
                for variant_list in expected_shards.values()
                for variant in variant_list]

    sharding = variant_sharding.VariantSharding(
        'gcp_variant_transforms/data/sharding_configs/'
        'homo_sapiens_default.yaml')
    pipeline = TestPipeline()
    shards = (
        pipeline
        | Create(variants, reshuffle=False)
        | 'ShardVariants' >> beam.Partition(
            shard_variants.ShardVariants(sharding),
            sharding.get_num_shards()))
    for i in range(sharding.get_num_shards()):
      assert_that(shards[i], equal_to(expected_shards.get(i, [])),
                  label=str(i))
    pipeline.run() 
Example #2
Source File: preprocess.py    From cloudml-samples with Apache License 2.0 6 votes vote down vote up
def split_data(examples, train_fraction):
    """Splits the data into train/eval.

    Args:
      examples: A PCollection.
      train_fraction: fraction of examples to keep in the train set (float).
    """

    def partition_fn(data, n_partition):
        random_value = random.random()
        if random_value < train_fraction:
            return 0
        return 1

    examples_split = (examples
                      | "SplitData" >> beam.Partition(partition_fn, 2))
    return examples_split 
Example #3
Source File: base_example_gen_executor.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _PartitionFn(
    record: Union[tf.train.Example, tf.train.SequenceExample, bytes],
    num_partitions: int,
    buckets: List[int],
    split_config: example_gen_pb2.SplitConfig,
) -> int:
  """Partition function for the ExampleGen's output splits."""
  assert num_partitions == len(
      buckets), 'Partitions do not match bucket number.'
  partition_str = _GeneratePartitionKey(record, split_config)
  bucket = int(hashlib.sha256(partition_str).hexdigest(), 16) % buckets[-1]
  # For example, if buckets is [10,50,80], there will be 3 splits:
  #   bucket >=0 && < 10, returns 0
  #   bucket >=10 && < 50, returns 1
  #   bucket >=50 && < 80, returns 2
  return bisect.bisect(buckets, bucket) 
Example #4
Source File: variant_to_bigquery.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    bq_rows = pcoll | 'ConvertToBigQueryTableRow' >> beam.ParDo(
        ConvertVariantToRow(
            self._bigquery_row_generator,
            self._allow_incompatible_records,
            self._omit_empty_sample_calls))
    if self._num_bigquery_write_shards > 1:
      # We split data into self._num_bigquery_write_shards random partitions
      # and then write each part to final BQ by appending them together.
      # Combined with LimitWrite transform, this will avoid the BQ failure.
      bq_row_partitions = bq_rows | beam.Partition(
          lambda _, n: random.randint(0, n - 1),
          self._num_bigquery_write_shards)
      bq_writes = []
      for i in range(self._num_bigquery_write_shards):
        bq_rows = (bq_row_partitions[i] | 'LimitWrite' + str(i) >>
                   limit_write.LimitWrite(_WRITE_SHARDS_LIMIT))
        bq_writes.append(
            bq_rows | 'WriteToBigQuery' + str(i) >>
            beam.io.Write(beam.io.BigQuerySink(
                self._output_table,
                schema=self._schema,
                create_disposition=(
                    beam.io.BigQueryDisposition.CREATE_NEVER),
                write_disposition=(
                    beam.io.BigQueryDisposition.WRITE_APPEND))))
      return bq_writes
    else:
      return (bq_rows
              | 'WriteToBigQuery' >> beam.io.Write(beam.io.BigQuerySink(
                  self._output_table,
                  schema=self._schema,
                  create_disposition=(
                      beam.io.BigQueryDisposition.CREATE_NEVER),
                  write_disposition=(
                      beam.io.BigQueryDisposition.WRITE_APPEND
                      if self._append
                      else beam.io.BigQueryDisposition.WRITE_EMPTY)))) 
Example #5
Source File: _preprocess.py    From pydatalab with Apache License 2.0 5 votes vote down vote up
def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id):
  source_train = _util.get_sources_from_dataset(p, dataset_train, 'train')
  labels_source = [source_train]
  if dataset_eval is not None:
    source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval')
    labels_source.append(source_eval)

  labels = _labels_pipeline(labels_source)
  train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train')
  if dataset_eval is not None:
    # explicit eval data.
    eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval')
  else:
    # Split train/eval.
    train_preprocessed, eval_preprocessed = (train_preprocessed |
                                             'Random Partition' >>
                                             beam.Partition(TrainEvalSplitPartitionFn(), 2))

  output_train_path = os.path.join(output_dir, job_id, 'train')
  output_eval_path = os.path.join(output_dir, job_id, 'eval')
  labels_file = os.path.join(output_dir, job_id, 'labels')
  labels_save = (labels |
                 'Write labels' >>
                 beam.io.textio.WriteToText(labels_file, shard_name_template=''))
  train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path)
  eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path)
  # Make sure we write "latest" file after train and eval data are successfully written.
  output_latest_file = os.path.join(output_dir, 'latest')
  ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() |
      'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) |
      beam.Map(lambda path: job_id) |
      'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template='')) 
Example #6
Source File: deep_copy_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testEachPTransformCopiedOnce(self):
    with beam.Pipeline() as p:
      created = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')])
      modified1 = (created
                   | 'Transform1' >> beam.Map(
                       lambda x: DeepCopyTest._CountingIdentityFn(
                           'Transform1', x)))
      partition_fn = lambda element, partitions: element[0] % partitions
      p1, p2 = (modified1
                | 'Partition' >> beam.Partition(partition_fn, 2))
      merged = (p1, p2) | 'Flatten1' >> beam.Flatten()
      modified2 = (merged
                   | 'Transform2' >> beam.Map(
                       lambda x: DeepCopyTest._CountingIdentityFn(
                           'Transform2', x)))

      copied = deep_copy.deep_copy(modified2)

      # Check that deep copy was performed.
      self.assertIsNot(copied.producer.inputs[0], modified2.producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[0],
                       modified2.producer.inputs[0].producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[1],
                       modified2.producer.inputs[0].producer.inputs[1])

    # Check counts of processed items.
    self.assertEqual(DeepCopyTest._counts['Transform1'], 4)
    self.assertEqual(DeepCopyTest._counts['Transform2'], 4) 
Example #7
Source File: base_example_gen_executor.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _GeneratePartitionKey(record: Union[tf.train.Example,
                                        tf.train.SequenceExample, bytes],
                          split_config: example_gen_pb2.SplitConfig) -> bytes:
  """Generates key for partition."""

  if not split_config.HasField('partition_feature_name'):
    if isinstance(record, bytes):
      return record
    return record.SerializeToString(deterministic=True)

  if isinstance(record, tf.train.Example):
    features = record.features.feature  # pytype: disable=attribute-error
  elif isinstance(record, tf.train.SequenceExample):
    features = record.context.feature  # pytype: disable=attribute-error
  else:
    raise RuntimeError('Split by `partition_feature_name` is only supported '
                       'for FORMAT_TF_EXAMPLE and FORMAT_TF_SEQUENCE_EXAMPLE '
                       'payload format.')

  # Use a feature for partitioning the examples.
  feature_name = split_config.partition_feature_name
  if feature_name not in features:
    raise RuntimeError('Feature name `{}` does not exist.'.format(feature_name))
  feature = features[feature_name]
  if not feature.HasField('kind'):
    raise RuntimeError('Partition feature does not contain any value.')
  if (not feature.HasField('bytes_list') and
      not feature.HasField('int64_list')):
    raise RuntimeError('Only `bytes_list` and `int64_list` features are '
                       'supported for partition.')
  return feature.SerializeToString(deterministic=True) 
Example #8
Source File: preprocess.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def _split_data(examples, train_fraction=constants.TRAIN_SIZE,
                val_fraction=constants.VAL_SIZE):
  """Splits the data into train/validation/test."""

  def partition_fn(*_):
    random_value = np.random.random()
    if random_value < train_fraction:
      return 0
    if random_value < train_fraction + val_fraction:
      return 1
    return 2

  examples_split = examples | "SplitData" >> beam.Partition(partition_fn, 3)
  return zip([constants.TRAIN, constants.VAL, constants.TEST], examples_split) 
Example #9
Source File: jackknife.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def expand(self, sliced_extracts):

    def partition_fn(_, num_partitions):
      return self._random_state.randint(num_partitions)

    # Partition the data
    # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
    partitions = (
        sliced_extracts
        | 'Partition' >> beam.Partition(partition_fn,
                                        self._num_jackknife_samples))

    def add_partition_index(slice_key,
                            accumulator_and_size,
                            partition_index=None):
      accumulator, size = accumulator_and_size
      return slice_key, _PartitionInfo(accumulator, size, partition_index)

    # Within each partition, partially combine per slice key to get accumulators
    # and partition sizes; add partition_id for determinism.
    # List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
    partition_accumulators = []
    for i, partition in enumerate(partitions):
      partition_accumulators.append(
          partition
          | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
              beam.transforms.combiners.SingleInputTupleCombineFn(
                  _AccumulateOnlyCombiner(combiner=self._combiner),
                  beam.transforms.combiners.CountCombineFn()))
          | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
              add_partition_index, i))

    # Group partitions for the same slice, compute LOO metrics, and flatten back
    # into per-partition LOO metrics.
    # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
    return (partition_accumulators
            | 'FlattenPartitionAccumulators' >> beam.Flatten()
            | 'CollectPerSlicePartitions' >> beam.GroupByKey()
            | 'MakeJackknifeSamples' >> beam.FlatMap(
                _make_jackknife_samples, combiner=self._combiner))