Python Examples of apache

Source File: shard_variants_test.py From gcp-variant-transforms with Apache License 2.0

6 votes

def test_shard_variants(self):
    expected_shards = self._get_expected_variant_shards()
    variants = [variant
                for variant_list in expected_shards.values()
                for variant in variant_list]

    sharding = variant_sharding.VariantSharding(
        'gcp_variant_transforms/data/sharding_configs/'
        'homo_sapiens_default.yaml')
    pipeline = TestPipeline()
    shards = (
        pipeline
        | Create(variants, reshuffle=False)
        | 'ShardVariants' >> beam.Partition(
            shard_variants.ShardVariants(sharding),
            sharding.get_num_shards()))
    for i in range(sharding.get_num_shards()):
      assert_that(shards[i], equal_to(expected_shards.get(i, [])),
                  label=str(i))
    pipeline.run()

Source File: preprocess.py From cloudml-samples with Apache License 2.0

6 votes

def split_data(examples, train_fraction):
    """Splits the data into train/eval.

    Args:
      examples: A PCollection.
      train_fraction: fraction of examples to keep in the train set (float).
    """

    def partition_fn(data, n_partition):
        random_value = random.random()
        if random_value < train_fraction:
            return 0
        return 1

    examples_split = (examples
                      | "SplitData" >> beam.Partition(partition_fn, 2))
    return examples_split

Source File: base_example_gen_executor.py From tfx with Apache License 2.0

6 votes

def _PartitionFn(
    record: Union[tf.train.Example, tf.train.SequenceExample, bytes],
    num_partitions: int,
    buckets: List[int],
    split_config: example_gen_pb2.SplitConfig,
) -> int:
  """Partition function for the ExampleGen's output splits."""
  assert num_partitions == len(
      buckets), 'Partitions do not match bucket number.'
  partition_str = _GeneratePartitionKey(record, split_config)
  bucket = int(hashlib.sha256(partition_str).hexdigest(), 16) % buckets[-1]
  # For example, if buckets is [10,50,80], there will be 3 splits:
  #   bucket >=0 && < 10, returns 0
  #   bucket >=10 && < 50, returns 1
  #   bucket >=50 && < 80, returns 2
  return bisect.bisect(buckets, bucket)

Source File: variant_to_bigquery.py From gcp-variant-transforms with Apache License 2.0

5 votes

def expand(self, pcoll):
    bq_rows = pcoll | 'ConvertToBigQueryTableRow' >> beam.ParDo(
        ConvertVariantToRow(
            self._bigquery_row_generator,
            self._allow_incompatible_records,
            self._omit_empty_sample_calls))
    if self._num_bigquery_write_shards > 1:
      # We split data into self._num_bigquery_write_shards random partitions
      # and then write each part to final BQ by appending them together.
      # Combined with LimitWrite transform, this will avoid the BQ failure.
      bq_row_partitions = bq_rows | beam.Partition(
          lambda _, n: random.randint(0, n - 1),
          self._num_bigquery_write_shards)
      bq_writes = []
      for i in range(self._num_bigquery_write_shards):
        bq_rows = (bq_row_partitions[i] | 'LimitWrite' + str(i) >>
                   limit_write.LimitWrite(_WRITE_SHARDS_LIMIT))
        bq_writes.append(
            bq_rows | 'WriteToBigQuery' + str(i) >>
            beam.io.Write(beam.io.BigQuerySink(
                self._output_table,
                schema=self._schema,
                create_disposition=(
                    beam.io.BigQueryDisposition.CREATE_NEVER),
                write_disposition=(
                    beam.io.BigQueryDisposition.WRITE_APPEND))))
      return bq_writes
    else:
      return (bq_rows
              | 'WriteToBigQuery' >> beam.io.Write(beam.io.BigQuerySink(
                  self._output_table,
                  schema=self._schema,
                  create_disposition=(
                      beam.io.BigQueryDisposition.CREATE_NEVER),
                  write_disposition=(
                      beam.io.BigQueryDisposition.WRITE_APPEND
                      if self._append
                      else beam.io.BigQueryDisposition.WRITE_EMPTY))))

Source File: _preprocess.py From pydatalab with Apache License 2.0

5 votes

def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id):
  source_train = _util.get_sources_from_dataset(p, dataset_train, 'train')
  labels_source = [source_train]
  if dataset_eval is not None:
    source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval')
    labels_source.append(source_eval)

  labels = _labels_pipeline(labels_source)
  train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train')
  if dataset_eval is not None:
    # explicit eval data.
    eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval')
  else:
    # Split train/eval.
    train_preprocessed, eval_preprocessed = (train_preprocessed |
                                             'Random Partition' >>
                                             beam.Partition(TrainEvalSplitPartitionFn(), 2))

  output_train_path = os.path.join(output_dir, job_id, 'train')
  output_eval_path = os.path.join(output_dir, job_id, 'eval')
  labels_file = os.path.join(output_dir, job_id, 'labels')
  labels_save = (labels |
                 'Write labels' >>
                 beam.io.textio.WriteToText(labels_file, shard_name_template=''))
  train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path)
  eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path)
  # Make sure we write "latest" file after train and eval data are successfully written.
  output_latest_file = os.path.join(output_dir, 'latest')
  ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() |
      'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) |
      beam.Map(lambda path: job_id) |
      'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))

Source File: deep_copy_test.py From transform with Apache License 2.0

5 votes

def testEachPTransformCopiedOnce(self):
    with beam.Pipeline() as p:
      created = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')])
      modified1 = (created
                   | 'Transform1' >> beam.Map(
                       lambda x: DeepCopyTest._CountingIdentityFn(
                           'Transform1', x)))
      partition_fn = lambda element, partitions: element[0] % partitions
      p1, p2 = (modified1
                | 'Partition' >> beam.Partition(partition_fn, 2))
      merged = (p1, p2) | 'Flatten1' >> beam.Flatten()
      modified2 = (merged
                   | 'Transform2' >> beam.Map(
                       lambda x: DeepCopyTest._CountingIdentityFn(
                           'Transform2', x)))

      copied = deep_copy.deep_copy(modified2)

      # Check that deep copy was performed.
      self.assertIsNot(copied.producer.inputs[0], modified2.producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[0],
                       modified2.producer.inputs[0].producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[1],
                       modified2.producer.inputs[0].producer.inputs[1])

    # Check counts of processed items.
    self.assertEqual(DeepCopyTest._counts['Transform1'], 4)
    self.assertEqual(DeepCopyTest._counts['Transform2'], 4)

Source File: base_example_gen_executor.py From tfx with Apache License 2.0

5 votes

def _GeneratePartitionKey(record: Union[tf.train.Example,
                                        tf.train.SequenceExample, bytes],
                          split_config: example_gen_pb2.SplitConfig) -> bytes:
  """Generates key for partition."""

  if not split_config.HasField('partition_feature_name'):
    if isinstance(record, bytes):
      return record
    return record.SerializeToString(deterministic=True)

  if isinstance(record, tf.train.Example):
    features = record.features.feature  # pytype: disable=attribute-error
  elif isinstance(record, tf.train.SequenceExample):
    features = record.context.feature  # pytype: disable=attribute-error
  else:
    raise RuntimeError('Split by `partition_feature_name` is only supported '
                       'for FORMAT_TF_EXAMPLE and FORMAT_TF_SEQUENCE_EXAMPLE '
                       'payload format.')

  # Use a feature for partitioning the examples.
  feature_name = split_config.partition_feature_name
  if feature_name not in features:
    raise RuntimeError('Feature name `{}` does not exist.'.format(feature_name))
  feature = features[feature_name]
  if not feature.HasField('kind'):
    raise RuntimeError('Partition feature does not contain any value.')
  if (not feature.HasField('bytes_list') and
      not feature.HasField('int64_list')):
    raise RuntimeError('Only `bytes_list` and `int64_list` features are '
                       'supported for partition.')
  return feature.SerializeToString(deterministic=True)

Source File: preprocess.py From professional-services with Apache License 2.0

5 votes

def _split_data(examples, train_fraction=constants.TRAIN_SIZE,
                val_fraction=constants.VAL_SIZE):
  """Splits the data into train/validation/test."""

  def partition_fn(*_):
    random_value = np.random.random()
    if random_value < train_fraction:
      return 0
    if random_value < train_fraction + val_fraction:
      return 1
    return 2

  examples_split = examples | "SplitData" >> beam.Partition(partition_fn, 3)
  return zip([constants.TRAIN, constants.VAL, constants.TEST], examples_split)

Source File: jackknife.py From model-analysis with Apache License 2.0

5 votes

def expand(self, sliced_extracts):

    def partition_fn(_, num_partitions):
      return self._random_state.randint(num_partitions)

    # Partition the data
    # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
    partitions = (
        sliced_extracts
        | 'Partition' >> beam.Partition(partition_fn,
                                        self._num_jackknife_samples))

    def add_partition_index(slice_key,
                            accumulator_and_size,
                            partition_index=None):
      accumulator, size = accumulator_and_size
      return slice_key, _PartitionInfo(accumulator, size, partition_index)

    # Within each partition, partially combine per slice key to get accumulators
    # and partition sizes; add partition_id for determinism.
    # List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
    partition_accumulators = []
    for i, partition in enumerate(partitions):
      partition_accumulators.append(
          partition
          | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
              beam.transforms.combiners.SingleInputTupleCombineFn(
                  _AccumulateOnlyCombiner(combiner=self._combiner),
                  beam.transforms.combiners.CountCombineFn()))
          | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
              add_partition_index, i))

    # Group partitions for the same slice, compute LOO metrics, and flatten back
    # into per-partition LOO metrics.
    # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
    return (partition_accumulators
            | 'FlattenPartitionAccumulators' >> beam.Flatten()
            | 'CollectPerSlicePartitions' >> beam.GroupByKey()
            | 'MakeJackknifeSamples' >> beam.FlatMap(
                _make_jackknife_samples, combiner=self._combiner))

Python apache_beam.Partition() Examples