Python apache_beam.Partition() Examples
The following are 9
code examples of apache_beam.Partition().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: shard_variants_test.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def test_shard_variants(self): expected_shards = self._get_expected_variant_shards() variants = [variant for variant_list in expected_shards.values() for variant in variant_list] sharding = variant_sharding.VariantSharding( 'gcp_variant_transforms/data/sharding_configs/' 'homo_sapiens_default.yaml') pipeline = TestPipeline() shards = ( pipeline | Create(variants, reshuffle=False) | 'ShardVariants' >> beam.Partition( shard_variants.ShardVariants(sharding), sharding.get_num_shards())) for i in range(sharding.get_num_shards()): assert_that(shards[i], equal_to(expected_shards.get(i, [])), label=str(i)) pipeline.run()
Example #2
Source File: preprocess.py From cloudml-samples with Apache License 2.0 | 6 votes |
def split_data(examples, train_fraction): """Splits the data into train/eval. Args: examples: A PCollection. train_fraction: fraction of examples to keep in the train set (float). """ def partition_fn(data, n_partition): random_value = random.random() if random_value < train_fraction: return 0 return 1 examples_split = (examples | "SplitData" >> beam.Partition(partition_fn, 2)) return examples_split
Example #3
Source File: base_example_gen_executor.py From tfx with Apache License 2.0 | 6 votes |
def _PartitionFn( record: Union[tf.train.Example, tf.train.SequenceExample, bytes], num_partitions: int, buckets: List[int], split_config: example_gen_pb2.SplitConfig, ) -> int: """Partition function for the ExampleGen's output splits.""" assert num_partitions == len( buckets), 'Partitions do not match bucket number.' partition_str = _GeneratePartitionKey(record, split_config) bucket = int(hashlib.sha256(partition_str).hexdigest(), 16) % buckets[-1] # For example, if buckets is [10,50,80], there will be 3 splits: # bucket >=0 && < 10, returns 0 # bucket >=10 && < 50, returns 1 # bucket >=50 && < 80, returns 2 return bisect.bisect(buckets, bucket)
Example #4
Source File: variant_to_bigquery.py From gcp-variant-transforms with Apache License 2.0 | 5 votes |
def expand(self, pcoll): bq_rows = pcoll | 'ConvertToBigQueryTableRow' >> beam.ParDo( ConvertVariantToRow( self._bigquery_row_generator, self._allow_incompatible_records, self._omit_empty_sample_calls)) if self._num_bigquery_write_shards > 1: # We split data into self._num_bigquery_write_shards random partitions # and then write each part to final BQ by appending them together. # Combined with LimitWrite transform, this will avoid the BQ failure. bq_row_partitions = bq_rows | beam.Partition( lambda _, n: random.randint(0, n - 1), self._num_bigquery_write_shards) bq_writes = [] for i in range(self._num_bigquery_write_shards): bq_rows = (bq_row_partitions[i] | 'LimitWrite' + str(i) >> limit_write.LimitWrite(_WRITE_SHARDS_LIMIT)) bq_writes.append( bq_rows | 'WriteToBigQuery' + str(i) >> beam.io.Write(beam.io.BigQuerySink( self._output_table, schema=self._schema, create_disposition=( beam.io.BigQueryDisposition.CREATE_NEVER), write_disposition=( beam.io.BigQueryDisposition.WRITE_APPEND)))) return bq_writes else: return (bq_rows | 'WriteToBigQuery' >> beam.io.Write(beam.io.BigQuerySink( self._output_table, schema=self._schema, create_disposition=( beam.io.BigQueryDisposition.CREATE_NEVER), write_disposition=( beam.io.BigQueryDisposition.WRITE_APPEND if self._append else beam.io.BigQueryDisposition.WRITE_EMPTY))))
Example #5
Source File: _preprocess.py From pydatalab with Apache License 2.0 | 5 votes |
def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id): source_train = _util.get_sources_from_dataset(p, dataset_train, 'train') labels_source = [source_train] if dataset_eval is not None: source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval') labels_source.append(source_eval) labels = _labels_pipeline(labels_source) train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train') if dataset_eval is not None: # explicit eval data. eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval') else: # Split train/eval. train_preprocessed, eval_preprocessed = (train_preprocessed | 'Random Partition' >> beam.Partition(TrainEvalSplitPartitionFn(), 2)) output_train_path = os.path.join(output_dir, job_id, 'train') output_eval_path = os.path.join(output_dir, job_id, 'eval') labels_file = os.path.join(output_dir, job_id, 'labels') labels_save = (labels | 'Write labels' >> beam.io.textio.WriteToText(labels_file, shard_name_template='')) train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path) eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path) # Make sure we write "latest" file after train and eval data are successfully written. output_latest_file = os.path.join(output_dir, 'latest') ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() | 'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) | beam.Map(lambda path: job_id) | 'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))
Example #6
Source File: deep_copy_test.py From transform with Apache License 2.0 | 5 votes |
def testEachPTransformCopiedOnce(self): with beam.Pipeline() as p: created = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')]) modified1 = (created | 'Transform1' >> beam.Map( lambda x: DeepCopyTest._CountingIdentityFn( 'Transform1', x))) partition_fn = lambda element, partitions: element[0] % partitions p1, p2 = (modified1 | 'Partition' >> beam.Partition(partition_fn, 2)) merged = (p1, p2) | 'Flatten1' >> beam.Flatten() modified2 = (merged | 'Transform2' >> beam.Map( lambda x: DeepCopyTest._CountingIdentityFn( 'Transform2', x))) copied = deep_copy.deep_copy(modified2) # Check that deep copy was performed. self.assertIsNot(copied.producer.inputs[0], modified2.producer.inputs[0]) self.assertIsNot(copied.producer.inputs[0].producer.inputs[0], modified2.producer.inputs[0].producer.inputs[0]) self.assertIsNot(copied.producer.inputs[0].producer.inputs[1], modified2.producer.inputs[0].producer.inputs[1]) # Check counts of processed items. self.assertEqual(DeepCopyTest._counts['Transform1'], 4) self.assertEqual(DeepCopyTest._counts['Transform2'], 4)
Example #7
Source File: base_example_gen_executor.py From tfx with Apache License 2.0 | 5 votes |
def _GeneratePartitionKey(record: Union[tf.train.Example, tf.train.SequenceExample, bytes], split_config: example_gen_pb2.SplitConfig) -> bytes: """Generates key for partition.""" if not split_config.HasField('partition_feature_name'): if isinstance(record, bytes): return record return record.SerializeToString(deterministic=True) if isinstance(record, tf.train.Example): features = record.features.feature # pytype: disable=attribute-error elif isinstance(record, tf.train.SequenceExample): features = record.context.feature # pytype: disable=attribute-error else: raise RuntimeError('Split by `partition_feature_name` is only supported ' 'for FORMAT_TF_EXAMPLE and FORMAT_TF_SEQUENCE_EXAMPLE ' 'payload format.') # Use a feature for partitioning the examples. feature_name = split_config.partition_feature_name if feature_name not in features: raise RuntimeError('Feature name `{}` does not exist.'.format(feature_name)) feature = features[feature_name] if not feature.HasField('kind'): raise RuntimeError('Partition feature does not contain any value.') if (not feature.HasField('bytes_list') and not feature.HasField('int64_list')): raise RuntimeError('Only `bytes_list` and `int64_list` features are ' 'supported for partition.') return feature.SerializeToString(deterministic=True)
Example #8
Source File: preprocess.py From professional-services with Apache License 2.0 | 5 votes |
def _split_data(examples, train_fraction=constants.TRAIN_SIZE, val_fraction=constants.VAL_SIZE): """Splits the data into train/validation/test.""" def partition_fn(*_): random_value = np.random.random() if random_value < train_fraction: return 0 if random_value < train_fraction + val_fraction: return 1 return 2 examples_split = examples | "SplitData" >> beam.Partition(partition_fn, 3) return zip([constants.TRAIN, constants.VAL, constants.TEST], examples_split)
Example #9
Source File: jackknife.py From model-analysis with Apache License 2.0 | 5 votes |
def expand(self, sliced_extracts): def partition_fn(_, num_partitions): return self._random_state.randint(num_partitions) # Partition the data # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]] partitions = ( sliced_extracts | 'Partition' >> beam.Partition(partition_fn, self._num_jackknife_samples)) def add_partition_index(slice_key, accumulator_and_size, partition_index=None): accumulator, size = accumulator_and_size return slice_key, _PartitionInfo(accumulator, size, partition_index) # Within each partition, partially combine per slice key to get accumulators # and partition sizes; add partition_id for determinism. # List[PCollection[slicer.SliceKeyType, _PartitionInfo]] partition_accumulators = [] for i, partition in enumerate(partitions): partition_accumulators.append( partition | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey( beam.transforms.combiners.SingleInputTupleCombineFn( _AccumulateOnlyCombiner(combiner=self._combiner), beam.transforms.combiners.CountCombineFn())) | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple( add_partition_index, i)) # Group partitions for the same slice, compute LOO metrics, and flatten back # into per-partition LOO metrics. # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict]) return (partition_accumulators | 'FlattenPartitionAccumulators' >> beam.Flatten() | 'CollectPerSlicePartitions' >> beam.GroupByKey() | 'MakeJackknifeSamples' >> beam.FlatMap( _make_jackknife_samples, combiner=self._combiner))