Python apache_beam.Flatten() Examples
The following are 23
code examples of apache_beam.Flatten().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: pipeline_common.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def add_annotation_headers(pipeline, known_args, pipeline_mode, merged_header, annotated_vcf_pattern): if pipeline_mode == PipelineModes.LARGE: annotation_headers = (pipeline | 'ReadAnnotatedVCF' >> beam.Create([annotated_vcf_pattern]) | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders()) else: annotation_headers = ( pipeline | 'ReadHeaders' >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern)) merged_header = ( (merged_header, annotation_headers) | beam.Flatten() | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header
Example #2
Source File: data_linter.py From data-linter with Apache License 2.0 | 6 votes |
def expand(self, examples): """Runs the linters on the data and writes out the results. The order in which the linters run is unspecified. Args: examples: A `PTransform` that yields a `PCollection` of `tf.Examples`. Returns: A pipeline containing the `DataLinter` `PTransform`s. """ coders = (beam.coders.coders.StrUtf8Coder(), beam.coders.coders.ProtoCoder(lint_result_pb2.LintResult)) return ( [examples | linter for linter in self._linters if linter.should_run()] | 'MergeResults' >> beam.Flatten() | 'DropEmpty' >> beam.Filter(lambda (_, r): r and len(r.warnings)) | 'ToDict' >> beam.combiners.ToDict() | 'WriteResults' >> beam.io.textio.WriteToText( self._results_path, coder=beam.coders.coders.PickleCoder(), shard_name_template=''))
Example #3
Source File: _util.py From pydatalab with Apache License 2.0 | 6 votes |
def get_sources_from_dataset(p, dataset, mode): """get pcollection from dataset.""" import apache_beam as beam import csv from google.datalab.ml import CsvDataSet, BigQueryDataSet check_dataset(dataset, mode) if type(dataset) is CsvDataSet: source_list = [] for ii, input_path in enumerate(dataset.files): source_list.append(p | 'Read from Csv %d (%s)' % (ii, mode) >> beam.io.ReadFromText(input_path, strip_trailing_newlines=True)) return (source_list | 'Flatten Sources (%s)' % mode >> beam.Flatten() | 'Create Dict from Csv (%s)' % mode >> beam.Map(lambda line: csv.DictReader([line], fieldnames=['image_url', 'label']).next())) elif type(dataset) is BigQueryDataSet: bq_source = (beam.io.BigQuerySource(table=dataset.table) if dataset.table is not None else beam.io.BigQuerySource(query=dataset.query)) return p | 'Read source from BigQuery (%s)' % mode >> beam.io.Read(bq_source) else: raise ValueError('Invalid DataSet. Expect CsvDataSet or BigQueryDataSet')
Example #4
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 6 votes |
def expand( self, sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection: unweighted_protos = ( sliced_record_batchs | 'ComputeUnweightedLift' >> self._unweighted_generator) if not self._weight_column_name: # If no weight column name is given, only compute unweighted lift. return unweighted_protos weighted_protos = ( sliced_record_batchs | 'ComputeWeightedLift' >> self._weighted_generator) return ((unweighted_protos, weighted_protos) | 'MergeUnweightedAndWeightedProtos' >> beam.Flatten())
Example #5
Source File: vcf_to_bq_preprocess.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def _get_inferred_headers(variants, # type: pvalue.PCollection merged_header # type: pvalue.PCollection ): # type: (...) -> (pvalue.PCollection, pvalue.PCollection) inferred_headers = (variants | 'FilterVariants' >> filter_variants.FilterVariants() | ' InferHeaderFields' >> infer_headers.InferHeaderFields( pvalue.AsSingleton(merged_header), allow_incompatible_records=True, infer_headers=True)) merged_header = ( (inferred_headers, merged_header) | beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( allow_incompatible_records=True)) return inferred_headers, merged_header
Example #6
Source File: impl.py From transform with Apache License 2.0 | 5 votes |
def expand(self, pbegin): # TODO(b/151921205): we have to do an identity map for unmodified # PCollections below because otherwise we get an error from beam. identity_map = 'Identity' >> beam.Map(lambda x: x) if self._dataset_key.is_flattened_dataset_key(): if self._flat_pcollection: return self._flat_pcollection | identity_map else: return ( list(self._pcollection_dict.values()) | 'FlattenAnalysisInputs' >> beam.Flatten(pipeline=pbegin.pipeline)) else: return self._pcollection_dict[self._dataset_key] | identity_map
Example #7
Source File: evaluator.py From model-analysis with Apache License 2.0 | 5 votes |
def combine_dict_based_evaluations( evaluations: Dict[Text, List[beam.pvalue.PCollection]]) -> Evaluation: """Combines multiple evaluation outputs together when the outputs are dicts. Note that the dict here refers to the output in the PCollection. The evaluations themselves are dicts of PCollections keyed by category ('metrics', 'plots', 'analysis', etc). This util is used to group the outputs of one or more of these evaluations where the PCollections themselves must be dicts. For example, a 'metrics' evaluation might store its output in PCollection of dicts containing metric keys and metric values. This util would be used to group the outputs from running two or more independent metrics evaluations together into a single PCollection. Args: evaluations: Dict of lists of PCollections of outputs from different evaluators keyed by type of output ('metrics', 'plots', 'analysis', etc). Returns: Dict of consolidated PCollections of outputs keyed by type of output. """ result = {} for k, v in evaluations.items(): if len(v) == 1: result[k] = v[0] continue result[k] = ( v | 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten() | 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey( _CombineEvaluationDictionariesFn())) return result
Example #8
Source File: jackknife.py From model-analysis with Apache License 2.0 | 5 votes |
def expand(self, sliced_extracts): def partition_fn(_, num_partitions): return self._random_state.randint(num_partitions) # Partition the data # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]] partitions = ( sliced_extracts | 'Partition' >> beam.Partition(partition_fn, self._num_jackknife_samples)) def add_partition_index(slice_key, accumulator_and_size, partition_index=None): accumulator, size = accumulator_and_size return slice_key, _PartitionInfo(accumulator, size, partition_index) # Within each partition, partially combine per slice key to get accumulators # and partition sizes; add partition_id for determinism. # List[PCollection[slicer.SliceKeyType, _PartitionInfo]] partition_accumulators = [] for i, partition in enumerate(partitions): partition_accumulators.append( partition | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey( beam.transforms.combiners.SingleInputTupleCombineFn( _AccumulateOnlyCombiner(combiner=self._combiner), beam.transforms.combiners.CountCombineFn())) | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple( add_partition_index, i)) # Group partitions for the same slice, compute LOO metrics, and flatten back # into per-partition LOO metrics. # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict]) return (partition_accumulators | 'FlattenPartitionAccumulators' >> beam.Flatten() | 'CollectPerSlicePartitions' >> beam.GroupByKey() | 'MakeJackknifeSamples' >> beam.FlatMap( _make_jackknife_samples, combiner=self._combiner))
Example #9
Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0 | 5 votes |
def expand(self, pcoll): to_dict = lambda x: {x[0]: x[1]} example_counts = ( pcoll | "count_examples" >> beam.combiners.Count.Globally() | "key_example_counts" >> beam.Map( lambda x: ("examples", x)) | "example_count_dict" >> beam.Map(to_dict)) def _count_tokens(pcoll, feat): return ( pcoll | "key_%s_toks" % feat >> beam.Map( lambda x: # pylint:disable=g-long-lambda ("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0))) token_counts = ( [_count_tokens(pcoll, feat) for feat in self._output_features] | "flatten_tokens" >> beam.Flatten() | "count_tokens" >> beam.CombinePerKey(sum) | "token_count_dict" >> beam.Map(to_dict)) def _merge_dicts(dicts): merged_dict = {} for d in dicts: assert not set(merged_dict).intersection(d) merged_dict.update(d) return merged_dict return ( [example_counts, token_counts] | "flatten_counts" >> beam.Flatten() | "merge_stats" >> beam.CombineGlobally(_merge_dicts))
Example #10
Source File: impl.py From transform with Apache License 2.0 | 5 votes |
def expand(self, inputs): return inputs | beam.Flatten()
Example #11
Source File: analyzer_impls.py From transform with Apache License 2.0 | 5 votes |
def expand(self, inputs): if self._top_k is not None and self._top_k < 0: raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got ' '{}.'.format(self._top_k)) if self._frequency_threshold is not None and self._frequency_threshold < 0: raise ValueError( 'frequency_threshold for VocabularyImpl should be >= 0 or None, ' 'got {}.'.format(self._frequency_threshold)) if self._coverage_top_k is not None and self._coverage_top_k < 0: raise ValueError('coverage_top_k for VocabularyImpl should be >= 0 or ' 'None, got {}.'.format(self._coverage_top_k)) if (self._coverage_frequency_threshold is not None and self._coverage_frequency_threshold < 0): raise ValueError( 'coverage_frequency_threshold for VocabularyImpl should be >= 0 or ' 'None, got {}.'.format(self._coverage_frequency_threshold)) pcoll, = inputs result = ( pcoll | 'ApplyThresholdsAndTopK' >> ( _ApplyThresholdsAndTopK( # pylint: disable=no-value-for-parameter self._frequency_threshold, self._top_k, self._informativeness_threshold, None))) if self._key_fn: # Note: current APIs do not allow for specifying a coverage # informativeness threshold. coverage_counts = ( pcoll | 'ApplyCoverageThresholdAndTopK' >> ( _ApplyThresholdsAndTopK( # pylint: disable=no-value-for-parameter self._coverage_frequency_threshold, self._coverage_top_k, self._coverage_informativeness_threshold, self._key_fn))) result = ((result, coverage_counts) | 'MergeStandardAndCoverageArms' >> beam.Flatten() | 'RemoveDuplicates' >> beam.RemoveDuplicates()) return result
Example #12
Source File: sentiment_example.py From transform with Apache License 2.0 | 5 votes |
def ReadAndShuffleData(pcoll, filepatterns): """Read a train or test dataset from disk and shuffle it.""" # NOTE: we pass filepatterns as a tuple instead of two args, as the current # version of beam assumes that if the first arg to a ptransfrom_fn is a # string, then that string is the label. neg_filepattern, pos_filepattern = filepatterns # Read from each file pattern and create a tuple of the review text and the # correct label. negative_examples = ( pcoll | 'ReadNegativeExamples' >> beam.io.ReadFromText(neg_filepattern) | 'PairWithZero' >> beam.Map(lambda review: (review, 0))) positive_examples = ( pcoll | 'ReadPositiveExamples' >> beam.io.ReadFromText(pos_filepattern) | 'PairWithOne' >> beam.Map(lambda review: (review, 1))) all_examples = ( [negative_examples, positive_examples] | 'Merge' >> beam.Flatten()) # Shuffle the data. Note that the data does in fact contain duplicate reviews # for reasons that are unclear. This means that NUM_TRAIN_INSTANCES and # NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data. # pylint: disable=no-value-for-parameter shuffled_examples = ( all_examples | 'Distinct' >> beam.Distinct() | 'Shuffle' >> Shuffle()) # Put the data in the format that can be accepted directly by tf.Transform. return shuffled_examples | 'MakeInstances' >> beam.Map( lambda p: {REVIEW_KEY: p[0], LABEL_KEY: p[1]})
Example #13
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 5 votes |
def _add_inferred_headers(all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace merged_header, # type: pvalue.PCollection pipeline_mode # type: int ): # type: (...) -> pvalue.PCollection annotation_fields_to_infer = (known_args.annotation_fields if known_args.infer_annotation_types else []) inferred_headers = ( _read_variants(all_patterns, pipeline, known_args, pipeline_mode, pre_infer_headers=known_args.infer_headers) | 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( pvalue.AsSingleton(merged_header), known_args.allow_incompatible_records, known_args.infer_headers, annotation_fields_to_infer)) merged_header = ( (inferred_headers, merged_header) | 'FlattenHeaders' >> beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header
Example #14
Source File: _preprocess.py From pydatalab with Apache License 2.0 | 5 votes |
def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id): source_train = _util.get_sources_from_dataset(p, dataset_train, 'train') labels_source = [source_train] if dataset_eval is not None: source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval') labels_source.append(source_eval) labels = _labels_pipeline(labels_source) train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train') if dataset_eval is not None: # explicit eval data. eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval') else: # Split train/eval. train_preprocessed, eval_preprocessed = (train_preprocessed | 'Random Partition' >> beam.Partition(TrainEvalSplitPartitionFn(), 2)) output_train_path = os.path.join(output_dir, job_id, 'train') output_eval_path = os.path.join(output_dir, job_id, 'eval') labels_file = os.path.join(output_dir, job_id, 'labels') labels_save = (labels | 'Write labels' >> beam.io.textio.WriteToText(labels_file, shard_name_template='')) train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path) eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path) # Make sure we write "latest" file after train and eval data are successfully written. output_latest_file = os.path.join(output_dir, 'latest') ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() | 'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) | beam.Map(lambda path: job_id) | 'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))
Example #15
Source File: _preprocess.py From pydatalab with Apache License 2.0 | 5 votes |
def _labels_pipeline(sources): labels = (sources | 'Flatten Sources for labels' >> beam.Flatten() | 'Parse input for labels' >> beam.Map(lambda x: str(x['label'])) | 'Combine labels' >> beam.transforms.combiners.Count.PerElement() | 'Get labels' >> beam.Map(lambda label_count: label_count[0])) return labels
Example #16
Source File: beam_reshuffle.py From exoplanet-ml with Apache License 2.0 | 5 votes |
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") def pipeline(root): """Beam pipeline for preprocessing open images.""" assert FLAGS.input_file_patterns assert FLAGS.output_dir assert FLAGS.output_name assert FLAGS.num_shards # Create Pipeline. tfrecords = [] for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")): logging.info("Reading TFRecords from %s", file_pattern) stage_name = "read_tfrecords_{}".format(i) tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord( file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example))) # pylint: disable=expression-not-assigned (tfrecords | "flatten" >> beam.Flatten() | "count_labels" >> beam.ParDo(CountLabelsDoFn()) | "reshuffle" >> beam.Reshuffle() | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(FLAGS.output_dir, FLAGS.output_name), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=FLAGS.num_shards)) # pylint: enable=expression-not-assigned pipeline.run() logging.info("Processing complete.")
Example #17
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 4 votes |
def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch, _SlicedYKey]): sliced_record_batchs, y_keys = sliced_record_batchs_and_ys # _SlicedXYKey(slice, x_path, x, y), xy_count partial_copresence_counts = ( sliced_record_batchs | 'ToPartialCopresenceCounts' >> beam.FlatMap( _to_partial_copresence_counts, self._y_path, self._x_paths, self._y_boundaries, self._weight_column_name)) # Compute placerholder copresence counts. # partial_copresence_counts will only include x-y pairs that are present, # but we would also like to keep track of x-y pairs that never appear, as # long as x and y independently occur in the slice. # _SlicedXKey(slice, x_path, x), x_count x_counts = ( sliced_record_batchs | 'ToPartialXCounts' >> beam.FlatMap( _to_partial_x_counts, self._x_paths, self._weight_column_name) | 'SumXCounts' >> beam.CombinePerKey(sum)) if self._min_x_count: x_counts = x_counts | 'FilterXCounts' >> beam.Filter( lambda kv: kv[1] > self._min_x_count) # _SlicedXYKey(slice, x_path, x, y), 0 placeholder_copresence_counts = ( (x_counts, y_keys) | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts( self._x_paths, self._min_x_count)) def move_y_to_value(key, xy_count): return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count) # _SlicedXKey(slice, x_path, x), (y, xy_count) copresence_counts = ( (placeholder_copresence_counts, partial_copresence_counts) | 'FlattenCopresenceCounts' >> beam.Flatten() | 'SumCopresencePairs' >> beam.CombinePerKey(sum) | 'MoveYToValue' >> beam.MapTuple(move_y_to_value)) # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count) return ({ 'x_count': x_counts, 'xy_counts': copresence_counts } | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey() | 'JoinXCounts' >> beam.FlatMap(_join_x_counts))
Example #18
Source File: deep_copy_test.py From transform with Apache License 2.0 | 4 votes |
def testFlatten(self): with beam.Pipeline() as p: create_1 = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')]) create_2 = p | 'Create2' >> beam.Create([(3, 'c')]) created = (create_1, create_2) | 'Flatten1' >> beam.Flatten() grouped1 = (created | 'PreGroup1' >> beam.Map( lambda x: DeepCopyTest._CountingIdentityFn( 'PreGroup1', x)) | 'GBK1' >> beam.GroupByKey()) grouped2 = (p | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')]) | 'PreGroup2' >> beam.Map( lambda x: DeepCopyTest._CountingIdentityFn( 'PreGroup2', x)) | 'GBK2' >> beam.GroupByKey()) modified1 = ( grouped1 | 'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1'))) modified2 = ( grouped2 | 'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2'))) flattened = (modified1, modified2) | 'Flatten2' >> beam.Flatten() modified3 = ( flattened | 'Add3' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add3'))) copied = deep_copy.deep_copy(modified3) # Check that deep copy was performed. self.assertIsNot(copied.producer.inputs[0], modified3.producer.inputs[0]) self.assertIsNot(copied.producer.inputs[0].producer.inputs[0], modified3.producer.inputs[0].producer.inputs[0]) self.assertIsNot(copied.producer.inputs[0].producer.inputs[1], modified3.producer.inputs[0].producer.inputs[1]) # Check that copy stops at materialization boundary. self.assertIs( copied.producer.inputs[0].producer.inputs[0].producer.inputs[0], modified3.producer.inputs[0].producer.inputs[0].producer.inputs[0]) self.assertIs( copied.producer.inputs[0].producer.inputs[1].producer.inputs[0], modified3.producer.inputs[0].producer.inputs[1].producer.inputs[0]) # Check counts of processed items. self.assertEqual(DeepCopyTest._counts['PreGroup1'], 3) self.assertEqual(DeepCopyTest._counts['PreGroup2'], 3) self.assertEqual(DeepCopyTest._counts['Add1'], 6) self.assertEqual(DeepCopyTest._counts['Add2'], 6) self.assertEqual(DeepCopyTest._counts['Add3'], 12)
Example #19
Source File: executor.py From tfx with Apache License 2.0 | 4 votes |
def _run_model_inference(self, model_path: Text, example_uris: Mapping[Text, Text], output_path: Text, model_spec: bulk_inferrer_pb2.ModelSpec) -> None: """Runs model inference on given example data. Args: model_path: Path to model. example_uris: Mapping of example split name to example uri. output_path: Path to output generated prediction logs. model_spec: bulk_inferrer_pb2.ModelSpec instance. Returns: None """ try: from tfx_bsl.public.beam import run_inference from tfx_bsl.public.proto import model_spec_pb2 except ImportError: # TODO(b/151468119): Remove this branch after next release. run_inference = importlib.import_module('tfx_bsl.beam.run_inference') model_spec_pb2 = importlib.import_module('tfx_bsl.proto.model_spec_pb2') saved_model_spec = model_spec_pb2.SavedModelSpec( model_path=model_path, tag=model_spec.tag, signature_name=model_spec.model_signature_name) # TODO(b/151468119): Remove this branch after next release. if getattr(model_spec_pb2, 'InferenceEndpoint', False): inference_endpoint = getattr(model_spec_pb2, 'InferenceEndpoint')() else: inference_endpoint = model_spec_pb2.InferenceSpecType() inference_endpoint.saved_model_spec.CopyFrom(saved_model_spec) with self._make_beam_pipeline() as pipeline: data_list = [] for split, example_uri in example_uris.items(): data = ( pipeline | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri))) data_list.append(data) _ = ( [data for data in data_list] | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline) | 'ParseExamples' >> beam.Map(tf.train.Example.FromString) | 'RunInference' >> run_inference.RunInference(inference_endpoint) | 'WritePredictionLogs' >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz', coder=beam.coders.ProtoCoder(prediction_log_pb2.PredictionLog))) logging.info('Inference result written to %s.', output_path)
Example #20
Source File: model_eval_lib.py From model-analysis with Apache License 2.0 | 4 votes |
def ExtractAndEvaluate( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, extractors: List[extractor.Extractor], evaluators: List[evaluator.Evaluator]) -> evaluator.Evaluation: """Performs Extractions and Evaluations in provided order.""" # evaluation[k] = list of values for k evaluation = {} def update(evaluation: Dict[Text, Any], new_evaluation: Dict[Text, Any]): for k, v in new_evaluation.items(): if k not in evaluation: evaluation[k] = [] evaluation[k].append(v) return evaluation # Run evaluators that run before extraction (i.e. that only require # the incoming input extract added by ReadInputs) for v in evaluators: if not v.run_after: update(evaluation, extracts | v.stage_name >> v.ptransform) for x in extractors: extracts = (extracts | x.stage_name >> x.ptransform) for v in evaluators: if v.run_after == x.stage_name: update(evaluation, extracts | v.stage_name >> v.ptransform) for v in evaluators: if v.run_after == extractor.LAST_EXTRACTOR_STAGE_NAME: update(evaluation, extracts | v.stage_name >> v.ptransform) # Merge multi-valued keys if necessary. result = {} for k, v in evaluation.items(): if len(v) == 1: result[k] = v[0] continue # Note that we assume that if a key is multivalued, its values are # dictionaries with disjoint keys. The combined value will simply be the # disjoint union of all the dictionaries. result[k] = ( v | 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten() | 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey( _CombineEvaluationDictionariesFn())) return result
Example #21
Source File: beam_prepare_embedding_inputs.py From exoplanet-ml with Apache License 2.0 | 4 votes |
def main(argv): del argv # Unused. logging.set_verbosity(logging.INFO) def pipeline(root): """Beam pipeline for preprocessing Kepler events.""" # Separately process and write each TCE dataset, and gather all the results. configs = _parse_configs() subsets = { "train": [], "val": [], "test": [], } for config in configs: output_dir = os.path.join(FLAGS.output_dir, config.name) # Write the config. config_json = json.dumps(config, indent=2) logging.info(config_json) (root | "{}-create-config".format(config.name) >> beam.Create([config_json]) | "{}-write_config".format(config.name) >> beam.io.WriteToText( os.path.join(output_dir, "config.json"), num_shards=1, shard_name_template="")) # Process TCEs and write each subset. results = _process_tces(root, config) for subset_name, subset_values in results: _write_subset(config.name, subset_name, subset_values) subsets[subset_name].append(subset_values) # Create one dataset comprising all TCE datasets. for subset_name, subset_values in subsets.items(): combined_subset_values = ( subset_values | "combined-{}-flatten".format(subset_name) >> beam.Flatten() | "combined-{}-count_labels".format(subset_name) >> beam.ParDo( _CountLabelsDoFn(prefix="combined-{}".format(subset_name))) | "combined-{}-reshuffle".format(subset_name) >> beam.Reshuffle()) _write_subset("combined", subset_name, combined_subset_values) pipeline.run() logging.info("Preprocessing complete.")
Example #22
Source File: poisson_bootstrap.py From model-analysis with Apache License 2.0 | 4 votes |
def ComputeWithConfidenceIntervals( # pylint: disable=invalid-name sliced_extracts: beam.pvalue.PCollection, compute_per_slice_metrics_cls: Type[beam.PTransform], num_bootstrap_samples: Optional[int] = DEFAULT_NUM_BOOTSTRAP_SAMPLES, random_seed_for_testing: Optional[int] = None, **kwargs) -> beam.pvalue.PCollection: """PTransform for computing metrics using T-Distribution values. Args: sliced_extracts: Incoming PCollection consisting of slice key and extracts. compute_per_slice_metrics_cls: PTransform class that takes a PCollection of (slice key, extracts) as input and returns (slice key, dict of metrics) as output. The class will be instantiated multiple times to compute metrics both with and without sampling. The class will be initialized using kwargs 'compute_with_sampling' and 'random_seed_for_testing' along with any kwargs passed in **kwargs. num_bootstrap_samples: Number of replicas to use in calculating uncertainty using bootstrapping. If 1 is provided (default), aggregate metrics will be calculated with no uncertainty. If num_bootstrap_samples is > 0, multiple samples of each slice will be calculated using the Poisson bootstrap method. To calculate standard errors, num_bootstrap_samples should be 20 or more in order to provide useful data. More is better, but you pay a performance cost. random_seed_for_testing: Seed to use for unit testing, because nondeterministic tests stink. Each partition will use this value + i. **kwargs: Additional args to pass to compute_per_slice_metrics_cls init. Returns: PCollection of (slice key, dict of metrics) """ if not num_bootstrap_samples: num_bootstrap_samples = 1 # TODO(ckuhn): Cap the number of bootstrap samples at 20. if num_bootstrap_samples < 1: raise ValueError('num_bootstrap_samples should be > 0, got %d' % num_bootstrap_samples) output_results = ( sliced_extracts | 'ComputeUnsampledMetrics' >> compute_per_slice_metrics_cls( compute_with_sampling=False, random_seed_for_testing=None, **kwargs)) if num_bootstrap_samples > 1: multicombine = [] for i in range(num_bootstrap_samples): seed = (None if random_seed_for_testing is None else random_seed_for_testing + i) multicombine.append( sliced_extracts | 'ComputeSampledMetrics%d' % i >> compute_per_slice_metrics_cls( compute_with_sampling=True, random_seed_for_testing=seed, **kwargs)) output_results = ( multicombine | 'FlattenBootstrapPartitions' >> beam.Flatten() | 'GroupBySlice' >> beam.GroupByKey() | 'MergeBootstrap' >> beam.ParDo(_MergeBootstrap(), beam.pvalue.AsDict(output_results))) return output_results
Example #23
Source File: stats_impl.py From data-validation with Apache License 2.0 | 4 votes |
def expand(self, dataset: beam.pvalue.PCollection) -> beam.pvalue.PCollection: # Handles generators by their type: # - CombinerStatsGenerators will be wrapped in a single CombinePerKey by # _CombinerStatsGeneratorsCombineFn. # - TransformStatsGenerator will be invoked separately with `dataset`. combiner_stats_generators = [] result_protos = [] for generator in get_generators(self._options): if isinstance(generator, stats_generator.CombinerStatsGenerator): combiner_stats_generators.append(generator) elif isinstance(generator, stats_generator.TransformStatsGenerator): result_protos.append( dataset | generator.name >> generator.ptransform) else: raise TypeError('Statistics generator must extend one of ' 'CombinerStatsGenerator or TransformStatsGenerator, ' 'found object of type %s' % generator.__class__.__name__) if combiner_stats_generators: # TODO(b/115685296): Obviate the need for explicit fanout. fanout = 5 * int(math.ceil(math.sqrt(len(combiner_stats_generators)))) result_protos.append(dataset | 'RunCombinerStatsGenerators' >> beam.CombinePerKey( _CombinerStatsGeneratorsCombineFn( combiner_stats_generators, self._options.desired_batch_size )).with_hot_key_fanout(fanout)) # result_protos is a list of PCollections of (slice key, # DatasetFeatureStatistics proto) pairs. We now flatten the list into a # single PCollection, combine the DatasetFeatureStatistics protos by key, # and then merge the DatasetFeatureStatistics protos in the PCollection into # a single DatasetFeatureStatisticsList proto. return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten() | 'MergeDatasetFeatureStatisticsProtos' >> beam.CombinePerKey(_merge_dataset_feature_stats_protos) | 'AddSliceKeyToStatsProto' >> beam.Map( _add_slice_key, self._is_slicing_enabled) | 'ToList' >> beam.combiners.ToList() | 'MakeDatasetFeatureStatisticsListProto' >> beam.Map(_make_dataset_feature_statistics_list_proto))