Python apache_beam.CombinePerKey() Examples
The following are 30
code examples of apache_beam.CombinePerKey().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: partitioned_stats_generator.py From data-validation with Apache License 2.0 | 5 votes |
def expand(self, pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection: """Estimates the user defined statistic.""" return ( pcoll | 'AssignBatchToPartition' >> beam.Map( _assign_to_partition, num_partitions=self._num_partitions) | 'GroupPartitionsIntoList' >> beam.CombinePerKey( beam.combiners.SampleCombineFn(self._max_batches_per_partition)) | 'ProcessPartition' >> beam.Map(_process_partition, stats_fn=self._stats_fn) | 'ComputeMetaStats' >> beam.CombinePerKey( PartitionedStatisticsAnalyzer(min_partitions_stat_presence=self ._min_partitions_stat_presence)))
Example #2
Source File: linters.py From data-linter with Apache License 2.0 | 5 votes |
def _count_transformer(self): return ( 'DropNaN' >> beam.Filter(lambda (f_v, _): not np.isnan(f_v[1])) | 'IsIntegral' >> beam.Map( lambda (f_v, c): ((f_v[0], f_v[1] % 1 == 0), c)) | 'Count' >> beam.CombinePerKey(sum))
Example #3
Source File: evaluator.py From model-analysis with Apache License 2.0 | 5 votes |
def combine_dict_based_evaluations( evaluations: Dict[Text, List[beam.pvalue.PCollection]]) -> Evaluation: """Combines multiple evaluation outputs together when the outputs are dicts. Note that the dict here refers to the output in the PCollection. The evaluations themselves are dicts of PCollections keyed by category ('metrics', 'plots', 'analysis', etc). This util is used to group the outputs of one or more of these evaluations where the PCollections themselves must be dicts. For example, a 'metrics' evaluation might store its output in PCollection of dicts containing metric keys and metric values. This util would be used to group the outputs from running two or more independent metrics evaluations together into a single PCollection. Args: evaluations: Dict of lists of PCollections of outputs from different evaluators keyed by type of output ('metrics', 'plots', 'analysis', etc). Returns: Dict of consolidated PCollections of outputs keyed by type of output. """ result = {} for k, v in evaluations.items(): if len(v) == 1: result[k] = v[0] continue result[k] = ( v | 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten() | 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey( _CombineEvaluationDictionariesFn())) return result
Example #4
Source File: aggregate.py From model-analysis with Apache License 2.0 | 5 votes |
def ComputePerSliceMetrics( # pylint: disable=invalid-name slice_result: beam.pvalue.PCollection, eval_shared_model: types.EvalSharedModel, desired_batch_size: Optional[int] = None, compute_with_sampling: Optional[bool] = False, random_seed_for_testing: Optional[int] = None) -> beam.pvalue.PCollection: """PTransform for computing, aggregating and combining metrics. Args: slice_result: Incoming PCollection consisting of slice key and extracts. eval_shared_model: Shared model parameters for EvalSavedModel. desired_batch_size: Optional batch size for batching in Aggregate. compute_with_sampling: True to compute with sampling. random_seed_for_testing: Seed to use for unit testing. Returns: PCollection of (slice key, dict of metrics). """ # TODO(b/123516222): Remove this workaround per discussions in CL/227944001 slice_result.element_type = beam.typehints.Any return ( slice_result # _ModelLoadingIdentityFn loads the EvalSavedModel into memory # under a shared handle that can be used by subsequent steps. # Combiner lifting and producer-consumer fusion should ensure # that these steps run in the same process and memory space. # TODO(b/69566045): Remove _ModelLoadingIdentityFn and move model # loading to CombineFn.setup after it is available in Beam. | 'LoadModel' >> beam.ParDo( _ModelLoadingIdentityFn(eval_shared_model=eval_shared_model)) | 'CombinePerSlice' >> beam.CombinePerKey( _AggregateCombineFn( eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size, compute_with_sampling=compute_with_sampling, seed_for_testing=random_seed_for_testing)) | 'InterpretOutput' >> beam.ParDo( _ExtractOutputDoFn(eval_shared_model=eval_shared_model)))
Example #5
Source File: jackknife.py From model-analysis with Apache License 2.0 | 5 votes |
def expand(self, sliced_derived_values_and_diffs): return (sliced_derived_values_and_diffs | 'MoveJackknifeSampleIdToValue' >> beam.MapTuple(_move_jackknife_sample_id_to_value) | 'CombineJackknifeSamplesPerSlice' >> beam.CombinePerKey( _JackknifeSampleCombiner( num_jackknife_samples=self._num_jackknife_samples, skip_ci_metric_keys=self._skip_ci_metric_keys)))
Example #6
Source File: jackknife.py From model-analysis with Apache License 2.0 | 5 votes |
def expand(self, sliced_extracts): def partition_fn(_, num_partitions): return self._random_state.randint(num_partitions) # Partition the data # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]] partitions = ( sliced_extracts | 'Partition' >> beam.Partition(partition_fn, self._num_jackknife_samples)) def add_partition_index(slice_key, accumulator_and_size, partition_index=None): accumulator, size = accumulator_and_size return slice_key, _PartitionInfo(accumulator, size, partition_index) # Within each partition, partially combine per slice key to get accumulators # and partition sizes; add partition_id for determinism. # List[PCollection[slicer.SliceKeyType, _PartitionInfo]] partition_accumulators = [] for i, partition in enumerate(partitions): partition_accumulators.append( partition | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey( beam.transforms.combiners.SingleInputTupleCombineFn( _AccumulateOnlyCombiner(combiner=self._combiner), beam.transforms.combiners.CountCombineFn())) | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple( add_partition_index, i)) # Group partitions for the same slice, compute LOO metrics, and flatten back # into per-partition LOO metrics. # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict]) return (partition_accumulators | 'FlattenPartitionAccumulators' >> beam.Flatten() | 'CollectPerSlicePartitions' >> beam.GroupByKey() | 'MakeJackknifeSamples' >> beam.FlatMap( _make_jackknife_samples, combiner=self._combiner))
Example #7
Source File: metrics_and_plots_evaluator_v2.py From model-analysis with Apache License 2.0 | 5 votes |
def _GroupByQueryKey( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, query_key: Text, ) -> beam.pvalue.PCollection: """PTransform for grouping extracts by a query key. Args: extracts: Incoming PCollection consisting of extracts. query_key: Query key to group extracts by. Must be a member of the dict of features stored under tfma.FEATURES_KEY. Returns: PCollection of lists of extracts where each list is associated with same query key. """ missing_query_key_counter = beam.metrics.Metrics.counter( constants.METRICS_NAMESPACE, 'missing_query_key') def key_by_query_key(extracts: types.Extracts, query_key: Text) -> Tuple[Text, types.Extracts]: """Extract the query key from the extract and key by that.""" value = metric_util.to_scalar( util.get_by_keys( extracts, [constants.FEATURES_KEY, query_key], optional=True), tensor_name=query_key) if value is None: missing_query_key_counter.inc() return ('', extracts) return ('{}'.format(value), extracts) # pylint: disable=no-value-for-parameter return (extracts | 'KeyByQueryId' >> beam.Map(key_by_query_key, query_key) | 'GroupByKey' >> beam.CombinePerKey(beam.combiners.ToListCombineFn()) | 'DropQueryId' >> beam.Map(lambda kv: kv[1]))
Example #8
Source File: squared_pearson_correlation_test.py From model-analysis with Apache License 2.0 | 5 votes |
def testSquaredPearsonCorrelationMetricsWithNan(self): computations = ( squared_pearson_correlation.SquaredPearsonCorrelation().computations()) metric = computations[0] example = { 'labels': np.array([0.0]), 'predictions': np.array([1.0]), 'example_weights': np.array([1.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeMetric' >> beam.CombinePerKey(metric.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric.keys[0] self.assertIn(key, got_metrics) self.assertTrue(math.isnan(got_metrics[key])) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #9
Source File: example_count_test.py From model-analysis with Apache License 2.0 | 5 votes |
def testExampleCount(self): metric = example_count.ExampleCount().computations()[0] example1 = {} example2 = {} with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2]) | 'Process' >> beam.ParDo(metric.preprocessor) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeMetric' >> beam.CombinePerKey(metric.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey(name='example_count') self.assertDictElementsAlmostEqual(got_metrics, {example_count_key: 2}) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #10
Source File: tf_metric_wrapper_test.py From model-analysis with Apache License 2.0 | 5 votes |
def testCustomTFMetric(self): metric = tf_metric_wrapper.tf_metric_computations([_CustomMetric()])[0] example1 = {'labels': [0.0], 'predictions': [0.2], 'example_weights': [1.0]} example2 = {'labels': [0.0], 'predictions': [0.8], 'example_weights': [1.0]} example3 = {'labels': [0.0], 'predictions': [0.5], 'example_weights': [2.0]} with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(metric.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) custom_key = metric_types.MetricKey(name='custom') self.assertDictElementsAlmostEqual( got_metrics, {custom_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0)}) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #11
Source File: tf_metric_wrapper_test.py From model-analysis with Apache License 2.0 | 5 votes |
def testSparseMetric(self): computation = tf_metric_wrapper.tf_metric_computations([ tf.keras.metrics.SparseCategoricalCrossentropy( name='sparse_categorical_crossentropy') ])[0] # Simulate a multi-class problem with 3 labels. example = { 'labels': [1], 'predictions': [0.3, 0.6, 0.1], 'example_weights': [1.0] } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric_types.MetricKey(name='sparse_categorical_crossentropy') # 0*log(.3) -1*log(0.6)-0*log(.1) = 0.51 self.assertDictElementsAlmostEqual(got_metrics, {key: 0.51083}) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #12
Source File: tf_metric_wrapper_test.py From model-analysis with Apache License 2.0 | 5 votes |
def testSimpleMetric(self): computation = tf_metric_wrapper.tf_metric_computations( [tf.keras.metrics.MeanSquaredError(name='mse')])[0] example = { 'labels': [0, 0, 1, 1], 'predictions': [0, 0.5, 0.3, 0.9], 'example_weights': [1.0] } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) mse_key = metric_types.MetricKey(name='mse') self.assertDictElementsAlmostEqual(got_metrics, {mse_key: 0.1875}) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #13
Source File: min_label_position_test.py From model-analysis with Apache License 2.0 | 5 votes |
def testRaisesErrorWhenExampleWeightsDiffer(self): with self.assertRaises(ValueError): metric = min_label_position.MinLabelPosition().computations( query_key='query')[0] query1_example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([1.0]), 'features': { 'query': np.array(['query1']) } } query1_example2 = { 'labels': np.array([1.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([0.5]), 'features': { 'query': np.array(['query1']) } } def to_standard_metric_inputs_list(list_of_extracts): return [ metric_util.to_standard_metric_inputs(e, True) for e in list_of_extracts ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([[query1_example1, query1_example2]]) | 'Process' >> beam.Map(to_standard_metric_inputs_list) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(metric.combiner))
Example #14
Source File: tjur_discrimination_test.py From model-analysis with Apache License 2.0 | 5 votes |
def testTjurDiscriminationMetricsWithNan(self, metric): computations = metric.computations() shared_metrics = computations[0] metric = computations[1] example = { 'labels': np.array([0.0]), 'predictions': np.array([1.0]), 'example_weights': np.array([1.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeWeightedTotals' >> beam.CombinePerKey(shared_metrics.combiner) | 'ComputeMetric' >> beam.Map(lambda x: (x[0], metric.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric.keys[0] self.assertIn(key, got_metrics) self.assertTrue(math.isnan(got_metrics[key])) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #15
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 5 votes |
def expand(self, sliced_record_batchs): # slice, example_count example_counts = ( sliced_record_batchs | 'ToExampleCounts' >> beam.MapTuple(lambda k, v: (k, v.num_rows)) | 'SumExampleCounts' >> beam.CombinePerKey(sum)) def move_y_to_value(slice_and_y, y_count): slice_key, y = slice_and_y return slice_key, (y, y_count) # slice, (y, y_count) y_counts = ( sliced_record_batchs | 'ToPartialYCounts' >> beam.FlatMap(_to_partial_counts, self._y_path, self._y_boundaries, self._weight_column_name) | 'SumYCounts' >> beam.CombinePerKey(sum) | 'MoveYToValue' >> beam.MapTuple(move_y_to_value)) # _SlicedYKey(slice, y), _YRate(y_count, example_count) return ({ 'y_counts': y_counts, 'example_count': example_counts } | 'CoGroupByForYRates' >> beam.CoGroupByKey() | 'JoinExampleCounts' >> beam.FlatMap(_join_example_counts))
Example #16
Source File: analyzer_impls.py From transform with Apache License 2.0 | 5 votes |
def expand(self, inputs): pcoll, = inputs # Create a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). # TODO(b/112916494): Unify the graph in both cases once possible. if (self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION): flatten_map_fn = _flatten_to_key_and_means_accumulator_list combine_transform = _MutualInformationTransformAccumulate() # pylint: disable=no-value-for-parameter elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_FREQUENCY: flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples combine_transform = beam.CombinePerKey(sum) elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_LABELS: flatten_map_fn = _flatten_value_and_labeled_weights_to_list_of_tuples combine_transform = beam.CombinePerKey(sum_labeled_weights) else: flatten_map_fn = _flatten_value_to_list combine_transform = beam.combiners.Count.PerElement() result = ( pcoll | 'FlattenTokensAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn) | 'CountPerToken' >> combine_transform) if self._input_dtype == tf.string: # TODO(b/62379925) Filter empty strings or strings containing the \n or \r # tokens since index_table_from_file doesn't allow empty rows. def is_problematic_string(kv): string, _ = kv # Ignore counts. return string and b'\n' not in string and b'\r' not in string result |= 'FilterProblematicStrings' >> beam.Filter(is_problematic_string) return result
Example #17
Source File: analyzer_impls.py From transform with Apache License 2.0 | 5 votes |
def expand(self, inputs): if (self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION): combine_transform = _MutualInformationTransformMerge( # pylint: disable=no-value-for-parameter self._use_adjusted_mutual_info, self._min_diff_from_avg) elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_LABELS: combine_transform = beam.CombinePerKey(sum_labeled_weights) else: combine_transform = beam.CombinePerKey(sum) pcoll, = inputs return (pcoll | 'CountPerToken' >> combine_transform | 'SwapTokensAndCounts' >> beam.KvSwap())
Example #18
Source File: analyzer_impls.py From transform with Apache License 2.0 | 5 votes |
def _MutualInformationTransformAccumulate(pcol): # pylint: disable=invalid-name """Accumulates information needed for mutual information computation.""" return (pcol | 'VocabCountPerLabelPerTokenAccumulate' >> beam.CombinePerKey( _WeightedMeanCombineFn(output_shape=(None,))))
Example #19
Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0 | 5 votes |
def expand(self, pcoll): to_dict = lambda x: {x[0]: x[1]} example_counts = ( pcoll | "count_examples" >> beam.combiners.Count.Globally() | "key_example_counts" >> beam.Map( lambda x: ("examples", x)) | "example_count_dict" >> beam.Map(to_dict)) def _count_tokens(pcoll, feat): return ( pcoll | "key_%s_toks" % feat >> beam.Map( lambda x: # pylint:disable=g-long-lambda ("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0))) token_counts = ( [_count_tokens(pcoll, feat) for feat in self._output_features] | "flatten_tokens" >> beam.Flatten() | "count_tokens" >> beam.CombinePerKey(sum) | "token_count_dict" >> beam.Map(to_dict)) def _merge_dicts(dicts): merged_dict = {} for d in dicts: assert not set(merged_dict).intersection(d) merged_dict.update(d) return merged_dict return ( [example_counts, token_counts] | "flatten_counts" >> beam.Flatten() | "merge_stats" >> beam.CombineGlobally(_merge_dicts))
Example #20
Source File: analyzer_impls.py From transform with Apache License 2.0 | 5 votes |
def _MutualInformationTransformMerge( # pylint: disable=invalid-name pcol, use_adjusted_mutual_info, min_diff_from_avg): """Computes mutual information for each key using the given accumulators.""" feature_accumulator_pcol = ( pcol | 'VocabCountPerLabelPerTokenMerge' >> beam.CombinePerKey( _WeightedMeanCombineFn(output_shape=(None,)))) accumulators_by_feature, global_accumulator = ( feature_accumulator_pcol | 'ExtractSentinels' >> beam.FlatMap(_extract_sentinels).with_outputs( 'feature', 'global')) if min_diff_from_avg is None: min_diff_from_avg = ( global_accumulator | 'AutoMinDiffFromAvg' >> beam.Map(lambda acc: analyzers.calculate_recommended_min_diff_from_avg( # pylint: disable=g-long-lambda acc.count * acc.weight))) min_diff_from_avg = beam.pvalue.AsSingleton(min_diff_from_avg) def _extract_merged_values(term, results): """Returns the key and tuple of (mutual information, frequency).""" # Ignore the second value, which is the Expected Mutual Info. (mi, _, frequency) = results return term, (mi, frequency) return (accumulators_by_feature | 'CalculateMutualInformationPerToken' >> beam.Map( _calculate_mutual_information_for_feature_value, beam.pvalue.AsSingleton(global_accumulator), use_adjusted_mutual_info=use_adjusted_mutual_info, min_diff_from_avg=min_diff_from_avg) | beam.MapTuple(_extract_merged_values))
Example #21
Source File: analyzer_impls.py From transform with Apache License 2.0 | 5 votes |
def expand(self, inputs): pcoll, = inputs return ( pcoll | 'MergeCombinePerKey' >> beam.CombinePerKey( _CombinerWrapper( self._combiner, self._tf_config, is_combining_accumulators=True)))
Example #22
Source File: tf_metric_wrapper_test.py From model-analysis with Apache License 2.0 | 4 votes |
def testTFMetricWithClassID(self): computation = tf_metric_wrapper.tf_metric_computations( [tf.keras.metrics.MeanSquaredError(name='mse')], sub_key=metric_types.SubKey(class_id=1))[0] example1 = { 'labels': [2], 'predictions': [0.5, 0.0, 0.5], 'example_weights': [1.0] } example2 = { 'labels': [0], 'predictions': [0.2, 0.5, 0.3], 'example_weights': [1.0] } example3 = { 'labels': [1], 'predictions': [0.2, 0.3, 0.5], 'example_weights': [1.0] } example4 = { 'labels': [1], 'predictions': [0.0, 0.9, 0.1], 'example_weights': [1.0] } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) mse_key = metric_types.MetricKey( name='mse', sub_key=metric_types.SubKey(class_id=1)) self.assertDictElementsAlmostEqual(got_metrics, { mse_key: 0.1875, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #23
Source File: stats_impl.py From data-validation with Apache License 2.0 | 4 votes |
def expand(self, dataset: beam.pvalue.PCollection) -> beam.pvalue.PCollection: # Handles generators by their type: # - CombinerStatsGenerators will be wrapped in a single CombinePerKey by # _CombinerStatsGeneratorsCombineFn. # - TransformStatsGenerator will be invoked separately with `dataset`. combiner_stats_generators = [] result_protos = [] for generator in get_generators(self._options): if isinstance(generator, stats_generator.CombinerStatsGenerator): combiner_stats_generators.append(generator) elif isinstance(generator, stats_generator.TransformStatsGenerator): result_protos.append( dataset | generator.name >> generator.ptransform) else: raise TypeError('Statistics generator must extend one of ' 'CombinerStatsGenerator or TransformStatsGenerator, ' 'found object of type %s' % generator.__class__.__name__) if combiner_stats_generators: # TODO(b/115685296): Obviate the need for explicit fanout. fanout = 5 * int(math.ceil(math.sqrt(len(combiner_stats_generators)))) result_protos.append(dataset | 'RunCombinerStatsGenerators' >> beam.CombinePerKey( _CombinerStatsGeneratorsCombineFn( combiner_stats_generators, self._options.desired_batch_size )).with_hot_key_fanout(fanout)) # result_protos is a list of PCollections of (slice key, # DatasetFeatureStatistics proto) pairs. We now flatten the list into a # single PCollection, combine the DatasetFeatureStatistics protos by key, # and then merge the DatasetFeatureStatistics protos in the PCollection into # a single DatasetFeatureStatisticsList proto. return (result_protos | 'FlattenFeatureStatistics' >> beam.Flatten() | 'MergeDatasetFeatureStatisticsProtos' >> beam.CombinePerKey(_merge_dataset_feature_stats_protos) | 'AddSliceKeyToStatsProto' >> beam.Map( _add_slice_key, self._is_slicing_enabled) | 'ToList' >> beam.combiners.ToList() | 'MakeDatasetFeatureStatisticsListProto' >> beam.Map(_make_dataset_feature_statistics_list_proto))
Example #24
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 4 votes |
def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch, _SlicedYKey]): sliced_record_batchs, y_keys = sliced_record_batchs_and_ys # _SlicedXYKey(slice, x_path, x, y), xy_count partial_copresence_counts = ( sliced_record_batchs | 'ToPartialCopresenceCounts' >> beam.FlatMap( _to_partial_copresence_counts, self._y_path, self._x_paths, self._y_boundaries, self._weight_column_name)) # Compute placerholder copresence counts. # partial_copresence_counts will only include x-y pairs that are present, # but we would also like to keep track of x-y pairs that never appear, as # long as x and y independently occur in the slice. # _SlicedXKey(slice, x_path, x), x_count x_counts = ( sliced_record_batchs | 'ToPartialXCounts' >> beam.FlatMap( _to_partial_x_counts, self._x_paths, self._weight_column_name) | 'SumXCounts' >> beam.CombinePerKey(sum)) if self._min_x_count: x_counts = x_counts | 'FilterXCounts' >> beam.Filter( lambda kv: kv[1] > self._min_x_count) # _SlicedXYKey(slice, x_path, x, y), 0 placeholder_copresence_counts = ( (x_counts, y_keys) | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts( self._x_paths, self._min_x_count)) def move_y_to_value(key, xy_count): return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count) # _SlicedXKey(slice, x_path, x), (y, xy_count) copresence_counts = ( (placeholder_copresence_counts, partial_copresence_counts) | 'FlattenCopresenceCounts' >> beam.Flatten() | 'SumCopresencePairs' >> beam.CombinePerKey(sum) | 'MoveYToValue' >> beam.MapTuple(move_y_to_value)) # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count) return ({ 'x_count': x_counts, 'xy_counts': copresence_counts } | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey() | 'JoinXCounts' >> beam.FlatMap(_join_x_counts))
Example #25
Source File: fairness_indicators_test.py From model-analysis with Apache License 2.0 | 4 votes |
def testFairessIndicatorsMetricsWithInput(self, input_examples, computations_kwargs, expected_result): # This is a parameterized test with following parameters. # - input examples to be used in the test # - parameters like model name etc. # - expected result to assert on computations = fairness_indicators.FairnessIndicators( thresholds=[0.5]).computations(**computations_kwargs) histogram = computations[0] matrices = computations[1] metrics = computations[2] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create(input_examples) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map( lambda x: (x[0], matrices.result(x[1]))) # pyformat: ignore | 'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1]))) ) # pyformat: ignore # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 8) # 1 threshold * 8 metrics for metrics_key in expected_result: self.assertEqual(got_metrics[metrics_key], expected_result[metrics_key]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result') # Todo(b/147497357): Add counter test once we have counter setup.
Example #26
Source File: fairness_indicators_test.py From model-analysis with Apache License 2.0 | 4 votes |
def testFairessIndicatorsMetricsWithThresholds(self, kwargs, expected_metrics_nums, expected_metrics_keys): # This is a parameterized test with following parameters. # - metric parameters like thresholds. # - expected number of metrics computed # - expected list of metrics keys computations = fairness_indicators.FairnessIndicators( **kwargs).computations() histogram = computations[0] matrices = computations[1] metrics = computations[2] examples = [{ 'labels': np.array([0.0]), 'predictions': np.array([0.1]), 'example_weights': np.array([1.0]), }, { 'labels': np.array([0.0]), 'predictions': np.array([0.7]), 'example_weights': np.array([3.0]), }] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create(examples) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map( lambda x: (x[0], matrices.result(x[1]))) # pyformat: ignore | 'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1]))) ) # pyformat: ignore # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, expected_metrics_nums) for metrics_key in expected_metrics_keys: self.assertIn(metrics_key, got_metrics) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #27
Source File: model_eval_lib.py From model-analysis with Apache License 2.0 | 4 votes |
def ExtractAndEvaluate( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, extractors: List[extractor.Extractor], evaluators: List[evaluator.Evaluator]) -> evaluator.Evaluation: """Performs Extractions and Evaluations in provided order.""" # evaluation[k] = list of values for k evaluation = {} def update(evaluation: Dict[Text, Any], new_evaluation: Dict[Text, Any]): for k, v in new_evaluation.items(): if k not in evaluation: evaluation[k] = [] evaluation[k].append(v) return evaluation # Run evaluators that run before extraction (i.e. that only require # the incoming input extract added by ReadInputs) for v in evaluators: if not v.run_after: update(evaluation, extracts | v.stage_name >> v.ptransform) for x in extractors: extracts = (extracts | x.stage_name >> x.ptransform) for v in evaluators: if v.run_after == x.stage_name: update(evaluation, extracts | v.stage_name >> v.ptransform) for v in evaluators: if v.run_after == extractor.LAST_EXTRACTOR_STAGE_NAME: update(evaluation, extracts | v.stage_name >> v.ptransform) # Merge multi-valued keys if necessary. result = {} for k, v in evaluation.items(): if len(v) == 1: result[k] = v[0] continue # Note that we assume that if a key is multivalued, its values are # dictionaries with disjoint keys. The combined value will simply be the # disjoint union of all the dictionaries. result[k] = ( v | 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten() | 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey( _CombineEvaluationDictionariesFn())) return result
Example #28
Source File: analyzer_impls.py From transform with Apache License 2.0 | 4 votes |
def _split_inputs_by_key(batch_values): """Takes inputs where first input is a key, and returns (key, value) pairs. Takes inputs of the form (key, arg0, ..., arg{N-1}) where `key` is a vector and arg0, ..., arg{N-1} have dimension >1 with size in the first dimension matching `key`. It yields pairs of the form (key[i], [arg0[i], ..., arg{N-1}[i]]) for 0 < i < len(key). Args: batch_values: A list of ndarrays representing the input from a batch. Yields: (key, args) pairs where args is a list of ndarrays. Raises: ValueError: if inputs do not have correct sizes. """ # TODO(b/77873002): Raise these errors in the graph where more informative # errors can be generated. Keep these as a fallback for user-defined # `Combiner`s. keys = batch_values[0] if keys.ndim != 1: raise ValueError( 'keys for CombinePerKey should have rank 1, got shape {}'.format( keys.shape)) for arg_index, arg_values in enumerate(batch_values[1:]): if arg_values.ndim < 1: raise ValueError( 'Argument {} for CombinePerKey should have rank >=1, ' 'got shape {}'.format(arg_index, arg_values.shape)) if arg_values.shape[0] != keys.shape[0]: raise ValueError( 'Argument {} had shape {} whose first dimension was not equal to the ' 'size of the keys vector ({})'.format( arg_index, arg_values.shape, keys.shape[0])) for instance_index, key in enumerate(keys): instance_args = [arg_values[instance_index] for arg_values in batch_values[1:]] yield (key, instance_args)
Example #29
Source File: tf_metric_wrapper_test.py From model-analysis with Apache License 2.0 | 4 votes |
def testBatching(self): computation = tf_metric_wrapper.tf_metric_computations( [_CustomMetric(), tf.keras.metrics.MeanSquaredError(name='mse')], batch_size=2)[0] example1 = {'labels': [0.0], 'predictions': [0.0], 'example_weights': [1.0]} example2 = {'labels': [0.0], 'predictions': [0.5], 'example_weights': [1.0]} example3 = {'labels': [1.0], 'predictions': [0.3], 'example_weights': [1.0]} example4 = {'labels': [1.0], 'predictions': [0.9], 'example_weights': [1.0]} example5 = {'labels': [1.0], 'predictions': [0.5], 'example_weights': [0.0]} with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4, example5]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) custom_key = metric_types.MetricKey(name='custom') mse_key = metric_types.MetricKey(name='mse') self.assertDictElementsAlmostEqual( got_metrics, { custom_key: (0.0 + 0.5 + 0.3 + 0.9 + 0.0) / (1.0 + 1.0 + 1.0 + 1.0 + 0.0), mse_key: 0.1875, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
Example #30
Source File: tjur_discrimination_test.py From model-analysis with Apache License 2.0 | 4 votes |
def testTjuDicriminationMetricsWithWeights(self, metric, expected_value): computations = metric.computations() shared_metrics = computations[0] metric = computations[1] # Positive labels: 1.0 * 0.0 + 2.0 * 1.0 + 3.0 * 1.0 + 4.0 * 0.0 = 5.0 # Negative labels: 1.0 * 1.0 + 2.0 * 0.0 + 3.0 * 0.0 + 4.0 * 1.0 = 5.0 # Positive predictions: 1.0 * 0.0 * 0.8 + 2.0 * 1.0 * 0.3 + 3.0 * 1.0 * 0.9 # + 4.0 * 0.0 * 0.2 = 3.3 # Negative predictions: 1.0 * 1.0 * 0.8 + 2.0 * 0.0 * 0.7 + 3.0 * 0.0 * 0.1 # + 4.0 * 1.0 * 0.2 = 1.6 example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([2.0]), } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([3.0]), } example4 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([4.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeWeightedTotals' >> beam.CombinePerKey(shared_metrics.combiner) | 'ComputeMetric' >> beam.Map(lambda x: (x[0], metric.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric.keys[0] self.assertDictElementsAlmostEqual( got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')