Python apache_beam.CoGroupByKey() Examples
The following are 7
code examples of apache_beam.CoGroupByKey().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 5 votes |
def _join_x_counts( join_info: Tuple[_SlicedXKey, Dict[Text, Sequence[Any]]] # TODO(b/147153346) update dict value list element type annotation to: # Union[_CountType, Tuple[_YType, _CountType]] ) -> Iterator[Tuple[_SlicedYKey, _ConditionalYRate]]: """Joins x_count with all xy_counts for that x. This function expects the result of a CoGroupByKey, in which the key is a tuple of the form (slice_key, x_path, x), and one of the grouped streams has just one element, the number of examples in a given slice for which x is present in x_path, and the other grouped stream is the set of all (x, y) pairs for that x along with the number of examples in which both x and y are present in their respective paths. Schematically, join_info looks like: (slice, x_path, x), {'x_count': [x_count], 'xy_counts': [(y_1, xy_1_count), ..., (y_k, xy_k_count)]} If the value of x_count is less than min_x_count, no rates will be yielded. Args: join_info: A CoGroupByKey result Yields: Per-(slice, x_path, y, x) tuples of the form (_SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)). """ # (slice_key, x_path, x), join_inputs = join_info key, join_inputs = join_info if not join_inputs['x_count']: return x_count = join_inputs['x_count'][0] for y, xy_count in join_inputs['xy_counts']: yield _SlicedYKey(key.slice_key, y), _ConditionalYRate( x_path=key.x_path, x=key.x, xy_count=xy_count, x_count=x_count)
Example #2
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 5 votes |
def _join_example_counts( join_info: Tuple[types.SliceKey, Dict[Text, Sequence[Any]]] # TODO(b/147153346) update dict value list element type annotation to: # Union[_CountType, Tuple[_YType, _CountType]] ) -> Iterator[Tuple[_SlicedYKey, _YRate]]: """Joins slice example count with all values of y within that slice. This function expects the result of a CoGroupByKey, in which the key is the slice_key, one of the grouped streams has just one element, the total number of examples within the slice, and the other grouped stream is the set of all y values and number of times that y value appears in this slice. Schematically, join_info looks like: slice_key, {'example_count': [example_count], 'y_counts': [(y_1, y_1_count), ..., (y_k, y_k_count)]} Args: join_info: A CoGroupByKey result. Yields: Per-(slice, y) tuples (_SlicedYKey(slice, y), _YRate(y_count, example_count)). """ slice_key, join_inputs = join_info example_count = join_inputs['example_count'][0] for y, y_count in join_inputs['y_counts']: yield _SlicedYKey(slice_key, y), _YRate(y_count, example_count)
Example #3
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 5 votes |
def _compute_lifts( join_info: Tuple[_SlicedYKey, Dict[Text, Sequence[Any]]] # TODO(b/147153346) update dict value list element type annotation to: # Sequence[Union[_YRate, _ConditionalYRate]] ) -> Iterator[Tuple[_SlicedFeatureKey, _LiftInfo]]: """Joins y_counts with all x-y pairs for that y and computes lift. This function expects the result of a CoGroupByKey, in which the key is a tuple of the form (slice_key, y), one of the grouped streams has just one element, the y_rate for that value of y, and the other grouped stream is the set of all conditional_y_rate values for that same value of y. Schematically, join_info looks like: (slice_key, y), {'y_rate': [y_count, example_count], 'conditional_y_rate': [ (x_path_1, x_1, x_1_y_count, x_1_count), ..., (x_path_1, x_k, x_k_y_count, x_k_count) ... (x_path_m, x_1, x_1_y_count, x_1_count), ..., (x_path_m, x_k, x_k_y_count, x_k_count)]} Args: join_info: A CoGroupByKey result. Yields: Per-(slice, x_path) tuples of the form ((slice_key, x_path), _LiftInfo(x, y, lift, xy_count, x_count, y_count)). """ (slice_key, y), join_inputs = join_info y_rate = join_inputs['y_rate'][0] for conditional_y_rate in join_inputs['conditional_y_rate']: lift = ((float(conditional_y_rate.xy_count) / conditional_y_rate.x_count) / (float(y_rate.y_count) / y_rate.example_count)) yield (_SlicedFeatureKey(slice_key, conditional_y_rate.x_path), _LiftInfo( x=conditional_y_rate.x, y=y, lift=lift, xy_count=conditional_y_rate.xy_count, x_count=conditional_y_rate.x_count, y_count=y_rate.y_count))
Example #4
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 5 votes |
def expand(self, x_counts_and_ys: Tuple[Tuple[_SlicedXKey, _CountType], _SlicedYKey]): x_counts, y_keys = x_counts_and_ys # slice, y y_keys_by_slice = ( y_keys | 'MoveYToValue_YKey' >> beam.Map(lambda k: (k.slice_key, k.y))) # slice, (x_path, x, x_count) x_counts_by_slice = ( x_counts | 'MoveXToValue_XCountsKey' >> beam.MapTuple( lambda k, v: (k.slice_key, (k.x_path, k.x, v)))) # _SlicedXYKey(slice, x_path, x, y), 0 return ( { 'y_keys': y_keys_by_slice, 'x_counts': x_counts_by_slice } | 'CoGroupByForPlaceholderYRates' >> beam.CoGroupByKey() | 'CrossXYValues' >> beam.FlatMap(_cross_join_y_keys)) # No typehint for input, since it's a multi-input PTransform for which Beam # doesn't yet support typehints (BEAM-3280).
Example #5
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 5 votes |
def expand( self, sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection: # Compute P(Y=y) # _SlicedYKey(slice, y), _YRate(y_count, example_count) y_rates = sliced_record_batchs | 'GetYRates' >> _GetYRates( self._y_path, self._y_boundaries, self._weight_column_name) y_keys = y_rates | 'ExtractYKeys' >> beam.Keys() # Compute P(Y=y | X=x) # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count) conditional_y_rates = ((sliced_record_batchs, y_keys) | 'GetConditionalYRates' >> _GetConditionalYRates( self._y_path, self._y_boundaries, self._x_paths, self._min_x_count, self._weight_column_name)) return ( { 'conditional_y_rate': conditional_y_rates, 'y_rate': y_rates } | 'CoGroupByForLift' >> beam.CoGroupByKey() | 'ComputeLifts' >> beam.FlatMap(_compute_lifts) | 'FilterLifts' >> _FilterLifts(self._top_k_per_y, self._bottom_k_per_y) | 'GroupLiftsForOutput' >> beam.GroupByKey() | 'MakeProtos' >> beam.Map(_make_dataset_feature_stats_proto, self._y_path, self._y_boundaries, self._weight_column_name is not None, self._output_custom_stats))
Example #6
Source File: linters.py From data-linter with Apache License 2.0 | 5 votes |
def _lint(self, examples): """Returns the result of the TailedDistributionDetector linter. Args: examples: A `PTransform` that yields a `PCollection` of `tf.Example`s. Returns: A `PTransform` that yields a `LintResult` of the format warnings: [feature names] lint_samples: [ [stats: {min: feature_min if outlying, max: feature_max if outlying}] for each warning ] """ feature_values = ( examples | 'FlattenFeatureValue' >> beam.FlatMap( self._flatten_feature_vals(self.numeric_features))) feature_min_trimmed_mean = ( feature_values | self._make_trimmed_averager(self._MIN)) feature_max_trimmed_mean = ( feature_values | self._make_trimmed_averager(self._MAX)) return ( (feature_min_trimmed_mean, feature_max_trimmed_mean) | 'MergeTrimmedMeans' >> beam.CoGroupByKey() | 'AsList' >> beam.combiners.ToList() | 'ToResult' >> beam.Map(self._to_result))
Example #7
Source File: lift_stats_generator.py From data-validation with Apache License 2.0 | 4 votes |
def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch, _SlicedYKey]): sliced_record_batchs, y_keys = sliced_record_batchs_and_ys # _SlicedXYKey(slice, x_path, x, y), xy_count partial_copresence_counts = ( sliced_record_batchs | 'ToPartialCopresenceCounts' >> beam.FlatMap( _to_partial_copresence_counts, self._y_path, self._x_paths, self._y_boundaries, self._weight_column_name)) # Compute placerholder copresence counts. # partial_copresence_counts will only include x-y pairs that are present, # but we would also like to keep track of x-y pairs that never appear, as # long as x and y independently occur in the slice. # _SlicedXKey(slice, x_path, x), x_count x_counts = ( sliced_record_batchs | 'ToPartialXCounts' >> beam.FlatMap( _to_partial_x_counts, self._x_paths, self._weight_column_name) | 'SumXCounts' >> beam.CombinePerKey(sum)) if self._min_x_count: x_counts = x_counts | 'FilterXCounts' >> beam.Filter( lambda kv: kv[1] > self._min_x_count) # _SlicedXYKey(slice, x_path, x, y), 0 placeholder_copresence_counts = ( (x_counts, y_keys) | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts( self._x_paths, self._min_x_count)) def move_y_to_value(key, xy_count): return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count) # _SlicedXKey(slice, x_path, x), (y, xy_count) copresence_counts = ( (placeholder_copresence_counts, partial_copresence_counts) | 'FlattenCopresenceCounts' >> beam.Flatten() | 'SumCopresencePairs' >> beam.CombinePerKey(sum) | 'MoveYToValue' >> beam.MapTuple(move_y_to_value)) # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count) return ({ 'x_count': x_counts, 'xy_counts': copresence_counts } | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey() | 'JoinXCounts' >> beam.FlatMap(_join_x_counts))