Python apache_beam.CoGroupByKey() Examples

The following are 7 code examples of apache_beam.CoGroupByKey(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: lift_stats_generator.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def _join_x_counts(
    join_info: Tuple[_SlicedXKey, Dict[Text, Sequence[Any]]]
    # TODO(b/147153346) update dict value list element type annotation to:
    # Union[_CountType, Tuple[_YType, _CountType]]
) -> Iterator[Tuple[_SlicedYKey, _ConditionalYRate]]:
  """Joins x_count with all xy_counts for that x.

  This function expects the result of a CoGroupByKey, in which the key is a
  tuple of the form (slice_key, x_path, x), and one of the grouped streams has
  just one element, the number of examples in a given slice for which x is
  present in x_path, and the other grouped stream is the set of all (x, y) pairs
  for that x along with the number of examples in which  both x and y are
  present in their respective paths. Schematically, join_info looks like:

  (slice, x_path, x), {'x_count': [x_count],
                       'xy_counts': [(y_1, xy_1_count), ..., (y_k, xy_k_count)]}

  If the value of x_count is less than min_x_count, no rates will be yielded.

  Args:
    join_info: A CoGroupByKey result

  Yields:
    Per-(slice, x_path, y, x) tuples of the form (_SlicedYKey(slice, y),
    _ConditionalYRate(x_path, x, xy_count, x_count)).
  """
  # (slice_key, x_path, x), join_inputs = join_info
  key, join_inputs = join_info
  if not join_inputs['x_count']:
    return
  x_count = join_inputs['x_count'][0]
  for y, xy_count in join_inputs['xy_counts']:
    yield _SlicedYKey(key.slice_key, y), _ConditionalYRate(
        x_path=key.x_path, x=key.x, xy_count=xy_count, x_count=x_count) 
Example #2
Source File: lift_stats_generator.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def _join_example_counts(
    join_info: Tuple[types.SliceKey, Dict[Text, Sequence[Any]]]
    # TODO(b/147153346) update dict value list element type annotation to:
    # Union[_CountType, Tuple[_YType, _CountType]]
) -> Iterator[Tuple[_SlicedYKey, _YRate]]:
  """Joins slice example count with all values of y within that slice.

  This function expects the result of a CoGroupByKey, in which the key is the
  slice_key, one of the grouped streams has just one element, the total number
  of examples within the slice, and the other grouped stream is the set of all
  y values and number of times that y value appears in this slice.
  Schematically, join_info looks like:

  slice_key, {'example_count': [example_count],
              'y_counts': [(y_1, y_1_count), ..., (y_k, y_k_count)]}

  Args:
    join_info: A CoGroupByKey result.

  Yields:
    Per-(slice, y) tuples (_SlicedYKey(slice, y),
                           _YRate(y_count, example_count)).
  """
  slice_key, join_inputs = join_info
  example_count = join_inputs['example_count'][0]
  for y, y_count in join_inputs['y_counts']:
    yield _SlicedYKey(slice_key, y), _YRate(y_count, example_count) 
Example #3
Source File: lift_stats_generator.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def _compute_lifts(
    join_info: Tuple[_SlicedYKey, Dict[Text, Sequence[Any]]]
    # TODO(b/147153346) update dict value list element type annotation to:
    # Sequence[Union[_YRate, _ConditionalYRate]]
) -> Iterator[Tuple[_SlicedFeatureKey, _LiftInfo]]:
  """Joins y_counts with all x-y pairs for that y and computes lift.

  This function expects the result of a CoGroupByKey, in which the key is a
  tuple of the form (slice_key, y), one of the grouped streams has just one
  element, the y_rate for that value of y, and the other grouped stream is the
  set of all conditional_y_rate values for that same value of y. Schematically,
  join_info looks like:

  (slice_key, y), {'y_rate': [y_count, example_count], 'conditional_y_rate': [
      (x_path_1, x_1, x_1_y_count, x_1_count), ...,
      (x_path_1, x_k, x_k_y_count, x_k_count)
      ...
      (x_path_m, x_1, x_1_y_count, x_1_count), ...,
      (x_path_m, x_k, x_k_y_count, x_k_count)]}

  Args:
    join_info: A CoGroupByKey result.

  Yields:
    Per-(slice, x_path) tuples of the form ((slice_key, x_path),
    _LiftInfo(x, y, lift, xy_count, x_count, y_count)).
  """
  (slice_key, y), join_inputs = join_info
  y_rate = join_inputs['y_rate'][0]
  for conditional_y_rate in join_inputs['conditional_y_rate']:
    lift = ((float(conditional_y_rate.xy_count) / conditional_y_rate.x_count) /
            (float(y_rate.y_count) / y_rate.example_count))
    yield (_SlicedFeatureKey(slice_key, conditional_y_rate.x_path),
           _LiftInfo(
               x=conditional_y_rate.x,
               y=y,
               lift=lift,
               xy_count=conditional_y_rate.xy_count,
               x_count=conditional_y_rate.x_count,
               y_count=y_rate.y_count)) 
Example #4
Source File: lift_stats_generator.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def expand(self, x_counts_and_ys: Tuple[Tuple[_SlicedXKey, _CountType],
                                          _SlicedYKey]):
    x_counts, y_keys = x_counts_and_ys

    # slice, y
    y_keys_by_slice = (
        y_keys
        | 'MoveYToValue_YKey' >> beam.Map(lambda k: (k.slice_key, k.y)))
    # slice, (x_path, x, x_count)
    x_counts_by_slice = (
        x_counts
        | 'MoveXToValue_XCountsKey' >> beam.MapTuple(
            lambda k, v: (k.slice_key, (k.x_path, k.x, v))))

    # _SlicedXYKey(slice, x_path, x, y), 0
    return (
        {
            'y_keys': y_keys_by_slice,
            'x_counts': x_counts_by_slice
        }
        | 'CoGroupByForPlaceholderYRates' >> beam.CoGroupByKey()
        | 'CrossXYValues' >> beam.FlatMap(_cross_join_y_keys))


# No typehint for input, since it's a multi-input PTransform for which Beam
# doesn't yet support typehints (BEAM-3280). 
Example #5
Source File: lift_stats_generator.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def expand(
      self,
      sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    # Compute P(Y=y)
    # _SlicedYKey(slice, y), _YRate(y_count, example_count)
    y_rates = sliced_record_batchs | 'GetYRates' >> _GetYRates(
        self._y_path, self._y_boundaries, self._weight_column_name)
    y_keys = y_rates | 'ExtractYKeys' >> beam.Keys()

    # Compute P(Y=y | X=x)
    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    conditional_y_rates = ((sliced_record_batchs, y_keys)
                           | 'GetConditionalYRates' >> _GetConditionalYRates(
                               self._y_path, self._y_boundaries, self._x_paths,
                               self._min_x_count, self._weight_column_name))

    return (
        {
            'conditional_y_rate': conditional_y_rates,
            'y_rate': y_rates
        }
        | 'CoGroupByForLift' >> beam.CoGroupByKey()
        | 'ComputeLifts' >> beam.FlatMap(_compute_lifts)
        | 'FilterLifts' >> _FilterLifts(self._top_k_per_y, self._bottom_k_per_y)
        | 'GroupLiftsForOutput' >> beam.GroupByKey()
        | 'MakeProtos' >> beam.Map(_make_dataset_feature_stats_proto,
                                   self._y_path, self._y_boundaries,
                                   self._weight_column_name is not None,
                                   self._output_custom_stats)) 
Example #6
Source File: linters.py    From data-linter with Apache License 2.0 5 votes vote down vote up
def _lint(self, examples):
    """Returns the result of the TailedDistributionDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `PTransform` that yields a `LintResult` of the format
        warnings: [feature names]
        lint_samples: [
          [stats: {min: feature_min if outlying, max: feature_max if outlying}]
          for each warning
        ]
    """

    feature_values = (
        examples
        | 'FlattenFeatureValue' >> beam.FlatMap(
            self._flatten_feature_vals(self.numeric_features)))

    feature_min_trimmed_mean = (
        feature_values | self._make_trimmed_averager(self._MIN))
    feature_max_trimmed_mean = (
        feature_values | self._make_trimmed_averager(self._MAX))

    return (
        (feature_min_trimmed_mean, feature_max_trimmed_mean)
        | 'MergeTrimmedMeans' >> beam.CoGroupByKey()
        | 'AsList' >> beam.combiners.ToList()
        | 'ToResult' >> beam.Map(self._to_result)) 
Example #7
Source File: lift_stats_generator.py    From data-validation with Apache License 2.0 4 votes vote down vote up
def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch,
                                                      _SlicedYKey]):
    sliced_record_batchs, y_keys = sliced_record_batchs_and_ys

    # _SlicedXYKey(slice, x_path, x, y), xy_count
    partial_copresence_counts = (
        sliced_record_batchs
        | 'ToPartialCopresenceCounts' >> beam.FlatMap(
            _to_partial_copresence_counts, self._y_path, self._x_paths,
            self._y_boundaries, self._weight_column_name))

    # Compute placerholder copresence counts.
    # partial_copresence_counts will only include x-y pairs that are present,
    # but we would also like to keep track of x-y pairs that never appear, as
    # long as x and y independently occur in the slice.

    # _SlicedXKey(slice, x_path, x), x_count
    x_counts = (
        sliced_record_batchs
        | 'ToPartialXCounts' >> beam.FlatMap(
            _to_partial_x_counts, self._x_paths, self._weight_column_name)
        | 'SumXCounts' >> beam.CombinePerKey(sum))
    if self._min_x_count:
      x_counts = x_counts | 'FilterXCounts' >> beam.Filter(
          lambda kv: kv[1] > self._min_x_count)

    # _SlicedXYKey(slice, x_path, x, y), 0
    placeholder_copresence_counts = (
        (x_counts, y_keys)
        | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts(
            self._x_paths, self._min_x_count))

    def move_y_to_value(key, xy_count):
      return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count)

    # _SlicedXKey(slice, x_path, x), (y, xy_count)
    copresence_counts = (
        (placeholder_copresence_counts, partial_copresence_counts)
        | 'FlattenCopresenceCounts' >> beam.Flatten()
        | 'SumCopresencePairs' >> beam.CombinePerKey(sum)
        | 'MoveYToValue' >> beam.MapTuple(move_y_to_value))

    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    return ({
        'x_count': x_counts,
        'xy_counts': copresence_counts
    }
            | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey()
            | 'JoinXCounts' >> beam.FlatMap(_join_x_counts))