Python Examples of apache_beam.CoGroupByKey

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

5 votes

def _join_x_counts(
    join_info: Tuple[_SlicedXKey, Dict[Text, Sequence[Any]]]
    # TODO(b/147153346) update dict value list element type annotation to:
    # Union[_CountType, Tuple[_YType, _CountType]]
) -> Iterator[Tuple[_SlicedYKey, _ConditionalYRate]]:
  """Joins x_count with all xy_counts for that x.

  This function expects the result of a CoGroupByKey, in which the key is a
  tuple of the form (slice_key, x_path, x), and one of the grouped streams has
  just one element, the number of examples in a given slice for which x is
  present in x_path, and the other grouped stream is the set of all (x, y) pairs
  for that x along with the number of examples in which  both x and y are
  present in their respective paths. Schematically, join_info looks like:

  (slice, x_path, x), {'x_count': [x_count],
                       'xy_counts': [(y_1, xy_1_count), ..., (y_k, xy_k_count)]}

  If the value of x_count is less than min_x_count, no rates will be yielded.

  Args:
    join_info: A CoGroupByKey result

  Yields:
    Per-(slice, x_path, y, x) tuples of the form (_SlicedYKey(slice, y),
    _ConditionalYRate(x_path, x, xy_count, x_count)).
  """
  # (slice_key, x_path, x), join_inputs = join_info
  key, join_inputs = join_info
  if not join_inputs['x_count']:
    return
  x_count = join_inputs['x_count'][0]
  for y, xy_count in join_inputs['xy_counts']:
    yield _SlicedYKey(key.slice_key, y), _ConditionalYRate(
        x_path=key.x_path, x=key.x, xy_count=xy_count, x_count=x_count)

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

5 votes

def _join_example_counts(
    join_info: Tuple[types.SliceKey, Dict[Text, Sequence[Any]]]
    # TODO(b/147153346) update dict value list element type annotation to:
    # Union[_CountType, Tuple[_YType, _CountType]]
) -> Iterator[Tuple[_SlicedYKey, _YRate]]:
  """Joins slice example count with all values of y within that slice.

  This function expects the result of a CoGroupByKey, in which the key is the
  slice_key, one of the grouped streams has just one element, the total number
  of examples within the slice, and the other grouped stream is the set of all
  y values and number of times that y value appears in this slice.
  Schematically, join_info looks like:

  slice_key, {'example_count': [example_count],
              'y_counts': [(y_1, y_1_count), ..., (y_k, y_k_count)]}

  Args:
    join_info: A CoGroupByKey result.

  Yields:
    Per-(slice, y) tuples (_SlicedYKey(slice, y),
                           _YRate(y_count, example_count)).
  """
  slice_key, join_inputs = join_info
  example_count = join_inputs['example_count'][0]
  for y, y_count in join_inputs['y_counts']:
    yield _SlicedYKey(slice_key, y), _YRate(y_count, example_count)

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

5 votes

def _compute_lifts(
    join_info: Tuple[_SlicedYKey, Dict[Text, Sequence[Any]]]
    # TODO(b/147153346) update dict value list element type annotation to:
    # Sequence[Union[_YRate, _ConditionalYRate]]
) -> Iterator[Tuple[_SlicedFeatureKey, _LiftInfo]]:
  """Joins y_counts with all x-y pairs for that y and computes lift.

  This function expects the result of a CoGroupByKey, in which the key is a
  tuple of the form (slice_key, y), one of the grouped streams has just one
  element, the y_rate for that value of y, and the other grouped stream is the
  set of all conditional_y_rate values for that same value of y. Schematically,
  join_info looks like:

  (slice_key, y), {'y_rate': [y_count, example_count], 'conditional_y_rate': [
      (x_path_1, x_1, x_1_y_count, x_1_count), ...,
      (x_path_1, x_k, x_k_y_count, x_k_count)
      ...
      (x_path_m, x_1, x_1_y_count, x_1_count), ...,
      (x_path_m, x_k, x_k_y_count, x_k_count)]}

  Args:
    join_info: A CoGroupByKey result.

  Yields:
    Per-(slice, x_path) tuples of the form ((slice_key, x_path),
    _LiftInfo(x, y, lift, xy_count, x_count, y_count)).
  """
  (slice_key, y), join_inputs = join_info
  y_rate = join_inputs['y_rate'][0]
  for conditional_y_rate in join_inputs['conditional_y_rate']:
    lift = ((float(conditional_y_rate.xy_count) / conditional_y_rate.x_count) /
            (float(y_rate.y_count) / y_rate.example_count))
    yield (_SlicedFeatureKey(slice_key, conditional_y_rate.x_path),
           _LiftInfo(
               x=conditional_y_rate.x,
               y=y,
               lift=lift,
               xy_count=conditional_y_rate.xy_count,
               x_count=conditional_y_rate.x_count,
               y_count=y_rate.y_count))

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

5 votes

def expand(self, x_counts_and_ys: Tuple[Tuple[_SlicedXKey, _CountType],
                                          _SlicedYKey]):
    x_counts, y_keys = x_counts_and_ys

    # slice, y
    y_keys_by_slice = (
        y_keys
        | 'MoveYToValue_YKey' >> beam.Map(lambda k: (k.slice_key, k.y)))
    # slice, (x_path, x, x_count)
    x_counts_by_slice = (
        x_counts
        | 'MoveXToValue_XCountsKey' >> beam.MapTuple(
            lambda k, v: (k.slice_key, (k.x_path, k.x, v))))

    # _SlicedXYKey(slice, x_path, x, y), 0
    return (
        {
            'y_keys': y_keys_by_slice,
            'x_counts': x_counts_by_slice
        }
        | 'CoGroupByForPlaceholderYRates' >> beam.CoGroupByKey()
        | 'CrossXYValues' >> beam.FlatMap(_cross_join_y_keys))


# No typehint for input, since it's a multi-input PTransform for which Beam
# doesn't yet support typehints (BEAM-3280).

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

5 votes

def expand(
      self,
      sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    # Compute P(Y=y)
    # _SlicedYKey(slice, y), _YRate(y_count, example_count)
    y_rates = sliced_record_batchs | 'GetYRates' >> _GetYRates(
        self._y_path, self._y_boundaries, self._weight_column_name)
    y_keys = y_rates | 'ExtractYKeys' >> beam.Keys()

    # Compute P(Y=y | X=x)
    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    conditional_y_rates = ((sliced_record_batchs, y_keys)
                           | 'GetConditionalYRates' >> _GetConditionalYRates(
                               self._y_path, self._y_boundaries, self._x_paths,
                               self._min_x_count, self._weight_column_name))

    return (
        {
            'conditional_y_rate': conditional_y_rates,
            'y_rate': y_rates
        }
        | 'CoGroupByForLift' >> beam.CoGroupByKey()
        | 'ComputeLifts' >> beam.FlatMap(_compute_lifts)
        | 'FilterLifts' >> _FilterLifts(self._top_k_per_y, self._bottom_k_per_y)
        | 'GroupLiftsForOutput' >> beam.GroupByKey()
        | 'MakeProtos' >> beam.Map(_make_dataset_feature_stats_proto,
                                   self._y_path, self._y_boundaries,
                                   self._weight_column_name is not None,
                                   self._output_custom_stats))

Source File: linters.py From data-linter with Apache License 2.0

5 votes

def _lint(self, examples):
    """Returns the result of the TailedDistributionDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `PTransform` that yields a `LintResult` of the format
        warnings: [feature names]
        lint_samples: [
          [stats: {min: feature_min if outlying, max: feature_max if outlying}]
          for each warning
        ]
    """

    feature_values = (
        examples
        | 'FlattenFeatureValue' >> beam.FlatMap(
            self._flatten_feature_vals(self.numeric_features)))

    feature_min_trimmed_mean = (
        feature_values | self._make_trimmed_averager(self._MIN))
    feature_max_trimmed_mean = (
        feature_values | self._make_trimmed_averager(self._MAX))

    return (
        (feature_min_trimmed_mean, feature_max_trimmed_mean)
        | 'MergeTrimmedMeans' >> beam.CoGroupByKey()
        | 'AsList' >> beam.combiners.ToList()
        | 'ToResult' >> beam.Map(self._to_result))

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

4 votes

def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch,
                                                      _SlicedYKey]):
    sliced_record_batchs, y_keys = sliced_record_batchs_and_ys

    # _SlicedXYKey(slice, x_path, x, y), xy_count
    partial_copresence_counts = (
        sliced_record_batchs
        | 'ToPartialCopresenceCounts' >> beam.FlatMap(
            _to_partial_copresence_counts, self._y_path, self._x_paths,
            self._y_boundaries, self._weight_column_name))

    # Compute placerholder copresence counts.
    # partial_copresence_counts will only include x-y pairs that are present,
    # but we would also like to keep track of x-y pairs that never appear, as
    # long as x and y independently occur in the slice.

    # _SlicedXKey(slice, x_path, x), x_count
    x_counts = (
        sliced_record_batchs
        | 'ToPartialXCounts' >> beam.FlatMap(
            _to_partial_x_counts, self._x_paths, self._weight_column_name)
        | 'SumXCounts' >> beam.CombinePerKey(sum))
    if self._min_x_count:
      x_counts = x_counts | 'FilterXCounts' >> beam.Filter(
          lambda kv: kv[1] > self._min_x_count)

    # _SlicedXYKey(slice, x_path, x, y), 0
    placeholder_copresence_counts = (
        (x_counts, y_keys)
        | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts(
            self._x_paths, self._min_x_count))

    def move_y_to_value(key, xy_count):
      return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count)

    # _SlicedXKey(slice, x_path, x), (y, xy_count)
    copresence_counts = (
        (placeholder_copresence_counts, partial_copresence_counts)
        | 'FlattenCopresenceCounts' >> beam.Flatten()
        | 'SumCopresencePairs' >> beam.CombinePerKey(sum)
        | 'MoveYToValue' >> beam.MapTuple(move_y_to_value))

    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    return ({
        'x_count': x_counts,
        'xy_counts': copresence_counts
    }
            | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey()
            | 'JoinXCounts' >> beam.FlatMap(_join_x_counts))

Python apache_beam.CoGroupByKey() Examples