Python apache_beam.CombineFn() Examples

The following are 10 code examples of apache_beam.CombineFn(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: jackknife.py    From model-analysis with Apache License 2.0 6 votes vote down vote up
def __init__(
      self,
      num_jackknife_samples: int,
      skip_ci_metric_keys: Optional[Set[metric_types.MetricKey]] = None):
    """Initializes a _MergeJackknifeSamples CombineFn.

    Args:
      num_jackknife_samples: The number of samples computed per slice.
      skip_ci_metric_keys: Set of metric keys for which to skip confidence
        interval computation. For metric keys in this set, just the unsampled
        value will be returned.
    """
    self._num_jackknife_samples = num_jackknife_samples
    self._skip_ci_metric_keys = skip_ci_metric_keys
    self._num_slices_counter = beam.metrics.Metrics.counter(
        constants.METRICS_NAMESPACE, 'num_slices')
    self._missing_samples_counter = beam.metrics.Metrics.counter(
        constants.METRICS_NAMESPACE, 'num_slices_missing_jackknife_samples')
    self._small_samples_counter = beam.metrics.Metrics.counter(
        constants.METRICS_NAMESPACE, 'num_slices_with_small_jackknife_samples')
    self._sample_id_key = metric_types.MetricKey(_JACKKNIFE_SAMPLE_ID_KEY) 
Example #2
Source File: analyzer_impls.py    From transform with Apache License 2.0 5 votes vote down vote up
def __init__(self,
               combiner,
               tf_config,
               is_combining_accumulators,
               should_extract_output=None):
    """Init method for _CombinerWrapper.

    Args:
      combiner: A `analyzer_nodes.Combiner` object used to combine.
      tf_config: A `tf.ConfigProto`.
      is_combining_accumulators: A bool which indicates whether this is
        combining single or batched inputs, or already accumulated objects.
      should_extract_output: A bool which indicates whether this should call the
        combiner's extract_output method in extract_output. If not specified, we
        assume it's the same value as `should_extract_output`.
    """
    self._combiner = combiner
    self._tf_config = tf_config
    self._is_combining_accumulators = is_combining_accumulators
    if should_extract_output is None:
      should_extract_output = is_combining_accumulators
    self._should_extract_output = should_extract_output

    # TODO(b/135541366): Move this to CombineFn.setup once it exists.
    # That should help simplify several aspects of Quantiles state management.
    if isinstance(combiner, analyzers.QuantilesCombiner):
      combiner.initialize_local_state(tf_config) 
Example #3
Source File: preprocess.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def check_size(p, name, path):
  """Performs checks on the input pipeline and stores stats in specfied path.

  Checks performed: counts rows and derives class distribution.

  Args:
    p: PCollection, input pipeline.
    name: string, unique identifier for the beam step.
    path: string: path to store stats.

  Returns:
    PCollection
  """

  class _Combine(beam.CombineFn):
    """Counts and take the average of positive classes in the pipeline."""

    def create_accumulator(self):
      return (0.0, 0.0)

    def add_input(self, sum_count, inputs):
      (s, count) = sum_count
      return s + inputs, count + 1

    def merge_accumulators(self, accumulators):
      sums, counts = zip(*accumulators)
      return sum(sums), sum(counts)

    # We should not consider the case count == 0 as an error (class initialized
    # with count == 0).
    def extract_output(self, sum_count):
      (s, count) = sum_count
      return count, (1.0 * s / count) if count else float('NaN')

  return (p
          | 'CheckMapTo_1_{}'.format(name) >>
          beam.Map(lambda x: x[constants.LABEL_COLUMN])
          | 'CheckSum_{}'.format(name) >> beam.CombineGlobally(_Combine())
          | 'CheckRecord_{}'.format(name) >> beam.io.WriteToText(
              '{}.txt'.format(path))) 
Example #4
Source File: model_util.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def __init__(self, model_loaders: Dict[Text, types.ModelLoader]):
    """Initializes CombineFn using dict of loaders keyed by model location."""
    self._model_loaders = model_loaders
    self._loaded_models = None
    self._model_load_seconds = None
    self._model_load_seconds_distribution = beam.metrics.Metrics.distribution(
        constants.METRICS_NAMESPACE, 'model_load_seconds') 
Example #5
Source File: metric_types.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def __new__(cls, keys: List[MetricKey], preprocessor: beam.DoFn,
              combiner: beam.CombineFn):
    return super(MetricComputation, cls).__new__(cls, keys, preprocessor,
                                                 combiner) 
Example #6
Source File: jackknife.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def __init__(self, combiner: beam.CombineFn):
    self._combiner = combiner 
Example #7
Source File: query_based_metrics_evaluator.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def QueryBasedMetricsEvaluator(  # pylint: disable=invalid-name
    query_id: Text,
    prediction_key: Text,
    combine_fns: List[beam.CombineFn],
    metrics_key: Text = constants.METRICS_KEY,
    run_after: Text = slice_key_extractor.SLICE_KEY_EXTRACTOR_STAGE_NAME,
) -> evaluator.Evaluator:
  """Creates an Evaluator for evaluating metrics and plots.

  Args:
    query_id: Key of query ID column in the features dictionary.
    prediction_key: Key in predictions dictionary to use as the prediction (for
      sorting examples within the query). Use the empty string if the Estimator
      returns a predictions Tensor (not a dictionary).
    combine_fns: List of query based metrics combine functions.
    metrics_key: Name to use for metrics key in Evaluation output.
    run_after: Extractor to run after (None means before any extractors).

  Returns:
    Evaluator for computing query-based metrics. The output will be stored under
    'metrics' and 'plots' keys.
  """
  # pylint: disable=no-value-for-parameter
  return evaluator.Evaluator(
      stage_name='EvaluateQueryBasedMetrics',
      run_after=run_after,
      ptransform=EvaluateQueryBasedMetrics(
          query_id=query_id,
          prediction_key=prediction_key,
          combine_fns=combine_fns,
          metrics_key=metrics_key))
  # pylint: enable=no-value-for-parameter 
Example #8
Source File: query_based_metrics_evaluator.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def EvaluateQueryBasedMetrics(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    prediction_key: Text,
    query_id: Text,
    combine_fns: List[beam.CombineFn],
    metrics_key: Text = constants.METRICS_KEY,
) -> evaluator.Evaluation:
  """Evaluates query-based metrics.

  Args:
    extracts: PCollection of Extracts. The extracts MUST contain a
      FeaturesPredictionsLabels extract keyed by
      tfma.FEATURE_PREDICTION_LABELS_KEY and a list of SliceKeyType extracts
      keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by
      calling the default_extractors function.
    prediction_key: Key in predictions dictionary to use as the prediction (for
      sorting examples within the query). Use the empty string if the Estimator
      returns a predictions Tensor (not a dictionary).
    query_id: Key of query ID column in the features dictionary.
    combine_fns: List of query based metrics combine functions.
    metrics_key: Name to use for metrics key in Evaluation output.

  Returns:
    Evaluation containing metrics dictionaries keyed by 'metrics'.
  """

  # pylint: disable=no-value-for-parameter
  metrics = (
      extracts
      | 'Filter' >> extractor.Filter(include=[
          constants.FEATURES_PREDICTIONS_LABELS_KEY,
          constants.SLICE_KEY_TYPES_KEY
      ])
      | 'ComputeQueryBasedMetrics' >> ComputeQueryBasedMetrics(
          query_id=query_id,
          combine_fns=combine_fns,
          prediction_key=prediction_key))
  # pylint: enable=no-value-for-parameter

  return {metrics_key: metrics} 
Example #9
Source File: jackknife.py    From model-analysis with Apache License 2.0 4 votes vote down vote up
def _make_loo_accumulators(
    accumulators: List[_AccumulatorType],
    combiner: beam.CombineFn) -> Iterator[_AccumulatorType]:
  """Yields accumulators which each leave out one value in accumulators.

  Args:
    accumulators: Tuple of values for which to compute complements
    combiner: A combiner to use for creating and merging accumulators.

  Yields:
    Leave-one-out accumulators for each element in `accumulators`. The ith
    accumulator will be the result of merging all accumulators but the ith,
    along with the accumulator passed as `complement`.
  """

  def make_loo_accumulators_rec(
      accumulators: List[_AccumulatorType], complement: _AccumulatorType,
      combiner: beam.CombineFn) -> Iterator[_AccumulatorType]:
    """Recursive helper to compute leave one out accumulators."""
    if len(accumulators) == 1:
      yield complement
    else:
      split_idx = int(len(accumulators) / 2)
      left, right = accumulators[:split_idx], accumulators[split_idx:]
      left_c = copy.deepcopy(complement)
      left_c = combiner.merge_accumulators([left_c] + right)
      for c in make_loo_accumulators_rec(left, left_c, combiner):
        yield c
      # reuse the complement accumulator on the right recursion.
      right_c = combiner.merge_accumulators([complement] + left)
      for c in make_loo_accumulators_rec(right, right_c, combiner):
        yield c

  # TODO(b/151445942) use `yield from` when we stop supporting python < 3.3
  for acc in make_loo_accumulators_rec(accumulators,
                                       combiner.create_accumulator(), combiner):
    yield acc


# TODO(b/152812821): Disble Beam annotations support due to failure in:
# //third_party/py/tensorflow_model_analysis/evaluators:jackknife_test.python3
# Output type hint violation at JackknifeCombinePerKey: expected Tuple[Union[
# Tuple[Tuple[str, Union[float, int, str]], ...], Tuple[]], Tuple[Dict[
# MetricKey, Any], ...]], got Tuple[Union[Tuple[Tuple[str, Union[float, int,
# str]], ...], Tuple[]], Dict[MetricKey, Any]]
#
# Since @beam.typehints.no_annotations is not available yet, part of the output
# type is put in quotes, which currently makes Beam ignore the hint. 
Example #10
Source File: jackknife.py    From model-analysis with Apache License 2.0 4 votes vote down vote up
def _make_jackknife_samples(
    slice_partitions: Tuple[slicer.SliceKeyType,
                            Sequence[_PartitionInfo]], combiner: beam.CombineFn
) -> Iterator[Tuple[slicer.SliceKeyType, 'metric_types.MetricsDict']]:
  """Computes leave-one-out and unsampled ouputs for the combiner.

  This function creates leave-one-out combiner outputs by combining all but one
  accumulator and extracting the output. Second, it creates an unsampled output
  using all of the accumulators and extracts an unsampled output. The keys
  yielded by thus function are augmented versions of the input slice key in
  which the sample ID (or a special placeholder ID for the unsampled value) has
  been added.

  Args:
    slice_partitions: The result of GroupByKey in which the key is a slice_key,
      and the grouped stream consists of per-partition _PartitionInfo tuples in
      which the first element is an accumulator for that partition, the second
      element is the size of that partition, and the third element is the
      partition ID.
    combiner: The combiner to be used for converting accumulators to outputs.

  Yields:
    Tuples of the form (slice_key, metrics), for each jackknife sample and for
    the unsampled value.
  """
  slice_key, accumulators_sizes_and_ids = slice_partitions
  accumulators, sizes, partition_ids = zip(*accumulators_sizes_and_ids)
  unsampled_accumulator = None
  for i, loo_accumulator in enumerate(
      _make_loo_accumulators(list(accumulators), combiner)):
    # yield sampled output with sample_id of the leftout partition
    sample_id_key = (_JACKKNIFE_SAMPLE_ID_KEY, partition_ids[i])
    yield slice_key + (sample_id_key,), combiner.extract_output(loo_accumulator)
    if i == 0:
      # Create the unsampled accumulator from sample 0 and its complement.
      unsampled_accumulator = combiner.merge_accumulators(
          [loo_accumulator, accumulators[0]])

  # yield unsampled output along with total count as a special metric
  count_dict = {_JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: sum(sizes)}
  sample_id_key = ((_JACKKNIFE_SAMPLE_ID_KEY, _JACKKNIFE_FULL_SAMPLE_ID),)
  unsampled_output = combiner.extract_output(unsampled_accumulator)
  unsampled_key = slice_key + sample_id_key
  unsampled_val = unsampled_output + (count_dict,)
  yield unsampled_key, unsampled_val