Python Examples of apache

Source File: pipeline_common.py From gcp-variant-transforms with Apache License 2.0

6 votes

def add_annotation_headers(pipeline, known_args, pipeline_mode,
                           merged_header,
                           annotated_vcf_pattern):
  if pipeline_mode == PipelineModes.LARGE:
    annotation_headers = (pipeline
                          | 'ReadAnnotatedVCF'
                          >> beam.Create([annotated_vcf_pattern])
                          | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
  else:
    annotation_headers = (
        pipeline
        | 'ReadHeaders'
        >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
  merged_header = (
      (merged_header, annotation_headers)
      | beam.Flatten()
      | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header

Source File: data_linter.py From data-linter with Apache License 2.0

6 votes

def expand(self, examples):
    """Runs the linters on the data and writes out the results.

    The order in which the linters run is unspecified.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Examples`.

    Returns:
      A pipeline containing the `DataLinter` `PTransform`s.
    """
    coders = (beam.coders.coders.StrUtf8Coder(),
              beam.coders.coders.ProtoCoder(lint_result_pb2.LintResult))
    return (
        [examples | linter for linter in self._linters if linter.should_run()]
        | 'MergeResults' >> beam.Flatten()
        | 'DropEmpty' >> beam.Filter(lambda (_, r): r and len(r.warnings))
        | 'ToDict' >> beam.combiners.ToDict()
        | 'WriteResults' >> beam.io.textio.WriteToText(
            self._results_path,
            coder=beam.coders.coders.PickleCoder(),
            shard_name_template=''))

Source File: _util.py From pydatalab with Apache License 2.0

6 votes

def get_sources_from_dataset(p, dataset, mode):
  """get pcollection from dataset."""

  import apache_beam as beam
  import csv
  from google.datalab.ml import CsvDataSet, BigQueryDataSet

  check_dataset(dataset, mode)
  if type(dataset) is CsvDataSet:
    source_list = []
    for ii, input_path in enumerate(dataset.files):
      source_list.append(p | 'Read from Csv %d (%s)' % (ii, mode) >>
                         beam.io.ReadFromText(input_path, strip_trailing_newlines=True))
    return (source_list |
            'Flatten Sources (%s)' % mode >>
            beam.Flatten() |
            'Create Dict from Csv (%s)' % mode >>
            beam.Map(lambda line: csv.DictReader([line], fieldnames=['image_url',
                                                                     'label']).next()))
  elif type(dataset) is BigQueryDataSet:
    bq_source = (beam.io.BigQuerySource(table=dataset.table) if dataset.table is not None else
                 beam.io.BigQuerySource(query=dataset.query))
    return p | 'Read source from BigQuery (%s)' % mode >> beam.io.Read(bq_source)
  else:
    raise ValueError('Invalid DataSet. Expect CsvDataSet or BigQueryDataSet')

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

6 votes

def expand(
      self,
      sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    unweighted_protos = (
        sliced_record_batchs
        | 'ComputeUnweightedLift' >> self._unweighted_generator)
    if not self._weight_column_name:
      # If no weight column name is given, only compute unweighted lift.
      return unweighted_protos

    weighted_protos = (
        sliced_record_batchs
        | 'ComputeWeightedLift' >> self._weighted_generator)

    return ((unweighted_protos, weighted_protos)
            | 'MergeUnweightedAndWeightedProtos' >> beam.Flatten())

Source File: vcf_to_bq_preprocess.py From gcp-variant-transforms with Apache License 2.0

6 votes

def _get_inferred_headers(variants,  # type: pvalue.PCollection
                          merged_header  # type: pvalue.PCollection
                         ):
  # type: (...) -> (pvalue.PCollection, pvalue.PCollection)
  inferred_headers = (variants
                      | 'FilterVariants' >> filter_variants.FilterVariants()
                      | ' InferHeaderFields' >>
                      infer_headers.InferHeaderFields(
                          pvalue.AsSingleton(merged_header),
                          allow_incompatible_records=True,
                          infer_headers=True))

  merged_header = (
      (inferred_headers, merged_header)
      | beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          allow_incompatible_records=True))
  return inferred_headers, merged_header

Source File: impl.py From transform with Apache License 2.0

5 votes

def expand(self, pbegin):
    # TODO(b/151921205): we have to do an identity map for unmodified
    # PCollections below because otherwise we get an error from beam.
    identity_map = 'Identity' >> beam.Map(lambda x: x)
    if self._dataset_key.is_flattened_dataset_key():
      if self._flat_pcollection:
        return self._flat_pcollection | identity_map
      else:
        return (
            list(self._pcollection_dict.values())
            | 'FlattenAnalysisInputs' >> beam.Flatten(pipeline=pbegin.pipeline))
    else:
      return self._pcollection_dict[self._dataset_key] | identity_map

Source File: evaluator.py From model-analysis with Apache License 2.0

5 votes

def combine_dict_based_evaluations(
    evaluations: Dict[Text, List[beam.pvalue.PCollection]]) -> Evaluation:
  """Combines multiple evaluation outputs together when the outputs are dicts.

  Note that the dict here refers to the output in the PCollection. The
  evaluations themselves are dicts of PCollections keyed by category ('metrics',
  'plots', 'analysis', etc). This util is used to group the outputs of one or
  more of these evaluations where the PCollections themselves must be dicts. For
  example, a 'metrics' evaluation might store its output in PCollection of dicts
  containing metric keys and metric values. This util would be used to group the
  outputs from running two or more independent metrics evaluations together into
  a single PCollection.

  Args:
    evaluations: Dict of lists of PCollections of outputs from different
      evaluators keyed by type of output ('metrics', 'plots', 'analysis', etc).

  Returns:
    Dict of consolidated PCollections of outputs keyed by type of output.
  """
  result = {}
  for k, v in evaluations.items():
    if len(v) == 1:
      result[k] = v[0]
      continue

    result[k] = (
        v
        | 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten()
        | 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey(
            _CombineEvaluationDictionariesFn()))
  return result

Source File: jackknife.py From model-analysis with Apache License 2.0

5 votes

def expand(self, sliced_extracts):

    def partition_fn(_, num_partitions):
      return self._random_state.randint(num_partitions)

    # Partition the data
    # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
    partitions = (
        sliced_extracts
        | 'Partition' >> beam.Partition(partition_fn,
                                        self._num_jackknife_samples))

    def add_partition_index(slice_key,
                            accumulator_and_size,
                            partition_index=None):
      accumulator, size = accumulator_and_size
      return slice_key, _PartitionInfo(accumulator, size, partition_index)

    # Within each partition, partially combine per slice key to get accumulators
    # and partition sizes; add partition_id for determinism.
    # List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
    partition_accumulators = []
    for i, partition in enumerate(partitions):
      partition_accumulators.append(
          partition
          | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
              beam.transforms.combiners.SingleInputTupleCombineFn(
                  _AccumulateOnlyCombiner(combiner=self._combiner),
                  beam.transforms.combiners.CountCombineFn()))
          | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
              add_partition_index, i))

    # Group partitions for the same slice, compute LOO metrics, and flatten back
    # into per-partition LOO metrics.
    # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
    return (partition_accumulators
            | 'FlattenPartitionAccumulators' >> beam.Flatten()
            | 'CollectPerSlicePartitions' >> beam.GroupByKey()
            | 'MakeJackknifeSamples' >> beam.FlatMap(
                _make_jackknife_samples, combiner=self._combiner))

Source File: cache_tasks_main.py From text-to-text-transfer-transformer with Apache License 2.0

5 votes

def expand(self, pcoll):
    to_dict = lambda x: {x[0]: x[1]}
    example_counts = (
        pcoll
        | "count_examples" >> beam.combiners.Count.Globally()
        | "key_example_counts" >> beam.Map(
            lambda x: ("examples", x))
        | "example_count_dict" >> beam.Map(to_dict))
    def _count_tokens(pcoll, feat):
      return (
          pcoll
          | "key_%s_toks" % feat >> beam.Map(
              lambda x:  # pylint:disable=g-long-lambda
              ("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0)))
    token_counts = (
        [_count_tokens(pcoll, feat)
         for feat in self._output_features]
        | "flatten_tokens" >> beam.Flatten()
        | "count_tokens" >> beam.CombinePerKey(sum)
        | "token_count_dict" >> beam.Map(to_dict))

    def _merge_dicts(dicts):
      merged_dict = {}
      for d in dicts:
        assert not set(merged_dict).intersection(d)
        merged_dict.update(d)
      return merged_dict
    return (
        [example_counts, token_counts]
        | "flatten_counts" >> beam.Flatten()
        | "merge_stats" >> beam.CombineGlobally(_merge_dicts))

Source File: impl.py From transform with Apache License 2.0

5 votes

def expand(self, inputs):
    return inputs | beam.Flatten()

Source File: analyzer_impls.py From transform with Apache License 2.0

5 votes

def expand(self, inputs):
    if self._top_k is not None and self._top_k < 0:
      raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got '
                       '{}.'.format(self._top_k))
    if self._frequency_threshold is not None and self._frequency_threshold < 0:
      raise ValueError(
          'frequency_threshold for VocabularyImpl should be >= 0 or None, '
          'got {}.'.format(self._frequency_threshold))
    if self._coverage_top_k is not None and self._coverage_top_k < 0:
      raise ValueError('coverage_top_k for VocabularyImpl should be >= 0 or '
                       'None, got {}.'.format(self._coverage_top_k))
    if (self._coverage_frequency_threshold is not None and
        self._coverage_frequency_threshold < 0):
      raise ValueError(
          'coverage_frequency_threshold for VocabularyImpl should be >= 0 or '
          'None, got {}.'.format(self._coverage_frequency_threshold))
    pcoll, = inputs

    result = (
        pcoll | 'ApplyThresholdsAndTopK' >> (
            _ApplyThresholdsAndTopK(  # pylint: disable=no-value-for-parameter
                self._frequency_threshold, self._top_k,
                self._informativeness_threshold, None)))

    if self._key_fn:
      # Note: current APIs do not allow for specifying a coverage
      # informativeness threshold.
      coverage_counts = (
          pcoll | 'ApplyCoverageThresholdAndTopK' >> (
              _ApplyThresholdsAndTopK(  # pylint: disable=no-value-for-parameter
                  self._coverage_frequency_threshold, self._coverage_top_k,
                  self._coverage_informativeness_threshold, self._key_fn)))

      result = ((result, coverage_counts)
                | 'MergeStandardAndCoverageArms' >> beam.Flatten()
                | 'RemoveDuplicates' >> beam.RemoveDuplicates())

    return result

Source File: sentiment_example.py From transform with Apache License 2.0

5 votes

def ReadAndShuffleData(pcoll, filepatterns):
  """Read a train or test dataset from disk and shuffle it."""
  # NOTE: we pass filepatterns as a tuple instead of two args, as the current
  # version of beam assumes that if the first arg to a ptransfrom_fn is a
  # string, then that string is the label.
  neg_filepattern, pos_filepattern = filepatterns

  # Read from each file pattern and create a tuple of the review text and the
  # correct label.
  negative_examples = (
      pcoll
      | 'ReadNegativeExamples' >> beam.io.ReadFromText(neg_filepattern)
      | 'PairWithZero' >> beam.Map(lambda review: (review, 0)))
  positive_examples = (
      pcoll
      | 'ReadPositiveExamples' >> beam.io.ReadFromText(pos_filepattern)
      | 'PairWithOne' >> beam.Map(lambda review: (review, 1)))
  all_examples = (
      [negative_examples, positive_examples] | 'Merge' >> beam.Flatten())

  # Shuffle the data.  Note that the data does in fact contain duplicate reviews
  # for reasons that are unclear.  This means that NUM_TRAIN_INSTANCES and
  # NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data.
  # pylint: disable=no-value-for-parameter
  shuffled_examples = (
      all_examples
      | 'Distinct' >> beam.Distinct()
      | 'Shuffle' >> Shuffle())

  # Put the data in the format that can be accepted directly by tf.Transform.
  return shuffled_examples | 'MakeInstances' >> beam.Map(
      lambda p: {REVIEW_KEY: p[0], LABEL_KEY: p[1]})

Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0

5 votes

def _add_inferred_headers(all_patterns,  # type: List[str]
                          pipeline,  # type: beam.Pipeline
                          known_args,  # type: argparse.Namespace
                          merged_header,  # type: pvalue.PCollection
                          pipeline_mode  # type: int
                         ):
  # type: (...) -> pvalue.PCollection
  annotation_fields_to_infer = (known_args.annotation_fields if
                                known_args.infer_annotation_types else [])
  inferred_headers = (
      _read_variants(all_patterns,
                     pipeline,
                     known_args,
                     pipeline_mode,
                     pre_infer_headers=known_args.infer_headers)
      | 'FilterVariants' >> filter_variants.FilterVariants(
          reference_names=known_args.reference_names)
      | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
          pvalue.AsSingleton(merged_header),
          known_args.allow_incompatible_records,
          known_args.infer_headers,
          annotation_fields_to_infer))
  merged_header = (
      (inferred_headers, merged_header)
      | 'FlattenHeaders' >> beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header

Source File: _preprocess.py From pydatalab with Apache License 2.0

5 votes

def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id):
  source_train = _util.get_sources_from_dataset(p, dataset_train, 'train')
  labels_source = [source_train]
  if dataset_eval is not None:
    source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval')
    labels_source.append(source_eval)

  labels = _labels_pipeline(labels_source)
  train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train')
  if dataset_eval is not None:
    # explicit eval data.
    eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval')
  else:
    # Split train/eval.
    train_preprocessed, eval_preprocessed = (train_preprocessed |
                                             'Random Partition' >>
                                             beam.Partition(TrainEvalSplitPartitionFn(), 2))

  output_train_path = os.path.join(output_dir, job_id, 'train')
  output_eval_path = os.path.join(output_dir, job_id, 'eval')
  labels_file = os.path.join(output_dir, job_id, 'labels')
  labels_save = (labels |
                 'Write labels' >>
                 beam.io.textio.WriteToText(labels_file, shard_name_template=''))
  train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path)
  eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path)
  # Make sure we write "latest" file after train and eval data are successfully written.
  output_latest_file = os.path.join(output_dir, 'latest')
  ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() |
      'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) |
      beam.Map(lambda path: job_id) |
      'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template=''))

Source File: _preprocess.py From pydatalab with Apache License 2.0

5 votes

def _labels_pipeline(sources):
  labels = (sources |
            'Flatten Sources for labels' >> beam.Flatten() |
            'Parse input for labels' >> beam.Map(lambda x: str(x['label'])) |
            'Combine labels' >> beam.transforms.combiners.Count.PerElement() |
            'Get labels' >> beam.Map(lambda label_count: label_count[0]))
  return labels

Source File: beam_reshuffle.py From exoplanet-ml with Apache License 2.0

5 votes

def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_patterns
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards

    # Create Pipeline.
    tfrecords = []
    for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")):
      logging.info("Reading TFRecords from %s", file_pattern)
      stage_name = "read_tfrecords_{}".format(i)
      tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord(
          file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))

    # pylint: disable=expression-not-assigned
    (tfrecords
     | "flatten" >> beam.Flatten()
     | "count_labels" >> beam.ParDo(CountLabelsDoFn())
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.")

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

4 votes

def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch,
                                                      _SlicedYKey]):
    sliced_record_batchs, y_keys = sliced_record_batchs_and_ys

    # _SlicedXYKey(slice, x_path, x, y), xy_count
    partial_copresence_counts = (
        sliced_record_batchs
        | 'ToPartialCopresenceCounts' >> beam.FlatMap(
            _to_partial_copresence_counts, self._y_path, self._x_paths,
            self._y_boundaries, self._weight_column_name))

    # Compute placerholder copresence counts.
    # partial_copresence_counts will only include x-y pairs that are present,
    # but we would also like to keep track of x-y pairs that never appear, as
    # long as x and y independently occur in the slice.

    # _SlicedXKey(slice, x_path, x), x_count
    x_counts = (
        sliced_record_batchs
        | 'ToPartialXCounts' >> beam.FlatMap(
            _to_partial_x_counts, self._x_paths, self._weight_column_name)
        | 'SumXCounts' >> beam.CombinePerKey(sum))
    if self._min_x_count:
      x_counts = x_counts | 'FilterXCounts' >> beam.Filter(
          lambda kv: kv[1] > self._min_x_count)

    # _SlicedXYKey(slice, x_path, x, y), 0
    placeholder_copresence_counts = (
        (x_counts, y_keys)
        | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts(
            self._x_paths, self._min_x_count))

    def move_y_to_value(key, xy_count):
      return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count)

    # _SlicedXKey(slice, x_path, x), (y, xy_count)
    copresence_counts = (
        (placeholder_copresence_counts, partial_copresence_counts)
        | 'FlattenCopresenceCounts' >> beam.Flatten()
        | 'SumCopresencePairs' >> beam.CombinePerKey(sum)
        | 'MoveYToValue' >> beam.MapTuple(move_y_to_value))

    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    return ({
        'x_count': x_counts,
        'xy_counts': copresence_counts
    }
            | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey()
            | 'JoinXCounts' >> beam.FlatMap(_join_x_counts))

Source File: deep_copy_test.py From transform with Apache License 2.0

4 votes

def testFlatten(self):
    with beam.Pipeline() as p:
      create_1 = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')])
      create_2 = p | 'Create2' >> beam.Create([(3, 'c')])
      created = (create_1, create_2) | 'Flatten1' >> beam.Flatten()
      grouped1 = (created
                  | 'PreGroup1' >> beam.Map(
                      lambda x: DeepCopyTest._CountingIdentityFn(
                          'PreGroup1', x))
                  | 'GBK1' >> beam.GroupByKey())
      grouped2 = (p
                  | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')])
                  | 'PreGroup2' >> beam.Map(
                      lambda x: DeepCopyTest._CountingIdentityFn(
                          'PreGroup2', x))
                  | 'GBK2' >> beam.GroupByKey())
      modified1 = (
          grouped1
          |
          'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1')))
      modified2 = (
          grouped2
          |
          'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2')))
      flattened = (modified1, modified2) | 'Flatten2' >> beam.Flatten()
      modified3 = (
          flattened
          |
          'Add3' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add3')))

      copied = deep_copy.deep_copy(modified3)

      # Check that deep copy was performed.
      self.assertIsNot(copied.producer.inputs[0], modified3.producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[0],
                       modified3.producer.inputs[0].producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[1],
                       modified3.producer.inputs[0].producer.inputs[1])

      # Check that copy stops at materialization boundary.
      self.assertIs(
          copied.producer.inputs[0].producer.inputs[0].producer.inputs[0],
          modified3.producer.inputs[0].producer.inputs[0].producer.inputs[0])
      self.assertIs(
          copied.producer.inputs[0].producer.inputs[1].producer.inputs[0],
          modified3.producer.inputs[0].producer.inputs[1].producer.inputs[0])

    # Check counts of processed items.
    self.assertEqual(DeepCopyTest._counts['PreGroup1'], 3)
    self.assertEqual(DeepCopyTest._counts['PreGroup2'], 3)
    self.assertEqual(DeepCopyTest._counts['Add1'], 6)
    self.assertEqual(DeepCopyTest._counts['Add2'], 6)
    self.assertEqual(DeepCopyTest._counts['Add3'], 12)

Source File: executor.py From tfx with Apache License 2.0

4 votes

def _run_model_inference(self, model_path: Text,
                           example_uris: Mapping[Text, Text],
                           output_path: Text,
                           model_spec: bulk_inferrer_pb2.ModelSpec) -> None:
    """Runs model inference on given example data.

    Args:
      model_path: Path to model.
      example_uris: Mapping of example split name to example uri.
      output_path: Path to output generated prediction logs.
      model_spec: bulk_inferrer_pb2.ModelSpec instance.

    Returns:
      None
    """

    try:
      from tfx_bsl.public.beam import run_inference
      from tfx_bsl.public.proto import model_spec_pb2
    except ImportError:
      # TODO(b/151468119): Remove this branch after next release.
      run_inference = importlib.import_module('tfx_bsl.beam.run_inference')
      model_spec_pb2 = importlib.import_module('tfx_bsl.proto.model_spec_pb2')
    saved_model_spec = model_spec_pb2.SavedModelSpec(
        model_path=model_path,
        tag=model_spec.tag,
        signature_name=model_spec.model_signature_name)
    # TODO(b/151468119): Remove this branch after next release.
    if getattr(model_spec_pb2, 'InferenceEndpoint', False):
      inference_endpoint = getattr(model_spec_pb2, 'InferenceEndpoint')()
    else:
      inference_endpoint = model_spec_pb2.InferenceSpecType()
    inference_endpoint.saved_model_spec.CopyFrom(saved_model_spec)
    with self._make_beam_pipeline() as pipeline:
      data_list = []
      for split, example_uri in example_uris.items():
        data = (
            pipeline | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord(
                file_pattern=io_utils.all_files_pattern(example_uri)))
        data_list.append(data)
      _ = (
          [data for data in data_list]
          | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline)
          | 'ParseExamples' >> beam.Map(tf.train.Example.FromString)
          | 'RunInference' >> run_inference.RunInference(inference_endpoint)
          | 'WritePredictionLogs' >> beam.io.WriteToTFRecord(
              output_path,
              file_name_suffix='.gz',
              coder=beam.coders.ProtoCoder(prediction_log_pb2.PredictionLog)))
    logging.info('Inference result written to %s.', output_path)

Source File: model_eval_lib.py From model-analysis with Apache License 2.0

4 votes

def ExtractAndEvaluate(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection, extractors: List[extractor.Extractor],
    evaluators: List[evaluator.Evaluator]) -> evaluator.Evaluation:
  """Performs Extractions and Evaluations in provided order."""
  # evaluation[k] = list of values for k
  evaluation = {}

  def update(evaluation: Dict[Text, Any], new_evaluation: Dict[Text, Any]):
    for k, v in new_evaluation.items():
      if k not in evaluation:
        evaluation[k] = []
      evaluation[k].append(v)
    return evaluation

  # Run evaluators that run before extraction (i.e. that only require
  # the incoming input extract added by ReadInputs)
  for v in evaluators:
    if not v.run_after:
      update(evaluation, extracts | v.stage_name >> v.ptransform)
  for x in extractors:
    extracts = (extracts | x.stage_name >> x.ptransform)
    for v in evaluators:
      if v.run_after == x.stage_name:
        update(evaluation, extracts | v.stage_name >> v.ptransform)
  for v in evaluators:
    if v.run_after == extractor.LAST_EXTRACTOR_STAGE_NAME:
      update(evaluation, extracts | v.stage_name >> v.ptransform)

  # Merge multi-valued keys if necessary.
  result = {}
  for k, v in evaluation.items():
    if len(v) == 1:
      result[k] = v[0]
      continue

    # Note that we assume that if a key is multivalued, its values are
    # dictionaries with disjoint keys. The combined value will simply be the
    # disjoint union of all the dictionaries.
    result[k] = (
        v
        | 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten()
        | 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey(
            _CombineEvaluationDictionariesFn()))

  return result

Source File: beam_prepare_embedding_inputs.py From exoplanet-ml with Apache License 2.0

4 votes

def main(argv):
  del argv  # Unused.
  logging.set_verbosity(logging.INFO)

  def pipeline(root):
    """Beam pipeline for preprocessing Kepler events."""
    # Separately process and write each TCE dataset, and gather all the results.
    configs = _parse_configs()
    subsets = {
        "train": [],
        "val": [],
        "test": [],
    }
    for config in configs:
      output_dir = os.path.join(FLAGS.output_dir, config.name)
      # Write the config.
      config_json = json.dumps(config, indent=2)
      logging.info(config_json)
      (root
       | "{}-create-config".format(config.name) >> beam.Create([config_json])
       | "{}-write_config".format(config.name) >> beam.io.WriteToText(
           os.path.join(output_dir, "config.json"),
           num_shards=1,
           shard_name_template=""))
      # Process TCEs and write each subset.
      results = _process_tces(root, config)
      for subset_name, subset_values in results:
        _write_subset(config.name, subset_name, subset_values)
        subsets[subset_name].append(subset_values)

    # Create one dataset comprising all TCE datasets.
    for subset_name, subset_values in subsets.items():
      combined_subset_values = (
          subset_values
          | "combined-{}-flatten".format(subset_name) >> beam.Flatten()
          | "combined-{}-count_labels".format(subset_name) >> beam.ParDo(
              _CountLabelsDoFn(prefix="combined-{}".format(subset_name)))
          | "combined-{}-reshuffle".format(subset_name) >> beam.Reshuffle())
      _write_subset("combined", subset_name, combined_subset_values)

  pipeline.run()
  logging.info("Preprocessing complete.")

Source File: poisson_bootstrap.py From model-analysis with Apache License 2.0

4 votes

def ComputeWithConfidenceIntervals(  # pylint: disable=invalid-name
    sliced_extracts: beam.pvalue.PCollection,
    compute_per_slice_metrics_cls: Type[beam.PTransform],
    num_bootstrap_samples: Optional[int] = DEFAULT_NUM_BOOTSTRAP_SAMPLES,
    random_seed_for_testing: Optional[int] = None,
    **kwargs) -> beam.pvalue.PCollection:
  """PTransform for computing metrics using T-Distribution values.

  Args:
    sliced_extracts: Incoming PCollection consisting of slice key and extracts.
    compute_per_slice_metrics_cls: PTransform class that takes a PCollection of
      (slice key, extracts) as input and returns (slice key, dict of metrics) as
      output. The class will be instantiated multiple times to compute metrics
      both with and without sampling. The class will be initialized using kwargs
      'compute_with_sampling' and 'random_seed_for_testing' along with any
      kwargs passed in **kwargs.
    num_bootstrap_samples: Number of replicas to use in calculating uncertainty
      using bootstrapping. If 1 is provided (default), aggregate metrics will be
      calculated with no uncertainty. If num_bootstrap_samples is > 0, multiple
      samples of each slice will be calculated using the Poisson bootstrap
      method. To calculate standard errors, num_bootstrap_samples should be 20
      or more in order to provide useful data. More is better, but you pay a
      performance cost.
    random_seed_for_testing: Seed to use for unit testing, because
      nondeterministic tests stink. Each partition will use this value + i.
    **kwargs: Additional args to pass to compute_per_slice_metrics_cls init.

  Returns:
    PCollection of (slice key, dict of metrics)
  """
  if not num_bootstrap_samples:
    num_bootstrap_samples = 1
  # TODO(ckuhn): Cap the number of bootstrap samples at 20.
  if num_bootstrap_samples < 1:
    raise ValueError('num_bootstrap_samples should be > 0, got %d' %
                     num_bootstrap_samples)

  output_results = (
      sliced_extracts
      | 'ComputeUnsampledMetrics' >> compute_per_slice_metrics_cls(
          compute_with_sampling=False, random_seed_for_testing=None, **kwargs))

  if num_bootstrap_samples > 1:
    multicombine = []
    for i in range(num_bootstrap_samples):
      seed = (None if random_seed_for_testing is None else
              random_seed_for_testing + i)
      multicombine.append(
          sliced_extracts
          | 'ComputeSampledMetrics%d' % i >> compute_per_slice_metrics_cls(
              compute_with_sampling=True,
              random_seed_for_testing=seed,
              **kwargs))
    output_results = (
        multicombine
        | 'FlattenBootstrapPartitions' >> beam.Flatten()
        | 'GroupBySlice' >> beam.GroupByKey()
        | 'MergeBootstrap' >> beam.ParDo(_MergeBootstrap(),
                                         beam.pvalue.AsDict(output_results)))
  return output_results

Source File: stats_impl.py From data-validation with Apache License 2.0

4 votes

def expand(self, dataset: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    # Handles generators by their type:
    #   - CombinerStatsGenerators will be wrapped in a single CombinePerKey by
    #     _CombinerStatsGeneratorsCombineFn.
    #   - TransformStatsGenerator will be invoked separately with `dataset`.
    combiner_stats_generators = []
    result_protos = []
    for generator in get_generators(self._options):
      if isinstance(generator, stats_generator.CombinerStatsGenerator):
        combiner_stats_generators.append(generator)
      elif isinstance(generator, stats_generator.TransformStatsGenerator):
        result_protos.append(
            dataset
            | generator.name >> generator.ptransform)
      else:
        raise TypeError('Statistics generator must extend one of '
                        'CombinerStatsGenerator or TransformStatsGenerator, '
                        'found object of type %s' %
                        generator.__class__.__name__)
    if combiner_stats_generators:
      # TODO(b/115685296): Obviate the need for explicit fanout.
      fanout = 5 * int(math.ceil(math.sqrt(len(combiner_stats_generators))))
      result_protos.append(dataset
                           | 'RunCombinerStatsGenerators'
                           >> beam.CombinePerKey(
                               _CombinerStatsGeneratorsCombineFn(
                                   combiner_stats_generators,
                                   self._options.desired_batch_size
                                   )).with_hot_key_fanout(fanout))

    # result_protos is a list of PCollections of (slice key,
    # DatasetFeatureStatistics proto) pairs. We now flatten the list into a
    # single PCollection, combine the DatasetFeatureStatistics protos by key,
    # and then merge the DatasetFeatureStatistics protos in the PCollection into
    # a single DatasetFeatureStatisticsList proto.
    return (result_protos
            | 'FlattenFeatureStatistics' >> beam.Flatten()
            | 'MergeDatasetFeatureStatisticsProtos' >>
            beam.CombinePerKey(_merge_dataset_feature_stats_protos)
            | 'AddSliceKeyToStatsProto' >> beam.Map(
                _add_slice_key,
                self._is_slicing_enabled)
            | 'ToList' >> beam.combiners.ToList()
            | 'MakeDatasetFeatureStatisticsListProto' >>
            beam.Map(_make_dataset_feature_statistics_list_proto))

Python apache_beam.Flatten() Examples