Python apache_beam.Flatten() Examples

The following are 23 code examples of apache_beam.Flatten(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: pipeline_common.py    From gcp-variant-transforms with Apache License 2.0 6 votes vote down vote up
def add_annotation_headers(pipeline, known_args, pipeline_mode,
                           merged_header,
                           annotated_vcf_pattern):
  if pipeline_mode == PipelineModes.LARGE:
    annotation_headers = (pipeline
                          | 'ReadAnnotatedVCF'
                          >> beam.Create([annotated_vcf_pattern])
                          | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
  else:
    annotation_headers = (
        pipeline
        | 'ReadHeaders'
        >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
  merged_header = (
      (merged_header, annotation_headers)
      | beam.Flatten()
      | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header 
Example #2
Source File: data_linter.py    From data-linter with Apache License 2.0 6 votes vote down vote up
def expand(self, examples):
    """Runs the linters on the data and writes out the results.

    The order in which the linters run is unspecified.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Examples`.

    Returns:
      A pipeline containing the `DataLinter` `PTransform`s.
    """
    coders = (beam.coders.coders.StrUtf8Coder(),
              beam.coders.coders.ProtoCoder(lint_result_pb2.LintResult))
    return (
        [examples | linter for linter in self._linters if linter.should_run()]
        | 'MergeResults' >> beam.Flatten()
        | 'DropEmpty' >> beam.Filter(lambda (_, r): r and len(r.warnings))
        | 'ToDict' >> beam.combiners.ToDict()
        | 'WriteResults' >> beam.io.textio.WriteToText(
            self._results_path,
            coder=beam.coders.coders.PickleCoder(),
            shard_name_template='')) 
Example #3
Source File: _util.py    From pydatalab with Apache License 2.0 6 votes vote down vote up
def get_sources_from_dataset(p, dataset, mode):
  """get pcollection from dataset."""

  import apache_beam as beam
  import csv
  from google.datalab.ml import CsvDataSet, BigQueryDataSet

  check_dataset(dataset, mode)
  if type(dataset) is CsvDataSet:
    source_list = []
    for ii, input_path in enumerate(dataset.files):
      source_list.append(p | 'Read from Csv %d (%s)' % (ii, mode) >>
                         beam.io.ReadFromText(input_path, strip_trailing_newlines=True))
    return (source_list |
            'Flatten Sources (%s)' % mode >>
            beam.Flatten() |
            'Create Dict from Csv (%s)' % mode >>
            beam.Map(lambda line: csv.DictReader([line], fieldnames=['image_url',
                                                                     'label']).next()))
  elif type(dataset) is BigQueryDataSet:
    bq_source = (beam.io.BigQuerySource(table=dataset.table) if dataset.table is not None else
                 beam.io.BigQuerySource(query=dataset.query))
    return p | 'Read source from BigQuery (%s)' % mode >> beam.io.Read(bq_source)
  else:
    raise ValueError('Invalid DataSet. Expect CsvDataSet or BigQueryDataSet') 
Example #4
Source File: lift_stats_generator.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def expand(
      self,
      sliced_record_batchs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    unweighted_protos = (
        sliced_record_batchs
        | 'ComputeUnweightedLift' >> self._unweighted_generator)
    if not self._weight_column_name:
      # If no weight column name is given, only compute unweighted lift.
      return unweighted_protos

    weighted_protos = (
        sliced_record_batchs
        | 'ComputeWeightedLift' >> self._weighted_generator)

    return ((unweighted_protos, weighted_protos)
            | 'MergeUnweightedAndWeightedProtos' >> beam.Flatten()) 
Example #5
Source File: vcf_to_bq_preprocess.py    From gcp-variant-transforms with Apache License 2.0 6 votes vote down vote up
def _get_inferred_headers(variants,  # type: pvalue.PCollection
                          merged_header  # type: pvalue.PCollection
                         ):
  # type: (...) -> (pvalue.PCollection, pvalue.PCollection)
  inferred_headers = (variants
                      | 'FilterVariants' >> filter_variants.FilterVariants()
                      | ' InferHeaderFields' >>
                      infer_headers.InferHeaderFields(
                          pvalue.AsSingleton(merged_header),
                          allow_incompatible_records=True,
                          infer_headers=True))

  merged_header = (
      (inferred_headers, merged_header)
      | beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          allow_incompatible_records=True))
  return inferred_headers, merged_header 
Example #6
Source File: impl.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, pbegin):
    # TODO(b/151921205): we have to do an identity map for unmodified
    # PCollections below because otherwise we get an error from beam.
    identity_map = 'Identity' >> beam.Map(lambda x: x)
    if self._dataset_key.is_flattened_dataset_key():
      if self._flat_pcollection:
        return self._flat_pcollection | identity_map
      else:
        return (
            list(self._pcollection_dict.values())
            | 'FlattenAnalysisInputs' >> beam.Flatten(pipeline=pbegin.pipeline))
    else:
      return self._pcollection_dict[self._dataset_key] | identity_map 
Example #7
Source File: evaluator.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def combine_dict_based_evaluations(
    evaluations: Dict[Text, List[beam.pvalue.PCollection]]) -> Evaluation:
  """Combines multiple evaluation outputs together when the outputs are dicts.

  Note that the dict here refers to the output in the PCollection. The
  evaluations themselves are dicts of PCollections keyed by category ('metrics',
  'plots', 'analysis', etc). This util is used to group the outputs of one or
  more of these evaluations where the PCollections themselves must be dicts. For
  example, a 'metrics' evaluation might store its output in PCollection of dicts
  containing metric keys and metric values. This util would be used to group the
  outputs from running two or more independent metrics evaluations together into
  a single PCollection.

  Args:
    evaluations: Dict of lists of PCollections of outputs from different
      evaluators keyed by type of output ('metrics', 'plots', 'analysis', etc).

  Returns:
    Dict of consolidated PCollections of outputs keyed by type of output.
  """
  result = {}
  for k, v in evaluations.items():
    if len(v) == 1:
      result[k] = v[0]
      continue

    result[k] = (
        v
        | 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten()
        | 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey(
            _CombineEvaluationDictionariesFn()))
  return result 
Example #8
Source File: jackknife.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def expand(self, sliced_extracts):

    def partition_fn(_, num_partitions):
      return self._random_state.randint(num_partitions)

    # Partition the data
    # List[PCollection[Tuple[slicer.SliceKeyType, types.Extracts]]]
    partitions = (
        sliced_extracts
        | 'Partition' >> beam.Partition(partition_fn,
                                        self._num_jackknife_samples))

    def add_partition_index(slice_key,
                            accumulator_and_size,
                            partition_index=None):
      accumulator, size = accumulator_and_size
      return slice_key, _PartitionInfo(accumulator, size, partition_index)

    # Within each partition, partially combine per slice key to get accumulators
    # and partition sizes; add partition_id for determinism.
    # List[PCollection[slicer.SliceKeyType, _PartitionInfo]]
    partition_accumulators = []
    for i, partition in enumerate(partitions):
      partition_accumulators.append(
          partition
          | 'CombinePartition[{}]'.format(i) >> beam.CombinePerKey(
              beam.transforms.combiners.SingleInputTupleCombineFn(
                  _AccumulateOnlyCombiner(combiner=self._combiner),
                  beam.transforms.combiners.CountCombineFn()))
          | 'AddPartitionId[{}]'.format(i) >> beam.MapTuple(
              add_partition_index, i))

    # Group partitions for the same slice, compute LOO metrics, and flatten back
    # into per-partition LOO metrics.
    # (slicer.SliceKeyType, Tuple[metric_types.MetricsDict])
    return (partition_accumulators
            | 'FlattenPartitionAccumulators' >> beam.Flatten()
            | 'CollectPerSlicePartitions' >> beam.GroupByKey()
            | 'MakeJackknifeSamples' >> beam.FlatMap(
                _make_jackknife_samples, combiner=self._combiner)) 
Example #9
Source File: cache_tasks_main.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    to_dict = lambda x: {x[0]: x[1]}
    example_counts = (
        pcoll
        | "count_examples" >> beam.combiners.Count.Globally()
        | "key_example_counts" >> beam.Map(
            lambda x: ("examples", x))
        | "example_count_dict" >> beam.Map(to_dict))
    def _count_tokens(pcoll, feat):
      return (
          pcoll
          | "key_%s_toks" % feat >> beam.Map(
              lambda x:  # pylint:disable=g-long-lambda
              ("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0)))
    token_counts = (
        [_count_tokens(pcoll, feat)
         for feat in self._output_features]
        | "flatten_tokens" >> beam.Flatten()
        | "count_tokens" >> beam.CombinePerKey(sum)
        | "token_count_dict" >> beam.Map(to_dict))

    def _merge_dicts(dicts):
      merged_dict = {}
      for d in dicts:
        assert not set(merged_dict).intersection(d)
        merged_dict.update(d)
      return merged_dict
    return (
        [example_counts, token_counts]
        | "flatten_counts" >> beam.Flatten()
        | "merge_stats" >> beam.CombineGlobally(_merge_dicts)) 
Example #10
Source File: impl.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, inputs):
    return inputs | beam.Flatten() 
Example #11
Source File: analyzer_impls.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, inputs):
    if self._top_k is not None and self._top_k < 0:
      raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got '
                       '{}.'.format(self._top_k))
    if self._frequency_threshold is not None and self._frequency_threshold < 0:
      raise ValueError(
          'frequency_threshold for VocabularyImpl should be >= 0 or None, '
          'got {}.'.format(self._frequency_threshold))
    if self._coverage_top_k is not None and self._coverage_top_k < 0:
      raise ValueError('coverage_top_k for VocabularyImpl should be >= 0 or '
                       'None, got {}.'.format(self._coverage_top_k))
    if (self._coverage_frequency_threshold is not None and
        self._coverage_frequency_threshold < 0):
      raise ValueError(
          'coverage_frequency_threshold for VocabularyImpl should be >= 0 or '
          'None, got {}.'.format(self._coverage_frequency_threshold))
    pcoll, = inputs

    result = (
        pcoll | 'ApplyThresholdsAndTopK' >> (
            _ApplyThresholdsAndTopK(  # pylint: disable=no-value-for-parameter
                self._frequency_threshold, self._top_k,
                self._informativeness_threshold, None)))

    if self._key_fn:
      # Note: current APIs do not allow for specifying a coverage
      # informativeness threshold.
      coverage_counts = (
          pcoll | 'ApplyCoverageThresholdAndTopK' >> (
              _ApplyThresholdsAndTopK(  # pylint: disable=no-value-for-parameter
                  self._coverage_frequency_threshold, self._coverage_top_k,
                  self._coverage_informativeness_threshold, self._key_fn)))

      result = ((result, coverage_counts)
                | 'MergeStandardAndCoverageArms' >> beam.Flatten()
                | 'RemoveDuplicates' >> beam.RemoveDuplicates())

    return result 
Example #12
Source File: sentiment_example.py    From transform with Apache License 2.0 5 votes vote down vote up
def ReadAndShuffleData(pcoll, filepatterns):
  """Read a train or test dataset from disk and shuffle it."""
  # NOTE: we pass filepatterns as a tuple instead of two args, as the current
  # version of beam assumes that if the first arg to a ptransfrom_fn is a
  # string, then that string is the label.
  neg_filepattern, pos_filepattern = filepatterns

  # Read from each file pattern and create a tuple of the review text and the
  # correct label.
  negative_examples = (
      pcoll
      | 'ReadNegativeExamples' >> beam.io.ReadFromText(neg_filepattern)
      | 'PairWithZero' >> beam.Map(lambda review: (review, 0)))
  positive_examples = (
      pcoll
      | 'ReadPositiveExamples' >> beam.io.ReadFromText(pos_filepattern)
      | 'PairWithOne' >> beam.Map(lambda review: (review, 1)))
  all_examples = (
      [negative_examples, positive_examples] | 'Merge' >> beam.Flatten())

  # Shuffle the data.  Note that the data does in fact contain duplicate reviews
  # for reasons that are unclear.  This means that NUM_TRAIN_INSTANCES and
  # NUM_TRAIN_INSTANCES are slightly wrong for the preprocessed data.
  # pylint: disable=no-value-for-parameter
  shuffled_examples = (
      all_examples
      | 'Distinct' >> beam.Distinct()
      | 'Shuffle' >> Shuffle())

  # Put the data in the format that can be accepted directly by tf.Transform.
  return shuffled_examples | 'MakeInstances' >> beam.Map(
      lambda p: {REVIEW_KEY: p[0], LABEL_KEY: p[1]}) 
Example #13
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def _add_inferred_headers(all_patterns,  # type: List[str]
                          pipeline,  # type: beam.Pipeline
                          known_args,  # type: argparse.Namespace
                          merged_header,  # type: pvalue.PCollection
                          pipeline_mode  # type: int
                         ):
  # type: (...) -> pvalue.PCollection
  annotation_fields_to_infer = (known_args.annotation_fields if
                                known_args.infer_annotation_types else [])
  inferred_headers = (
      _read_variants(all_patterns,
                     pipeline,
                     known_args,
                     pipeline_mode,
                     pre_infer_headers=known_args.infer_headers)
      | 'FilterVariants' >> filter_variants.FilterVariants(
          reference_names=known_args.reference_names)
      | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
          pvalue.AsSingleton(merged_header),
          known_args.allow_incompatible_records,
          known_args.infer_headers,
          annotation_fields_to_infer))
  merged_header = (
      (inferred_headers, merged_header)
      | 'FlattenHeaders' >> beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header 
Example #14
Source File: _preprocess.py    From pydatalab with Apache License 2.0 5 votes vote down vote up
def configure_pipeline(p, dataset_train, dataset_eval, checkpoint_path, output_dir, job_id):
  source_train = _util.get_sources_from_dataset(p, dataset_train, 'train')
  labels_source = [source_train]
  if dataset_eval is not None:
    source_eval = _util.get_sources_from_dataset(p, dataset_eval, 'eval')
    labels_source.append(source_eval)

  labels = _labels_pipeline(labels_source)
  train_preprocessed = _transformation_pipeline(source_train, checkpoint_path, labels, 'train')
  if dataset_eval is not None:
    # explicit eval data.
    eval_preprocessed = _transformation_pipeline(source_eval, checkpoint_path, labels, 'eval')
  else:
    # Split train/eval.
    train_preprocessed, eval_preprocessed = (train_preprocessed |
                                             'Random Partition' >>
                                             beam.Partition(TrainEvalSplitPartitionFn(), 2))

  output_train_path = os.path.join(output_dir, job_id, 'train')
  output_eval_path = os.path.join(output_dir, job_id, 'eval')
  labels_file = os.path.join(output_dir, job_id, 'labels')
  labels_save = (labels |
                 'Write labels' >>
                 beam.io.textio.WriteToText(labels_file, shard_name_template=''))
  train_save = train_preprocessed | 'Save train to disk' >> SaveFeatures(output_train_path)
  eval_save = eval_preprocessed | 'Save eval to disk' >> SaveFeatures(output_eval_path)
  # Make sure we write "latest" file after train and eval data are successfully written.
  output_latest_file = os.path.join(output_dir, 'latest')
  ([eval_save, train_save, labels_save] | 'Wait for train eval saving' >> beam.Flatten() |
      'Fixed One' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) |
      beam.Map(lambda path: job_id) |
      'WriteLatest' >> beam.io.textio.WriteToText(output_latest_file, shard_name_template='')) 
Example #15
Source File: _preprocess.py    From pydatalab with Apache License 2.0 5 votes vote down vote up
def _labels_pipeline(sources):
  labels = (sources |
            'Flatten Sources for labels' >> beam.Flatten() |
            'Parse input for labels' >> beam.Map(lambda x: str(x['label'])) |
            'Combine labels' >> beam.transforms.combiners.Count.PerElement() |
            'Get labels' >> beam.Map(lambda label_count: label_count[0]))
  return labels 
Example #16
Source File: beam_reshuffle.py    From exoplanet-ml with Apache License 2.0 5 votes vote down vote up
def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_patterns
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards

    # Create Pipeline.
    tfrecords = []
    for i, file_pattern in enumerate(FLAGS.input_file_patterns.split(",")):
      logging.info("Reading TFRecords from %s", file_pattern)
      stage_name = "read_tfrecords_{}".format(i)
      tfrecords.append(root | stage_name >> beam.io.tfrecordio.ReadFromTFRecord(
          file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)))

    # pylint: disable=expression-not-assigned
    (tfrecords
     | "flatten" >> beam.Flatten()
     | "count_labels" >> beam.ParDo(CountLabelsDoFn())
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.") 
Example #17
Source File: lift_stats_generator.py    From data-validation with Apache License 2.0 4 votes vote down vote up
def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch,
                                                      _SlicedYKey]):
    sliced_record_batchs, y_keys = sliced_record_batchs_and_ys

    # _SlicedXYKey(slice, x_path, x, y), xy_count
    partial_copresence_counts = (
        sliced_record_batchs
        | 'ToPartialCopresenceCounts' >> beam.FlatMap(
            _to_partial_copresence_counts, self._y_path, self._x_paths,
            self._y_boundaries, self._weight_column_name))

    # Compute placerholder copresence counts.
    # partial_copresence_counts will only include x-y pairs that are present,
    # but we would also like to keep track of x-y pairs that never appear, as
    # long as x and y independently occur in the slice.

    # _SlicedXKey(slice, x_path, x), x_count
    x_counts = (
        sliced_record_batchs
        | 'ToPartialXCounts' >> beam.FlatMap(
            _to_partial_x_counts, self._x_paths, self._weight_column_name)
        | 'SumXCounts' >> beam.CombinePerKey(sum))
    if self._min_x_count:
      x_counts = x_counts | 'FilterXCounts' >> beam.Filter(
          lambda kv: kv[1] > self._min_x_count)

    # _SlicedXYKey(slice, x_path, x, y), 0
    placeholder_copresence_counts = (
        (x_counts, y_keys)
        | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts(
            self._x_paths, self._min_x_count))

    def move_y_to_value(key, xy_count):
      return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count)

    # _SlicedXKey(slice, x_path, x), (y, xy_count)
    copresence_counts = (
        (placeholder_copresence_counts, partial_copresence_counts)
        | 'FlattenCopresenceCounts' >> beam.Flatten()
        | 'SumCopresencePairs' >> beam.CombinePerKey(sum)
        | 'MoveYToValue' >> beam.MapTuple(move_y_to_value))

    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    return ({
        'x_count': x_counts,
        'xy_counts': copresence_counts
    }
            | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey()
            | 'JoinXCounts' >> beam.FlatMap(_join_x_counts)) 
Example #18
Source File: deep_copy_test.py    From transform with Apache License 2.0 4 votes vote down vote up
def testFlatten(self):
    with beam.Pipeline() as p:
      create_1 = p | 'Create1' >> beam.Create([(1, 'a'), (2, 'b')])
      create_2 = p | 'Create2' >> beam.Create([(3, 'c')])
      created = (create_1, create_2) | 'Flatten1' >> beam.Flatten()
      grouped1 = (created
                  | 'PreGroup1' >> beam.Map(
                      lambda x: DeepCopyTest._CountingIdentityFn(
                          'PreGroup1', x))
                  | 'GBK1' >> beam.GroupByKey())
      grouped2 = (p
                  | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')])
                  | 'PreGroup2' >> beam.Map(
                      lambda x: DeepCopyTest._CountingIdentityFn(
                          'PreGroup2', x))
                  | 'GBK2' >> beam.GroupByKey())
      modified1 = (
          grouped1
          |
          'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1')))
      modified2 = (
          grouped2
          |
          'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2')))
      flattened = (modified1, modified2) | 'Flatten2' >> beam.Flatten()
      modified3 = (
          flattened
          |
          'Add3' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add3')))

      copied = deep_copy.deep_copy(modified3)

      # Check that deep copy was performed.
      self.assertIsNot(copied.producer.inputs[0], modified3.producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[0],
                       modified3.producer.inputs[0].producer.inputs[0])
      self.assertIsNot(copied.producer.inputs[0].producer.inputs[1],
                       modified3.producer.inputs[0].producer.inputs[1])

      # Check that copy stops at materialization boundary.
      self.assertIs(
          copied.producer.inputs[0].producer.inputs[0].producer.inputs[0],
          modified3.producer.inputs[0].producer.inputs[0].producer.inputs[0])
      self.assertIs(
          copied.producer.inputs[0].producer.inputs[1].producer.inputs[0],
          modified3.producer.inputs[0].producer.inputs[1].producer.inputs[0])

    # Check counts of processed items.
    self.assertEqual(DeepCopyTest._counts['PreGroup1'], 3)
    self.assertEqual(DeepCopyTest._counts['PreGroup2'], 3)
    self.assertEqual(DeepCopyTest._counts['Add1'], 6)
    self.assertEqual(DeepCopyTest._counts['Add2'], 6)
    self.assertEqual(DeepCopyTest._counts['Add3'], 12) 
Example #19
Source File: executor.py    From tfx with Apache License 2.0 4 votes vote down vote up
def _run_model_inference(self, model_path: Text,
                           example_uris: Mapping[Text, Text],
                           output_path: Text,
                           model_spec: bulk_inferrer_pb2.ModelSpec) -> None:
    """Runs model inference on given example data.

    Args:
      model_path: Path to model.
      example_uris: Mapping of example split name to example uri.
      output_path: Path to output generated prediction logs.
      model_spec: bulk_inferrer_pb2.ModelSpec instance.

    Returns:
      None
    """

    try:
      from tfx_bsl.public.beam import run_inference
      from tfx_bsl.public.proto import model_spec_pb2
    except ImportError:
      # TODO(b/151468119): Remove this branch after next release.
      run_inference = importlib.import_module('tfx_bsl.beam.run_inference')
      model_spec_pb2 = importlib.import_module('tfx_bsl.proto.model_spec_pb2')
    saved_model_spec = model_spec_pb2.SavedModelSpec(
        model_path=model_path,
        tag=model_spec.tag,
        signature_name=model_spec.model_signature_name)
    # TODO(b/151468119): Remove this branch after next release.
    if getattr(model_spec_pb2, 'InferenceEndpoint', False):
      inference_endpoint = getattr(model_spec_pb2, 'InferenceEndpoint')()
    else:
      inference_endpoint = model_spec_pb2.InferenceSpecType()
    inference_endpoint.saved_model_spec.CopyFrom(saved_model_spec)
    with self._make_beam_pipeline() as pipeline:
      data_list = []
      for split, example_uri in example_uris.items():
        data = (
            pipeline | 'ReadData[{}]'.format(split) >> beam.io.ReadFromTFRecord(
                file_pattern=io_utils.all_files_pattern(example_uri)))
        data_list.append(data)
      _ = (
          [data for data in data_list]
          | 'FlattenExamples' >> beam.Flatten(pipeline=pipeline)
          | 'ParseExamples' >> beam.Map(tf.train.Example.FromString)
          | 'RunInference' >> run_inference.RunInference(inference_endpoint)
          | 'WritePredictionLogs' >> beam.io.WriteToTFRecord(
              output_path,
              file_name_suffix='.gz',
              coder=beam.coders.ProtoCoder(prediction_log_pb2.PredictionLog)))
    logging.info('Inference result written to %s.', output_path) 
Example #20
Source File: model_eval_lib.py    From model-analysis with Apache License 2.0 4 votes vote down vote up
def ExtractAndEvaluate(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection, extractors: List[extractor.Extractor],
    evaluators: List[evaluator.Evaluator]) -> evaluator.Evaluation:
  """Performs Extractions and Evaluations in provided order."""
  # evaluation[k] = list of values for k
  evaluation = {}

  def update(evaluation: Dict[Text, Any], new_evaluation: Dict[Text, Any]):
    for k, v in new_evaluation.items():
      if k not in evaluation:
        evaluation[k] = []
      evaluation[k].append(v)
    return evaluation

  # Run evaluators that run before extraction (i.e. that only require
  # the incoming input extract added by ReadInputs)
  for v in evaluators:
    if not v.run_after:
      update(evaluation, extracts | v.stage_name >> v.ptransform)
  for x in extractors:
    extracts = (extracts | x.stage_name >> x.ptransform)
    for v in evaluators:
      if v.run_after == x.stage_name:
        update(evaluation, extracts | v.stage_name >> v.ptransform)
  for v in evaluators:
    if v.run_after == extractor.LAST_EXTRACTOR_STAGE_NAME:
      update(evaluation, extracts | v.stage_name >> v.ptransform)

  # Merge multi-valued keys if necessary.
  result = {}
  for k, v in evaluation.items():
    if len(v) == 1:
      result[k] = v[0]
      continue

    # Note that we assume that if a key is multivalued, its values are
    # dictionaries with disjoint keys. The combined value will simply be the
    # disjoint union of all the dictionaries.
    result[k] = (
        v
        | 'FlattenEvaluationOutput(%s)' % k >> beam.Flatten()
        | 'CombineEvaluationOutput(%s)' % k >> beam.CombinePerKey(
            _CombineEvaluationDictionariesFn()))

  return result 
Example #21
Source File: beam_prepare_embedding_inputs.py    From exoplanet-ml with Apache License 2.0 4 votes vote down vote up
def main(argv):
  del argv  # Unused.
  logging.set_verbosity(logging.INFO)

  def pipeline(root):
    """Beam pipeline for preprocessing Kepler events."""
    # Separately process and write each TCE dataset, and gather all the results.
    configs = _parse_configs()
    subsets = {
        "train": [],
        "val": [],
        "test": [],
    }
    for config in configs:
      output_dir = os.path.join(FLAGS.output_dir, config.name)
      # Write the config.
      config_json = json.dumps(config, indent=2)
      logging.info(config_json)
      (root
       | "{}-create-config".format(config.name) >> beam.Create([config_json])
       | "{}-write_config".format(config.name) >> beam.io.WriteToText(
           os.path.join(output_dir, "config.json"),
           num_shards=1,
           shard_name_template=""))
      # Process TCEs and write each subset.
      results = _process_tces(root, config)
      for subset_name, subset_values in results:
        _write_subset(config.name, subset_name, subset_values)
        subsets[subset_name].append(subset_values)

    # Create one dataset comprising all TCE datasets.
    for subset_name, subset_values in subsets.items():
      combined_subset_values = (
          subset_values
          | "combined-{}-flatten".format(subset_name) >> beam.Flatten()
          | "combined-{}-count_labels".format(subset_name) >> beam.ParDo(
              _CountLabelsDoFn(prefix="combined-{}".format(subset_name)))
          | "combined-{}-reshuffle".format(subset_name) >> beam.Reshuffle())
      _write_subset("combined", subset_name, combined_subset_values)

  pipeline.run()
  logging.info("Preprocessing complete.") 
Example #22
Source File: poisson_bootstrap.py    From model-analysis with Apache License 2.0 4 votes vote down vote up
def ComputeWithConfidenceIntervals(  # pylint: disable=invalid-name
    sliced_extracts: beam.pvalue.PCollection,
    compute_per_slice_metrics_cls: Type[beam.PTransform],
    num_bootstrap_samples: Optional[int] = DEFAULT_NUM_BOOTSTRAP_SAMPLES,
    random_seed_for_testing: Optional[int] = None,
    **kwargs) -> beam.pvalue.PCollection:
  """PTransform for computing metrics using T-Distribution values.

  Args:
    sliced_extracts: Incoming PCollection consisting of slice key and extracts.
    compute_per_slice_metrics_cls: PTransform class that takes a PCollection of
      (slice key, extracts) as input and returns (slice key, dict of metrics) as
      output. The class will be instantiated multiple times to compute metrics
      both with and without sampling. The class will be initialized using kwargs
      'compute_with_sampling' and 'random_seed_for_testing' along with any
      kwargs passed in **kwargs.
    num_bootstrap_samples: Number of replicas to use in calculating uncertainty
      using bootstrapping. If 1 is provided (default), aggregate metrics will be
      calculated with no uncertainty. If num_bootstrap_samples is > 0, multiple
      samples of each slice will be calculated using the Poisson bootstrap
      method. To calculate standard errors, num_bootstrap_samples should be 20
      or more in order to provide useful data. More is better, but you pay a
      performance cost.
    random_seed_for_testing: Seed to use for unit testing, because
      nondeterministic tests stink. Each partition will use this value + i.
    **kwargs: Additional args to pass to compute_per_slice_metrics_cls init.

  Returns:
    PCollection of (slice key, dict of metrics)
  """
  if not num_bootstrap_samples:
    num_bootstrap_samples = 1
  # TODO(ckuhn): Cap the number of bootstrap samples at 20.
  if num_bootstrap_samples < 1:
    raise ValueError('num_bootstrap_samples should be > 0, got %d' %
                     num_bootstrap_samples)

  output_results = (
      sliced_extracts
      | 'ComputeUnsampledMetrics' >> compute_per_slice_metrics_cls(
          compute_with_sampling=False, random_seed_for_testing=None, **kwargs))

  if num_bootstrap_samples > 1:
    multicombine = []
    for i in range(num_bootstrap_samples):
      seed = (None if random_seed_for_testing is None else
              random_seed_for_testing + i)
      multicombine.append(
          sliced_extracts
          | 'ComputeSampledMetrics%d' % i >> compute_per_slice_metrics_cls(
              compute_with_sampling=True,
              random_seed_for_testing=seed,
              **kwargs))
    output_results = (
        multicombine
        | 'FlattenBootstrapPartitions' >> beam.Flatten()
        | 'GroupBySlice' >> beam.GroupByKey()
        | 'MergeBootstrap' >> beam.ParDo(_MergeBootstrap(),
                                         beam.pvalue.AsDict(output_results)))
  return output_results 
Example #23
Source File: stats_impl.py    From data-validation with Apache License 2.0 4 votes vote down vote up
def expand(self, dataset: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
    # Handles generators by their type:
    #   - CombinerStatsGenerators will be wrapped in a single CombinePerKey by
    #     _CombinerStatsGeneratorsCombineFn.
    #   - TransformStatsGenerator will be invoked separately with `dataset`.
    combiner_stats_generators = []
    result_protos = []
    for generator in get_generators(self._options):
      if isinstance(generator, stats_generator.CombinerStatsGenerator):
        combiner_stats_generators.append(generator)
      elif isinstance(generator, stats_generator.TransformStatsGenerator):
        result_protos.append(
            dataset
            | generator.name >> generator.ptransform)
      else:
        raise TypeError('Statistics generator must extend one of '
                        'CombinerStatsGenerator or TransformStatsGenerator, '
                        'found object of type %s' %
                        generator.__class__.__name__)
    if combiner_stats_generators:
      # TODO(b/115685296): Obviate the need for explicit fanout.
      fanout = 5 * int(math.ceil(math.sqrt(len(combiner_stats_generators))))
      result_protos.append(dataset
                           | 'RunCombinerStatsGenerators'
                           >> beam.CombinePerKey(
                               _CombinerStatsGeneratorsCombineFn(
                                   combiner_stats_generators,
                                   self._options.desired_batch_size
                                   )).with_hot_key_fanout(fanout))

    # result_protos is a list of PCollections of (slice key,
    # DatasetFeatureStatistics proto) pairs. We now flatten the list into a
    # single PCollection, combine the DatasetFeatureStatistics protos by key,
    # and then merge the DatasetFeatureStatistics protos in the PCollection into
    # a single DatasetFeatureStatisticsList proto.
    return (result_protos
            | 'FlattenFeatureStatistics' >> beam.Flatten()
            | 'MergeDatasetFeatureStatisticsProtos' >>
            beam.CombinePerKey(_merge_dataset_feature_stats_protos)
            | 'AddSliceKeyToStatsProto' >> beam.Map(
                _add_slice_key,
                self._is_slicing_enabled)
            | 'ToList' >> beam.combiners.ToList()
            | 'MakeDatasetFeatureStatisticsListProto' >>
            beam.Map(_make_dataset_feature_statistics_list_proto))