Python apache_beam.CombineGlobally() Examples

The following are 26 code examples of apache_beam.CombineGlobally(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: linters.py    From data-linter with Apache License 2.0 6 votes vote down vote up
def _lint(self, examples):
    """Returns the `PTransform` for the EmptyExampleDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `PTransform` that yields a `LintResult` of the format
        warnings: [num empties]
        lint_sample: None
    """
    n_empties = (
        examples
        | 'DetectEmpties' >> beam.Map(self._example_is_empty)
        | 'Count' >> beam.CombineGlobally(sum)
        | 'NoZero' >> beam.Filter(bool)
        | 'ToResult' >> beam.Map(
            lambda w: self._make_result(warnings=[str(w)])))
    return n_empties 
Example #2
Source File: analyzer_impls.py    From transform with Apache License 2.0 6 votes vote down vote up
def expand(self, inputs):
    pcoll, = inputs
    # We specify a fanout so that the packed combiner doesn't exhibit stragglers
    # during the 'reduce' phase when we have a lot of combine analyzers packed.
    fanout = int(math.ceil(math.sqrt(len(self._combiners))))
    # TODO(b/34792459): Don't set with_defaults.
    return (
        pcoll
        | 'InitialPackedCombineGlobally' >> beam.CombineGlobally(
            _PackedCombinerWrapper(
                self._combiners,
                self._tf_config,
                is_combining_accumulators=False
            )
        ).with_fanout(fanout).with_defaults(False)
        | 'Count' >>
        common.IncrementCounter('num_packed_accumulate_combiners')) 
Example #3
Source File: utils_test.py    From text with Apache License 2.0 5 votes vote down vote up
def testNotEqual(self):
    with TestPipeline() as p:
      sample_input = [('I', 'en'), ('kind', 'en'), ('of', 'en'), ('like', 'en'),
                      ('to', 'en'), ('eat', 'en'), ('pie', 'en'), ('!', 'en'),
                      ('Je', 'fr'), ('suis', 'fr'), ('une', 'fr'),
                      ('fille', 'fr'), ('.', 'fr')]
      tokens = p | beam.Create(sample_input)
      result = (tokens
                | beam.CombineGlobally(utils.CalculateCoefficients(0.5))
                | beam.ParDo(CompareValues()))
      assert_that(result, equal_to([True])) 
Example #4
Source File: csv_decoder_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def test_invalid_row(self):
    input_lines = ['1,2.0,hello', '5,12.34']
    column_names = ['int_feature', 'float_feature', 'str_feature']
    with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
        ValueError, '.*Columns do not match specified csv headers.*'):
      with beam.Pipeline() as p:
        result = (
            p | beam.Create(input_lines, reshuffle=False)
            | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
            | beam.Keys()
            | beam.CombineGlobally(
                csv_decoder.ColumnTypeInferrer(
                    column_names, skip_blank_lines=False)))
        beam_test_util.assert_that(result, lambda _: None) 
Example #5
Source File: cache_tasks_main.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    to_dict = lambda x: {x[0]: x[1]}
    example_counts = (
        pcoll
        | "count_examples" >> beam.combiners.Count.Globally()
        | "key_example_counts" >> beam.Map(
            lambda x: ("examples", x))
        | "example_count_dict" >> beam.Map(to_dict))
    def _count_tokens(pcoll, feat):
      return (
          pcoll
          | "key_%s_toks" % feat >> beam.Map(
              lambda x:  # pylint:disable=g-long-lambda
              ("%s_tokens" % feat, int(sum(x[feat] > 1)) if feat in x else 0)))
    token_counts = (
        [_count_tokens(pcoll, feat)
         for feat in self._output_features]
        | "flatten_tokens" >> beam.Flatten()
        | "count_tokens" >> beam.CombinePerKey(sum)
        | "token_count_dict" >> beam.Map(to_dict))

    def _merge_dicts(dicts):
      merged_dict = {}
      for d in dicts:
        assert not set(merged_dict).intersection(d)
        merged_dict.update(d)
      return merged_dict
    return (
        [example_counts, token_counts]
        | "flatten_counts" >> beam.Flatten()
        | "merge_stats" >> beam.CombineGlobally(_merge_dicts)) 
Example #6
Source File: preprocess.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def check_size(p, name, path):
  """Performs checks on the input pipeline and stores stats in specfied path.

  Checks performed: counts rows and derives class distribution.

  Args:
    p: PCollection, input pipeline.
    name: string, unique identifier for the beam step.
    path: string: path to store stats.

  Returns:
    PCollection
  """

  class _Combine(beam.CombineFn):
    """Counts and take the average of positive classes in the pipeline."""

    def create_accumulator(self):
      return (0.0, 0.0)

    def add_input(self, sum_count, inputs):
      (s, count) = sum_count
      return s + inputs, count + 1

    def merge_accumulators(self, accumulators):
      sums, counts = zip(*accumulators)
      return sum(sums), sum(counts)

    # We should not consider the case count == 0 as an error (class initialized
    # with count == 0).
    def extract_output(self, sum_count):
      (s, count) = sum_count
      return count, (1.0 * s / count) if count else float('NaN')

  return (p
          | 'CheckMapTo_1_{}'.format(name) >>
          beam.Map(lambda x: x[constants.LABEL_COLUMN])
          | 'CheckSum_{}'.format(name) >> beam.CombineGlobally(_Combine())
          | 'CheckRecord_{}'.format(name) >> beam.io.WriteToText(
              '{}.txt'.format(path))) 
Example #7
Source File: utils_test.py    From text with Apache License 2.0 5 votes vote down vote up
def testUnsorted(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      result = tokens | beam.CombineGlobally(utils.SortByCount())
      assert_that(result, equal_to([[('c', 9), ('a', 5), ('d', 4), ('b', 2)]])) 
Example #8
Source File: merge_headers.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    return pcoll | 'MergeHeaders' >> beam.CombineGlobally(
        _MergeHeadersFn(self._header_merger)).without_defaults() 
Example #9
Source File: utils_test.py    From text with Apache License 2.0 5 votes vote down vote up
def testEqual(self):
    with TestPipeline() as p:
      tokens = p | beam.Create(self.sample_input)
      result = tokens | beam.CombineGlobally(utils.CalculateCoefficients(0.5))
      assert_that(result, equal_to([{'en': 1.0, 'fr': 1.0}])) 
Example #10
Source File: analyzer_impls.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, inputs):
    pcoll, = inputs

    return (
        pcoll
        | 'MergeCombinesGlobally' >> beam.CombineGlobally(
            _CombinerWrapper(
                self._combiner,
                self._tf_config,
                # TODO(b/34792459): Don't set with_defaults. We set it to False
                # for all combiners (even though QuantilesCombiner doesn't need
                # it to be set) as after combiner packing we will have a single
                # combiner and want a consistent behavior.
                is_combining_accumulators=True)).with_defaults(False)) 
Example #11
Source File: analyzer_impls.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, inputs):
    pcoll, = inputs

    return (
        pcoll
        | 'InitialCombineGlobally' >> beam.CombineGlobally(
            _CombinerWrapper(
                self._combiner,
                self._tf_config,
                # TODO(b/34792459): Don't set with_defaults. We set it to False
                # for all combiners (even though QuantilesCombiner doesn't need
                # it to be set) as after combiner packing we will have a single
                # combiner and want a consistent behavior.
                is_combining_accumulators=False)).with_defaults(False)) 
Example #12
Source File: analyzer_impls.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, inputs):
    pcoll, = inputs

    # TODO(b/34792459): Don't set with_defaults.
    return (
        pcoll
        | 'MergePackedCombinesGlobally' >> beam.CombineGlobally(
            _PackedCombinerWrapper(
                self._combiners,
                self._tf_config,
                is_combining_accumulators=True)).with_defaults(False)
        | 'Count' >>
        common.IncrementCounter('num_packed_merge_combiners')) 
Example #13
Source File: impl_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
      output_tuple = (
          pcoll
          | beam.FlatMap(self._flatten_fn)
          | beam.CombineGlobally(self._sum_fn)
          | beam.FlatMap(self._extract_outputs).with_outputs('0', '1'))
      return (output_tuple['0'], output_tuple['1']) 
Example #14
Source File: datagen_beam.py    From magenta with Apache License 2.0 5 votes vote down vote up
def get_stats_of_glyphazzn(filepattern, output_path):
  """Computes the Mean and Std across examples in glyphazzn dataset."""
  def pipeline(root):
    """Pipeline for computing means/std from dataset."""
    examples = root | 'Read' >> beam.io.tfrecordio.ReadFromTFRecord(filepattern)
    examples = examples | 'Deserialize' >> beam.Map(_decode_tfexample)
    examples = examples | 'GetMeanStdev' >> beam.CombineGlobally(MeanStddev())
    examples = examples | 'MeanStdevToSerializedTFRecord' >> beam.Map(
        _mean_to_example)
    (examples | 'WriteToTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
        output_path, coder=beam.coders.ProtoCode(tf.train.Example)))
  return pipeline 
Example #15
Source File: mlengine_prediction_summary.py    From airflow with Apache License 2.0 5 votes vote down vote up
def MakeSummary(pcoll, metric_fn, metric_keys):  # pylint: disable=invalid-name
    """
    Summary PTransofrm used in Dataflow.
    """
    return (
        pcoll |
        "ApplyMetricFnPerInstance" >> beam.Map(metric_fn) |
        "PairWith1" >> beam.Map(lambda tup: tup + (1,)) |
        "SumTuple" >> beam.CombineGlobally(beam.combiners.TupleCombineFn(
            *([sum] * (len(metric_keys) + 1)))) |
        "AverageAndMakeDict" >> beam.Map(
            lambda tup: dict(
                [(name, tup[i] / tup[-1]) for i, name in enumerate(metric_keys)] +
                [("count", tup[-1])]))) 
Example #16
Source File: merge_header_definitions.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    return pcoll | beam.CombineGlobally(
        _MergeDefinitionsFn(self._definitions_merger)).without_defaults() 
Example #17
Source File: extract_input_size.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def expand(self, sample_map):
    return (sample_map
            | 'GetListsOfValueCounts' >> beam.Values()
            | 'SumValueCountsPerSample' >> beam.Map(sum)
            | 'SumTotalValueCounts' >> beam.CombineGlobally(sum)) 
Example #18
Source File: extract_input_size.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def expand(self, estimates):
    return (estimates
            | 'ExtractVariantCount' >> beam.Map(
                lambda estimate: estimate.estimated_variant_count)
            | 'SumVariantCounts' >> beam.CombineGlobally(sum)) 
Example #19
Source File: extract_input_size.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def expand(self, estimates):
    return (estimates
            | 'ExtractFileSize' >> beam.Map(
                lambda estimate: estimate.size_in_bytes)
            | 'SumFileSizes' >> beam.CombineGlobally(sum)) 
Example #20
Source File: generate_word_counts.py    From text with Apache License 2.0 4 votes vote down vote up
def word_count(input_path, output_path, raw_metadata, min_token_frequency=2):
  """Returns a pipeline counting words and writing the output.

  Args:
    input_path: recordio file to read
    output_path: path in which to write the output
    raw_metadata: metadata of input tf.Examples
    min_token_frequency: the min frequency for a token to be included
  """

  lang_set = set(FLAGS.lang_set.split(','))

  # Create pipeline.
  pipeline = beam.Pipeline()

  with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    converter = tft.coders.ExampleProtoCoder(
        raw_metadata.schema, serialized=False)

    # Read raw data and convert to TF Transform encoded dict.
    raw_data = (
        pipeline
        | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
            input_path, coder=beam.coders.ProtoCoder(tf.train.Example))
        | 'DecodeInputData' >> beam.Map(converter.decode))

    # Apply TF Transform.
    (transformed_data, _), _ = (
        (raw_data, raw_metadata)
        | 'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
            utils.count_preprocessing_fn(FLAGS.text_key,
                                         FLAGS.language_code_key)))

    # Filter by languages.
    tokens = (
        transformed_data
        | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))

    # Calculate smoothing coefficients.
    coeffs = (
        tokens
        | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
            utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

    # Apply smoothing, aggregate counts, and sort words by count.
    _ = (
        tokens
        | 'ApplyExponentialSmoothing' >> beam.ParDo(
            utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
        | 'SumCounts' >> beam.CombinePerKey(sum)
        | 'FilterLowCounts' >> beam.ParDo(utils.FilterByCount(
            FLAGS.max_word_length, min_token_frequency))
        | 'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
        | 'Flatten' >> beam.FlatMap(lambda x: x)
        | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1]))
        | 'WriteSortedCount' >> beam.io.WriteToText(
            output_path, shard_name_template=''))

  return pipeline 
Example #21
Source File: dataset.py    From tfx with Apache License 2.0 4 votes vote down vote up
def convert_csv_to_tf_examples(self, csv_path, tfrecords_output_path):
    """Runs a Beam pipeline to convert the CSV file into a TFRecords file.

    This is needed because the conversion is orders of magnitude more
    time-consuming than the functions we want to benchmark, so instead of
    doing the conversion each time, we do it once to generate a converted
    dataset and use that for the benchmark instead.

    Args:
      csv_path: Path to CSV file containing examples.
      tfrecords_output_path: Path to output TFRecords file containing parsed
        examples.
    """
    # Copied from CSV example gen.
    fp = open(csv_path, "r")
    column_names = next(fp).strip().split(",")
    fp.close()

    with beam.Pipeline() as p:
      parsed_csv_lines = (
          p
          | "ReadFromText" >> beam.io.ReadFromText(
              file_pattern=csv_path, skip_header_lines=1)
          |
          "ParseCSVLine" >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=",")))
      # TODO(b/155997704) clean this up once tfx_bsl makes a release.
      if getattr(csv_decoder, "PARSE_CSV_LINE_YIELDS_RAW_RECORDS", False):
        # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
        # we only want the parsed_lines.
        parsed_csv_lines |= "ExtractParsedCSVLines" >> beam.Keys()

      column_infos = beam.pvalue.AsSingleton(
          parsed_csv_lines
          | "InferColumnTypes" >> beam.CombineGlobally(
              csv_decoder.ColumnTypeInferrer(
                  column_names, skip_blank_lines=True)))
      _ = (
          parsed_csv_lines
          | "ToTFExample" >> beam.ParDo(
              csv_exgen._ParsedCsvToTfExample(),  # pylint: disable=protected-access
              column_infos)
          | "Serialize" >> beam.Map(lambda x: x.SerializeToString())
          | "WriteToTFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
              file_path_prefix=tfrecords_output_path,
              shard_name_template="",
              compression_type=beam.io.filesystem.CompressionTypes.GZIP)) 
Example #22
Source File: executor.py    From tfx with Apache License 2.0 4 votes vote down vote up
def _CsvToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
      - input_base: input dir that contains CSV data. CSV must have header line.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
  input_base_uri = exec_properties[utils.INPUT_BASE_KEY]
  csv_pattern = os.path.join(input_base_uri, split_pattern)
  logging.info('Processing input csv data %s to TFExample.', csv_pattern)

  csv_files = tf.io.gfile.glob(csv_pattern)
  if not csv_files:
    raise RuntimeError(
        'Split pattern {} does not match any files.'.format(csv_pattern))

  column_names = io_utils.load_csv_column_names(csv_files[0])
  for csv_file in csv_files[1:]:
    if io_utils.load_csv_column_names(csv_file) != column_names:
      raise RuntimeError(
          'Files in same split {} have different header.'.format(csv_pattern))

  parsed_csv_lines = (
      pipeline
      | 'ReadFromText' >> beam.io.ReadFromText(
          file_pattern=csv_pattern, skip_header_lines=1)
      | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
  # TODO(b/155997704) clean this up once tfx_bsl makes a release.
  if getattr(csv_decoder, 'PARSE_CSV_LINE_YIELDS_RAW_RECORDS', False):
    # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
    # we only want the parsed_lines.
    parsed_csv_lines |= 'ExtractParsedCSVLines' >> beam.Keys()
  column_infos = beam.pvalue.AsSingleton(
      parsed_csv_lines
      | 'InferColumnTypes' >> beam.CombineGlobally(
          csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True)))

  return (parsed_csv_lines
          | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos)) 
Example #23
Source File: preprocess.py    From professional-services with Apache License 2.0 4 votes vote down vote up
def oversampling(p):
  """Oversamples the positive class elements contained in the input pipeline.

  Computes the current class distribution and re-sample positive class to
  ensure a class distribution close to 50% / 50%. Samples each positive class
  item w/ bernouilli distribution approximated with normal distribution
  (mean=ratio, var=ratio, where ratio is the factor by which we want to increase
  the number of positive samples).

  Args:
    p: PCollection.

  Returns:
    PCollection of re-balanced elements.

  Raises:
    ValueError: No positive class items found in pipeline.
  """

  # Computes percentage of positive class to use as side input in main pipeline.
  percentage = (
      p
      | 'ReduceToClass' >> beam.Map(lambda x: 1.0 * x[constants.LABEL_COLUMN])
      | beam.CombineGlobally(beam.combiners.MeanCombineFn()))

  class _Sample(beam.DoFn):
    """DoFn that performs resampling element by element.

    Attributes:
      process: Function performing the resampling at element level.
    """

    def process(self, element, percent_positive):
      if not percent_positive:
        raise ValueError('No positive class items found in pipeline.')
      ratio = 1.0 / percent_positive
      n = (
          max(int(random.gauss(mu=ratio, sigma=ratio**0.5)), 0)
          if element[constants.LABEL_COLUMN] else 1)
      for _ in range(n):
        yield element

  proc = (
      p | 'DuplicateItemAndFlatten' >> beam.ParDo(
          _Sample(), percent_positive=beam.pvalue.AsSingleton(percentage)))

  return proc 
Example #24
Source File: revise_preprocessed_data.py    From cloudml-examples with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
  """Runs the revise preprocessed data pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
  pipeline_options = PipelineOptions(flags=argv)
  revise_options = pipeline_options.view_as(ReviseOptions)
  cloud_options = pipeline_options.view_as(GoogleCloudOptions)
  output_dir = os.path.join(revise_options.output,
                            datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(
      WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
  cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
  cloud_options.temp_location = os.path.join(output_dir, 'tmp')
  cloud_options.job_name = 'relabel-examples-%s' % (
      datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

  metadata_query = str(
      Template(open(revise_options.metadata, 'r').read()).render(
          METADATA_QUERY_REPLACEMENTS))
  logging.info('metadata query : %s', metadata_query)

  with beam.Pipeline(options=pipeline_options) as p:
    # Gather our sample metadata into a python dictionary.
    samples_metadata = (
        p
        | 'ReadSampleMetadata' >> beam.io.Read(
            beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True))
        | 'TableToDictionary' >> beam.CombineGlobally(
            util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))

    # Read the tf.Example protos into a PCollection.
    examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
        file_pattern=revise_options.input,
        compression_type=CompressionTypes.GZIP)

    # Filter the TensorFlow Example Protocol Buffers.
    filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
        lambda example, samples_metadata:
        filter_and_revise_example(example, samples_metadata),
        beam.pvalue.AsSingleton(samples_metadata)))

    # Write the subset of tf.Example protos to Cloud Storage.
    _ = (filtered_examples
         | 'SerializeExamples' >>
         beam.Map(lambda example: example.SerializeToString())
         | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
             file_path_prefix=os.path.join(output_dir, 'examples'),
             compression_type=CompressionTypes.GZIP,
             file_name_suffix='.tfrecord.gz')) 
Example #25
Source File: metrics_plots_and_validations_writer.py    From model-analysis with Apache License 2.0 4 votes vote down vote up
def _WriteMetricsPlotsAndValidations(  # pylint: disable=invalid-name
    evaluation: evaluator.Evaluation, output_paths: Dict[Text, Text],
    add_metrics_callbacks: List[types.AddMetricsCallbackType],
    metrics_key: Text, plots_key: Text, validations_key: Text,
    output_file_format: Text) -> beam.pvalue.PDone:
  """PTransform to write metrics and plots."""

  if output_file_format and output_file_format != 'tfrecord':
    raise ValueError(
        'only "{}" format is currently supported: output_file_format={}'.format(
            'tfrecord', output_file_format))

  if metrics_key in evaluation:
    metrics = (
        evaluation[metrics_key] | 'ConvertSliceMetricsToProto' >> beam.Map(
            convert_slice_metrics_to_proto,
            add_metrics_callbacks=add_metrics_callbacks))

    if constants.METRICS_KEY in output_paths:
      _ = metrics | 'WriteMetrics' >> beam.io.WriteToTFRecord(
          file_path_prefix=output_paths[constants.METRICS_KEY],
          shard_name_template=None if output_file_format else '',
          file_name_suffix=('.' +
                            output_file_format if output_file_format else ''),
          coder=beam.coders.ProtoCoder(metrics_for_slice_pb2.MetricsForSlice))

  if plots_key in evaluation:
    plots = (
        evaluation[plots_key] | 'ConvertSlicePlotsToProto' >> beam.Map(
            convert_slice_plots_to_proto,
            add_metrics_callbacks=add_metrics_callbacks))

    if constants.PLOTS_KEY in output_paths:
      _ = plots | 'WritePlots' >> beam.io.WriteToTFRecord(
          file_path_prefix=output_paths[constants.PLOTS_KEY],
          shard_name_template=None if output_file_format else '',
          file_name_suffix=('.' +
                            output_file_format if output_file_format else ''),
          coder=beam.coders.ProtoCoder(metrics_for_slice_pb2.PlotsForSlice))

  if validations_key in evaluation:
    validations = (
        evaluation[validations_key]
        |
        'MergeValidationResults' >> beam.CombineGlobally(_CombineValidations()))

    if constants.VALIDATIONS_KEY in output_paths:
      # We only use a single shard here because validations are usually single
      # values.
      _ = validations | 'WriteValidations' >> beam.io.WriteToTFRecord(
          file_path_prefix=output_paths[constants.VALIDATIONS_KEY],
          shard_name_template='',
          file_name_suffix=('.' +
                            output_file_format if output_file_format else ''),
          coder=beam.coders.ProtoCoder(validation_result_pb2.ValidationResult))

  return beam.pvalue.PDone(list(evaluation.values())[0].pipeline) 
Example #26
Source File: linters.py    From data-linter with Apache License 2.0 4 votes vote down vote up
def _lint(self, examples):
    """Returns the `PTransform` for the DuplicateExampleDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `PTransform` that yields a `LintResult` of the format
        warnings: [num_duplicates]
        lint_sample: [ features: [sample duplicates...] ]
    """
    feature_names = sorted(f.name for f in self._stats.features)
    tuplize = utils.example_tuplizer(feature_names, denan=True)

    duplicates = (
        examples
        | 'Tuplize' >> beam.Map(lambda x: (tuplize(x), x))
        | 'CollectDuplicates' >> beam.GroupByKey()
        | 'ExamplesToList' >> beam.Map(
            lambda (example_tuple, examples): (example_tuple, list(examples)))
        | 'FilterDuplicates' >> beam.Filter(
            lambda (_, examples): len(examples) > 1))

    samples = (
        duplicates
        | 'TakeExamples' >> beam.Map(lambda (_, examples): examples[0])
        | 'Sample' >> beam.combiners.Sample.FixedSizeGlobally(
            self.N_LINT_SAMPLES)
        | 'ToSample' >> beam.Map(
            lambda x: lint_result_pb2.LintSample(examples=x)))

    n_duplicates = (
        duplicates
        | 'CountDuplicates' >> beam.Map(lambda (_, examples): len(examples))
        | 'ExcessCounts' >> beam.Map(lambda x: x - 1)
        | 'Total' >> beam.CombineGlobally(sum))

    return (
        # this is effectively a `Flatten` but with deterministic argument order
        examples.pipeline
        | 'SyncSideInputs' >> beam.Create([None])
        | 'ToResult' >> beam.Map(self._to_result,
                                 beam.pvalue.AsSingleton(n_duplicates),
                                 beam.pvalue.AsSingleton(samples)))