Python apache_beam.Filter() Examples

The following are 8 code examples of apache_beam.Filter(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function

Example #1

Source File: datagen_beam.py From magenta with Apache License 2.0

6 votes

def create_glyphazzn_dataset(filepattern, output_path):
  """Creates a glyphazzn dataset, from raw Parquetio to TFRecords."""
  def pipeline(root):
    """Pipeline for creating glyphazzn dataset."""
    attrs = ['uni', 'width', 'vwidth', 'sfd', 'id', 'binary_fp']

    examples = root | 'Read' >> beam.io.parquetio.ReadFromParquet(
        file_pattern=filepattern, columns=attrs)

    examples = examples | 'FilterBadIcons' >> beam.Filter(_is_valid_glyph)
    examples = examples | 'ConvertToPath' >> beam.Map(_convert_to_path)
    examples = examples | 'FilterBadPathLenghts' >> beam.Filter(_is_valid_path)
    examples = examples | 'ProcessAndConvert' >> beam.Map(_create_example)
    (examples | 'WriteToTFRecord' >> beam.io.tfrecordio.WriteToTFRecord(
        output_path, num_shards=90))
  return pipeline

Example #2

Source File: data_linter.py From data-linter with Apache License 2.0

6 votes

def expand(self, examples):
    """Runs the linters on the data and writes out the results.

    The order in which the linters run is unspecified.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Examples`.

    Returns:
      A pipeline containing the `DataLinter` `PTransform`s.
    """
    coders = (beam.coders.coders.StrUtf8Coder(),
              beam.coders.coders.ProtoCoder(lint_result_pb2.LintResult))
    return (
        [examples | linter for linter in self._linters if linter.should_run()]
        | 'MergeResults' >> beam.Flatten()
        | 'DropEmpty' >> beam.Filter(lambda (_, r): r and len(r.warnings))
        | 'ToDict' >> beam.combiners.ToDict()
        | 'WriteResults' >> beam.io.textio.WriteToText(
            self._results_path,
            coder=beam.coders.coders.PickleCoder(),
            shard_name_template=''))

Example #3

Source File: linters.py From data-linter with Apache License 2.0

6 votes

def _lint(self, examples):
    feature_val_w_counts = (
        examples
        | 'Tuplize' >> beam.FlatMap(
            utils.example_tuplizer(self._counted_features))
        | 'FlattenFeatureVals' >> beam.FlatMap(self._flatten_feature_vals)
        | 'CountFeatureVals' >> beam.combiners.Count.PerElement())

    if hasattr(self, '_count_transformer'):
      feature_val_w_counts |= 'TransformCounts' >> self._count_transformer

    return (
        feature_val_w_counts
        | 'PairValWithCount' >> beam.Map(self._shift_key)
        | 'GroupByFeature' >> beam.GroupByKey()
        | 'ValCountsToDict' >> beam.Map(self._val_counts_as_dict)
        | 'GenResults' >> beam.Map(self._check_feature)
        | 'DropUnwarned' >> beam.Filter(bool)
        | 'AsList' >> beam.combiners.ToList()
        | 'ToResult' >> beam.Map(self._to_result))

Example #4

Source File: linters.py From data-linter with Apache License 2.0

6 votes

def _lint(self, examples):
    """Returns the `PTransform` for the EmptyExampleDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `PTransform` that yields a `LintResult` of the format
        warnings: [num empties]
        lint_sample: None
    """
    n_empties = (
        examples
        | 'DetectEmpties' >> beam.Map(self._example_is_empty)
        | 'Count' >> beam.CombineGlobally(sum)
        | 'NoZero' >> beam.Filter(bool)
        | 'ToResult' >> beam.Map(
            lambda w: self._make_result(warnings=[str(w)])))
    return n_empties

Example #5

Source File: analyzer_impls.py From transform with Apache License 2.0

5 votes

def expand(self, inputs):
    pcoll, = inputs

    # Create a PCollection of (count, element) pairs, then iterates over
    # this to create a single element PCollection containing this list of
    # pairs in sorted order by decreasing counts (and by values for equal
    # counts).

    # TODO(b/112916494): Unify the graph in both cases once possible.
    if (self._vocab_ordering_type ==
        _VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
      flatten_map_fn = _flatten_to_key_and_means_accumulator_list
      combine_transform = _MutualInformationTransformAccumulate()  # pylint: disable=no-value-for-parameter
    elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_FREQUENCY:
      flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
      combine_transform = beam.CombinePerKey(sum)
    elif self._vocab_ordering_type == _VocabOrderingType.WEIGHTED_LABELS:
      flatten_map_fn = _flatten_value_and_labeled_weights_to_list_of_tuples
      combine_transform = beam.CombinePerKey(sum_labeled_weights)
    else:
      flatten_map_fn = _flatten_value_to_list
      combine_transform = beam.combiners.Count.PerElement()

    result = (
        pcoll
        | 'FlattenTokensAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn)
        | 'CountPerToken' >> combine_transform)

    if self._input_dtype == tf.string:
      # TODO(b/62379925) Filter empty strings or strings containing the \n or \r
      # tokens since index_table_from_file doesn't allow empty rows.
      def is_problematic_string(kv):
        string, _ = kv  # Ignore counts.
        return string and b'\n' not in string and b'\r' not in string

      result |= 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)

    return result

Example #6

Source File: linters.py From data-linter with Apache License 2.0

5 votes

def _count_transformer(self):
    return (
        'DropNaN' >> beam.Filter(lambda (f_v, _): not np.isnan(f_v[1]))
        | 'IsIntegral' >> beam.Map(
            lambda (f_v, c): ((f_v[0], f_v[1] % 1 == 0), c))
        | 'Count' >> beam.CombinePerKey(sum))

Example #7

Source File: lift_stats_generator.py From data-validation with Apache License 2.0

4 votes

def expand(self, sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch,
                                                      _SlicedYKey]):
    sliced_record_batchs, y_keys = sliced_record_batchs_and_ys

    # _SlicedXYKey(slice, x_path, x, y), xy_count
    partial_copresence_counts = (
        sliced_record_batchs
        | 'ToPartialCopresenceCounts' >> beam.FlatMap(
            _to_partial_copresence_counts, self._y_path, self._x_paths,
            self._y_boundaries, self._weight_column_name))

    # Compute placerholder copresence counts.
    # partial_copresence_counts will only include x-y pairs that are present,
    # but we would also like to keep track of x-y pairs that never appear, as
    # long as x and y independently occur in the slice.

    # _SlicedXKey(slice, x_path, x), x_count
    x_counts = (
        sliced_record_batchs
        | 'ToPartialXCounts' >> beam.FlatMap(
            _to_partial_x_counts, self._x_paths, self._weight_column_name)
        | 'SumXCounts' >> beam.CombinePerKey(sum))
    if self._min_x_count:
      x_counts = x_counts | 'FilterXCounts' >> beam.Filter(
          lambda kv: kv[1] > self._min_x_count)

    # _SlicedXYKey(slice, x_path, x, y), 0
    placeholder_copresence_counts = (
        (x_counts, y_keys)
        | 'GetPlaceholderCopresenceCounts' >> _GetPlaceholderCopresenceCounts(
            self._x_paths, self._min_x_count))

    def move_y_to_value(key, xy_count):
      return _SlicedXKey(key.slice_key, key.x_path, key.x), (key.y, xy_count)

    # _SlicedXKey(slice, x_path, x), (y, xy_count)
    copresence_counts = (
        (placeholder_copresence_counts, partial_copresence_counts)
        | 'FlattenCopresenceCounts' >> beam.Flatten()
        | 'SumCopresencePairs' >> beam.CombinePerKey(sum)
        | 'MoveYToValue' >> beam.MapTuple(move_y_to_value))

    # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
    return ({
        'x_count': x_counts,
        'xy_counts': copresence_counts
    }
            | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey()
            | 'JoinXCounts' >> beam.FlatMap(_join_x_counts))

Example #8

Source File: linters.py From data-linter with Apache License 2.0

4 votes

def _lint(self, examples):
    """Returns the `PTransform` for the DuplicateExampleDetector linter.

    Args:
      examples: A `PTransform` that yields a `PCollection` of `tf.Example`s.

    Returns:
      A `PTransform` that yields a `LintResult` of the format
        warnings: [num_duplicates]
        lint_sample: [ features: [sample duplicates...] ]
    """
    feature_names = sorted(f.name for f in self._stats.features)
    tuplize = utils.example_tuplizer(feature_names, denan=True)

    duplicates = (
        examples
        | 'Tuplize' >> beam.Map(lambda x: (tuplize(x), x))
        | 'CollectDuplicates' >> beam.GroupByKey()
        | 'ExamplesToList' >> beam.Map(
            lambda (example_tuple, examples): (example_tuple, list(examples)))
        | 'FilterDuplicates' >> beam.Filter(
            lambda (_, examples): len(examples) > 1))

    samples = (
        duplicates
        | 'TakeExamples' >> beam.Map(lambda (_, examples): examples[0])
        | 'Sample' >> beam.combiners.Sample.FixedSizeGlobally(
            self.N_LINT_SAMPLES)
        | 'ToSample' >> beam.Map(
            lambda x: lint_result_pb2.LintSample(examples=x)))

    n_duplicates = (
        duplicates
        | 'CountDuplicates' >> beam.Map(lambda (_, examples): len(examples))
        | 'ExcessCounts' >> beam.Map(lambda x: x - 1)
        | 'Total' >> beam.CombineGlobally(sum))

    return (
        # this is effectively a `Flatten` but with deterministic argument order
        examples.pipeline
        | 'SyncSideInputs' >> beam.Create([None])
        | 'ToResult' >> beam.Map(self._to_result,
                                 beam.pvalue.AsSingleton(n_duplicates),
                                 beam.pvalue.AsSingleton(samples)))