Python apache_beam.DoFn() Examples

The following are 26 code examples of apache_beam.DoFn(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: preprocess.py    From professional-services with Apache License 2.0 7 votes vote down vote up
def shuffle(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield random.random(), element

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data 
Example #2
Source File: PrettyDataGenerator.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def process(self, element, *args, **kwargs):
        """This function creates a random record based on the properties
        of the passed DataGenerator object for each element in prior the
        PCollection.

        Args:
            element: A single element of the PCollection
        """

        faker_schema = self.data_gen.get_faker_schema()
        try:
            # Here the element is treated as the dictionary representing a single row
            # of the histogram table.
            frequency = element.get('frequency')

            #TODO make this a splittable DoFn to avoid scenario where we hang for large
            # frequency values.
            for i in range(int(frequency)):
                row = self.generate_fake(fschema=faker_schema,
                                         key_dict=element)
                yield row
        except AttributeError:
            # The contents of this element are ignored if they are a string.
            row = self.generate_fake(fschema=faker_schema, key_dict=element)
            yield row 
Example #3
Source File: dofns.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def process(self,
                element,
                timestamp=beam.DoFn.TimestampParam,
                window=beam.DoFn.WindowParam,
                pane_info=beam.DoFn.PaneInfoParam):

        # Logging to audit triggering of side input refresh process. Statement will be logged only whenever the pubsub notification
        # triggers side input refresh process (i.e normally once in every x hours)
        if isinstance(window, beam.transforms.window.GlobalWindow):
            logging.info(
                f"(Re)loading side input data from basepath {element.decode()} for global window: {timestamp} - {window}"
            )
        else:
            logging.info(
                f"(Re)loading side input data from basepath {element.decode()} for window: {util.get_formatted_time(window.start)} - {util.get_formatted_time(window.end)}"
            )

        for sideinput_type in self.sideinput_types:
            yield beam.pvalue.TaggedOutput(
                sideinput_type,
                FileSystems.join(element.decode(), sideinput_type,
                                 self.file_prefix)) 
Example #4
Source File: preprocess.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def shuffle_data(p):
  """Shuffles data from PCollection.

  Args:
    p: PCollection.

  Returns:
    PCollection of shuffled data.
  """

  class _AddRandomKey(beam.DoFn):

    def process(self, element):
      yield (random.random(), element)

  shuffled_data = (
      p
      | 'PairWithRandom' >> beam.ParDo(_AddRandomKey())
      | 'GroupByRandom' >> beam.GroupByKey()
      | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
  return shuffled_data 
Example #5
Source File: PubSubToGCS.py    From python-docs-samples with Apache License 2.0 6 votes vote down vote up
def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
    # `save_main_session` is set to true because some DoFn's rely on
    # globally imported modules.
    pipeline_options = PipelineOptions(
        pipeline_args, streaming=True, save_main_session=True
    )

    with beam.Pipeline(options=pipeline_options) as pipeline:
        (
            pipeline
            | "Read PubSub Messages"
            >> beam.io.ReadFromPubSub(topic=input_topic)
            | "Window into" >> GroupWindowsIntoBatches(window_size)
            | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
        ) 
Example #6
Source File: metric_types.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def __new__(cls, keys: List[MetricKey], preprocessor: beam.DoFn,
              combiner: beam.CombineFn):
    return super(MetricComputation, cls).__new__(cls, keys, preprocessor,
                                                 combiner) 
Example #7
Source File: predict.py    From cloudml-samples with Apache License 2.0 5 votes vote down vote up
def process(self, inputs):
    # Create a session for every worker only once. The session is not
    # pickleable, so it can't be created at the DoFn constructor.
    if not self.session:
      self.graph = ops.Graph()
      with self.graph.as_default():
        self.session = tf.Session()
        metagraph_def = tf.compat.v1.saved_model.load(
            self.session, {self.meta_tag}, self.model_dir)
      signature_def = metagraph_def.signature_def[self.meta_signature]

      # inputs
      self.feed_tensors = {
          k: self.graph.get_tensor_by_name(v.name)
          for k, v in signature_def.inputs.items()
      }

      # outputs/predictions
      self.fetch_tensors = {
          k: self.graph.get_tensor_by_name(v.name)
          for k, v in signature_def.outputs.items()
      }

    # Create a feed_dict for a single element.
    feed_dict = {
        tensor: [inputs[key]]
        for key, tensor in self.feed_tensors.items()
        if key in inputs
    }
    results = self.session.run(self.fetch_tensors, feed_dict)

    yield {
        'id': inputs[self.id_key],
        'predictions': results[self.meta_predictions][0].tolist()
    }


# [START dataflow_molecules_run_definition] 
Example #8
Source File: model_util.py    From model-analysis with Apache License 2.0 5 votes vote down vote up
def __init__(self, model_loaders: Dict[Text, types.ModelLoader]):
    """Initializes DoFn using dict of model loaders keyed by model location."""
    self._model_loaders = model_loaders
    self._loaded_models = None
    self._model_load_seconds = None
    self._model_load_seconds_distribution = beam.metrics.Metrics.distribution(
        constants.METRICS_NAMESPACE, 'model_load_seconds') 
Example #9
Source File: PrettyDataGenerator.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def __init__(self, data_gen):
        """
        This initiates some properties of the FakeRowGen DoFn including an
        instance of the DataGenerator class and the number of records should be
        generated for each element in the prior PCollection.

        Attributes:
            data_gen(DataGenerator): defines the shape of the data should be
            generated by this DoFn.
        """
        self.data_gen = data_gen

    # Helper function to get a single field dictionary from the schema for
    # checking type and mode. 
Example #10
Source File: PerformantDataGenerator.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def __init__(self, data_gen):
        """
        This initiates some properties of the FakeRowGen DoFn including an
        instance of the DataGenerator class and the number of records should be
        generated for each element in the prior PCollection.

        Attributes:
            data_gen(DataGenerator): defines the shape of the data should be
            generated by this DoFn.
        """
        self.data_gen = data_gen

    # Helper function to get a single field dictionary from the schema for
    # checking type and mode. 
Example #11
Source File: transforms.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def process(self,
                element,
                timestamp=beam.DoFn.TimestampParam,
                window=beam.DoFn.WindowParam,
                pane_info=beam.DoFn.PaneInfoParam):

        # Logs one message per window trigger indicating the window and pane information
        if not isinstance(window, beam.transforms.window.GlobalWindow):
            logging.info(
                f"Timestamp:{timestamp};Window Start:{util.get_formatted_time(window.start)}; "
                f"Window end:{util.get_formatted_time(window.end)}; Pane Info:{pane_info}"
            )

        for event in element:
            logging.info(event) 
Example #12
Source File: preprocess.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def split_features_labels(data, label_column, key_column):
  """Separates features from true labels in input pipeline for future inference.

  Args:
    data: PCollection, input pipeline.
    label_column: string, name of column containing labels.
    key_column: string, name of column containing keys.

  Returns:
    Dictionary mapping the strings 'labels' and 'features' to PCollection
    objects.
  """

  label_pipeline, features_pipeline = 'labels', 'features'

  class _SplitFeaturesLabels(beam.DoFn):

    def process(self, element, label_column, key_column):
      yield beam.pvalue.TaggedOutput(label_pipeline, {
          key_column: element[key_column],
          label_column: element.pop(label_column)
      })
      yield element

  data |= 'SplitFeaturesLabels' >> beam.ParDo(
      _SplitFeaturesLabels(), label_column=label_column,
      key_column=key_column).with_outputs(
          label_pipeline, main=features_pipeline)
  return {k: data[k] for k in (label_pipeline, features_pipeline)} 
Example #13
Source File: preprocess.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def randomly_split(p, train_size, validation_size, test_size):
  """Randomly splits input pipeline in three sets based on input ratio.

  Args:
    p: PCollection, input pipeline.
    train_size: float, ratio of data going to train set.
    validation_size: float, ratio of data going to validation set.
    test_size: float, ratio of data going to test set.

  Returns:
    Tuple of PCollection.

  Raises:
    ValueError: Train validation and test sizes don`t add up to 1.0.
  """

  if train_size + validation_size + test_size != 1.0:
    raise ValueError('Train validation and test sizes don`t add up to 1.0.')

  class _SplitData(beam.DoFn):

    def process(self, element):
      r = random.random()
      if r < test_size:
        yield beam.pvalue.TaggedOutput(DatasetType.TEST.name, element)
      elif r < 1 - train_size:
        yield beam.pvalue.TaggedOutput(DatasetType.VAL.name, element)
      else:
        yield element

  split_data = (
      p | 'SplitData' >> beam.ParDo(_SplitData()).with_outputs(
          DatasetType.VAL.name,
          DatasetType.TEST.name,
          main=DatasetType.TRAIN.name))

  split_data_id = {}
  for k in [DatasetType.TRAIN, DatasetType.VAL, DatasetType.TEST]:
    split_data_id[k] = split_data[k.name]

  return split_data_id 
Example #14
Source File: PubSubToGCS.py    From python-docs-samples with Apache License 2.0 5 votes vote down vote up
def process(self, batch, window=beam.DoFn.WindowParam):
        """Write one batch per file to a Google Cloud Storage bucket. """

        ts_format = "%H:%M"
        window_start = window.start.to_utc_datetime().strftime(ts_format)
        window_end = window.end.to_utc_datetime().strftime(ts_format)
        filename = "-".join([self.output_path, window_start, window_end])

        with beam.io.gcp.gcsio.GcsIO().open(filename=filename, mode="w") as f:
            for element in batch:
                f.write("{}\n".format(json.dumps(element)).encode("utf-8")) 
Example #15
Source File: PubSubToGCS.py    From python-docs-samples with Apache License 2.0 5 votes vote down vote up
def process(self, element, publish_time=beam.DoFn.TimestampParam):
        """Processes each incoming windowed element by extracting the Pub/Sub
        message and its publish timestamp into a dictionary. `publish_time`
        defaults to the publish timestamp returned by the Pub/Sub server. It
        is bound to each element by Beam at runtime.
        """

        yield {
            "message_body": element.decode("utf-8"),
            "publish_time": datetime.datetime.utcfromtimestamp(
                float(publish_time)
            ).strftime("%Y-%m-%d %H:%M:%S.%f"),
        } 
Example #16
Source File: impl.py    From transform with Apache License 2.0 5 votes vote down vote up
def process(self, batch, saved_model_dir):
    """Runs the given graph to realize the output `Tensor` or `SparseTensor`s.

    Runs the graph in a TF session for computing the output values of the
    `Tensor` or `SparseTensor`s, given an input row of data (input `Tensor` or
    `SparseTensor`s).

    Args:
      batch: the batch of elements being processed by the DoFn
      saved_model_dir: Directory containing saved model.

    Yields:
      A representation of output features as a dict mapping keys (logical column
      names) to values.
    """
    if self._graph_state is None:
      # If available, acquire will return a cached _GraphState, since calling
      # _make_graph_state is expensive.
      self._graph_state = self._shared_graph_state_handle.acquire(
          lambda: self._make_graph_state(saved_model_dir))

    # This should remain true throughout the lifetime of this DoFn, regardless
    # of whether or not self._graph_state was cached.
    assert self._graph_state.saved_model_dir == saved_model_dir

    yield self._handle_batch(batch) 
Example #17
Source File: transform.py    From pydatalab with Apache License 2.0 5 votes vote down vote up
def __init__(self, batch_size):
    """Constructor of EmitAsBatchDoFn beam.DoFn class.

    Args:
      batch_size: the max size we want to buffer the records before emitting.
    """
    self._batch_size = batch_size
    self._cached = [] 
Example #18
Source File: transform.py    From pydatalab with Apache License 2.0 5 votes vote down vote up
def __init__(self, batch_size):
    """Constructor of EmitAsBatchDoFn beam.DoFn class.

    Args:
      batch_size: the max size we want to buffer the records before emitting.
    """
    self._batch_size = batch_size
    self._cached = [] 
Example #19
Source File: predict.py    From pydatalab with Apache License 2.0 5 votes vote down vote up
def __init__(self, batch_size):
    """Constructor of EmitAsBatchDoFn beam.DoFn class.

    Args:
      batch_size: the max size we want to buffer the records before emitting.
    """
    self._batch_size = batch_size
    self._cached = [] 
Example #20
Source File: prediction_fns.py    From exoplanet-ml with Apache License 2.0 5 votes vote down vote up
def __init__(self, hparams, dataset_overrides):
    """Initializes the DoFn."""
    self.hparams = hparams
    self.dataset_overrides = dataset_overrides 
Example #21
Source File: process_light_curve.py    From exoplanet-ml with Apache License 2.0 5 votes vote down vote up
def __init__(self,
               kepler_data_dir,
               flux_column="PDCSAP_FLUX",
               injected_group=None,
               scramble_type=None,
               invert_light_curves=False,
               upward_outlier_clipping=None,
               downward_outlier_clipping=None,
               clip_lowest_n_values=None,
               normalize_stddev=False):
    """Initializes the DoFn.

    Args:
      kepler_data_dir: Base directory containing Kepler data.
      flux_column: Name of the flux column to extract.
      injected_group: Optional string specifying the injected group. One of
        {'inj1', 'inj2', 'inj3'}.
      scramble_type: Optional string specifying the scramble order. One of
        {'SCR1', 'SCR2', 'SCR3'}.
      invert_light_curves: Whether to reflect light curves around the median
        flux value.
      upward_outlier_clipping: If specified, clip upward flux values to this
        number of multiples of the standard deviation.
      downward_outlier_clipping: If specified, clip downward flux values to this
        number of multiples of the standard deviation.
      clip_lowest_n_values: If specified, clip lowest flux values to the value
        of the nth lowest value.
      normalize_stddev: Whether to divide the flux by the standard deviation.
    """
    self.kepler_data_dir = kepler_data_dir
    self.flux_column = flux_column
    self.injected_group = injected_group
    self.extension = "INJECTED LIGHTCURVE" if injected_group else "LIGHTCURVE"
    self.scramble_type = scramble_type
    self.invert_light_curves = invert_light_curves
    self.upward_outlier_clipping = upward_outlier_clipping
    self.downward_outlier_clipping = downward_outlier_clipping
    self.clip_lowest_n_values = clip_lowest_n_values
    self.normalize_stddev = normalize_stddev 
Example #22
Source File: light_curve_fns.py    From exoplanet-ml with Apache License 2.0 5 votes vote down vote up
def __init__(self,
               gap_width,
               normalize_method,
               normalize_args,
               upward_outlier_sigma_cut=None,
               downward_outlier_sigma_cut=None,
               remove_events_width_factor=1.5,
               output_name="light_curve"):
    """Initializes the DoFn.

    Args:
      gap_width: Minimum gap size (in time units) to split the light curve
        before fitting the normalization curve.
      normalize_method: Method for fitting the normalization curve.
      normalize_args: Arguments passed to the function that computes the
        normalization curve.
      upward_outlier_sigma_cut: Number of standard deviations from the median
        flux value above which upward outliers are removed.
      downward_outlier_sigma_cut: Number of standard deviations from the median
        flux value above which downward outliers are removed.
      remove_events_width_factor: Fraction of the duration to remove when
        removing periodic events.
      output_name: Name of the processed light curve in the output dict.
    """
    self.remove_events_width_factor = remove_events_width_factor
    self.gap_width = gap_width
    self.normalize_method = normalize_method
    self.normalize_args = normalize_args
    self.upward_outlier_sigma_cut = upward_outlier_sigma_cut
    self.downward_outlier_sigma_cut = downward_outlier_sigma_cut
    self.output_name = output_name 
Example #23
Source File: prediction_fns.py    From exoplanet-ml with Apache License 2.0 5 votes vote down vote up
def __init__(self, model_name, model_dir, config_name=None):
    """Initializes the DoFn.

    Args:
      model_name: Name of the model class.
      model_dir: Directory containing a model checkpoint.
      config_name: Optional name of the model configuration. If not specified,
        the file 'config.json' in model_dir is used.
    """
    # Look up the model class.
    model_class = models.get_model_class(model_name)

    # Find the latest checkpoint.
    checkpoint_file = tf.train.latest_checkpoint(model_dir)
    if not checkpoint_file:
      raise ValueError("No checkpoint file found in: {}".format(model_dir))

    # Get the model configuration.
    if config_name:
      config = models.get_model_config(model_name, config_name)
    else:
      with tf.gfile.Open(os.path.join(model_dir, "config.json")) as f:
        config = json.load(f)
    config = configdict.ConfigDict(config)

    self.model_class = model_class
    self.checkpoint_file = checkpoint_file
    self.config = config 
Example #24
Source File: bls_fns.py    From exoplanet-ml with Apache License 2.0 5 votes vote down vote up
def __init__(self, all_periods, all_nbins, weight_min_factor,
               duration_density_min, duration_min_days, duration_density_max,
               duration_min_fraction):
    """Initializes the DoFn."""
    self.all_periods = all_periods
    self.all_nbins = all_nbins
    self.max_nbins = max(self.all_nbins)
    self.weight_min_factor = weight_min_factor
    self.duration_density_min = duration_density_min
    self.duration_min_days = duration_min_days
    self.duration_density_max = duration_density_max
    self.duration_min_fraction = duration_min_fraction 
Example #25
Source File: preprocess.py    From professional-services with Apache License 2.0 4 votes vote down vote up
def oversampling(p):
  """Oversamples the positive class elements contained in the input pipeline.

  Computes the current class distribution and re-sample positive class to
  ensure a class distribution close to 50% / 50%. Samples each positive class
  item w/ bernouilli distribution approximated with normal distribution
  (mean=ratio, var=ratio, where ratio is the factor by which we want to increase
  the number of positive samples).

  Args:
    p: PCollection.

  Returns:
    PCollection of re-balanced elements.

  Raises:
    ValueError: No positive class items found in pipeline.
  """

  # Computes percentage of positive class to use as side input in main pipeline.
  percentage = (
      p
      | 'ReduceToClass' >> beam.Map(lambda x: 1.0 * x[constants.LABEL_COLUMN])
      | beam.CombineGlobally(beam.combiners.MeanCombineFn()))

  class _Sample(beam.DoFn):
    """DoFn that performs resampling element by element.

    Attributes:
      process: Function performing the resampling at element level.
    """

    def process(self, element, percent_positive):
      if not percent_positive:
        raise ValueError('No positive class items found in pipeline.')
      ratio = 1.0 / percent_positive
      n = (
          max(int(random.gauss(mu=ratio, sigma=ratio**0.5)), 0)
          if element[constants.LABEL_COLUMN] else 1)
      for _ in range(n):
        yield element

  proc = (
      p | 'DuplicateItemAndFlatten' >> beam.ParDo(
          _Sample(), percent_positive=beam.pvalue.AsSingleton(percentage)))

  return proc 
Example #26
Source File: beam_sample_tfrecord.py    From exoplanet-ml with Apache License 2.0 4 votes vote down vote up
def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  def pipeline(root):
    """Beam pipeline for preprocessing open images."""
    assert FLAGS.input_file_pattern
    assert FLAGS.output_dir
    assert FLAGS.output_name
    assert FLAGS.num_shards
    assert FLAGS.kepid_whitelist

    # Read label whitelist.
    kepid_whitelist = [int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")]
    logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist))

    # Initialize DoFn.
    process_example = ProcessExampleDoFn(kepid_whitelist)

    # Create Pipeline.
    # pylint: disable=expression-not-assigned
    (root
     | "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord(
         FLAGS.input_file_pattern,
         coder=beam.coders.ProtoCoder(tf.train.Example))
     | "process_examples" >> beam.ParDo(process_example)
     | "reshuffle" >> beam.Reshuffle()
     | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord(
         os.path.join(FLAGS.output_dir, FLAGS.output_name),
         coder=beam.coders.ProtoCoder(tf.train.Example),
         num_shards=FLAGS.num_shards))
    # pylint: enable=expression-not-assigned

  pipeline.run()
  logging.info("Processing complete.")