Python apache_beam.DoFn() Examples
The following are 26
code examples of apache_beam.DoFn().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: preprocess.py From professional-services with Apache License 2.0 | 7 votes |
def shuffle(p): """Shuffles data from PCollection. Args: p: PCollection. Returns: PCollection of shuffled data. """ class _AddRandomKey(beam.DoFn): def process(self, element): yield random.random(), element shuffled_data = ( p | 'PairWithRandom' >> beam.ParDo(_AddRandomKey()) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) return shuffled_data
Example #2
Source File: PrettyDataGenerator.py From professional-services with Apache License 2.0 | 6 votes |
def process(self, element, *args, **kwargs): """This function creates a random record based on the properties of the passed DataGenerator object for each element in prior the PCollection. Args: element: A single element of the PCollection """ faker_schema = self.data_gen.get_faker_schema() try: # Here the element is treated as the dictionary representing a single row # of the histogram table. frequency = element.get('frequency') #TODO make this a splittable DoFn to avoid scenario where we hang for large # frequency values. for i in range(int(frequency)): row = self.generate_fake(fschema=faker_schema, key_dict=element) yield row except AttributeError: # The contents of this element are ignored if they are a string. row = self.generate_fake(fschema=faker_schema, key_dict=element) yield row
Example #3
Source File: dofns.py From professional-services with Apache License 2.0 | 6 votes |
def process(self, element, timestamp=beam.DoFn.TimestampParam, window=beam.DoFn.WindowParam, pane_info=beam.DoFn.PaneInfoParam): # Logging to audit triggering of side input refresh process. Statement will be logged only whenever the pubsub notification # triggers side input refresh process (i.e normally once in every x hours) if isinstance(window, beam.transforms.window.GlobalWindow): logging.info( f"(Re)loading side input data from basepath {element.decode()} for global window: {timestamp} - {window}" ) else: logging.info( f"(Re)loading side input data from basepath {element.decode()} for window: {util.get_formatted_time(window.start)} - {util.get_formatted_time(window.end)}" ) for sideinput_type in self.sideinput_types: yield beam.pvalue.TaggedOutput( sideinput_type, FileSystems.join(element.decode(), sideinput_type, self.file_prefix))
Example #4
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def shuffle_data(p): """Shuffles data from PCollection. Args: p: PCollection. Returns: PCollection of shuffled data. """ class _AddRandomKey(beam.DoFn): def process(self, element): yield (random.random(), element) shuffled_data = ( p | 'PairWithRandom' >> beam.ParDo(_AddRandomKey()) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs)) return shuffled_data
Example #5
Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0 | 6 votes |
def run(input_topic, output_path, window_size=1.0, pipeline_args=None): # `save_main_session` is set to true because some DoFn's rely on # globally imported modules. pipeline_options = PipelineOptions( pipeline_args, streaming=True, save_main_session=True ) with beam.Pipeline(options=pipeline_options) as pipeline: ( pipeline | "Read PubSub Messages" >> beam.io.ReadFromPubSub(topic=input_topic) | "Window into" >> GroupWindowsIntoBatches(window_size) | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path)) )
Example #6
Source File: metric_types.py From model-analysis with Apache License 2.0 | 5 votes |
def __new__(cls, keys: List[MetricKey], preprocessor: beam.DoFn, combiner: beam.CombineFn): return super(MetricComputation, cls).__new__(cls, keys, preprocessor, combiner)
Example #7
Source File: predict.py From cloudml-samples with Apache License 2.0 | 5 votes |
def process(self, inputs): # Create a session for every worker only once. The session is not # pickleable, so it can't be created at the DoFn constructor. if not self.session: self.graph = ops.Graph() with self.graph.as_default(): self.session = tf.Session() metagraph_def = tf.compat.v1.saved_model.load( self.session, {self.meta_tag}, self.model_dir) signature_def = metagraph_def.signature_def[self.meta_signature] # inputs self.feed_tensors = { k: self.graph.get_tensor_by_name(v.name) for k, v in signature_def.inputs.items() } # outputs/predictions self.fetch_tensors = { k: self.graph.get_tensor_by_name(v.name) for k, v in signature_def.outputs.items() } # Create a feed_dict for a single element. feed_dict = { tensor: [inputs[key]] for key, tensor in self.feed_tensors.items() if key in inputs } results = self.session.run(self.fetch_tensors, feed_dict) yield { 'id': inputs[self.id_key], 'predictions': results[self.meta_predictions][0].tolist() } # [START dataflow_molecules_run_definition]
Example #8
Source File: model_util.py From model-analysis with Apache License 2.0 | 5 votes |
def __init__(self, model_loaders: Dict[Text, types.ModelLoader]): """Initializes DoFn using dict of model loaders keyed by model location.""" self._model_loaders = model_loaders self._loaded_models = None self._model_load_seconds = None self._model_load_seconds_distribution = beam.metrics.Metrics.distribution( constants.METRICS_NAMESPACE, 'model_load_seconds')
Example #9
Source File: PrettyDataGenerator.py From professional-services with Apache License 2.0 | 5 votes |
def __init__(self, data_gen): """ This initiates some properties of the FakeRowGen DoFn including an instance of the DataGenerator class and the number of records should be generated for each element in the prior PCollection. Attributes: data_gen(DataGenerator): defines the shape of the data should be generated by this DoFn. """ self.data_gen = data_gen # Helper function to get a single field dictionary from the schema for # checking type and mode.
Example #10
Source File: PerformantDataGenerator.py From professional-services with Apache License 2.0 | 5 votes |
def __init__(self, data_gen): """ This initiates some properties of the FakeRowGen DoFn including an instance of the DataGenerator class and the number of records should be generated for each element in the prior PCollection. Attributes: data_gen(DataGenerator): defines the shape of the data should be generated by this DoFn. """ self.data_gen = data_gen # Helper function to get a single field dictionary from the schema for # checking type and mode.
Example #11
Source File: transforms.py From professional-services with Apache License 2.0 | 5 votes |
def process(self, element, timestamp=beam.DoFn.TimestampParam, window=beam.DoFn.WindowParam, pane_info=beam.DoFn.PaneInfoParam): # Logs one message per window trigger indicating the window and pane information if not isinstance(window, beam.transforms.window.GlobalWindow): logging.info( f"Timestamp:{timestamp};Window Start:{util.get_formatted_time(window.start)}; " f"Window end:{util.get_formatted_time(window.end)}; Pane Info:{pane_info}" ) for event in element: logging.info(event)
Example #12
Source File: preprocess.py From professional-services with Apache License 2.0 | 5 votes |
def split_features_labels(data, label_column, key_column): """Separates features from true labels in input pipeline for future inference. Args: data: PCollection, input pipeline. label_column: string, name of column containing labels. key_column: string, name of column containing keys. Returns: Dictionary mapping the strings 'labels' and 'features' to PCollection objects. """ label_pipeline, features_pipeline = 'labels', 'features' class _SplitFeaturesLabels(beam.DoFn): def process(self, element, label_column, key_column): yield beam.pvalue.TaggedOutput(label_pipeline, { key_column: element[key_column], label_column: element.pop(label_column) }) yield element data |= 'SplitFeaturesLabels' >> beam.ParDo( _SplitFeaturesLabels(), label_column=label_column, key_column=key_column).with_outputs( label_pipeline, main=features_pipeline) return {k: data[k] for k in (label_pipeline, features_pipeline)}
Example #13
Source File: preprocess.py From professional-services with Apache License 2.0 | 5 votes |
def randomly_split(p, train_size, validation_size, test_size): """Randomly splits input pipeline in three sets based on input ratio. Args: p: PCollection, input pipeline. train_size: float, ratio of data going to train set. validation_size: float, ratio of data going to validation set. test_size: float, ratio of data going to test set. Returns: Tuple of PCollection. Raises: ValueError: Train validation and test sizes don`t add up to 1.0. """ if train_size + validation_size + test_size != 1.0: raise ValueError('Train validation and test sizes don`t add up to 1.0.') class _SplitData(beam.DoFn): def process(self, element): r = random.random() if r < test_size: yield beam.pvalue.TaggedOutput(DatasetType.TEST.name, element) elif r < 1 - train_size: yield beam.pvalue.TaggedOutput(DatasetType.VAL.name, element) else: yield element split_data = ( p | 'SplitData' >> beam.ParDo(_SplitData()).with_outputs( DatasetType.VAL.name, DatasetType.TEST.name, main=DatasetType.TRAIN.name)) split_data_id = {} for k in [DatasetType.TRAIN, DatasetType.VAL, DatasetType.TEST]: split_data_id[k] = split_data[k.name] return split_data_id
Example #14
Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0 | 5 votes |
def process(self, batch, window=beam.DoFn.WindowParam): """Write one batch per file to a Google Cloud Storage bucket. """ ts_format = "%H:%M" window_start = window.start.to_utc_datetime().strftime(ts_format) window_end = window.end.to_utc_datetime().strftime(ts_format) filename = "-".join([self.output_path, window_start, window_end]) with beam.io.gcp.gcsio.GcsIO().open(filename=filename, mode="w") as f: for element in batch: f.write("{}\n".format(json.dumps(element)).encode("utf-8"))
Example #15
Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0 | 5 votes |
def process(self, element, publish_time=beam.DoFn.TimestampParam): """Processes each incoming windowed element by extracting the Pub/Sub message and its publish timestamp into a dictionary. `publish_time` defaults to the publish timestamp returned by the Pub/Sub server. It is bound to each element by Beam at runtime. """ yield { "message_body": element.decode("utf-8"), "publish_time": datetime.datetime.utcfromtimestamp( float(publish_time) ).strftime("%Y-%m-%d %H:%M:%S.%f"), }
Example #16
Source File: impl.py From transform with Apache License 2.0 | 5 votes |
def process(self, batch, saved_model_dir): """Runs the given graph to realize the output `Tensor` or `SparseTensor`s. Runs the graph in a TF session for computing the output values of the `Tensor` or `SparseTensor`s, given an input row of data (input `Tensor` or `SparseTensor`s). Args: batch: the batch of elements being processed by the DoFn saved_model_dir: Directory containing saved model. Yields: A representation of output features as a dict mapping keys (logical column names) to values. """ if self._graph_state is None: # If available, acquire will return a cached _GraphState, since calling # _make_graph_state is expensive. self._graph_state = self._shared_graph_state_handle.acquire( lambda: self._make_graph_state(saved_model_dir)) # This should remain true throughout the lifetime of this DoFn, regardless # of whether or not self._graph_state was cached. assert self._graph_state.saved_model_dir == saved_model_dir yield self._handle_batch(batch)
Example #17
Source File: transform.py From pydatalab with Apache License 2.0 | 5 votes |
def __init__(self, batch_size): """Constructor of EmitAsBatchDoFn beam.DoFn class. Args: batch_size: the max size we want to buffer the records before emitting. """ self._batch_size = batch_size self._cached = []
Example #18
Source File: transform.py From pydatalab with Apache License 2.0 | 5 votes |
def __init__(self, batch_size): """Constructor of EmitAsBatchDoFn beam.DoFn class. Args: batch_size: the max size we want to buffer the records before emitting. """ self._batch_size = batch_size self._cached = []
Example #19
Source File: predict.py From pydatalab with Apache License 2.0 | 5 votes |
def __init__(self, batch_size): """Constructor of EmitAsBatchDoFn beam.DoFn class. Args: batch_size: the max size we want to buffer the records before emitting. """ self._batch_size = batch_size self._cached = []
Example #20
Source File: prediction_fns.py From exoplanet-ml with Apache License 2.0 | 5 votes |
def __init__(self, hparams, dataset_overrides): """Initializes the DoFn.""" self.hparams = hparams self.dataset_overrides = dataset_overrides
Example #21
Source File: process_light_curve.py From exoplanet-ml with Apache License 2.0 | 5 votes |
def __init__(self, kepler_data_dir, flux_column="PDCSAP_FLUX", injected_group=None, scramble_type=None, invert_light_curves=False, upward_outlier_clipping=None, downward_outlier_clipping=None, clip_lowest_n_values=None, normalize_stddev=False): """Initializes the DoFn. Args: kepler_data_dir: Base directory containing Kepler data. flux_column: Name of the flux column to extract. injected_group: Optional string specifying the injected group. One of {'inj1', 'inj2', 'inj3'}. scramble_type: Optional string specifying the scramble order. One of {'SCR1', 'SCR2', 'SCR3'}. invert_light_curves: Whether to reflect light curves around the median flux value. upward_outlier_clipping: If specified, clip upward flux values to this number of multiples of the standard deviation. downward_outlier_clipping: If specified, clip downward flux values to this number of multiples of the standard deviation. clip_lowest_n_values: If specified, clip lowest flux values to the value of the nth lowest value. normalize_stddev: Whether to divide the flux by the standard deviation. """ self.kepler_data_dir = kepler_data_dir self.flux_column = flux_column self.injected_group = injected_group self.extension = "INJECTED LIGHTCURVE" if injected_group else "LIGHTCURVE" self.scramble_type = scramble_type self.invert_light_curves = invert_light_curves self.upward_outlier_clipping = upward_outlier_clipping self.downward_outlier_clipping = downward_outlier_clipping self.clip_lowest_n_values = clip_lowest_n_values self.normalize_stddev = normalize_stddev
Example #22
Source File: light_curve_fns.py From exoplanet-ml with Apache License 2.0 | 5 votes |
def __init__(self, gap_width, normalize_method, normalize_args, upward_outlier_sigma_cut=None, downward_outlier_sigma_cut=None, remove_events_width_factor=1.5, output_name="light_curve"): """Initializes the DoFn. Args: gap_width: Minimum gap size (in time units) to split the light curve before fitting the normalization curve. normalize_method: Method for fitting the normalization curve. normalize_args: Arguments passed to the function that computes the normalization curve. upward_outlier_sigma_cut: Number of standard deviations from the median flux value above which upward outliers are removed. downward_outlier_sigma_cut: Number of standard deviations from the median flux value above which downward outliers are removed. remove_events_width_factor: Fraction of the duration to remove when removing periodic events. output_name: Name of the processed light curve in the output dict. """ self.remove_events_width_factor = remove_events_width_factor self.gap_width = gap_width self.normalize_method = normalize_method self.normalize_args = normalize_args self.upward_outlier_sigma_cut = upward_outlier_sigma_cut self.downward_outlier_sigma_cut = downward_outlier_sigma_cut self.output_name = output_name
Example #23
Source File: prediction_fns.py From exoplanet-ml with Apache License 2.0 | 5 votes |
def __init__(self, model_name, model_dir, config_name=None): """Initializes the DoFn. Args: model_name: Name of the model class. model_dir: Directory containing a model checkpoint. config_name: Optional name of the model configuration. If not specified, the file 'config.json' in model_dir is used. """ # Look up the model class. model_class = models.get_model_class(model_name) # Find the latest checkpoint. checkpoint_file = tf.train.latest_checkpoint(model_dir) if not checkpoint_file: raise ValueError("No checkpoint file found in: {}".format(model_dir)) # Get the model configuration. if config_name: config = models.get_model_config(model_name, config_name) else: with tf.gfile.Open(os.path.join(model_dir, "config.json")) as f: config = json.load(f) config = configdict.ConfigDict(config) self.model_class = model_class self.checkpoint_file = checkpoint_file self.config = config
Example #24
Source File: bls_fns.py From exoplanet-ml with Apache License 2.0 | 5 votes |
def __init__(self, all_periods, all_nbins, weight_min_factor, duration_density_min, duration_min_days, duration_density_max, duration_min_fraction): """Initializes the DoFn.""" self.all_periods = all_periods self.all_nbins = all_nbins self.max_nbins = max(self.all_nbins) self.weight_min_factor = weight_min_factor self.duration_density_min = duration_density_min self.duration_min_days = duration_min_days self.duration_density_max = duration_density_max self.duration_min_fraction = duration_min_fraction
Example #25
Source File: preprocess.py From professional-services with Apache License 2.0 | 4 votes |
def oversampling(p): """Oversamples the positive class elements contained in the input pipeline. Computes the current class distribution and re-sample positive class to ensure a class distribution close to 50% / 50%. Samples each positive class item w/ bernouilli distribution approximated with normal distribution (mean=ratio, var=ratio, where ratio is the factor by which we want to increase the number of positive samples). Args: p: PCollection. Returns: PCollection of re-balanced elements. Raises: ValueError: No positive class items found in pipeline. """ # Computes percentage of positive class to use as side input in main pipeline. percentage = ( p | 'ReduceToClass' >> beam.Map(lambda x: 1.0 * x[constants.LABEL_COLUMN]) | beam.CombineGlobally(beam.combiners.MeanCombineFn())) class _Sample(beam.DoFn): """DoFn that performs resampling element by element. Attributes: process: Function performing the resampling at element level. """ def process(self, element, percent_positive): if not percent_positive: raise ValueError('No positive class items found in pipeline.') ratio = 1.0 / percent_positive n = ( max(int(random.gauss(mu=ratio, sigma=ratio**0.5)), 0) if element[constants.LABEL_COLUMN] else 1) for _ in range(n): yield element proc = ( p | 'DuplicateItemAndFlatten' >> beam.ParDo( _Sample(), percent_positive=beam.pvalue.AsSingleton(percentage))) return proc
Example #26
Source File: beam_sample_tfrecord.py From exoplanet-ml with Apache License 2.0 | 4 votes |
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") def pipeline(root): """Beam pipeline for preprocessing open images.""" assert FLAGS.input_file_pattern assert FLAGS.output_dir assert FLAGS.output_name assert FLAGS.num_shards assert FLAGS.kepid_whitelist # Read label whitelist. kepid_whitelist = [int(kepid) for kepid in FLAGS.kepid_whitelist.split(",")] logging.info("Read Kepid whitelist with %d labels", len(kepid_whitelist)) # Initialize DoFn. process_example = ProcessExampleDoFn(kepid_whitelist) # Create Pipeline. # pylint: disable=expression-not-assigned (root | "read_tfrecord" >> beam.io.tfrecordio.ReadFromTFRecord( FLAGS.input_file_pattern, coder=beam.coders.ProtoCoder(tf.train.Example)) | "process_examples" >> beam.ParDo(process_example) | "reshuffle" >> beam.Reshuffle() | "write_tfrecord" >> beam.io.tfrecordio.WriteToTFRecord( os.path.join(FLAGS.output_dir, FLAGS.output_name), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=FLAGS.num_shards)) # pylint: enable=expression-not-assigned pipeline.run() logging.info("Processing complete.")