Python Examples of apache_beam.options.pipeline

Source File: read_from_relational_db.py From beam-nuggets with MIT License

6 votes

def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        source_config = relational_db.SourceConfiguration(
            drivername=db_args.drivername,
            host=db_args.host,
            port=db_args.port,
            database=db_args.database,
            username=db_args.username,
            password=db_args.password,
        )

        months = p | "Reading records from db" >> relational_db.ReadFromDB(
            source_config=source_config,
            table_name=db_args.table
        )
        months | 'Writing to stdout' >> beam.Map(print)

Source File: tfdv.py From spotify-tensorflow with Apache License 2.0

5 votes

def generate_statistics_from_tfrecord(pipeline_args,  # type: List[str]
                                      data_location,  # type: str
                                      output_path,    # type: str
                                      stats_options   # type: StatsOptions
                                      ):
    # type: (...) ->  statistics_pb2.DatasetFeatureStatisticsList
    """
    Generate stats file from a tfrecord dataset using TFDV

    :param pipeline_args: un-parsed Dataflow arguments
    :param data_location: input data dir containing tfrecord files
    :param output_path: output path for the stats file
    :param stats_options: tfdv.StatsOptions for statistics generation settings
    :return a DatasetFeatureStatisticsList proto.
    """
    assert_not_empty_string(data_location)
    assert_not_empty_string(output_path)

    args_in_snake_case = clean_up_pipeline_args(pipeline_args)
    pipeline_options = PipelineOptions(flags=args_in_snake_case)

    all_options = pipeline_options.get_all_options()

    if all_options["job_name"] is None:
        gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
        gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))

    if all_options["setup_file"] is None:
        setup_file_path = create_setup_file()
        setup_options = pipeline_options.view_as(SetupOptions)
        setup_options.setup_file = setup_file_path

    input_files = os.path.join(data_location, "*.tfrecords*")
    return tfdv.generate_statistics_from_tfrecord(data_location=input_files,
                                                  output_path=output_path,
                                                  stats_options=stats_options,
                                                  pipeline_options=pipeline_options)

Source File: run.py From realtime-embeddings-matching with Apache License 2.0

5 votes

def main(argv=None):
  known_args, pipeline_args = get_args(argv)
  pipeline_options = PipelineOptions(pipeline_args)
  setup_options = pipeline_options.view_as(SetupOptions)
  setup_options.save_main_session = True
  pipeline.run(pipeline_options, known_args)

Source File: write_to_relational_db.py From beam-nuggets with MIT License

5 votes

def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Target database instance
    source_config = relational_db.SourceConfiguration(
        drivername=db_args.drivername,
        host=db_args.host,
        port=db_args.port,
        database=db_args.database,
        username=db_args.username,
        password=db_args.password,
        create_if_missing=db_args.create_if_missing
    )

    # The data to be written
    records = [
        {'name': 'Jan', 'num': 1},
        {'name': 'Feb', 'num': 2},
        {'name': 'Mar', 'num': 3},
        {'name': 'Apr', 'num': 4},
        {'name': 'May', 'num': 5},
        {'name': 'Jun', 'num': 6},
    ]

    # Target database table
    table_config = relational_db.TableConfiguration(
        name='months',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['num']  # and use 'num' column as a primary key
    )

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        months = p | "Reading records" >> beam.Create(records)
        months | 'Writing to DB' >> relational_db.Write(
            source_config=source_config,
            table_config=table_config
        )

Source File: create_data.py From conversational-datasets with Apache License 2.0

4 votes

def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    sentence_files_match = FileSystems.match([args.sentence_files])[0]
    sentence_files = [
        file_metadata.path
        for file_metadata in sentence_files_match.metadata_list]
    logging.info("Reading %i files from %s.",
                 len(sentence_files), args.sentence_files)
    assert len(sentence_files) > 0
    sentence_files = p | beam.Create(sentence_files)
    examples = sentence_files | "create examples" >> beam.FlatMap(
        partial(_create_examples_from_file,
                min_length=args.min_length,
                max_length=args.max_length,
                num_extra_contexts=args.num_extra_contexts)
    )

    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)).with_outputs(
            _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (
            serialized_examples | ("write " + name)
            >> write_sink(
                os.path.join(args.output_dir, name),
                file_name_suffix=file_name_suffix,
                num_shards=args.num_shards_train,
            )
        )

    result = p.run()
    result.wait_until_finish()

Source File: create_data.py From conversational-datasets with Apache License 2.0

4 votes

def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | "read qa files" >> ReadFromText(args.file_pattern)

    # The lines are not JSON, but the string representation of python
    # dictionary objects. Parse them with ast.literal_eval.
    json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval)
    qa_tuples = json_objects | "create tuples" >> beam.FlatMap(
        partial(
            _create_tuples,
            min_words=args.min_words, max_words=args.max_words)
    )

    # Remove duplicate examples.
    qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v))
    qa_tuples |= "group duplicates" >> beam.GroupByKey()
    qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0])

    # Create the examples.
    examples = qa_tuples | "create examples" >> beam.Map(
        lambda args: _create_example(*args)
    )
    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)
    ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (
            serialized_examples | ("write " + name)
            >> write_sink(
                os.path.join(args.output_dir, name),
                file_name_suffix=file_name_suffix,
                num_shards=args.num_shards_train,
            )
        )

    result = p.run()
    result.wait_until_finish()

Source File: revise_preprocessed_data.py From cloudml-examples with Apache License 2.0

4 votes

def run(argv=None):
  """Runs the revise preprocessed data pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
  pipeline_options = PipelineOptions(flags=argv)
  revise_options = pipeline_options.view_as(ReviseOptions)
  cloud_options = pipeline_options.view_as(GoogleCloudOptions)
  output_dir = os.path.join(revise_options.output,
                            datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(
      WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
  cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
  cloud_options.temp_location = os.path.join(output_dir, 'tmp')
  cloud_options.job_name = 'relabel-examples-%s' % (
      datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

  metadata_query = str(
      Template(open(revise_options.metadata, 'r').read()).render(
          METADATA_QUERY_REPLACEMENTS))
  logging.info('metadata query : %s', metadata_query)

  with beam.Pipeline(options=pipeline_options) as p:
    # Gather our sample metadata into a python dictionary.
    samples_metadata = (
        p
        | 'ReadSampleMetadata' >> beam.io.Read(
            beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True))
        | 'TableToDictionary' >> beam.CombineGlobally(
            util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))

    # Read the tf.Example protos into a PCollection.
    examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
        file_pattern=revise_options.input,
        compression_type=CompressionTypes.GZIP)

    # Filter the TensorFlow Example Protocol Buffers.
    filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
        lambda example, samples_metadata:
        filter_and_revise_example(example, samples_metadata),
        beam.pvalue.AsSingleton(samples_metadata)))

    # Write the subset of tf.Example protos to Cloud Storage.
    _ = (filtered_examples
         | 'SerializeExamples' >>
         beam.Map(lambda example: example.SerializeToString())
         | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
             file_path_prefix=os.path.join(output_dir, 'examples'),
             compression_type=CompressionTypes.GZIP,
             file_name_suffix='.tfrecord.gz'))

Python apache_beam.options.pipeline_options.SetupOptions() Examples