Python apache_beam.options.pipeline_options.SetupOptions() Examples
The following are 7
code examples of apache_beam.options.pipeline_options.SetupOptions().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam.options.pipeline_options
, or try the search function
.
Example #1
Source File: read_from_relational_db.py From beam-nuggets with MIT License | 6 votes |
def main(): # get the cmd args db_args, pipeline_args = get_args() # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, ) months = p | "Reading records from db" >> relational_db.ReadFromDB( source_config=source_config, table_name=db_args.table ) months | 'Writing to stdout' >> beam.Map(print)
Example #2
Source File: tfdv.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def generate_statistics_from_tfrecord(pipeline_args, # type: List[str] data_location, # type: str output_path, # type: str stats_options # type: StatsOptions ): # type: (...) -> statistics_pb2.DatasetFeatureStatisticsList """ Generate stats file from a tfrecord dataset using TFDV :param pipeline_args: un-parsed Dataflow arguments :param data_location: input data dir containing tfrecord files :param output_path: output path for the stats file :param stats_options: tfdv.StatsOptions for statistics generation settings :return a DatasetFeatureStatisticsList proto. """ assert_not_empty_string(data_location) assert_not_empty_string(output_path) args_in_snake_case = clean_up_pipeline_args(pipeline_args) pipeline_options = PipelineOptions(flags=args_in_snake_case) all_options = pipeline_options.get_all_options() if all_options["job_name"] is None: gcloud_options = pipeline_options.view_as(GoogleCloudOptions) gcloud_options.job_name = "generatestats-%s" % str(int(time.time())) if all_options["setup_file"] is None: setup_file_path = create_setup_file() setup_options = pipeline_options.view_as(SetupOptions) setup_options.setup_file = setup_file_path input_files = os.path.join(data_location, "*.tfrecords*") return tfdv.generate_statistics_from_tfrecord(data_location=input_files, output_path=output_path, stats_options=stats_options, pipeline_options=pipeline_options)
Example #3
Source File: run.py From realtime-embeddings-matching with Apache License 2.0 | 5 votes |
def main(argv=None): known_args, pipeline_args = get_args(argv) pipeline_options = PipelineOptions(pipeline_args) setup_options = pipeline_options.view_as(SetupOptions) setup_options.save_main_session = True pipeline.run(pipeline_options, known_args)
Example #4
Source File: write_to_relational_db.py From beam-nuggets with MIT License | 5 votes |
def main(): # get the cmd args db_args, pipeline_args = get_args() # Target database instance source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, create_if_missing=db_args.create_if_missing ) # The data to be written records = [ {'name': 'Jan', 'num': 1}, {'name': 'Feb', 'num': 2}, {'name': 'Mar', 'num': 3}, {'name': 'Apr', 'num': 4}, {'name': 'May', 'num': 5}, {'name': 'Jun', 'num': 6}, ] # Target database table table_config = relational_db.TableConfiguration( name='months', create_if_missing=True, # automatically create the table if not there primary_key_columns=['num'] # and use 'num' column as a primary key ) # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: months = p | "Reading records" >> beam.Create(records) months | 'Writing to DB' >> relational_db.Write( source_config=source_config, table_config=table_config )
Example #5
Source File: create_data.py From conversational-datasets with Apache License 2.0 | 4 votes |
def run(argv=None): """Run the beam pipeline.""" args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) sentence_files_match = FileSystems.match([args.sentence_files])[0] sentence_files = [ file_metadata.path for file_metadata in sentence_files_match.metadata_list] logging.info("Reading %i files from %s.", len(sentence_files), args.sentence_files) assert len(sentence_files) > 0 sentence_files = p | beam.Create(sentence_files) examples = sentence_files | "create examples" >> beam.FlatMap( partial(_create_examples_from_file, min_length=args.min_length, max_length=args.max_length, num_extra_contexts=args.num_extra_contexts) ) examples = _shuffle_examples(examples) examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(args.train_split)).with_outputs( _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps else: assert args.dataset_format == _TF_FORMAT write_sink = WriteToTFRecord file_name_suffix = ".tfrecord" serialize_fn = _features_to_serialized_tf_example for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG), ("test", _TrainTestSplitFn.TEST_TAG)]: serialized_examples = examples[tag] | ( "serialize {} examples".format(name) >> beam.Map(serialize_fn)) ( serialized_examples | ("write " + name) >> write_sink( os.path.join(args.output_dir, name), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) result = p.run() result.wait_until_finish()
Example #6
Source File: create_data.py From conversational-datasets with Apache License 2.0 | 4 votes |
def run(argv=None): """Run the beam pipeline.""" args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) lines = p | "read qa files" >> ReadFromText(args.file_pattern) # The lines are not JSON, but the string representation of python # dictionary objects. Parse them with ast.literal_eval. json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval) qa_tuples = json_objects | "create tuples" >> beam.FlatMap( partial( _create_tuples, min_words=args.min_words, max_words=args.max_words) ) # Remove duplicate examples. qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v)) qa_tuples |= "group duplicates" >> beam.GroupByKey() qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0]) # Create the examples. examples = qa_tuples | "create examples" >> beam.Map( lambda args: _create_example(*args) ) examples = _shuffle_examples(examples) examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(args.train_split) ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps else: assert args.dataset_format == _TF_FORMAT write_sink = WriteToTFRecord file_name_suffix = ".tfrecord" serialize_fn = _features_to_serialized_tf_example for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG), ("test", _TrainTestSplitFn.TEST_TAG)]: serialized_examples = examples[tag] | ( "serialize {} examples".format(name) >> beam.Map(serialize_fn)) ( serialized_examples | ("write " + name) >> write_sink( os.path.join(args.output_dir, name), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) result = p.run() result.wait_until_finish()
Example #7
Source File: revise_preprocessed_data.py From cloudml-examples with Apache License 2.0 | 4 votes |
def run(argv=None): """Runs the revise preprocessed data pipeline. Args: argv: Pipeline options as a list of arguments. """ pipeline_options = PipelineOptions(flags=argv) revise_options = pipeline_options.view_as(ReviseOptions) cloud_options = pipeline_options.view_as(GoogleCloudOptions) output_dir = os.path.join(revise_options.output, datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging') cloud_options.temp_location = os.path.join(output_dir, 'tmp') cloud_options.job_name = 'relabel-examples-%s' % ( datetime.datetime.now().strftime('%y%m%d-%H%M%S')) metadata_query = str( Template(open(revise_options.metadata, 'r').read()).render( METADATA_QUERY_REPLACEMENTS)) logging.info('metadata query : %s', metadata_query) with beam.Pipeline(options=pipeline_options) as p: # Gather our sample metadata into a python dictionary. samples_metadata = ( p | 'ReadSampleMetadata' >> beam.io.Read( beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True)) | 'TableToDictionary' >> beam.CombineGlobally( util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN))) # Read the tf.Example protos into a PCollection. examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord( file_pattern=revise_options.input, compression_type=CompressionTypes.GZIP) # Filter the TensorFlow Example Protocol Buffers. filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap( lambda example, samples_metadata: filter_and_revise_example(example, samples_metadata), beam.pvalue.AsSingleton(samples_metadata))) # Write the subset of tf.Example protos to Cloud Storage. _ = (filtered_examples | 'SerializeExamples' >> beam.Map(lambda example: example.SerializeToString()) | 'WriteExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(output_dir, 'examples'), compression_type=CompressionTypes.GZIP, file_name_suffix='.tfrecord.gz'))