Python apache_beam.options.pipeline_options.PipelineOptions() Examples
The following are 27
code examples of apache_beam.options.pipeline_options.PipelineOptions().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam.options.pipeline_options
, or try the search function
.
Example #1
Source File: streaming_beam.py From python-docs-samples with Apache License 2.0 | 6 votes |
def run(args, input_subscription, output_table, window_interval): """Build and run the pipeline.""" options = PipelineOptions(args, save_main_session=True, streaming=True) with beam.Pipeline(options=options) as pipeline: # Read the messages from PubSub and process them. messages = ( pipeline | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub( subscription=input_subscription).with_output_types(bytes) | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8')) | 'Parse JSON messages' >> beam.Map(parse_json_message) | 'Fixed-size windows' >> beam.WindowInto( window.FixedWindows(int(window_interval), 0)) | 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg)) | 'Group by URLs' >> beam.GroupByKey() | 'Get statistics' >> beam.Map(get_statistics)) # Output the results into BigQuery table. _ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery( output_table, schema=SCHEMA)
Example #2
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def run(): """Run Apache Beam pipeline to generate TFRecords for Survival Analysis.""" flags = parse_arguments(sys.argv[1:]) pipeline_args = get_pipeline_args(flags) options = pipeline_options.PipelineOptions(flags=[], **pipeline_args) options.view_as(pipeline_options.WorkerOptions).machine_type = ( flags.machine_type) temp_dir = os.path.join(flags.output_dir, 'tmp') runner = 'DataflowRunner' if flags.cloud else 'DirectRunner' with beam.Pipeline(runner, options=options) as p: with tft_beam.Context(temp_dir=temp_dir): build_pipeline(p, flags)
Example #3
Source File: read_from_relational_db.py From beam-nuggets with MIT License | 6 votes |
def main(): # get the cmd args db_args, pipeline_args = get_args() # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, ) months = p | "Reading records from db" >> relational_db.ReadFromDB( source_config=source_config, table_name=db_args.table ) months | 'Writing to stdout' >> beam.Map(print)
Example #4
Source File: base_executor.py From tfx with Apache License 2.0 | 6 votes |
def _make_beam_pipeline(self) -> beam.Pipeline: """Makes beam pipeline.""" pipeline_options = PipelineOptions(self._beam_pipeline_args) if pipeline_options.view_as(StandardOptions).runner: return beam.Pipeline(argv=self._beam_pipeline_args) # TODO(b/159468583): move this warning to Beam. direct_running_mode = pipeline_options.view_as( DirectOptions).direct_running_mode direct_num_workers = pipeline_options.view_as( DirectOptions).direct_num_workers if direct_running_mode == 'in_memory' and direct_num_workers != 1: absl.logging.warning( 'If direct_num_workers is not equal to 1, direct_running_mode should ' 'be `multi_processing` or `multi_threading` instead of `in_memory` ' 'in order for it to have the desired worker parallelism effect.') return beam.Pipeline( options=pipeline_options, runner=fn_api_runner.FnApiRunner())
Example #5
Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0 | 6 votes |
def run(input_topic, output_path, window_size=1.0, pipeline_args=None): # `save_main_session` is set to true because some DoFn's rely on # globally imported modules. pipeline_options = PipelineOptions( pipeline_args, streaming=True, save_main_session=True ) with beam.Pipeline(options=pipeline_options) as pipeline: ( pipeline | "Read PubSub Messages" >> beam.io.ReadFromPubSub(topic=input_topic) | "Window into" >> GroupWindowsIntoBatches(window_size) | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path)) )
Example #6
Source File: vep_runner.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def _process_pipeline_args(self, pipeline_args): # type: (List[str]) -> None flags_dict = pipeline_options.PipelineOptions( pipeline_args).get_all_options() self._project = self._get_flag(flags_dict, 'project') self._region = self._get_flag(flags_dict, 'region') # TODO(bahsir2): Fix the error messages of _check_flag since # --worker_machine_type has dest='machine_type'. try: self._machine_type = self._get_flag(flags_dict, 'machine_type') except ValueError: self._machine_type = self._get_machine_type_from_fork() self._max_num_workers = self._get_flag( flags_dict, 'max_num_workers', 'num_workers') if self._max_num_workers <= 0: raise ValueError( '--max_num_workers and --num_workers should be positive numbers, ' 'got: {}'.format(self._max_num_workers))
Example #7
Source File: write_to_relational_db.py From beam-nuggets with MIT License | 5 votes |
def main(): # get the cmd args db_args, pipeline_args = get_args() # Target database instance source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, create_if_missing=db_args.create_if_missing ) # The data to be written records = [ {'name': 'Jan', 'num': 1}, {'name': 'Feb', 'num': 2}, {'name': 'Mar', 'num': 3}, {'name': 'Apr', 'num': 4}, {'name': 'May', 'num': 5}, {'name': 'Jun', 'num': 6}, ] # Target database table table_config = relational_db.TableConfiguration( name='months', create_if_missing=True, # automatically create the table if not there primary_key_columns=['num'] # and use 'num' column as a primary key ) # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: months = p | "Reading records" >> beam.Create(records) months | 'Writing to DB' >> relational_db.Write( source_config=source_config, table_config=table_config )
Example #8
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 5 votes |
def _validate_annotation_pipeline_args(known_args, pipeline_args): match_results = filesystems.FileSystems.match(['{}*'.format( vep_runner_util.format_dir_path(known_args.annotation_output_dir))]) if match_results and match_results[0].metadata_list: raise ValueError('Output directory {} already exists.'.format( known_args.annotation_output_dir)) flags_dict = pipeline_options.PipelineOptions(pipeline_args).get_all_options() expected_flags = ['max_num_workers', 'num_workers'] for flag in expected_flags: if flag in flags_dict and flags_dict[flag] > 0: return raise ValueError('Could not find any of {} with a valid value among pipeline ' 'flags {}'.format(expected_flags, flags_dict))
Example #9
Source File: tfdv.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def generate_statistics_from_tfrecord(pipeline_args, # type: List[str] data_location, # type: str output_path, # type: str stats_options # type: StatsOptions ): # type: (...) -> statistics_pb2.DatasetFeatureStatisticsList """ Generate stats file from a tfrecord dataset using TFDV :param pipeline_args: un-parsed Dataflow arguments :param data_location: input data dir containing tfrecord files :param output_path: output path for the stats file :param stats_options: tfdv.StatsOptions for statistics generation settings :return a DatasetFeatureStatisticsList proto. """ assert_not_empty_string(data_location) assert_not_empty_string(output_path) args_in_snake_case = clean_up_pipeline_args(pipeline_args) pipeline_options = PipelineOptions(flags=args_in_snake_case) all_options = pipeline_options.get_all_options() if all_options["job_name"] is None: gcloud_options = pipeline_options.view_as(GoogleCloudOptions) gcloud_options.job_name = "generatestats-%s" % str(int(time.time())) if all_options["setup_file"] is None: setup_file_path = create_setup_file() setup_options = pipeline_options.view_as(SetupOptions) setup_options.setup_file = setup_file_path input_files = os.path.join(data_location, "*.tfrecords*") return tfdv.generate_statistics_from_tfrecord(data_location=input_files, output_path=output_path, stats_options=stats_options, pipeline_options=pipeline_options)
Example #10
Source File: run_inference.py From tfx-bsl with Apache License 2.0 | 5 votes |
def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType, pipeline_options: PipelineOptions): super(_RemotePredictDoFn, self).__init__(inference_spec_type) self._api_client = None project_id = ( inference_spec_type.ai_platform_prediction_model_spec.project_id or pipeline_options.view_as(GoogleCloudOptions).project) if not project_id: raise ValueError('Either a non-empty project id or project flag in ' ' beam pipeline options needs be provided.') model_name = ( inference_spec_type.ai_platform_prediction_model_spec.model_name) if not model_name: raise ValueError('A non-empty model name must be provided.') version_name = ( inference_spec_type.ai_platform_prediction_model_spec.version_name) name_spec = 'projects/{}/models/{}' # If version is not specified, the default version for a model is used. if version_name: name_spec += '/versions/{}' self._full_model_name = name_spec.format(project_id, model_name, version_name)
Example #11
Source File: run.py From realtime-embeddings-matching with Apache License 2.0 | 5 votes |
def main(argv=None): known_args, pipeline_args = get_args(argv) pipeline_options = PipelineOptions(pipeline_args) setup_options = pipeline_options.view_as(SetupOptions) setup_options.save_main_session = True pipeline.run(pipeline_options, known_args)
Example #12
Source File: gcs_to_bigquery_lib.py From healthcare-deid with Apache License 2.0 | 5 votes |
def run_pipeline(input_pattern, output_table, pipeline_args): """Read the records from GCS and write them to BigQuery.""" p = beam.Pipeline(options=PipelineOptions(pipeline_args)) _ = (p | 'match_files' >> beam.Create(f2pn.match_files(input_pattern)) | 'to_records' >> beam.FlatMap(f2pn.map_file_to_records) | 'parse_physionet_record' >> beam.Map(f2pn.parse_physionet_record) | 'write' >> beam.io.Write(beam.io.BigQuerySink( output_table, schema='patient_id:INTEGER, record_number:INTEGER, note:STRING', write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) result = p.run().wait_until_finish() logging.info('GCS to BigQuery result: %s', result)
Example #13
Source File: bigquery_to_gcs_lib.py From healthcare-deid with Apache License 2.0 | 5 votes |
def run_pipeline(input_query, output_file, pipeline_args): p = beam.Pipeline(options=PipelineOptions(pipeline_args)) _ = (p | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=input_query)) | 'to_physionet' >> beam.Map(map_to_physionet_record) | 'write' >> beam.io.WriteToText(output_file)) result = p.run().wait_until_finish() logging.info('BigQuery to GCS result: %s', result)
Example #14
Source File: physionet_to_mae_lib.py From healthcare-deid with Apache License 2.0 | 5 votes |
def run_pipeline(input_pattern, output_dir, mae_task_name, project, pipeline_args): """Read the physionet records from GCS and write them out as MAE.""" p = beam.Pipeline(options=PipelineOptions(pipeline_args)) _ = (p | 'match_files' >> beam.Create(f2pn.match_files(input_pattern)) | 'to_records' >> beam.FlatMap(f2pn.map_phi_to_findings) | 'generate_mae' >> beam.Map(mae.generate_mae, mae_task_name, {}, ['patient_id', 'record_number']) | 'write_mae' >> beam.Map(write_mae, project, output_dir) ) result = p.run().wait_until_finish() logging.info('GCS to BigQuery result: %s', result)
Example #15
Source File: gcs_to_bigquery_lib.py From healthcare-deid with Apache License 2.0 | 5 votes |
def run_pipeline(input_pattern, output_table, pipeline_args): """Read the records from GCS and write them to BigQuery.""" p = beam.Pipeline(options=PipelineOptions(pipeline_args)) _ = (p | 'match_files' >> beam.Create(f2pn.match_files(input_pattern)) | 'to_records' >> beam.FlatMap(map_file_to_records) | 'map_to_bq_inputs' >> beam.Map(map_to_bq_inputs) | 'write' >> beam.io.Write(beam.io.BigQuerySink( output_table, schema='patient_id:INTEGER, note:STRING', write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) result = p.run().wait_until_finish() logging.info('GCS to BigQuery result: %s', result)
Example #16
Source File: bigquery_to_gcs_lib.py From healthcare-deid with Apache License 2.0 | 5 votes |
def run_pipeline(input_query, output_path, pipeline_args): p = beam.Pipeline(options=PipelineOptions(pipeline_args)) _ = (p | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=input_query)) | 'to_mist' >> beam.Map(map_to_mist_record) | 'write' >> beam.io.WriteToText(output_path)) result = p.run().wait_until_finish() logging.info('BigQuery to GCS result: %s', result)
Example #17
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 5 votes |
def _shard_variants(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> List[str] """Reads the variants and writes them to VCF shards. Returns: The VCF shards directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) shard_files_job_name = pipeline_common.generate_unique_name( _SHARD_VCF_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, shard_files_job_name) vcf_shards_output_dir = filesystems.FileSystems.join( known_args.annotation_output_dir, _SHARDS_FOLDER) with beam.Pipeline(options=options) as p: variants = _read_variants(known_args.all_patterns, p, known_args, pipeline_mode, pre_infer_headers=False, keep_raw_sample_names=True) sample_ids = (variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner() | 'CombineToList' >> beam.combiners.ToList()) # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead # of sample names in the the sharded VCF files, which would lead to double # hashing of samples. Needs to be fixed ASAP. _ = (variants | 'DensifyVariants' >> densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids), known_args.number_of_variants_per_shard)) return [vep_runner_util.format_dir_path(vcf_shards_output_dir) + _GCS_RECURSIVE_WILDCARD]
Example #18
Source File: create_data.py From conversational-datasets with Apache License 2.0 | 4 votes |
def run(argv=None): """Run the beam pipeline.""" args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) sentence_files_match = FileSystems.match([args.sentence_files])[0] sentence_files = [ file_metadata.path for file_metadata in sentence_files_match.metadata_list] logging.info("Reading %i files from %s.", len(sentence_files), args.sentence_files) assert len(sentence_files) > 0 sentence_files = p | beam.Create(sentence_files) examples = sentence_files | "create examples" >> beam.FlatMap( partial(_create_examples_from_file, min_length=args.min_length, max_length=args.max_length, num_extra_contexts=args.num_extra_contexts) ) examples = _shuffle_examples(examples) examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(args.train_split)).with_outputs( _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps else: assert args.dataset_format == _TF_FORMAT write_sink = WriteToTFRecord file_name_suffix = ".tfrecord" serialize_fn = _features_to_serialized_tf_example for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG), ("test", _TrainTestSplitFn.TEST_TAG)]: serialized_examples = examples[tag] | ( "serialize {} examples".format(name) >> beam.Map(serialize_fn)) ( serialized_examples | ("write " + name) >> write_sink( os.path.join(args.output_dir, name), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) result = p.run() result.wait_until_finish()
Example #19
Source File: revise_preprocessed_data.py From cloudml-examples with Apache License 2.0 | 4 votes |
def run(argv=None): """Runs the revise preprocessed data pipeline. Args: argv: Pipeline options as a list of arguments. """ pipeline_options = PipelineOptions(flags=argv) revise_options = pipeline_options.view_as(ReviseOptions) cloud_options = pipeline_options.view_as(GoogleCloudOptions) output_dir = os.path.join(revise_options.output, datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging') cloud_options.temp_location = os.path.join(output_dir, 'tmp') cloud_options.job_name = 'relabel-examples-%s' % ( datetime.datetime.now().strftime('%y%m%d-%H%M%S')) metadata_query = str( Template(open(revise_options.metadata, 'r').read()).render( METADATA_QUERY_REPLACEMENTS)) logging.info('metadata query : %s', metadata_query) with beam.Pipeline(options=pipeline_options) as p: # Gather our sample metadata into a python dictionary. samples_metadata = ( p | 'ReadSampleMetadata' >> beam.io.Read( beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True)) | 'TableToDictionary' >> beam.CombineGlobally( util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN))) # Read the tf.Example protos into a PCollection. examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord( file_pattern=revise_options.input, compression_type=CompressionTypes.GZIP) # Filter the TensorFlow Example Protocol Buffers. filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap( lambda example, samples_metadata: filter_and_revise_example(example, samples_metadata), beam.pvalue.AsSingleton(samples_metadata))) # Write the subset of tf.Example protos to Cloud Storage. _ = (filtered_examples | 'SerializeExamples' >> beam.Map(lambda example: example.SerializeToString()) | 'WriteExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(output_dir, 'examples'), compression_type=CompressionTypes.GZIP, file_name_suffix='.tfrecord.gz'))
Example #20
Source File: create_data.py From conversational-datasets with Apache License 2.0 | 4 votes |
def run(argv=None): """Run the beam pipeline.""" args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) lines = p | "read qa files" >> ReadFromText(args.file_pattern) # The lines are not JSON, but the string representation of python # dictionary objects. Parse them with ast.literal_eval. json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval) qa_tuples = json_objects | "create tuples" >> beam.FlatMap( partial( _create_tuples, min_words=args.min_words, max_words=args.max_words) ) # Remove duplicate examples. qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v)) qa_tuples |= "group duplicates" >> beam.GroupByKey() qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0]) # Create the examples. examples = qa_tuples | "create examples" >> beam.Map( lambda args: _create_example(*args) ) examples = _shuffle_examples(examples) examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(args.train_split) ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps else: assert args.dataset_format == _TF_FORMAT write_sink = WriteToTFRecord file_name_suffix = ".tfrecord" serialize_fn = _features_to_serialized_tf_example for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG), ("test", _TrainTestSplitFn.TEST_TAG)]: serialized_examples = examples[tag] | ( "serialize {} examples".format(name) >> beam.Map(serialize_fn)) ( serialized_examples | ("write " + name) >> write_sink( os.path.join(args.output_dir, name), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) result = p.run() result.wait_until_finish()
Example #21
Source File: vcf_to_bq_preprocess.py From gcp-variant-transforms with Apache License 2.0 | 4 votes |
def run(argv=None): # type: (List[str]) -> (str, str) """Runs preprocess pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args(argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) all_patterns = known_args.all_patterns pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns) merged_headers = pipeline_common.get_merged_headers(headers) merged_definitions = (headers | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) if known_args.report_all_conflicts: variants = pipeline_common.read_variants(p, all_patterns, pipeline_mode, allow_malformed_records=True, pre_infer_headers=True) malformed_records = variants | filter_variants.ExtractMalformedVariants() inferred_headers, merged_headers = (_get_inferred_headers(variants, merged_headers)) _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo(preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers), beam.pvalue.AsSingleton(inferred_headers), beam.pvalue.AsIter(malformed_records))) else: _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo(preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers))) if known_args.resolved_headers_path: pipeline_common.write_headers(merged_headers, known_args.resolved_headers_path)
Example #22
Source File: process_delimited.py From professional-services with Apache License 2.0 | 4 votes |
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Add the arguments needed for this specific Dataflow job. parser.add_argument( '--input', dest='input', required=True, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.') parser.add_argument('--output', dest='output', required=True, help='Output BQ table to write results to.') parser.add_argument('--delimiter', dest='delimiter', required=False, help='Delimiter to split input records.', default=',') parser.add_argument('--fields', dest='fields', required=True, help='Comma separated list of field names.') parser.add_argument('--load_dt', dest='load_dt', required=True, help='Load date in YYYY-MM-DD format.') known_args, pipeline_args = parser.parse_known_args(argv) row_transformer = RowTransformer(delimiter=known_args.delimiter, header=known_args.fields, filename=ntpath.basename(known_args.input), load_dt=known_args.load_dt) p_opts = pipeline_options.PipelineOptions(pipeline_args) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information including where Dataflow should # store temp files, and what the project id is. with beam.Pipeline(options=p_opts) as pipeline: # Read the file. This is the source of the pipeline. All further # processing starts with lines read from the file. We use the input # argument from the command line. rows = pipeline | "Read from text file" >> beam.io.ReadFromText(known_args.input) # This stage of the pipeline translates from a delimited single row # input to a dictionary object consumable by BigQuery. # It refers to a function we have written. This function will # be run in parallel on different workers using input from the # previous stage of the pipeline. dict_records = rows | "Convert to BigQuery row" >> beam.Map( lambda r: row_transformer.parse(r)) # This stage of the pipeline writes the dictionary records into # an existing BigQuery table. The sink is also configured to truncate # the table if it contains any existing records. dict_records | "Write to BigQuery" >> beam.io.Write( beam.io.BigQuerySink(known_args.output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
Example #23
Source File: data_transformation.py From professional-services with Apache License 2.0 | 4 votes |
def run(argv=None): """The main function which creates the pipeline and runs it.""" parser = argparse.ArgumentParser() # Here we add some specific command line arguments we expect. Specifically # we have the input file to load and the output table to write to. parser.add_argument( '--input', dest='input', required=False, help='Input file to read. This can be a local file or ' 'a file in a Google Storage Bucket.', # This example file contains a total of only 10 lines. # It is useful for developing on a small set of data default='gs://python-dataflow-example/data_files/head_usa_names.csv') # This defaults to the temp dataset in your BigQuery project. You'll have # to create the temp dataset yourself using bq mk temp parser.add_argument('--output', dest='output', required=False, help='Output BQ table to write results to.', default='lake.usa_names_transformed') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # DataTransformation is a class we built in this script to hold the logic for # transforming the file into a BigQuery table. data_ingestion = DataTransformation() # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information like where Dataflow should # store temp files, and what the project id is. p = beam.Pipeline(options=PipelineOptions(pipeline_args)) schema = parse_table_schema_from_json(data_ingestion.schema_str) (p # Read the file. This is the source of the pipeline. All further # processing starts with lines read from the file. We use the input # argument from the command line. We also skip the first line which is a # header row. | 'Read From Text' >> beam.io.ReadFromText(known_args.input, skip_header_lines=1) # This stage of the pipeline translates from a CSV file single row # input as a string, to a dictionary object consumable by BigQuery. # It refers to a function we have written. This function will # be run in parallel on different workers using input from the # previous stage of the pipeline. | 'String to BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s)) | 'Write to BigQuery' >> beam.io.Write( beam.io.BigQuerySink( # The table name is a required argument for the BigQuery sink. # In this case we use the value passed in from the command line. known_args.output, # Here we use the JSON schema read in from a JSON file. # Specifying the schema allows the API to create the table correctly if it does not yet exist. schema=schema, # Creates the table in BigQuery if it does not yet exist. create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # Deletes all data in the BigQuery table before writing. write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run().wait_until_finish()
Example #24
Source File: load_file_generator.py From professional-services with Apache License 2.0 | 4 votes |
def _create_parquet_file(self, blob_name, staging_table_util, destination_prefix): """Creates a parquet file from a staging table and stores in GCS. The parquet file is generated using DataFLow, since BigQuery Extract Jobs do not support the parquet file type as a destination format. Args: blob_name(str): Name of the file (or blob) to be generated. Starts with 'fileType=' and end with the file extension. Ex: fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876.csv # pylint: disable=line-too-long staging_table_util(load_benchmark_tools.table_util.TableUtil): Util object for interacting with the staging table that the parquet file will be generated from. destination_prefix(str): String containing the 'gs://' prefix, the bucket name, and the path of the file, without the extension. This is needed by the WriteToParquet class. Ex: gs://annarudy_test_files/fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876 # pylint: disable=line-too-long """ logging.info('Attempting to create file ' '{0:s}'.format(blob_name)) pipeline_args = [ '--project', self.project_id, '--staging_location', self.dataflow_staging_location, '--temp_location', self.dataflow_temp_location, '--save_main_session', '--worker_machine_type', 'n1-highcpu-32', '--runner', 'DataflowRunner', '--setup_file', './setup.py' ] options = pipeline_options.PipelineOptions(pipeline_args) table_spec = beam_bigquery.TableReference( projectId=self.project_id, datasetId=self.primitive_staging_dataset_id, tableId=staging_table_util.table_id) bq_schema = staging_table_util.table.schema pa_schema = parquet_util.ParquetUtil( bq_schema).get_pa_translated_schema() p = beam.Pipeline(options=options) table = ( p | 'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(table_spec))) (table | beam.io.WriteToParquet( file_path_prefix=destination_prefix, schema=pa_schema, file_name_suffix='.parquet', num_shards=1, shard_name_template='', )) p.run().wait_until_finish() logging.info('Created file: {0:s}'.format(blob_name))
Example #25
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 4 votes |
def _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path, annotated_vcf_pattern=None): # type: (str, argparse.Namespace, List[str], int, str) -> None """Merges VCF headers using beam based on pipeline_mode.""" options = pipeline_options.PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == pipeline_common.PipelineModes.SMALL and not known_args.infer_headers and not known_args.infer_annotation_types): options.view_as(pipeline_options.StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) merge_headers_job_name = pipeline_common.generate_unique_name( _MERGE_HEADERS_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + merge_headers_job_name else: google_cloud_options.job_name = merge_headers_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_merged_headers_file_name = '-'.join([google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) temp_merged_headers_file_path = filesystems.FileSystems.join( temp_directory, temp_merged_headers_file_name) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers( p, pipeline_mode, known_args.all_patterns) _ = (headers | 'SampleInfoToAvro' >> sample_info_to_avro.SampleInfoToAvro( avro_root_path + sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX, SampleNameEncoding[known_args.sample_name_encoding])) if known_args.representative_header_file: return merged_header = pipeline_common.get_merged_headers( headers, known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records) if annotated_vcf_pattern: merged_header = pipeline_common.add_annotation_headers( p, known_args, pipeline_mode, merged_header, annotated_vcf_pattern) if known_args.infer_headers or known_args.infer_annotation_types: infer_headers_input_pattern = ( [annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) merged_header = _add_inferred_headers(infer_headers_input_pattern, p, known_args, merged_header, pipeline_mode) pipeline_common.write_headers(merged_header, temp_merged_headers_file_path) known_args.representative_header_file = temp_merged_headers_file_path
Example #26
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 4 votes |
def _get_input_dimensions(known_args, pipeline_args): pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = beam_pipeline_options.view_as( pipeline_options.GoogleCloudOptions) estimate_sizes_job_name = pipeline_common.generate_unique_name( _ESTIMATE_SIZES_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + estimate_sizes_job_name else: google_cloud_options.job_name = estimate_sizes_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_estimated_input_size_file_name = '-'.join( [google_cloud_options.job_name, _ESTIMATE_SIZES_FILE_NAME]) temp_estimated_input_size_file_path = filesystems.FileSystems.join( temp_directory, temp_estimated_input_size_file_name) with beam.Pipeline(options=beam_pipeline_options) as p: estimates = pipeline_common.get_estimates( p, pipeline_mode, known_args.all_patterns) files_size = (estimates | 'GetFilesSize' >> extract_input_size.GetFilesSize()) file_count = (estimates | 'CountAllFiles' >> beam.combiners.Count.Globally()) sample_map = (estimates | 'ExtractSampleMap' >> extract_input_size.GetSampleMap()) estimated_value_count = (sample_map | extract_input_size.GetEstimatedValueCount()) estimated_sample_count = (sample_map | extract_input_size.GetEstimatedSampleCount()) estimated_variant_count = (estimates | 'GetEstimatedVariantCount' >> extract_input_size.GetEstimatedVariantCount()) _ = (estimated_variant_count | beam.ParDo(extract_input_size.print_estimates_to_file, beam.pvalue.AsSingleton(estimated_sample_count), beam.pvalue.AsSingleton(estimated_value_count), beam.pvalue.AsSingleton(files_size), beam.pvalue.AsSingleton(file_count), temp_estimated_input_size_file_path)) with filesystems.FileSystems.open(temp_estimated_input_size_file_path) as f: estimates = f.readlines() if len(estimates) != 5: raise ValueError('Exactly 5 estimates were expected in {}.'.format( temp_estimated_input_size_file_path)) known_args.estimated_variant_count = int(estimates[0].strip()) known_args.estimated_sample_count = int(estimates[1].strip()) known_args.estimated_value_count = int(estimates[2].strip()) known_args.files_size = int(estimates[3].strip()) known_args.file_count = int(estimates[4].strip())
Example #27
Source File: bq_to_vcf.py From gcp-variant-transforms with Apache License 2.0 | 4 votes |
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args(argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) is_direct_runner = pipeline_common.is_pipeline_direct_runner( beam.Pipeline(options=options)) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) if not google_cloud_options.project: raise ValueError('project must be set.') if not is_direct_runner and not known_args.output_file.startswith('gs://'): raise ValueError('Please set the output file {} to GCS when running with ' 'DataflowRunner.'.format(known_args.output_file)) if is_direct_runner: known_args.number_of_bases_per_shard = sys.maxsize temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp() unique_temp_id = pipeline_common.generate_unique_name( google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME) vcf_data_temp_folder = filesystems.FileSystems.join( temp_folder, '{}_data_temp_files'.format(unique_temp_id)) # Create the directory manually. FileSystems cannot create a file if the # directory does not exist when using Direct Runner. filesystems.FileSystems.mkdirs(vcf_data_temp_folder) vcf_header_file_path = filesystems.FileSystems.join( temp_folder, '{}_header_with_sample_ids.vcf'.format(unique_temp_id)) if not known_args.representative_header_file: known_args.representative_header_file = filesystems.FileSystems.join( temp_folder, '{}_meta_info.vcf'.format(unique_temp_id)) _write_vcf_meta_info(known_args.input_table, known_args.representative_header_file, known_args.allow_incompatible_schema) _bigquery_to_vcf_shards(known_args, options, vcf_data_temp_folder, vcf_header_file_path) if is_direct_runner: vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path, vcf_data_temp_folder, known_args.output_file) else: vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project, vcf_header_file_path, vcf_data_temp_folder, known_args.output_file)