Python apache_beam.options.pipeline_options.GoogleCloudOptions() Examples
The following are 8
code examples of apache_beam.options.pipeline_options.GoogleCloudOptions().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam.options.pipeline_options
, or try the search function
.
Example #1
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 5 votes |
def _shard_variants(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> List[str] """Reads the variants and writes them to VCF shards. Returns: The VCF shards directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) shard_files_job_name = pipeline_common.generate_unique_name( _SHARD_VCF_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, shard_files_job_name) vcf_shards_output_dir = filesystems.FileSystems.join( known_args.annotation_output_dir, _SHARDS_FOLDER) with beam.Pipeline(options=options) as p: variants = _read_variants(known_args.all_patterns, p, known_args, pipeline_mode, pre_infer_headers=False, keep_raw_sample_names=True) sample_ids = (variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner() | 'CombineToList' >> beam.combiners.ToList()) # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead # of sample names in the the sharded VCF files, which would lead to double # hashing of samples. Needs to be fixed ASAP. _ = (variants | 'DensifyVariants' >> densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids), known_args.number_of_variants_per_shard)) return [vep_runner_util.format_dir_path(vcf_shards_output_dir) + _GCS_RECURSIVE_WILDCARD]
Example #2
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 5 votes |
def _annotate_vcf_files(all_patterns, known_args, pipeline_args): # type: (List[str], argparse.Namespace, List[str]) -> str """Annotates the VCF files using VEP. Returns: The annotated VCF files directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) annotate_files_job_name = pipeline_common.generate_unique_name( _ANNOTATE_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, annotate_files_job_name) with beam.Pipeline(options=options) as p: _ = (p | beam.Create(all_patterns) | 'AnnotateShards' >> beam.ParDo( annotate_files.AnnotateFile(known_args, pipeline_args))) if known_args.annotation_fields: known_args.annotation_fields.append(known_args.vep_info_field) else: known_args.annotation_fields = [known_args.vep_info_field] # TODO(bashir2): The VEP runner by default runs VEP with --allele_number hence # we turn on this feature here. However, this might be inconsistent with other # annotation fields that are originally present in input files, if they do not # have ALLELE_NUM annotation. The fix is to make annotation ALT matching # smarter to fall back on other matching methods if ALLELE_NUM is not present. # When this is implemented, we may even consider removing use_allele_num flag # and always start by checking if ALLELE_NUM is present. known_args.use_allele_num = True return vep_runner_util.get_output_pattern(known_args.annotation_output_dir)
Example #3
Source File: tfdv.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def generate_statistics_from_tfrecord(pipeline_args, # type: List[str] data_location, # type: str output_path, # type: str stats_options # type: StatsOptions ): # type: (...) -> statistics_pb2.DatasetFeatureStatisticsList """ Generate stats file from a tfrecord dataset using TFDV :param pipeline_args: un-parsed Dataflow arguments :param data_location: input data dir containing tfrecord files :param output_path: output path for the stats file :param stats_options: tfdv.StatsOptions for statistics generation settings :return a DatasetFeatureStatisticsList proto. """ assert_not_empty_string(data_location) assert_not_empty_string(output_path) args_in_snake_case = clean_up_pipeline_args(pipeline_args) pipeline_options = PipelineOptions(flags=args_in_snake_case) all_options = pipeline_options.get_all_options() if all_options["job_name"] is None: gcloud_options = pipeline_options.view_as(GoogleCloudOptions) gcloud_options.job_name = "generatestats-%s" % str(int(time.time())) if all_options["setup_file"] is None: setup_file_path = create_setup_file() setup_options = pipeline_options.view_as(SetupOptions) setup_options.setup_file = setup_file_path input_files = os.path.join(data_location, "*.tfrecords*") return tfdv.generate_statistics_from_tfrecord(data_location=input_files, output_path=output_path, stats_options=stats_options, pipeline_options=pipeline_options)
Example #4
Source File: run_inference.py From tfx-bsl with Apache License 2.0 | 5 votes |
def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType, pipeline_options: PipelineOptions): super(_RemotePredictDoFn, self).__init__(inference_spec_type) self._api_client = None project_id = ( inference_spec_type.ai_platform_prediction_model_spec.project_id or pipeline_options.view_as(GoogleCloudOptions).project) if not project_id: raise ValueError('Either a non-empty project id or project flag in ' ' beam pipeline options needs be provided.') model_name = ( inference_spec_type.ai_platform_prediction_model_spec.model_name) if not model_name: raise ValueError('A non-empty model name must be provided.') version_name = ( inference_spec_type.ai_platform_prediction_model_spec.version_name) name_spec = 'projects/{}/models/{}' # If version is not specified, the default version for a model is used. if version_name: name_spec += '/versions/{}' self._full_model_name = name_spec.format(project_id, model_name, version_name)
Example #5
Source File: bq_to_vcf.py From gcp-variant-transforms with Apache License 2.0 | 4 votes |
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args(argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) is_direct_runner = pipeline_common.is_pipeline_direct_runner( beam.Pipeline(options=options)) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) if not google_cloud_options.project: raise ValueError('project must be set.') if not is_direct_runner and not known_args.output_file.startswith('gs://'): raise ValueError('Please set the output file {} to GCS when running with ' 'DataflowRunner.'.format(known_args.output_file)) if is_direct_runner: known_args.number_of_bases_per_shard = sys.maxsize temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp() unique_temp_id = pipeline_common.generate_unique_name( google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME) vcf_data_temp_folder = filesystems.FileSystems.join( temp_folder, '{}_data_temp_files'.format(unique_temp_id)) # Create the directory manually. FileSystems cannot create a file if the # directory does not exist when using Direct Runner. filesystems.FileSystems.mkdirs(vcf_data_temp_folder) vcf_header_file_path = filesystems.FileSystems.join( temp_folder, '{}_header_with_sample_ids.vcf'.format(unique_temp_id)) if not known_args.representative_header_file: known_args.representative_header_file = filesystems.FileSystems.join( temp_folder, '{}_meta_info.vcf'.format(unique_temp_id)) _write_vcf_meta_info(known_args.input_table, known_args.representative_header_file, known_args.allow_incompatible_schema) _bigquery_to_vcf_shards(known_args, options, vcf_data_temp_folder, vcf_header_file_path) if is_direct_runner: vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path, vcf_data_temp_folder, known_args.output_file) else: vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project, vcf_header_file_path, vcf_data_temp_folder, known_args.output_file)
Example #6
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 4 votes |
def _get_input_dimensions(known_args, pipeline_args): pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = beam_pipeline_options.view_as( pipeline_options.GoogleCloudOptions) estimate_sizes_job_name = pipeline_common.generate_unique_name( _ESTIMATE_SIZES_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + estimate_sizes_job_name else: google_cloud_options.job_name = estimate_sizes_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_estimated_input_size_file_name = '-'.join( [google_cloud_options.job_name, _ESTIMATE_SIZES_FILE_NAME]) temp_estimated_input_size_file_path = filesystems.FileSystems.join( temp_directory, temp_estimated_input_size_file_name) with beam.Pipeline(options=beam_pipeline_options) as p: estimates = pipeline_common.get_estimates( p, pipeline_mode, known_args.all_patterns) files_size = (estimates | 'GetFilesSize' >> extract_input_size.GetFilesSize()) file_count = (estimates | 'CountAllFiles' >> beam.combiners.Count.Globally()) sample_map = (estimates | 'ExtractSampleMap' >> extract_input_size.GetSampleMap()) estimated_value_count = (sample_map | extract_input_size.GetEstimatedValueCount()) estimated_sample_count = (sample_map | extract_input_size.GetEstimatedSampleCount()) estimated_variant_count = (estimates | 'GetEstimatedVariantCount' >> extract_input_size.GetEstimatedVariantCount()) _ = (estimated_variant_count | beam.ParDo(extract_input_size.print_estimates_to_file, beam.pvalue.AsSingleton(estimated_sample_count), beam.pvalue.AsSingleton(estimated_value_count), beam.pvalue.AsSingleton(files_size), beam.pvalue.AsSingleton(file_count), temp_estimated_input_size_file_path)) with filesystems.FileSystems.open(temp_estimated_input_size_file_path) as f: estimates = f.readlines() if len(estimates) != 5: raise ValueError('Exactly 5 estimates were expected in {}.'.format( temp_estimated_input_size_file_path)) known_args.estimated_variant_count = int(estimates[0].strip()) known_args.estimated_sample_count = int(estimates[1].strip()) known_args.estimated_value_count = int(estimates[2].strip()) known_args.files_size = int(estimates[3].strip()) known_args.file_count = int(estimates[4].strip())
Example #7
Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0 | 4 votes |
def _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path, annotated_vcf_pattern=None): # type: (str, argparse.Namespace, List[str], int, str) -> None """Merges VCF headers using beam based on pipeline_mode.""" options = pipeline_options.PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == pipeline_common.PipelineModes.SMALL and not known_args.infer_headers and not known_args.infer_annotation_types): options.view_as(pipeline_options.StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) merge_headers_job_name = pipeline_common.generate_unique_name( _MERGE_HEADERS_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + merge_headers_job_name else: google_cloud_options.job_name = merge_headers_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_merged_headers_file_name = '-'.join([google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) temp_merged_headers_file_path = filesystems.FileSystems.join( temp_directory, temp_merged_headers_file_name) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers( p, pipeline_mode, known_args.all_patterns) _ = (headers | 'SampleInfoToAvro' >> sample_info_to_avro.SampleInfoToAvro( avro_root_path + sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX, SampleNameEncoding[known_args.sample_name_encoding])) if known_args.representative_header_file: return merged_header = pipeline_common.get_merged_headers( headers, known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records) if annotated_vcf_pattern: merged_header = pipeline_common.add_annotation_headers( p, known_args, pipeline_mode, merged_header, annotated_vcf_pattern) if known_args.infer_headers or known_args.infer_annotation_types: infer_headers_input_pattern = ( [annotated_vcf_pattern] if annotated_vcf_pattern else known_args.all_patterns) merged_header = _add_inferred_headers(infer_headers_input_pattern, p, known_args, merged_header, pipeline_mode) pipeline_common.write_headers(merged_header, temp_merged_headers_file_path) known_args.representative_header_file = temp_merged_headers_file_path
Example #8
Source File: revise_preprocessed_data.py From cloudml-examples with Apache License 2.0 | 4 votes |
def run(argv=None): """Runs the revise preprocessed data pipeline. Args: argv: Pipeline options as a list of arguments. """ pipeline_options = PipelineOptions(flags=argv) revise_options = pipeline_options.view_as(ReviseOptions) cloud_options = pipeline_options.view_as(GoogleCloudOptions) output_dir = os.path.join(revise_options.output, datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as( WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED' cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging') cloud_options.temp_location = os.path.join(output_dir, 'tmp') cloud_options.job_name = 'relabel-examples-%s' % ( datetime.datetime.now().strftime('%y%m%d-%H%M%S')) metadata_query = str( Template(open(revise_options.metadata, 'r').read()).render( METADATA_QUERY_REPLACEMENTS)) logging.info('metadata query : %s', metadata_query) with beam.Pipeline(options=pipeline_options) as p: # Gather our sample metadata into a python dictionary. samples_metadata = ( p | 'ReadSampleMetadata' >> beam.io.Read( beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True)) | 'TableToDictionary' >> beam.CombineGlobally( util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN))) # Read the tf.Example protos into a PCollection. examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord( file_pattern=revise_options.input, compression_type=CompressionTypes.GZIP) # Filter the TensorFlow Example Protocol Buffers. filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap( lambda example, samples_metadata: filter_and_revise_example(example, samples_metadata), beam.pvalue.AsSingleton(samples_metadata))) # Write the subset of tf.Example protos to Cloud Storage. _ = (filtered_examples | 'SerializeExamples' >> beam.Map(lambda example: example.SerializeToString()) | 'WriteExamples' >> tfrecordio.WriteToTFRecord( file_path_prefix=os.path.join(output_dir, 'examples'), compression_type=CompressionTypes.GZIP, file_name_suffix='.tfrecord.gz'))