Python apache_beam.options.pipeline_options.GoogleCloudOptions() Examples

The following are 8 code examples of apache_beam.options.pipeline_options.GoogleCloudOptions(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam.options.pipeline_options , or try the search function .
Example #1
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def _shard_variants(known_args, pipeline_args, pipeline_mode):
  # type: (argparse.Namespace, List[str], int) -> List[str]
  """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  shard_files_job_name = pipeline_common.generate_unique_name(
      _SHARD_VCF_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
  vcf_shards_output_dir = filesystems.FileSystems.join(
      known_args.annotation_output_dir, _SHARDS_FOLDER)
  with beam.Pipeline(options=options) as p:
    variants = _read_variants(known_args.all_patterns,
                              p,
                              known_args,
                              pipeline_mode,
                              pre_infer_headers=False,
                              keep_raw_sample_names=True)
    sample_ids = (variants
                  | 'CombineSampleIds' >>
                  combine_sample_ids.SampleIdsCombiner()
                  | 'CombineToList' >> beam.combiners.ToList())
    # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
    # of sample names in the the sharded VCF files, which would lead to double
    # hashing of samples. Needs to be fixed ASAP.
    _ = (variants
         | 'DensifyVariants' >> densify_variants.DensifyVariants(
             beam.pvalue.AsSingleton(sample_ids))
         | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
             vcf_shards_output_dir,
             beam.pvalue.AsSingleton(sample_ids),
             known_args.number_of_variants_per_shard))

  return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
          _GCS_RECURSIVE_WILDCARD] 
Example #2
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def _annotate_vcf_files(all_patterns, known_args, pipeline_args):
  # type: (List[str], argparse.Namespace, List[str]) -> str
  """Annotates the VCF files using VEP.

  Returns:
    The annotated VCF files directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  annotate_files_job_name = pipeline_common.generate_unique_name(
      _ANNOTATE_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, annotate_files_job_name)

  with beam.Pipeline(options=options) as p:
    _ = (p
         | beam.Create(all_patterns)
         | 'AnnotateShards' >> beam.ParDo(
             annotate_files.AnnotateFile(known_args, pipeline_args)))
  if known_args.annotation_fields:
    known_args.annotation_fields.append(known_args.vep_info_field)
  else:
    known_args.annotation_fields = [known_args.vep_info_field]
  # TODO(bashir2): The VEP runner by default runs VEP with --allele_number hence
  # we turn on this feature here. However, this might be inconsistent with other
  # annotation fields that are originally present in input files, if they do not
  # have ALLELE_NUM annotation. The fix is to make annotation ALT matching
  # smarter to fall back on other matching methods if ALLELE_NUM is not present.
  # When this is implemented, we may even consider removing use_allele_num flag
  # and always start by checking if ALLELE_NUM is present.
  known_args.use_allele_num = True
  return vep_runner_util.get_output_pattern(known_args.annotation_output_dir) 
Example #3
Source File: tfdv.py    From spotify-tensorflow with Apache License 2.0 5 votes vote down vote up
def generate_statistics_from_tfrecord(pipeline_args,  # type: List[str]
                                      data_location,  # type: str
                                      output_path,    # type: str
                                      stats_options   # type: StatsOptions
                                      ):
    # type: (...) ->  statistics_pb2.DatasetFeatureStatisticsList
    """
    Generate stats file from a tfrecord dataset using TFDV

    :param pipeline_args: un-parsed Dataflow arguments
    :param data_location: input data dir containing tfrecord files
    :param output_path: output path for the stats file
    :param stats_options: tfdv.StatsOptions for statistics generation settings
    :return a DatasetFeatureStatisticsList proto.
    """
    assert_not_empty_string(data_location)
    assert_not_empty_string(output_path)

    args_in_snake_case = clean_up_pipeline_args(pipeline_args)
    pipeline_options = PipelineOptions(flags=args_in_snake_case)

    all_options = pipeline_options.get_all_options()

    if all_options["job_name"] is None:
        gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
        gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))

    if all_options["setup_file"] is None:
        setup_file_path = create_setup_file()
        setup_options = pipeline_options.view_as(SetupOptions)
        setup_options.setup_file = setup_file_path

    input_files = os.path.join(data_location, "*.tfrecords*")
    return tfdv.generate_statistics_from_tfrecord(data_location=input_files,
                                                  output_path=output_path,
                                                  stats_options=stats_options,
                                                  pipeline_options=pipeline_options) 
Example #4
Source File: run_inference.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType,
               pipeline_options: PipelineOptions):
    super(_RemotePredictDoFn, self).__init__(inference_spec_type)
    self._api_client = None

    project_id = (
        inference_spec_type.ai_platform_prediction_model_spec.project_id or
        pipeline_options.view_as(GoogleCloudOptions).project)
    if not project_id:
      raise ValueError('Either a non-empty project id or project flag in '
                       ' beam pipeline options needs be provided.')

    model_name = (
        inference_spec_type.ai_platform_prediction_model_spec.model_name)
    if not model_name:
      raise ValueError('A non-empty model name must be provided.')

    version_name = (
        inference_spec_type.ai_platform_prediction_model_spec.version_name)
    name_spec = 'projects/{}/models/{}'
    # If version is not specified, the default version for a model is used.
    if version_name:
      name_spec += '/versions/{}'
    self._full_model_name = name_spec.format(project_id, model_name,
                                             version_name) 
Example #5
Source File: bq_to_vcf.py    From gcp-variant-transforms with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
  # type: (List[str]) -> None
  """Runs BigQuery to VCF pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)
  options = pipeline_options.PipelineOptions(pipeline_args)
  is_direct_runner = pipeline_common.is_pipeline_direct_runner(
      beam.Pipeline(options=options))
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  if not google_cloud_options.project:
    raise ValueError('project must be set.')
  if not is_direct_runner and not known_args.output_file.startswith('gs://'):
    raise ValueError('Please set the output file {} to GCS when running with '
                     'DataflowRunner.'.format(known_args.output_file))
  if is_direct_runner:
    known_args.number_of_bases_per_shard = sys.maxsize

  temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp()
  unique_temp_id = pipeline_common.generate_unique_name(
      google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME)
  vcf_data_temp_folder = filesystems.FileSystems.join(
      temp_folder,
      '{}_data_temp_files'.format(unique_temp_id))
  # Create the directory manually. FileSystems cannot create a file if the
  # directory does not exist when using Direct Runner.
  filesystems.FileSystems.mkdirs(vcf_data_temp_folder)
  vcf_header_file_path = filesystems.FileSystems.join(
      temp_folder,
      '{}_header_with_sample_ids.vcf'.format(unique_temp_id))

  if not known_args.representative_header_file:
    known_args.representative_header_file = filesystems.FileSystems.join(
        temp_folder,
        '{}_meta_info.vcf'.format(unique_temp_id))
    _write_vcf_meta_info(known_args.input_table,
                         known_args.representative_header_file,
                         known_args.allow_incompatible_schema)

  _bigquery_to_vcf_shards(known_args,
                          options,
                          vcf_data_temp_folder,
                          vcf_header_file_path)
  if is_direct_runner:
    vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path,
                                               vcf_data_temp_folder,
                                               known_args.output_file)
  else:
    vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project,
                                             vcf_header_file_path,
                                             vcf_data_temp_folder,
                                             known_args.output_file) 
Example #6
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 4 votes vote down vote up
def _get_input_dimensions(known_args, pipeline_args):
  pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns)
  beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = beam_pipeline_options.view_as(
      pipeline_options.GoogleCloudOptions)

  estimate_sizes_job_name = pipeline_common.generate_unique_name(
      _ESTIMATE_SIZES_JOB_NAME)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + estimate_sizes_job_name
  else:
    google_cloud_options.job_name = estimate_sizes_job_name
  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  temp_estimated_input_size_file_name = '-'.join(
      [google_cloud_options.job_name,
       _ESTIMATE_SIZES_FILE_NAME])
  temp_estimated_input_size_file_path = filesystems.FileSystems.join(
      temp_directory, temp_estimated_input_size_file_name)
  with beam.Pipeline(options=beam_pipeline_options) as p:
    estimates = pipeline_common.get_estimates(
        p, pipeline_mode, known_args.all_patterns)

    files_size = (estimates
                  | 'GetFilesSize' >> extract_input_size.GetFilesSize())
    file_count = (estimates
                  | 'CountAllFiles' >> beam.combiners.Count.Globally())
    sample_map = (estimates
                  | 'ExtractSampleMap' >> extract_input_size.GetSampleMap())
    estimated_value_count = (sample_map
                             | extract_input_size.GetEstimatedValueCount())
    estimated_sample_count = (sample_map
                              | extract_input_size.GetEstimatedSampleCount())
    estimated_variant_count = (estimates
                               | 'GetEstimatedVariantCount'
                               >> extract_input_size.GetEstimatedVariantCount())
    _ = (estimated_variant_count
         | beam.ParDo(extract_input_size.print_estimates_to_file,
                      beam.pvalue.AsSingleton(estimated_sample_count),
                      beam.pvalue.AsSingleton(estimated_value_count),
                      beam.pvalue.AsSingleton(files_size),
                      beam.pvalue.AsSingleton(file_count),
                      temp_estimated_input_size_file_path))

  with filesystems.FileSystems.open(temp_estimated_input_size_file_path) as f:
    estimates = f.readlines()
  if len(estimates) != 5:
    raise ValueError('Exactly 5 estimates were expected in {}.'.format(
        temp_estimated_input_size_file_path))

  known_args.estimated_variant_count = int(estimates[0].strip())
  known_args.estimated_sample_count = int(estimates[1].strip())
  known_args.estimated_value_count = int(estimates[2].strip())
  known_args.files_size = int(estimates[3].strip())
  known_args.file_count = int(estimates[4].strip()) 
Example #7
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 4 votes vote down vote up
def _merge_headers(known_args, pipeline_args,
                   pipeline_mode, avro_root_path, annotated_vcf_pattern=None):
  # type: (str, argparse.Namespace, List[str], int, str) -> None
  """Merges VCF headers using beam based on pipeline_mode."""
  options = pipeline_options.PipelineOptions(pipeline_args)

  # Always run pipeline locally if data is small.
  if (pipeline_mode == pipeline_common.PipelineModes.SMALL and
      not known_args.infer_headers and not known_args.infer_annotation_types):
    options.view_as(pipeline_options.StandardOptions).runner = 'DirectRunner'

  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  merge_headers_job_name = pipeline_common.generate_unique_name(
      _MERGE_HEADERS_JOB_NAME)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + merge_headers_job_name
  else:
    google_cloud_options.job_name = merge_headers_job_name

  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  temp_merged_headers_file_name = '-'.join([google_cloud_options.job_name,
                                            _MERGE_HEADERS_FILE_NAME])
  temp_merged_headers_file_path = filesystems.FileSystems.join(
      temp_directory, temp_merged_headers_file_name)

  with beam.Pipeline(options=options) as p:
    headers = pipeline_common.read_headers(
        p, pipeline_mode,
        known_args.all_patterns)
    _ = (headers
         | 'SampleInfoToAvro'
         >> sample_info_to_avro.SampleInfoToAvro(
             avro_root_path +
             sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX,
             SampleNameEncoding[known_args.sample_name_encoding]))
    if known_args.representative_header_file:
      return
    merged_header = pipeline_common.get_merged_headers(
        headers,
        known_args.split_alternate_allele_info_fields,
        known_args.allow_incompatible_records)
    if annotated_vcf_pattern:
      merged_header = pipeline_common.add_annotation_headers(
          p, known_args, pipeline_mode, merged_header,
          annotated_vcf_pattern)
    if known_args.infer_headers or known_args.infer_annotation_types:
      infer_headers_input_pattern = (
          [annotated_vcf_pattern] if
          annotated_vcf_pattern else known_args.all_patterns)
      merged_header = _add_inferred_headers(infer_headers_input_pattern, p,
                                            known_args, merged_header,
                                            pipeline_mode)

    pipeline_common.write_headers(merged_header, temp_merged_headers_file_path)
    known_args.representative_header_file = temp_merged_headers_file_path 
Example #8
Source File: revise_preprocessed_data.py    From cloudml-examples with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
  """Runs the revise preprocessed data pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
  pipeline_options = PipelineOptions(flags=argv)
  revise_options = pipeline_options.view_as(ReviseOptions)
  cloud_options = pipeline_options.view_as(GoogleCloudOptions)
  output_dir = os.path.join(revise_options.output,
                            datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(
      WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
  cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
  cloud_options.temp_location = os.path.join(output_dir, 'tmp')
  cloud_options.job_name = 'relabel-examples-%s' % (
      datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

  metadata_query = str(
      Template(open(revise_options.metadata, 'r').read()).render(
          METADATA_QUERY_REPLACEMENTS))
  logging.info('metadata query : %s', metadata_query)

  with beam.Pipeline(options=pipeline_options) as p:
    # Gather our sample metadata into a python dictionary.
    samples_metadata = (
        p
        | 'ReadSampleMetadata' >> beam.io.Read(
            beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True))
        | 'TableToDictionary' >> beam.CombineGlobally(
            util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))

    # Read the tf.Example protos into a PCollection.
    examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
        file_pattern=revise_options.input,
        compression_type=CompressionTypes.GZIP)

    # Filter the TensorFlow Example Protocol Buffers.
    filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
        lambda example, samples_metadata:
        filter_and_revise_example(example, samples_metadata),
        beam.pvalue.AsSingleton(samples_metadata)))

    # Write the subset of tf.Example protos to Cloud Storage.
    _ = (filtered_examples
         | 'SerializeExamples' >>
         beam.Map(lambda example: example.SerializeToString())
         | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
             file_path_prefix=os.path.join(output_dir, 'examples'),
             compression_type=CompressionTypes.GZIP,
             file_name_suffix='.tfrecord.gz'))