Python apache_beam.options.pipeline_options.PipelineOptions() Examples

The following are 27 code examples of apache_beam.options.pipeline_options.PipelineOptions(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam.options.pipeline_options , or try the search function .
Example #1
Source File: streaming_beam.py    From python-docs-samples with Apache License 2.0 6 votes vote down vote up
def run(args, input_subscription, output_table, window_interval):
    """Build and run the pipeline."""
    options = PipelineOptions(args, save_main_session=True, streaming=True)

    with beam.Pipeline(options=options) as pipeline:

        # Read the messages from PubSub and process them.
        messages = (
            pipeline
            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                subscription=input_subscription).with_output_types(bytes)
            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
            | 'Parse JSON messages' >> beam.Map(parse_json_message)
            | 'Fixed-size windows' >> beam.WindowInto(
                window.FixedWindows(int(window_interval), 0))
            | 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
            | 'Group by URLs' >> beam.GroupByKey()
            | 'Get statistics' >> beam.Map(get_statistics))

        # Output the results into BigQuery table.
        _ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
            output_table, schema=SCHEMA) 
Example #2
Source File: preprocess.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def run():
    """Run Apache Beam pipeline to generate TFRecords for Survival Analysis."""

    flags = parse_arguments(sys.argv[1:])
    pipeline_args = get_pipeline_args(flags)

    options = pipeline_options.PipelineOptions(flags=[], **pipeline_args)
    options.view_as(pipeline_options.WorkerOptions).machine_type = (
        flags.machine_type)

    temp_dir = os.path.join(flags.output_dir, 'tmp')

    runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'

    with beam.Pipeline(runner, options=options) as p:
        with tft_beam.Context(temp_dir=temp_dir):
            build_pipeline(p, flags) 
Example #3
Source File: read_from_relational_db.py    From beam-nuggets with MIT License 6 votes vote down vote up
def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        source_config = relational_db.SourceConfiguration(
            drivername=db_args.drivername,
            host=db_args.host,
            port=db_args.port,
            database=db_args.database,
            username=db_args.username,
            password=db_args.password,
        )

        months = p | "Reading records from db" >> relational_db.ReadFromDB(
            source_config=source_config,
            table_name=db_args.table
        )
        months | 'Writing to stdout' >> beam.Map(print) 
Example #4
Source File: base_executor.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _make_beam_pipeline(self) -> beam.Pipeline:
    """Makes beam pipeline."""
    pipeline_options = PipelineOptions(self._beam_pipeline_args)
    if pipeline_options.view_as(StandardOptions).runner:
      return beam.Pipeline(argv=self._beam_pipeline_args)

    # TODO(b/159468583): move this warning to Beam.
    direct_running_mode = pipeline_options.view_as(
        DirectOptions).direct_running_mode
    direct_num_workers = pipeline_options.view_as(
        DirectOptions).direct_num_workers
    if direct_running_mode == 'in_memory' and direct_num_workers != 1:
      absl.logging.warning(
          'If direct_num_workers is not equal to 1, direct_running_mode should '
          'be `multi_processing` or `multi_threading` instead of `in_memory` '
          'in order for it to have the desired worker parallelism effect.')

    return beam.Pipeline(
        options=pipeline_options, runner=fn_api_runner.FnApiRunner()) 
Example #5
Source File: PubSubToGCS.py    From python-docs-samples with Apache License 2.0 6 votes vote down vote up
def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
    # `save_main_session` is set to true because some DoFn's rely on
    # globally imported modules.
    pipeline_options = PipelineOptions(
        pipeline_args, streaming=True, save_main_session=True
    )

    with beam.Pipeline(options=pipeline_options) as pipeline:
        (
            pipeline
            | "Read PubSub Messages"
            >> beam.io.ReadFromPubSub(topic=input_topic)
            | "Window into" >> GroupWindowsIntoBatches(window_size)
            | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
        ) 
Example #6
Source File: vep_runner.py    From gcp-variant-transforms with Apache License 2.0 6 votes vote down vote up
def _process_pipeline_args(self, pipeline_args):
    # type: (List[str]) -> None
    flags_dict = pipeline_options.PipelineOptions(
        pipeline_args).get_all_options()
    self._project = self._get_flag(flags_dict, 'project')
    self._region = self._get_flag(flags_dict, 'region')
    # TODO(bahsir2): Fix the error messages of _check_flag since
    # --worker_machine_type has dest='machine_type'.
    try:
      self._machine_type = self._get_flag(flags_dict, 'machine_type')
    except ValueError:
      self._machine_type = self._get_machine_type_from_fork()
    self._max_num_workers = self._get_flag(
        flags_dict, 'max_num_workers', 'num_workers')
    if self._max_num_workers <= 0:
      raise ValueError(
          '--max_num_workers and --num_workers should be positive numbers, '
          'got: {}'.format(self._max_num_workers)) 
Example #7
Source File: write_to_relational_db.py    From beam-nuggets with MIT License 5 votes vote down vote up
def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Target database instance
    source_config = relational_db.SourceConfiguration(
        drivername=db_args.drivername,
        host=db_args.host,
        port=db_args.port,
        database=db_args.database,
        username=db_args.username,
        password=db_args.password,
        create_if_missing=db_args.create_if_missing
    )

    # The data to be written
    records = [
        {'name': 'Jan', 'num': 1},
        {'name': 'Feb', 'num': 2},
        {'name': 'Mar', 'num': 3},
        {'name': 'Apr', 'num': 4},
        {'name': 'May', 'num': 5},
        {'name': 'Jun', 'num': 6},
    ]

    # Target database table
    table_config = relational_db.TableConfiguration(
        name='months',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['num']  # and use 'num' column as a primary key
    )

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        months = p | "Reading records" >> beam.Create(records)
        months | 'Writing to DB' >> relational_db.Write(
            source_config=source_config,
            table_config=table_config
        ) 
Example #8
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def _validate_annotation_pipeline_args(known_args, pipeline_args):
  match_results = filesystems.FileSystems.match(['{}*'.format(
      vep_runner_util.format_dir_path(known_args.annotation_output_dir))])
  if match_results and match_results[0].metadata_list:
    raise ValueError('Output directory {} already exists.'.format(
        known_args.annotation_output_dir))

  flags_dict = pipeline_options.PipelineOptions(pipeline_args).get_all_options()
  expected_flags = ['max_num_workers', 'num_workers']
  for flag in expected_flags:
    if flag in flags_dict and flags_dict[flag] > 0:
      return
  raise ValueError('Could not find any of {} with a valid value among pipeline '
                   'flags {}'.format(expected_flags, flags_dict)) 
Example #9
Source File: tfdv.py    From spotify-tensorflow with Apache License 2.0 5 votes vote down vote up
def generate_statistics_from_tfrecord(pipeline_args,  # type: List[str]
                                      data_location,  # type: str
                                      output_path,    # type: str
                                      stats_options   # type: StatsOptions
                                      ):
    # type: (...) ->  statistics_pb2.DatasetFeatureStatisticsList
    """
    Generate stats file from a tfrecord dataset using TFDV

    :param pipeline_args: un-parsed Dataflow arguments
    :param data_location: input data dir containing tfrecord files
    :param output_path: output path for the stats file
    :param stats_options: tfdv.StatsOptions for statistics generation settings
    :return a DatasetFeatureStatisticsList proto.
    """
    assert_not_empty_string(data_location)
    assert_not_empty_string(output_path)

    args_in_snake_case = clean_up_pipeline_args(pipeline_args)
    pipeline_options = PipelineOptions(flags=args_in_snake_case)

    all_options = pipeline_options.get_all_options()

    if all_options["job_name"] is None:
        gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
        gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))

    if all_options["setup_file"] is None:
        setup_file_path = create_setup_file()
        setup_options = pipeline_options.view_as(SetupOptions)
        setup_options.setup_file = setup_file_path

    input_files = os.path.join(data_location, "*.tfrecords*")
    return tfdv.generate_statistics_from_tfrecord(data_location=input_files,
                                                  output_path=output_path,
                                                  stats_options=stats_options,
                                                  pipeline_options=pipeline_options) 
Example #10
Source File: run_inference.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType,
               pipeline_options: PipelineOptions):
    super(_RemotePredictDoFn, self).__init__(inference_spec_type)
    self._api_client = None

    project_id = (
        inference_spec_type.ai_platform_prediction_model_spec.project_id or
        pipeline_options.view_as(GoogleCloudOptions).project)
    if not project_id:
      raise ValueError('Either a non-empty project id or project flag in '
                       ' beam pipeline options needs be provided.')

    model_name = (
        inference_spec_type.ai_platform_prediction_model_spec.model_name)
    if not model_name:
      raise ValueError('A non-empty model name must be provided.')

    version_name = (
        inference_spec_type.ai_platform_prediction_model_spec.version_name)
    name_spec = 'projects/{}/models/{}'
    # If version is not specified, the default version for a model is used.
    if version_name:
      name_spec += '/versions/{}'
    self._full_model_name = name_spec.format(project_id, model_name,
                                             version_name) 
Example #11
Source File: run.py    From realtime-embeddings-matching with Apache License 2.0 5 votes vote down vote up
def main(argv=None):
  known_args, pipeline_args = get_args(argv)
  pipeline_options = PipelineOptions(pipeline_args)
  setup_options = pipeline_options.view_as(SetupOptions)
  setup_options.save_main_session = True
  pipeline.run(pipeline_options, known_args) 
Example #12
Source File: gcs_to_bigquery_lib.py    From healthcare-deid with Apache License 2.0 5 votes vote down vote up
def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_file_to_records) |
       'parse_physionet_record' >> beam.Map(f2pn.parse_physionet_record) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, record_number:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
Example #13
Source File: bigquery_to_gcs_lib.py    From healthcare-deid with Apache License 2.0 5 votes vote down vote up
def run_pipeline(input_query, output_file, pipeline_args):
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p
       | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=input_query))
       | 'to_physionet' >> beam.Map(map_to_physionet_record)
       | 'write' >> beam.io.WriteToText(output_file))
  result = p.run().wait_until_finish()

  logging.info('BigQuery to GCS result: %s', result) 
Example #14
Source File: physionet_to_mae_lib.py    From healthcare-deid with Apache License 2.0 5 votes vote down vote up
def run_pipeline(input_pattern, output_dir, mae_task_name, project,
                 pipeline_args):
  """Read the physionet records from GCS and write them out as MAE."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_phi_to_findings) |
       'generate_mae' >> beam.Map(mae.generate_mae, mae_task_name, {},
                                  ['patient_id', 'record_number']) |
       'write_mae' >> beam.Map(write_mae, project, output_dir)
      )
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
Example #15
Source File: gcs_to_bigquery_lib.py    From healthcare-deid with Apache License 2.0 5 votes vote down vote up
def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(map_file_to_records) |
       'map_to_bq_inputs' >> beam.Map(map_to_bq_inputs) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
Example #16
Source File: bigquery_to_gcs_lib.py    From healthcare-deid with Apache License 2.0 5 votes vote down vote up
def run_pipeline(input_query, output_path, pipeline_args):
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p
       | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=input_query))
       | 'to_mist' >> beam.Map(map_to_mist_record)
       | 'write' >> beam.io.WriteToText(output_path))
  result = p.run().wait_until_finish()

  logging.info('BigQuery to GCS result: %s', result) 
Example #17
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 5 votes vote down vote up
def _shard_variants(known_args, pipeline_args, pipeline_mode):
  # type: (argparse.Namespace, List[str], int) -> List[str]
  """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  shard_files_job_name = pipeline_common.generate_unique_name(
      _SHARD_VCF_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
  vcf_shards_output_dir = filesystems.FileSystems.join(
      known_args.annotation_output_dir, _SHARDS_FOLDER)
  with beam.Pipeline(options=options) as p:
    variants = _read_variants(known_args.all_patterns,
                              p,
                              known_args,
                              pipeline_mode,
                              pre_infer_headers=False,
                              keep_raw_sample_names=True)
    sample_ids = (variants
                  | 'CombineSampleIds' >>
                  combine_sample_ids.SampleIdsCombiner()
                  | 'CombineToList' >> beam.combiners.ToList())
    # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
    # of sample names in the the sharded VCF files, which would lead to double
    # hashing of samples. Needs to be fixed ASAP.
    _ = (variants
         | 'DensifyVariants' >> densify_variants.DensifyVariants(
             beam.pvalue.AsSingleton(sample_ids))
         | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
             vcf_shards_output_dir,
             beam.pvalue.AsSingleton(sample_ids),
             known_args.number_of_variants_per_shard))

  return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
          _GCS_RECURSIVE_WILDCARD] 
Example #18
Source File: create_data.py    From conversational-datasets with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    sentence_files_match = FileSystems.match([args.sentence_files])[0]
    sentence_files = [
        file_metadata.path
        for file_metadata in sentence_files_match.metadata_list]
    logging.info("Reading %i files from %s.",
                 len(sentence_files), args.sentence_files)
    assert len(sentence_files) > 0
    sentence_files = p | beam.Create(sentence_files)
    examples = sentence_files | "create examples" >> beam.FlatMap(
        partial(_create_examples_from_file,
                min_length=args.min_length,
                max_length=args.max_length,
                num_extra_contexts=args.num_extra_contexts)
    )

    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)).with_outputs(
            _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (
            serialized_examples | ("write " + name)
            >> write_sink(
                os.path.join(args.output_dir, name),
                file_name_suffix=file_name_suffix,
                num_shards=args.num_shards_train,
            )
        )

    result = p.run()
    result.wait_until_finish() 
Example #19
Source File: revise_preprocessed_data.py    From cloudml-examples with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
  """Runs the revise preprocessed data pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
  pipeline_options = PipelineOptions(flags=argv)
  revise_options = pipeline_options.view_as(ReviseOptions)
  cloud_options = pipeline_options.view_as(GoogleCloudOptions)
  output_dir = os.path.join(revise_options.output,
                            datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(
      WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
  cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
  cloud_options.temp_location = os.path.join(output_dir, 'tmp')
  cloud_options.job_name = 'relabel-examples-%s' % (
      datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

  metadata_query = str(
      Template(open(revise_options.metadata, 'r').read()).render(
          METADATA_QUERY_REPLACEMENTS))
  logging.info('metadata query : %s', metadata_query)

  with beam.Pipeline(options=pipeline_options) as p:
    # Gather our sample metadata into a python dictionary.
    samples_metadata = (
        p
        | 'ReadSampleMetadata' >> beam.io.Read(
            beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True))
        | 'TableToDictionary' >> beam.CombineGlobally(
            util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))

    # Read the tf.Example protos into a PCollection.
    examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
        file_pattern=revise_options.input,
        compression_type=CompressionTypes.GZIP)

    # Filter the TensorFlow Example Protocol Buffers.
    filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
        lambda example, samples_metadata:
        filter_and_revise_example(example, samples_metadata),
        beam.pvalue.AsSingleton(samples_metadata)))

    # Write the subset of tf.Example protos to Cloud Storage.
    _ = (filtered_examples
         | 'SerializeExamples' >>
         beam.Map(lambda example: example.SerializeToString())
         | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
             file_path_prefix=os.path.join(output_dir, 'examples'),
             compression_type=CompressionTypes.GZIP,
             file_name_suffix='.tfrecord.gz')) 
Example #20
Source File: create_data.py    From conversational-datasets with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | "read qa files" >> ReadFromText(args.file_pattern)

    # The lines are not JSON, but the string representation of python
    # dictionary objects. Parse them with ast.literal_eval.
    json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval)
    qa_tuples = json_objects | "create tuples" >> beam.FlatMap(
        partial(
            _create_tuples,
            min_words=args.min_words, max_words=args.max_words)
    )

    # Remove duplicate examples.
    qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v))
    qa_tuples |= "group duplicates" >> beam.GroupByKey()
    qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0])

    # Create the examples.
    examples = qa_tuples | "create examples" >> beam.Map(
        lambda args: _create_example(*args)
    )
    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)
    ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (
            serialized_examples | ("write " + name)
            >> write_sink(
                os.path.join(args.output_dir, name),
                file_name_suffix=file_name_suffix,
                num_shards=args.num_shards_train,
            )
        )

    result = p.run()
    result.wait_until_finish() 
Example #21
Source File: vcf_to_bq_preprocess.py    From gcp-variant-transforms with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
  # type: (List[str]) -> (str, str)
  """Runs preprocess pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)
  options = pipeline_options.PipelineOptions(pipeline_args)
  all_patterns = known_args.all_patterns
  pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

  with beam.Pipeline(options=options) as p:
    headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns)
    merged_headers = pipeline_common.get_merged_headers(headers)
    merged_definitions = (headers
                          | 'MergeDefinitions' >>
                          merge_header_definitions.MergeDefinitions())
    if known_args.report_all_conflicts:
      variants = pipeline_common.read_variants(p,
                                               all_patterns,
                                               pipeline_mode,
                                               allow_malformed_records=True,
                                               pre_infer_headers=True)
      malformed_records = variants | filter_variants.ExtractMalformedVariants()
      inferred_headers, merged_headers = (_get_inferred_headers(variants,
                                                                merged_headers))
      _ = (merged_definitions
           | 'GenerateConflictsReport' >>
           beam.ParDo(preprocess_reporter.generate_report,
                      known_args.report_path,
                      beam.pvalue.AsSingleton(merged_headers),
                      beam.pvalue.AsSingleton(inferred_headers),
                      beam.pvalue.AsIter(malformed_records)))
    else:
      _ = (merged_definitions
           | 'GenerateConflictsReport' >>
           beam.ParDo(preprocess_reporter.generate_report,
                      known_args.report_path,
                      beam.pvalue.AsSingleton(merged_headers)))

    if known_args.resolved_headers_path:
      pipeline_common.write_headers(merged_headers,
                                    known_args.resolved_headers_path) 
Example #22
Source File: process_delimited.py    From professional-services with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()

    # Add the arguments needed for this specific Dataflow job.
    parser.add_argument(
        '--input', dest='input', required=True,
        help='Input file to read.  This can be a local file or '
             'a file in a Google Storage Bucket.')

    parser.add_argument('--output', dest='output', required=True,
                        help='Output BQ table to write results to.')

    parser.add_argument('--delimiter', dest='delimiter', required=False,
                        help='Delimiter to split input records.',
                        default=',')

    parser.add_argument('--fields', dest='fields', required=True,
                        help='Comma separated list of field names.')

    parser.add_argument('--load_dt', dest='load_dt', required=True,
                        help='Load date in YYYY-MM-DD format.')

    known_args, pipeline_args = parser.parse_known_args(argv)
    row_transformer = RowTransformer(delimiter=known_args.delimiter,
                                     header=known_args.fields,
                                     filename=ntpath.basename(known_args.input),
                                     load_dt=known_args.load_dt)

    p_opts = pipeline_options.PipelineOptions(pipeline_args)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is.
    with beam.Pipeline(options=p_opts) as pipeline:
        # Read the file.  This is the source of the pipeline.  All further
        # processing starts with lines read from the file.  We use the input
        # argument from the command line.
        rows = pipeline | "Read from text file" >> beam.io.ReadFromText(known_args.input)

        # This stage of the pipeline translates from a delimited single row
        # input to a dictionary object consumable by BigQuery.
        # It refers to a function we have written.  This function will
        # be run in parallel on different workers using input from the
        # previous stage of the pipeline.
        dict_records = rows | "Convert to BigQuery row" >> beam.Map(
            lambda r: row_transformer.parse(r))

        # This stage of the pipeline writes the dictionary records into
        # an existing BigQuery table. The sink is also configured to truncate
        # the table if it contains any existing records.
        dict_records | "Write to BigQuery" >> beam.io.Write(
            beam.io.BigQuerySink(known_args.output,
                                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                                 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) 
Example #23
Source File: data_transformation.py    From professional-services with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   Specifically
    # we have the input file to load and the output table to write to.
    parser.add_argument(
        '--input',
        dest='input',
        required=False,
        help='Input file to read.  This can be a local file or '
        'a file in a Google Storage Bucket.',
        # This example file contains a total of only 10 lines.
        # It is useful for developing on a small set of data
        default='gs://python-dataflow-example/data_files/head_usa_names.csv')
    # This defaults to the temp dataset in your BigQuery project.  You'll have
    # to create the temp dataset yourself using bq mk temp
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake.usa_names_transformed')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)
    # DataTransformation is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataTransformation()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should
    # store temp files, and what the project id is.
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    (p
     # Read the file.  This is the source of the pipeline.  All further
     # processing starts with lines read from the file.  We use the input
     # argument from the command line.  We also skip the first line which is a
     # header row.
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     # This stage of the pipeline translates from a CSV file single row
     # input as a string, to a dictionary object consumable by BigQuery.
     # It refers to a function we have written.  This function will
     # be run in parallel on different workers using input from the
     # previous stage of the pipeline.
     | 'String to BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s)) |
     'Write to BigQuery' >> beam.io.Write(
         beam.io.BigQuerySink(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             known_args.output,
             # Here we use the JSON schema read in from a JSON file.
             # Specifying the schema allows the API to create the table correctly if it does not yet exist.
             schema=schema,
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             # Deletes all data in the BigQuery table before writing.
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish() 
Example #24
Source File: load_file_generator.py    From professional-services with Apache License 2.0 4 votes vote down vote up
def _create_parquet_file(self, blob_name, staging_table_util,
                             destination_prefix):
        """Creates a parquet file from a staging table and stores in GCS.

        The parquet file is generated using DataFLow, since BigQuery Extract
        Jobs do not support the parquet file type as a destination format.

        Args:
            blob_name(str): Name of the file (or blob) to be generated. Starts
                with 'fileType=' and end with the file extension.
                Ex: fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876.csv  # pylint: disable=line-too-long
            staging_table_util(load_benchmark_tools.table_util.TableUtil): Util
                object for interacting with the staging table that the parquet
                file will be generated from.
            destination_prefix(str): String containing the 'gs://' prefix, the
                bucket name, and the path of the file, without the extension.
                This is needed by the WriteToParquet class.
                Ex: gs://annarudy_test_files/fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876 # pylint: disable=line-too-long
        """
        logging.info('Attempting to create file ' '{0:s}'.format(blob_name))
        pipeline_args = [
            '--project', self.project_id, '--staging_location',
            self.dataflow_staging_location, '--temp_location',
            self.dataflow_temp_location, '--save_main_session',
            '--worker_machine_type', 'n1-highcpu-32', '--runner',
            'DataflowRunner', '--setup_file', './setup.py'
        ]
        options = pipeline_options.PipelineOptions(pipeline_args)
        table_spec = beam_bigquery.TableReference(
            projectId=self.project_id,
            datasetId=self.primitive_staging_dataset_id,
            tableId=staging_table_util.table_id)
        bq_schema = staging_table_util.table.schema
        pa_schema = parquet_util.ParquetUtil(
            bq_schema).get_pa_translated_schema()
        p = beam.Pipeline(options=options)
        table = (
            p | 'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(table_spec)))
        (table | beam.io.WriteToParquet(
            file_path_prefix=destination_prefix,
            schema=pa_schema,
            file_name_suffix='.parquet',
            num_shards=1,
            shard_name_template='',
        ))
        p.run().wait_until_finish()
        logging.info('Created file: {0:s}'.format(blob_name)) 
Example #25
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 4 votes vote down vote up
def _merge_headers(known_args, pipeline_args,
                   pipeline_mode, avro_root_path, annotated_vcf_pattern=None):
  # type: (str, argparse.Namespace, List[str], int, str) -> None
  """Merges VCF headers using beam based on pipeline_mode."""
  options = pipeline_options.PipelineOptions(pipeline_args)

  # Always run pipeline locally if data is small.
  if (pipeline_mode == pipeline_common.PipelineModes.SMALL and
      not known_args.infer_headers and not known_args.infer_annotation_types):
    options.view_as(pipeline_options.StandardOptions).runner = 'DirectRunner'

  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  merge_headers_job_name = pipeline_common.generate_unique_name(
      _MERGE_HEADERS_JOB_NAME)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + merge_headers_job_name
  else:
    google_cloud_options.job_name = merge_headers_job_name

  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  temp_merged_headers_file_name = '-'.join([google_cloud_options.job_name,
                                            _MERGE_HEADERS_FILE_NAME])
  temp_merged_headers_file_path = filesystems.FileSystems.join(
      temp_directory, temp_merged_headers_file_name)

  with beam.Pipeline(options=options) as p:
    headers = pipeline_common.read_headers(
        p, pipeline_mode,
        known_args.all_patterns)
    _ = (headers
         | 'SampleInfoToAvro'
         >> sample_info_to_avro.SampleInfoToAvro(
             avro_root_path +
             sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX,
             SampleNameEncoding[known_args.sample_name_encoding]))
    if known_args.representative_header_file:
      return
    merged_header = pipeline_common.get_merged_headers(
        headers,
        known_args.split_alternate_allele_info_fields,
        known_args.allow_incompatible_records)
    if annotated_vcf_pattern:
      merged_header = pipeline_common.add_annotation_headers(
          p, known_args, pipeline_mode, merged_header,
          annotated_vcf_pattern)
    if known_args.infer_headers or known_args.infer_annotation_types:
      infer_headers_input_pattern = (
          [annotated_vcf_pattern] if
          annotated_vcf_pattern else known_args.all_patterns)
      merged_header = _add_inferred_headers(infer_headers_input_pattern, p,
                                            known_args, merged_header,
                                            pipeline_mode)

    pipeline_common.write_headers(merged_header, temp_merged_headers_file_path)
    known_args.representative_header_file = temp_merged_headers_file_path 
Example #26
Source File: vcf_to_bq.py    From gcp-variant-transforms with Apache License 2.0 4 votes vote down vote up
def _get_input_dimensions(known_args, pipeline_args):
  pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns)
  beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = beam_pipeline_options.view_as(
      pipeline_options.GoogleCloudOptions)

  estimate_sizes_job_name = pipeline_common.generate_unique_name(
      _ESTIMATE_SIZES_JOB_NAME)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + estimate_sizes_job_name
  else:
    google_cloud_options.job_name = estimate_sizes_job_name
  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  temp_estimated_input_size_file_name = '-'.join(
      [google_cloud_options.job_name,
       _ESTIMATE_SIZES_FILE_NAME])
  temp_estimated_input_size_file_path = filesystems.FileSystems.join(
      temp_directory, temp_estimated_input_size_file_name)
  with beam.Pipeline(options=beam_pipeline_options) as p:
    estimates = pipeline_common.get_estimates(
        p, pipeline_mode, known_args.all_patterns)

    files_size = (estimates
                  | 'GetFilesSize' >> extract_input_size.GetFilesSize())
    file_count = (estimates
                  | 'CountAllFiles' >> beam.combiners.Count.Globally())
    sample_map = (estimates
                  | 'ExtractSampleMap' >> extract_input_size.GetSampleMap())
    estimated_value_count = (sample_map
                             | extract_input_size.GetEstimatedValueCount())
    estimated_sample_count = (sample_map
                              | extract_input_size.GetEstimatedSampleCount())
    estimated_variant_count = (estimates
                               | 'GetEstimatedVariantCount'
                               >> extract_input_size.GetEstimatedVariantCount())
    _ = (estimated_variant_count
         | beam.ParDo(extract_input_size.print_estimates_to_file,
                      beam.pvalue.AsSingleton(estimated_sample_count),
                      beam.pvalue.AsSingleton(estimated_value_count),
                      beam.pvalue.AsSingleton(files_size),
                      beam.pvalue.AsSingleton(file_count),
                      temp_estimated_input_size_file_path))

  with filesystems.FileSystems.open(temp_estimated_input_size_file_path) as f:
    estimates = f.readlines()
  if len(estimates) != 5:
    raise ValueError('Exactly 5 estimates were expected in {}.'.format(
        temp_estimated_input_size_file_path))

  known_args.estimated_variant_count = int(estimates[0].strip())
  known_args.estimated_sample_count = int(estimates[1].strip())
  known_args.estimated_value_count = int(estimates[2].strip())
  known_args.files_size = int(estimates[3].strip())
  known_args.file_count = int(estimates[4].strip()) 
Example #27
Source File: bq_to_vcf.py    From gcp-variant-transforms with Apache License 2.0 4 votes vote down vote up
def run(argv=None):
  # type: (List[str]) -> None
  """Runs BigQuery to VCF pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)
  options = pipeline_options.PipelineOptions(pipeline_args)
  is_direct_runner = pipeline_common.is_pipeline_direct_runner(
      beam.Pipeline(options=options))
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  if not google_cloud_options.project:
    raise ValueError('project must be set.')
  if not is_direct_runner and not known_args.output_file.startswith('gs://'):
    raise ValueError('Please set the output file {} to GCS when running with '
                     'DataflowRunner.'.format(known_args.output_file))
  if is_direct_runner:
    known_args.number_of_bases_per_shard = sys.maxsize

  temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp()
  unique_temp_id = pipeline_common.generate_unique_name(
      google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME)
  vcf_data_temp_folder = filesystems.FileSystems.join(
      temp_folder,
      '{}_data_temp_files'.format(unique_temp_id))
  # Create the directory manually. FileSystems cannot create a file if the
  # directory does not exist when using Direct Runner.
  filesystems.FileSystems.mkdirs(vcf_data_temp_folder)
  vcf_header_file_path = filesystems.FileSystems.join(
      temp_folder,
      '{}_header_with_sample_ids.vcf'.format(unique_temp_id))

  if not known_args.representative_header_file:
    known_args.representative_header_file = filesystems.FileSystems.join(
        temp_folder,
        '{}_meta_info.vcf'.format(unique_temp_id))
    _write_vcf_meta_info(known_args.input_table,
                         known_args.representative_header_file,
                         known_args.allow_incompatible_schema)

  _bigquery_to_vcf_shards(known_args,
                          options,
                          vcf_data_temp_folder,
                          vcf_header_file_path)
  if is_direct_runner:
    vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path,
                                               vcf_data_temp_folder,
                                               known_args.output_file)
  else:
    vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project,
                                             vcf_header_file_path,
                                             vcf_data_temp_folder,
                                             known_args.output_file)