Python Examples of apache_beam.options.pipeline

Source File: streaming_beam.py From python-docs-samples with Apache License 2.0

6 votes

def run(args, input_subscription, output_table, window_interval):
    """Build and run the pipeline."""
    options = PipelineOptions(args, save_main_session=True, streaming=True)

    with beam.Pipeline(options=options) as pipeline:

        # Read the messages from PubSub and process them.
        messages = (
            pipeline
            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                subscription=input_subscription).with_output_types(bytes)
            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
            | 'Parse JSON messages' >> beam.Map(parse_json_message)
            | 'Fixed-size windows' >> beam.WindowInto(
                window.FixedWindows(int(window_interval), 0))
            | 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
            | 'Group by URLs' >> beam.GroupByKey()
            | 'Get statistics' >> beam.Map(get_statistics))

        # Output the results into BigQuery table.
        _ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
            output_table, schema=SCHEMA)

Source File: preprocess.py From professional-services with Apache License 2.0

6 votes

def run():
    """Run Apache Beam pipeline to generate TFRecords for Survival Analysis."""

    flags = parse_arguments(sys.argv[1:])
    pipeline_args = get_pipeline_args(flags)

    options = pipeline_options.PipelineOptions(flags=[], **pipeline_args)
    options.view_as(pipeline_options.WorkerOptions).machine_type = (
        flags.machine_type)

    temp_dir = os.path.join(flags.output_dir, 'tmp')

    runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'

    with beam.Pipeline(runner, options=options) as p:
        with tft_beam.Context(temp_dir=temp_dir):
            build_pipeline(p, flags)

Source File: read_from_relational_db.py From beam-nuggets with MIT License

6 votes

def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        source_config = relational_db.SourceConfiguration(
            drivername=db_args.drivername,
            host=db_args.host,
            port=db_args.port,
            database=db_args.database,
            username=db_args.username,
            password=db_args.password,
        )

        months = p | "Reading records from db" >> relational_db.ReadFromDB(
            source_config=source_config,
            table_name=db_args.table
        )
        months | 'Writing to stdout' >> beam.Map(print)

Source File: base_executor.py From tfx with Apache License 2.0

6 votes

def _make_beam_pipeline(self) -> beam.Pipeline:
    """Makes beam pipeline."""
    pipeline_options = PipelineOptions(self._beam_pipeline_args)
    if pipeline_options.view_as(StandardOptions).runner:
      return beam.Pipeline(argv=self._beam_pipeline_args)

    # TODO(b/159468583): move this warning to Beam.
    direct_running_mode = pipeline_options.view_as(
        DirectOptions).direct_running_mode
    direct_num_workers = pipeline_options.view_as(
        DirectOptions).direct_num_workers
    if direct_running_mode == 'in_memory' and direct_num_workers != 1:
      absl.logging.warning(
          'If direct_num_workers is not equal to 1, direct_running_mode should '
          'be `multi_processing` or `multi_threading` instead of `in_memory` '
          'in order for it to have the desired worker parallelism effect.')

    return beam.Pipeline(
        options=pipeline_options, runner=fn_api_runner.FnApiRunner())

Source File: PubSubToGCS.py From python-docs-samples with Apache License 2.0

6 votes

def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
    # `save_main_session` is set to true because some DoFn's rely on
    # globally imported modules.
    pipeline_options = PipelineOptions(
        pipeline_args, streaming=True, save_main_session=True
    )

    with beam.Pipeline(options=pipeline_options) as pipeline:
        (
            pipeline
            | "Read PubSub Messages"
            >> beam.io.ReadFromPubSub(topic=input_topic)
            | "Window into" >> GroupWindowsIntoBatches(window_size)
            | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
        )

Source File: vep_runner.py From gcp-variant-transforms with Apache License 2.0

6 votes

def _process_pipeline_args(self, pipeline_args):
    # type: (List[str]) -> None
    flags_dict = pipeline_options.PipelineOptions(
        pipeline_args).get_all_options()
    self._project = self._get_flag(flags_dict, 'project')
    self._region = self._get_flag(flags_dict, 'region')
    # TODO(bahsir2): Fix the error messages of _check_flag since
    # --worker_machine_type has dest='machine_type'.
    try:
      self._machine_type = self._get_flag(flags_dict, 'machine_type')
    except ValueError:
      self._machine_type = self._get_machine_type_from_fork()
    self._max_num_workers = self._get_flag(
        flags_dict, 'max_num_workers', 'num_workers')
    if self._max_num_workers <= 0:
      raise ValueError(
          '--max_num_workers and --num_workers should be positive numbers, '
          'got: {}'.format(self._max_num_workers))

Source File: write_to_relational_db.py From beam-nuggets with MIT License

5 votes

def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Target database instance
    source_config = relational_db.SourceConfiguration(
        drivername=db_args.drivername,
        host=db_args.host,
        port=db_args.port,
        database=db_args.database,
        username=db_args.username,
        password=db_args.password,
        create_if_missing=db_args.create_if_missing
    )

    # The data to be written
    records = [
        {'name': 'Jan', 'num': 1},
        {'name': 'Feb', 'num': 2},
        {'name': 'Mar', 'num': 3},
        {'name': 'Apr', 'num': 4},
        {'name': 'May', 'num': 5},
        {'name': 'Jun', 'num': 6},
    ]

    # Target database table
    table_config = relational_db.TableConfiguration(
        name='months',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['num']  # and use 'num' column as a primary key
    )

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        months = p | "Reading records" >> beam.Create(records)
        months | 'Writing to DB' >> relational_db.Write(
            source_config=source_config,
            table_config=table_config
        )

Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0

5 votes

def _validate_annotation_pipeline_args(known_args, pipeline_args):
  match_results = filesystems.FileSystems.match(['{}*'.format(
      vep_runner_util.format_dir_path(known_args.annotation_output_dir))])
  if match_results and match_results[0].metadata_list:
    raise ValueError('Output directory {} already exists.'.format(
        known_args.annotation_output_dir))

  flags_dict = pipeline_options.PipelineOptions(pipeline_args).get_all_options()
  expected_flags = ['max_num_workers', 'num_workers']
  for flag in expected_flags:
    if flag in flags_dict and flags_dict[flag] > 0:
      return
  raise ValueError('Could not find any of {} with a valid value among pipeline '
                   'flags {}'.format(expected_flags, flags_dict))

Source File: tfdv.py From spotify-tensorflow with Apache License 2.0

5 votes

def generate_statistics_from_tfrecord(pipeline_args,  # type: List[str]
                                      data_location,  # type: str
                                      output_path,    # type: str
                                      stats_options   # type: StatsOptions
                                      ):
    # type: (...) ->  statistics_pb2.DatasetFeatureStatisticsList
    """
    Generate stats file from a tfrecord dataset using TFDV

    :param pipeline_args: un-parsed Dataflow arguments
    :param data_location: input data dir containing tfrecord files
    :param output_path: output path for the stats file
    :param stats_options: tfdv.StatsOptions for statistics generation settings
    :return a DatasetFeatureStatisticsList proto.
    """
    assert_not_empty_string(data_location)
    assert_not_empty_string(output_path)

    args_in_snake_case = clean_up_pipeline_args(pipeline_args)
    pipeline_options = PipelineOptions(flags=args_in_snake_case)

    all_options = pipeline_options.get_all_options()

    if all_options["job_name"] is None:
        gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
        gcloud_options.job_name = "generatestats-%s" % str(int(time.time()))

    if all_options["setup_file"] is None:
        setup_file_path = create_setup_file()
        setup_options = pipeline_options.view_as(SetupOptions)
        setup_options.setup_file = setup_file_path

    input_files = os.path.join(data_location, "*.tfrecords*")
    return tfdv.generate_statistics_from_tfrecord(data_location=input_files,
                                                  output_path=output_path,
                                                  stats_options=stats_options,
                                                  pipeline_options=pipeline_options)

Source File: run_inference.py From tfx-bsl with Apache License 2.0

5 votes

def __init__(self, inference_spec_type: model_spec_pb2.InferenceSpecType,
               pipeline_options: PipelineOptions):
    super(_RemotePredictDoFn, self).__init__(inference_spec_type)
    self._api_client = None

    project_id = (
        inference_spec_type.ai_platform_prediction_model_spec.project_id or
        pipeline_options.view_as(GoogleCloudOptions).project)
    if not project_id:
      raise ValueError('Either a non-empty project id or project flag in '
                       ' beam pipeline options needs be provided.')

    model_name = (
        inference_spec_type.ai_platform_prediction_model_spec.model_name)
    if not model_name:
      raise ValueError('A non-empty model name must be provided.')

    version_name = (
        inference_spec_type.ai_platform_prediction_model_spec.version_name)
    name_spec = 'projects/{}/models/{}'
    # If version is not specified, the default version for a model is used.
    if version_name:
      name_spec += '/versions/{}'
    self._full_model_name = name_spec.format(project_id, model_name,
                                             version_name)

Source File: run.py From realtime-embeddings-matching with Apache License 2.0

5 votes

def main(argv=None):
  known_args, pipeline_args = get_args(argv)
  pipeline_options = PipelineOptions(pipeline_args)
  setup_options = pipeline_options.view_as(SetupOptions)
  setup_options.save_main_session = True
  pipeline.run(pipeline_options, known_args)

Source File: gcs_to_bigquery_lib.py From healthcare-deid with Apache License 2.0

5 votes

def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_file_to_records) |
       'parse_physionet_record' >> beam.Map(f2pn.parse_physionet_record) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, record_number:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result)

Source File: bigquery_to_gcs_lib.py From healthcare-deid with Apache License 2.0

5 votes

def run_pipeline(input_query, output_file, pipeline_args):
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p
       | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=input_query))
       | 'to_physionet' >> beam.Map(map_to_physionet_record)
       | 'write' >> beam.io.WriteToText(output_file))
  result = p.run().wait_until_finish()

  logging.info('BigQuery to GCS result: %s', result)

Source File: physionet_to_mae_lib.py From healthcare-deid with Apache License 2.0

5 votes

def run_pipeline(input_pattern, output_dir, mae_task_name, project,
                 pipeline_args):
  """Read the physionet records from GCS and write them out as MAE."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_phi_to_findings) |
       'generate_mae' >> beam.Map(mae.generate_mae, mae_task_name, {},
                                  ['patient_id', 'record_number']) |
       'write_mae' >> beam.Map(write_mae, project, output_dir)
      )
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result)

Source File: gcs_to_bigquery_lib.py From healthcare-deid with Apache License 2.0

5 votes

def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(map_file_to_records) |
       'map_to_bq_inputs' >> beam.Map(map_to_bq_inputs) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result)

Source File: bigquery_to_gcs_lib.py From healthcare-deid with Apache License 2.0

5 votes

def run_pipeline(input_query, output_path, pipeline_args):
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p
       | 'read' >> beam.io.Read(beam.io.BigQuerySource(query=input_query))
       | 'to_mist' >> beam.Map(map_to_mist_record)
       | 'write' >> beam.io.WriteToText(output_path))
  result = p.run().wait_until_finish()

  logging.info('BigQuery to GCS result: %s', result)

Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0

5 votes

def _shard_variants(known_args, pipeline_args, pipeline_mode):
  # type: (argparse.Namespace, List[str], int) -> List[str]
  """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  shard_files_job_name = pipeline_common.generate_unique_name(
      _SHARD_VCF_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
  vcf_shards_output_dir = filesystems.FileSystems.join(
      known_args.annotation_output_dir, _SHARDS_FOLDER)
  with beam.Pipeline(options=options) as p:
    variants = _read_variants(known_args.all_patterns,
                              p,
                              known_args,
                              pipeline_mode,
                              pre_infer_headers=False,
                              keep_raw_sample_names=True)
    sample_ids = (variants
                  | 'CombineSampleIds' >>
                  combine_sample_ids.SampleIdsCombiner()
                  | 'CombineToList' >> beam.combiners.ToList())
    # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
    # of sample names in the the sharded VCF files, which would lead to double
    # hashing of samples. Needs to be fixed ASAP.
    _ = (variants
         | 'DensifyVariants' >> densify_variants.DensifyVariants(
             beam.pvalue.AsSingleton(sample_ids))
         | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
             vcf_shards_output_dir,
             beam.pvalue.AsSingleton(sample_ids),
             known_args.number_of_variants_per_shard))

  return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
          _GCS_RECURSIVE_WILDCARD]

Source File: create_data.py From conversational-datasets with Apache License 2.0

4 votes

def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    sentence_files_match = FileSystems.match([args.sentence_files])[0]
    sentence_files = [
        file_metadata.path
        for file_metadata in sentence_files_match.metadata_list]
    logging.info("Reading %i files from %s.",
                 len(sentence_files), args.sentence_files)
    assert len(sentence_files) > 0
    sentence_files = p | beam.Create(sentence_files)
    examples = sentence_files | "create examples" >> beam.FlatMap(
        partial(_create_examples_from_file,
                min_length=args.min_length,
                max_length=args.max_length,
                num_extra_contexts=args.num_extra_contexts)
    )

    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)).with_outputs(
            _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (
            serialized_examples | ("write " + name)
            >> write_sink(
                os.path.join(args.output_dir, name),
                file_name_suffix=file_name_suffix,
                num_shards=args.num_shards_train,
            )
        )

    result = p.run()
    result.wait_until_finish()

Source File: revise_preprocessed_data.py From cloudml-examples with Apache License 2.0

4 votes

def run(argv=None):
  """Runs the revise preprocessed data pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """
  pipeline_options = PipelineOptions(flags=argv)
  revise_options = pipeline_options.view_as(ReviseOptions)
  cloud_options = pipeline_options.view_as(GoogleCloudOptions)
  output_dir = os.path.join(revise_options.output,
                            datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(
      WorkerOptions).autoscaling_algorithm = 'THROUGHPUT_BASED'
  cloud_options.staging_location = os.path.join(output_dir, 'tmp', 'staging')
  cloud_options.temp_location = os.path.join(output_dir, 'tmp')
  cloud_options.job_name = 'relabel-examples-%s' % (
      datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

  metadata_query = str(
      Template(open(revise_options.metadata, 'r').read()).render(
          METADATA_QUERY_REPLACEMENTS))
  logging.info('metadata query : %s', metadata_query)

  with beam.Pipeline(options=pipeline_options) as p:
    # Gather our sample metadata into a python dictionary.
    samples_metadata = (
        p
        | 'ReadSampleMetadata' >> beam.io.Read(
            beam.io.BigQuerySource(query=metadata_query, use_standard_sql=True))
        | 'TableToDictionary' >> beam.CombineGlobally(
            util.TableToDictCombineFn(key_column=encoder.KEY_COLUMN)))

    # Read the tf.Example protos into a PCollection.
    examples = p | 'ReadExamples' >> tfrecordio.ReadFromTFRecord(
        file_pattern=revise_options.input,
        compression_type=CompressionTypes.GZIP)

    # Filter the TensorFlow Example Protocol Buffers.
    filtered_examples = (examples | 'ReviseExamples' >> beam.FlatMap(
        lambda example, samples_metadata:
        filter_and_revise_example(example, samples_metadata),
        beam.pvalue.AsSingleton(samples_metadata)))

    # Write the subset of tf.Example protos to Cloud Storage.
    _ = (filtered_examples
         | 'SerializeExamples' >>
         beam.Map(lambda example: example.SerializeToString())
         | 'WriteExamples' >> tfrecordio.WriteToTFRecord(
             file_path_prefix=os.path.join(output_dir, 'examples'),
             compression_type=CompressionTypes.GZIP,
             file_name_suffix='.tfrecord.gz'))

Source File: create_data.py From conversational-datasets with Apache License 2.0

4 votes

def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | "read qa files" >> ReadFromText(args.file_pattern)

    # The lines are not JSON, but the string representation of python
    # dictionary objects. Parse them with ast.literal_eval.
    json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval)
    qa_tuples = json_objects | "create tuples" >> beam.FlatMap(
        partial(
            _create_tuples,
            min_words=args.min_words, max_words=args.max_words)
    )

    # Remove duplicate examples.
    qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v))
    qa_tuples |= "group duplicates" >> beam.GroupByKey()
    qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0])

    # Create the examples.
    examples = qa_tuples | "create examples" >> beam.Map(
        lambda args: _create_example(*args)
    )
    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)
    ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:

        serialized_examples = examples[tag] | (
            "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (
            serialized_examples | ("write " + name)
            >> write_sink(
                os.path.join(args.output_dir, name),
                file_name_suffix=file_name_suffix,
                num_shards=args.num_shards_train,
            )
        )

    result = p.run()
    result.wait_until_finish()

Source File: vcf_to_bq_preprocess.py From gcp-variant-transforms with Apache License 2.0

4 votes

def run(argv=None):
  # type: (List[str]) -> (str, str)
  """Runs preprocess pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)
  options = pipeline_options.PipelineOptions(pipeline_args)
  all_patterns = known_args.all_patterns
  pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

  with beam.Pipeline(options=options) as p:
    headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns)
    merged_headers = pipeline_common.get_merged_headers(headers)
    merged_definitions = (headers
                          | 'MergeDefinitions' >>
                          merge_header_definitions.MergeDefinitions())
    if known_args.report_all_conflicts:
      variants = pipeline_common.read_variants(p,
                                               all_patterns,
                                               pipeline_mode,
                                               allow_malformed_records=True,
                                               pre_infer_headers=True)
      malformed_records = variants | filter_variants.ExtractMalformedVariants()
      inferred_headers, merged_headers = (_get_inferred_headers(variants,
                                                                merged_headers))
      _ = (merged_definitions
           | 'GenerateConflictsReport' >>
           beam.ParDo(preprocess_reporter.generate_report,
                      known_args.report_path,
                      beam.pvalue.AsSingleton(merged_headers),
                      beam.pvalue.AsSingleton(inferred_headers),
                      beam.pvalue.AsIter(malformed_records)))
    else:
      _ = (merged_definitions
           | 'GenerateConflictsReport' >>
           beam.ParDo(preprocess_reporter.generate_report,
                      known_args.report_path,
                      beam.pvalue.AsSingleton(merged_headers)))

    if known_args.resolved_headers_path:
      pipeline_common.write_headers(merged_headers,
                                    known_args.resolved_headers_path)

Source File: process_delimited.py From professional-services with Apache License 2.0

4 votes

def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()

    # Add the arguments needed for this specific Dataflow job.
    parser.add_argument(
        '--input', dest='input', required=True,
        help='Input file to read.  This can be a local file or '
             'a file in a Google Storage Bucket.')

    parser.add_argument('--output', dest='output', required=True,
                        help='Output BQ table to write results to.')

    parser.add_argument('--delimiter', dest='delimiter', required=False,
                        help='Delimiter to split input records.',
                        default=',')

    parser.add_argument('--fields', dest='fields', required=True,
                        help='Comma separated list of field names.')

    parser.add_argument('--load_dt', dest='load_dt', required=True,
                        help='Load date in YYYY-MM-DD format.')

    known_args, pipeline_args = parser.parse_known_args(argv)
    row_transformer = RowTransformer(delimiter=known_args.delimiter,
                                     header=known_args.fields,
                                     filename=ntpath.basename(known_args.input),
                                     load_dt=known_args.load_dt)

    p_opts = pipeline_options.PipelineOptions(pipeline_args)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is.
    with beam.Pipeline(options=p_opts) as pipeline:
        # Read the file.  This is the source of the pipeline.  All further
        # processing starts with lines read from the file.  We use the input
        # argument from the command line.
        rows = pipeline | "Read from text file" >> beam.io.ReadFromText(known_args.input)

        # This stage of the pipeline translates from a delimited single row
        # input to a dictionary object consumable by BigQuery.
        # It refers to a function we have written.  This function will
        # be run in parallel on different workers using input from the
        # previous stage of the pipeline.
        dict_records = rows | "Convert to BigQuery row" >> beam.Map(
            lambda r: row_transformer.parse(r))

        # This stage of the pipeline writes the dictionary records into
        # an existing BigQuery table. The sink is also configured to truncate
        # the table if it contains any existing records.
        dict_records | "Write to BigQuery" >> beam.io.Write(
            beam.io.BigQuerySink(known_args.output,
                                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                                 write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

Source File: data_transformation.py From professional-services with Apache License 2.0

4 votes

def run(argv=None):
    """The main function which creates the pipeline and runs it."""
    parser = argparse.ArgumentParser()
    # Here we add some specific command line arguments we expect.   Specifically
    # we have the input file to load and the output table to write to.
    parser.add_argument(
        '--input',
        dest='input',
        required=False,
        help='Input file to read.  This can be a local file or '
        'a file in a Google Storage Bucket.',
        # This example file contains a total of only 10 lines.
        # It is useful for developing on a small set of data
        default='gs://python-dataflow-example/data_files/head_usa_names.csv')
    # This defaults to the temp dataset in your BigQuery project.  You'll have
    # to create the temp dataset yourself using bq mk temp
    parser.add_argument('--output',
                        dest='output',
                        required=False,
                        help='Output BQ table to write results to.',
                        default='lake.usa_names_transformed')

    # Parse arguments from the command line.
    known_args, pipeline_args = parser.parse_known_args(argv)
    # DataTransformation is a class we built in this script to hold the logic for
    # transforming the file into a BigQuery table.
    data_ingestion = DataTransformation()

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should
    # store temp files, and what the project id is.
    p = beam.Pipeline(options=PipelineOptions(pipeline_args))
    schema = parse_table_schema_from_json(data_ingestion.schema_str)

    (p
     # Read the file.  This is the source of the pipeline.  All further
     # processing starts with lines read from the file.  We use the input
     # argument from the command line.  We also skip the first line which is a
     # header row.
     | 'Read From Text' >> beam.io.ReadFromText(known_args.input,
                                                skip_header_lines=1)
     # This stage of the pipeline translates from a CSV file single row
     # input as a string, to a dictionary object consumable by BigQuery.
     # It refers to a function we have written.  This function will
     # be run in parallel on different workers using input from the
     # previous stage of the pipeline.
     | 'String to BigQuery Row' >>
     beam.Map(lambda s: data_ingestion.parse_method(s)) |
     'Write to BigQuery' >> beam.io.Write(
         beam.io.BigQuerySink(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             known_args.output,
             # Here we use the JSON schema read in from a JSON file.
             # Specifying the schema allows the API to create the table correctly if it does not yet exist.
             schema=schema,
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             # Deletes all data in the BigQuery table before writing.
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run().wait_until_finish()

Source File: load_file_generator.py From professional-services with Apache License 2.0

4 votes

def _create_parquet_file(self, blob_name, staging_table_util,
                             destination_prefix):
        """Creates a parquet file from a staging table and stores in GCS.

        The parquet file is generated using DataFLow, since BigQuery Extract
        Jobs do not support the parquet file type as a destination format.

        Args:
            blob_name(str): Name of the file (or blob) to be generated. Starts
                with 'fileType=' and end with the file extension.
                Ex: fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876.csv  # pylint: disable=line-too-long
            staging_table_util(load_benchmark_tools.table_util.TableUtil): Util
                object for interacting with the staging table that the parquet
                file will be generated from.
            destination_prefix(str): String containing the 'gs://' prefix, the
                bucket name, and the path of the file, without the extension.
                This is needed by the WriteToParquet class.
                Ex: gs://annarudy_test_files/fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=10000/tableSize=2147MB/file3876 # pylint: disable=line-too-long
        """
        logging.info('Attempting to create file ' '{0:s}'.format(blob_name))
        pipeline_args = [
            '--project', self.project_id, '--staging_location',
            self.dataflow_staging_location, '--temp_location',
            self.dataflow_temp_location, '--save_main_session',
            '--worker_machine_type', 'n1-highcpu-32', '--runner',
            'DataflowRunner', '--setup_file', './setup.py'
        ]
        options = pipeline_options.PipelineOptions(pipeline_args)
        table_spec = beam_bigquery.TableReference(
            projectId=self.project_id,
            datasetId=self.primitive_staging_dataset_id,
            tableId=staging_table_util.table_id)
        bq_schema = staging_table_util.table.schema
        pa_schema = parquet_util.ParquetUtil(
            bq_schema).get_pa_translated_schema()
        p = beam.Pipeline(options=options)
        table = (
            p | 'ReadTable' >> beam.io.Read(beam.io.BigQuerySource(table_spec)))
        (table | beam.io.WriteToParquet(
            file_path_prefix=destination_prefix,
            schema=pa_schema,
            file_name_suffix='.parquet',
            num_shards=1,
            shard_name_template='',
        ))
        p.run().wait_until_finish()
        logging.info('Created file: {0:s}'.format(blob_name))

Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0

4 votes

def _merge_headers(known_args, pipeline_args,
                   pipeline_mode, avro_root_path, annotated_vcf_pattern=None):
  # type: (str, argparse.Namespace, List[str], int, str) -> None
  """Merges VCF headers using beam based on pipeline_mode."""
  options = pipeline_options.PipelineOptions(pipeline_args)

  # Always run pipeline locally if data is small.
  if (pipeline_mode == pipeline_common.PipelineModes.SMALL and
      not known_args.infer_headers and not known_args.infer_annotation_types):
    options.view_as(pipeline_options.StandardOptions).runner = 'DirectRunner'

  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  merge_headers_job_name = pipeline_common.generate_unique_name(
      _MERGE_HEADERS_JOB_NAME)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + merge_headers_job_name
  else:
    google_cloud_options.job_name = merge_headers_job_name

  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  temp_merged_headers_file_name = '-'.join([google_cloud_options.job_name,
                                            _MERGE_HEADERS_FILE_NAME])
  temp_merged_headers_file_path = filesystems.FileSystems.join(
      temp_directory, temp_merged_headers_file_name)

  with beam.Pipeline(options=options) as p:
    headers = pipeline_common.read_headers(
        p, pipeline_mode,
        known_args.all_patterns)
    _ = (headers
         | 'SampleInfoToAvro'
         >> sample_info_to_avro.SampleInfoToAvro(
             avro_root_path +
             sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX,
             SampleNameEncoding[known_args.sample_name_encoding]))
    if known_args.representative_header_file:
      return
    merged_header = pipeline_common.get_merged_headers(
        headers,
        known_args.split_alternate_allele_info_fields,
        known_args.allow_incompatible_records)
    if annotated_vcf_pattern:
      merged_header = pipeline_common.add_annotation_headers(
          p, known_args, pipeline_mode, merged_header,
          annotated_vcf_pattern)
    if known_args.infer_headers or known_args.infer_annotation_types:
      infer_headers_input_pattern = (
          [annotated_vcf_pattern] if
          annotated_vcf_pattern else known_args.all_patterns)
      merged_header = _add_inferred_headers(infer_headers_input_pattern, p,
                                            known_args, merged_header,
                                            pipeline_mode)

    pipeline_common.write_headers(merged_header, temp_merged_headers_file_path)
    known_args.representative_header_file = temp_merged_headers_file_path

Source File: vcf_to_bq.py From gcp-variant-transforms with Apache License 2.0

4 votes

def _get_input_dimensions(known_args, pipeline_args):
  pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns)
  beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = beam_pipeline_options.view_as(
      pipeline_options.GoogleCloudOptions)

  estimate_sizes_job_name = pipeline_common.generate_unique_name(
      _ESTIMATE_SIZES_JOB_NAME)
  if google_cloud_options.job_name:
    google_cloud_options.job_name += '-' + estimate_sizes_job_name
  else:
    google_cloud_options.job_name = estimate_sizes_job_name
  temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
  temp_estimated_input_size_file_name = '-'.join(
      [google_cloud_options.job_name,
       _ESTIMATE_SIZES_FILE_NAME])
  temp_estimated_input_size_file_path = filesystems.FileSystems.join(
      temp_directory, temp_estimated_input_size_file_name)
  with beam.Pipeline(options=beam_pipeline_options) as p:
    estimates = pipeline_common.get_estimates(
        p, pipeline_mode, known_args.all_patterns)

    files_size = (estimates
                  | 'GetFilesSize' >> extract_input_size.GetFilesSize())
    file_count = (estimates
                  | 'CountAllFiles' >> beam.combiners.Count.Globally())
    sample_map = (estimates
                  | 'ExtractSampleMap' >> extract_input_size.GetSampleMap())
    estimated_value_count = (sample_map
                             | extract_input_size.GetEstimatedValueCount())
    estimated_sample_count = (sample_map
                              | extract_input_size.GetEstimatedSampleCount())
    estimated_variant_count = (estimates
                               | 'GetEstimatedVariantCount'
                               >> extract_input_size.GetEstimatedVariantCount())
    _ = (estimated_variant_count
         | beam.ParDo(extract_input_size.print_estimates_to_file,
                      beam.pvalue.AsSingleton(estimated_sample_count),
                      beam.pvalue.AsSingleton(estimated_value_count),
                      beam.pvalue.AsSingleton(files_size),
                      beam.pvalue.AsSingleton(file_count),
                      temp_estimated_input_size_file_path))

  with filesystems.FileSystems.open(temp_estimated_input_size_file_path) as f:
    estimates = f.readlines()
  if len(estimates) != 5:
    raise ValueError('Exactly 5 estimates were expected in {}.'.format(
        temp_estimated_input_size_file_path))

  known_args.estimated_variant_count = int(estimates[0].strip())
  known_args.estimated_sample_count = int(estimates[1].strip())
  known_args.estimated_value_count = int(estimates[2].strip())
  known_args.files_size = int(estimates[3].strip())
  known_args.file_count = int(estimates[4].strip())

Source File: bq_to_vcf.py From gcp-variant-transforms with Apache License 2.0

4 votes

def run(argv=None):
  # type: (List[str]) -> None
  """Runs BigQuery to VCF pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                         _COMMAND_LINE_OPTIONS)
  options = pipeline_options.PipelineOptions(pipeline_args)
  is_direct_runner = pipeline_common.is_pipeline_direct_runner(
      beam.Pipeline(options=options))
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  if not google_cloud_options.project:
    raise ValueError('project must be set.')
  if not is_direct_runner and not known_args.output_file.startswith('gs://'):
    raise ValueError('Please set the output file {} to GCS when running with '
                     'DataflowRunner.'.format(known_args.output_file))
  if is_direct_runner:
    known_args.number_of_bases_per_shard = sys.maxsize

  temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp()
  unique_temp_id = pipeline_common.generate_unique_name(
      google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME)
  vcf_data_temp_folder = filesystems.FileSystems.join(
      temp_folder,
      '{}_data_temp_files'.format(unique_temp_id))
  # Create the directory manually. FileSystems cannot create a file if the
  # directory does not exist when using Direct Runner.
  filesystems.FileSystems.mkdirs(vcf_data_temp_folder)
  vcf_header_file_path = filesystems.FileSystems.join(
      temp_folder,
      '{}_header_with_sample_ids.vcf'.format(unique_temp_id))

  if not known_args.representative_header_file:
    known_args.representative_header_file = filesystems.FileSystems.join(
        temp_folder,
        '{}_meta_info.vcf'.format(unique_temp_id))
    _write_vcf_meta_info(known_args.input_table,
                         known_args.representative_header_file,
                         known_args.allow_incompatible_schema)

  _bigquery_to_vcf_shards(known_args,
                          options,
                          vcf_data_temp_folder,
                          vcf_header_file_path)
  if is_direct_runner:
    vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path,
                                               vcf_data_temp_folder,
                                               known_args.output_file)
  else:
    vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project,
                                             vcf_header_file_path,
                                             vcf_data_temp_folder,
                                             known_args.output_file)

Python apache_beam.options.pipeline_options.PipelineOptions() Examples