Python Examples of apache

Source File: vcf_header_io_test.py From gcp-variant-transforms with Apache License 2.0

7 votes

def test_pipeline_read_all_file_pattern(self):
    with temp_dir.TempDir() as tempdir:
      headers_1 = [self.lines[1], self.lines[-1]]
      headers_2 = [self.lines[2], self.lines[3], self.lines[-1]]
      headers_3 = [self.lines[4], self.lines[-1]]

      file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1)
      file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2)
      file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3)

      pipeline = TestPipeline()
      pcoll = (pipeline
               | 'Create' >> beam.Create(
                   [os.path.join(tempdir.get_path(), '*.vcf')])
               | 'ReadHeaders' >> ReadAllVcfHeaders())

      expected = [_get_vcf_header_from_lines(h, file_name=file_name)
                  for h, file_name in [(headers_1, file_name_1),
                                       (headers_2, file_name_2),
                                       (headers_3, file_name_3)]]
      assert_that(pcoll, asserts.header_vars_equal(expected))
      pipeline.run()

Source File: utils_test.py From text with Apache License 2.0

6 votes

def testTwoLangs(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.CompileTokenizationInfo())
      assert_that(result, equal_to([{
          'lang': 'en',
          'count': 1,
          'num_preserved_chars': 13,
          'num_dropped_chars': 2,
          'num_non_unk_wordpieces': 4,
          'preserved_ratio': [13/4],
          'dropped_ratio': [2/15],
          'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce'])
      }, {
          'lang': 'fr',
          'count': 1,
          'num_preserved_chars': 14,
          'num_dropped_chars': 0,
          'num_non_unk_wordpieces': 5,
          'preserved_ratio': [14/5],
          'dropped_ratio': [0],
          'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir'])
      }]))

Source File: stats_api_test.py From data-validation with Apache License 2.0

6 votes

def test_stats_pipeline_with_sample_rate(self):
    record_batches = [
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
    ]

    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          sample_rate=1.0,
          num_top_values=2,
          num_rank_histogram_buckets=2,
          num_values_histogram_buckets=2,
          num_histogram_buckets=2,
          num_quantiles_histogram_buckets=2,
          epsilon=0.001)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, self._sampling_test_expected_result))

Source File: transform_fn_io_test.py From transform with Apache License 2.0

6 votes

def testWriteTransformFnIsIdempotent(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    def mock_write_metadata_expand(unused_self, unused_metadata):
      raise ArithmeticError('Some error')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))

      with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata,
                             'expand', mock_write_metadata_expand):
        with self.assertRaisesRegexp(ArithmeticError, 'Some error'):
          _ = ((saved_model_dir_pcoll, object())
               | transform_fn_io.WriteTransformFn(transform_output_dir))

    self.assertFalse(file_io.file_exists(transform_output_dir))

Source File: stats_api_test.py From data-validation with Apache License 2.0

6 votes

def test_stats_pipeline_with_zero_examples(self):
    expected_result = text_format.Parse(
        """
        datasets {
          num_examples: 0
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          num_top_values=1,
          num_rank_histogram_buckets=1,
          num_values_histogram_buckets=2,
          num_histogram_buckets=1,
          num_quantiles_histogram_buckets=1,
          epsilon=0.001)
      result = (p | beam.Create([]) | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))

Source File: stats_api_test.py From data-validation with Apache License 2.0

6 votes

def test_write_stats_to_text(self):
    stats = text_format.Parse(
        """
        datasets {
          name: 'x'
          num_examples: 100
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
    output_path = os.path.join(self._get_temp_dir(), 'stats')
    with beam.Pipeline() as p:
      _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToText(
          output_path))
    stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
    serialized_stats = io_util.read_file_to_string(
        output_path, binary_mode=True)
    stats_from_file.ParseFromString(serialized_stats)
    self.assertLen(stats_from_file.datasets, 1)
    test_util.assert_dataset_feature_stats_proto_equal(
        self, stats_from_file.datasets[0], stats.datasets[0])

Source File: csv_decoder_test.py From data-validation with Apache License 2.0

6 votes

def test_csv_decoder(self,
                       input_lines,
                       expected_result,
                       column_names,
                       delimiter=',',
                       skip_blank_lines=True,
                       schema=None,
                       multivalent_columns=None,
                       secondary_delimiter=None):
    with beam.Pipeline() as p:
      result = (
          p | beam.Create(input_lines, reshuffle=False)
          | csv_decoder.DecodeCSV(
              column_names=column_names,
              delimiter=delimiter,
              skip_blank_lines=skip_blank_lines,
              schema=schema,
              multivalent_columns=multivalent_columns,
              secondary_delimiter=secondary_delimiter))
      util.assert_that(
          result,
          test_util.make_arrow_record_batches_equal_fn(self, expected_result))

Source File: pipeline_common.py From gcp-variant-transforms with Apache License 2.0

6 votes

def add_annotation_headers(pipeline, known_args, pipeline_mode,
                           merged_header,
                           annotated_vcf_pattern):
  if pipeline_mode == PipelineModes.LARGE:
    annotation_headers = (pipeline
                          | 'ReadAnnotatedVCF'
                          >> beam.Create([annotated_vcf_pattern])
                          | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
  else:
    annotation_headers = (
        pipeline
        | 'ReadHeaders'
        >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
  merged_header = (
      (merged_header, annotation_headers)
      | beam.Flatten()
      | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header

Source File: pipeline_common.py From gcp-variant-transforms with Apache License 2.0

6 votes

def read_headers(
    pipeline,  #type: beam.Pipeline
    pipeline_mode,  #type: int
    all_patterns  #type: List[str]
    ):
  # type: (...) -> pvalue.PCollection
  """Creates an initial PCollection by reading the VCF file headers."""
  compression_type = get_compression_type(all_patterns)
  if pipeline_mode == PipelineModes.LARGE:
    headers = (pipeline
               | beam.Create(all_patterns)
               | vcf_header_io.ReadAllVcfHeaders(
                   compression_type=compression_type))
  else:
    headers = pipeline | vcf_header_io.ReadVcfHeaders(
        all_patterns[0],
        compression_type=compression_type)

  return headers

Source File: stats_impl_test.py From data-validation with Apache License 2.0

6 votes

def test_stats_impl(self,
                      record_batches,
                      options,
                      expected_result_proto_text,
                      schema=None):
    expected_result = text_format.Parse(
        expected_result_proto_text,
        statistics_pb2.DatasetFeatureStatisticsList())
    if schema is not None:
      options.schema = schema
    with beam.Pipeline() as p:
      result = (
          p | beam.Create(record_batches, reshuffle=False)
          | stats_impl.GenerateStatisticsImpl(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result))

Source File: vcf_estimate_io_test.py From gcp-variant-transforms with Apache License 2.0

6 votes

def test_pipeline_read_all_file_pattern(self):
    with temp_dir.TempDir() as tempdir:
      lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2]
      lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4]
      lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:]
      file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=lines_1)
      file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=lines_2)
      file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=lines_3)

      pipeline = TestPipeline()
      pcoll = pipeline | 'ReadHeaders' >> GetEstimates(
          os.path.join(tempdir.get_path(), '*.vcf'))
      pcoll = (pipeline
               | 'Create' >> beam.Create(
                   [os.path.join(tempdir.get_path(), '*.vcf')])
               | 'GetAllEstimates' >> GetAllEstimates())

      expected = [_get_estimate_from_lines(lines, file_name=file_name)
                  for lines, file_name in [(lines_1, file_name_1),
                                           (lines_2, file_name_2),
                                           (lines_3, file_name_3)]]
      assert_that(pcoll, asserts.header_vars_equal(expected))
      pipeline.run()

Source File: impl.py From transform with Apache License 2.0

6 votes

def _clear_shared_state_after_barrier(pipeline, input_barrier):
  """Clears any shared state from within a pipeline context.

  This will only be cleared once input_barrier becomes available.

  Args:
    pipeline: A `beam.Pipeline` object.
    input_barrier: A `PCollection` which the pipeline should wait for.

  Returns:
    An empty `PCollection`.
  """
  empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
      lambda x: None)
  return (pipeline
          | 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
          | 'WaitAndClearSharedKeepAlives' >> beam.Map(
              lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
              beam.pvalue.AsIter(empty_pcoll)))

Source File: impl_test.py From transform with Apache License 2.0

6 votes

def testHandleBatchError(self):
    if self._UseTFXIO():
      return

    def preprocessing_fn(inputs):
      return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

    metadata = tft_unit.metadata_from_feature_spec({
        'x': tf.io.FixedLenFeature([], tf.float32),
    })
    pipeline = self._makeTestPipeline()
    input_data = pipeline | 'CreateTrainingData' >> beam.Create([{
        'x': 1
    }, {
        'x': [4, 1]
    }])
    with beam_impl.Context(temp_dir=self.get_temp_dir()):
      _ = ((input_data, metadata)
           | 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))
    # Exception type depends on the running being used.
    with self.assertRaisesRegexp(
        (RuntimeError, ValueError),
        'An error occured while trying to apply the transformation:'):
      pipeline.run()

Source File: run_pipeline_lib.py From healthcare-deid with Apache License 2.0

6 votes

def _create_row(stats, now, extra_columns=tuple()):
  """Create a BigQuery row from the given stats."""
  row = {'true_positives': stats.true_positives,
         'false_positives': stats.false_positives,
         'false_negatives': stats.false_negatives}
  if not math.isnan(stats.precision):
    row['precision'] = stats.precision
  if not math.isnan(stats.recall):
    row['recall'] = stats.recall
  if not math.isnan(stats.f_score):
    row['f_score'] = stats.f_score

  row['timestamp'] = now

  for column_name, val in extra_columns:
    row[column_name] = val

  return row

Source File: run_deid_lib.py From healthcare-deid with Apache License 2.0

6 votes

def read_csv(p, csv_filename):
  """Read csv file to the row format expected by deid()."""
  rows = []
  with open(csv_filename) as f:
    spamreader = unicodecsv.UnicodeReader(f)
    headers = []
    for row in spamreader:
      if not headers:
        headers = row
        continue
      rowmap = {}
      for i in range(len(headers)):
        val = ''
        if i < len(row):
          val = row[i]
        rowmap[headers[i]] = val
      rows.append([rowmap])
  return p | beam.Create(rows)

Source File: executor.py From tfx with Apache License 2.0

6 votes

def _PrestoToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    exec_properties: Dict[Text, Any],
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read from Presto and transform to TF examples.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, a Presto sql string.

  Returns:
    PCollection of TF examples.
  """
  conn_config = example_gen_pb2.CustomConfig()
  json_format.Parse(exec_properties['custom_config'], conn_config)
  presto_config = presto_config_pb2.PrestoConnConfig()
  conn_config.custom_config.Unpack(presto_config)

  client = _deserialize_conn_config(presto_config)
  return (pipeline
          | 'Query' >> beam.Create([split_pattern])
          | 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client))
          | 'ToTFExample' >> beam.Map(_row_to_example))

Source File: vcfio_test.py From gcp-variant-transforms with Apache License 2.0

6 votes

def _assert_pipeline_read_files_record_count_equal(
      self, input_pattern, expected_count, use_read_all=False):
    """Helper method for verifying total records read.

    Args:
      input_pattern (str): Input file pattern to read.
      expected_count (int): Expected number of reacords that was read.
      use_read_all (bool): Whether to use the scalable ReadAllFromVcf transform
        instead of ReadFromVcf.
    """
    pipeline = TestPipeline()
    if use_read_all:
      pcoll = (pipeline
               | 'Create' >> beam.Create([input_pattern])
               | 'Read' >> ReadAllFromVcf())
    else:
      pcoll = pipeline | 'Read' >> ReadFromVcf(input_pattern)
    assert_that(pcoll, asserts.count_equals_to(expected_count))
    pipeline.run()

Source File: gcs_to_bigquery_lib.py From healthcare-deid with Apache License 2.0

5 votes

def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_file_to_records) |
       'parse_physionet_record' >> beam.Map(f2pn.parse_physionet_record) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, record_number:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result)

Source File: deep_copy_test.py From transform with Apache License 2.0

5 votes

def testMultipleCopies(self):
    with beam.Pipeline() as p:
      grouped = (p
                 | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')])
                 | beam.Map(lambda x: DeepCopyTest._CountingIdentityFn(
                     'PreGroup', x))
                 | beam.GroupByKey())
      modified = (
          grouped
          |
          'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1'))
          |
          'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2')))

      num_copies = 6

      first_copy = deep_copy.deep_copy(modified)
      self.assertEqual(first_copy.producer.full_label, 'Add2.Copy')
      self.assertEqual(first_copy.producer.inputs[0].producer.full_label,
                       'Add1.Copy')

      for i in range(num_copies - 1):
        copied = deep_copy.deep_copy(modified)
        self.assertEqual(copied.producer.full_label, 'Add2.Copy%d' % i)
        self.assertEqual(copied.producer.inputs[0].producer.full_label,
                         'Add1.Copy%d' % i)

    self.assertEqual(DeepCopyTest._counts['PreGroup'], 3)
    self.assertEqual(DeepCopyTest._counts['Add1'], 3 * (num_copies + 1))
    self.assertEqual(DeepCopyTest._counts['Add2'], 3 * (num_copies + 1))

Source File: deep_copy_test.py From transform with Apache License 2.0

5 votes

def testBasicDeepCopy(self):
    with beam.Pipeline() as p:
      grouped = (p
                 | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')])
                 | beam.Map(
                     lambda x: DeepCopyTest._CountingIdentityFn(
                         'PreGroup', x))
                 | beam.GroupByKey())
      modified = (
          grouped
          |
          'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1'))
          |
          'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2')))
      copied = deep_copy.deep_copy(modified)

      # pylint: disable=expression-not-assigned
      modified | 'Add3' >> beam.Map(
          DeepCopyTest._MakeAdd1CountingIdentityFn('Add3'))
      # pylint: enable=expression-not-assigned

      # Check labels.
      self.assertEqual(copied.producer.full_label, 'Add2.Copy')
      self.assertEqual(copied.producer.inputs[0].producer.full_label,
                       'Add1.Copy')

      # Check that deep copy was performed.
      self.assertIsNot(copied.producer.inputs[0], modified.producer.inputs[0])

      # Check that copy stops at materialization boundary.
      self.assertIs(copied.producer.inputs[0].producer.inputs[0],
                    modified.producer.inputs[0].producer.inputs[0])

    # Check counts of processed items.
    self.assertEqual(DeepCopyTest._counts['PreGroup'], 3)
    self.assertEqual(DeepCopyTest._counts['Add1'], 6)
    self.assertEqual(DeepCopyTest._counts['Add2'], 6)
    self.assertEqual(DeepCopyTest._counts['Add3'], 3)

Source File: utils_test.py From text with Apache License 2.0

5 votes

def testUnsorted(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      result = tokens | beam.CombineGlobally(utils.SortByCount())
      assert_that(result, equal_to([[('c', 9), ('a', 5), ('d', 4), ('b', 2)]]))

Source File: utils_test.py From text with Apache License 2.0

5 votes

def testLangNotInLangSetIncludeOthers(self):
    with TestPipeline() as p:
      tokens = p | beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'}, True))
      assert_that(result, equal_to([('I', 'other'),
                                    ('like', 'other'),
                                    ('pie', 'other'),
                                    ('.', 'other')]))

Source File: transform_fn_io.py From transform with Apache License 2.0

5 votes

def expand(self, pvalue):
    transform_fn_path = os.path.join(self._path,
                                     tft.TFTransformOutput.TRANSFORM_FN_DIR)
    saved_model_dir_pcoll = (
        pvalue.pipeline
        | 'CreateTransformFnPath' >> beam.Create([transform_fn_path]))

    metadata = metadata_io.read_metadata(
        os.path.join(self._path,
                     tft.TFTransformOutput.TRANSFORMED_METADATA_DIR))

    return saved_model_dir_pcoll, metadata

Source File: transform_fn_io_test.py From transform with Apache License 2.0

5 votes

def testWriteTransformFn(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      file_io.recursive_create_dir(saved_model_dir)
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
      # Combine test metadata with a dict of PCollections resolving futures.
      deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
          [test_metadata.COMPLETE_METADATA])
      metadata = beam_metadata_io.BeamDatasetMetadata(
          test_metadata.INCOMPLETE_METADATA, deferred_metadata)

      _ = ((saved_model_dir_pcoll, metadata)
           | transform_fn_io.WriteTransformFn(transform_output_dir))

    # Test reading with TFTransformOutput
    tf_transform_output = tft.TFTransformOutput(transform_output_dir)
    metadata = tf_transform_output.transformed_metadata
    self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

    transform_fn_dir = tf_transform_output.transform_savedmodel_dir
    self.assertTrue(file_io.file_exists(transform_fn_dir))
    self.assertTrue(file_io.is_directory(transform_fn_dir))

Source File: beam_metadata_io_test.py From transform with Apache License 2.0

5 votes

def testWriteMetadataDeferred(self):
    # Write metadata to disk using WriteMetadata PTransform, combining
    # incomplete metadata with (deferred) complete metadata.
    with beam.Pipeline() as pipeline:
      path = self.get_temp_dir()
      deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
          [test_metadata.COMPLETE_METADATA])
      metadata = beam_metadata_io.BeamDatasetMetadata(
          test_metadata.INCOMPLETE_METADATA, deferred_metadata)
      _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)

    # Load from disk and check that it is as expected.
    metadata = metadata_io.read_metadata(path)
    self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

Source File: beam_metadata_io.py From transform with Apache License 2.0

5 votes

def expand(self, metadata):
    if hasattr(metadata, 'deferred_metadata'):
      metadata_pcoll = metadata.deferred_metadata
    else:
      metadata_pcoll = self.pipeline | beam.Create([metadata])

    def write_metadata_output(metadata):
      output_path = self._path
      if self._write_to_unique_subdirectory:
        output_path = common.get_unique_temp_path(self._path)
      metadata_io.write_metadata(metadata, output_path)
      return output_path

    return metadata_pcoll | 'WriteMetadata' >> beam.Map(write_metadata_output)

Source File: impl.py From transform with Apache License 2.0

5 votes

def expand(self, inputs):
    pipeline = (inputs[0] if isinstance(inputs, tuple) else inputs).pipeline
    saved_model_dir_pcoll = pipeline | 'CreateSavedModel' >> beam.Create(
        [self._unbound_saved_model_dir])

    if isinstance(inputs, beam.pvalue.PBegin):
      return saved_model_dir_pcoll

    return saved_model_dir_pcoll | 'ReplaceWithConstants' >> beam.Map(
        _replace_tensors_with_constant_values, self._base_temp_dir,
        *[beam.pvalue.AsSingleton(pcoll) for pcoll in inputs])

Source File: common.py From transform with Apache License 2.0

5 votes

def expand(self, pcoll):
    _ = (
        pcoll.pipeline
        | 'CreateSole' >> beam.Create([None])
        | 'Count' >> beam.Map(self._make_and_increment_counter))
    return pcoll

Source File: analyzer_impls.py From transform with Apache License 2.0

5 votes

def create_accumulator(self):
    """Create an accumulator with all zero entries."""
    return self._combiner.create_accumulator()

Source File: analyzer_impls.py From transform with Apache License 2.0

5 votes

def expand(self, inputs):
    counts, = inputs
    vocabulary_file = os.path.join(self._base_temp_dir, self._vocab_filename)
    vocab_is_written = (
        counts.pipeline
        | 'Prepare' >> beam.Create([None])
        | 'OrderElements' >> beam.ParDo(
            _OrderElementsFn(self._store_frequency, self._fingerprint_shuffle,
                             self._input_dtype),
            counts_iter=beam.pvalue.AsIter(counts))
        # TODO(b/62379925) For now force a single file. Should
        # `InitializeTableFromTextFile` operate on a @N set of files?
        # TODO(b/67863471) Here we are relying on fusion (an implementation
        # detail) for the ordering to be maintained when the results are written
        # to disk. Perform the write within the body of `OrderElements` maybe
        # `OrderElementsAndWrite`. This would mean using TF IO instead of Beam
        # IO so it's perhaps not great.
        | 'WriteToFile' >> beam.io.WriteToText(
            vocabulary_file, shard_name_template=''))
    # Return the vocabulary path.
    wait_for_vocabulary_transform = (
        counts.pipeline
        | 'CreatePath' >> beam.Create([np.array(vocabulary_file)])
        # Ensure that the analysis returns only after the file is written.
        | 'WaitForVocabularyFile' >> beam.Map(
            lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
    return (wait_for_vocabulary_transform,)

Python apache_beam.Create() Examples