Python apache_beam.Create() Examples

The following are 30 code examples of apache_beam.Create(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: vcf_header_io_test.py    From gcp-variant-transforms with Apache License 2.0 7 votes vote down vote up
def test_pipeline_read_all_file_pattern(self):
    with temp_dir.TempDir() as tempdir:
      headers_1 = [self.lines[1], self.lines[-1]]
      headers_2 = [self.lines[2], self.lines[3], self.lines[-1]]
      headers_3 = [self.lines[4], self.lines[-1]]

      file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1)
      file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2)
      file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3)

      pipeline = TestPipeline()
      pcoll = (pipeline
               | 'Create' >> beam.Create(
                   [os.path.join(tempdir.get_path(), '*.vcf')])
               | 'ReadHeaders' >> ReadAllVcfHeaders())

      expected = [_get_vcf_header_from_lines(h, file_name=file_name)
                  for h, file_name in [(headers_1, file_name_1),
                                       (headers_2, file_name_2),
                                       (headers_3, file_name_3)]]
      assert_that(pcoll, asserts.header_vars_equal(expected))
      pipeline.run() 
Example #2
Source File: utils_test.py    From text with Apache License 2.0 6 votes vote down vote up
def testTwoLangs(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.CompileTokenizationInfo())
      assert_that(result, equal_to([{
          'lang': 'en',
          'count': 1,
          'num_preserved_chars': 13,
          'num_dropped_chars': 2,
          'num_non_unk_wordpieces': 4,
          'preserved_ratio': [13/4],
          'dropped_ratio': [2/15],
          'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce'])
      }, {
          'lang': 'fr',
          'count': 1,
          'num_preserved_chars': 14,
          'num_dropped_chars': 0,
          'num_non_unk_wordpieces': 5,
          'preserved_ratio': [14/5],
          'dropped_ratio': [0],
          'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir'])
      }])) 
Example #3
Source File: stats_api_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_stats_pipeline_with_sample_rate(self):
    record_batches = [
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
    ]

    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          sample_rate=1.0,
          num_top_values=2,
          num_rank_histogram_buckets=2,
          num_values_histogram_buckets=2,
          num_histogram_buckets=2,
          num_quantiles_histogram_buckets=2,
          epsilon=0.001)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, self._sampling_test_expected_result)) 
Example #4
Source File: transform_fn_io_test.py    From transform with Apache License 2.0 6 votes vote down vote up
def testWriteTransformFnIsIdempotent(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    def mock_write_metadata_expand(unused_self, unused_metadata):
      raise ArithmeticError('Some error')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))

      with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata,
                             'expand', mock_write_metadata_expand):
        with self.assertRaisesRegexp(ArithmeticError, 'Some error'):
          _ = ((saved_model_dir_pcoll, object())
               | transform_fn_io.WriteTransformFn(transform_output_dir))

    self.assertFalse(file_io.file_exists(transform_output_dir)) 
Example #5
Source File: stats_api_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_stats_pipeline_with_zero_examples(self):
    expected_result = text_format.Parse(
        """
        datasets {
          num_examples: 0
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          num_top_values=1,
          num_rank_histogram_buckets=1,
          num_values_histogram_buckets=2,
          num_histogram_buckets=1,
          num_quantiles_histogram_buckets=1,
          epsilon=0.001)
      result = (p | beam.Create([]) | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result)) 
Example #6
Source File: stats_api_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_write_stats_to_text(self):
    stats = text_format.Parse(
        """
        datasets {
          name: 'x'
          num_examples: 100
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())
    output_path = os.path.join(self._get_temp_dir(), 'stats')
    with beam.Pipeline() as p:
      _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToText(
          output_path))
    stats_from_file = statistics_pb2.DatasetFeatureStatisticsList()
    serialized_stats = io_util.read_file_to_string(
        output_path, binary_mode=True)
    stats_from_file.ParseFromString(serialized_stats)
    self.assertLen(stats_from_file.datasets, 1)
    test_util.assert_dataset_feature_stats_proto_equal(
        self, stats_from_file.datasets[0], stats.datasets[0]) 
Example #7
Source File: csv_decoder_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_csv_decoder(self,
                       input_lines,
                       expected_result,
                       column_names,
                       delimiter=',',
                       skip_blank_lines=True,
                       schema=None,
                       multivalent_columns=None,
                       secondary_delimiter=None):
    with beam.Pipeline() as p:
      result = (
          p | beam.Create(input_lines, reshuffle=False)
          | csv_decoder.DecodeCSV(
              column_names=column_names,
              delimiter=delimiter,
              skip_blank_lines=skip_blank_lines,
              schema=schema,
              multivalent_columns=multivalent_columns,
              secondary_delimiter=secondary_delimiter))
      util.assert_that(
          result,
          test_util.make_arrow_record_batches_equal_fn(self, expected_result)) 
Example #8
Source File: pipeline_common.py    From gcp-variant-transforms with Apache License 2.0 6 votes vote down vote up
def add_annotation_headers(pipeline, known_args, pipeline_mode,
                           merged_header,
                           annotated_vcf_pattern):
  if pipeline_mode == PipelineModes.LARGE:
    annotation_headers = (pipeline
                          | 'ReadAnnotatedVCF'
                          >> beam.Create([annotated_vcf_pattern])
                          | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders())
  else:
    annotation_headers = (
        pipeline
        | 'ReadHeaders'
        >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern))
  merged_header = (
      (merged_header, annotation_headers)
      | beam.Flatten()
      | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header 
Example #9
Source File: pipeline_common.py    From gcp-variant-transforms with Apache License 2.0 6 votes vote down vote up
def read_headers(
    pipeline,  #type: beam.Pipeline
    pipeline_mode,  #type: int
    all_patterns  #type: List[str]
    ):
  # type: (...) -> pvalue.PCollection
  """Creates an initial PCollection by reading the VCF file headers."""
  compression_type = get_compression_type(all_patterns)
  if pipeline_mode == PipelineModes.LARGE:
    headers = (pipeline
               | beam.Create(all_patterns)
               | vcf_header_io.ReadAllVcfHeaders(
                   compression_type=compression_type))
  else:
    headers = pipeline | vcf_header_io.ReadVcfHeaders(
        all_patterns[0],
        compression_type=compression_type)

  return headers 
Example #10
Source File: stats_impl_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_stats_impl(self,
                      record_batches,
                      options,
                      expected_result_proto_text,
                      schema=None):
    expected_result = text_format.Parse(
        expected_result_proto_text,
        statistics_pb2.DatasetFeatureStatisticsList())
    if schema is not None:
      options.schema = schema
    with beam.Pipeline() as p:
      result = (
          p | beam.Create(record_batches, reshuffle=False)
          | stats_impl.GenerateStatisticsImpl(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, expected_result)) 
Example #11
Source File: vcf_estimate_io_test.py    From gcp-variant-transforms with Apache License 2.0 6 votes vote down vote up
def test_pipeline_read_all_file_pattern(self):
    with temp_dir.TempDir() as tempdir:
      lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2]
      lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4]
      lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:]
      file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=lines_1)
      file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=lines_2)
      file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=lines_3)

      pipeline = TestPipeline()
      pcoll = pipeline | 'ReadHeaders' >> GetEstimates(
          os.path.join(tempdir.get_path(), '*.vcf'))
      pcoll = (pipeline
               | 'Create' >> beam.Create(
                   [os.path.join(tempdir.get_path(), '*.vcf')])
               | 'GetAllEstimates' >> GetAllEstimates())

      expected = [_get_estimate_from_lines(lines, file_name=file_name)
                  for lines, file_name in [(lines_1, file_name_1),
                                           (lines_2, file_name_2),
                                           (lines_3, file_name_3)]]
      assert_that(pcoll, asserts.header_vars_equal(expected))
      pipeline.run() 
Example #12
Source File: impl.py    From transform with Apache License 2.0 6 votes vote down vote up
def _clear_shared_state_after_barrier(pipeline, input_barrier):
  """Clears any shared state from within a pipeline context.

  This will only be cleared once input_barrier becomes available.

  Args:
    pipeline: A `beam.Pipeline` object.
    input_barrier: A `PCollection` which the pipeline should wait for.

  Returns:
    An empty `PCollection`.
  """
  empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap(
      lambda x: None)
  return (pipeline
          | 'PrepareToClearSharedKeepAlives' >> beam.Create([None])
          | 'WaitAndClearSharedKeepAlives' >> beam.Map(
              lambda x, empty_side_input: shared.Shared().acquire(lambda: None),
              beam.pvalue.AsIter(empty_pcoll))) 
Example #13
Source File: impl_test.py    From transform with Apache License 2.0 6 votes vote down vote up
def testHandleBatchError(self):
    if self._UseTFXIO():
      return

    def preprocessing_fn(inputs):
      return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

    metadata = tft_unit.metadata_from_feature_spec({
        'x': tf.io.FixedLenFeature([], tf.float32),
    })
    pipeline = self._makeTestPipeline()
    input_data = pipeline | 'CreateTrainingData' >> beam.Create([{
        'x': 1
    }, {
        'x': [4, 1]
    }])
    with beam_impl.Context(temp_dir=self.get_temp_dir()):
      _ = ((input_data, metadata)
           | 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))
    # Exception type depends on the running being used.
    with self.assertRaisesRegexp(
        (RuntimeError, ValueError),
        'An error occured while trying to apply the transformation:'):
      pipeline.run() 
Example #14
Source File: run_pipeline_lib.py    From healthcare-deid with Apache License 2.0 6 votes vote down vote up
def _create_row(stats, now, extra_columns=tuple()):
  """Create a BigQuery row from the given stats."""
  row = {'true_positives': stats.true_positives,
         'false_positives': stats.false_positives,
         'false_negatives': stats.false_negatives}
  if not math.isnan(stats.precision):
    row['precision'] = stats.precision
  if not math.isnan(stats.recall):
    row['recall'] = stats.recall
  if not math.isnan(stats.f_score):
    row['f_score'] = stats.f_score

  row['timestamp'] = now

  for column_name, val in extra_columns:
    row[column_name] = val

  return row 
Example #15
Source File: run_deid_lib.py    From healthcare-deid with Apache License 2.0 6 votes vote down vote up
def read_csv(p, csv_filename):
  """Read csv file to the row format expected by deid()."""
  rows = []
  with open(csv_filename) as f:
    spamreader = unicodecsv.UnicodeReader(f)
    headers = []
    for row in spamreader:
      if not headers:
        headers = row
        continue
      rowmap = {}
      for i in range(len(headers)):
        val = ''
        if i < len(row):
          val = row[i]
        rowmap[headers[i]] = val
      rows.append([rowmap])
  return p | beam.Create(rows) 
Example #16
Source File: executor.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _PrestoToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    exec_properties: Dict[Text, Any],
    split_pattern: Text) -> beam.pvalue.PCollection:
  """Read from Presto and transform to TF examples.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, a Presto sql string.

  Returns:
    PCollection of TF examples.
  """
  conn_config = example_gen_pb2.CustomConfig()
  json_format.Parse(exec_properties['custom_config'], conn_config)
  presto_config = presto_config_pb2.PrestoConnConfig()
  conn_config.custom_config.Unpack(presto_config)

  client = _deserialize_conn_config(presto_config)
  return (pipeline
          | 'Query' >> beam.Create([split_pattern])
          | 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client))
          | 'ToTFExample' >> beam.Map(_row_to_example)) 
Example #17
Source File: vcfio_test.py    From gcp-variant-transforms with Apache License 2.0 6 votes vote down vote up
def _assert_pipeline_read_files_record_count_equal(
      self, input_pattern, expected_count, use_read_all=False):
    """Helper method for verifying total records read.

    Args:
      input_pattern (str): Input file pattern to read.
      expected_count (int): Expected number of reacords that was read.
      use_read_all (bool): Whether to use the scalable ReadAllFromVcf transform
        instead of ReadFromVcf.
    """
    pipeline = TestPipeline()
    if use_read_all:
      pcoll = (pipeline
               | 'Create' >> beam.Create([input_pattern])
               | 'Read' >> ReadAllFromVcf())
    else:
      pcoll = pipeline | 'Read' >> ReadFromVcf(input_pattern)
    assert_that(pcoll, asserts.count_equals_to(expected_count))
    pipeline.run() 
Example #18
Source File: gcs_to_bigquery_lib.py    From healthcare-deid with Apache License 2.0 5 votes vote down vote up
def run_pipeline(input_pattern, output_table, pipeline_args):
  """Read the records from GCS and write them to BigQuery."""
  p = beam.Pipeline(options=PipelineOptions(pipeline_args))
  _ = (p |
       'match_files' >> beam.Create(f2pn.match_files(input_pattern)) |
       'to_records' >> beam.FlatMap(f2pn.map_file_to_records) |
       'parse_physionet_record' >> beam.Map(f2pn.parse_physionet_record) |
       'write' >> beam.io.Write(beam.io.BigQuerySink(
           output_table,
           schema='patient_id:INTEGER, record_number:INTEGER, note:STRING',
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
  result = p.run().wait_until_finish()
  logging.info('GCS to BigQuery result: %s', result) 
Example #19
Source File: deep_copy_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testMultipleCopies(self):
    with beam.Pipeline() as p:
      grouped = (p
                 | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')])
                 | beam.Map(lambda x: DeepCopyTest._CountingIdentityFn(
                     'PreGroup', x))
                 | beam.GroupByKey())
      modified = (
          grouped
          |
          'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1'))
          |
          'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2')))

      num_copies = 6

      first_copy = deep_copy.deep_copy(modified)
      self.assertEqual(first_copy.producer.full_label, 'Add2.Copy')
      self.assertEqual(first_copy.producer.inputs[0].producer.full_label,
                       'Add1.Copy')

      for i in range(num_copies - 1):
        copied = deep_copy.deep_copy(modified)
        self.assertEqual(copied.producer.full_label, 'Add2.Copy%d' % i)
        self.assertEqual(copied.producer.inputs[0].producer.full_label,
                         'Add1.Copy%d' % i)

    self.assertEqual(DeepCopyTest._counts['PreGroup'], 3)
    self.assertEqual(DeepCopyTest._counts['Add1'], 3 * (num_copies + 1))
    self.assertEqual(DeepCopyTest._counts['Add2'], 3 * (num_copies + 1)) 
Example #20
Source File: deep_copy_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testBasicDeepCopy(self):
    with beam.Pipeline() as p:
      grouped = (p
                 | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')])
                 | beam.Map(
                     lambda x: DeepCopyTest._CountingIdentityFn(
                         'PreGroup', x))
                 | beam.GroupByKey())
      modified = (
          grouped
          |
          'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1'))
          |
          'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2')))
      copied = deep_copy.deep_copy(modified)

      # pylint: disable=expression-not-assigned
      modified | 'Add3' >> beam.Map(
          DeepCopyTest._MakeAdd1CountingIdentityFn('Add3'))
      # pylint: enable=expression-not-assigned

      # Check labels.
      self.assertEqual(copied.producer.full_label, 'Add2.Copy')
      self.assertEqual(copied.producer.inputs[0].producer.full_label,
                       'Add1.Copy')

      # Check that deep copy was performed.
      self.assertIsNot(copied.producer.inputs[0], modified.producer.inputs[0])

      # Check that copy stops at materialization boundary.
      self.assertIs(copied.producer.inputs[0].producer.inputs[0],
                    modified.producer.inputs[0].producer.inputs[0])

    # Check counts of processed items.
    self.assertEqual(DeepCopyTest._counts['PreGroup'], 3)
    self.assertEqual(DeepCopyTest._counts['Add1'], 6)
    self.assertEqual(DeepCopyTest._counts['Add2'], 6)
    self.assertEqual(DeepCopyTest._counts['Add3'], 3) 
Example #21
Source File: utils_test.py    From text with Apache License 2.0 5 votes vote down vote up
def testUnsorted(self):
    with TestPipeline() as p:
      tokens = p | 'CreateInput' >> beam.Create(self.sample_input)
      result = tokens | beam.CombineGlobally(utils.SortByCount())
      assert_that(result, equal_to([[('c', 9), ('a', 5), ('d', 4), ('b', 2)]])) 
Example #22
Source File: utils_test.py    From text with Apache License 2.0 5 votes vote down vote up
def testLangNotInLangSetIncludeOthers(self):
    with TestPipeline() as p:
      tokens = p | beam.Create(self.sample_input)
      result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'}, True))
      assert_that(result, equal_to([('I', 'other'),
                                    ('like', 'other'),
                                    ('pie', 'other'),
                                    ('.', 'other')])) 
Example #23
Source File: transform_fn_io.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, pvalue):
    transform_fn_path = os.path.join(self._path,
                                     tft.TFTransformOutput.TRANSFORM_FN_DIR)
    saved_model_dir_pcoll = (
        pvalue.pipeline
        | 'CreateTransformFnPath' >> beam.Create([transform_fn_path]))

    metadata = metadata_io.read_metadata(
        os.path.join(self._path,
                     tft.TFTransformOutput.TRANSFORMED_METADATA_DIR))

    return saved_model_dir_pcoll, metadata 
Example #24
Source File: transform_fn_io_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testWriteTransformFn(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      file_io.recursive_create_dir(saved_model_dir)
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
      # Combine test metadata with a dict of PCollections resolving futures.
      deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
          [test_metadata.COMPLETE_METADATA])
      metadata = beam_metadata_io.BeamDatasetMetadata(
          test_metadata.INCOMPLETE_METADATA, deferred_metadata)

      _ = ((saved_model_dir_pcoll, metadata)
           | transform_fn_io.WriteTransformFn(transform_output_dir))

    # Test reading with TFTransformOutput
    tf_transform_output = tft.TFTransformOutput(transform_output_dir)
    metadata = tf_transform_output.transformed_metadata
    self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

    transform_fn_dir = tf_transform_output.transform_savedmodel_dir
    self.assertTrue(file_io.file_exists(transform_fn_dir))
    self.assertTrue(file_io.is_directory(transform_fn_dir)) 
Example #25
Source File: beam_metadata_io_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testWriteMetadataDeferred(self):
    # Write metadata to disk using WriteMetadata PTransform, combining
    # incomplete metadata with (deferred) complete metadata.
    with beam.Pipeline() as pipeline:
      path = self.get_temp_dir()
      deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
          [test_metadata.COMPLETE_METADATA])
      metadata = beam_metadata_io.BeamDatasetMetadata(
          test_metadata.INCOMPLETE_METADATA, deferred_metadata)
      _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline)

    # Load from disk and check that it is as expected.
    metadata = metadata_io.read_metadata(path)
    self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) 
Example #26
Source File: beam_metadata_io.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, metadata):
    if hasattr(metadata, 'deferred_metadata'):
      metadata_pcoll = metadata.deferred_metadata
    else:
      metadata_pcoll = self.pipeline | beam.Create([metadata])

    def write_metadata_output(metadata):
      output_path = self._path
      if self._write_to_unique_subdirectory:
        output_path = common.get_unique_temp_path(self._path)
      metadata_io.write_metadata(metadata, output_path)
      return output_path

    return metadata_pcoll | 'WriteMetadata' >> beam.Map(write_metadata_output) 
Example #27
Source File: impl.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, inputs):
    pipeline = (inputs[0] if isinstance(inputs, tuple) else inputs).pipeline
    saved_model_dir_pcoll = pipeline | 'CreateSavedModel' >> beam.Create(
        [self._unbound_saved_model_dir])

    if isinstance(inputs, beam.pvalue.PBegin):
      return saved_model_dir_pcoll

    return saved_model_dir_pcoll | 'ReplaceWithConstants' >> beam.Map(
        _replace_tensors_with_constant_values, self._base_temp_dir,
        *[beam.pvalue.AsSingleton(pcoll) for pcoll in inputs]) 
Example #28
Source File: common.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, pcoll):
    _ = (
        pcoll.pipeline
        | 'CreateSole' >> beam.Create([None])
        | 'Count' >> beam.Map(self._make_and_increment_counter))
    return pcoll 
Example #29
Source File: analyzer_impls.py    From transform with Apache License 2.0 5 votes vote down vote up
def create_accumulator(self):
    """Create an accumulator with all zero entries."""
    return self._combiner.create_accumulator() 
Example #30
Source File: analyzer_impls.py    From transform with Apache License 2.0 5 votes vote down vote up
def expand(self, inputs):
    counts, = inputs
    vocabulary_file = os.path.join(self._base_temp_dir, self._vocab_filename)
    vocab_is_written = (
        counts.pipeline
        | 'Prepare' >> beam.Create([None])
        | 'OrderElements' >> beam.ParDo(
            _OrderElementsFn(self._store_frequency, self._fingerprint_shuffle,
                             self._input_dtype),
            counts_iter=beam.pvalue.AsIter(counts))
        # TODO(b/62379925) For now force a single file. Should
        # `InitializeTableFromTextFile` operate on a @N set of files?
        # TODO(b/67863471) Here we are relying on fusion (an implementation
        # detail) for the ordering to be maintained when the results are written
        # to disk. Perform the write within the body of `OrderElements` maybe
        # `OrderElementsAndWrite`. This would mean using TF IO instead of Beam
        # IO so it's perhaps not great.
        | 'WriteToFile' >> beam.io.WriteToText(
            vocabulary_file, shard_name_template=''))
    # Return the vocabulary path.
    wait_for_vocabulary_transform = (
        counts.pipeline
        | 'CreatePath' >> beam.Create([np.array(vocabulary_file)])
        # Ensure that the analysis returns only after the file is written.
        | 'WaitForVocabularyFile' >> beam.Map(
            lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
    return (wait_for_vocabulary_transform,)