Python apache_beam.Create() Examples
The following are 30
code examples of apache_beam.Create().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apache_beam
, or try the search function
.
Example #1
Source File: vcf_header_io_test.py From gcp-variant-transforms with Apache License 2.0 | 7 votes |
def test_pipeline_read_all_file_pattern(self): with temp_dir.TempDir() as tempdir: headers_1 = [self.lines[1], self.lines[-1]] headers_2 = [self.lines[2], self.lines[3], self.lines[-1]] headers_3 = [self.lines[4], self.lines[-1]] file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1) file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2) file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3) pipeline = TestPipeline() pcoll = (pipeline | 'Create' >> beam.Create( [os.path.join(tempdir.get_path(), '*.vcf')]) | 'ReadHeaders' >> ReadAllVcfHeaders()) expected = [_get_vcf_header_from_lines(h, file_name=file_name) for h, file_name in [(headers_1, file_name_1), (headers_2, file_name_2), (headers_3, file_name_3)]] assert_that(pcoll, asserts.header_vars_equal(expected)) pipeline.run()
Example #2
Source File: utils_test.py From text with Apache License 2.0 | 6 votes |
def testTwoLangs(self): with TestPipeline() as p: tokens = p | 'CreateInput' >> beam.Create(self.sample_input) result = tokens | beam.ParDo(utils.CompileTokenizationInfo()) assert_that(result, equal_to([{ 'lang': 'en', 'count': 1, 'num_preserved_chars': 13, 'num_dropped_chars': 2, 'num_non_unk_wordpieces': 4, 'preserved_ratio': [13/4], 'dropped_ratio': [2/15], 'wordpieces': collections.Counter(['the', 'app', '##le', 'sauce']) }, { 'lang': 'fr', 'count': 1, 'num_preserved_chars': 14, 'num_dropped_chars': 0, 'num_non_unk_wordpieces': 5, 'preserved_ratio': [14/5], 'dropped_ratio': [0], 'wordpieces': collections.Counter(['bon', '##jour', 'bon', '##soir']) }]))
Example #3
Source File: stats_api_test.py From data-validation with Apache License 2.0 | 6 votes |
def test_stats_pipeline_with_sample_rate(self): record_batches = [ pa.RecordBatch.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), ] with beam.Pipeline() as p: options = stats_options.StatsOptions( sample_rate=1.0, num_top_values=2, num_rank_histogram_buckets=2, num_values_histogram_buckets=2, num_histogram_buckets=2, num_quantiles_histogram_buckets=2, epsilon=0.001) result = ( p | beam.Create(record_batches) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, self._sampling_test_expected_result))
Example #4
Source File: transform_fn_io_test.py From transform with Apache License 2.0 | 6 votes |
def testWriteTransformFnIsIdempotent(self): transform_output_dir = os.path.join(self.get_temp_dir(), 'output') def mock_write_metadata_expand(unused_self, unused_metadata): raise ArithmeticError('Some error') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) with mock.patch.object(transform_fn_io.beam_metadata_io.WriteMetadata, 'expand', mock_write_metadata_expand): with self.assertRaisesRegexp(ArithmeticError, 'Some error'): _ = ((saved_model_dir_pcoll, object()) | transform_fn_io.WriteTransformFn(transform_output_dir)) self.assertFalse(file_io.file_exists(transform_output_dir))
Example #5
Source File: stats_api_test.py From data-validation with Apache License 2.0 | 6 votes |
def test_stats_pipeline_with_zero_examples(self): expected_result = text_format.Parse( """ datasets { num_examples: 0 } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( num_top_values=1, num_rank_histogram_buckets=1, num_values_histogram_buckets=2, num_histogram_buckets=1, num_quantiles_histogram_buckets=1, epsilon=0.001) result = (p | beam.Create([]) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
Example #6
Source File: stats_api_test.py From data-validation with Apache License 2.0 | 6 votes |
def test_write_stats_to_text(self): stats = text_format.Parse( """ datasets { name: 'x' num_examples: 100 } """, statistics_pb2.DatasetFeatureStatisticsList()) output_path = os.path.join(self._get_temp_dir(), 'stats') with beam.Pipeline() as p: _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToText( output_path)) stats_from_file = statistics_pb2.DatasetFeatureStatisticsList() serialized_stats = io_util.read_file_to_string( output_path, binary_mode=True) stats_from_file.ParseFromString(serialized_stats) self.assertLen(stats_from_file.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_file.datasets[0], stats.datasets[0])
Example #7
Source File: csv_decoder_test.py From data-validation with Apache License 2.0 | 6 votes |
def test_csv_decoder(self, input_lines, expected_result, column_names, delimiter=',', skip_blank_lines=True, schema=None, multivalent_columns=None, secondary_delimiter=None): with beam.Pipeline() as p: result = ( p | beam.Create(input_lines, reshuffle=False) | csv_decoder.DecodeCSV( column_names=column_names, delimiter=delimiter, skip_blank_lines=skip_blank_lines, schema=schema, multivalent_columns=multivalent_columns, secondary_delimiter=secondary_delimiter)) util.assert_that( result, test_util.make_arrow_record_batches_equal_fn(self, expected_result))
Example #8
Source File: pipeline_common.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def add_annotation_headers(pipeline, known_args, pipeline_mode, merged_header, annotated_vcf_pattern): if pipeline_mode == PipelineModes.LARGE: annotation_headers = (pipeline | 'ReadAnnotatedVCF' >> beam.Create([annotated_vcf_pattern]) | 'ReadHeaders' >> vcf_header_io.ReadAllVcfHeaders()) else: annotation_headers = ( pipeline | 'ReadHeaders' >> vcf_header_io.ReadVcfHeaders(annotated_vcf_pattern)) merged_header = ( (merged_header, annotation_headers) | beam.Flatten() | 'MergeWithOriginalHeaders' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header
Example #9
Source File: pipeline_common.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def read_headers( pipeline, #type: beam.Pipeline pipeline_mode, #type: int all_patterns #type: List[str] ): # type: (...) -> pvalue.PCollection """Creates an initial PCollection by reading the VCF file headers.""" compression_type = get_compression_type(all_patterns) if pipeline_mode == PipelineModes.LARGE: headers = (pipeline | beam.Create(all_patterns) | vcf_header_io.ReadAllVcfHeaders( compression_type=compression_type)) else: headers = pipeline | vcf_header_io.ReadVcfHeaders( all_patterns[0], compression_type=compression_type) return headers
Example #10
Source File: stats_impl_test.py From data-validation with Apache License 2.0 | 6 votes |
def test_stats_impl(self, record_batches, options, expected_result_proto_text, schema=None): expected_result = text_format.Parse( expected_result_proto_text, statistics_pb2.DatasetFeatureStatisticsList()) if schema is not None: options.schema = schema with beam.Pipeline() as p: result = ( p | beam.Create(record_batches, reshuffle=False) | stats_impl.GenerateStatisticsImpl(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
Example #11
Source File: vcf_estimate_io_test.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def test_pipeline_read_all_file_pattern(self): with temp_dir.TempDir() as tempdir: lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2] lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4] lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:] file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=lines_1) file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=lines_2) file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=lines_3) pipeline = TestPipeline() pcoll = pipeline | 'ReadHeaders' >> GetEstimates( os.path.join(tempdir.get_path(), '*.vcf')) pcoll = (pipeline | 'Create' >> beam.Create( [os.path.join(tempdir.get_path(), '*.vcf')]) | 'GetAllEstimates' >> GetAllEstimates()) expected = [_get_estimate_from_lines(lines, file_name=file_name) for lines, file_name in [(lines_1, file_name_1), (lines_2, file_name_2), (lines_3, file_name_3)]] assert_that(pcoll, asserts.header_vars_equal(expected)) pipeline.run()
Example #12
Source File: impl.py From transform with Apache License 2.0 | 6 votes |
def _clear_shared_state_after_barrier(pipeline, input_barrier): """Clears any shared state from within a pipeline context. This will only be cleared once input_barrier becomes available. Args: pipeline: A `beam.Pipeline` object. input_barrier: A `PCollection` which the pipeline should wait for. Returns: An empty `PCollection`. """ empty_pcoll = input_barrier | 'MakeCheapBarrier' >> beam.FlatMap( lambda x: None) return (pipeline | 'PrepareToClearSharedKeepAlives' >> beam.Create([None]) | 'WaitAndClearSharedKeepAlives' >> beam.Map( lambda x, empty_side_input: shared.Shared().acquire(lambda: None), beam.pvalue.AsIter(empty_pcoll)))
Example #13
Source File: impl_test.py From transform with Apache License 2.0 | 6 votes |
def testHandleBatchError(self): if self._UseTFXIO(): return def preprocessing_fn(inputs): return {'x_scaled': tft.scale_to_0_1(inputs['x'])} metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), }) pipeline = self._makeTestPipeline() input_data = pipeline | 'CreateTrainingData' >> beam.Create([{ 'x': 1 }, { 'x': [4, 1] }]) with beam_impl.Context(temp_dir=self.get_temp_dir()): _ = ((input_data, metadata) | 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn)) # Exception type depends on the running being used. with self.assertRaisesRegexp( (RuntimeError, ValueError), 'An error occured while trying to apply the transformation:'): pipeline.run()
Example #14
Source File: run_pipeline_lib.py From healthcare-deid with Apache License 2.0 | 6 votes |
def _create_row(stats, now, extra_columns=tuple()): """Create a BigQuery row from the given stats.""" row = {'true_positives': stats.true_positives, 'false_positives': stats.false_positives, 'false_negatives': stats.false_negatives} if not math.isnan(stats.precision): row['precision'] = stats.precision if not math.isnan(stats.recall): row['recall'] = stats.recall if not math.isnan(stats.f_score): row['f_score'] = stats.f_score row['timestamp'] = now for column_name, val in extra_columns: row[column_name] = val return row
Example #15
Source File: run_deid_lib.py From healthcare-deid with Apache License 2.0 | 6 votes |
def read_csv(p, csv_filename): """Read csv file to the row format expected by deid().""" rows = [] with open(csv_filename) as f: spamreader = unicodecsv.UnicodeReader(f) headers = [] for row in spamreader: if not headers: headers = row continue rowmap = {} for i in range(len(headers)): val = '' if i < len(row): val = row[i] rowmap[headers[i]] = val rows.append([rowmap]) return p | beam.Create(rows)
Example #16
Source File: executor.py From tfx with Apache License 2.0 | 6 votes |
def _PrestoToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, exec_properties: Dict[Text, Any], split_pattern: Text) -> beam.pvalue.PCollection: """Read from Presto and transform to TF examples. Args: pipeline: beam pipeline. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, a Presto sql string. Returns: PCollection of TF examples. """ conn_config = example_gen_pb2.CustomConfig() json_format.Parse(exec_properties['custom_config'], conn_config) presto_config = presto_config_pb2.PrestoConnConfig() conn_config.custom_config.Unpack(presto_config) client = _deserialize_conn_config(presto_config) return (pipeline | 'Query' >> beam.Create([split_pattern]) | 'QueryTable' >> beam.ParDo(_ReadPrestoDoFn(client)) | 'ToTFExample' >> beam.Map(_row_to_example))
Example #17
Source File: vcfio_test.py From gcp-variant-transforms with Apache License 2.0 | 6 votes |
def _assert_pipeline_read_files_record_count_equal( self, input_pattern, expected_count, use_read_all=False): """Helper method for verifying total records read. Args: input_pattern (str): Input file pattern to read. expected_count (int): Expected number of reacords that was read. use_read_all (bool): Whether to use the scalable ReadAllFromVcf transform instead of ReadFromVcf. """ pipeline = TestPipeline() if use_read_all: pcoll = (pipeline | 'Create' >> beam.Create([input_pattern]) | 'Read' >> ReadAllFromVcf()) else: pcoll = pipeline | 'Read' >> ReadFromVcf(input_pattern) assert_that(pcoll, asserts.count_equals_to(expected_count)) pipeline.run()
Example #18
Source File: gcs_to_bigquery_lib.py From healthcare-deid with Apache License 2.0 | 5 votes |
def run_pipeline(input_pattern, output_table, pipeline_args): """Read the records from GCS and write them to BigQuery.""" p = beam.Pipeline(options=PipelineOptions(pipeline_args)) _ = (p | 'match_files' >> beam.Create(f2pn.match_files(input_pattern)) | 'to_records' >> beam.FlatMap(f2pn.map_file_to_records) | 'parse_physionet_record' >> beam.Map(f2pn.parse_physionet_record) | 'write' >> beam.io.Write(beam.io.BigQuerySink( output_table, schema='patient_id:INTEGER, record_number:INTEGER, note:STRING', write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) result = p.run().wait_until_finish() logging.info('GCS to BigQuery result: %s', result)
Example #19
Source File: deep_copy_test.py From transform with Apache License 2.0 | 5 votes |
def testMultipleCopies(self): with beam.Pipeline() as p: grouped = (p | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')]) | beam.Map(lambda x: DeepCopyTest._CountingIdentityFn( 'PreGroup', x)) | beam.GroupByKey()) modified = ( grouped | 'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1')) | 'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2'))) num_copies = 6 first_copy = deep_copy.deep_copy(modified) self.assertEqual(first_copy.producer.full_label, 'Add2.Copy') self.assertEqual(first_copy.producer.inputs[0].producer.full_label, 'Add1.Copy') for i in range(num_copies - 1): copied = deep_copy.deep_copy(modified) self.assertEqual(copied.producer.full_label, 'Add2.Copy%d' % i) self.assertEqual(copied.producer.inputs[0].producer.full_label, 'Add1.Copy%d' % i) self.assertEqual(DeepCopyTest._counts['PreGroup'], 3) self.assertEqual(DeepCopyTest._counts['Add1'], 3 * (num_copies + 1)) self.assertEqual(DeepCopyTest._counts['Add2'], 3 * (num_copies + 1))
Example #20
Source File: deep_copy_test.py From transform with Apache License 2.0 | 5 votes |
def testBasicDeepCopy(self): with beam.Pipeline() as p: grouped = (p | beam.Create([(1, 'a'), (2, 'b'), (3, 'c')]) | beam.Map( lambda x: DeepCopyTest._CountingIdentityFn( 'PreGroup', x)) | beam.GroupByKey()) modified = ( grouped | 'Add1' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add1')) | 'Add2' >> beam.Map(DeepCopyTest._MakeAdd1CountingIdentityFn('Add2'))) copied = deep_copy.deep_copy(modified) # pylint: disable=expression-not-assigned modified | 'Add3' >> beam.Map( DeepCopyTest._MakeAdd1CountingIdentityFn('Add3')) # pylint: enable=expression-not-assigned # Check labels. self.assertEqual(copied.producer.full_label, 'Add2.Copy') self.assertEqual(copied.producer.inputs[0].producer.full_label, 'Add1.Copy') # Check that deep copy was performed. self.assertIsNot(copied.producer.inputs[0], modified.producer.inputs[0]) # Check that copy stops at materialization boundary. self.assertIs(copied.producer.inputs[0].producer.inputs[0], modified.producer.inputs[0].producer.inputs[0]) # Check counts of processed items. self.assertEqual(DeepCopyTest._counts['PreGroup'], 3) self.assertEqual(DeepCopyTest._counts['Add1'], 6) self.assertEqual(DeepCopyTest._counts['Add2'], 6) self.assertEqual(DeepCopyTest._counts['Add3'], 3)
Example #21
Source File: utils_test.py From text with Apache License 2.0 | 5 votes |
def testUnsorted(self): with TestPipeline() as p: tokens = p | 'CreateInput' >> beam.Create(self.sample_input) result = tokens | beam.CombineGlobally(utils.SortByCount()) assert_that(result, equal_to([[('c', 9), ('a', 5), ('d', 4), ('b', 2)]]))
Example #22
Source File: utils_test.py From text with Apache License 2.0 | 5 votes |
def testLangNotInLangSetIncludeOthers(self): with TestPipeline() as p: tokens = p | beam.Create(self.sample_input) result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'}, True)) assert_that(result, equal_to([('I', 'other'), ('like', 'other'), ('pie', 'other'), ('.', 'other')]))
Example #23
Source File: transform_fn_io.py From transform with Apache License 2.0 | 5 votes |
def expand(self, pvalue): transform_fn_path = os.path.join(self._path, tft.TFTransformOutput.TRANSFORM_FN_DIR) saved_model_dir_pcoll = ( pvalue.pipeline | 'CreateTransformFnPath' >> beam.Create([transform_fn_path])) metadata = metadata_io.read_metadata( os.path.join(self._path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)) return saved_model_dir_pcoll, metadata
Example #24
Source File: transform_fn_io_test.py From transform with Apache License 2.0 | 5 votes |
def testWriteTransformFn(self): transform_output_dir = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) # Combine test metadata with a dict of PCollections resolving futures. deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata) _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(transform_output_dir)) # Test reading with TFTransformOutput tf_transform_output = tft.TFTransformOutput(transform_output_dir) metadata = tf_transform_output.transformed_metadata self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) transform_fn_dir = tf_transform_output.transform_savedmodel_dir self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
Example #25
Source File: beam_metadata_io_test.py From transform with Apache License 2.0 | 5 votes |
def testWriteMetadataDeferred(self): # Write metadata to disk using WriteMetadata PTransform, combining # incomplete metadata with (deferred) complete metadata. with beam.Pipeline() as pipeline: path = self.get_temp_dir() deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
Example #26
Source File: beam_metadata_io.py From transform with Apache License 2.0 | 5 votes |
def expand(self, metadata): if hasattr(metadata, 'deferred_metadata'): metadata_pcoll = metadata.deferred_metadata else: metadata_pcoll = self.pipeline | beam.Create([metadata]) def write_metadata_output(metadata): output_path = self._path if self._write_to_unique_subdirectory: output_path = common.get_unique_temp_path(self._path) metadata_io.write_metadata(metadata, output_path) return output_path return metadata_pcoll | 'WriteMetadata' >> beam.Map(write_metadata_output)
Example #27
Source File: impl.py From transform with Apache License 2.0 | 5 votes |
def expand(self, inputs): pipeline = (inputs[0] if isinstance(inputs, tuple) else inputs).pipeline saved_model_dir_pcoll = pipeline | 'CreateSavedModel' >> beam.Create( [self._unbound_saved_model_dir]) if isinstance(inputs, beam.pvalue.PBegin): return saved_model_dir_pcoll return saved_model_dir_pcoll | 'ReplaceWithConstants' >> beam.Map( _replace_tensors_with_constant_values, self._base_temp_dir, *[beam.pvalue.AsSingleton(pcoll) for pcoll in inputs])
Example #28
Source File: common.py From transform with Apache License 2.0 | 5 votes |
def expand(self, pcoll): _ = ( pcoll.pipeline | 'CreateSole' >> beam.Create([None]) | 'Count' >> beam.Map(self._make_and_increment_counter)) return pcoll
Example #29
Source File: analyzer_impls.py From transform with Apache License 2.0 | 5 votes |
def create_accumulator(self): """Create an accumulator with all zero entries.""" return self._combiner.create_accumulator()
Example #30
Source File: analyzer_impls.py From transform with Apache License 2.0 | 5 votes |
def expand(self, inputs): counts, = inputs vocabulary_file = os.path.join(self._base_temp_dir, self._vocab_filename) vocab_is_written = ( counts.pipeline | 'Prepare' >> beam.Create([None]) | 'OrderElements' >> beam.ParDo( _OrderElementsFn(self._store_frequency, self._fingerprint_shuffle, self._input_dtype), counts_iter=beam.pvalue.AsIter(counts)) # TODO(b/62379925) For now force a single file. Should # `InitializeTableFromTextFile` operate on a @N set of files? # TODO(b/67863471) Here we are relying on fusion (an implementation # detail) for the ordering to be maintained when the results are written # to disk. Perform the write within the body of `OrderElements` maybe # `OrderElementsAndWrite`. This would mean using TF IO instead of Beam # IO so it's perhaps not great. | 'WriteToFile' >> beam.io.WriteToText( vocabulary_file, shard_name_template='')) # Return the vocabulary path. wait_for_vocabulary_transform = ( counts.pipeline | 'CreatePath' >> beam.Create([np.array(vocabulary_file)]) # Ensure that the analysis returns only after the file is written. | 'WaitForVocabularyFile' >> beam.Map( lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written))) return (wait_for_vocabulary_transform,)