Python tensorflow_transform.tf_metadata.dataset_metadata.DatasetMetadata() Examples
The following are 20
code examples of tensorflow_transform.tf_metadata.dataset_metadata.DatasetMetadata().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow_transform.tf_metadata.dataset_metadata
, or try the search function
.
Example #1
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def store_transformed_data(data, schema, path, name=''): """Stores data from input pipeline into TFRecord in the specified path. Args: data: `PCollection`, input pipeline. schema: `DatasetMetadata` object, describes schema of the input pipeline. path: string, where to write output. name: string: name describing pipeline to be written. Returns: PCollection """ p = ( data | 'WriteData{}'.format(name) >> tfrecordio.WriteToTFRecord( path, coder=example_proto_coder.ExampleProtoCoder(schema.schema))) return p
Example #2
Source File: tft_unit.py From transform with Apache License 2.0 | 6 votes |
def convert_to_tfxio_api_inputs( self, legacy_input_data, legacy_input_metadata, label='input_data'): """Converts from the legacy TFT API inputs to TFXIO-based inputs. Args: legacy_input_data: a PCollection of instance dicts. legacy_input_metadata: a tft.DatasetMetadata. label: label for the PTransform that translates `legacy_input_data` into the TFXIO input data. Set to different values if this method is called multiple times in a beam Pipeline. Returns: A tuple of a PCollection of `pyarrow.RecordBatch` and a `tensor_adapter.TensorAdapterConfig`. This tuple can be fed directly to TFT's `{Analyze,Transform,AnalyzeAndTransform}Dataset` APIs. """ tfxio_impl = _LegacyCompatibilityTFXIO(legacy_input_metadata.schema) input_data = ( legacy_input_data | ('LegacyFormatToTfxio[%s]' % label >> tfxio_impl.BeamSource( beam_impl.Context.get_desired_batch_size()))) return input_data, tfxio_impl.TensorAdapterConfig()
Example #3
Source File: executor.py From tfx with Apache License 2.0 | 6 votes |
def _ReadMetadata(self, data_format: Text, schema_path: Text) -> dataset_metadata.DatasetMetadata: """Returns a dataset_metadata.DatasetMetadata for the input data. Args: data_format: name of the input data format. schema_path: path to schema file. Returns: A dataset_metadata.DatasetMetadata representing the provided set of columns. """ if self._ShouldDecodeAsRawExample(data_format): return dataset_metadata.DatasetMetadata(_RAW_EXAMPLE_SCHEMA) schema_proto = self._GetSchema(schema_path) # For compatibility with tensorflow_transform 0.13 and 0.14, we create and # then update a DatasetMetadata. result = dataset_metadata.DatasetMetadata(dataset_schema.Schema({})) _GetSchemaProto(result).CopyFrom(schema_proto) return result
Example #4
Source File: executor.py From tfx with Apache License 2.0 | 6 votes |
def _ReadExamples( pipeline: beam.Pipeline, dataset: _Dataset, input_dataset_metadata: dataset_metadata.DatasetMetadata ) -> beam.pvalue.PCollection: """Reads examples from the given `dataset`. Args: pipeline: beam pipeline. dataset: A `_Dataset` object that represents the data to read. input_dataset_metadata: A `dataset_metadata.DatasetMetadata`. Not used. Returns: A PCollection containing KV pairs of bytes. """ del input_dataset_metadata assert dataset.file_format == labels.FORMAT_TFRECORD, dataset.file_format return ( pipeline | 'Read' >> beam.io.ReadFromTFRecord( dataset.file_pattern, coder=beam.coders.BytesCoder(), # TODO(b/114938612): Eventually remove this override. validate=False) | 'AddKey' >> beam.Map(lambda x: (None, x)))
Example #5
Source File: metadata_io.py From transform with Apache License 2.0 | 6 votes |
def read_metadata(path): """Load metadata in JSON format from a path into a new DatasetMetadata.""" schema_file = os.path.join(path, 'schema.pbtxt') legacy_schema_file = os.path.join(path, 'v1-json', 'schema.json') if file_io.file_exists(schema_file): text_proto = file_io.FileIO(schema_file, 'r').read() schema_proto = text_format.Parse(text_proto, schema_pb2.Schema(), allow_unknown_extension=True) elif file_io.file_exists(legacy_schema_file): schema_json = file_io.FileIO(legacy_schema_file, 'r').read() schema_proto = _parse_schema_json(schema_json) else: raise IOError( 'Schema file {} does not exist and neither did legacy format file ' '{}'.format(schema_file, legacy_schema_file)) return dataset_metadata.DatasetMetadata(schema_proto)
Example #6
Source File: executor.py From tfx with Apache License 2.0 | 6 votes |
def _GetSchemaProto( metadata: dataset_metadata.DatasetMetadata) -> schema_pb2.Schema: """Gets the schema proto associated with a DatasetMetadata. This is needed because tensorflow_transform 0.13 and tensorflow_transform 0.14 have a different API for DatasetMetadata. Args: metadata: A dataset_metadata.DatasetMetadata. Returns: A schema_pb2.Schema. """ # `schema` is either a Schema proto or dataset_schema.Schema. schema = metadata.schema # In the case where it's a dataset_schema.Schema, fetch the schema proto. return getattr(schema, '_schema_proto', schema)
Example #7
Source File: tft_benchmark_base.py From tfx with Apache License 2.0 | 6 votes |
def _get_common_variables(dataset): """Returns metadata schema, preprocessing fn, input dataset metadata.""" tf_metadata_schema = benchmark_utils.read_schema( dataset.tf_metadata_schema_path()) preprocessing_fn = dataset.tft_preprocessing_fn() feature_spec = schema_utils.schema_as_feature_spec( tf_metadata_schema).feature_spec transform_input_columns = ( tft.get_transform_input_columns(preprocessing_fn, feature_spec)) transform_input_dataset_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ feature: feature_spec[feature] for feature in transform_input_columns })) return CommonVariablesTuple( tf_metadata_schema=tf_metadata_schema, preprocessing_fn=preprocessing_fn, transform_input_dataset_metadata=transform_input_dataset_metadata)
Example #8
Source File: tft_benchmark_base.py From tfx with Apache License 2.0 | 6 votes |
def __init__(self, dataset, tf_metadata_schema, preprocessing_fn, transform_input_dataset_metadata, generate_dataset=False): """Constructor. Args: dataset: BenchmarkDataset object. tf_metadata_schema: tf.Metadata schema. preprocessing_fn: preprocessing_fn. transform_input_dataset_metadata: dataset_metadata.DatasetMetadata. generate_dataset: If True, generates the raw dataset and appropriate intermediate outputs (just the TFT SavedModel for now) necessary for other benchmarks. """ self._dataset = dataset self._tf_metadata_schema = tf_metadata_schema self._preprocessing_fn = preprocessing_fn self._transform_input_dataset_metadata = transform_input_dataset_metadata self._generate_dataset = generate_dataset
Example #9
Source File: input_metadata.py From cloudml-samples with Apache License 2.0 | 6 votes |
def _create_raw_metadata(): """Create a DatasetMetadata for the raw data.""" column_schemas = { key: dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) for key in CATEGORICAL_FEATURE_KEYS } column_schemas.update({ key: dataset_schema.ColumnSchema( tf.float32, [], dataset_schema.FixedColumnRepresentation()) for key in NUMERIC_FEATURE_KEYS }) column_schemas[LABEL_KEY] = dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema( column_schemas)) return raw_data_metadata
Example #10
Source File: generate_vocab.py From text with Apache License 2.0 | 6 votes |
def main(_): # Define schema. raw_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'text': tf.FixedLenFeature([], tf.string), 'language_code': tf.FixedLenFeature([], tf.string), })) # Add in padding tokens. reserved_tokens = FLAGS.reserved_tokens if FLAGS.num_pad_tokens: padded_tokens = ['<pad>'] padded_tokens += ['<pad%d>' % i for i in range(1, FLAGS.num_pad_tokens)] reserved_tokens = padded_tokens + reserved_tokens params = learner.Params(FLAGS.upper_thresh, FLAGS.lower_thresh, FLAGS.num_iterations, FLAGS.max_input_tokens, FLAGS.max_token_length, FLAGS.max_unique_chars, FLAGS.vocab_size, FLAGS.slack_ratio, FLAGS.include_joiner_token, FLAGS.joiner, reserved_tokens) generate_vocab(FLAGS.data_file, FLAGS.vocab_file, FLAGS.metrics_file, raw_metadata, params)
Example #11
Source File: generate_word_counts.py From text with Apache License 2.0 | 5 votes |
def main(_): # Generate schema of input data. raw_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'text': tf.FixedLenFeature([], tf.string), 'language_code': tf.FixedLenFeature([], tf.string), })) pipeline = word_count(FLAGS.input_path, FLAGS.output_path, raw_metadata) pipeline.run().wait_until_finish()
Example #12
Source File: features.py From professional-services with Apache License 2.0 | 5 votes |
def get_raw_dataset_metadata(): return dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(RAW_FEATURE_SPEC))
Example #13
Source File: task.py From pipelines with Apache License 2.0 | 5 votes |
def make_tft_input_metadata(schema): """Create tf-transform metadata from given schema.""" tft_schema = {} for col_schema in schema: col_type = col_schema['type'] col_name = col_schema['name'] if col_type == 'NUMBER': tft_schema[col_name] = dataset_schema.ColumnSchema( tf.float32, [], dataset_schema.FixedColumnRepresentation(default_value=0.0)) elif col_type in ['CATEGORY', 'TEXT', 'IMAGE_URL', 'KEY']: tft_schema[col_name] = dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation(default_value='')) return dataset_metadata.DatasetMetadata(dataset_schema.Schema(tft_schema))
Example #14
Source File: pipeline.py From realtime-embeddings-matching with Apache License 2.0 | 5 votes |
def get_metadata(): from tensorflow_transform.tf_metadata import dataset_schema from tensorflow_transform.tf_metadata import dataset_metadata metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({ 'id': dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()), 'text': dataset_schema.ColumnSchema( tf.string, [], dataset_schema.FixedColumnRepresentation()) })) return metadata
Example #15
Source File: metadata_io_test.py From transform with Apache License 2.0 | 5 votes |
def test_write_and_read(self): # TODO(b/123241798): use TEST_TMPDIR basedir = tempfile.mkdtemp() original = dataset_metadata.DatasetMetadata( schema=test_common.get_test_schema()) metadata_io.write_metadata(original, basedir) reloaded = metadata_io.read_metadata(basedir) self.assertEqual(original, reloaded)
Example #16
Source File: metadata_io.py From transform with Apache License 2.0 | 5 votes |
def write_metadata(metadata, path): """Write metadata to given path, in JSON format. Args: metadata: A `DatasetMetadata` to write. path: a path to a directory where metadata should be written. """ if not file_io.file_exists(path): file_io.recursive_create_dir(path) schema_file = os.path.join(path, 'schema.pbtxt') ascii_proto = text_format.MessageToString(metadata.schema) file_io.atomic_write_string_to_file(schema_file, ascii_proto, overwrite=True)
Example #17
Source File: impl.py From transform with Apache License 2.0 | 5 votes |
def _remove_columns_from_metadata(metadata, excluded_columns): """Remove columns from metadata without mutating original metadata.""" feature_spec, domains = schema_utils.schema_as_feature_spec(metadata.schema) new_feature_spec = {name: spec for name, spec in feature_spec.items() if name not in excluded_columns} new_domains = {name: spec for name, spec in domains.items() if name not in excluded_columns} return dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(new_feature_spec, new_domains))
Example #18
Source File: impl.py From transform with Apache License 2.0 | 5 votes |
def _infer_metadata_from_saved_model(saved_model_dir): """Infers a DatasetMetadata for outputs of a SavedModel.""" with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.Session(graph=graph) as session: _, outputs = ( saved_transform_io.partially_apply_saved_transform_internal( saved_model_dir, {})) session.run(tf.compat.v1.global_variables_initializer()) session.run(tf.compat.v1.tables_initializer()) return dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema(outputs, graph, session))
Example #19
Source File: tft_unit.py From transform with Apache License 2.0 | 5 votes |
def metadata_from_feature_spec(feature_spec, domains=None): """Construct a DatasetMetadata from a feature spec. Args: feature_spec: A feature spec domains: A dict containing domains of features Returns: A `tft.tf_metadata.dataset_metadata.DatasetMetadata` object. """ return dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(feature_spec, domains))
Example #20
Source File: simple_example.py From transform with Apache License 2.0 | 5 votes |
def main(): def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = x - tft.mean(x) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.compute_and_apply_vocabulary(s) x_centered_times_y_normalized = (x_centered * y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized } raw_data = [ {'x': 1, 'y': 1, 's': 'hello'}, {'x': 2, 'y': 2, 's': 'world'}, {'x': 3, 'y': 3, 's': 'hello'} ] raw_data_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'y': tf.io.FixedLenFeature([], tf.float32), 'x': tf.io.FixedLenFeature([], tf.float32), })) with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( # pylint: disable=unused-variable (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable pprint.pprint(transformed_data)