Python Examples of tensorflow_transform.tf_metadata.dataset

Source File: preprocess.py From professional-services with Apache License 2.0

6 votes

def store_transformed_data(data, schema, path, name=''):
  """Stores data from input pipeline into TFRecord in the specified path.

  Args:
    data: `PCollection`, input pipeline.
    schema: `DatasetMetadata` object, describes schema of the input pipeline.
    path: string, where to write output.
    name: string: name describing pipeline to be written.

  Returns:
    PCollection
  """

  p = (
      data
      | 'WriteData{}'.format(name) >> tfrecordio.WriteToTFRecord(
          path, coder=example_proto_coder.ExampleProtoCoder(schema.schema)))
  return p

Source File: tft_unit.py From transform with Apache License 2.0

6 votes

def convert_to_tfxio_api_inputs(
      self, legacy_input_data, legacy_input_metadata, label='input_data'):
    """Converts from the legacy TFT API inputs to TFXIO-based inputs.

    Args:
      legacy_input_data: a PCollection of instance dicts.
      legacy_input_metadata: a tft.DatasetMetadata.
      label: label for the PTransform that translates `legacy_input_data` into
        the TFXIO input data. Set to different values if this method is called
        multiple times in a beam Pipeline.
    Returns:
      A tuple of a PCollection of `pyarrow.RecordBatch` and a
      `tensor_adapter.TensorAdapterConfig`. This tuple can be fed directly to
      TFT's `{Analyze,Transform,AnalyzeAndTransform}Dataset` APIs.
    """
    tfxio_impl = _LegacyCompatibilityTFXIO(legacy_input_metadata.schema)
    input_data = (
        legacy_input_data |
        ('LegacyFormatToTfxio[%s]' % label >> tfxio_impl.BeamSource(
            beam_impl.Context.get_desired_batch_size())))
    return input_data, tfxio_impl.TensorAdapterConfig()

Source File: executor.py From tfx with Apache License 2.0

6 votes

def _ReadMetadata(self, data_format: Text,
                    schema_path: Text) -> dataset_metadata.DatasetMetadata:
    """Returns a dataset_metadata.DatasetMetadata for the input data.

    Args:
      data_format: name of the input data format.
      schema_path: path to schema file.

    Returns:
      A dataset_metadata.DatasetMetadata representing the provided set of
          columns.
    """

    if self._ShouldDecodeAsRawExample(data_format):
      return dataset_metadata.DatasetMetadata(_RAW_EXAMPLE_SCHEMA)
    schema_proto = self._GetSchema(schema_path)
    # For compatibility with tensorflow_transform 0.13 and 0.14, we create and
    # then update a DatasetMetadata.
    result = dataset_metadata.DatasetMetadata(dataset_schema.Schema({}))
    _GetSchemaProto(result).CopyFrom(schema_proto)
    return result

Source File: executor.py From tfx with Apache License 2.0

6 votes

def _ReadExamples(
      pipeline: beam.Pipeline, dataset: _Dataset,
      input_dataset_metadata: dataset_metadata.DatasetMetadata
  ) -> beam.pvalue.PCollection:
    """Reads examples from the given `dataset`.

    Args:
      pipeline: beam pipeline.
      dataset: A `_Dataset` object that represents the data to read.
      input_dataset_metadata: A `dataset_metadata.DatasetMetadata`. Not used.

    Returns:
      A PCollection containing KV pairs of bytes.
    """
    del input_dataset_metadata
    assert dataset.file_format == labels.FORMAT_TFRECORD, dataset.file_format

    return (
        pipeline
        | 'Read' >> beam.io.ReadFromTFRecord(
            dataset.file_pattern,
            coder=beam.coders.BytesCoder(),
            # TODO(b/114938612): Eventually remove this override.
            validate=False)
        | 'AddKey' >> beam.Map(lambda x: (None, x)))

Source File: metadata_io.py From transform with Apache License 2.0

6 votes

def read_metadata(path):
  """Load metadata in JSON format from a path into a new DatasetMetadata."""
  schema_file = os.path.join(path, 'schema.pbtxt')
  legacy_schema_file = os.path.join(path, 'v1-json', 'schema.json')
  if file_io.file_exists(schema_file):
    text_proto = file_io.FileIO(schema_file, 'r').read()
    schema_proto = text_format.Parse(text_proto, schema_pb2.Schema(),
                                     allow_unknown_extension=True)
  elif file_io.file_exists(legacy_schema_file):
    schema_json = file_io.FileIO(legacy_schema_file, 'r').read()
    schema_proto = _parse_schema_json(schema_json)
  else:
    raise IOError(
        'Schema file {} does not exist and neither did legacy format file '
        '{}'.format(schema_file, legacy_schema_file))
  return dataset_metadata.DatasetMetadata(schema_proto)

Source File: executor.py From tfx with Apache License 2.0

6 votes

def _GetSchemaProto(
    metadata: dataset_metadata.DatasetMetadata) -> schema_pb2.Schema:
  """Gets the schema proto associated with a DatasetMetadata.

  This is needed because tensorflow_transform 0.13 and tensorflow_transform 0.14
  have a different API for DatasetMetadata.

  Args:
    metadata: A dataset_metadata.DatasetMetadata.

  Returns:
    A schema_pb2.Schema.
  """
  # `schema` is either a Schema proto or dataset_schema.Schema.
  schema = metadata.schema
  # In the case where it's a dataset_schema.Schema, fetch the schema proto.
  return getattr(schema, '_schema_proto', schema)

Source File: tft_benchmark_base.py From tfx with Apache License 2.0

6 votes

def _get_common_variables(dataset):
  """Returns metadata schema, preprocessing fn, input dataset metadata."""

  tf_metadata_schema = benchmark_utils.read_schema(
      dataset.tf_metadata_schema_path())

  preprocessing_fn = dataset.tft_preprocessing_fn()

  feature_spec = schema_utils.schema_as_feature_spec(
      tf_metadata_schema).feature_spec
  transform_input_columns = (
      tft.get_transform_input_columns(preprocessing_fn, feature_spec))
  transform_input_dataset_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
          feature: feature_spec[feature] for feature in transform_input_columns
      }))

  return CommonVariablesTuple(
      tf_metadata_schema=tf_metadata_schema,
      preprocessing_fn=preprocessing_fn,
      transform_input_dataset_metadata=transform_input_dataset_metadata)

Source File: tft_benchmark_base.py From tfx with Apache License 2.0

6 votes

def __init__(self,
               dataset,
               tf_metadata_schema,
               preprocessing_fn,
               transform_input_dataset_metadata,
               generate_dataset=False):
    """Constructor.

    Args:
      dataset: BenchmarkDataset object.
      tf_metadata_schema: tf.Metadata schema.
      preprocessing_fn: preprocessing_fn.
      transform_input_dataset_metadata: dataset_metadata.DatasetMetadata.
      generate_dataset: If True, generates the raw dataset and appropriate
        intermediate outputs (just the TFT SavedModel for now) necessary for
        other benchmarks.
    """
    self._dataset = dataset
    self._tf_metadata_schema = tf_metadata_schema
    self._preprocessing_fn = preprocessing_fn
    self._transform_input_dataset_metadata = transform_input_dataset_metadata
    self._generate_dataset = generate_dataset

Source File: input_metadata.py From cloudml-samples with Apache License 2.0

6 votes

def _create_raw_metadata():
  """Create a DatasetMetadata for the raw data."""
  column_schemas = {
      key: dataset_schema.ColumnSchema(
          tf.string, [], dataset_schema.FixedColumnRepresentation())
      for key in CATEGORICAL_FEATURE_KEYS
  }
  column_schemas.update({
      key: dataset_schema.ColumnSchema(
          tf.float32, [], dataset_schema.FixedColumnRepresentation())
      for key in NUMERIC_FEATURE_KEYS
  })
  column_schemas[LABEL_KEY] = dataset_schema.ColumnSchema(
      tf.string, [], dataset_schema.FixedColumnRepresentation())

  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(
      column_schemas))
  return raw_data_metadata

Source File: generate_vocab.py From text with Apache License 2.0

6 votes

def main(_):
  # Define schema.
  raw_metadata = dataset_metadata.DatasetMetadata(
      dataset_schema.from_feature_spec({
          'text': tf.FixedLenFeature([], tf.string),
          'language_code': tf.FixedLenFeature([], tf.string),
      }))

  # Add in padding tokens.
  reserved_tokens = FLAGS.reserved_tokens
  if FLAGS.num_pad_tokens:
    padded_tokens = ['<pad>']
    padded_tokens += ['<pad%d>' % i for i in range(1, FLAGS.num_pad_tokens)]
    reserved_tokens = padded_tokens + reserved_tokens

  params = learner.Params(FLAGS.upper_thresh, FLAGS.lower_thresh,
                          FLAGS.num_iterations, FLAGS.max_input_tokens,
                          FLAGS.max_token_length, FLAGS.max_unique_chars,
                          FLAGS.vocab_size, FLAGS.slack_ratio,
                          FLAGS.include_joiner_token, FLAGS.joiner,
                          reserved_tokens)

  generate_vocab(FLAGS.data_file, FLAGS.vocab_file, FLAGS.metrics_file,
                 raw_metadata, params)

Source File: generate_word_counts.py From text with Apache License 2.0

5 votes

def main(_):
  # Generate schema of input data.
  raw_metadata = dataset_metadata.DatasetMetadata(
      dataset_schema.from_feature_spec({
          'text': tf.FixedLenFeature([], tf.string),
          'language_code': tf.FixedLenFeature([], tf.string),
      }))

  pipeline = word_count(FLAGS.input_path, FLAGS.output_path, raw_metadata)
  pipeline.run().wait_until_finish()

Source File: features.py From professional-services with Apache License 2.0

5 votes

def get_raw_dataset_metadata():
    return dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec(RAW_FEATURE_SPEC))

Source File: task.py From pipelines with Apache License 2.0

5 votes

def make_tft_input_metadata(schema):
  """Create tf-transform metadata from given schema."""
  tft_schema = {}

  for col_schema in schema:
    col_type = col_schema['type']
    col_name = col_schema['name']
    if col_type == 'NUMBER':
      tft_schema[col_name] = dataset_schema.ColumnSchema(
          tf.float32, [], dataset_schema.FixedColumnRepresentation(default_value=0.0))
    elif col_type in ['CATEGORY', 'TEXT', 'IMAGE_URL', 'KEY']:
      tft_schema[col_name] = dataset_schema.ColumnSchema(
          tf.string, [], dataset_schema.FixedColumnRepresentation(default_value=''))
  return dataset_metadata.DatasetMetadata(dataset_schema.Schema(tft_schema))

Source File: pipeline.py From realtime-embeddings-matching with Apache License 2.0

5 votes

def get_metadata():
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.tf_metadata import dataset_metadata

  metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
    'id': dataset_schema.ColumnSchema(
      tf.string, [], dataset_schema.FixedColumnRepresentation()),
    'text': dataset_schema.ColumnSchema(
      tf.string, [], dataset_schema.FixedColumnRepresentation())
  }))
  return metadata

Source File: metadata_io_test.py From transform with Apache License 2.0

5 votes

def test_write_and_read(self):
    # TODO(b/123241798): use TEST_TMPDIR
    basedir = tempfile.mkdtemp()
    original = dataset_metadata.DatasetMetadata(
        schema=test_common.get_test_schema())

    metadata_io.write_metadata(original, basedir)
    reloaded = metadata_io.read_metadata(basedir)

    self.assertEqual(original, reloaded)

Source File: metadata_io.py From transform with Apache License 2.0

5 votes

def write_metadata(metadata, path):
  """Write metadata to given path, in JSON format.

  Args:
    metadata: A `DatasetMetadata` to write.
    path: a path to a directory where metadata should be written.
  """
  if not file_io.file_exists(path):
    file_io.recursive_create_dir(path)
  schema_file = os.path.join(path, 'schema.pbtxt')
  ascii_proto = text_format.MessageToString(metadata.schema)
  file_io.atomic_write_string_to_file(schema_file, ascii_proto, overwrite=True)

Source File: impl.py From transform with Apache License 2.0

5 votes

def _remove_columns_from_metadata(metadata, excluded_columns):
  """Remove columns from metadata without mutating original metadata."""
  feature_spec, domains = schema_utils.schema_as_feature_spec(metadata.schema)
  new_feature_spec = {name: spec for name, spec in feature_spec.items()
                      if name not in excluded_columns}
  new_domains = {name: spec for name, spec in domains.items()
                 if name not in excluded_columns}
  return dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec(new_feature_spec, new_domains))

Source File: impl.py From transform with Apache License 2.0

5 votes

def _infer_metadata_from_saved_model(saved_model_dir):
  """Infers a DatasetMetadata for outputs of a SavedModel."""
  with tf.compat.v1.Graph().as_default() as graph:
    with tf.compat.v1.Session(graph=graph) as session:
      _, outputs = (
          saved_transform_io.partially_apply_saved_transform_internal(
              saved_model_dir, {}))

      session.run(tf.compat.v1.global_variables_initializer())
      session.run(tf.compat.v1.tables_initializer())
      return dataset_metadata.DatasetMetadata(
          schema=schema_inference.infer_feature_schema(outputs, graph, session))

Source File: tft_unit.py From transform with Apache License 2.0

5 votes

def metadata_from_feature_spec(feature_spec, domains=None):
  """Construct a DatasetMetadata from a feature spec.

  Args:
    feature_spec: A feature spec
    domains: A dict containing domains of features

  Returns:
    A `tft.tf_metadata.dataset_metadata.DatasetMetadata` object.
  """
  return dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec(feature_spec, domains))

Source File: simple_example.py From transform with Apache License 2.0

5 votes

def main():
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = (x_centered * y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }

  raw_data = [
      {'x': 1, 'y': 1, 's': 'hello'},
      {'x': 2, 'y': 2, 's': 'world'},
      {'x': 3, 'y': 3, 's': 'hello'}
  ]

  raw_data_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
          's': tf.io.FixedLenFeature([], tf.string),
          'y': tf.io.FixedLenFeature([], tf.float32),
          'x': tf.io.FixedLenFeature([], tf.float32),
      }))

  with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn))

  transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

  pprint.pprint(transformed_data)

Python tensorflow_transform.tf_metadata.dataset_metadata.DatasetMetadata() Examples