Python tensorflow_transform.tf_metadata.dataset_schema.from_feature_spec() Examples

The following are 18 code examples of tensorflow_transform.tf_metadata.dataset_schema.from_feature_spec(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow_transform.tf_metadata.dataset_schema , or try the search function

Example #1

Source File: criteo.py From cloudml-samples with Apache License 2.0

6 votes

def make_input_schema(mode=tf.contrib.learn.ModeKeys.TRAIN):
  """Input schema definition.

  Args:
    mode: tf.contrib.learn.ModeKeys specifying if the schema is being used for
      train/eval or prediction.
  Returns:
    A `Schema` object.
  """
  result = ({} if mode == tf.contrib.learn.ModeKeys.INFER
            else {'clicked': tf.FixedLenFeature(shape=[], dtype=tf.int64)})
  for name in INTEGER_COLUMN_NAMES:
    result[name] = tf.FixedLenFeature(
        shape=[], dtype=tf.int64, default_value=-1)
  for name in CATEGORICAL_COLUMN_NAMES:
    result[name] = tf.FixedLenFeature(shape=[], dtype=tf.string,
                                      default_value='')

  return dataset_schema.from_feature_spec(result)

Example #2

Source File: preprocess.py From professional-services with Apache License 2.0

6 votes

def WriteOutput(p, prefix, output_dir, feature_spec, plain_text=False):
  """Writes the given pCollection as a TF-Record.

  Args:
    p: a pCollection.
    prefix: prefix for location tf-record will be written to.
    output_dir: the directory or bucket to write the json data.
    feature_spec: the feature spec of the tf-record to be written.
    plain_text: if true, write the output as plain text instead.
  """
  path = os.path.join(output_dir, prefix)
  shuffled = p | "ShuffleData" >> Shuffle()  # pylint: disable=no-value-for-parameter

  if plain_text:
    shuffled | "WriteToText" >> beam.io.WriteToText(
        path, file_name_suffix=".txt")
    return

  schema = dataset_schema.from_feature_spec(feature_spec)
  coder = coders.ExampleProtoCoder(schema)
  shuffled | "WriteTFRecord" >> beam.io.tfrecordio.WriteToTFRecord(
      path,
      coder=coder,
      file_name_suffix=".tfrecord")

Example #3

Source File: preprocess.py From professional-services with Apache License 2.0

6 votes

def _run_tft_fn(raw_data, tft_fn, transform_fn_path, user_freq, item_freq):
  """Applys the TensorFlow Transform function to the given data.

  Args:
    raw_data: a dict of shape {$user_key: $user_id, $item_key: ...}.
    tft_fn: a TensorFlow Transform function.
    transform_fn_path: the location to save transformation outputs to.
    user_freq: minimum frequency of a user to include it in the user vocab.
    item_freq: minimum frequency of an item to include it in the item vocab.

  Returns:
    A pCollection of dicts, where each dict is an element of raw_data with the
      preprocess_fn applied to it:
      {$user_key: $user_id, $item_key: $item_id, $count_key: $count}.
  """
  raw_data_metadata = tft.tf_metadata.dataset_metadata.DatasetMetadata(
      tft.tf_metadata.dataset_schema.from_feature_spec(constants.TRAIN_SPEC))
  transformed_dataset, transform_fn = (
      (raw_data, raw_data_metadata)
      | beam_impl.AnalyzeAndTransformDataset(
          lambda x: tft_fn(x, user_freq, item_freq)))
  (transform_fn | "WriteTransformFn" >>
   tft.beam.tft_beam_io.transform_fn_io.WriteTransformFn(
       os.path.join(transform_fn_path, "transform_fn")))
  return transformed_dataset[0]

Example #4

Source File: preprocess.py From professional-services with Apache License 2.0

6 votes

def make_input_schema():
  """Builds the schema of the data read from BigQuery.

  Appends key column to schema for inference.

  Returns:
    A dictionary mapping keys of column names to `tf.FixedLenFeature` instances.
  """

  feature_spec = {}
  for c in constants.FEATURE_COLUMNS:
    feature_spec[c] = tf.FixedLenFeature(shape=[], dtype=tf.float32)
  feature_spec[constants.LABEL_COLUMN] = tf.FixedLenFeature(
      shape=[], dtype=tf.int64)
  feature_spec[constants.KEY_COLUMN] = tf.FixedLenFeature(
      shape=[], dtype=tf.int64)

  return dataset_schema.from_feature_spec(feature_spec)

Example #5

Source File: generate_vocab.py From text with Apache License 2.0

6 votes

def main(_):
  # Define schema.
  raw_metadata = dataset_metadata.DatasetMetadata(
      dataset_schema.from_feature_spec({
          'text': tf.FixedLenFeature([], tf.string),
          'language_code': tf.FixedLenFeature([], tf.string),
      }))

  # Add in padding tokens.
  reserved_tokens = FLAGS.reserved_tokens
  if FLAGS.num_pad_tokens:
    padded_tokens = ['<pad>']
    padded_tokens += ['<pad%d>' % i for i in range(1, FLAGS.num_pad_tokens)]
    reserved_tokens = padded_tokens + reserved_tokens

  params = learner.Params(FLAGS.upper_thresh, FLAGS.lower_thresh,
                          FLAGS.num_iterations, FLAGS.max_input_tokens,
                          FLAGS.max_token_length, FLAGS.max_unique_chars,
                          FLAGS.vocab_size, FLAGS.slack_ratio,
                          FLAGS.include_joiner_token, FLAGS.joiner,
                          reserved_tokens)

  generate_vocab(FLAGS.data_file, FLAGS.vocab_file, FLAGS.metrics_file,
                 raw_metadata, params)

Example #6

Source File: reddit.py From cloudml-samples with Apache License 2.0

6 votes

def make_input_schema(mode=tf.contrib.learn.ModeKeys.TRAIN):
  """Input schema definition.

  Args:
    mode: tf.contrib.learn.ModeKeys specifying if the schema is being used for
      train/eval or prediction.
  Returns:
    A `Schema` object.
  """
  result = ({} if mode == tf.contrib.learn.ModeKeys.INFER else {
      'score': tf.FixedLenFeature(shape=[], dtype=tf.float32)
  })
  result.update({
      'subreddit': tf.FixedLenFeature(shape=[], dtype=tf.string),
      'author': tf.FixedLenFeature(shape=[], dtype=tf.string),
      'comment_body': tf.FixedLenFeature(shape=[], dtype=tf.string,
                                         default_value=''),
      'comment_parent_body': tf.FixedLenFeature(shape=[], dtype=tf.string,
                                                default_value=''),
      'toplevel': tf.FixedLenFeature(shape=[], dtype=tf.int64),
  })
  return dataset_schema.from_feature_spec(result)

Example #7

Source File: movielens.py From cloudml-samples with Apache License 2.0

6 votes

def _make_schema(columns, types, default_values):
  """Input schema definition.

  Args:
    columns: column names for fields appearing in input.
    types: column types for fields appearing in input.
    default_values: default values for fields appearing in input.
  Returns:
    feature_set dictionary of string to *Feature.
  """
  result = {}
  assert len(columns) == len(types)
  assert len(columns) == len(default_values)
  for c, t, v in zip(columns, types, default_values):
    if isinstance(t, list):
      result[c] = tf.VarLenFeature(dtype=t[0])
    else:
      result[c] = tf.FixedLenFeature(shape=[], dtype=t, default_value=v)
  return dataset_schema.from_feature_spec(result)

Example #8

Source File: generate_word_counts.py From text with Apache License 2.0

5 votes

def main(_):
  # Generate schema of input data.
  raw_metadata = dataset_metadata.DatasetMetadata(
      dataset_schema.from_feature_spec({
          'text': tf.FixedLenFeature([], tf.string),
          'language_code': tf.FixedLenFeature([], tf.string),
      }))

  pipeline = word_count(FLAGS.input_path, FLAGS.output_path, raw_metadata)
  pipeline.run().wait_until_finish()

Example #9

Source File: chicago_taxi_client.py From tfx with Apache License 2.0

5 votes

def _make_proto_coder(schema):
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  return tft_coders.ExampleProtoCoder(raw_schema)

Example #10

Source File: chicago_taxi_client.py From tfx with Apache License 2.0

5 votes

def _make_csv_coder(schema, column_names):
  """Return a coder for tf.transform to read csv files."""
  raw_feature_spec = _get_raw_feature_spec(schema)
  parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  return tft_coders.CsvCoder(column_names, parsing_schema)

Example #11

Source File: preprocess.py From professional-services with Apache License 2.0

5 votes

def run(p, params):
  """Defines Beam preprocessing pipeline.

  Performs the following:
    - Reads text files from pattern.
    - Split text files in train and validation sets.

  Args:
    p: PCollection, initial pipeline.
    params: Object holding a set of parameters as name-value pairs.
  """

  path_pattern = os.path.join(params.input_dir, '*', '*{}'.format(
      constants.FILE_EXTENSION))
  data = (
      p
      | 'ListFiles' >> beam.Create(gfile.Glob(path_pattern))
      | 'ReadFiles' >> beam.ParDo(ReadFile())
      | 'SplitData' >> beam.ParDo(
          _SplitData(),
          train_size=params.train_size,
          val_label=_DatasetType.VAL.name).with_outputs(
              _DatasetType.VAL.name, main=_DatasetType.TRAIN.name))

  schema = dataset_schema.from_feature_spec(utils.get_processed_data_schema())
  for dataset in _DatasetType:
    if not dataset.value:
      continue
    _ = (
        data[dataset.name]
        | 'Shuffle{}'.format(dataset.name) >> shuffle()  # pylint: disable=no-value-for-parameter
        | 'WriteFiles{}'.format(dataset.name) >> tfrecordio.WriteToTFRecord(
            os.path.join(params.output_dir, dataset.name + constants.TFRECORD),
            coder=example_proto_coder.ExampleProtoCoder(schema)))

Example #12

Source File: example_decoders.py From spotify-tensorflow with Apache License 2.0

5 votes

def __init__(self, feature_spec):
        super(ExampleWithFeatureSpecDecoder, self).__init__()
        schema = dataset_schema.from_feature_spec(feature_spec)
        self._coder = example_proto_coder.ExampleProtoCoder(schema)

Example #13

Source File: features.py From professional-services with Apache License 2.0

5 votes

def get_raw_dataset_metadata():
    return dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec(RAW_FEATURE_SPEC))

Example #14

Source File: taxi.py From code-snippets with Apache License 2.0

5 votes

def make_proto_coder(schema):
  raw_feature_spec = get_raw_feature_spec(schema)
  raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  return tft_coders.ExampleProtoCoder(raw_schema)

Example #15

Source File: taxi.py From code-snippets with Apache License 2.0

5 votes

def make_csv_coder(schema):
  """Return a coder for tf.transform to read csv files."""
  raw_feature_spec = get_raw_feature_spec(schema)
  parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  return tft_coders.CsvCoder(CSV_COLUMN_NAMES, parsing_schema)

Example #16

Source File: taxi_schema.py From code-snippets with Apache License 2.0

5 votes

def make_proto_coder(schema):
  raw_feature_spec = get_raw_feature_spec(schema)
  raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  return tft_coders.ExampleProtoCoder(raw_schema)

Example #17

Source File: taxi_schema.py From code-snippets with Apache License 2.0

5 votes

def make_csv_coder(schema):
  """Return a coder for tf.transform to read csv files."""
  raw_feature_spec = get_raw_feature_spec(schema)
  parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  return tft_coders.CsvCoder(CSV_COLUMN_NAMES, parsing_schema)

Example #18

Source File: taxi_preprocess_bq.py From code-snippets with Apache License 2.0

5 votes

def make_mcsv_coder(schema):
  """Return a coder for tf.transform to read csv files."""
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  return mcsv_coder.CsvCoder(taxi.CSV_COLUMN_NAMES, parsing_schema)