Python tensorflow_transform.TFTransformOutput() Examples

The following are 30 code examples of tensorflow_transform.TFTransformOutput(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow_transform , or try the search function .
Example #1
Source File: census_example.py    From transform with Apache License 2.0 6 votes vote down vote up
def get_feature_columns(tf_transform_output):
  """Returns the FeatureColumns for the model.

  Args:
    tf_transform_output: A `TFTransformOutput` object.

  Returns:
    A list of FeatureColumns.
  """
  # Wrap scalars as real valued columns.
  real_valued_columns = [tf.feature_column.numeric_column(key, shape=())
                         for key in NUMERIC_FEATURE_KEYS]

  # Wrap categorical columns.
  one_hot_columns = [
      tf.feature_column.indicator_column(  # pylint: disable=g-complex-comprehension
          tf.feature_column.categorical_column_with_vocabulary_file(
              key=key,
              vocabulary_file=tf_transform_output.vocabulary_file_by_name(
                  vocab_filename=key)))
      for key in CATEGORICAL_FEATURE_KEYS]

  return real_valued_columns + one_hot_columns 
Example #2
Source File: sentiment_example.py    From transform with Apache License 2.0 6 votes vote down vote up
def get_feature_columns(tf_transform_output):
  """Returns the FeatureColumns for the model.

  Args:
    tf_transform_output: A `TFTransformOutput` object.

  Returns:
    A list of FeatureColumns.
  """
  del tf_transform_output  # unused
  # Unrecognized tokens are represented by -1, but
  # categorical_column_with_identity uses the mod operator to map integers
  # to the range [0, bucket_size).  By choosing bucket_size=VOCAB_SIZE + 1, we
  # represent unrecognized tokens as VOCAB_SIZE.
  review_column = tf.feature_column.categorical_column_with_identity(
      REVIEW_KEY, num_buckets=VOCAB_SIZE + 1)
  weighted_reviews = tf.feature_column.weighted_categorical_column(
      review_column, REVIEW_WEIGHT_KEY)

  return [weighted_reviews] 
Example #3
Source File: model.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def _make_embedding_col(feature_name, vocab_name, tft_output, mult=1):
  """Creates an embedding column.

  Args:
    feature_name: a attribute of features to get embedding vectors for.
    vocab_name: the name of the embedding vocabulary made with tft.
    tft_output: a TFTransformOutput object.
    mult: a multiplier on the embedding size.

  Returns:
    A tuple of (embedding_col, embedding_size):
      embedding_col: an n x d tensor, where n is the batch size and d is the
        length of all the features concatenated together.
      embedding_size: the embedding dimension.
  """
  vocab_size = tft_output.vocabulary_size_by_name(vocab_name)
  embedding_size = int(_default_embedding_size(vocab_size) * mult)
  cat_col = tf.feature_column.categorical_column_with_identity(
      key=feature_name, num_buckets=vocab_size + 1, default_value=vocab_size)
  embedding_col = tf.feature_column.embedding_column(cat_col, embedding_size)
  return embedding_col, embedding_size 
Example #4
Source File: transform_fn_io_test.py    From transform with Apache License 2.0 6 votes vote down vote up
def testReadTransformFn(self):
    path = self.get_temp_dir()
    # NOTE: we don't need to create or write to the transform_fn directory since
    # ReadTransformFn never inspects this directory.
    transform_fn_dir = os.path.join(
        path, tft.TFTransformOutput.TRANSFORM_FN_DIR)
    transformed_metadata_dir = os.path.join(
        path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
    metadata_io.write_metadata(test_metadata.COMPLETE_METADATA,
                               transformed_metadata_dir)

    with beam.Pipeline() as pipeline:
      saved_model_dir_pcoll, metadata = (
          pipeline | transform_fn_io.ReadTransformFn(path))
      beam_test_util.assert_that(
          saved_model_dir_pcoll,
          beam_test_util.equal_to([transform_fn_dir]),
          label='AssertSavedModelDir')
      # NOTE: metadata is currently read in a non-deferred manner.
      self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) 
Example #5
Source File: taxi_utils.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors) 
Example #6
Source File: utils.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def _sample_vocab(tft_output, vocab_name, label, k):
  """Samples the given vocab and returns the indices and samples.

  Args:
    tft_output: a TFTransformOutput object.
    vocab_name: the name of the embedding vocabulary made with tft.
    label: a label to assign each sample of the vocab.
    k: the maximum number of samples to take.

  Returns:
    A tuple of (indices, metadata):
      indices: a list of indices for the vocab sample.
      metadata: a list of lists of data corresponding to the indices.
  """
  vocab = tft_output.vocabulary_by_name(vocab_name)
  num_indices = min(k, len(vocab))
  indices = random.sample(range(len(vocab)), num_indices)
  return indices, [[label, vocab[i]] for i in indices] 
Example #7
Source File: taxi_utils.py    From pipelines with Apache License 2.0 6 votes vote down vote up
def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors) 
Example #8
Source File: taxi_utils_bqml.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _flat_input_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving function for flat list of Dense tensors as input.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  # We construct a receiver function that receives flat list of Dense tensors as
  # features. This is as per BigQuery ML serving requirements.
  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.features) 
Example #9
Source File: trainer_module.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors) 
Example #10
Source File: model.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _input_fn(filenames, tf_transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)

  transformed_features = tf.compat.v1.data.make_one_shot_iterator(
      dataset).get_next()
  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      features.transformed_name(features.LABEL_KEY)) 
Example #11
Source File: model.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(features.LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors) 
Example #12
Source File: model.py    From tfx with Apache License 2.0 6 votes vote down vote up
def _input_fn(file_pattern, tf_transform_output, batch_size=200):
  """Generates features and label for tuning/training.

  Args:
    file_pattern: input tfrecord file pattern.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      file_pattern=file_pattern,
      batch_size=batch_size,
      features=transformed_feature_spec,
      reader=_gzip_reader_fn,
      label_key=features.transformed_name(features.LABEL_KEY))

  return dataset 
Example #13
Source File: mnist_utils_native_keras.py    From tfx with Apache License 2.0 5 votes vote down vote up
def run_fn(fn_args: TrainerFnArgs):
  """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
  tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)

  train_dataset = base.input_fn(fn_args.train_files, tf_transform_output, 40)
  eval_dataset = base.input_fn(fn_args.eval_files, tf_transform_output, 40)

  mirrored_strategy = tf.distribute.MirroredStrategy()
  with mirrored_strategy.scope():
    model = base.build_keras_model()

  try:
    log_dir = fn_args.model_run_dir
  except KeyError:
    # TODO(b/158106209): use ModelRun instead of Model artifact for logging.
    log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')

  # Write logs to path
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir, update_freq='batch')

  model.fit(
      train_dataset,
      steps_per_epoch=fn_args.train_steps,
      validation_data=eval_dataset,
      validation_steps=fn_args.eval_steps,
      callbacks=[tensorboard_callback])

  signatures = {
      'serving_default':
          _get_serve_tf_examples_fn(
              model, tf_transform_output).get_concrete_function(
                  tf.TensorSpec(shape=[None], dtype=tf.string, name='examples'))
  }
  model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures) 
Example #14
Source File: input_util.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def input_fn(input_dir, mode, batch_size, num_epochs, label_name=None,
             shuffle_buffer_size=10000, feature_spec=None):
    """Reads TFRecords and returns the features and labels."""
    if feature_spec is None:
        tf_transform_output = tft.TFTransformOutput(
            os.path.join(input_dir, 'transformed_metadata'))
        feature_spec = tf_transform_output.transformed_feature_spec()
    prefix = str(mode).lower()
    suffix = '.tfrecord'
    num_cpus = multiprocessing.cpu_count()

    file_pattern = os.path.join(input_dir, 'data', prefix, prefix+'*'+suffix)
    filenames = tf.matching_files(file_pattern)
    dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=None,
                                      num_parallel_reads=num_cpus)

    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(shuffle_buffer_size)

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(
        lambda examples: tf.parse_example(examples, feature_spec))
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    if mode == tf.estimator.ModeKeys.PREDICT:
        return features

    label = features.pop(label_name)
    return features, label 
Example #15
Source File: taxi_utils_solution.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _input_fn(file_pattern: List[Text],
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
  """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      file_pattern=file_pattern,
      batch_size=batch_size,
      features=transformed_feature_spec,
      reader=_gzip_reader_fn,
      label_key=_transformed_name(_LABEL_KEY))

  return dataset 
Example #16
Source File: taxi_utils.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.io.parse_example(
      serialized=serialized_tf_example, features=raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)]) 
Example #17
Source File: taxi_utils_native_keras.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _input_fn(file_pattern: List[Text],
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
  """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      file_pattern=file_pattern,
      batch_size=batch_size,
      features=transformed_feature_spec,
      reader=_gzip_reader_fn,
      label_key=_transformed_name(_LABEL_KEY))

  return dataset 
Example #18
Source File: trainer_module.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.io.parse_example(
      serialized=serialized_tf_example, features=raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)]) 
Example #19
Source File: trainer_module.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _input_fn(filenames, tf_transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)

  transformed_features = tf.compat.v1.data.make_one_shot_iterator(
      dataset).get_next()
  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      _transformed_name(_LABEL_KEY))


# TFX will call this function 
Example #20
Source File: utils.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def write_projector_metadata(metadata_dir, tft_dir):
  """Write a metadata file to use in tensorboard to visualize embeddings.

  Tensorboard expects a .tsv (tab-seperated values) file encoding information
  about each sample. A header is required if there is more than one column.

  Args:
    metadata_dir: the directory where the projector config protobuf is written.
    tft_dir: the directory where tft outputs are written.

  Returns:
    A tuple of user and item indices:
      user_indices: indices of users that were sampled.
      item_indices: indices of items that were sampled.
  """
  tft_output = tft.TFTransformOutput(tft_dir)
  user_indices, user_metadata = _sample_vocab(tft_output,
                                              constants.USER_VOCAB_NAME,
                                              "user",
                                              constants.NUM_PROJECTOR_USERS)
  item_indices, item_metadata = _sample_vocab(tft_output,
                                              constants.ITEM_VOCAB_NAME,
                                              "item",
                                              constants.NUM_PROJECTOR_ITEMS)
  metadata = user_metadata + item_metadata
  metadata_path = os.path.join(metadata_dir, constants.PROJECTOR_PATH)
  tf.io.gfile.makedirs(metadata_dir)
  with tf.io.gfile.GFile(metadata_path, "w+") as f:
    f.write("label\tname\n")
    f.write("\n".join(["{}\t{}".format(label, name) for label, name in metadata]))
  return user_indices, item_indices 
Example #21
Source File: model.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def _get_net_features(features, tft_output, n_feats, n_lens, c_feats, vocabs):
  """Creates an input layer of features.

  Args:
    features: a batch of features.
    tft_output: a TFTransformOutput object.
    n_feats: a list of numerical feature names.
    n_lens: the lengths of each nemerical feature.
    c_feats: a list of categorical feature names.
    vocabs: a list of vocabulary names cooresponding the the features in
      c_feats.

  Returns:
    A tuple of (net_features, size):
      net_features: an n x d tensor, where n is the batch size and d is the
        length of all the features concatenated together.
      size: the size of the feature layer.
  """
  numerical_cols = [tf.feature_column.numeric_column(col, shape=length)
                    for col, length in zip(n_feats, n_lens)]
  categorical_cols = [_make_embedding_col(col, vocab_name, tft_output)
                      for col, vocab_name in zip(c_feats, vocabs)]
  cols = [x[0] for x in categorical_cols] + numerical_cols
  size = sum([x[1] for x in categorical_cols]
             + [x.shape[0] for x in numerical_cols])
  feature_names = {x: features[x] for x in n_feats + c_feats}
  net_features = tf.feature_column.input_layer(feature_names, cols)
  return net_features, size 
Example #22
Source File: model.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def _make_input_layer(features, tft_output, feature_name, vocab_name, n_feats,
                      n_lens, c_feats, vocabs, mult=1):
  """Creates an input layer containing embeddings and features.

  Args:
    features: a batch of features.
    tft_output: a TFTransformOutput object.
    feature_name: a attribute of features to get embedding vectors for.
    vocab_name: the name of the embedding vocabulary made with tft.
    n_feats: a list of numerical feature names.
    n_lens: the lengths of each nemerical feature.
    c_feats: a list of categorical feature names.
    vocabs: a list of vocabulary names cooresponding the the features in
      c_features.
    mult: a multiplier on the embedding size.

  Returns:
    A tuple of (net, size):
      net: an n x d tensor, where n is the batch size and d is the embedding
        size.
      size: the size of the layer.
  """
  col, embedding_size = _make_embedding_col(feature_name, vocab_name,
                                            tft_output, mult)
  embedding_feature = tf.feature_column.input_layer(
      {feature_name: features[feature_name]}, [col])
  net_features, size = _get_net_features(features, tft_output, n_feats,
                                         n_lens, c_feats, vocabs)
  net = tf.concat([embedding_feature, net_features], 1)
  return net, embedding_size + size 
Example #23
Source File: taxi_utils_bqml.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _input_fn(filenames, tf_transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)

  transformed_features = tf.compat.v1.data.make_one_shot_iterator(
      dataset).get_next()
  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      _transformed_name(_LABEL_KEY))


# TFX will call this function 
Example #24
Source File: transform_fn_io_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testWriteTransformFn(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      file_io.recursive_create_dir(saved_model_dir)
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
      # Combine test metadata with a dict of PCollections resolving futures.
      deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
          [test_metadata.COMPLETE_METADATA])
      metadata = beam_metadata_io.BeamDatasetMetadata(
          test_metadata.INCOMPLETE_METADATA, deferred_metadata)

      _ = ((saved_model_dir_pcoll, metadata)
           | transform_fn_io.WriteTransformFn(transform_output_dir))

    # Test reading with TFTransformOutput
    tf_transform_output = tft.TFTransformOutput(transform_output_dir)
    metadata = tf_transform_output.transformed_metadata
    self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

    transform_fn_dir = tf_transform_output.transform_savedmodel_dir
    self.assertTrue(file_io.file_exists(transform_fn_dir))
    self.assertTrue(file_io.is_directory(transform_fn_dir)) 
Example #25
Source File: taxi_utils.py    From pipelines with Apache License 2.0 5 votes vote down vote up
def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.parse_example(serialized_tf_example, raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)]) 
Example #26
Source File: taxi_utils.py    From pipelines with Apache License 2.0 5 votes vote down vote up
def _input_fn(filenames, tf_transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)

  transformed_features = dataset.make_one_shot_iterator().get_next()
  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      _transformed_name(_LABEL_KEY))


# TFX will call this function 
Example #27
Source File: iris_utils_sklearn.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _input_fn(file_pattern: Text, tf_transform_output: tft.TFTransformOutput,
              ) -> Tuple[np.ndarray, np.ndarray]:
  """Generates features and label for tuning/training.

  Args:
    file_pattern: input tfrecord file pattern.
    tf_transform_output: A TFTransformOutput.

  Returns:
    A (features, indices) tuple where features is a matrix of features, and
      indices is a single vector of label indices.
  """
  def _parse_example(example):
    """Parses a tfrecord into a (features, indices) tuple of Tensors."""
    parsed_example = tf.io.parse_single_example(
        serialized=example,
        features=tf_transform_output.transformed_feature_spec())
    label = parsed_example.pop(_transformed_name(_LABEL_KEY))
    return parsed_example, label

  filenames = tf.data.Dataset.list_files(file_pattern)
  dataset = tf.data.TFRecordDataset(filenames, compression_type='GZIP')
  # TODO(b/157598676): Make AUTOTUNE the default.
  dataset = dataset.map(
      _parse_example,
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.shuffle(_SHUFFLE_BUFFER)
  return _tf_dataset_to_numpy(dataset)


# TFX Transform will call this function. 
Example #28
Source File: iris_utils_sklearn.py    From tfx with Apache License 2.0 5 votes vote down vote up
def run_fn(fn_args: TrainerFnArgs):
  """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
  tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)

  x_train, y_train = _input_fn(fn_args.train_files, tf_transform_output)
  x_eval, y_eval = _input_fn(fn_args.eval_files, tf_transform_output)

  steps_per_epoch = _TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE

  model = MLPClassifier(
      hidden_layer_sizes=[8, 8, 8],
      activation='relu',
      solver='adam',
      batch_size=_TRAIN_BATCH_SIZE,
      learning_rate_init=0.0005,
      max_iter=int(fn_args.train_steps / steps_per_epoch),
      verbose=True)
  model.fit(x_train, y_train)
  absl.logging.info(model)

  score = model.score(x_eval, y_eval)
  absl.logging.info('Accuracy: %f', score)

  os.makedirs(fn_args.serving_model_dir)

  # TODO(humichael): Export TFT graph for serving once a solution for serving is
  # determined.
  model_path = os.path.join(fn_args.serving_model_dir, 'model.joblib')
  with tf.io.gfile.GFile(model_path, 'wb+') as f:
    joblib.dump(model, f) 
Example #29
Source File: iris_utils_native_keras.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _input_fn(file_pattern: List[Text],
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
  """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      file_pattern=file_pattern,
      batch_size=batch_size,
      features=transformed_feature_spec,
      reader=_gzip_reader_fn,
      label_key=_transformed_name(_LABEL_KEY))

  return dataset 
Example #30
Source File: imdb_utils_native_keras.py    From tfx with Apache License 2.0 5 votes vote down vote up
def run_fn(fn_args: TrainerFnArgs):
  """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
  tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)

  train_dataset = _input_fn(
      fn_args.train_files, tf_transform_output, batch_size=_TRAIN_BATCH_SIZE)

  eval_dataset = _input_fn(
      fn_args.eval_files, tf_transform_output, batch_size=_EVAL_BATCH_SIZE)

  mirrored_strategy = tf.distribute.MirroredStrategy()
  with mirrored_strategy.scope():
    model = _build_keras_model()

  # In distributed training, it is common to use num_steps instead of num_epochs
  # to control training.
  # Reference: https://stackoverflow.com/questions/45989971/
  # /distributed-training-with-tf-estimator-resulting-in-more-training-steps

  model.fit(
      train_dataset,
      steps_per_epoch=fn_args.train_steps,
      validation_data=eval_dataset,
      validation_steps=fn_args.eval_steps)

  signatures = {
      'serving_default':
          _get_serve_tf_examples_fn(model,
                                    tf_transform_output).get_concrete_function(
                                        tf.TensorSpec(
                                            shape=[None],
                                            dtype=tf.string,
                                            name='examples')),
  }

  model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)