Python Examples of tensorflow_transform.TFTransformOutput

Source File: census_example.py From transform with Apache License 2.0

6 votes

def get_feature_columns(tf_transform_output):
  """Returns the FeatureColumns for the model.

  Args:
    tf_transform_output: A `TFTransformOutput` object.

  Returns:
    A list of FeatureColumns.
  """
  # Wrap scalars as real valued columns.
  real_valued_columns = [tf.feature_column.numeric_column(key, shape=())
                         for key in NUMERIC_FEATURE_KEYS]

  # Wrap categorical columns.
  one_hot_columns = [
      tf.feature_column.indicator_column(  # pylint: disable=g-complex-comprehension
          tf.feature_column.categorical_column_with_vocabulary_file(
              key=key,
              vocabulary_file=tf_transform_output.vocabulary_file_by_name(
                  vocab_filename=key)))
      for key in CATEGORICAL_FEATURE_KEYS]

  return real_valued_columns + one_hot_columns

Source File: sentiment_example.py From transform with Apache License 2.0

6 votes

def get_feature_columns(tf_transform_output):
  """Returns the FeatureColumns for the model.

  Args:
    tf_transform_output: A `TFTransformOutput` object.

  Returns:
    A list of FeatureColumns.
  """
  del tf_transform_output  # unused
  # Unrecognized tokens are represented by -1, but
  # categorical_column_with_identity uses the mod operator to map integers
  # to the range [0, bucket_size).  By choosing bucket_size=VOCAB_SIZE + 1, we
  # represent unrecognized tokens as VOCAB_SIZE.
  review_column = tf.feature_column.categorical_column_with_identity(
      REVIEW_KEY, num_buckets=VOCAB_SIZE + 1)
  weighted_reviews = tf.feature_column.weighted_categorical_column(
      review_column, REVIEW_WEIGHT_KEY)

  return [weighted_reviews]

Source File: model.py From professional-services with Apache License 2.0

6 votes

def _make_embedding_col(feature_name, vocab_name, tft_output, mult=1):
  """Creates an embedding column.

  Args:
    feature_name: a attribute of features to get embedding vectors for.
    vocab_name: the name of the embedding vocabulary made with tft.
    tft_output: a TFTransformOutput object.
    mult: a multiplier on the embedding size.

  Returns:
    A tuple of (embedding_col, embedding_size):
      embedding_col: an n x d tensor, where n is the batch size and d is the
        length of all the features concatenated together.
      embedding_size: the embedding dimension.
  """
  vocab_size = tft_output.vocabulary_size_by_name(vocab_name)
  embedding_size = int(_default_embedding_size(vocab_size) * mult)
  cat_col = tf.feature_column.categorical_column_with_identity(
      key=feature_name, num_buckets=vocab_size + 1, default_value=vocab_size)
  embedding_col = tf.feature_column.embedding_column(cat_col, embedding_size)
  return embedding_col, embedding_size

Source File: transform_fn_io_test.py From transform with Apache License 2.0

6 votes

def testReadTransformFn(self):
    path = self.get_temp_dir()
    # NOTE: we don't need to create or write to the transform_fn directory since
    # ReadTransformFn never inspects this directory.
    transform_fn_dir = os.path.join(
        path, tft.TFTransformOutput.TRANSFORM_FN_DIR)
    transformed_metadata_dir = os.path.join(
        path, tft.TFTransformOutput.TRANSFORMED_METADATA_DIR)
    metadata_io.write_metadata(test_metadata.COMPLETE_METADATA,
                               transformed_metadata_dir)

    with beam.Pipeline() as pipeline:
      saved_model_dir_pcoll, metadata = (
          pipeline | transform_fn_io.ReadTransformFn(path))
      beam_test_util.assert_that(
          saved_model_dir_pcoll,
          beam_test_util.equal_to([transform_fn_dir]),
          label='AssertSavedModelDir')
      # NOTE: metadata is currently read in a non-deferred manner.
      self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

Source File: taxi_utils.py From tfx with Apache License 2.0

6 votes

def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)

Source File: utils.py From professional-services with Apache License 2.0

6 votes

def _sample_vocab(tft_output, vocab_name, label, k):
  """Samples the given vocab and returns the indices and samples.

  Args:
    tft_output: a TFTransformOutput object.
    vocab_name: the name of the embedding vocabulary made with tft.
    label: a label to assign each sample of the vocab.
    k: the maximum number of samples to take.

  Returns:
    A tuple of (indices, metadata):
      indices: a list of indices for the vocab sample.
      metadata: a list of lists of data corresponding to the indices.
  """
  vocab = tft_output.vocabulary_by_name(vocab_name)
  num_indices = min(k, len(vocab))
  indices = random.sample(range(len(vocab)), num_indices)
  return indices, [[label, vocab[i]] for i in indices]

Source File: taxi_utils.py From pipelines with Apache License 2.0

6 votes

def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)

Source File: taxi_utils_bqml.py From tfx with Apache License 2.0

6 votes

def _flat_input_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving function for flat list of Dense tensors as input.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  # We construct a receiver function that receives flat list of Dense tensors as
  # features. This is as per BigQuery ML serving requirements.
  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.features)

Source File: trainer_module.py From tfx with Apache License 2.0

6 votes

def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)

Source File: model.py From tfx with Apache License 2.0

6 votes

def _input_fn(filenames, tf_transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)

  transformed_features = tf.compat.v1.data.make_one_shot_iterator(
      dataset).get_next()
  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      features.transformed_name(features.LABEL_KEY))

Source File: model.py From tfx with Apache License 2.0

6 votes

def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(features.LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)

Source File: model.py From tfx with Apache License 2.0

6 votes

def _input_fn(file_pattern, tf_transform_output, batch_size=200):
  """Generates features and label for tuning/training.

  Args:
    file_pattern: input tfrecord file pattern.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      file_pattern=file_pattern,
      batch_size=batch_size,
      features=transformed_feature_spec,
      reader=_gzip_reader_fn,
      label_key=features.transformed_name(features.LABEL_KEY))

  return dataset

Source File: mnist_utils_native_keras.py From tfx with Apache License 2.0

5 votes

def run_fn(fn_args: TrainerFnArgs):
  """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
  tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)

  train_dataset = base.input_fn(fn_args.train_files, tf_transform_output, 40)
  eval_dataset = base.input_fn(fn_args.eval_files, tf_transform_output, 40)

  mirrored_strategy = tf.distribute.MirroredStrategy()
  with mirrored_strategy.scope():
    model = base.build_keras_model()

  try:
    log_dir = fn_args.model_run_dir
  except KeyError:
    # TODO(b/158106209): use ModelRun instead of Model artifact for logging.
    log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')

  # Write logs to path
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir, update_freq='batch')

  model.fit(
      train_dataset,
      steps_per_epoch=fn_args.train_steps,
      validation_data=eval_dataset,
      validation_steps=fn_args.eval_steps,
      callbacks=[tensorboard_callback])

  signatures = {
      'serving_default':
          _get_serve_tf_examples_fn(
              model, tf_transform_output).get_concrete_function(
                  tf.TensorSpec(shape=[None], dtype=tf.string, name='examples'))
  }
  model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)

Source File: input_util.py From professional-services with Apache License 2.0

5 votes

def input_fn(input_dir, mode, batch_size, num_epochs, label_name=None,
             shuffle_buffer_size=10000, feature_spec=None):
    """Reads TFRecords and returns the features and labels."""
    if feature_spec is None:
        tf_transform_output = tft.TFTransformOutput(
            os.path.join(input_dir, 'transformed_metadata'))
        feature_spec = tf_transform_output.transformed_feature_spec()
    prefix = str(mode).lower()
    suffix = '.tfrecord'
    num_cpus = multiprocessing.cpu_count()

    file_pattern = os.path.join(input_dir, 'data', prefix, prefix+'*'+suffix)
    filenames = tf.matching_files(file_pattern)
    dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=None,
                                      num_parallel_reads=num_cpus)

    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(shuffle_buffer_size)

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(
        lambda examples: tf.parse_example(examples, feature_spec))
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    if mode == tf.estimator.ModeKeys.PREDICT:
        return features

    label = features.pop(label_name)
    return features, label

Source File: taxi_utils_solution.py From tfx with Apache License 2.0

5 votes

def _input_fn(file_pattern: List[Text],
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
  """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      file_pattern=file_pattern,
      batch_size=batch_size,
      features=transformed_feature_spec,
      reader=_gzip_reader_fn,
      label_key=_transformed_name(_LABEL_KEY))

  return dataset

Source File: taxi_utils.py From tfx with Apache License 2.0

5 votes

def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.io.parse_example(
      serialized=serialized_tf_example, features=raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)])

Source File: taxi_utils_native_keras.py From tfx with Apache License 2.0

5 votes

def _input_fn(file_pattern: List[Text],
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
  """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      file_pattern=file_pattern,
      batch_size=batch_size,
      features=transformed_feature_spec,
      reader=_gzip_reader_fn,
      label_key=_transformed_name(_LABEL_KEY))

  return dataset

Source File: trainer_module.py From tfx with Apache License 2.0

5 votes

def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.io.parse_example(
      serialized=serialized_tf_example, features=raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)])

Source File: trainer_module.py From tfx with Apache License 2.0

5 votes

def _input_fn(filenames, tf_transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)

  transformed_features = tf.compat.v1.data.make_one_shot_iterator(
      dataset).get_next()
  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      _transformed_name(_LABEL_KEY))


# TFX will call this function

Source File: utils.py From professional-services with Apache License 2.0

5 votes

def write_projector_metadata(metadata_dir, tft_dir):
  """Write a metadata file to use in tensorboard to visualize embeddings.

  Tensorboard expects a .tsv (tab-seperated values) file encoding information
  about each sample. A header is required if there is more than one column.

  Args:
    metadata_dir: the directory where the projector config protobuf is written.
    tft_dir: the directory where tft outputs are written.

  Returns:
    A tuple of user and item indices:
      user_indices: indices of users that were sampled.
      item_indices: indices of items that were sampled.
  """
  tft_output = tft.TFTransformOutput(tft_dir)
  user_indices, user_metadata = _sample_vocab(tft_output,
                                              constants.USER_VOCAB_NAME,
                                              "user",
                                              constants.NUM_PROJECTOR_USERS)
  item_indices, item_metadata = _sample_vocab(tft_output,
                                              constants.ITEM_VOCAB_NAME,
                                              "item",
                                              constants.NUM_PROJECTOR_ITEMS)
  metadata = user_metadata + item_metadata
  metadata_path = os.path.join(metadata_dir, constants.PROJECTOR_PATH)
  tf.io.gfile.makedirs(metadata_dir)
  with tf.io.gfile.GFile(metadata_path, "w+") as f:
    f.write("label\tname\n")
    f.write("\n".join(["{}\t{}".format(label, name) for label, name in metadata]))
  return user_indices, item_indices

Source File: model.py From professional-services with Apache License 2.0

5 votes

def _get_net_features(features, tft_output, n_feats, n_lens, c_feats, vocabs):
  """Creates an input layer of features.

  Args:
    features: a batch of features.
    tft_output: a TFTransformOutput object.
    n_feats: a list of numerical feature names.
    n_lens: the lengths of each nemerical feature.
    c_feats: a list of categorical feature names.
    vocabs: a list of vocabulary names cooresponding the the features in
      c_feats.

  Returns:
    A tuple of (net_features, size):
      net_features: an n x d tensor, where n is the batch size and d is the
        length of all the features concatenated together.
      size: the size of the feature layer.
  """
  numerical_cols = [tf.feature_column.numeric_column(col, shape=length)
                    for col, length in zip(n_feats, n_lens)]
  categorical_cols = [_make_embedding_col(col, vocab_name, tft_output)
                      for col, vocab_name in zip(c_feats, vocabs)]
  cols = [x[0] for x in categorical_cols] + numerical_cols
  size = sum([x[1] for x in categorical_cols]
             + [x.shape[0] for x in numerical_cols])
  feature_names = {x: features[x] for x in n_feats + c_feats}
  net_features = tf.feature_column.input_layer(feature_names, cols)
  return net_features, size

Source File: model.py From professional-services with Apache License 2.0

5 votes

def _make_input_layer(features, tft_output, feature_name, vocab_name, n_feats,
                      n_lens, c_feats, vocabs, mult=1):
  """Creates an input layer containing embeddings and features.

  Args:
    features: a batch of features.
    tft_output: a TFTransformOutput object.
    feature_name: a attribute of features to get embedding vectors for.
    vocab_name: the name of the embedding vocabulary made with tft.
    n_feats: a list of numerical feature names.
    n_lens: the lengths of each nemerical feature.
    c_feats: a list of categorical feature names.
    vocabs: a list of vocabulary names cooresponding the the features in
      c_features.
    mult: a multiplier on the embedding size.

  Returns:
    A tuple of (net, size):
      net: an n x d tensor, where n is the batch size and d is the embedding
        size.
      size: the size of the layer.
  """
  col, embedding_size = _make_embedding_col(feature_name, vocab_name,
                                            tft_output, mult)
  embedding_feature = tf.feature_column.input_layer(
      {feature_name: features[feature_name]}, [col])
  net_features, size = _get_net_features(features, tft_output, n_feats,
                                         n_lens, c_feats, vocabs)
  net = tf.concat([embedding_feature, net_features], 1)
  return net, embedding_size + size

Source File: taxi_utils_bqml.py From tfx with Apache License 2.0

5 votes

def _input_fn(filenames, tf_transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)

  transformed_features = tf.compat.v1.data.make_one_shot_iterator(
      dataset).get_next()
  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      _transformed_name(_LABEL_KEY))


# TFX will call this function

Source File: transform_fn_io_test.py From transform with Apache License 2.0

5 votes

def testWriteTransformFn(self):
    transform_output_dir = os.path.join(self.get_temp_dir(), 'output')

    with beam.Pipeline() as pipeline:
      # Create an empty directory for the source saved model dir.
      saved_model_dir = os.path.join(self.get_temp_dir(), 'source')
      file_io.recursive_create_dir(saved_model_dir)
      saved_model_dir_pcoll = (
          pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir]))
      # Combine test metadata with a dict of PCollections resolving futures.
      deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create(
          [test_metadata.COMPLETE_METADATA])
      metadata = beam_metadata_io.BeamDatasetMetadata(
          test_metadata.INCOMPLETE_METADATA, deferred_metadata)

      _ = ((saved_model_dir_pcoll, metadata)
           | transform_fn_io.WriteTransformFn(transform_output_dir))

    # Test reading with TFTransformOutput
    tf_transform_output = tft.TFTransformOutput(transform_output_dir)
    metadata = tf_transform_output.transformed_metadata
    self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)

    transform_fn_dir = tf_transform_output.transform_savedmodel_dir
    self.assertTrue(file_io.file_exists(transform_fn_dir))
    self.assertTrue(file_io.is_directory(transform_fn_dir))

Source File: taxi_utils.py From pipelines with Apache License 2.0

5 votes

def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.parse_example(serialized_tf_example, raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)])

Source File: taxi_utils.py From pipelines with Apache License 2.0

5 votes

def _input_fn(filenames, tf_transform_output, batch_size=200):
  """Generates features and labels for training or evaluation.

  Args:
    filenames: [str] list of CSV files to read data from.
    tf_transform_output: A TFTransformOutput.
    batch_size: int First dimension size of the Tensors returned by input_fn

  Returns:
    A (features, indices) tuple where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      filenames, batch_size, transformed_feature_spec, reader=_gzip_reader_fn)

  transformed_features = dataset.make_one_shot_iterator().get_next()
  # We pop the label because we do not want to use it as a feature while we're
  # training.
  return transformed_features, transformed_features.pop(
      _transformed_name(_LABEL_KEY))


# TFX will call this function

Source File: iris_utils_sklearn.py From tfx with Apache License 2.0

5 votes

def _input_fn(file_pattern: Text, tf_transform_output: tft.TFTransformOutput,
              ) -> Tuple[np.ndarray, np.ndarray]:
  """Generates features and label for tuning/training.

  Args:
    file_pattern: input tfrecord file pattern.
    tf_transform_output: A TFTransformOutput.

  Returns:
    A (features, indices) tuple where features is a matrix of features, and
      indices is a single vector of label indices.
  """
  def _parse_example(example):
    """Parses a tfrecord into a (features, indices) tuple of Tensors."""
    parsed_example = tf.io.parse_single_example(
        serialized=example,
        features=tf_transform_output.transformed_feature_spec())
    label = parsed_example.pop(_transformed_name(_LABEL_KEY))
    return parsed_example, label

  filenames = tf.data.Dataset.list_files(file_pattern)
  dataset = tf.data.TFRecordDataset(filenames, compression_type='GZIP')
  # TODO(b/157598676): Make AUTOTUNE the default.
  dataset = dataset.map(
      _parse_example,
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.shuffle(_SHUFFLE_BUFFER)
  return _tf_dataset_to_numpy(dataset)


# TFX Transform will call this function.

Source File: iris_utils_sklearn.py From tfx with Apache License 2.0

5 votes

def run_fn(fn_args: TrainerFnArgs):
  """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
  tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)

  x_train, y_train = _input_fn(fn_args.train_files, tf_transform_output)
  x_eval, y_eval = _input_fn(fn_args.eval_files, tf_transform_output)

  steps_per_epoch = _TRAIN_DATA_SIZE / _TRAIN_BATCH_SIZE

  model = MLPClassifier(
      hidden_layer_sizes=[8, 8, 8],
      activation='relu',
      solver='adam',
      batch_size=_TRAIN_BATCH_SIZE,
      learning_rate_init=0.0005,
      max_iter=int(fn_args.train_steps / steps_per_epoch),
      verbose=True)
  model.fit(x_train, y_train)
  absl.logging.info(model)

  score = model.score(x_eval, y_eval)
  absl.logging.info('Accuracy: %f', score)

  os.makedirs(fn_args.serving_model_dir)

  # TODO(humichael): Export TFT graph for serving once a solution for serving is
  # determined.
  model_path = os.path.join(fn_args.serving_model_dir, 'model.joblib')
  with tf.io.gfile.GFile(model_path, 'wb+') as f:
    joblib.dump(model, f)

Source File: iris_utils_native_keras.py From tfx with Apache License 2.0

5 votes

def _input_fn(file_pattern: List[Text],
              tf_transform_output: tft.TFTransformOutput,
              batch_size: int = 200) -> tf.data.Dataset:
  """Generates features and label for tuning/training.

  Args:
    file_pattern: List of paths or patterns of input tfrecord files.
    tf_transform_output: A TFTransformOutput.
    batch_size: representing the number of consecutive elements of returned
      dataset to combine in a single batch

  Returns:
    A dataset that contains (features, indices) tuple where features is a
      dictionary of Tensors, and indices is a single Tensor of label indices.
  """
  transformed_feature_spec = (
      tf_transform_output.transformed_feature_spec().copy())

  dataset = tf.data.experimental.make_batched_features_dataset(
      file_pattern=file_pattern,
      batch_size=batch_size,
      features=transformed_feature_spec,
      reader=_gzip_reader_fn,
      label_key=_transformed_name(_LABEL_KEY))

  return dataset

Source File: imdb_utils_native_keras.py From tfx with Apache License 2.0

5 votes

def run_fn(fn_args: TrainerFnArgs):
  """Train the model based on given args.

  Args:
    fn_args: Holds args used to train the model as name/value pairs.
  """
  tf_transform_output = tft.TFTransformOutput(fn_args.transform_output)

  train_dataset = _input_fn(
      fn_args.train_files, tf_transform_output, batch_size=_TRAIN_BATCH_SIZE)

  eval_dataset = _input_fn(
      fn_args.eval_files, tf_transform_output, batch_size=_EVAL_BATCH_SIZE)

  mirrored_strategy = tf.distribute.MirroredStrategy()
  with mirrored_strategy.scope():
    model = _build_keras_model()

  # In distributed training, it is common to use num_steps instead of num_epochs
  # to control training.
  # Reference: https://stackoverflow.com/questions/45989971/
  # /distributed-training-with-tf-estimator-resulting-in-more-training-steps

  model.fit(
      train_dataset,
      steps_per_epoch=fn_args.train_steps,
      validation_data=eval_dataset,
      validation_steps=fn_args.eval_steps)

  signatures = {
      'serving_default':
          _get_serve_tf_examples_fn(model,
                                    tf_transform_output).get_concrete_function(
                                        tf.TensorSpec(
                                            shape=[None],
                                            dtype=tf.string,
                                            name='examples')),
  }

  model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)

Python tensorflow_transform.TFTransformOutput() Examples