Python Examples of tensorflow.transform

Source File: inspect_preprocessing_fn.py From transform with Apache License 2.0

6 votes

def get_analyze_input_columns(preprocessing_fn, specs):
  """Return columns that are required inputs of `AnalyzeDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    specs: A dict of feature name to feature specification or tf.TypeSpecs.

  Returns:
    A list of columns that are required inputs of analyzers.
  """

  with tf.compat.v1.Graph().as_default() as graph:
    input_signature = impl_helper.batched_placeholders_from_specs(
        specs)
    _ = preprocessing_fn(input_signature.copy())

    tensor_sinks = graph.get_collection(analyzer_nodes.TENSOR_REPLACEMENTS)
    visitor = _SourcedTensorsVisitor()
    for tensor_sink in tensor_sinks:
      nodes.Traverser(visitor).visit_value_node(tensor_sink.future)

    analyze_input_tensors = graph_tools.get_dependent_inputs(
        graph, input_signature, visitor.sourced_tensors)
    return list(analyze_input_tensors.keys())

Source File: mcsv_coder.py From code-snippets with Apache License 2.0

6 votes

def encode(self, instance):
    """Encode a tf.transform encoded dict to a csv-formatted string.

    Args:
      instance: A python dictionary where the keys are the column names and
        the values are fixed len or var len encoded features.

    Returns:
      A csv-formatted string. The order of the columns is given by column_names.
    """
    string_list = [None] * len(self._column_names)
    for feature_handler in self._feature_handlers:
      try:
        feature_handler.encode_value(string_list,
                                     instance[feature_handler.name])
      except TypeError as e:
        raise TypeError('%s while encoding feature "%s"' %
                        (e, feature_handler.name))
    return self._encoder.encode_record(string_list)

  # Please run tensorflow_transform/coders/benchmark_coders_test.py
  # if you make any changes on these methods.

Source File: trainer_module.py From tfx with Apache License 2.0

6 votes

def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)

Source File: taxi_utils_slack.py From tfx with Apache License 2.0

6 votes

def _example_serving_receiver_fn(transform_output, schema):
  """Build the serving in inputs.

  Args:
    transform_output: directory in which the tf-transform model was written
      during the preprocessing step.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  _, transformed_features = (
      saved_transform_io.partially_apply_saved_transform(
          os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR),
          serving_input_receiver.features))

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)

Source File: taxi_utils_bqml.py From tfx with Apache License 2.0

6 votes

def _flat_input_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving function for flat list of Dense tensors as input.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  # We construct a receiver function that receives flat list of Dense tensors as
  # features. This is as per BigQuery ML serving requirements.
  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.features)

Source File: taxi_utils.py From pipelines with Apache License 2.0

6 votes

def _example_serving_receiver_fn(tf_transform_output, schema):
  """Build the serving in inputs.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    Tensorflow graph which parses examples, applying tf-transform to them.
  """
  raw_feature_spec = _get_raw_feature_spec(schema)
  raw_feature_spec.pop(_LABEL_KEY)

  raw_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
      raw_feature_spec, default_batch_size=None)
  serving_input_receiver = raw_input_fn()

  transformed_features = tf_transform_output.transform_raw_features(
      serving_input_receiver.features)

  return tf.estimator.export.ServingInputReceiver(
      transformed_features, serving_input_receiver.receiver_tensors)

Source File: example_proto_coder.py From transform with Apache License 2.0

6 votes

def encode(self, instance):
    """Encode a tf.transform encoded dict as tf.Example."""
    # The feature handles encode using the self._encode_example_cache.
    for feature_handler in self._feature_handlers:
      value = instance[feature_handler.name]
      try:
        feature_handler.encode_value(value)
      except TypeError as e:
        raise TypeError('%s while encoding feature "%s"' %
                        (e, feature_handler.name))

    if self._serialized:
      return self._encode_example_cache.SerializeToString()

    result = tf.train.Example()
    result.CopyFrom(self._encode_example_cache)
    return result

Source File: inspect_preprocessing_fn.py From transform with Apache License 2.0

6 votes

def get_transform_input_columns(preprocessing_fn, specs):
  """Return columns that are required inputs of `TransformDataset`.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    specs: A dict of feature name to feature specification or tf.TypeSpecs.

  Returns:
    A list of columns that are required inputs of the transform `tf.Graph`
    defined by `preprocessing_fn`.
  """
  with tf.compat.v1.Graph().as_default() as graph:
    input_signature = impl_helper.batched_placeholders_from_specs(
        specs)
    output_signature = preprocessing_fn(input_signature.copy())
    transform_input_tensors = graph_tools.get_dependent_inputs(
        graph, input_signature, output_signature)
    return list(transform_input_tensors.keys())

Source File: analysis_graph_builder.py From transform with Apache License 2.0

5 votes

def get_analysis_cache_entry_keys(preprocessing_fn, feature_spec, dataset_keys):
  """Computes the cache entry keys that would be useful for analysis.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    feature_spec: A dict of feature name to feature specification.
    dataset_keys: A set of strings which are dataset keys, they uniquely
      identify these datasets across analysis runs.

  Returns:
    A set of cache entry keys which would be useful for analysis.
  """
  _, cache_dict = _build_analysis_graph_for_inspection(
      preprocessing_fn, feature_spec, dataset_keys, {})
  return set([cache_key for _, cache_key in cache_dict.keys()])

Source File: taxi_preprocess_bq.py From code-snippets with Apache License 2.0

5 votes

def make_mcsv_coder(schema):
  """Return a coder for tf.transform to read csv files."""
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec)
  return mcsv_coder.CsvCoder(taxi.CSV_COLUMN_NAMES, parsing_schema)

Source File: preprocessing.py From kale with Apache License 2.0

5 votes

def preprocess(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
      inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[key] = transform.scale_to_z_score(inputs[key])

    for key in VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        if inputs[key].dtype == tf.string:
            vocab_tensor = inputs[key]
        else:
            vocab_tensor = tf.as_string(inputs[key])
        outputs[key] = transform.string_to_int(
            vocab_tensor, vocab_filename='vocab_' + key,
            top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

    for key in BUCKET_FEATURE_KEYS:
        outputs[key] = transform.bucketize(inputs[key], FEATURE_BUCKET_COUNT)

    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[key] = tf.to_int64(inputs[key])

    taxi_fare = inputs[FARE_KEY]
    taxi_tip = inputs[LABEL_KEY]
    # Test if the tip was > 20% of the fare.
    tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
    outputs[LABEL_KEY] = tf.logical_and(
        tf.logical_not(tf.is_nan(taxi_fare)),
        tf.greater(taxi_tip, tip_threshold))

    return outputs

Source File: transform_module.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(_identity(inputs[key])))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: trainer_module.py From tfx with Apache License 2.0

5 votes

def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.io.parse_example(
      serialized=serialized_tf_example, features=raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)])

Source File: output_wrapper.py From transform with Apache License 2.0

5 votes

def transform_features_layer(self):
    """Creates a `TransformFeaturesLayer` from this transform output.

    If a `TransformFeaturesLayer` has already been created for self, the same
    one will be returned.

    Returns:
      A `TransformFeaturesLayer` instance.
    """
    if self._transform_features_layer is None:
      self._transform_features_layer = TransformFeaturesLayer(self)
    return self._transform_features_layer

Source File: taxi_utils_slack.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: taxi_utils_native_keras.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # TODO(b/157064428): Support label transformation for Keras.
  # Do not apply label transformation as it will result in wrong evaluation.
  outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

  return outputs


# TFX Trainer will call this function.

Source File: taxi_utils.py From tfx with Apache License 2.0

5 votes

def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.io.parse_example(
      serialized=serialized_tf_example, features=raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)])

Source File: taxi_utils.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: taxi_utils_bqml.py From tfx with Apache License 2.0

5 votes

def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.io.parse_example(
      serialized=serialized_tf_example, features=raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)])

Source File: output_wrapper.py From transform with Apache License 2.0

5 votes

def load_transform_graph(self):
    """Load the transform graph without replacing any placeholders.

    This is necessary to ensure that variables in the transform graph are
    included in the training checkpoint when using tf.Estimator.  This should
    be called in the training input_fn.
    """
    saved_transform_io.partially_apply_saved_transform_internal(
        self.transform_savedmodel_dir, {})

Source File: taxi_utils_bqml.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: taxi_utils.py From pipelines with Apache License 2.0

5 votes

def _eval_input_receiver_fn(tf_transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    tf_transform_output: A TFTransformOutput.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.parse_example(serialized_tf_example, raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  transformed_features = tf_transform_output.transform_raw_features(
      features)

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)])

Source File: output_wrapper.py From transform with Apache License 2.0

5 votes

def raw_metadata(self):
    """A DatasetMetadata.

    Note: raw_metadata is not guaranteed to exist in the output of tf.transform
    and hence using this could fail, if raw_metadata is not present in
    TFTransformOutput.

    Returns:
      A DatasetMetadata
    """
    if self._raw_metadata is None:
      self._raw_metadata = metadata_io.read_metadata(
          os.path.join(self._transform_output_dir, self.RAW_METADATA_DIR))
    return self._raw_metadata

Source File: taxi_utils.py From pipelines with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT,
        always_return_num_quantiles=False)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: example_proto_coder.py From transform with Apache License 2.0

5 votes

def decode(self, example_proto):
    """Decode tf.Example as a tf.transform encoded dict."""
    if self._serialized:
      example = self._decode_example_cache
      example.ParseFromString(example_proto)
    else:
      example = example_proto

    feature_map = example.features.feature
    return {feature_handler.name: feature_handler.parse_value(feature_map)
            for feature_handler in self._feature_handlers}

Source File: output_wrapper.py From transform with Apache License 2.0

5 votes

def post_transform_statistics_path(self):
    """Returns the path to the post-transform datum statistics.

    Note: post_transform_statistics is not guaranteed to exist in the output of
    tf.transform and hence using this could fail, if post_transform statistics
    is not present in TFTransformOutput.
    """
    return os.path.join(
        self._transform_output_dir, self.POST_TRANSFORM_FEATURE_STATS_PATH)


# TODO(zoyahav): Use register_keras_serializable directly once we no longer support
# TF<2.1.

Source File: analysis_graph_builder.py From transform with Apache License 2.0

5 votes

def get_analysis_dataset_keys(
    preprocessing_fn, specs, dataset_keys, input_cache):
  """Computes the dataset keys that are required in order to perform analysis.

  Args:
    preprocessing_fn: A tf.transform preprocessing_fn.
    specs: A dict of feature name to feature specification or tf.TypeSpecs.
    dataset_keys: A set of strings which are dataset keys, they uniquely
      identify these datasets across analysis runs.
    input_cache: A cache dictionary.

  Returns:
    A set of dataset keys that are required for analysis.
  """
  transform_fn_future, _ = _build_analysis_graph_for_inspection(
      preprocessing_fn, specs, dataset_keys, input_cache)

  result = set()
  inspect_visitor = _InspectVisitor(result)
  inspect_traverser = nodes.Traverser(inspect_visitor)
  _ = inspect_traverser.visit_value_node(transform_fn_future)

  # If None is present this means that a flattened version of the entire dataset
  # is required, therefore this will be returning all of the given dataset_keys.
  if any(k.is_flattened_dataset_key() for k in result):
    result = dataset_keys
  return result

Source File: taxi_utils_solution.py From tfx with Apache License 2.0

4 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs


# TFX Trainer will call this function.

Source File: taxi_utils_slack.py From tfx with Apache License 2.0

4 votes

def _eval_input_receiver_fn(transform_output, schema):
  """Build everything needed for the tf-model-analysis to run the model.

  Args:
    transform_output: directory in which the tf-transform model was written
      during the preprocessing step.
    schema: the schema of the input data.

  Returns:
    EvalInputReceiver function, which contains:
      - Tensorflow graph which parses raw untransformed features, applies the
        tf-transform preprocessing operators.
      - Set of raw, untransformed features.
      - Label against which predictions will be compared.
  """
  # Notice that the inputs are raw features, not transformed features here.
  raw_feature_spec = _get_raw_feature_spec(schema)

  serialized_tf_example = tf.compat.v1.placeholder(
      dtype=tf.string, shape=[None], name='input_example_tensor')

  # Add a parse_example operator to the tensorflow graph, which will parse
  # raw, untransformed, tf examples.
  features = tf.io.parse_example(
      serialized=serialized_tf_example, features=raw_feature_spec)

  # Now that we have our raw examples, process them through the tf-transform
  # function computed during the preprocessing step.
  _, transformed_features = (
      saved_transform_io.partially_apply_saved_transform(
          os.path.join(transform_output, transform_fn_io.TRANSFORM_FN_DIR),
          features))

  # The key name MUST be 'examples'.
  receiver_tensors = {'examples': serialized_tf_example}

  # NOTE: Model is driven by transformed features (since training works on the
  # materialized output of TFT, but slicing will happen on raw features.
  features.update(transformed_features)

  return tfma.export.EvalInputReceiver(
      features=features,
      receiver_tensors=receiver_tensors,
      labels=transformed_features[_transformed_name(_LABEL_KEY)])

Source File: preprocessing.py From code-snippets with Apache License 2.0

4 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in taxi.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[
        taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

  for key in taxi.BUCKET_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = transform.bucketize(
        _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

  for key in taxi.CATEGORICAL_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
  tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
  outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
          tf.int64))

  return outputs

Python tensorflow.transform() Examples