Python Examples of tensorflow_transform.scale_to_z

Source File: iris_utils_native_keras.py From tfx with Apache License 2.0

6 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}

  for key in _FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.scale_to_z_score(inputs[key])
  # TODO(b/157064428): Support label transformation for Keras.
  # Do not apply label transformation as it will result in wrong evaluation.
  outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

  return outputs


# TFX Tuner will call this function.

Source File: iris_utils_sklearn.py From tfx with Apache License 2.0

6 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}

  for key in _FEATURE_KEYS:
    std_input = tft.scale_to_z_score(inputs[key])
    assert isinstance(std_input, tf.Tensor)
    outputs[_transformed_name(key)] = std_input
  outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

  return outputs


# TFX Trainer will call this function.

Source File: tft_test.py From spotify-tensorflow with Apache License 2.0

5 votes

def dummy_preprocessing_fn(inputs):
    out = dict()
    out["test_feature_fx"] = tft.scale_to_z_score(inputs["test_feature"])
    return out

Source File: transform_module.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(_identity(inputs[key])))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: preprocessing.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in features.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[features.transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in features.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=features.VOCAB_SIZE,
        num_oov_buckets=features.OOV_SIZE)

  for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS,
                              features.BUCKET_FEATURE_BUCKET_COUNT):
    outputs[features.transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        num_buckets)

  for key in features.CATEGORICAL_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key])

  # TODO(b/157064428): Support label transformation for Keras.
  # Do not apply label transformation as it will result in wrong evaluation.
  outputs[features.transformed_name(
      features.LABEL_KEY)] = inputs[features.LABEL_KEY]

  return outputs

Source File: taxi_utils_slack.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: taxi_utils.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: taxi_utils_bqml.py From tfx with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: taxi_utils.py From pipelines with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT,
        always_return_num_quantiles=False)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs

Source File: mappers.py From transform with Apache License 2.0

5 votes

def scale_to_z_score(x, elementwise=False, name=None, output_dtype=None):
  """Returns a standardized column with mean 0 and variance 1.

  Scaling to z-score subtracts out the mean and divides by standard deviation.
  Note that the standard deviation computed here is based on the biased variance
  (0 delta degrees of freedom), as computed by analyzers.var.

  Args:
    x: A numeric `Tensor` or `SparseTensor`.
    elementwise: If true, scales each element of the tensor independently;
        otherwise uses the mean and variance of the whole tensor.
    name: (Optional) A name for this operation.
    output_dtype: (Optional) If not None, casts the output tensor to this type.

  Returns:
    A `Tensor` or `SparseTensor` containing the input column scaled to mean 0
    and variance 1 (standard deviation 1), given by: (x - mean(x)) / std_dev(x).
    If `x` is floating point, the mean will have the same type as `x`. If `x` is
    integral, the output is cast to tf.float32.

    Note that TFLearn generally permits only tf.int64 and tf.float32, so casting
    this scaler's output may be necessary.
  """
  with tf.compat.v1.name_scope(name, 'scale_to_z_score'):
    return _scale_to_z_score_internal(
        x=x,
        key=None,
        elementwise=elementwise,
        key_vocabulary_filename=None,
        output_dtype=output_dtype)

Source File: impl_test.py From transform with Apache License 2.0

5 votes

def testScaleToZScore(self, input_data, output_data, elementwise):
    self._SkipIfExternalEnvironmentAnd(
        self._UseTFXIO(), 'Skipping large test cases; b/147698868')

    def preprocessing_fn(inputs):
      x = inputs['x']
      x_cast = tf.cast(x, tf.as_dtype(input_data.dtype))
      x_scaled = tft.scale_to_z_score(x_cast, elementwise=elementwise)
      self.assertEqual(x_scaled.dtype, tf.as_dtype(output_data.dtype))
      return {'x_scaled': tf.cast(x_scaled, tf.float32)}

    input_data_dicts = [{'x': x} for x in input_data]
    expected_data_dicts = [{'x_scaled': x_scaled} for x_scaled in output_data]
    input_metadata = tft_unit.metadata_from_feature_spec({
        'x':
            tf.io.FixedLenFeature(
                input_data.shape[1:],
                tft_unit.canonical_numeric_dtype(tf.as_dtype(
                    input_data.dtype))),
    })
    expected_metadata = tft_unit.metadata_from_feature_spec({
        'x_scaled': tf.io.FixedLenFeature(output_data.shape[1:], tf.float32),
    })
    self.assertAnalyzeAndTransformResults(
        input_data_dicts, input_metadata,
        preprocessing_fn, expected_data_dicts, expected_metadata)

Source File: preprocessing.py From kale with Apache License 2.0

5 votes

def preprocess(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
      inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
      Map from string feature key to transformed feature operations.
    """
    outputs = {}
    for key in DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        outputs[key] = transform.scale_to_z_score(inputs[key])

    for key in VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        if inputs[key].dtype == tf.string:
            vocab_tensor = inputs[key]
        else:
            vocab_tensor = tf.as_string(inputs[key])
        outputs[key] = transform.string_to_int(
            vocab_tensor, vocab_filename='vocab_' + key,
            top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

    for key in BUCKET_FEATURE_KEYS:
        outputs[key] = transform.bucketize(inputs[key], FEATURE_BUCKET_COUNT)

    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[key] = tf.to_int64(inputs[key])

    taxi_fare = inputs[FARE_KEY]
    taxi_tip = inputs[LABEL_KEY]
    # Test if the tip was > 20% of the fare.
    tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
    outputs[LABEL_KEY] = tf.logical_and(
        tf.logical_not(tf.is_nan(taxi_fare)),
        tf.greater(taxi_tip, tip_threshold))

    return outputs

Source File: main.py From spotify-tensorflow with Apache License 2.0

5 votes

def preprocessing_fn(inputs):
    out = dict()

    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        out[taxi.transformed_name(key)] = tft.scale_to_z_score(
            taxi.fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        out[taxi.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            taxi.fill_in_missing(inputs[key]), top_k=10, num_oov_buckets=10)

    for key in taxi.BUCKET_FEATURE_KEYS:
        out[taxi.transformed_name(key)] = tft.bucketize(taxi.fill_in_missing(inputs[key]),
                                                        num_buckets=10)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
        out[taxi.transformed_name(key)] = taxi.fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = taxi.fill_in_missing(inputs[taxi.FARE_KEY])
    tips = taxi.fill_in_missing(inputs[taxi.LABEL_KEY])
    out[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)
    )

    return out

Source File: mappers.py From transform with Apache License 2.0

4 votes

def _scale_to_z_score_internal(
    x, key, elementwise, key_vocabulary_filename, output_dtype):
  """Implementation for scale_to_z_score."""
  # x_mean will be float16, float32, or float64, depending on type of x
  if key is None:
    x_mean, x_var = analyzers._mean_and_var(  # pylint: disable=protected-access
        x,
        reduce_instance_dims=not elementwise,
        output_dtype=output_dtype)
  else:
    if elementwise:
      raise NotImplementedError('Per-key elementwise reduction not supported')

    mean_and_var_per_key_result = analyzers._mean_and_var_per_key(  # pylint: disable=protected-access
        x, key, key_vocabulary_filename=key_vocabulary_filename,
        output_dtype=output_dtype)

    if key_vocabulary_filename is None:
      key_vocab, key_means, key_vars = mean_and_var_per_key_result
      x_mean, x_var = tf_utils.map_per_key_reductions((key_means, key_vars),
                                                      key, key_vocab, x)
    else:
      mean_var_for_key = tf_utils.apply_per_key_vocabulary(
          mean_and_var_per_key_result, key, target_ndims=x.get_shape().ndims)
      x_mean, x_var = (mean_var_for_key[:, 0], mean_var_for_key[:, 1])

  compose_result_fn = _make_sparse_tensor_wrapper_if_sparse(x)
  x_values = x

  if isinstance(x, tf.SparseTensor):
    x_values = x.values
    if elementwise:
      # Only supports SparseTensors with rank 2.
      x.get_shape().assert_has_rank(2)

      x_mean = tf.gather(x_mean, x.indices[:, 1])
      x_var = tf.gather(x_var, x.indices[:, 1])

  numerator = tf.cast(x_values, x_mean.dtype) - x_mean
  denominator = tf.sqrt(x_var)
  cond = tf.not_equal(denominator, 0)

  if cond.shape.as_list() != x_values.shape.as_list():
    # Repeats cond when necessary across the batch dimension for it to be
    # compatible with the shape of numerator.
    cond = tf.cast(
        tf.zeros_like(numerator) + tf.cast(cond, numerator.dtype),
        dtype=tf.bool)

  deviation_values = tf.where(cond, tf.divide(numerator, denominator),
                              numerator)
  return compose_result_fn(deviation_values)

Source File: mappers.py From transform with Apache License 2.0

4 votes

def scale_to_z_score_per_key(x,
                             key,
                             elementwise=False,
                             key_vocabulary_filename=None,
                             name=None,
                             output_dtype=None):
  """Returns a standardized column with mean 0 and variance 1, grouped per key.

  Scaling to z-score subtracts out the mean and divides by standard deviation.
  Note that the standard deviation computed here is based on the biased variance
  (0 delta degrees of freedom), as computed by analyzers.var.

  Args:
    x: A numeric `Tensor` or `SparseTensor`.
    key: A Tensor or `SparseTensor` of dtype tf.string.
        Must meet one of the following conditions:
        0. key is None
        1. Both x and key are dense,
        2. Both x and key are sparse and `key` must exactly match `x` in
        everything except values,
        3. The axis=1 index of each x matches its index of dense key.
    elementwise: If true, scales each element of the tensor independently;
        otherwise uses the mean and variance of the whole tensor.
        Currently, not supported for per-key operations.
    key_vocabulary_filename: (Optional) The file name for the per-key file.
      If None, this combiner will assume the keys fit in memory and will not
      store the analyzer result in a file. If '', a file name will be chosen
      based on the current TensorFlow scope. If not '', it should be unique
      within a given preprocessing function.
    name: (Optional) A name for this operation.
    output_dtype: (Optional) If not None, casts the output tensor to this type.

  Returns:
    A `Tensor` or `SparseTensor` containing the input column scaled to mean 0
    and variance 1 (standard deviation 1), grouped per key if a key is provided.

    That is, for all keys k: (x - mean(x)) / std_dev(x) for all x with key k.
    If `x` is floating point, the mean will have the same type as `x`. If `x` is
    integral, the output is cast to tf.float32.

    Note that TFLearn generally permits only tf.int64 and tf.float32, so casting
    this scaler's output may be necessary.
  """
  with tf.compat.v1.name_scope(name, 'scale_to_z_score_per_key'):
    if key is None:
      tf.errors.InvalidArgumentError(
          'key is None, call `tft.scale_to_z_score` instead')
    return _scale_to_z_score_internal(
        x=x,
        key=key,
        elementwise=elementwise,
        key_vocabulary_filename=key_vocabulary_filename,
        output_dtype=output_dtype)

Source File: taxi_utils_solution.py From tfx with Apache License 2.0

4 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs


# TFX Trainer will call this function.

Source File: impl_test.py From transform with Apache License 2.0

4 votes

def testScaleToZScoreSparse(self, input_dtype, elementwise):
    self._SkipIfExternalEnvironmentAnd(
        self._UseTFXIO(), 'Skipping large test cases; b/147698868')

    def preprocessing_fn(inputs):
      z_score = tf.sparse.to_dense(
          tft.scale_to_z_score(
              tf.cast(inputs['x'], input_dtype), elementwise=elementwise),
          default_value=np.nan)
      z_score.set_shape([None, 4])
      self.assertEqual(z_score.dtype, _mean_output_dtype(input_dtype))
      return {
          'x_scaled': tf.cast(z_score, tf.float32)
      }

    input_data = [
        {'idx': [0, 1], 'val': [-4, 10]},
        {'idx': [0, 1], 'val': [2, 4]},
    ]
    input_metadata = tft_unit.metadata_from_feature_spec({
        'x':
            tf.io.SparseFeature('idx', 'val',
                                tft_unit.canonical_numeric_dtype(input_dtype),
                                4)
    })
    if elementwise:
      # Mean(x) = [-1, 7]
      # Var(x) = [9, 9]
      # StdDev(x) = [3, 3]
      expected_data = [
          {
              'x_scaled': [-1., 1.,
                           float('nan'),
                           float('nan')]  # [(-4 +1 ) / 3, (10 -7) / 3]
          },
          {
              'x_scaled': [1., -1.,
                           float('nan'),
                           float('nan')]  # [(2 + 1) / 3, (4 - 7) / 3]
          }
      ]
    else:
      # Mean = 3
      # Var = 25
      # Std Dev = 5
      expected_data = [
          {
              'x_scaled': [-1.4, 1.4, float('nan'),
                           float('nan')]  # [(-4 - 3) / 5, (10 - 3) / 5]
          },
          {
              'x_scaled': [-.2, .2, float('nan'),
                           float('nan')]  # [(2 - 3) / 5, (4 - 3) / 5]
          }
      ]
    expected_metadata = tft_unit.metadata_from_feature_spec(
        {'x_scaled': tf.io.FixedLenFeature([4], tf.float32)})
    self.assertAnalyzeAndTransformResults(input_data, input_metadata,
                                          preprocessing_fn, expected_data,
                                          expected_metadata)

Source File: preprocessing.py From code-snippets with Apache License 2.0

4 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in taxi.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[
        taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

  for key in taxi.BUCKET_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = transform.bucketize(
        _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

  for key in taxi.CATEGORICAL_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
  tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
  outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
          tf.int64))

  return outputs

Source File: preprocessing2.py From code-snippets with Apache License 2.0

4 votes

def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in taxi.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[
        taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

  for key in taxi.BUCKET_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = transform.bucketize(
        _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

  for key in taxi.CATEGORICAL_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
  tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
  outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 5% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.05))),
          tf.int64))

  return outputs

Python tensorflow_transform.scale_to_z_score() Examples