Python tensorflow_transform.compute_and_apply_vocabulary() Examples

The following are 22 code examples of tensorflow_transform.compute_and_apply_vocabulary(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow_transform , or try the search function .
Example #1
Source File: impl_test.py    From transform with Apache License 2.0 6 votes vote down vote up
def testTFIDFNoData(self):
    def preprocessing_fn(inputs):
      inputs_as_ints = tft.compute_and_apply_vocabulary(
          tf.compat.v1.strings.split(inputs['a']))
      out_index, out_values = tft.tfidf(inputs_as_ints, 6)
      return {
          'tf_idf': out_values,
          'index': out_index
      }
    input_data = [{'a': ''}]
    input_metadata = tft_unit.metadata_from_feature_spec(
        {'a': tf.io.FixedLenFeature([], tf.string)})
    expected_transformed_data = [{'tf_idf': [], 'index': []}]
    expected_metadata = tft_unit.metadata_from_feature_spec({
        'tf_idf': tf.io.VarLenFeature(tf.float32),
        'index': tf.io.VarLenFeature(tf.int64)
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_metadata, preprocessing_fn, expected_transformed_data,
        expected_metadata) 
Example #2
Source File: vocabulary_integration_test.py    From transform with Apache License 2.0 6 votes vote down vote up
def testVocabularyAnalyzerWithTokenization(self):
    def preprocessing_fn(inputs):
      return {
          'index':
              tft.compute_and_apply_vocabulary(
                  tf.compat.v1.strings.split(inputs['a']))
      }

    input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}]
    input_metadata = tft_unit.metadata_from_feature_spec(
        {'a': tf.io.FixedLenFeature([], tf.string)})
    expected_data = [{'index': [0, 0, 1]}, {'index': [0, 2, 1]}]

    expected_metadata = tft_unit.metadata_from_feature_spec({
        'index': tf.io.VarLenFeature(tf.int64),
    }, {
        'index': schema_pb2.IntDomain(min=-1, max=2, is_categorical=True),
    })
    self.assertAnalyzeAndTransformResults(input_data, input_metadata,
                                          preprocessing_fn, expected_data,
                                          expected_metadata) 
Example #3
Source File: simple_example.py    From transform with Apache License 2.0 5 votes vote down vote up
def main():
  def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = (x_centered * y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
        's_integerized': s_integerized
    }

  raw_data = [
      {'x': 1, 'y': 1, 's': 'hello'},
      {'x': 2, 'y': 2, 's': 'world'},
      {'x': 3, 'y': 3, 's': 'hello'}
  ]

  raw_data_metadata = dataset_metadata.DatasetMetadata(
      schema_utils.schema_from_feature_spec({
          's': tf.io.FixedLenFeature([], tf.string),
          'y': tf.io.FixedLenFeature([], tf.float32),
          'x': tf.io.FixedLenFeature([], tf.float32),
      }))

  with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
    transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn))

  transformed_data, transformed_metadata = transformed_dataset  # pylint: disable=unused-variable

  pprint.pprint(transformed_data) 
Example #4
Source File: preprocess.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def _preprocess_tft(raw_data, user_freq, item_freq):
  """Creates vocabularies for users and items and maps their ids to ints.

  Args:
    raw_data: a dict of shape {$user_key: tensor, $item_key: tensor, ...}.
    user_freq: minimum frequency of a user to include it in the user vocab.
    item_freq: minimum frequency of an item to include it in the item vocab.

  Returns:
    A dict containing int ids cooresponding to a user_id and item_id and other
      features: {$user_key: $user_id, $item_key: $item_id, ...}.
  """
  features = {feature: raw_data[feature] for feature in constants.BQ_FEATURES}
  tft_features = {
      constants.TFT_USER_KEY: tft.compute_and_apply_vocabulary(
          raw_data[constants.USER_KEY],
          vocab_filename=constants.USER_VOCAB_NAME,
          frequency_threshold=user_freq,
          default_value=constants.TFT_DEFAULT_ID),
      constants.TFT_ITEM_KEY: tft.compute_and_apply_vocabulary(
          raw_data[constants.ITEM_KEY],
          vocab_filename=constants.ITEM_VOCAB_NAME,
          frequency_threshold=item_freq,
          default_value=constants.TFT_DEFAULT_ID),
      constants.TFT_ARTIST_KEY: tft.compute_and_apply_vocabulary(
          raw_data[constants.ARTIST_KEY],
          vocab_filename=constants.ARTIST_VOCAB_NAME,
          default_value=constants.TFT_DEFAULT_ID),
      constants.TFT_TAGS_KEY: tft.compute_and_apply_vocabulary(
          raw_data[constants.TAGS_KEY],
          vocab_filename=constants.TAG_VOCAB_NAME,
          default_value=constants.TFT_DEFAULT_ID),
  }
  features.update(tft_features)
  return features 
Example #5
Source File: transform_module.py    From tfx with Apache License 2.0 5 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(_identity(inputs[key])))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs 
Example #6
Source File: preprocessing.py    From tfx with Apache License 2.0 5 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in features.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[features.transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in features.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=features.VOCAB_SIZE,
        num_oov_buckets=features.OOV_SIZE)

  for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS,
                              features.BUCKET_FEATURE_BUCKET_COUNT):
    outputs[features.transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        num_buckets)

  for key in features.CATEGORICAL_FEATURE_KEYS:
    outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key])

  # TODO(b/157064428): Support label transformation for Keras.
  # Do not apply label transformation as it will result in wrong evaluation.
  outputs[features.transformed_name(
      features.LABEL_KEY)] = inputs[features.LABEL_KEY]

  return outputs 
Example #7
Source File: taxi_utils_native_keras.py    From tfx with Apache License 2.0 5 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # TODO(b/157064428): Support label transformation for Keras.
  # Do not apply label transformation as it will result in wrong evaluation.
  outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY]

  return outputs


# TFX Trainer will call this function. 
Example #8
Source File: taxi_utils.py    From tfx with Apache License 2.0 5 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs 
Example #9
Source File: taxi_utils_bqml.py    From tfx with Apache License 2.0 5 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs 
Example #10
Source File: imdb_utils_native_keras.py    From tfx with Apache License 2.0 5 votes vote down vote up
def _tokenize_review(review):
  """Tokenize the reviews by spliting the reviews.

  Constructing a vocabulary. Map the words to their frequency index in the
  vocabulary.

  Args:
    review: tensors containing the reviews. (batch_size/None, 1)

  Returns:
    Tokenized and padded review tensors. (batch_size/None, _MAX_LEN)
  """
  review_sparse = tf.strings.split(tf.reshape(review, [-1])).to_sparse()
  # tft.apply_vocabulary doesn't reserve 0 for oov words. In order to comply
  # with convention and use mask_zero in keras.embedding layer, set oov value
  # to _VOCAB_SIZE and padding value to -1. Then add 1 to all the tokens.
  review_indices = tft.compute_and_apply_vocabulary(
      review_sparse, default_value=_VOCAB_SIZE, top_k=_VOCAB_SIZE)
  dense = tf.sparse.to_dense(review_indices, default_value=-1)
  # TFX transform expects the transform result to be FixedLenFeature.
  padding_config = [[0, 0], [0, _MAX_LEN]]
  dense = tf.pad(dense, padding_config, 'CONSTANT', -1)
  padded = tf.slice(dense, [0, 0], [-1, _MAX_LEN])
  padded += 1
  return padded


# TFX Transform will call this function. 
Example #11
Source File: taxi_utils.py    From pipelines with Apache License 2.0 5 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT,
        always_return_num_quantiles=False)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs 
Example #12
Source File: impl_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testPipelineAPICounters(self):

    def preprocessing_fn(inputs):
      _ = tft.vocabulary(inputs['a'])
      return {
          'a_int': tft.compute_and_apply_vocabulary(inputs['a']),
          'x_scaled': tft.scale_to_0_1(inputs['x']),
          'y_scaled': tft.scale_to_0_1(inputs['y'])
      }

    with self._makeTestPipeline() as pipeline:
      input_data = pipeline | 'CreateTrainingData' >> beam.Create([{
          'x': 4,
          'y': 5,
          'a': 'hello'
      }, {
          'x': 1,
          'y': 3,
          'a': 'world'
      }])
      metadata = tft_unit.metadata_from_feature_spec({
          'x': tf.io.FixedLenFeature([], tf.float32),
          'y': tf.io.FixedLenFeature([], tf.float32),
          'a': tf.io.FixedLenFeature([], tf.string)
      })
      with beam_impl.Context(temp_dir=self.get_temp_dir()):
        input_data, metadata = self._MaybeConvertInputsToTFXIO(
            input_data, metadata)
        _ = ((input_data, metadata)
             | 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn))

    metrics = pipeline.metrics
    self.assertMetricsCounterEqual(metrics, 'tft_analyzer_vocabulary', 1)
    self.assertMetricsCounterEqual(metrics, 'tft_mapper_scale_to_0_1', 2)
    self.assertMetricsCounterEqual(metrics,
                                   'tft_mapper_compute_and_apply_vocabulary', 1)
    # compute_and_apply_vocabulary implicitly calls apply_vocabulary.
    # We check that that call is not logged.
    self.assertMetricsCounterEqual(metrics, 'tft_mapper_apply_vocabulary', 0) 
Example #13
Source File: impl_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testStringToTFIDFEmptyDoc(self):
    def preprocessing_fn(inputs):
      inputs_as_ints = tft.compute_and_apply_vocabulary(
          tf.compat.v1.strings.split(inputs['a']))
      out_index, out_values = tft.tfidf(inputs_as_ints, 6)
      return {
          'tf_idf': out_values,
          'index': out_index
      }
    input_data = [{'a': 'hello hello world'},
                  {'a': ''},
                  {'a': 'hello goodbye hello world'},
                  {'a': 'I like pie pie pie'}]
    input_metadata = tft_unit.metadata_from_feature_spec(
        {'a': tf.io.FixedLenFeature([], tf.string)})

    log_5_over_2 = 1.91629073187
    log_5_over_3 = 1.51082562376
    expected_transformed_data = [{
        'tf_idf': [(2/3)*log_5_over_3, (1/3)*log_5_over_3],
        'index': [0, 2]
    }, {
        'tf_idf': [],
        'index': []
    }, {
        'tf_idf': [(2/4)*log_5_over_3, (1/4)*log_5_over_3, (1/4)*log_5_over_2],
        'index': [0, 2, 4]
    }, {
        'tf_idf': [(3/5)*log_5_over_2, (1/5)*log_5_over_2, (1/5)*log_5_over_2],
        'index': [1, 3, 5]
    }]
    expected_metadata = tft_unit.metadata_from_feature_spec({
        'tf_idf': tf.io.VarLenFeature(tf.float32),
        'index': tf.io.VarLenFeature(tf.int64)
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_metadata, preprocessing_fn,
        expected_transformed_data, expected_metadata) 
Example #14
Source File: impl_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testStringToTFIDF(self):
    def preprocessing_fn(inputs):
      inputs_as_ints = tft.compute_and_apply_vocabulary(
          tf.compat.v1.strings.split(inputs['a']))
      out_index, out_values = tft.tfidf(inputs_as_ints, 6)
      return {
          'tf_idf': out_values,
          'index': out_index
      }
    input_data = [{'a': 'hello hello world'},
                  {'a': 'hello goodbye hello world'},
                  {'a': 'I like pie pie pie'}]
    input_metadata = tft_unit.metadata_from_feature_spec(
        {'a': tf.io.FixedLenFeature([], tf.string)})

    # IDFs
    # hello = log(4/3) = 0.28768
    # world = log(4/3)
    # goodbye = log(4/2) = 0.69314
    # I = log(4/2)
    # like = log(4/2)
    # pie = log(4/2)
    log_4_over_2 = 1.69314718056
    log_4_over_3 = 1.28768207245
    expected_transformed_data = [{
        'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3],
        'index': [0, 2]
    }, {
        'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_2],
        'index': [0, 2, 4]
    }, {
        'tf_idf': [(3/5)*log_4_over_2, (1/5)*log_4_over_2, (1/5)*log_4_over_2],
        'index': [1, 3, 5]
    }]
    expected_metadata = tft_unit.metadata_from_feature_spec({
        'tf_idf': tf.io.VarLenFeature(tf.float32),
        'index': tf.io.VarLenFeature(tf.int64)
    })
    self.assertAnalyzeAndTransformResults(
        input_data, input_metadata, preprocessing_fn,
        expected_transformed_data, expected_metadata) 
Example #15
Source File: impl_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testWithMoreThanDesiredBatchSize(self):
    def preprocessing_fn(inputs):
      return {
          'ab': tf.multiply(inputs['a'], inputs['b']),
          'i': tft.compute_and_apply_vocabulary(inputs['c'])
      }

    batch_size = 100
    num_instances = batch_size + 1
    input_data = [{
        'a': 2,
        'b': i,
        'c': '%.10i' % i,  # Front-padded to facilitate lexicographic sorting.
    } for i in range(num_instances)]
    input_metadata = tft_unit.metadata_from_feature_spec({
        'a': tf.io.FixedLenFeature([], tf.float32),
        'b': tf.io.FixedLenFeature([], tf.float32),
        'c': tf.io.FixedLenFeature([], tf.string)
    })
    expected_data = [{
        'ab': 2*i,
        'i': (len(input_data) - 1) - i,  # Due to reverse lexicographic sorting.
    } for i in range(len(input_data))]
    expected_metadata = tft_unit.metadata_from_feature_spec({
        'ab': tf.io.FixedLenFeature([], tf.float32),
        'i': tf.io.FixedLenFeature([], tf.int64),
    }, {
        'i':
            schema_pb2.IntDomain(
                min=-1, max=num_instances - 1, is_categorical=True)
    })
    self.assertAnalyzeAndTransformResults(
        input_data,
        input_metadata,
        preprocessing_fn,
        expected_data,
        expected_metadata,
        desired_batch_size=batch_size) 
Example #16
Source File: main.py    From spotify-tensorflow with Apache License 2.0 5 votes vote down vote up
def preprocessing_fn(inputs):
    out = dict()

    for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
        # Preserve this feature as a dense float, setting nan's to the mean.
        out[taxi.transformed_name(key)] = tft.scale_to_z_score(
            taxi.fill_in_missing(inputs[key]))

    for key in taxi.VOCAB_FEATURE_KEYS:
        # Build a vocabulary for this feature.
        out[taxi.transformed_name(key)] = tft.compute_and_apply_vocabulary(
            taxi.fill_in_missing(inputs[key]), top_k=10, num_oov_buckets=10)

    for key in taxi.BUCKET_FEATURE_KEYS:
        out[taxi.transformed_name(key)] = tft.bucketize(taxi.fill_in_missing(inputs[key]),
                                                        num_buckets=10)

    for key in taxi.CATEGORICAL_FEATURE_KEYS:
        out[taxi.transformed_name(key)] = taxi.fill_in_missing(inputs[key])

    # Was this passenger a big tipper?
    taxi_fare = taxi.fill_in_missing(inputs[taxi.FARE_KEY])
    tips = taxi.fill_in_missing(inputs[taxi.LABEL_KEY])
    out[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
        tf.is_nan(taxi_fare),
        tf.cast(tf.zeros_like(taxi_fare), tf.int64),
        # Test if the tip was > 20% of the fare.
        tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)
    )

    return out 
Example #17
Source File: cached_impl_test.py    From transform with Apache License 2.0 4 votes vote down vote up
def test_changing_constant_fails_cache(self, use_tfxio):

    def make_preprocessing_fn(string):

      def preprocessing_fn(inputs):
        constant_str = tf.tile(tf.constant([string]), tf.shape(inputs['s']))
        joined = tf.strings.join([inputs['s'], constant_str])
        return {'id': tft.compute_and_apply_vocabulary(joined)}

      return preprocessing_fn

    feature_spec = {'s': tf.io.FixedLenFeature([], tf.string)}
    input_data_dict = {
        analyzer_cache.DatasetKey('span-0'): [dict(s='a'),
                                              dict(s='b')]
    }

    run_result = self._run_pipeline(feature_spec, input_data_dict,
                                    make_preprocessing_fn('1st_run'),
                                    use_tfxio=use_tfxio)
    first_cache_output, p1 = run_result.cache_output, run_result.pipeline

    for key in input_data_dict:
      self.assertIn(key, first_cache_output)
      self.assertEqual(1, len(first_cache_output[key]))

    self.assertMetricsCounterEqual(p1.metrics, 'num_instances', 2)
    self.assertMetricsCounterEqual(p1.metrics, 'cache_entries_decoded', 0)
    self.assertMetricsCounterEqual(p1.metrics, 'cache_entries_encoded', 1)
    self.assertMetricsCounterEqual(p1.metrics, 'saved_models_created',
                                   _SINGLE_PHASE_NUM_SAVED_MODELS)

    run_result = self._run_pipeline(feature_spec, input_data_dict,
                                    make_preprocessing_fn('2nd_run'),
                                    use_tfxio=use_tfxio)
    second_cache_output, p2 = run_result.cache_output, run_result.pipeline

    # We expect a full output cache again because tf.function in the
    # preprocessing_fn broke that cache entry.
    for key in input_data_dict:
      self.assertIn(key, second_cache_output)
      self.assertEqual(1, len(second_cache_output[key]))

    self.assertMetricsCounterEqual(p2.metrics, 'num_instances', 2)
    self.assertMetricsCounterEqual(p2.metrics, 'cache_entries_decoded', 0)
    self.assertMetricsCounterEqual(p2.metrics, 'cache_entries_encoded', 1)
    self.assertMetricsCounterEqual(p2.metrics, 'saved_models_created',
                                   _SINGLE_PHASE_NUM_SAVED_MODELS) 
Example #18
Source File: vocabulary_integration_test.py    From transform with Apache License 2.0 4 votes vote down vote up
def testComputeAndApplyVocabulary(
      self, x_data, x_feature_spec, index_data, index_feature_spec,
      index_domain, label_data=None, label_feature_spec=None,
      weight_data=None, weight_feature_spec=None,
      expected_vocab_file_contents=None, **kwargs):
    """Test tft.compute_and_apply_vocabulary with various inputs."""

    input_data = [{'x': x} for x in x_data]
    input_feature_spec = {'x': x_feature_spec}
    expected_data = [{'index': index} for index in index_data]
    expected_feature_spec = {'index': index_feature_spec}
    expected_domains = {'index': index_domain}

    if label_data is not None:
      for idx, label in enumerate(label_data):
        input_data[idx]['label'] = label
      input_feature_spec['label'] = label_feature_spec

    if weight_data is not None:
      for idx, weight in enumerate(weight_data):
        input_data[idx]['weights'] = weight
      input_feature_spec['weights'] = weight_feature_spec

    input_metadata = tft_unit.metadata_from_feature_spec(input_feature_spec)
    expected_metadata = tft_unit.metadata_from_feature_spec(
        expected_feature_spec, expected_domains)

    def preprocessing_fn(inputs):
      x = inputs['x']
      labels = inputs.get('label')
      weights = inputs.get('weights')
      index = tft.compute_and_apply_vocabulary(
          x, labels=labels, weights=weights, **kwargs)
      return {'index': index}

    self.assertAnalyzeAndTransformResults(
        input_data,
        input_metadata,
        preprocessing_fn,
        expected_data,
        expected_metadata,
        expected_vocab_file_contents=expected_vocab_file_contents)

  # Example on how to use the vocab frequency as part of the transform
  # function. 
Example #19
Source File: taxi_utils_solution.py    From tfx with Apache License 2.0 4 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in _DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[_transformed_name(key)] = tft.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in _VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(
        _fill_in_missing(inputs[key]),
        top_k=_VOCAB_SIZE,
        num_oov_buckets=_OOV_SIZE)

  for key in _BUCKET_FEATURE_KEYS:
    outputs[_transformed_name(key)] = tft.bucketize(
        _fill_in_missing(inputs[key]),
        _FEATURE_BUCKET_COUNT)

  for key in _CATEGORICAL_FEATURE_KEYS:
    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])
  tips = _fill_in_missing(inputs[_LABEL_KEY])
  outputs[_transformed_name(_LABEL_KEY)] = tf.where(
      tf.math.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))

  return outputs


# TFX Trainer will call this function. 
Example #20
Source File: impl_test.py    From transform with Apache License 2.0 4 votes vote down vote up
def testSavedModelWithAnnotations(self):
    """Test serialization/deserialization as a saved model with annotations."""
    def preprocessing_fn(inputs):
      # Bucketization applies annotations to the output schema
      return {
          'x_bucketized': tft.bucketize(inputs['x'], num_buckets=4),
          'y_vocab': tft.compute_and_apply_vocabulary(inputs['y']),
      }

    input_data = [{
        'x': 1,
        'y': 'foo',
    }, {
        'x': 2,
        'y': 'bar',
    }, {
        'x': 3,
        'y': 'foo',
    }, {
        'x': 4,
        'y': 'foo',
    }]
    input_metadata = tft_unit.metadata_from_feature_spec({
        'x': tf.io.FixedLenFeature([], tf.float32),
        'y': tf.io.FixedLenFeature([], tf.string),
    })
    temp_dir = self.get_temp_dir()
    # Force a batch size of 1 to ensure that occurences are correctly aggregated
    # across batches when computing the total vocabulary size.
    with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=1):
      input_data, input_metadata = self._MaybeConvertInputsToTFXIO(
          input_data, input_metadata)
      transform_fn = ((input_data, input_metadata)
                      | beam_impl.AnalyzeDataset(preprocessing_fn))
      #  Write transform_fn to serialize annotation collections to SavedModel
      _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir)

    # Ensure that the annotations survive the round trip to SavedModel.
    tf_transform_output = tft.TFTransformOutput(temp_dir)
    savedmodel_dir = tf_transform_output.transform_savedmodel_dir
    schema = beam_impl._infer_metadata_from_saved_model(savedmodel_dir)._schema
    self.assertLen(schema.feature, 2)
    for feature in schema.feature:
      if feature.name == 'x_bucketized':
        self.assertLen(feature.annotation.extra_metadata, 1)
        for annotation in feature.annotation.extra_metadata:
          message = annotations_pb2.BucketBoundaries()
          annotation.Unpack(message)
          self.assertAllClose(list(message.boundaries), [2, 3, 4])
      elif feature.name == 'y_vocab':
        self.assertLen(feature.annotation.extra_metadata, 0)
      else:
        raise ValueError('Unexpected feature with metadata: {}'.format(
            feature.name))
    # Vocabularies create a top-level schema annotation for each vocab file.
    self.assertLen(schema.annotation.extra_metadata, 1)
    message = annotations_pb2.VocabularyMetadata()
    annotation = schema.annotation.extra_metadata[0]
    annotation.Unpack(message)
    self.assertEqual(message.unfiltered_vocabulary_size, 2) 
Example #21
Source File: preprocessing.py    From code-snippets with Apache License 2.0 4 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in taxi.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[
        taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

  for key in taxi.BUCKET_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = transform.bucketize(
        _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

  for key in taxi.CATEGORICAL_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
  tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
  outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 20% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
          tf.int64))

  return outputs 
Example #22
Source File: preprocessing2.py    From code-snippets with Apache License 2.0 4 votes vote down vote up
def preprocessing_fn(inputs):
  """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
  outputs = {}
  for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
    # Preserve this feature as a dense float, setting nan's to the mean.
    outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
        _fill_in_missing(inputs[key]))

  for key in taxi.VOCAB_FEATURE_KEYS:
    # Build a vocabulary for this feature.
    outputs[
        taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary(
            _fill_in_missing(inputs[key]),
            top_k=taxi.VOCAB_SIZE,
            num_oov_buckets=taxi.OOV_SIZE)

  for key in taxi.BUCKET_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = transform.bucketize(
        _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

  for key in taxi.CATEGORICAL_FEATURE_KEYS:
    outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

  # Was this passenger a big tipper?
  taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
  tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
  outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
      tf.is_nan(taxi_fare),
      tf.cast(tf.zeros_like(taxi_fare), tf.int64),
      # Test if the tip was > 5% of the fare.
      tf.cast(
          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.05))),
          tf.int64))

  return outputs