Python tensorflow_transform.compute_and_apply_vocabulary() Examples
The following are 22
code examples of tensorflow_transform.compute_and_apply_vocabulary().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow_transform
, or try the search function
.
Example #1
Source File: impl_test.py From transform with Apache License 2.0 | 6 votes |
def testTFIDFNoData(self): def preprocessing_fn(inputs): inputs_as_ints = tft.compute_and_apply_vocabulary( tf.compat.v1.strings.split(inputs['a'])) out_index, out_values = tft.tfidf(inputs_as_ints, 6) return { 'tf_idf': out_values, 'index': out_index } input_data = [{'a': ''}] input_metadata = tft_unit.metadata_from_feature_spec( {'a': tf.io.FixedLenFeature([], tf.string)}) expected_transformed_data = [{'tf_idf': [], 'index': []}] expected_metadata = tft_unit.metadata_from_feature_spec({ 'tf_idf': tf.io.VarLenFeature(tf.float32), 'index': tf.io.VarLenFeature(tf.int64) }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_transformed_data, expected_metadata)
Example #2
Source File: vocabulary_integration_test.py From transform with Apache License 2.0 | 6 votes |
def testVocabularyAnalyzerWithTokenization(self): def preprocessing_fn(inputs): return { 'index': tft.compute_and_apply_vocabulary( tf.compat.v1.strings.split(inputs['a'])) } input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}] input_metadata = tft_unit.metadata_from_feature_spec( {'a': tf.io.FixedLenFeature([], tf.string)}) expected_data = [{'index': [0, 0, 1]}, {'index': [0, 2, 1]}] expected_metadata = tft_unit.metadata_from_feature_spec({ 'index': tf.io.VarLenFeature(tf.int64), }, { 'index': schema_pb2.IntDomain(min=-1, max=2, is_categorical=True), }) self.assertAnalyzeAndTransformResults(input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata)
Example #3
Source File: simple_example.py From transform with Apache License 2.0 | 5 votes |
def main(): def preprocessing_fn(inputs): """Preprocess input columns into transformed columns.""" x = inputs['x'] y = inputs['y'] s = inputs['s'] x_centered = x - tft.mean(x) y_normalized = tft.scale_to_0_1(y) s_integerized = tft.compute_and_apply_vocabulary(s) x_centered_times_y_normalized = (x_centered * y_normalized) return { 'x_centered': x_centered, 'y_normalized': y_normalized, 'x_centered_times_y_normalized': x_centered_times_y_normalized, 's_integerized': s_integerized } raw_data = [ {'x': 1, 'y': 1, 's': 'hello'}, {'x': 2, 'y': 2, 's': 'world'}, {'x': 3, 'y': 3, 's': 'hello'} ] raw_data_metadata = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'y': tf.io.FixedLenFeature([], tf.float32), 'x': tf.io.FixedLenFeature([], tf.float32), })) with tft_beam.Context(temp_dir=tempfile.mkdtemp()): transformed_dataset, transform_fn = ( # pylint: disable=unused-variable (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset( preprocessing_fn)) transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable pprint.pprint(transformed_data)
Example #4
Source File: preprocess.py From professional-services with Apache License 2.0 | 5 votes |
def _preprocess_tft(raw_data, user_freq, item_freq): """Creates vocabularies for users and items and maps their ids to ints. Args: raw_data: a dict of shape {$user_key: tensor, $item_key: tensor, ...}. user_freq: minimum frequency of a user to include it in the user vocab. item_freq: minimum frequency of an item to include it in the item vocab. Returns: A dict containing int ids cooresponding to a user_id and item_id and other features: {$user_key: $user_id, $item_key: $item_id, ...}. """ features = {feature: raw_data[feature] for feature in constants.BQ_FEATURES} tft_features = { constants.TFT_USER_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.USER_KEY], vocab_filename=constants.USER_VOCAB_NAME, frequency_threshold=user_freq, default_value=constants.TFT_DEFAULT_ID), constants.TFT_ITEM_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.ITEM_KEY], vocab_filename=constants.ITEM_VOCAB_NAME, frequency_threshold=item_freq, default_value=constants.TFT_DEFAULT_ID), constants.TFT_ARTIST_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.ARTIST_KEY], vocab_filename=constants.ARTIST_VOCAB_NAME, default_value=constants.TFT_DEFAULT_ID), constants.TFT_TAGS_KEY: tft.compute_and_apply_vocabulary( raw_data[constants.TAGS_KEY], vocab_filename=constants.TAG_VOCAB_NAME, default_value=constants.TFT_DEFAULT_ID), } features.update(tft_features) return features
Example #5
Source File: transform_module.py From tfx with Apache License 2.0 | 5 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(_identity(inputs[key]))) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
Example #6
Source File: preprocessing.py From tfx with Apache License 2.0 | 5 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in features.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[features.transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in features.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=features.VOCAB_SIZE, num_oov_buckets=features.OOV_SIZE) for key, num_buckets in zip(features.BUCKET_FEATURE_KEYS, features.BUCKET_FEATURE_BUCKET_COUNT): outputs[features.transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), num_buckets) for key in features.CATEGORICAL_FEATURE_KEYS: outputs[features.transformed_name(key)] = _fill_in_missing(inputs[key]) # TODO(b/157064428): Support label transformation for Keras. # Do not apply label transformation as it will result in wrong evaluation. outputs[features.transformed_name( features.LABEL_KEY)] = inputs[features.LABEL_KEY] return outputs
Example #7
Source File: taxi_utils_native_keras.py From tfx with Apache License 2.0 | 5 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # TODO(b/157064428): Support label transformation for Keras. # Do not apply label transformation as it will result in wrong evaluation. outputs[_transformed_name(_LABEL_KEY)] = inputs[_LABEL_KEY] return outputs # TFX Trainer will call this function.
Example #8
Source File: taxi_utils.py From tfx with Apache License 2.0 | 5 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
Example #9
Source File: taxi_utils_bqml.py From tfx with Apache License 2.0 | 5 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.compat.v1.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
Example #10
Source File: imdb_utils_native_keras.py From tfx with Apache License 2.0 | 5 votes |
def _tokenize_review(review): """Tokenize the reviews by spliting the reviews. Constructing a vocabulary. Map the words to their frequency index in the vocabulary. Args: review: tensors containing the reviews. (batch_size/None, 1) Returns: Tokenized and padded review tensors. (batch_size/None, _MAX_LEN) """ review_sparse = tf.strings.split(tf.reshape(review, [-1])).to_sparse() # tft.apply_vocabulary doesn't reserve 0 for oov words. In order to comply # with convention and use mask_zero in keras.embedding layer, set oov value # to _VOCAB_SIZE and padding value to -1. Then add 1 to all the tokens. review_indices = tft.compute_and_apply_vocabulary( review_sparse, default_value=_VOCAB_SIZE, top_k=_VOCAB_SIZE) dense = tf.sparse.to_dense(review_indices, default_value=-1) # TFX transform expects the transform result to be FixedLenFeature. padding_config = [[0, 0], [0, _MAX_LEN]] dense = tf.pad(dense, padding_config, 'CONSTANT', -1) padded = tf.slice(dense, [0, 0], [-1, _MAX_LEN]) padded += 1 return padded # TFX Transform will call this function.
Example #11
Source File: taxi_utils.py From pipelines with Apache License 2.0 | 5 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT, always_return_num_quantiles=False) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
Example #12
Source File: impl_test.py From transform with Apache License 2.0 | 5 votes |
def testPipelineAPICounters(self): def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['a']) return { 'a_int': tft.compute_and_apply_vocabulary(inputs['a']), 'x_scaled': tft.scale_to_0_1(inputs['x']), 'y_scaled': tft.scale_to_0_1(inputs['y']) } with self._makeTestPipeline() as pipeline: input_data = pipeline | 'CreateTrainingData' >> beam.Create([{ 'x': 4, 'y': 5, 'a': 'hello' }, { 'x': 1, 'y': 3, 'a': 'world' }]) metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 'a': tf.io.FixedLenFeature([], tf.string) }) with beam_impl.Context(temp_dir=self.get_temp_dir()): input_data, metadata = self._MaybeConvertInputsToTFXIO( input_data, metadata) _ = ((input_data, metadata) | 'AnalyzeDataset' >> beam_impl.AnalyzeDataset(preprocessing_fn)) metrics = pipeline.metrics self.assertMetricsCounterEqual(metrics, 'tft_analyzer_vocabulary', 1) self.assertMetricsCounterEqual(metrics, 'tft_mapper_scale_to_0_1', 2) self.assertMetricsCounterEqual(metrics, 'tft_mapper_compute_and_apply_vocabulary', 1) # compute_and_apply_vocabulary implicitly calls apply_vocabulary. # We check that that call is not logged. self.assertMetricsCounterEqual(metrics, 'tft_mapper_apply_vocabulary', 0)
Example #13
Source File: impl_test.py From transform with Apache License 2.0 | 5 votes |
def testStringToTFIDFEmptyDoc(self): def preprocessing_fn(inputs): inputs_as_ints = tft.compute_and_apply_vocabulary( tf.compat.v1.strings.split(inputs['a'])) out_index, out_values = tft.tfidf(inputs_as_ints, 6) return { 'tf_idf': out_values, 'index': out_index } input_data = [{'a': 'hello hello world'}, {'a': ''}, {'a': 'hello goodbye hello world'}, {'a': 'I like pie pie pie'}] input_metadata = tft_unit.metadata_from_feature_spec( {'a': tf.io.FixedLenFeature([], tf.string)}) log_5_over_2 = 1.91629073187 log_5_over_3 = 1.51082562376 expected_transformed_data = [{ 'tf_idf': [(2/3)*log_5_over_3, (1/3)*log_5_over_3], 'index': [0, 2] }, { 'tf_idf': [], 'index': [] }, { 'tf_idf': [(2/4)*log_5_over_3, (1/4)*log_5_over_3, (1/4)*log_5_over_2], 'index': [0, 2, 4] }, { 'tf_idf': [(3/5)*log_5_over_2, (1/5)*log_5_over_2, (1/5)*log_5_over_2], 'index': [1, 3, 5] }] expected_metadata = tft_unit.metadata_from_feature_spec({ 'tf_idf': tf.io.VarLenFeature(tf.float32), 'index': tf.io.VarLenFeature(tf.int64) }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_transformed_data, expected_metadata)
Example #14
Source File: impl_test.py From transform with Apache License 2.0 | 5 votes |
def testStringToTFIDF(self): def preprocessing_fn(inputs): inputs_as_ints = tft.compute_and_apply_vocabulary( tf.compat.v1.strings.split(inputs['a'])) out_index, out_values = tft.tfidf(inputs_as_ints, 6) return { 'tf_idf': out_values, 'index': out_index } input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye hello world'}, {'a': 'I like pie pie pie'}] input_metadata = tft_unit.metadata_from_feature_spec( {'a': tf.io.FixedLenFeature([], tf.string)}) # IDFs # hello = log(4/3) = 0.28768 # world = log(4/3) # goodbye = log(4/2) = 0.69314 # I = log(4/2) # like = log(4/2) # pie = log(4/2) log_4_over_2 = 1.69314718056 log_4_over_3 = 1.28768207245 expected_transformed_data = [{ 'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3], 'index': [0, 2] }, { 'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_2], 'index': [0, 2, 4] }, { 'tf_idf': [(3/5)*log_4_over_2, (1/5)*log_4_over_2, (1/5)*log_4_over_2], 'index': [1, 3, 5] }] expected_metadata = tft_unit.metadata_from_feature_spec({ 'tf_idf': tf.io.VarLenFeature(tf.float32), 'index': tf.io.VarLenFeature(tf.int64) }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_transformed_data, expected_metadata)
Example #15
Source File: impl_test.py From transform with Apache License 2.0 | 5 votes |
def testWithMoreThanDesiredBatchSize(self): def preprocessing_fn(inputs): return { 'ab': tf.multiply(inputs['a'], inputs['b']), 'i': tft.compute_and_apply_vocabulary(inputs['c']) } batch_size = 100 num_instances = batch_size + 1 input_data = [{ 'a': 2, 'b': i, 'c': '%.10i' % i, # Front-padded to facilitate lexicographic sorting. } for i in range(num_instances)] input_metadata = tft_unit.metadata_from_feature_spec({ 'a': tf.io.FixedLenFeature([], tf.float32), 'b': tf.io.FixedLenFeature([], tf.float32), 'c': tf.io.FixedLenFeature([], tf.string) }) expected_data = [{ 'ab': 2*i, 'i': (len(input_data) - 1) - i, # Due to reverse lexicographic sorting. } for i in range(len(input_data))] expected_metadata = tft_unit.metadata_from_feature_spec({ 'ab': tf.io.FixedLenFeature([], tf.float32), 'i': tf.io.FixedLenFeature([], tf.int64), }, { 'i': schema_pb2.IntDomain( min=-1, max=num_instances - 1, is_categorical=True) }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, desired_batch_size=batch_size)
Example #16
Source File: main.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def preprocessing_fn(inputs): out = dict() for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. out[taxi.transformed_name(key)] = tft.scale_to_z_score( taxi.fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. out[taxi.transformed_name(key)] = tft.compute_and_apply_vocabulary( taxi.fill_in_missing(inputs[key]), top_k=10, num_oov_buckets=10) for key in taxi.BUCKET_FEATURE_KEYS: out[taxi.transformed_name(key)] = tft.bucketize(taxi.fill_in_missing(inputs[key]), num_buckets=10) for key in taxi.CATEGORICAL_FEATURE_KEYS: out[taxi.transformed_name(key)] = taxi.fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = taxi.fill_in_missing(inputs[taxi.FARE_KEY]) tips = taxi.fill_in_missing(inputs[taxi.LABEL_KEY]) out[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64) ) return out
Example #17
Source File: cached_impl_test.py From transform with Apache License 2.0 | 4 votes |
def test_changing_constant_fails_cache(self, use_tfxio): def make_preprocessing_fn(string): def preprocessing_fn(inputs): constant_str = tf.tile(tf.constant([string]), tf.shape(inputs['s'])) joined = tf.strings.join([inputs['s'], constant_str]) return {'id': tft.compute_and_apply_vocabulary(joined)} return preprocessing_fn feature_spec = {'s': tf.io.FixedLenFeature([], tf.string)} input_data_dict = { analyzer_cache.DatasetKey('span-0'): [dict(s='a'), dict(s='b')] } run_result = self._run_pipeline(feature_spec, input_data_dict, make_preprocessing_fn('1st_run'), use_tfxio=use_tfxio) first_cache_output, p1 = run_result.cache_output, run_result.pipeline for key in input_data_dict: self.assertIn(key, first_cache_output) self.assertEqual(1, len(first_cache_output[key])) self.assertMetricsCounterEqual(p1.metrics, 'num_instances', 2) self.assertMetricsCounterEqual(p1.metrics, 'cache_entries_decoded', 0) self.assertMetricsCounterEqual(p1.metrics, 'cache_entries_encoded', 1) self.assertMetricsCounterEqual(p1.metrics, 'saved_models_created', _SINGLE_PHASE_NUM_SAVED_MODELS) run_result = self._run_pipeline(feature_spec, input_data_dict, make_preprocessing_fn('2nd_run'), use_tfxio=use_tfxio) second_cache_output, p2 = run_result.cache_output, run_result.pipeline # We expect a full output cache again because tf.function in the # preprocessing_fn broke that cache entry. for key in input_data_dict: self.assertIn(key, second_cache_output) self.assertEqual(1, len(second_cache_output[key])) self.assertMetricsCounterEqual(p2.metrics, 'num_instances', 2) self.assertMetricsCounterEqual(p2.metrics, 'cache_entries_decoded', 0) self.assertMetricsCounterEqual(p2.metrics, 'cache_entries_encoded', 1) self.assertMetricsCounterEqual(p2.metrics, 'saved_models_created', _SINGLE_PHASE_NUM_SAVED_MODELS)
Example #18
Source File: vocabulary_integration_test.py From transform with Apache License 2.0 | 4 votes |
def testComputeAndApplyVocabulary( self, x_data, x_feature_spec, index_data, index_feature_spec, index_domain, label_data=None, label_feature_spec=None, weight_data=None, weight_feature_spec=None, expected_vocab_file_contents=None, **kwargs): """Test tft.compute_and_apply_vocabulary with various inputs.""" input_data = [{'x': x} for x in x_data] input_feature_spec = {'x': x_feature_spec} expected_data = [{'index': index} for index in index_data] expected_feature_spec = {'index': index_feature_spec} expected_domains = {'index': index_domain} if label_data is not None: for idx, label in enumerate(label_data): input_data[idx]['label'] = label input_feature_spec['label'] = label_feature_spec if weight_data is not None: for idx, weight in enumerate(weight_data): input_data[idx]['weights'] = weight input_feature_spec['weights'] = weight_feature_spec input_metadata = tft_unit.metadata_from_feature_spec(input_feature_spec) expected_metadata = tft_unit.metadata_from_feature_spec( expected_feature_spec, expected_domains) def preprocessing_fn(inputs): x = inputs['x'] labels = inputs.get('label') weights = inputs.get('weights') index = tft.compute_and_apply_vocabulary( x, labels=labels, weights=weights, **kwargs) return {'index': index} self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata, expected_vocab_file_contents=expected_vocab_file_contents) # Example on how to use the vocab frequency as part of the transform # function.
Example #19
Source File: taxi_utils_solution.py From tfx with Apache License 2.0 | 4 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in _DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[_transformed_name(key)] = tft.scale_to_z_score( _fill_in_missing(inputs[key])) for key in _VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=_VOCAB_SIZE, num_oov_buckets=_OOV_SIZE) for key in _BUCKET_FEATURE_KEYS: outputs[_transformed_name(key)] = tft.bucketize( _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT) for key in _CATEGORICAL_FEATURE_KEYS: outputs[_transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[_FARE_KEY]) tips = _fill_in_missing(inputs[_LABEL_KEY]) outputs[_transformed_name(_LABEL_KEY)] = tf.where( tf.math.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs # TFX Trainer will call this function.
Example #20
Source File: impl_test.py From transform with Apache License 2.0 | 4 votes |
def testSavedModelWithAnnotations(self): """Test serialization/deserialization as a saved model with annotations.""" def preprocessing_fn(inputs): # Bucketization applies annotations to the output schema return { 'x_bucketized': tft.bucketize(inputs['x'], num_buckets=4), 'y_vocab': tft.compute_and_apply_vocabulary(inputs['y']), } input_data = [{ 'x': 1, 'y': 'foo', }, { 'x': 2, 'y': 'bar', }, { 'x': 3, 'y': 'foo', }, { 'x': 4, 'y': 'foo', }] input_metadata = tft_unit.metadata_from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.string), }) temp_dir = self.get_temp_dir() # Force a batch size of 1 to ensure that occurences are correctly aggregated # across batches when computing the total vocabulary size. with beam_impl.Context(temp_dir=temp_dir, desired_batch_size=1): input_data, input_metadata = self._MaybeConvertInputsToTFXIO( input_data, input_metadata) transform_fn = ((input_data, input_metadata) | beam_impl.AnalyzeDataset(preprocessing_fn)) # Write transform_fn to serialize annotation collections to SavedModel _ = transform_fn | transform_fn_io.WriteTransformFn(temp_dir) # Ensure that the annotations survive the round trip to SavedModel. tf_transform_output = tft.TFTransformOutput(temp_dir) savedmodel_dir = tf_transform_output.transform_savedmodel_dir schema = beam_impl._infer_metadata_from_saved_model(savedmodel_dir)._schema self.assertLen(schema.feature, 2) for feature in schema.feature: if feature.name == 'x_bucketized': self.assertLen(feature.annotation.extra_metadata, 1) for annotation in feature.annotation.extra_metadata: message = annotations_pb2.BucketBoundaries() annotation.Unpack(message) self.assertAllClose(list(message.boundaries), [2, 3, 4]) elif feature.name == 'y_vocab': self.assertLen(feature.annotation.extra_metadata, 0) else: raise ValueError('Unexpected feature with metadata: {}'.format( feature.name)) # Vocabularies create a top-level schema annotation for each vocab file. self.assertLen(schema.annotation.extra_metadata, 1) message = annotations_pb2.VocabularyMetadata() annotation = schema.annotation.extra_metadata[0] annotation.Unpack(message) self.assertEqual(message.unfiltered_vocabulary_size, 2)
Example #21
Source File: preprocessing.py From code-snippets with Apache License 2.0 | 4 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs
Example #22
Source File: preprocessing2.py From code-snippets with Apache License 2.0 | 4 votes |
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[ taxi.transformed_name(key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 5% of the fare. tf.cast( tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.05))), tf.int64)) return outputs