Python tensorflow_transform.tf_metadata.dataset_schema.from_feature_spec() Examples
The following are 18
code examples of tensorflow_transform.tf_metadata.dataset_schema.from_feature_spec().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow_transform.tf_metadata.dataset_schema
, or try the search function
.
Example #1
Source File: criteo.py From cloudml-samples with Apache License 2.0 | 6 votes |
def make_input_schema(mode=tf.contrib.learn.ModeKeys.TRAIN): """Input schema definition. Args: mode: tf.contrib.learn.ModeKeys specifying if the schema is being used for train/eval or prediction. Returns: A `Schema` object. """ result = ({} if mode == tf.contrib.learn.ModeKeys.INFER else {'clicked': tf.FixedLenFeature(shape=[], dtype=tf.int64)}) for name in INTEGER_COLUMN_NAMES: result[name] = tf.FixedLenFeature( shape=[], dtype=tf.int64, default_value=-1) for name in CATEGORICAL_COLUMN_NAMES: result[name] = tf.FixedLenFeature(shape=[], dtype=tf.string, default_value='') return dataset_schema.from_feature_spec(result)
Example #2
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def WriteOutput(p, prefix, output_dir, feature_spec, plain_text=False): """Writes the given pCollection as a TF-Record. Args: p: a pCollection. prefix: prefix for location tf-record will be written to. output_dir: the directory or bucket to write the json data. feature_spec: the feature spec of the tf-record to be written. plain_text: if true, write the output as plain text instead. """ path = os.path.join(output_dir, prefix) shuffled = p | "ShuffleData" >> Shuffle() # pylint: disable=no-value-for-parameter if plain_text: shuffled | "WriteToText" >> beam.io.WriteToText( path, file_name_suffix=".txt") return schema = dataset_schema.from_feature_spec(feature_spec) coder = coders.ExampleProtoCoder(schema) shuffled | "WriteTFRecord" >> beam.io.tfrecordio.WriteToTFRecord( path, coder=coder, file_name_suffix=".tfrecord")
Example #3
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def _run_tft_fn(raw_data, tft_fn, transform_fn_path, user_freq, item_freq): """Applys the TensorFlow Transform function to the given data. Args: raw_data: a dict of shape {$user_key: $user_id, $item_key: ...}. tft_fn: a TensorFlow Transform function. transform_fn_path: the location to save transformation outputs to. user_freq: minimum frequency of a user to include it in the user vocab. item_freq: minimum frequency of an item to include it in the item vocab. Returns: A pCollection of dicts, where each dict is an element of raw_data with the preprocess_fn applied to it: {$user_key: $user_id, $item_key: $item_id, $count_key: $count}. """ raw_data_metadata = tft.tf_metadata.dataset_metadata.DatasetMetadata( tft.tf_metadata.dataset_schema.from_feature_spec(constants.TRAIN_SPEC)) transformed_dataset, transform_fn = ( (raw_data, raw_data_metadata) | beam_impl.AnalyzeAndTransformDataset( lambda x: tft_fn(x, user_freq, item_freq))) (transform_fn | "WriteTransformFn" >> tft.beam.tft_beam_io.transform_fn_io.WriteTransformFn( os.path.join(transform_fn_path, "transform_fn"))) return transformed_dataset[0]
Example #4
Source File: preprocess.py From professional-services with Apache License 2.0 | 6 votes |
def make_input_schema(): """Builds the schema of the data read from BigQuery. Appends key column to schema for inference. Returns: A dictionary mapping keys of column names to `tf.FixedLenFeature` instances. """ feature_spec = {} for c in constants.FEATURE_COLUMNS: feature_spec[c] = tf.FixedLenFeature(shape=[], dtype=tf.float32) feature_spec[constants.LABEL_COLUMN] = tf.FixedLenFeature( shape=[], dtype=tf.int64) feature_spec[constants.KEY_COLUMN] = tf.FixedLenFeature( shape=[], dtype=tf.int64) return dataset_schema.from_feature_spec(feature_spec)
Example #5
Source File: generate_vocab.py From text with Apache License 2.0 | 6 votes |
def main(_): # Define schema. raw_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'text': tf.FixedLenFeature([], tf.string), 'language_code': tf.FixedLenFeature([], tf.string), })) # Add in padding tokens. reserved_tokens = FLAGS.reserved_tokens if FLAGS.num_pad_tokens: padded_tokens = ['<pad>'] padded_tokens += ['<pad%d>' % i for i in range(1, FLAGS.num_pad_tokens)] reserved_tokens = padded_tokens + reserved_tokens params = learner.Params(FLAGS.upper_thresh, FLAGS.lower_thresh, FLAGS.num_iterations, FLAGS.max_input_tokens, FLAGS.max_token_length, FLAGS.max_unique_chars, FLAGS.vocab_size, FLAGS.slack_ratio, FLAGS.include_joiner_token, FLAGS.joiner, reserved_tokens) generate_vocab(FLAGS.data_file, FLAGS.vocab_file, FLAGS.metrics_file, raw_metadata, params)
Example #6
Source File: reddit.py From cloudml-samples with Apache License 2.0 | 6 votes |
def make_input_schema(mode=tf.contrib.learn.ModeKeys.TRAIN): """Input schema definition. Args: mode: tf.contrib.learn.ModeKeys specifying if the schema is being used for train/eval or prediction. Returns: A `Schema` object. """ result = ({} if mode == tf.contrib.learn.ModeKeys.INFER else { 'score': tf.FixedLenFeature(shape=[], dtype=tf.float32) }) result.update({ 'subreddit': tf.FixedLenFeature(shape=[], dtype=tf.string), 'author': tf.FixedLenFeature(shape=[], dtype=tf.string), 'comment_body': tf.FixedLenFeature(shape=[], dtype=tf.string, default_value=''), 'comment_parent_body': tf.FixedLenFeature(shape=[], dtype=tf.string, default_value=''), 'toplevel': tf.FixedLenFeature(shape=[], dtype=tf.int64), }) return dataset_schema.from_feature_spec(result)
Example #7
Source File: movielens.py From cloudml-samples with Apache License 2.0 | 6 votes |
def _make_schema(columns, types, default_values): """Input schema definition. Args: columns: column names for fields appearing in input. types: column types for fields appearing in input. default_values: default values for fields appearing in input. Returns: feature_set dictionary of string to *Feature. """ result = {} assert len(columns) == len(types) assert len(columns) == len(default_values) for c, t, v in zip(columns, types, default_values): if isinstance(t, list): result[c] = tf.VarLenFeature(dtype=t[0]) else: result[c] = tf.FixedLenFeature(shape=[], dtype=t, default_value=v) return dataset_schema.from_feature_spec(result)
Example #8
Source File: generate_word_counts.py From text with Apache License 2.0 | 5 votes |
def main(_): # Generate schema of input data. raw_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'text': tf.FixedLenFeature([], tf.string), 'language_code': tf.FixedLenFeature([], tf.string), })) pipeline = word_count(FLAGS.input_path, FLAGS.output_path, raw_metadata) pipeline.run().wait_until_finish()
Example #9
Source File: chicago_taxi_client.py From tfx with Apache License 2.0 | 5 votes |
def _make_proto_coder(schema): raw_feature_spec = _get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) return tft_coders.ExampleProtoCoder(raw_schema)
Example #10
Source File: chicago_taxi_client.py From tfx with Apache License 2.0 | 5 votes |
def _make_csv_coder(schema, column_names): """Return a coder for tf.transform to read csv files.""" raw_feature_spec = _get_raw_feature_spec(schema) parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec) return tft_coders.CsvCoder(column_names, parsing_schema)
Example #11
Source File: preprocess.py From professional-services with Apache License 2.0 | 5 votes |
def run(p, params): """Defines Beam preprocessing pipeline. Performs the following: - Reads text files from pattern. - Split text files in train and validation sets. Args: p: PCollection, initial pipeline. params: Object holding a set of parameters as name-value pairs. """ path_pattern = os.path.join(params.input_dir, '*', '*{}'.format( constants.FILE_EXTENSION)) data = ( p | 'ListFiles' >> beam.Create(gfile.Glob(path_pattern)) | 'ReadFiles' >> beam.ParDo(ReadFile()) | 'SplitData' >> beam.ParDo( _SplitData(), train_size=params.train_size, val_label=_DatasetType.VAL.name).with_outputs( _DatasetType.VAL.name, main=_DatasetType.TRAIN.name)) schema = dataset_schema.from_feature_spec(utils.get_processed_data_schema()) for dataset in _DatasetType: if not dataset.value: continue _ = ( data[dataset.name] | 'Shuffle{}'.format(dataset.name) >> shuffle() # pylint: disable=no-value-for-parameter | 'WriteFiles{}'.format(dataset.name) >> tfrecordio.WriteToTFRecord( os.path.join(params.output_dir, dataset.name + constants.TFRECORD), coder=example_proto_coder.ExampleProtoCoder(schema)))
Example #12
Source File: example_decoders.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def __init__(self, feature_spec): super(ExampleWithFeatureSpecDecoder, self).__init__() schema = dataset_schema.from_feature_spec(feature_spec) self._coder = example_proto_coder.ExampleProtoCoder(schema)
Example #13
Source File: features.py From professional-services with Apache License 2.0 | 5 votes |
def get_raw_dataset_metadata(): return dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec(RAW_FEATURE_SPEC))
Example #14
Source File: taxi.py From code-snippets with Apache License 2.0 | 5 votes |
def make_proto_coder(schema): raw_feature_spec = get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) return tft_coders.ExampleProtoCoder(raw_schema)
Example #15
Source File: taxi.py From code-snippets with Apache License 2.0 | 5 votes |
def make_csv_coder(schema): """Return a coder for tf.transform to read csv files.""" raw_feature_spec = get_raw_feature_spec(schema) parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec) return tft_coders.CsvCoder(CSV_COLUMN_NAMES, parsing_schema)
Example #16
Source File: taxi_schema.py From code-snippets with Apache License 2.0 | 5 votes |
def make_proto_coder(schema): raw_feature_spec = get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) return tft_coders.ExampleProtoCoder(raw_schema)
Example #17
Source File: taxi_schema.py From code-snippets with Apache License 2.0 | 5 votes |
def make_csv_coder(schema): """Return a coder for tf.transform to read csv files.""" raw_feature_spec = get_raw_feature_spec(schema) parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec) return tft_coders.CsvCoder(CSV_COLUMN_NAMES, parsing_schema)
Example #18
Source File: taxi_preprocess_bq.py From code-snippets with Apache License 2.0 | 5 votes |
def make_mcsv_coder(schema): """Return a coder for tf.transform to read csv files.""" raw_feature_spec = taxi.get_raw_feature_spec(schema) parsing_schema = dataset_schema.from_feature_spec(raw_feature_spec) return mcsv_coder.CsvCoder(taxi.CSV_COLUMN_NAMES, parsing_schema)