Python tensorflow.SparseFeature() Examples
The following are 20
code examples of tensorflow.SparseFeature().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
![](https://www.programcreek.com/common/static/images/search.png)
Example #1
Source File: mcsv_coder.py From code-snippets with Apache License 2.0 | 6 votes |
def encode_value(self, string_list, sparse_value): """Encode the value of this feature into the CSV line.""" index, value = sparse_value if len(value) == len(index): if self._encoder: string_list[self._value_index] = self._encoder.encode_record( map(str, value)) string_list[self._index_index] = self._encoder.encode_record( map(str, index)) else: string_list[self._value_index] = str(value[0]) if value else '' string_list[self._index_index] = str(index[0]) if index else '' else: raise ValueError( 'SparseFeature %r has value and index unaligned %r vs %r.' % (self._name, value, index))
Example #2
Source File: schema_utils.py From transform with Apache License 2.0 | 6 votes |
def _feature_from_feature_spec(spec, name, domains): """Returns a representation of a Feature from a feature spec.""" if isinstance(spec, tf.io.FixedLenFeature): if spec.default_value is not None: raise ValueError( 'feature "{}" had default_value {}, but FixedLenFeature must have ' 'default_value=None'.format(name, spec.default_value)) dims = [schema_pb2.FixedShape.Dim(size=size) for size in spec.shape] feature = schema_pb2.Feature( name=name, presence=schema_pb2.FeaturePresence(min_fraction=1.0), shape=schema_pb2.FixedShape(dim=dims)) elif isinstance(spec, tf.io.VarLenFeature): feature = schema_pb2.Feature(name=name) else: raise TypeError( 'Spec for feature "{}" was {} of type {}, expected a ' 'FixedLenFeature, VarLenFeature or SparseFeature'.format( name, spec, type(spec))) _set_type(name, feature, spec.dtype) _set_domain(name, feature, domains.get(name)) return feature
Example #3
Source File: example_decoders_test.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def test_example_with_feature_spec_decoder(self): feature_spec = { "scalar_feature_1": tf.FixedLenFeature(shape=[], dtype=tf.int64), "scalar_feature_2": tf.FixedLenFeature(shape=[], dtype=tf.int64), "scalar_feature_3": tf.FixedLenFeature(shape=[], dtype=tf.float32), "varlen_feature_1": tf.VarLenFeature(dtype=tf.float32), "varlen_feature_2": tf.VarLenFeature(dtype=tf.string), "1d_vector_feature": tf.FixedLenFeature(shape=[1], dtype=tf.string), "2d_vector_feature": tf.FixedLenFeature(shape=[2, 2], dtype=tf.float32), "sparse_feature": tf.SparseFeature("sparse_feature_idx", "sparse_feature_value", tf.float32, 10), } dec = ExampleWithFeatureSpecDecoder(feature_spec) actual_json = json.loads(dec.to_json(self.example_str)) expected_decoded = { "scalar_feature_1": 12, "scalar_feature_2": 12, "scalar_feature_3": 1.0, "varlen_feature_1": [89.0], "1d_vector_feature": ["this is a ,text"], "2d_vector_feature": [[1.0, 2.0], [3.0, 4.0]], "varlen_feature_2": ["female"], "sparse_feature_idx": [1, 4], "sparse_feature_value": [12.0, 20.0], } self.assertEqual(actual_json, expected_decoded)
Example #4
Source File: dataset.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def parse_schema(cls, schema_path): # type: (str) -> Tuple[Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]], Schema] # noqa: E501 """ Returns TensorFlow Feature Spec and parsed tf.metadata Schema for given tf.metadata Schema. :param schema_path: tf.metadata Schema path """ schema = parse_schema_file(schema_path) return schema_to_feature_spec(schema), schema
Example #5
Source File: dataset.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def parse_schema_from_stats(cls, stats_path): # type: (str) -> Tuple[Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]], Schema] # noqa: E501 """ Returns TensorFlow Feature Spec and parsed tf.metadata Schema for given tf.metadata DatasetFeatureStatisticsList. :param stats_path: tf.metadata DatasetFeatureStatisticsList path """ import tensorflow_data_validation as tfdv stats = tfdv.load_statistics(stats_path) schema = tfdv.infer_schema(stats) return schema_to_feature_spec(schema), schema
Example #6
Source File: tf_schema_utils_test.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def test_round_trip(self): feature_spec = { "scalar_feature_1": tf.FixedLenFeature(shape=[], dtype=tf.int64), "scalar_feature_2": tf.FixedLenFeature(shape=[], dtype=tf.int64), "scalar_feature_3": tf.FixedLenFeature(shape=[], dtype=tf.float32), "varlen_feature_1": tf.VarLenFeature(dtype=tf.float32), "varlen_feature_2": tf.VarLenFeature(dtype=tf.string), "1d_vector_feature": tf.FixedLenFeature(shape=[1], dtype=tf.string), "2d_vector_feature": tf.FixedLenFeature(shape=[2, 2], dtype=tf.float32), "sparse_feature": tf.SparseFeature("idx", "value", tf.float32, 10), } inferred_schema = feature_spec_to_schema(feature_spec) inferred_feature_spec = schema_to_feature_spec(inferred_schema) self.assertEqual(inferred_feature_spec, feature_spec)
Example #7
Source File: mcsv_coder.py From code-snippets with Apache License 2.0 | 5 votes |
def parse_value(self, string_list): """Parse the value of this feature from string list split from CSV line.""" value_str = string_list[self._value_index] index_str = string_list[self._index_index] if value_str and self._reader: values = map(self._cast_fn, _decode_with_reader(value_str, self._reader)) elif value_str: values = [self._cast_fn(value_str)] else: values = [] # In Python 2, if the value is too large to fit into an int, int(..) returns # a long, but ints are cheaper to use when possible. if index_str and self._reader: indices = map(int, _decode_with_reader(index_str, self._reader)) elif index_str: indices = [int(index_str)] else: indices = [] # Check that all indices are in range. if indices: i_min, i_max = min(indices), max(indices) if i_min < 0 or i_max >= self._size: i_bad = i_min if i_min < 0 else i_max raise ValueError('SparseFeature %r has index %d out of range [0, %d)' % (self._name, i_bad, self._size)) if len(values) != len(indices): raise ValueError( 'SparseFeature %r has indices and values of different lengths: ' 'values: %r, indices: %r' % (self._name, values, indices)) return (np.asarray(indices), np.asarray(values))
Example #8
Source File: dataset.py From spotify-tensorflow with Apache License 2.0 | 5 votes |
def _examples(cls, file_pattern, # type: str schema_path=None, # type: str feature_spec=None, # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]] # noqa: E501 default_value=0, # type: float compression_type=None, # type: str batch_size=128, # type: int shuffle=True, # type: bool num_epochs=1, # type: int shuffle_buffer_size=10000, # type: int shuffle_seed=None, # type: int prefetch_buffer_size=1, # type: int reader_num_threads=1, # type: int parser_num_threads=2, # type: int sloppy_ordering=False, # type: bool drop_final_batch=False # type: bool ): # type: (...) -> Iterator[pd.DataFrame] Datasets._assert_eager("DataFrame") dataset = Datasets.dict._examples(file_pattern=file_pattern, schema_path=schema_path, default_value=default_value, feature_spec=feature_spec, compression_type=compression_type, batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, shuffle_buffer_size=shuffle_buffer_size, shuffle_seed=shuffle_seed, prefetch_buffer_size=prefetch_buffer_size, reader_num_threads=reader_num_threads, parser_num_threads=parser_num_threads, sloppy_ordering=sloppy_ordering, drop_final_batch=drop_final_batch) for d in dataset: yield pd.DataFrame(data=d)
Example #9
Source File: schema_utils.py From transform with Apache License 2.0 | 5 votes |
def _sparse_feature_from_feature_spec(spec, name, domains): """Returns a representation of a SparseFeature from a feature spec.""" if isinstance(spec.index_key, list): raise ValueError( 'SparseFeature "{}" had index_key {}, but size and index_key ' 'fields should be single values'.format(name, spec.index_key)) if isinstance(spec.size, list): raise ValueError( 'SparseFeature "{}" had size {}, but size and index_key fields ' 'should be single values'.format(name, spec.size)) # Create a index feature. index_feature = schema_pb2.Feature( name=spec.index_key, type=schema_pb2.INT, int_domain=schema_pb2.IntDomain(min=0, max=spec.size - 1)) # Create a value feature. value_feature = schema_pb2.Feature(name=spec.value_key) _set_type(name, value_feature, spec.dtype) _set_domain(name, value_feature, domains.get(name)) # Create a sparse feature which refers to the index and value features. index_feature_ref = schema_pb2.SparseFeature.IndexFeature( name=spec.index_key) value_feature_ref = schema_pb2.SparseFeature.ValueFeature( name=spec.value_key) sparse_feature = schema_pb2.SparseFeature( name=name, is_sorted=True if spec.already_sorted else None, index_feature=[index_feature_ref], value_feature=value_feature_ref) return (index_feature, value_feature, sparse_feature)
Example #10
Source File: mcsv_coder.py From code-snippets with Apache License 2.0 | 4 votes |
def decode(self, csv_string): """Decodes the given string record according to the schema. Missing value handling is as follows: 1.a) If FixedLenFeature and has a default value, use that value for missing entries. 1.b) If FixedLenFeature and doesn't has default value throw an Exception on missing entries. 2) For VarLenFeature return an empty array. 3) For SparseFeature throw an Exception if only one of the indices or values has a missing entry. If both indices and values are missing, return a tuple of 2 empty arrays. For the case of multivalent columns a ValueError will occur if FixedLenFeature gets the wrong number of values, or a SparseFeature gets different length indices and values. Args: csv_string: String to be decoded. Returns: Dictionary of column name to value. Raises: DecodeError: If columns do not match specified csv headers. ValueError: If some numeric column has non-numeric data, if a SparseFeature has missing indices but not values or vice versa or multivalent data has the wrong length. """ try: raw_values = self._reader.read_record(csv_string) except Exception as e: # pylint: disable=broad-except raise DecodeError('%s: %s' % (e, csv_string)) # An empty string when we expect a single column is potentially valid. This # is probably more permissive than the csv standard but is useful for # testing so that we can test single column CSV lines. if not raw_values and len(self._column_names) == 1: raw_values = [''] # Check record length mismatches. if len(raw_values) != len(self._column_names): raise DecodeError( 'Columns do not match specified csv headers: %s -> %s' % ( self._column_names, raw_values)) return {feature_handler.name: feature_handler.parse_value(raw_values) for feature_handler in self._feature_handlers}
Example #11
Source File: csv_coder.py From transform with Apache License 2.0 | 4 votes |
def decode(self, csv_string): """Decodes the given string record according to the schema. Missing value handling is as follows: 1. For FixedLenFeature: 1. If FixedLenFeature and has a default value, use that value for missing entries. 2. If FixedLenFeature and doesn't have default value throw an Exception on missing entries. 2. For VarLenFeature return an empty array. 3. For SparseFeature throw an Exception if only one of the indices or values has a missing entry. If both indices and values are missing, return a tuple of 2 empty arrays. For the case of multivalent columns a ValueError will occur if FixedLenFeature gets the wrong number of values, or a SparseFeature gets different length indices and values. Args: csv_string: String to be decoded. Returns: Dictionary of column name to value. Raises: DecodeError: If columns do not match specified csv headers. ValueError: If some numeric column has non-numeric data, if a SparseFeature has missing indices but not values or vice versa or multivalent data has the wrong length. """ try: raw_values = self._reader.read_record(csv_string) except Exception as e: # pylint: disable=broad-except raise DecodeError('%s: %s' % (e, csv_string)) # An empty string when we expect a single column is potentially valid. This # is probably more permissive than the csv standard but is useful for # testing so that we can test single column CSV lines. if not raw_values and len(self._column_names) == 1: raw_values = [''] # Check record length mismatches. if len(raw_values) != len(self._column_names): raise DecodeError( 'Columns do not match specified csv headers: {} -> {}'.format( self._column_names, raw_values)) return { feature_handler.name: feature_handler.parse_value(raw_values) for feature_handler in self._feature_handlers }
Example #12
Source File: schema_utils.py From transform with Apache License 2.0 | 4 votes |
def _sparse_feature_as_feature_spec(feature, feature_by_name, string_domains): """Returns a representation of a SparseFeature as a feature spec.""" index_keys = [index_feature.name for index_feature in feature.index_feature] index_features = [] for index_key in index_keys: try: index_features.append(feature_by_name.pop(index_key)) except KeyError: raise ValueError( 'sparse_feature "{}" referred to index feature "{}" which did not ' 'exist in the schema'.format(feature.name, index_key)) if len(index_features) != 1: raise ValueError( 'sparse_feature "{}" had rank {} but currently only rank 1' ' sparse features are supported'.format( feature.name, len(index_features))) value_key = feature.value_feature.name try: value_feature = feature_by_name.pop(value_key) except KeyError: raise ValueError( 'sparse_feature "{}" referred to value feature "{}" which did not ' 'exist in the schema or was referred to as an index or value multiple ' 'times.'.format(feature.name, value_key)) if index_features[0].HasField('int_domain'): # Currently we only handle O-based INT index features whose minimum # domain value must be zero. if not index_features[0].int_domain.HasField('min'): raise ValueError('Cannot determine dense shape of sparse feature ' '"{}". The minimum domain value of index feature "{}"' ' is not set.'.format(feature.name, index_keys[0])) if index_features[0].int_domain.min != 0: raise ValueError('Only 0-based index features are supported. Sparse ' 'feature "{}" has index feature "{}" whose minimum ' 'domain value is {}.'.format( feature.name, index_keys[0], index_features[0].int_domain.min)) if not index_features[0].int_domain.HasField('max'): raise ValueError('Cannot determine dense shape of sparse feature ' '"{}". The maximum domain value of index feature "{}"' ' is not set.'.format(feature.name, index_keys[0])) shape = [index_features[0].int_domain.max + 1] else: raise ValueError('Cannot determine dense shape of sparse feature "{}".' ' The index feature "{}" had no int_domain set.'.format( feature.name, index_keys[0])) dtype = _feature_dtype(value_feature) if len(index_keys) != len(shape): raise ValueError( 'sparse_feature "{}" had rank {} (shape {}) but {} index keys were' ' given'.format(feature.name, len(shape), shape, len(index_keys))) spec = tf.io.SparseFeature(index_keys[0], value_key, dtype, shape[0], feature.is_sorted) domain = _get_domain(value_feature, string_domains) return spec, domain
Example #13
Source File: schema_utils.py From transform with Apache License 2.0 | 4 votes |
def schema_as_feature_spec(schema_proto): """Generates a feature spec from a Schema proto. For a Feature with a FixedShape we generate a FixedLenFeature with no default. For a Feature without a FixedShape we generate a VarLenFeature. For a SparseFeature we generate a SparseFeature. Args: schema_proto: A Schema proto. Returns: A pair (feature spec, domains) where feature spec is a dict whose keys are feature names and values are instances of FixedLenFeature, VarLenFeature or SparseFeature, and `domains` is a dict whose keys are feature names and values are one of the `domain_info` oneof, e.g. IntDomain. Raises: ValueError: If the schema proto is invalid. """ for feature in schema_proto.feature: if RAGGED_TENSOR_TAG in feature.annotation.tag: raise ValueError( 'Feature "{}" had tag "{}". Features represented by a ' 'RaggedTensor cannot be serialized/deserialized to Example proto or ' 'other formats, and cannot have a feature spec generated for ' 'them.'.format(feature.name, RAGGED_TENSOR_TAG)) if schema_utils_legacy.get_generate_legacy_feature_spec(schema_proto): return _legacy_schema_as_feature_spec(schema_proto) feature_spec = {} # Will hold the domain_info (IntDomain, FloatDomain etc.) of the feature. For # sparse features, will hold the domain_info of the values feature. Features # that do not have a domain set will not be present in `domains`. domains = {} feature_by_name = {feature.name: feature for feature in schema_proto.feature} string_domains = _get_string_domains(schema_proto) # Generate a `tf.SparseFeature` for each element of # `schema_proto.sparse_feature`. This also removed the features from # feature_by_name. # TODO(KesterTong): Allow sparse features to share index features. for feature in schema_proto.sparse_feature: if _include_in_parsing_spec(feature): feature_spec[feature.name], domains[feature.name] = ( _sparse_feature_as_feature_spec( feature, feature_by_name, string_domains)) # Generate a `tf.FixedLenFeature` or `tf.VarLenFeature` for each element of # `schema_proto.feature` that was not referenced by a `SparseFeature`. for name, feature in feature_by_name.items(): if _include_in_parsing_spec(feature): feature_spec[name], domains[name] = _feature_as_feature_spec( feature, string_domains) schema_utils_legacy.check_for_unsupported_features(schema_proto) domains = {name: domain for name, domain in domains.items() if domain is not None} return SchemaAsFeatureSpecResult(feature_spec, domains)
Example #14
Source File: schema_utils.py From transform with Apache License 2.0 | 4 votes |
def schema_from_feature_spec(feature_spec, domains=None): """Convert a feature spec to a Schema proto. Args: feature_spec: A TensorFlow feature spec domains: (optional) a dict whose keys are feature names and values are one of schema_pb2.IntDomain, schema_pb2.StringDomain or schema_pb2.FloatDomain. Returns: A Schema proto Raises: ValueError: If the feature spec cannot be converted to a Schema proto. """ if domains is None: domains = {} result = schema_pb2.Schema() # Some feature specs can only be represented with the legacy schema, in # particular feature specs where any FixedLenFeature has default_value set. # We represent these (and only these) using a schema with # generate_legacy_feature_spec=True. Note the generate_legacy_feature_spec # field is not part of the open source codebase. if schema_utils_legacy.should_set_generate_legacy_feature_spec(feature_spec): return _legacy_schema_from_feature_spec(feature_spec, domains) schema_utils_legacy.set_generate_legacy_feature_spec(result, False) # Add the features to the schema. for name, spec in sorted(feature_spec.items()): if isinstance(spec, tf.io.SparseFeature): (index_feature, value_feature, sparse_feature) = ( _sparse_feature_from_feature_spec(spec, name, domains)) result.feature.add().CopyFrom(index_feature) result.feature.add().CopyFrom(value_feature) result.sparse_feature.add().CopyFrom(sparse_feature) else: result.feature.add().CopyFrom( _feature_from_feature_spec(spec, name, domains)) return result
Example #15
Source File: cscGAN.py From scGAN with MIT License | 4 votes |
def make_input_fn(self, file_paths, epochs=None): """ Function that loads the TFRecords files and creates the placeholders for the data inputs. Parameters ---------- file_paths : list List of TFRecord files from which to read from. epochs : int Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. NOTE - If specified, creates a variable that must be initialized, so call tf.local_variables_initializer() and run the op in a session. Default is None. Returns ------- features : Tensor Tensor containing a batch of cells (vector of expression levels). cluster : Tensor Tensor containing (a batch of) the cluster indexes of the corresponding cells. """ feature_map = {'scg': tf.SparseFeature(index_key='indices', value_key='values', dtype=tf.float32, size=self.genes_no), 'cluster_int': tf.FixedLenFeature(1, tf.int64)} options = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.GZIP) batched_features = tf.contrib.learn.read_batch_features( file_pattern=file_paths, batch_size=self.batch_size, features=feature_map, reader=lambda: tf.TFRecordReader( options=options), num_epochs=epochs) sgc = batched_features['scg'] sparse = tf.sparse_reshape(sgc, (self.batch_size, self.genes_no)) dense = tf.sparse_tensor_to_dense(sparse) cluster = tf.squeeze(tf.to_int32(batched_features['cluster_int'])) features = tf.reshape(dense, (self.batch_size, self.genes_no)) return features, cluster
Example #16
Source File: scGAN.py From scGAN with MIT License | 4 votes |
def make_input_fn(self, file_paths, epochs=None): """ Function that loads the TFRecords files and creates the placeholders for the data inputs. Parameters ---------- file_paths : list List of TFRecord files from which to read from. epochs : int Integer specifying the number of times to read through the dataset. If None, cycles through the dataset forever. NOTE - If specified, creates a variable that must be initialized, so call tf.local_variables_initializer() and run the op in a session. Default is None. Returns ------- features : Tensor Tensor containing a batch of cells (vector of expression levels). """ feature_map = {'scg': tf.SparseFeature(index_key='indices', value_key='values', dtype=tf.float32, size=self.genes_no) } options = tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.GZIP) batched_features = tf.contrib.learn.read_batch_features( file_pattern=file_paths, batch_size=self.batch_size, features=feature_map, reader=lambda: tf.TFRecordReader(options=options), num_epochs=epochs ) sgc = batched_features['scg'] sparse = tf.sparse_reshape(sgc, (self.batch_size, self.genes_no)) dense = tf.sparse_tensor_to_dense(sparse) features = tf.reshape(dense, (self.batch_size, self.genes_no)) return features
Example #17
Source File: dataset.py From spotify-tensorflow with Apache License 2.0 | 4 votes |
def examples_via_feature_spec(cls, file_pattern, # type: str feature_spec, # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]] # noqa: E501 default_value=0, # type: float batch_size=128, # type: int compression_type=None, # type: str shuffle=True, # type: bool num_epochs=1, # type: int shuffle_buffer_size=10000, # type: int shuffle_seed=42, # type: int prefetch_buffer_size=1, # type: int reader_num_threads=1, # type: int parser_num_threads=2, # type: int sloppy_ordering=False, # type: bool drop_final_batch=False # type: bool ): # type: (...) -> Iterator[pd.DataFrame] """ Read a TF dataset in batches, each one yields a Pandas DataFrame. :param file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules :param feature_spec: TensorFlow feature spec :param default_value: Value used if a sparse feature is missing. :param batch_size: batch size, set to the size of the dataset to read all data at once :param compression_type: TFRecord compression type, see `tf.data.TFRecordDataset` doc :param shuffle: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param num_epochs: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param shuffle_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param shuffle_seed: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param prefetch_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param reader_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param parser_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param sloppy_ordering: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param drop_final_batch: see `tensorflow.contrib.data.make_batched_features_dataset` doc :return A Python Generator, yielding batches of data in a Pandas DataFrame """ return cls._examples(file_pattern=file_pattern, feature_spec=feature_spec, default_value=default_value, batch_size=batch_size, compression_type=compression_type, shuffle=shuffle, num_epochs=num_epochs, shuffle_buffer_size=shuffle_buffer_size, shuffle_seed=shuffle_seed, prefetch_buffer_size=prefetch_buffer_size, reader_num_threads=reader_num_threads, parser_num_threads=parser_num_threads, sloppy_ordering=sloppy_ordering, drop_final_batch=drop_final_batch)
Example #18
Source File: dataset.py From spotify-tensorflow with Apache License 2.0 | 4 votes |
def _examples(cls, file_pattern, # type: str schema_path=None, # type: str feature_spec=None, # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]] # noqa: E501 default_value=0, # type: float compression_type=None, # type: str batch_size=128, # type: int shuffle=True, # type: bool num_epochs=1, # type: int shuffle_buffer_size=10000, # type: int shuffle_seed=None, # type: int prefetch_buffer_size=1, # type: int reader_num_threads=1, # type: int parser_num_threads=2, # type: int sloppy_ordering=False, # type: bool drop_final_batch=False # type: bool ): # type: (...) -> Iterator[Dict[str, np.ndarray]] Datasets._assert_eager("Dictionary") def get_numpy(tensor): if isinstance(tensor, tf.Tensor): return tensor.numpy() elif isinstance(tensor, tf.SparseTensor): # If it's a SparseTensor, which is the representation of VarLenFeature and # SparseFeature, we convert it to dense representation, and further is it's # a scalar, we reshape to to a vector shape = tensor.dense_shape.numpy() # first element is batch size if shape[1] == 0: # this feature is not defined for any of the examples in the batch return np.repeat(default_value, shape[0]) numpy_dense = tf.sparse_tensor_to_dense(tensor, default_value=default_value).numpy() if shape[1] == 1: # this is scalar feature, reshape to a vector return numpy_dense.reshape(shape[0]) else: return numpy_dense else: raise ValueError("This type %s is not supported!", type(tensor).__name__) dataset = Datasets._examples(file_pattern=file_pattern, schema_path=schema_path, feature_spec=feature_spec, compression_type=compression_type, batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, shuffle_buffer_size=shuffle_buffer_size, shuffle_seed=shuffle_seed, prefetch_buffer_size=prefetch_buffer_size, reader_num_threads=reader_num_threads, parser_num_threads=parser_num_threads, sloppy_ordering=sloppy_ordering, drop_final_batch=drop_final_batch) for batch in dataset: yield {name: get_numpy(eager_tensor) for name, eager_tensor in six.iteritems(batch)}
Example #19
Source File: dataset.py From spotify-tensorflow with Apache License 2.0 | 4 votes |
def examples_via_feature_spec(cls, file_pattern, # type: str feature_spec, # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]] # noqa: E501 default_value=0, # type: float batch_size=128, # type: int compression_type=None, # type: str shuffle=True, # type: bool num_epochs=1, # type: int shuffle_buffer_size=10000, # type: int shuffle_seed=42, # type: int prefetch_buffer_size=1, # type: int reader_num_threads=1, # type: int parser_num_threads=2, # type: int sloppy_ordering=False, # type: bool drop_final_batch=False # type: bool ): # type: (...) -> Iterator[Dict[str, np.ndarray]] """ Read a TF dataset and load it into a dictionary of NumPy Arrays. :param file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules :param feature_spec: TensorFlow feature spec :param default_value: Value used if a sparse feature is missing. :param compression_type: TFRecord compression type, see `tf.data.TFRecordDataset` doc :param batch_size: batch size, set to the size of the dataset to read all data at once :param shuffle: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param num_epochs: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param shuffle_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param shuffle_seed: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param prefetch_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param reader_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param parser_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param sloppy_ordering: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param drop_final_batch: see `tensorflow.contrib.data.make_batched_features_dataset` doc :return Dictionary of NumPy arrays """ return cls._examples(file_pattern=file_pattern, feature_spec=feature_spec, default_value=default_value, batch_size=batch_size, compression_type=compression_type, shuffle=shuffle, num_epochs=num_epochs, shuffle_buffer_size=shuffle_buffer_size, shuffle_seed=shuffle_seed, prefetch_buffer_size=prefetch_buffer_size, reader_num_threads=reader_num_threads, parser_num_threads=parser_num_threads, sloppy_ordering=sloppy_ordering, drop_final_batch=drop_final_batch)
Example #20
Source File: dataset.py From spotify-tensorflow with Apache License 2.0 | 4 votes |
def examples_via_feature_spec(cls, file_pattern, # type: str feature_spec, # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]] # noqa: E501 compression_type=None, # type: str batch_size=128, # type: int shuffle=True, # type: bool num_epochs=1, # type: int shuffle_buffer_size=10000, # type: int shuffle_seed=None, # type: int prefetch_buffer_size=1, # type: int reader_num_threads=1, # type: int parser_num_threads=2, # type: int sloppy_ordering=False, # type: bool drop_final_batch=False # type: bool ): # type: (...) -> tf.data.Dataset """Get `Dataset` of parsed `Example` protos. :param file_pattern: List of files or patterns of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules :param feature_spec: TensorFlow feature spec :param compression_type: TFRecord compression type, see `tf.data.TFRecordDataset` doc :param batch_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param shuffle: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param num_epochs: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param shuffle_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param shuffle_seed: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param prefetch_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param reader_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param parser_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param sloppy_ordering: see `tensorflow.contrib.data.make_batched_features_dataset` doc :param drop_final_batch: see `tensorflow.contrib.data.make_batched_features_dataset` doc :return `Dataset`, which holds results of the parsing of `Example` protos """ return cls._examples(file_pattern, feature_spec=feature_spec, compression_type=compression_type, batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, shuffle_buffer_size=shuffle_buffer_size, shuffle_seed=shuffle_seed, prefetch_buffer_size=prefetch_buffer_size, reader_num_threads=reader_num_threads, parser_num_threads=parser_num_threads, sloppy_ordering=sloppy_ordering, drop_final_batch=drop_final_batch)