Python tensorflow.SparseFeature() Examples

The following are 20 code examples of tensorflow.SparseFeature(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: mcsv_coder.py    From code-snippets with Apache License 2.0 6 votes vote down vote up
def encode_value(self, string_list, sparse_value):
    """Encode the value of this feature into the CSV line."""
    index, value = sparse_value
    if len(value) == len(index):
      if self._encoder:
        string_list[self._value_index] = self._encoder.encode_record(
            map(str, value))
        string_list[self._index_index] = self._encoder.encode_record(
            map(str, index))
      else:
        string_list[self._value_index] = str(value[0]) if value else ''
        string_list[self._index_index] = str(index[0]) if index else ''
    else:
      raise ValueError(
          'SparseFeature %r has value and index unaligned %r vs %r.' %
          (self._name, value, index)) 
Example #2
Source File: schema_utils.py    From transform with Apache License 2.0 6 votes vote down vote up
def _feature_from_feature_spec(spec, name, domains):
  """Returns a representation of a Feature from a feature spec."""
  if isinstance(spec, tf.io.FixedLenFeature):
    if spec.default_value is not None:
      raise ValueError(
          'feature "{}" had default_value {}, but FixedLenFeature must have '
          'default_value=None'.format(name, spec.default_value))
    dims = [schema_pb2.FixedShape.Dim(size=size) for size in spec.shape]
    feature = schema_pb2.Feature(
        name=name,
        presence=schema_pb2.FeaturePresence(min_fraction=1.0),
        shape=schema_pb2.FixedShape(dim=dims))
  elif isinstance(spec, tf.io.VarLenFeature):
    feature = schema_pb2.Feature(name=name)
  else:
    raise TypeError(
        'Spec for feature "{}" was {} of type {}, expected a '
        'FixedLenFeature, VarLenFeature or SparseFeature'.format(
            name, spec, type(spec)))

  _set_type(name, feature, spec.dtype)
  _set_domain(name, feature, domains.get(name))
  return feature 
Example #3
Source File: example_decoders_test.py    From spotify-tensorflow with Apache License 2.0 5 votes vote down vote up
def test_example_with_feature_spec_decoder(self):
        feature_spec = {
            "scalar_feature_1": tf.FixedLenFeature(shape=[], dtype=tf.int64),
            "scalar_feature_2": tf.FixedLenFeature(shape=[], dtype=tf.int64),
            "scalar_feature_3": tf.FixedLenFeature(shape=[], dtype=tf.float32),
            "varlen_feature_1": tf.VarLenFeature(dtype=tf.float32),
            "varlen_feature_2": tf.VarLenFeature(dtype=tf.string),
            "1d_vector_feature": tf.FixedLenFeature(shape=[1], dtype=tf.string),
            "2d_vector_feature": tf.FixedLenFeature(shape=[2, 2], dtype=tf.float32),
            "sparse_feature": tf.SparseFeature("sparse_feature_idx", "sparse_feature_value",
                                               tf.float32, 10),
        }

        dec = ExampleWithFeatureSpecDecoder(feature_spec)
        actual_json = json.loads(dec.to_json(self.example_str))
        expected_decoded = {
            "scalar_feature_1": 12,
            "scalar_feature_2": 12,
            "scalar_feature_3": 1.0,
            "varlen_feature_1": [89.0],
            "1d_vector_feature": ["this is a ,text"],
            "2d_vector_feature": [[1.0, 2.0], [3.0, 4.0]],
            "varlen_feature_2": ["female"],
            "sparse_feature_idx": [1, 4],
            "sparse_feature_value": [12.0, 20.0],
        }
        self.assertEqual(actual_json, expected_decoded) 
Example #4
Source File: dataset.py    From spotify-tensorflow with Apache License 2.0 5 votes vote down vote up
def parse_schema(cls, schema_path):
        # type: (str) -> Tuple[Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]], Schema]  # noqa: E501
        """
        Returns TensorFlow Feature Spec and parsed tf.metadata Schema for given tf.metadata Schema.

        :param schema_path: tf.metadata Schema path
        """
        schema = parse_schema_file(schema_path)
        return schema_to_feature_spec(schema), schema 
Example #5
Source File: dataset.py    From spotify-tensorflow with Apache License 2.0 5 votes vote down vote up
def parse_schema_from_stats(cls, stats_path):
        # type: (str) -> Tuple[Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]], Schema]  # noqa: E501
        """
        Returns TensorFlow Feature Spec and parsed tf.metadata Schema for given tf.metadata
        DatasetFeatureStatisticsList.

        :param stats_path: tf.metadata DatasetFeatureStatisticsList path
        """
        import tensorflow_data_validation as tfdv
        stats = tfdv.load_statistics(stats_path)
        schema = tfdv.infer_schema(stats)
        return schema_to_feature_spec(schema), schema 
Example #6
Source File: tf_schema_utils_test.py    From spotify-tensorflow with Apache License 2.0 5 votes vote down vote up
def test_round_trip(self):
        feature_spec = {
            "scalar_feature_1": tf.FixedLenFeature(shape=[], dtype=tf.int64),
            "scalar_feature_2": tf.FixedLenFeature(shape=[], dtype=tf.int64),
            "scalar_feature_3": tf.FixedLenFeature(shape=[], dtype=tf.float32),
            "varlen_feature_1": tf.VarLenFeature(dtype=tf.float32),
            "varlen_feature_2": tf.VarLenFeature(dtype=tf.string),
            "1d_vector_feature": tf.FixedLenFeature(shape=[1], dtype=tf.string),
            "2d_vector_feature": tf.FixedLenFeature(shape=[2, 2], dtype=tf.float32),
            "sparse_feature": tf.SparseFeature("idx", "value", tf.float32, 10),
        }
        inferred_schema = feature_spec_to_schema(feature_spec)
        inferred_feature_spec = schema_to_feature_spec(inferred_schema)
        self.assertEqual(inferred_feature_spec, feature_spec) 
Example #7
Source File: mcsv_coder.py    From code-snippets with Apache License 2.0 5 votes vote down vote up
def parse_value(self, string_list):
    """Parse the value of this feature from string list split from CSV line."""
    value_str = string_list[self._value_index]
    index_str = string_list[self._index_index]

    if value_str and self._reader:
      values = map(self._cast_fn, _decode_with_reader(value_str, self._reader))
    elif value_str:
      values = [self._cast_fn(value_str)]
    else:
      values = []

    # In Python 2, if the value is too large to fit into an int, int(..) returns
    # a long, but ints are cheaper to use when possible.
    if index_str and self._reader:
      indices = map(int, _decode_with_reader(index_str, self._reader))
    elif index_str:
      indices = [int(index_str)]
    else:
      indices = []

    # Check that all indices are in range.
    if indices:
      i_min, i_max = min(indices), max(indices)
      if i_min < 0 or i_max >= self._size:
        i_bad = i_min if i_min < 0 else i_max
        raise ValueError('SparseFeature %r has index %d out of range [0, %d)'
                         % (self._name, i_bad, self._size))

    if len(values) != len(indices):
      raise ValueError(
          'SparseFeature %r has indices and values of different lengths: '
          'values: %r, indices: %r' % (self._name, values, indices))

    return (np.asarray(indices), np.asarray(values)) 
Example #8
Source File: dataset.py    From spotify-tensorflow with Apache License 2.0 5 votes vote down vote up
def _examples(cls,
                      file_pattern,  # type: str
                      schema_path=None,  # type: str
                      feature_spec=None,  # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]]  # noqa: E501
                      default_value=0,  # type: float
                      compression_type=None,  # type: str
                      batch_size=128,  # type: int
                      shuffle=True,  # type: bool
                      num_epochs=1,  # type: int
                      shuffle_buffer_size=10000,  # type: int
                      shuffle_seed=None,  # type: int
                      prefetch_buffer_size=1,  # type: int
                      reader_num_threads=1,  # type: int
                      parser_num_threads=2,  # type: int
                      sloppy_ordering=False,  # type: bool
                      drop_final_batch=False  # type: bool
                      ):
            # type: (...) -> Iterator[pd.DataFrame]
            Datasets._assert_eager("DataFrame")
            dataset = Datasets.dict._examples(file_pattern=file_pattern,
                                              schema_path=schema_path,
                                              default_value=default_value,
                                              feature_spec=feature_spec,
                                              compression_type=compression_type,
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_epochs=num_epochs,
                                              shuffle_buffer_size=shuffle_buffer_size,
                                              shuffle_seed=shuffle_seed,
                                              prefetch_buffer_size=prefetch_buffer_size,
                                              reader_num_threads=reader_num_threads,
                                              parser_num_threads=parser_num_threads,
                                              sloppy_ordering=sloppy_ordering,
                                              drop_final_batch=drop_final_batch)
            for d in dataset:
                yield pd.DataFrame(data=d) 
Example #9
Source File: schema_utils.py    From transform with Apache License 2.0 5 votes vote down vote up
def _sparse_feature_from_feature_spec(spec, name, domains):
  """Returns a representation of a SparseFeature from a feature spec."""
  if isinstance(spec.index_key, list):
    raise ValueError(
        'SparseFeature "{}" had index_key {}, but size and index_key '
        'fields should be single values'.format(name, spec.index_key))
  if isinstance(spec.size, list):
    raise ValueError(
        'SparseFeature "{}" had size {}, but size and index_key fields '
        'should be single values'.format(name, spec.size))

  # Create a index feature.
  index_feature = schema_pb2.Feature(
      name=spec.index_key, type=schema_pb2.INT,
      int_domain=schema_pb2.IntDomain(min=0, max=spec.size - 1))

  # Create a value feature.
  value_feature = schema_pb2.Feature(name=spec.value_key)
  _set_type(name, value_feature, spec.dtype)
  _set_domain(name, value_feature, domains.get(name))

  # Create a sparse feature which refers to the index and value features.
  index_feature_ref = schema_pb2.SparseFeature.IndexFeature(
      name=spec.index_key)
  value_feature_ref = schema_pb2.SparseFeature.ValueFeature(
      name=spec.value_key)
  sparse_feature = schema_pb2.SparseFeature(
      name=name, is_sorted=True if spec.already_sorted else None,
      index_feature=[index_feature_ref], value_feature=value_feature_ref)

  return (index_feature, value_feature, sparse_feature) 
Example #10
Source File: mcsv_coder.py    From code-snippets with Apache License 2.0 4 votes vote down vote up
def decode(self, csv_string):
    """Decodes the given string record according to the schema.

    Missing value handling is as follows:

    1.a) If FixedLenFeature and has a default value, use that value for missing
         entries.
    1.b) If FixedLenFeature and doesn't has default value throw an Exception on
         missing entries.

    2) For VarLenFeature return an empty array.

    3) For SparseFeature throw an Exception if only one of the indices or values
       has a missing entry. If both indices and values are missing, return
       a tuple of 2 empty arrays.

    For the case of multivalent columns a ValueError will occur if
    FixedLenFeature gets the wrong number of values, or a SparseFeature gets
    different length indices and values.

    Args:
      csv_string: String to be decoded.

    Returns:
      Dictionary of column name to value.

    Raises:
      DecodeError: If columns do not match specified csv headers.
      ValueError: If some numeric column has non-numeric data, if a
          SparseFeature has missing indices but not values or vice versa or
          multivalent data has the wrong length.
    """
    try:
      raw_values = self._reader.read_record(csv_string)
    except Exception as e:  # pylint: disable=broad-except
      raise DecodeError('%s: %s' % (e, csv_string))

    # An empty string when we expect a single column is potentially valid.  This
    # is probably more permissive than the csv standard but is useful for
    # testing so that we can test single column CSV lines.
    if not raw_values and len(self._column_names) == 1:
      raw_values = ['']

    # Check record length mismatches.
    if len(raw_values) != len(self._column_names):
      raise DecodeError(
          'Columns do not match specified csv headers: %s -> %s' % (
              self._column_names, raw_values))

    return {feature_handler.name: feature_handler.parse_value(raw_values)
            for feature_handler in self._feature_handlers} 
Example #11
Source File: csv_coder.py    From transform with Apache License 2.0 4 votes vote down vote up
def decode(self, csv_string):
    """Decodes the given string record according to the schema.

    Missing value handling is as follows:

    1. For FixedLenFeature:
        1. If FixedLenFeature and has a default value, use that value for
        missing entries.
        2. If FixedLenFeature and doesn't have default value throw an Exception
        on missing entries.

    2. For VarLenFeature return an empty array.

    3. For SparseFeature throw an Exception if only one of the indices or values
       has a missing entry. If both indices and values are missing, return
       a tuple of 2 empty arrays.

    For the case of multivalent columns a ValueError will occur if
    FixedLenFeature gets the wrong number of values, or a SparseFeature gets
    different length indices and values.

    Args:
      csv_string: String to be decoded.

    Returns:
      Dictionary of column name to value.

    Raises:
      DecodeError: If columns do not match specified csv headers.
      ValueError: If some numeric column has non-numeric data, if a
          SparseFeature has missing indices but not values or vice versa or
          multivalent data has the wrong length.
    """
    try:
      raw_values = self._reader.read_record(csv_string)
    except Exception as e:  # pylint: disable=broad-except
      raise DecodeError('%s: %s' % (e, csv_string))

    # An empty string when we expect a single column is potentially valid.  This
    # is probably more permissive than the csv standard but is useful for
    # testing so that we can test single column CSV lines.
    if not raw_values and len(self._column_names) == 1:
      raw_values = ['']

    # Check record length mismatches.
    if len(raw_values) != len(self._column_names):
      raise DecodeError(
          'Columns do not match specified csv headers: {} -> {}'.format(
              self._column_names, raw_values))

    return {
        feature_handler.name: feature_handler.parse_value(raw_values)
        for feature_handler in self._feature_handlers
    } 
Example #12
Source File: schema_utils.py    From transform with Apache License 2.0 4 votes vote down vote up
def _sparse_feature_as_feature_spec(feature, feature_by_name, string_domains):
  """Returns a representation of a SparseFeature as a feature spec."""
  index_keys = [index_feature.name for index_feature in feature.index_feature]
  index_features = []
  for index_key in index_keys:
    try:
      index_features.append(feature_by_name.pop(index_key))
    except KeyError:
      raise ValueError(
          'sparse_feature "{}" referred to index feature "{}" which did not '
          'exist in the schema'.format(feature.name, index_key))

  if len(index_features) != 1:
    raise ValueError(
        'sparse_feature "{}" had rank {} but currently only rank 1'
        ' sparse features are supported'.format(
            feature.name, len(index_features)))

  value_key = feature.value_feature.name
  try:
    value_feature = feature_by_name.pop(value_key)
  except KeyError:
    raise ValueError(
        'sparse_feature "{}" referred to value feature "{}" which did not '
        'exist in the schema or was referred to as an index or value multiple '
        'times.'.format(feature.name, value_key))

  if index_features[0].HasField('int_domain'):
    # Currently we only handle O-based INT index features whose minimum
    # domain value must be zero.
    if not index_features[0].int_domain.HasField('min'):
      raise ValueError('Cannot determine dense shape of sparse feature '
                       '"{}". The minimum domain value of index feature "{}"'
                       ' is not set.'.format(feature.name, index_keys[0]))
    if index_features[0].int_domain.min != 0:
      raise ValueError('Only 0-based index features are supported. Sparse '
                       'feature "{}" has index feature "{}" whose minimum '
                       'domain value is {}.'.format(
                           feature.name, index_keys[0],
                           index_features[0].int_domain.min))

    if not index_features[0].int_domain.HasField('max'):
      raise ValueError('Cannot determine dense shape of sparse feature '
                       '"{}". The maximum domain value of index feature "{}"'
                       ' is not set.'.format(feature.name, index_keys[0]))
    shape = [index_features[0].int_domain.max + 1]
  else:
    raise ValueError('Cannot determine dense shape of sparse feature "{}".'
                     ' The index feature "{}" had no int_domain set.'.format(
                         feature.name, index_keys[0]))

  dtype = _feature_dtype(value_feature)
  if len(index_keys) != len(shape):
    raise ValueError(
        'sparse_feature "{}" had rank {} (shape {}) but {} index keys were'
        ' given'.format(feature.name, len(shape), shape, len(index_keys)))
  spec = tf.io.SparseFeature(index_keys[0], value_key, dtype, shape[0],
                             feature.is_sorted)
  domain = _get_domain(value_feature, string_domains)
  return spec, domain 
Example #13
Source File: schema_utils.py    From transform with Apache License 2.0 4 votes vote down vote up
def schema_as_feature_spec(schema_proto):
  """Generates a feature spec from a Schema proto.

  For a Feature with a FixedShape we generate a FixedLenFeature with no default.
  For a Feature without a FixedShape we generate a VarLenFeature.  For a
  SparseFeature we generate a SparseFeature.

  Args:
    schema_proto: A Schema proto.

  Returns:
    A pair (feature spec, domains) where feature spec is a dict whose keys are
        feature names and values are instances of FixedLenFeature, VarLenFeature
        or SparseFeature, and `domains` is a dict whose keys are feature names
        and values are one of the `domain_info` oneof, e.g. IntDomain.

  Raises:
    ValueError: If the schema proto is invalid.
  """
  for feature in schema_proto.feature:
    if RAGGED_TENSOR_TAG in feature.annotation.tag:
      raise ValueError(
          'Feature "{}" had tag "{}".  Features represented by a '
          'RaggedTensor cannot be serialized/deserialized to Example proto or '
          'other formats, and cannot have a feature spec generated for '
          'them.'.format(feature.name, RAGGED_TENSOR_TAG))

  if schema_utils_legacy.get_generate_legacy_feature_spec(schema_proto):
    return _legacy_schema_as_feature_spec(schema_proto)
  feature_spec = {}
  # Will hold the domain_info (IntDomain, FloatDomain etc.) of the feature.  For
  # sparse features, will hold the domain_info of the values feature.  Features
  # that do not have a domain set will not be present in `domains`.
  domains = {}
  feature_by_name = {feature.name: feature for feature in schema_proto.feature}
  string_domains = _get_string_domains(schema_proto)

  # Generate a `tf.SparseFeature` for each element of
  # `schema_proto.sparse_feature`.  This also removed the features from
  # feature_by_name.
  # TODO(KesterTong): Allow sparse features to share index features.
  for feature in schema_proto.sparse_feature:
    if _include_in_parsing_spec(feature):
      feature_spec[feature.name], domains[feature.name] = (
          _sparse_feature_as_feature_spec(
              feature, feature_by_name, string_domains))

  # Generate a `tf.FixedLenFeature` or `tf.VarLenFeature` for each element of
  # `schema_proto.feature` that was not referenced by a `SparseFeature`.
  for name, feature in feature_by_name.items():
    if _include_in_parsing_spec(feature):
      feature_spec[name], domains[name] = _feature_as_feature_spec(
          feature, string_domains)

  schema_utils_legacy.check_for_unsupported_features(schema_proto)

  domains = {name: domain for name, domain in domains.items()
             if domain is not None}
  return SchemaAsFeatureSpecResult(feature_spec, domains) 
Example #14
Source File: schema_utils.py    From transform with Apache License 2.0 4 votes vote down vote up
def schema_from_feature_spec(feature_spec, domains=None):
  """Convert a feature spec to a Schema proto.

  Args:
    feature_spec: A TensorFlow feature spec
    domains: (optional) a dict whose keys are feature names and values are one
        of schema_pb2.IntDomain, schema_pb2.StringDomain or
        schema_pb2.FloatDomain.

  Returns:
    A Schema proto

  Raises:
    ValueError: If the feature spec cannot be converted to a Schema proto.
  """
  if domains is None:
    domains = {}

  result = schema_pb2.Schema()

  # Some feature specs can only be represented with the legacy schema, in
  # particular feature specs where any FixedLenFeature has default_value set.
  # We represent these (and only these) using a schema with
  # generate_legacy_feature_spec=True.  Note the generate_legacy_feature_spec
  # field is not part of the open source codebase.
  if schema_utils_legacy.should_set_generate_legacy_feature_spec(feature_spec):
    return _legacy_schema_from_feature_spec(feature_spec, domains)

  schema_utils_legacy.set_generate_legacy_feature_spec(result, False)

  # Add the features to the schema.
  for name, spec in sorted(feature_spec.items()):
    if isinstance(spec, tf.io.SparseFeature):
      (index_feature, value_feature, sparse_feature) = (
          _sparse_feature_from_feature_spec(spec, name, domains))
      result.feature.add().CopyFrom(index_feature)
      result.feature.add().CopyFrom(value_feature)
      result.sparse_feature.add().CopyFrom(sparse_feature)
    else:
      result.feature.add().CopyFrom(
          _feature_from_feature_spec(spec, name, domains))
  return result 
Example #15
Source File: cscGAN.py    From scGAN with MIT License 4 votes vote down vote up
def make_input_fn(self, file_paths, epochs=None):
        """
        Function that loads the TFRecords files and creates the placeholders
        for the data inputs.

        Parameters
        ----------
        file_paths : list
            List of TFRecord files from which to read from.
        epochs : int
            Integer specifying the number of times to read through the dataset.
            If None, cycles through the dataset forever.
            NOTE - If specified, creates a variable that must be initialized,
            so call tf.local_variables_initializer() and run the op in a session.
            Default is None.

        Returns
        -------
        features : Tensor
            Tensor containing a batch of cells (vector of expression levels).
        cluster : Tensor
            Tensor containing (a batch of) the cluster indexes of the
            corresponding cells.
        """

        feature_map = {'scg': tf.SparseFeature(index_key='indices',
                                               value_key='values',
                                               dtype=tf.float32,
                                               size=self.genes_no),
                       'cluster_int': tf.FixedLenFeature(1, tf.int64)}

        options = tf.python_io.TFRecordOptions(
            tf.python_io.TFRecordCompressionType.GZIP)

        batched_features = tf.contrib.learn.read_batch_features(
            file_pattern=file_paths,
            batch_size=self.batch_size,
            features=feature_map,
            reader=lambda: tf.TFRecordReader(
                options=options),
            num_epochs=epochs)

        sgc = batched_features['scg']

        sparse = tf.sparse_reshape(sgc, (self.batch_size, self.genes_no))

        dense = tf.sparse_tensor_to_dense(sparse)

        cluster = tf.squeeze(tf.to_int32(batched_features['cluster_int']))

        features = tf.reshape(dense, (self.batch_size, self.genes_no))

        return features, cluster 
Example #16
Source File: scGAN.py    From scGAN with MIT License 4 votes vote down vote up
def make_input_fn(self, file_paths, epochs=None):
        """
        Function that loads the TFRecords files and creates the placeholders
        for the data inputs.

        Parameters
        ----------
        file_paths : list
            List of TFRecord files from which to read from.
        epochs : int
            Integer specifying the number of times to read through the dataset.
            If None, cycles through the dataset forever.
            NOTE - If specified, creates a variable that must be initialized,
            so call tf.local_variables_initializer() and run the op in a session.
            Default is None.

        Returns
        -------
        features : Tensor
            Tensor containing a batch of cells (vector of expression levels).
        """

        feature_map = {'scg': tf.SparseFeature(index_key='indices',
                                               value_key='values',
                                               dtype=tf.float32,
                                               size=self.genes_no)
                       }

        options = tf.python_io.TFRecordOptions(
            tf.python_io.TFRecordCompressionType.GZIP)

        batched_features = tf.contrib.learn.read_batch_features(
            file_pattern=file_paths,
            batch_size=self.batch_size,
            features=feature_map,
            reader=lambda: tf.TFRecordReader(options=options),
            num_epochs=epochs
            )

        sgc = batched_features['scg']

        sparse = tf.sparse_reshape(sgc, (self.batch_size, self.genes_no))

        dense = tf.sparse_tensor_to_dense(sparse)

        features = tf.reshape(dense, (self.batch_size, self.genes_no))

        return features 
Example #17
Source File: dataset.py    From spotify-tensorflow with Apache License 2.0 4 votes vote down vote up
def examples_via_feature_spec(cls,
                                      file_pattern,  # type: str
                                      feature_spec,  # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]]  # noqa: E501
                                      default_value=0,  # type: float
                                      batch_size=128,  # type: int
                                      compression_type=None,  # type: str
                                      shuffle=True,  # type: bool
                                      num_epochs=1,  # type: int
                                      shuffle_buffer_size=10000,  # type: int
                                      shuffle_seed=42,  # type: int
                                      prefetch_buffer_size=1,  # type: int
                                      reader_num_threads=1,  # type: int
                                      parser_num_threads=2,  # type: int
                                      sloppy_ordering=False,  # type: bool
                                      drop_final_batch=False  # type: bool
                                      ):
            # type: (...) -> Iterator[pd.DataFrame]
            """
            Read a TF dataset in batches, each one yields a Pandas DataFrame.

            :param file_pattern: List of files or patterns of file paths containing
                                 `Example` records. See `tf.gfile.Glob` for pattern rules
            :param feature_spec: TensorFlow feature spec
            :param default_value: Value used if a sparse feature is missing.
            :param batch_size: batch size, set to the size of the dataset to read all data at once
            :param compression_type: TFRecord compression type, see `tf.data.TFRecordDataset` doc
            :param shuffle: see `tensorflow.contrib.data.make_batched_features_dataset` doc
            :param num_epochs: see `tensorflow.contrib.data.make_batched_features_dataset` doc
            :param shuffle_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset`
                                        doc
            :param shuffle_seed: see `tensorflow.contrib.data.make_batched_features_dataset` doc
            :param prefetch_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset`
                                         doc
            :param reader_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset`
                                       doc
            :param parser_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset`
                                       doc
            :param sloppy_ordering: see `tensorflow.contrib.data.make_batched_features_dataset` doc
            :param drop_final_batch: see `tensorflow.contrib.data.make_batched_features_dataset` doc

            :return A Python Generator, yielding batches of data in a Pandas DataFrame
            """
            return cls._examples(file_pattern=file_pattern,
                                 feature_spec=feature_spec,
                                 default_value=default_value,
                                 batch_size=batch_size,
                                 compression_type=compression_type,
                                 shuffle=shuffle,
                                 num_epochs=num_epochs,
                                 shuffle_buffer_size=shuffle_buffer_size,
                                 shuffle_seed=shuffle_seed,
                                 prefetch_buffer_size=prefetch_buffer_size,
                                 reader_num_threads=reader_num_threads,
                                 parser_num_threads=parser_num_threads,
                                 sloppy_ordering=sloppy_ordering,
                                 drop_final_batch=drop_final_batch) 
Example #18
Source File: dataset.py    From spotify-tensorflow with Apache License 2.0 4 votes vote down vote up
def _examples(cls,
                      file_pattern,  # type: str
                      schema_path=None,  # type: str
                      feature_spec=None,  # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]]  # noqa: E501
                      default_value=0,  # type: float
                      compression_type=None,  # type: str
                      batch_size=128,  # type: int
                      shuffle=True,  # type: bool
                      num_epochs=1,  # type: int
                      shuffle_buffer_size=10000,  # type: int
                      shuffle_seed=None,  # type: int
                      prefetch_buffer_size=1,  # type: int
                      reader_num_threads=1,  # type: int
                      parser_num_threads=2,  # type: int
                      sloppy_ordering=False,  # type: bool
                      drop_final_batch=False  # type: bool
                      ):
            # type: (...) -> Iterator[Dict[str, np.ndarray]]
            Datasets._assert_eager("Dictionary")

            def get_numpy(tensor):
                if isinstance(tensor, tf.Tensor):
                    return tensor.numpy()
                elif isinstance(tensor, tf.SparseTensor):
                    # If it's a SparseTensor, which is the representation of VarLenFeature and
                    # SparseFeature, we convert it to dense representation, and further is it's
                    # a scalar, we reshape to to a vector

                    shape = tensor.dense_shape.numpy()
                    # first element is batch size
                    if shape[1] == 0:
                        # this feature is not defined for any of the examples in the batch
                        return np.repeat(default_value, shape[0])

                    numpy_dense = tf.sparse_tensor_to_dense(tensor,
                                                            default_value=default_value).numpy()
                    if shape[1] == 1:
                        # this is scalar feature, reshape to a vector
                        return numpy_dense.reshape(shape[0])
                    else:
                        return numpy_dense
                else:
                    raise ValueError("This type %s is not supported!", type(tensor).__name__)

            dataset = Datasets._examples(file_pattern=file_pattern,
                                         schema_path=schema_path,
                                         feature_spec=feature_spec,
                                         compression_type=compression_type,
                                         batch_size=batch_size,
                                         shuffle=shuffle,
                                         num_epochs=num_epochs,
                                         shuffle_buffer_size=shuffle_buffer_size,
                                         shuffle_seed=shuffle_seed,
                                         prefetch_buffer_size=prefetch_buffer_size,
                                         reader_num_threads=reader_num_threads,
                                         parser_num_threads=parser_num_threads,
                                         sloppy_ordering=sloppy_ordering,
                                         drop_final_batch=drop_final_batch)
            for batch in dataset:
                yield {name: get_numpy(eager_tensor) for name, eager_tensor in six.iteritems(batch)} 
Example #19
Source File: dataset.py    From spotify-tensorflow with Apache License 2.0 4 votes vote down vote up
def examples_via_feature_spec(cls,
                                      file_pattern,  # type: str
                                      feature_spec,  # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]]  # noqa: E501
                                      default_value=0,  # type: float
                                      batch_size=128,  # type: int
                                      compression_type=None,  # type: str
                                      shuffle=True,  # type: bool
                                      num_epochs=1,  # type: int
                                      shuffle_buffer_size=10000,  # type: int
                                      shuffle_seed=42,  # type: int
                                      prefetch_buffer_size=1,  # type: int
                                      reader_num_threads=1,  # type: int
                                      parser_num_threads=2,  # type: int
                                      sloppy_ordering=False,  # type: bool
                                      drop_final_batch=False  # type: bool
                                      ):
            # type: (...) -> Iterator[Dict[str, np.ndarray]]
            """
            Read a TF dataset and load it into a dictionary of NumPy Arrays.

            :param file_pattern: List of files or patterns of file paths containing
                                 `Example` records. See `tf.gfile.Glob` for pattern rules
            :param feature_spec: TensorFlow feature spec
            :param default_value: Value used if a sparse feature is missing.
            :param compression_type: TFRecord compression type, see `tf.data.TFRecordDataset` doc
            :param batch_size: batch size, set to the size of the dataset to read all data at once
            :param shuffle: see `tensorflow.contrib.data.make_batched_features_dataset` doc
            :param num_epochs: see `tensorflow.contrib.data.make_batched_features_dataset` doc
            :param shuffle_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset`
                                        doc
            :param shuffle_seed: see `tensorflow.contrib.data.make_batched_features_dataset` doc
            :param prefetch_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset`
                                         doc
            :param reader_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset`
                                       doc
            :param parser_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset`
                                       doc
            :param sloppy_ordering: see `tensorflow.contrib.data.make_batched_features_dataset` doc
            :param drop_final_batch: see `tensorflow.contrib.data.make_batched_features_dataset` doc

            :return Dictionary of NumPy arrays
            """
            return cls._examples(file_pattern=file_pattern,
                                 feature_spec=feature_spec,
                                 default_value=default_value,
                                 batch_size=batch_size,
                                 compression_type=compression_type,
                                 shuffle=shuffle,
                                 num_epochs=num_epochs,
                                 shuffle_buffer_size=shuffle_buffer_size,
                                 shuffle_seed=shuffle_seed,
                                 prefetch_buffer_size=prefetch_buffer_size,
                                 reader_num_threads=reader_num_threads,
                                 parser_num_threads=parser_num_threads,
                                 sloppy_ordering=sloppy_ordering,
                                 drop_final_batch=drop_final_batch) 
Example #20
Source File: dataset.py    From spotify-tensorflow with Apache License 2.0 4 votes vote down vote up
def examples_via_feature_spec(cls,
                                  file_pattern,  # type: str
                                  feature_spec,  # type: Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]]  # noqa: E501
                                  compression_type=None,  # type: str
                                  batch_size=128,  # type: int
                                  shuffle=True,  # type: bool
                                  num_epochs=1,  # type: int
                                  shuffle_buffer_size=10000,  # type: int
                                  shuffle_seed=None,  # type: int
                                  prefetch_buffer_size=1,  # type: int
                                  reader_num_threads=1,  # type: int
                                  parser_num_threads=2,  # type: int
                                  sloppy_ordering=False,  # type: bool
                                  drop_final_batch=False  # type: bool
                                  ):
        # type: (...) -> tf.data.Dataset
        """Get `Dataset` of parsed `Example` protos.

        :param file_pattern: List of files or patterns of file paths containing
                             `Example` records. See `tf.gfile.Glob` for pattern rules
        :param feature_spec: TensorFlow feature spec
        :param compression_type: TFRecord compression type, see `tf.data.TFRecordDataset` doc
        :param batch_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param shuffle: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param num_epochs: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param shuffle_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param shuffle_seed: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param prefetch_buffer_size: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param reader_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param parser_num_threads: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param sloppy_ordering: see `tensorflow.contrib.data.make_batched_features_dataset` doc
        :param drop_final_batch: see `tensorflow.contrib.data.make_batched_features_dataset` doc

        :return `Dataset`, which holds results of the parsing of `Example` protos
        """
        return cls._examples(file_pattern,
                             feature_spec=feature_spec,
                             compression_type=compression_type,
                             batch_size=batch_size,
                             shuffle=shuffle,
                             num_epochs=num_epochs,
                             shuffle_buffer_size=shuffle_buffer_size,
                             shuffle_seed=shuffle_seed,
                             prefetch_buffer_size=prefetch_buffer_size,
                             reader_num_threads=reader_num_threads,
                             parser_num_threads=parser_num_threads,
                             sloppy_ordering=sloppy_ordering,
                             drop_final_batch=drop_final_batch)