Python tensorflow_datasets.load() Examples

The following are 30 code examples of tensorflow_datasets.load(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow_datasets , or try the search function .
Example #1
Source File: squad.py    From exbert with Apache License 2.0 6 votes vote down vote up
def get_train_examples(self, data_dir, filename=None):
        """
        Returns the training examples from the data directory.

        Args:
            data_dir: Directory containing the data files used for training and evaluating.
            filename: None by default, specify this if the training file has a different name than the original one
                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.

        """
        if data_dir is None:
            data_dir = ""

        if self.train_file is None:
            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")

        with open(
            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
        ) as reader:
            input_data = json.load(reader)["data"]
        return self._create_examples(input_data, "train") 
Example #2
Source File: mnist_tutorial.py    From cleverhans with MIT License 6 votes vote down vote up
def ld_mnist():
  """Load training and test data."""

  def convert_types(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255
    return image, label

  dataset, info = tfds.load('mnist', 
                            data_dir='gs://tfds-data/datasets', 
                            with_info=True,
                            as_supervised=True)
  mnist_train, mnist_test = dataset['train'], dataset['test']
  mnist_train = mnist_train.map(convert_types).shuffle(10000).batch(128)
  mnist_test = mnist_test.map(convert_types).batch(128)
  return EasyDict(train=mnist_train, test=mnist_test) 
Example #3
Source File: datasets.py    From compare_gan with Apache License 2.0 6 votes vote down vote up
def _load_dataset(self, split):
    """Loads the underlying dataset split from disk.

    Args:
      split: Name of the split to load.

    Returns:
      Returns a `tf.data.Dataset` object with a tuple of image and label tensor.
    """
    if FLAGS.data_fake_dataset:
      return self._make_fake_dataset(split)
    ds = tfds.load(
        self._tfds_name,
        split=split,
        data_dir=FLAGS.tfds_data_dir,
        as_dataset_kwargs={"shuffle_files": False})
    ds = self._replace_labels(split, ds)
    ds = ds.map(self._parse_fn)
    return ds.prefetch(tf.contrib.data.AUTOTUNE) 
Example #4
Source File: data.py    From IIC with MIT License 6 votes vote down vote up
def load(data_set_name, **kwargs):
    """
    :param data_set_name: data set name--call tfds.list_builders() for options
    :return:
        train_ds: TensorFlow Dataset object for the training data
        test_ds: TensorFlow Dataset object for the testing data
        info: data set info object
    """
    # get data and its info
    ds, info = tfds.load(name=data_set_name, split=tfds.Split.ALL, with_info=True)

    # configure the data sets
    if 'train' in info.splits:
        train_ds = configure_data_set(ds=ds, info=info, is_training=True, **kwargs)
    else:
        train_ds = None
    if 'test' in info.splits:
        test_ds = configure_data_set(ds=ds, info=info, is_training=False, **kwargs)
    else:
        test_ds = None

    return train_ds, test_ds, info 
Example #5
Source File: transfo_experiment.py    From axcell with Apache License 2.0 6 votes vote down vote up
def prepare_glue_examples(tokenizer, task_name='mrpc', split_name='train'):
    processor = glue_processors[task_name]()

    def tf_mrpc_to_pytorch(d):
        for ex in d:
            ex = processor.get_example_from_tensor_dict(ex)
            #        ex = processor.tfds_map(ex)
            yield ex

    tf_data = tensorflow_datasets.load(f"glue/{task_name}")[split_name]
    examples = tf_mrpc_to_pytorch(tf_data)
    features = glue_convert_examples_to_features(examples,
                                                 tokenizer,
                                                 max_length=128,
                                                 task='mrpc')

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset 
Example #6
Source File: squad.py    From exbert with Apache License 2.0 6 votes vote down vote up
def get_dev_examples(self, data_dir, filename=None):
        """
        Returns the evaluation example from the data directory.

        Args:
            data_dir: Directory containing the data files used for training and evaluating.
            filename: None by default, specify this if the evaluation file has a different name than the original one
                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
        """
        if data_dir is None:
            data_dir = ""

        if self.dev_file is None:
            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")

        with open(
            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
        ) as reader:
            input_data = json.load(reader)["data"]
        return self._create_examples(input_data, "dev") 
Example #7
Source File: loaders.py    From neural-structured-learning with Apache License 2.0 6 votes vote down vote up
def load_data_planetoid(name, path, splits_path=None, row_normalize=False,
                        data_container_class=PlanetoidDataset):
  """Load Planetoid data."""
  if splits_path is None:
    # Load from file in Planetoid format.
    (adj, features, _, _, _, train_mask, val_mask, test_mask,
     labels) = load_from_planetoid_files(name, path)
  else:
    # Otherwise load from a path where we saved a pickle with random splits.
    logging.info('Loading from splits path: %s', splits_path)
    (adj, features, _, _, _, train_mask, val_mask, test_mask,
     labels) = pickle.load(open(splits_path, 'rb'))

  return data_container_class.build_from_adjacency_matrix(
      name,
      adj,
      features,
      train_mask,
      val_mask,
      test_mask,
      labels,
      row_normalize=row_normalize) 
Example #8
Source File: robust_model.py    From interval-bound-propagation with Apache License 2.0 6 votes vote down vote up
def _build(self):
    dataset = tfds.load(name=self._dataset_name, split=self._mode)
    minibatch = dataset.map(parse).repeat()

    if self._shuffle:
      minibatch = minibatch.shuffle(self._batch_size*100)
    minibatch = minibatch.batch(
        self._batch_size).make_one_shot_iterator().get_next()
    minibatch['sentiment'].set_shape([self._batch_size])
    minibatch['sentence'] = tf.SparseTensor(
        indices=minibatch['sentence'].indices,
        values=minibatch['sentence'].values,
        dense_shape=[self._batch_size, minibatch['sentence'].dense_shape[1]])
    # minibatch.sentence sparse tensor with dense shape
    # [batch_size x seq_length], length: [batch_size]
    return Dataset(
        tokens=minibatch['sentence'],
        num_tokens=self.get_row_lengths(minibatch['sentence']),
        sentiment=minibatch['sentiment'],
    ) 
Example #9
Source File: datasets.py    From mobilenetv3-tensorflow with Apache License 2.0 6 votes vote down vote up
def build_dataset(
    shape: Tuple[int, int],
    name: str="mnist",
    train_batch_size: int=32,
    valid_batch_size: int=32
    ):

    dataset = {}
    builder = tfds.builder(name)
    dataset["num_train"] = builder.info.splits['train'].num_examples
    dataset["num_test"] = builder.info.splits['test'].num_examples

    [ds_train, ds_test], info = tfds.load(name=name, split=["train", "test"], with_info=True)
    dataset["num_classes"] = info.features["label"].num_classes
    dataset["channels"] = ds_train.output_shapes["image"][-1].value

    ds_train = ds_train.shuffle(1024).repeat()
    ds_train = ds_train.map(lambda data: _parse_function(data, shape, dataset["num_classes"], dataset["channels"]))
    dataset["train"] = ds_train.batch(train_batch_size)

    ds_test = ds_test.shuffle(1024).repeat()
    ds_test = ds_test.map(lambda data: _parse_function(data, shape, dataset["num_classes"], dataset["channels"]))
    dataset["test"] = ds_test.batch(valid_batch_size)

    return dataset 
Example #10
Source File: image.py    From stacked_capsule_autoencoders with Apache License 2.0 5 votes vote down vote up
def _create_mnist(subset, batch_size, **kwargs):
  return tfds.load(
      name='mnist', split=subset, **kwargs).repeat().batch(batch_size) 
Example #11
Source File: transfo_experiment.py    From axcell with Apache License 2.0 5 votes vote down vote up
def glue_dataset_to_df(task_name):
    data = tensorflow_datasets.load(f"glue/{task_name}")
    new_dict = {}
    for name, dataset in data.items():
        new_dict[name] = pd.DataFrame.from_records([strip_tensors(r) for r in dataset],
                                                   columns=dataset.output_shapes.keys(),
                                                   index='idx')
    return new_dict.get('train', None), new_dict.get('validation', None), new_dict.get('test', None) 
Example #12
Source File: __init__.py    From graphics with Apache License 2.0 5 votes vote down vote up
def load(*args, **kwargs):
    return tfds.load('model_net40', *args, **kwargs) 
Example #13
Source File: mnist_dpsgd_tutorial_common.py    From privacy with Apache License 2.0 5 votes vote down vote up
def make_input_fn(split, input_batch_size=256, repetitions=-1, tpu=False):
  """Make input function on given MNIST split."""

  def input_fn(params=None):
    """A simple input function."""
    batch_size = params.get('batch_size', input_batch_size)

    def parser(example):
      image, label = example['image'], example['label']
      image = tf.cast(image, tf.float32)
      image /= 255.0
      label = tf.cast(label, tf.int32)
      return image, label

    dataset = tfds.load(name='mnist', split=split)
    dataset = dataset.map(parser).shuffle(60000).repeat(repetitions).batch(
        batch_size)
    # If this input function is not meant for TPUs, we can stop here.
    # Otherwise, we need to explicitly set its shape. Note that for unknown
    # reasons, returning the latter format causes performance regression
    # on non-TPUs.
    if not tpu:
      return dataset

    # Give inputs statically known shapes; needed for TPUs.
    images, labels = tf.data.make_one_shot_iterator(dataset).get_next()
    # return images, labels
    images.set_shape([batch_size, 28, 28, 1])
    labels.set_shape([
        batch_size,
    ])
    return images, labels

  return input_fn 
Example #14
Source File: input.py    From BigGAN-TPU-TensorFlow with MIT License 5 votes vote down vote up
def tfds_input_fn(params, dataset, is_training=True):

	dataset = tfds.load(
		name=dataset, 
		split=tfds.Split.TRAIN if is_training else tfds.Split.TEST,
		data_dir=params['data_dir'])

	if params['take_examples'] is not None:
		dataset = dataset.take(params['take_examples'])

	dataset = dataset.shuffle(params['batch_size']*20)
	dataset = dataset.repeat()
	dataset = dataset.batch(params['batch_size'], drop_remainder=True)
	dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

	def map_fn(features):
		image = tf.cast(features["image"], tf.float32) / 127.5 - 1

		if "label" in features:
			label = tf.one_hot(features["label"], params['num_labels'], dtype=tf.float32)
		else:
			label = tf.zeros([params['batch_size'], 1])

		return image, label

	dataset = dataset.map(map_fn)
	return dataset 
Example #15
Source File: t2t.py    From BERT with Apache License 2.0 5 votes vote down vote up
def train_and_eval_dataset(dataset_name, data_dir):
  """Return train and evaluation datasets, feature info and supervised keys.

  Args:
    dataset_name: a string, the name of the dataset; if it starts with "v1_"
      then we'll search T2T Problem registry for it, otherwise we assume it
      is a dataset from TFDS and load it from there.
    data_dir: directory where the data is located.

  Returns:
    a 4-tuple consisting of:
     * the train tf.data.Dataset
     * the eval tf.data.Dataset
     * information about features: a python dictionary with feature names
         as keys and an object as value that provides .shape and .num_classes.
     * supervised_keys: information what's the input and what's the target,
         ie., a pair of lists with input and target feature names.
  """
  if dataset_name.startswith("v1_"):
    return _train_and_eval_dataset_v1(dataset_name[3:], data_dir)
  dataset_builder = tfds.builder(dataset_name, data_dir=data_dir)
  info = dataset_builder.info
  splits = dataset_builder.info.splits
  if tfds.Split.TRAIN not in splits:
    raise ValueError("To train we require a train split in the dataset.")
  if tfds.Split.VALIDATION not in splits and "test" not in splits:
    raise ValueError("We require a validation or test split in the dataset.")
  eval_split = tfds.Split.VALIDATION
  if tfds.Split.VALIDATION not in splits:
    eval_split = tfds.Split.TEST
  train, valid = tfds.load(
      name=dataset_name, split=[tfds.Split.TRAIN, eval_split])
  keys = None
  if info.supervised_keys:
    keys = ([info.supervised_keys[0]], [info.supervised_keys[1]])
  return train, valid, info.features, keys 
Example #16
Source File: dataset.py    From mesh with Apache License 2.0 5 votes vote down vote up
def untokenized_tfds_dataset(dataset_name=gin.REQUIRED,
                             text2self=gin.REQUIRED,
                             tfds_data_dir=gin.REQUIRED,
                             dataset_split=gin.REQUIRED,
                             batch_size=None,
                             sequence_length=gin.REQUIRED,
                             vocabulary=gin.REQUIRED,
                             pack=gin.REQUIRED):
  """Reads a tensorflow_datasets dataset.

  Returns a tf.data.Dataset containing single tokenized examples where each
  feature ends in EOS=1.

  Args:
    dataset_name: a string
    text2self: a boolean, if true, run unsupervised LM-style training. if false,
      the dataset must support supervised mode.
    tfds_data_dir: a boolean
    dataset_split: a string
    batch_size: an integer
    sequence_length: an integer
    vocabulary: a vocabulary.Vocabulary
    pack: if True, multiple examples emitted by load_internal() are concatenated
        to form one combined example.
  Returns:
    a tf.data.Dataset of batches
  """
  del batch_size
  dataset = tfds.load(
      dataset_name, split=dataset_split,
      as_supervised=not text2self, data_dir=tfds_data_dir)
  if dataset_split == "train":
    dataset = dataset.repeat()
    dataset = dataset.shuffle(1000)
  if not text2self:
    dataset = supervised_to_dict(dataset, text2self)
  dataset = encode_all_features(dataset, vocabulary)
  return pack_or_pad(dataset, sequence_length, pack) 
Example #17
Source File: run.py    From polyaxon with Apache License 2.0 5 votes vote down vote up
def make_datasets_unbatched():
    BUFFER_SIZE = 10000

    # Scaling MNIST data from (0, 255] to (0., 1.]
    def scale(image, label):
        image = tf.cast(image, tf.float32)
        image /= 255
        return image, label

    datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True)

    return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE) 
Example #18
Source File: cifar10_input.py    From Live-feed-object-device-identification-using-Tensorflow-and-OpenCV with Apache License 2.0 5 votes vote down vote up
def _get_images_labels(batch_size, split, distords=False):
  """Returns Dataset for given split."""
  dataset = tfds.load(name='cifar10', split=split)
  scope = 'data_augmentation' if distords else 'input'
  with tf.name_scope(scope):
    dataset = dataset.map(DataPreprocessor(distords), num_parallel_calls=10)
  # Dataset is small enough to be fully loaded on memory:
  dataset = dataset.prefetch(-1)
  dataset = dataset.repeat().batch(batch_size)
  iterator = dataset.make_one_shot_iterator()
  images_labels = iterator.get_next()
  images, labels = images_labels['input'], images_labels['target']
  tf.summary.image('images', images)
  return images, labels 
Example #19
Source File: utils.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def load(self, split, shuffle_files):
    """Returns a tf.data.Dataset for the given split."""
    split = self._map_split(split)
    return tfds.load(
        self._name,
        split=split,
        data_dir=self.data_dir,
        shuffle_files=shuffle_files,
        download=True,
        try_gcs=True) 
Example #20
Source File: utils.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def get_cached_stats(self, split=tfds.Split.TRAIN):
    """Returns basic statistics for cached dataset."""
    self.assert_cached()
    if split not in self._stats:
      stats_path = get_stats_path(self.cache_dir, split)
      if not tf.io.gfile.exists(stats_path):
        raise ValueError(
            "Stats do not exist for '%s' split: %s" % (self.name, split))
      with tf.io.gfile.GFile(stats_path) as f:
        self._stats[split] = json.load(f)
    return self._stats[split] 
Example #21
Source File: utils.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def _get_cached_dataset(self, split=tfds.Split.TRAIN, shuffle=True):
    """Returns a tf.data.Dataset read from cached files."""
    self.assert_cached()
    with tf.io.gfile.GFile(get_info_path(self.cache_dir, split)) as f:
      split_info = json.load(f)

    # Use `FixedLenSequenceFeature` for sequences with variable length.
    def _feature_config(shape, dtype):
      if shape and shape[0] is None:
        return tf.io.FixedLenSequenceFeature(
            shape[1:], dtype, allow_missing=True)
      return tf.io.FixedLenFeature(shape, dtype)
    feature_desc = {
        feat: _feature_config(**desc)
        for feat, desc in split_info["features"].items()}

    ds = tf.data.Dataset.list_files(
        "%s-*-of-*%d" % (
            get_tfrecord_prefix(self.cache_dir, split),
            split_info["num_shards"]),
        shuffle=shuffle)
    ds = ds.interleave(
        tf.data.TFRecordDataset,
        cycle_length=16, block_length=16,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds = ds.map(lambda ex: tf.parse_single_example(ex, feature_desc),
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
    if self.get_cached_stats(split)["examples"] <= _MAX_EXAMPLES_TO_MEM_CACHE:
      ds = ds.cache()
    return ds 
Example #22
Source File: lm_dpsgd_tutorial.py    From privacy with Apache License 2.0 5 votes vote down vote up
def load_data():
  """Load training and validation data."""
  if not FLAGS.data_dir:
    print('FLAGS.data_dir containing train.txt and test.txt was not specified, '
          'using a substitute dataset from the tensorflow_datasets module.')
    train_dataset = tfds.load(name='lm1b/subwords8k',
                              split=tfds.Split.TRAIN,
                              batch_size=NB_TRAIN,
                              shuffle_files=True)
    test_dataset = tfds.load(name='lm1b/subwords8k',
                             split=tfds.Split.TEST,
                             batch_size=10000)
    train_data = next(tfds.as_numpy(train_dataset))
    test_data = next(tfds.as_numpy(test_dataset))
    train_data = train_data['text'].flatten()
    test_data = test_data['text'].flatten()
  else:
    train_fpath = os.path.join(FLAGS.data_dir, 'train.txt')
    test_fpath = os.path.join(FLAGS.data_dir, 'test.txt')
    train_txt = open(train_fpath).read().split()
    test_txt = open(test_fpath).read().split()
    keys = sorted(set(train_txt))
    remap = {k: i for i, k in enumerate(keys)}
    train_data = np.array([remap[x] for x in train_txt], dtype=np.uint8)
    test_data = np.array([remap[x] for x in test_txt], dtype=np.uint8)

  return train_data, test_data 
Example #23
Source File: tensorflow_estimator_integration.py    From optuna with MIT License 5 votes vote down vote up
def train_input_fn():
    data = tfds.load(name="mnist", as_supervised=True)
    train_ds = data["train"]
    train_ds = train_ds.map(preprocess).shuffle(60000).batch(BATCH_SIZE).take(N_TRAIN_BATCHES)
    return train_ds 
Example #24
Source File: tensorflow_estimator_integration.py    From optuna with MIT License 5 votes vote down vote up
def eval_input_fn():
    data = tfds.load(name="mnist", as_supervised=True)
    valid_ds = data["test"]
    valid_ds = valid_ds.map(preprocess).shuffle(10000).batch(BATCH_SIZE).take(N_VALID_BATCHES)
    return valid_ds 
Example #25
Source File: tfkeras_integration.py    From optuna with MIT License 5 votes vote down vote up
def train_dataset():

    ds = tfds.load("mnist", split=tfds.Split.TRAIN, shuffle_files=True)
    ds = ds.map(lambda x: (tf.cast(x["image"], tf.float32) / 255.0, x["label"]))
    ds = ds.repeat().shuffle(1024).batch(BATCHSIZE)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

    return ds 
Example #26
Source File: tfkeras_integration.py    From optuna with MIT License 5 votes vote down vote up
def eval_dataset():

    ds = tfds.load("mnist", split=tfds.Split.TEST, shuffle_files=False)
    ds = ds.map(lambda x: (tf.cast(x["image"], tf.float32) / 255.0, x["label"]))
    ds = ds.repeat().batch(BATCHSIZE)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

    return ds 
Example #27
Source File: tensorflow_estimator_simple.py    From optuna with MIT License 5 votes vote down vote up
def train_input_fn():
    data = tfds.load(name="mnist", as_supervised=True)
    train_ds = data["train"]
    train_ds = train_ds.map(preprocess).shuffle(60000).batch(BATCH_SIZE).take(N_TRAIN_BATCHES)
    return train_ds 
Example #28
Source File: utils.py    From hub with Apache License 2.0 5 votes vote down vote up
def load_data(dataset, split, num_examples=None):
  ds = tfds.load(dataset, split=split, shuffle_files=False)
  if num_examples:
    ds = ds.take(num_examples)
  return ds 
Example #29
Source File: search_test.py    From hub with Apache License 2.0 5 votes vote down vote up
def test_run_e2e(self, mock_tfds_load):
    if not tf.executing_eagerly():
      self.skipTest("Test requires eager mode.")
    modules = self._create_image_models()
    #tfds.load = fake_image_dataset
    with flagsaver.flagsaver(
        dataset="cifar100",
        module=modules,
    ):
      search.main([]) 
Example #30
Source File: classifier_data_lib.py    From models with Apache License 2.0 5 votes vote down vote up
def __init__(self,
               tfds_params,
               process_text_fn=tokenization.convert_to_unicode):
    super(TfdsProcessor, self).__init__(process_text_fn)
    self._process_tfds_params_str(tfds_params)
    if self.module_import:
      importlib.import_module(self.module_import)

    self.dataset, info = tfds.load(
        self.dataset_name, data_dir=self.data_dir, with_info=True)
    if self.is_regression:
      self._labels = None
    else:
      self._labels = list(range(info.features[self.label_key].num_classes))