Python tensorflow_datasets.load() Examples
The following are 30
code examples of tensorflow_datasets.load().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow_datasets
, or try the search function
.
![](https://www.programcreek.com/common/static/images/search.png)
Example #1
Source File: squad.py From exbert with Apache License 2.0 | 6 votes |
def get_train_examples(self, data_dir, filename=None): """ Returns the training examples from the data directory. Args: data_dir: Directory containing the data files used for training and evaluating. filename: None by default, specify this if the training file has a different name than the original one which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. """ if data_dir is None: data_dir = "" if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open( os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8" ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train")
Example #2
Source File: mnist_tutorial.py From cleverhans with MIT License | 6 votes |
def ld_mnist(): """Load training and test data.""" def convert_types(image, label): image = tf.cast(image, tf.float32) image /= 255 return image, label dataset, info = tfds.load('mnist', data_dir='gs://tfds-data/datasets', with_info=True, as_supervised=True) mnist_train, mnist_test = dataset['train'], dataset['test'] mnist_train = mnist_train.map(convert_types).shuffle(10000).batch(128) mnist_test = mnist_test.map(convert_types).batch(128) return EasyDict(train=mnist_train, test=mnist_test)
Example #3
Source File: datasets.py From compare_gan with Apache License 2.0 | 6 votes |
def _load_dataset(self, split): """Loads the underlying dataset split from disk. Args: split: Name of the split to load. Returns: Returns a `tf.data.Dataset` object with a tuple of image and label tensor. """ if FLAGS.data_fake_dataset: return self._make_fake_dataset(split) ds = tfds.load( self._tfds_name, split=split, data_dir=FLAGS.tfds_data_dir, as_dataset_kwargs={"shuffle_files": False}) ds = self._replace_labels(split, ds) ds = ds.map(self._parse_fn) return ds.prefetch(tf.contrib.data.AUTOTUNE)
Example #4
Source File: data.py From IIC with MIT License | 6 votes |
def load(data_set_name, **kwargs): """ :param data_set_name: data set name--call tfds.list_builders() for options :return: train_ds: TensorFlow Dataset object for the training data test_ds: TensorFlow Dataset object for the testing data info: data set info object """ # get data and its info ds, info = tfds.load(name=data_set_name, split=tfds.Split.ALL, with_info=True) # configure the data sets if 'train' in info.splits: train_ds = configure_data_set(ds=ds, info=info, is_training=True, **kwargs) else: train_ds = None if 'test' in info.splits: test_ds = configure_data_set(ds=ds, info=info, is_training=False, **kwargs) else: test_ds = None return train_ds, test_ds, info
Example #5
Source File: transfo_experiment.py From axcell with Apache License 2.0 | 6 votes |
def prepare_glue_examples(tokenizer, task_name='mrpc', split_name='train'): processor = glue_processors[task_name]() def tf_mrpc_to_pytorch(d): for ex in d: ex = processor.get_example_from_tensor_dict(ex) # ex = processor.tfds_map(ex) yield ex tf_data = tensorflow_datasets.load(f"glue/{task_name}")[split_name] examples = tf_mrpc_to_pytorch(tf_data) features = glue_convert_examples_to_features(examples, tokenizer, max_length=128, task='mrpc') all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
Example #6
Source File: squad.py From exbert with Apache License 2.0 | 6 votes |
def get_dev_examples(self, data_dir, filename=None): """ Returns the evaluation example from the data directory. Args: data_dir: Directory containing the data files used for training and evaluating. filename: None by default, specify this if the evaluation file has a different name than the original one which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. """ if data_dir is None: data_dir = "" if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open( os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8" ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev")
Example #7
Source File: loaders.py From neural-structured-learning with Apache License 2.0 | 6 votes |
def load_data_planetoid(name, path, splits_path=None, row_normalize=False, data_container_class=PlanetoidDataset): """Load Planetoid data.""" if splits_path is None: # Load from file in Planetoid format. (adj, features, _, _, _, train_mask, val_mask, test_mask, labels) = load_from_planetoid_files(name, path) else: # Otherwise load from a path where we saved a pickle with random splits. logging.info('Loading from splits path: %s', splits_path) (adj, features, _, _, _, train_mask, val_mask, test_mask, labels) = pickle.load(open(splits_path, 'rb')) return data_container_class.build_from_adjacency_matrix( name, adj, features, train_mask, val_mask, test_mask, labels, row_normalize=row_normalize)
Example #8
Source File: robust_model.py From interval-bound-propagation with Apache License 2.0 | 6 votes |
def _build(self): dataset = tfds.load(name=self._dataset_name, split=self._mode) minibatch = dataset.map(parse).repeat() if self._shuffle: minibatch = minibatch.shuffle(self._batch_size*100) minibatch = minibatch.batch( self._batch_size).make_one_shot_iterator().get_next() minibatch['sentiment'].set_shape([self._batch_size]) minibatch['sentence'] = tf.SparseTensor( indices=minibatch['sentence'].indices, values=minibatch['sentence'].values, dense_shape=[self._batch_size, minibatch['sentence'].dense_shape[1]]) # minibatch.sentence sparse tensor with dense shape # [batch_size x seq_length], length: [batch_size] return Dataset( tokens=minibatch['sentence'], num_tokens=self.get_row_lengths(minibatch['sentence']), sentiment=minibatch['sentiment'], )
Example #9
Source File: datasets.py From mobilenetv3-tensorflow with Apache License 2.0 | 6 votes |
def build_dataset( shape: Tuple[int, int], name: str="mnist", train_batch_size: int=32, valid_batch_size: int=32 ): dataset = {} builder = tfds.builder(name) dataset["num_train"] = builder.info.splits['train'].num_examples dataset["num_test"] = builder.info.splits['test'].num_examples [ds_train, ds_test], info = tfds.load(name=name, split=["train", "test"], with_info=True) dataset["num_classes"] = info.features["label"].num_classes dataset["channels"] = ds_train.output_shapes["image"][-1].value ds_train = ds_train.shuffle(1024).repeat() ds_train = ds_train.map(lambda data: _parse_function(data, shape, dataset["num_classes"], dataset["channels"])) dataset["train"] = ds_train.batch(train_batch_size) ds_test = ds_test.shuffle(1024).repeat() ds_test = ds_test.map(lambda data: _parse_function(data, shape, dataset["num_classes"], dataset["channels"])) dataset["test"] = ds_test.batch(valid_batch_size) return dataset
Example #10
Source File: image.py From stacked_capsule_autoencoders with Apache License 2.0 | 5 votes |
def _create_mnist(subset, batch_size, **kwargs): return tfds.load( name='mnist', split=subset, **kwargs).repeat().batch(batch_size)
Example #11
Source File: transfo_experiment.py From axcell with Apache License 2.0 | 5 votes |
def glue_dataset_to_df(task_name): data = tensorflow_datasets.load(f"glue/{task_name}") new_dict = {} for name, dataset in data.items(): new_dict[name] = pd.DataFrame.from_records([strip_tensors(r) for r in dataset], columns=dataset.output_shapes.keys(), index='idx') return new_dict.get('train', None), new_dict.get('validation', None), new_dict.get('test', None)
Example #12
Source File: __init__.py From graphics with Apache License 2.0 | 5 votes |
def load(*args, **kwargs): return tfds.load('model_net40', *args, **kwargs)
Example #13
Source File: mnist_dpsgd_tutorial_common.py From privacy with Apache License 2.0 | 5 votes |
def make_input_fn(split, input_batch_size=256, repetitions=-1, tpu=False): """Make input function on given MNIST split.""" def input_fn(params=None): """A simple input function.""" batch_size = params.get('batch_size', input_batch_size) def parser(example): image, label = example['image'], example['label'] image = tf.cast(image, tf.float32) image /= 255.0 label = tf.cast(label, tf.int32) return image, label dataset = tfds.load(name='mnist', split=split) dataset = dataset.map(parser).shuffle(60000).repeat(repetitions).batch( batch_size) # If this input function is not meant for TPUs, we can stop here. # Otherwise, we need to explicitly set its shape. Note that for unknown # reasons, returning the latter format causes performance regression # on non-TPUs. if not tpu: return dataset # Give inputs statically known shapes; needed for TPUs. images, labels = tf.data.make_one_shot_iterator(dataset).get_next() # return images, labels images.set_shape([batch_size, 28, 28, 1]) labels.set_shape([ batch_size, ]) return images, labels return input_fn
Example #14
Source File: input.py From BigGAN-TPU-TensorFlow with MIT License | 5 votes |
def tfds_input_fn(params, dataset, is_training=True): dataset = tfds.load( name=dataset, split=tfds.Split.TRAIN if is_training else tfds.Split.TEST, data_dir=params['data_dir']) if params['take_examples'] is not None: dataset = dataset.take(params['take_examples']) dataset = dataset.shuffle(params['batch_size']*20) dataset = dataset.repeat() dataset = dataset.batch(params['batch_size'], drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) def map_fn(features): image = tf.cast(features["image"], tf.float32) / 127.5 - 1 if "label" in features: label = tf.one_hot(features["label"], params['num_labels'], dtype=tf.float32) else: label = tf.zeros([params['batch_size'], 1]) return image, label dataset = dataset.map(map_fn) return dataset
Example #15
Source File: t2t.py From BERT with Apache License 2.0 | 5 votes |
def train_and_eval_dataset(dataset_name, data_dir): """Return train and evaluation datasets, feature info and supervised keys. Args: dataset_name: a string, the name of the dataset; if it starts with "v1_" then we'll search T2T Problem registry for it, otherwise we assume it is a dataset from TFDS and load it from there. data_dir: directory where the data is located. Returns: a 4-tuple consisting of: * the train tf.data.Dataset * the eval tf.data.Dataset * information about features: a python dictionary with feature names as keys and an object as value that provides .shape and .num_classes. * supervised_keys: information what's the input and what's the target, ie., a pair of lists with input and target feature names. """ if dataset_name.startswith("v1_"): return _train_and_eval_dataset_v1(dataset_name[3:], data_dir) dataset_builder = tfds.builder(dataset_name, data_dir=data_dir) info = dataset_builder.info splits = dataset_builder.info.splits if tfds.Split.TRAIN not in splits: raise ValueError("To train we require a train split in the dataset.") if tfds.Split.VALIDATION not in splits and "test" not in splits: raise ValueError("We require a validation or test split in the dataset.") eval_split = tfds.Split.VALIDATION if tfds.Split.VALIDATION not in splits: eval_split = tfds.Split.TEST train, valid = tfds.load( name=dataset_name, split=[tfds.Split.TRAIN, eval_split]) keys = None if info.supervised_keys: keys = ([info.supervised_keys[0]], [info.supervised_keys[1]]) return train, valid, info.features, keys
Example #16
Source File: dataset.py From mesh with Apache License 2.0 | 5 votes |
def untokenized_tfds_dataset(dataset_name=gin.REQUIRED, text2self=gin.REQUIRED, tfds_data_dir=gin.REQUIRED, dataset_split=gin.REQUIRED, batch_size=None, sequence_length=gin.REQUIRED, vocabulary=gin.REQUIRED, pack=gin.REQUIRED): """Reads a tensorflow_datasets dataset. Returns a tf.data.Dataset containing single tokenized examples where each feature ends in EOS=1. Args: dataset_name: a string text2self: a boolean, if true, run unsupervised LM-style training. if false, the dataset must support supervised mode. tfds_data_dir: a boolean dataset_split: a string batch_size: an integer sequence_length: an integer vocabulary: a vocabulary.Vocabulary pack: if True, multiple examples emitted by load_internal() are concatenated to form one combined example. Returns: a tf.data.Dataset of batches """ del batch_size dataset = tfds.load( dataset_name, split=dataset_split, as_supervised=not text2self, data_dir=tfds_data_dir) if dataset_split == "train": dataset = dataset.repeat() dataset = dataset.shuffle(1000) if not text2self: dataset = supervised_to_dict(dataset, text2self) dataset = encode_all_features(dataset, vocabulary) return pack_or_pad(dataset, sequence_length, pack)
Example #17
Source File: run.py From polyaxon with Apache License 2.0 | 5 votes |
def make_datasets_unbatched(): BUFFER_SIZE = 10000 # Scaling MNIST data from (0, 255] to (0., 1.] def scale(image, label): image = tf.cast(image, tf.float32) image /= 255 return image, label datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True) return datasets['train'].map(scale).cache().shuffle(BUFFER_SIZE)
Example #18
Source File: cifar10_input.py From Live-feed-object-device-identification-using-Tensorflow-and-OpenCV with Apache License 2.0 | 5 votes |
def _get_images_labels(batch_size, split, distords=False): """Returns Dataset for given split.""" dataset = tfds.load(name='cifar10', split=split) scope = 'data_augmentation' if distords else 'input' with tf.name_scope(scope): dataset = dataset.map(DataPreprocessor(distords), num_parallel_calls=10) # Dataset is small enough to be fully loaded on memory: dataset = dataset.prefetch(-1) dataset = dataset.repeat().batch(batch_size) iterator = dataset.make_one_shot_iterator() images_labels = iterator.get_next() images, labels = images_labels['input'], images_labels['target'] tf.summary.image('images', images) return images, labels
Example #19
Source File: utils.py From text-to-text-transfer-transformer with Apache License 2.0 | 5 votes |
def load(self, split, shuffle_files): """Returns a tf.data.Dataset for the given split.""" split = self._map_split(split) return tfds.load( self._name, split=split, data_dir=self.data_dir, shuffle_files=shuffle_files, download=True, try_gcs=True)
Example #20
Source File: utils.py From text-to-text-transfer-transformer with Apache License 2.0 | 5 votes |
def get_cached_stats(self, split=tfds.Split.TRAIN): """Returns basic statistics for cached dataset.""" self.assert_cached() if split not in self._stats: stats_path = get_stats_path(self.cache_dir, split) if not tf.io.gfile.exists(stats_path): raise ValueError( "Stats do not exist for '%s' split: %s" % (self.name, split)) with tf.io.gfile.GFile(stats_path) as f: self._stats[split] = json.load(f) return self._stats[split]
Example #21
Source File: utils.py From text-to-text-transfer-transformer with Apache License 2.0 | 5 votes |
def _get_cached_dataset(self, split=tfds.Split.TRAIN, shuffle=True): """Returns a tf.data.Dataset read from cached files.""" self.assert_cached() with tf.io.gfile.GFile(get_info_path(self.cache_dir, split)) as f: split_info = json.load(f) # Use `FixedLenSequenceFeature` for sequences with variable length. def _feature_config(shape, dtype): if shape and shape[0] is None: return tf.io.FixedLenSequenceFeature( shape[1:], dtype, allow_missing=True) return tf.io.FixedLenFeature(shape, dtype) feature_desc = { feat: _feature_config(**desc) for feat, desc in split_info["features"].items()} ds = tf.data.Dataset.list_files( "%s-*-of-*%d" % ( get_tfrecord_prefix(self.cache_dir, split), split_info["num_shards"]), shuffle=shuffle) ds = ds.interleave( tf.data.TFRecordDataset, cycle_length=16, block_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.map(lambda ex: tf.parse_single_example(ex, feature_desc), num_parallel_calls=tf.data.experimental.AUTOTUNE) if self.get_cached_stats(split)["examples"] <= _MAX_EXAMPLES_TO_MEM_CACHE: ds = ds.cache() return ds
Example #22
Source File: lm_dpsgd_tutorial.py From privacy with Apache License 2.0 | 5 votes |
def load_data(): """Load training and validation data.""" if not FLAGS.data_dir: print('FLAGS.data_dir containing train.txt and test.txt was not specified, ' 'using a substitute dataset from the tensorflow_datasets module.') train_dataset = tfds.load(name='lm1b/subwords8k', split=tfds.Split.TRAIN, batch_size=NB_TRAIN, shuffle_files=True) test_dataset = tfds.load(name='lm1b/subwords8k', split=tfds.Split.TEST, batch_size=10000) train_data = next(tfds.as_numpy(train_dataset)) test_data = next(tfds.as_numpy(test_dataset)) train_data = train_data['text'].flatten() test_data = test_data['text'].flatten() else: train_fpath = os.path.join(FLAGS.data_dir, 'train.txt') test_fpath = os.path.join(FLAGS.data_dir, 'test.txt') train_txt = open(train_fpath).read().split() test_txt = open(test_fpath).read().split() keys = sorted(set(train_txt)) remap = {k: i for i, k in enumerate(keys)} train_data = np.array([remap[x] for x in train_txt], dtype=np.uint8) test_data = np.array([remap[x] for x in test_txt], dtype=np.uint8) return train_data, test_data
Example #23
Source File: tensorflow_estimator_integration.py From optuna with MIT License | 5 votes |
def train_input_fn(): data = tfds.load(name="mnist", as_supervised=True) train_ds = data["train"] train_ds = train_ds.map(preprocess).shuffle(60000).batch(BATCH_SIZE).take(N_TRAIN_BATCHES) return train_ds
Example #24
Source File: tensorflow_estimator_integration.py From optuna with MIT License | 5 votes |
def eval_input_fn(): data = tfds.load(name="mnist", as_supervised=True) valid_ds = data["test"] valid_ds = valid_ds.map(preprocess).shuffle(10000).batch(BATCH_SIZE).take(N_VALID_BATCHES) return valid_ds
Example #25
Source File: tfkeras_integration.py From optuna with MIT License | 5 votes |
def train_dataset(): ds = tfds.load("mnist", split=tfds.Split.TRAIN, shuffle_files=True) ds = ds.map(lambda x: (tf.cast(x["image"], tf.float32) / 255.0, x["label"])) ds = ds.repeat().shuffle(1024).batch(BATCHSIZE) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) return ds
Example #26
Source File: tfkeras_integration.py From optuna with MIT License | 5 votes |
def eval_dataset(): ds = tfds.load("mnist", split=tfds.Split.TEST, shuffle_files=False) ds = ds.map(lambda x: (tf.cast(x["image"], tf.float32) / 255.0, x["label"])) ds = ds.repeat().batch(BATCHSIZE) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) return ds
Example #27
Source File: tensorflow_estimator_simple.py From optuna with MIT License | 5 votes |
def train_input_fn(): data = tfds.load(name="mnist", as_supervised=True) train_ds = data["train"] train_ds = train_ds.map(preprocess).shuffle(60000).batch(BATCH_SIZE).take(N_TRAIN_BATCHES) return train_ds
Example #28
Source File: utils.py From hub with Apache License 2.0 | 5 votes |
def load_data(dataset, split, num_examples=None): ds = tfds.load(dataset, split=split, shuffle_files=False) if num_examples: ds = ds.take(num_examples) return ds
Example #29
Source File: search_test.py From hub with Apache License 2.0 | 5 votes |
def test_run_e2e(self, mock_tfds_load): if not tf.executing_eagerly(): self.skipTest("Test requires eager mode.") modules = self._create_image_models() #tfds.load = fake_image_dataset with flagsaver.flagsaver( dataset="cifar100", module=modules, ): search.main([])
Example #30
Source File: classifier_data_lib.py From models with Apache License 2.0 | 5 votes |
def __init__(self, tfds_params, process_text_fn=tokenization.convert_to_unicode): super(TfdsProcessor, self).__init__(process_text_fn) self._process_tfds_params_str(tfds_params) if self.module_import: importlib.import_module(self.module_import) self.dataset, info = tfds.load( self.dataset_name, data_dir=self.data_dir, with_info=True) if self.is_regression: self._labels = None else: self._labels = list(range(info.features[self.label_key].num_classes))