Python tensorflow_datasets.builder() Examples

The following are 30 code examples of tensorflow_datasets.builder(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow_datasets , or try the search function .
Example #1
Source File: datasets.py    From mobilenetv3-tensorflow with Apache License 2.0 6 votes vote down vote up
def build_dataset(
    shape: Tuple[int, int],
    name: str="mnist",
    train_batch_size: int=32,
    valid_batch_size: int=32
    ):

    dataset = {}
    builder = tfds.builder(name)
    dataset["num_train"] = builder.info.splits['train'].num_examples
    dataset["num_test"] = builder.info.splits['test'].num_examples

    [ds_train, ds_test], info = tfds.load(name=name, split=["train", "test"], with_info=True)
    dataset["num_classes"] = info.features["label"].num_classes
    dataset["channels"] = ds_train.output_shapes["image"][-1].value

    ds_train = ds_train.shuffle(1024).repeat()
    ds_train = ds_train.map(lambda data: _parse_function(data, shape, dataset["num_classes"], dataset["channels"]))
    dataset["train"] = ds_train.batch(train_batch_size)

    ds_test = ds_test.shuffle(1024).repeat()
    ds_test = ds_test.map(lambda data: _parse_function(data, shape, dataset["num_classes"], dataset["channels"]))
    dataset["test"] = ds_test.batch(valid_batch_size)

    return dataset 
Example #2
Source File: post_training_quantization.py    From models with Apache License 2.0 6 votes vote down vote up
def _representative_dataset_gen():
  """Gets a python generator of numpy arrays for the given dataset."""
  image_size = FLAGS.image_size
  dataset = tfds.builder(FLAGS.dataset_name, data_dir=FLAGS.dataset_dir)
  dataset.download_and_prepare()
  data = dataset.as_dataset()[FLAGS.dataset_split]
  iterator = tf.data.make_one_shot_iterator(data)
  if FLAGS.use_model_specific_preprocessing:
    preprocess_fn = functools.partial(
        preprocessing_factory.get_preprocessing(name=FLAGS.model_name),
        output_height=image_size,
        output_width=image_size)
  else:
    preprocess_fn = functools.partial(
        _preprocess_for_quantization, image_size=image_size)
  features = iterator.get_next()
  image = features["image"]
  image = preprocess_fn(image)
  image = tf.reshape(image, [1, image_size, image_size, 3])
  for _ in range(FLAGS.num_steps):
    yield [image.eval()] 
Example #3
Source File: tfds.py    From blueoil with Apache License 2.0 6 votes vote down vote up
def count_max_boxes(cls, builder):
        sess = tf.compat.v1.Session()
        max_boxes = 0

        for split in builder.info.splits:
            tf_dataset = builder.as_dataset(split=split)
            iterator = tf.compat.v1.data.make_one_shot_iterator(tf_dataset)
            next_batch = iterator.get_next()

            while True:
                try:
                    data = sess.run(next_batch)
                    if max_boxes < data["objects"]["label"].shape[0]:
                        max_boxes = data["objects"]["label"].shape[0]
                except tf.errors.OutOfRangeError:
                    break

        return max_boxes 
Example #4
Source File: document_datasets.py    From datasets with Apache License 2.0 6 votes vote down vote up
def document_single_builder(builder):
  """Doc string for a single builder, with or without configs."""
  print('Document builder %s...' % builder.name)
  get_config_builder = lambda config: tfds.builder(builder.name, config=config)
  config_builders = []
  if builder.builder_configs:
    with futures.ThreadPoolExecutor(max_workers=WORKER_COUNT_CONFIGS) as tpool:
      config_builders = list(
          tpool.map(get_config_builder, builder.BUILDER_CONFIGS))
  tmpl = get_mako_template('dataset')
  visu_doc_util = VisualizationDocUtil()
  out_str = tmpl.render_unicode(
      builder=builder,
      config_builders=config_builders,
      visu_doc_util=visu_doc_util,
      nightly_doc_util=NightlyDocUtil(),
  ).strip()
  schema_org_tmpl = get_mako_template('schema_org')
  schema_org_out_str = schema_org_tmpl.render_unicode(
      builder=builder,
      config_builders=config_builders,
      visu_doc_util=visu_doc_util,
  ).strip()
  out_str = schema_org_out_str + '\n' + out_str
  return out_str 
Example #5
Source File: generate_visualization.py    From datasets with Apache License 2.0 6 votes vote down vote up
def _get_full_names(datasets: Optional[List[str]] = None) -> List[str]:
  """List all builder names `ds/version` and `ds/config/version` to generate.

  Args:
    datasets: List of datasets from which get the builder names.

  Returns:
    builder_names: The builder names.
  """
  if datasets is None:
    return tfds.core.registered.list_full_names(
        current_version_only=True,
    )
  else:
    builder_names = list(itertools.chain.from_iterable([
        tfds.core.registered.single_full_names(builder_name)
        for builder_name in datasets
    ]))
    return builder_names 
Example #6
Source File: oxford_iiit_pet.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, data_dir=None):

    dataset_builder = tfds.builder("oxford_iiit_pet:3.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    tfds_splits = {}
    tfds_splits["train"] = "train[:{}%]".format(TRAIN_SPLIT_PERCENT)
    tfds_splits["val"] = "train[{}%:]".format(TRAIN_SPLIT_PERCENT)
    tfds_splits["trainval"] = tfds.Split.TRAIN
    tfds_splits["test"] = tfds.Split.TEST

    # Creates a dict with example counts for each split.
    num_samples_splits = {}
    trainval_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples
    num_samples_splits["train"] = (TRAIN_SPLIT_PERCENT * trainval_count) // 100
    num_samples_splits["val"] = trainval_count - num_samples_splits["train"]
    num_samples_splits["trainval"] = trainval_count
    num_samples_splits["test"] = test_count

    super(OxfordIIITPetData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(["image", "label"]),
        num_classes=dataset_builder.info.features["label"].num_classes) 
Example #7
Source File: utils.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def files(self, split):
    """Returns set of instructions for reading TFDS files for the dataset."""
    split = self._map_split(split)

    if "/" not in self.name and self.builder.BUILDER_CONFIGS:
      # If builder has multiple configs, and no particular config was
      # requested, raise an error.
      raise ValueError("Dataset '%s' has multiple configs." % self.name)

    split_info = self.builder.info.splits[split]
    files = split_info.file_instructions

    if not files:
      logging.fatal("No TFRecord files found for dataset: %s", self.name)
    return files 
Example #8
Source File: utils.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def load_shard(self, file_instruction):
    """Returns a dataset for a single shard of the TFDS TFRecord files."""
    ds = self.builder._tfrecords_reader.read_files(  # pylint:disable=protected-access
        [file_instruction],
        read_config=tfds.ReadConfig(),
        shuffle_files=False)
    return ds 
Example #9
Source File: patch_camelyon.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, data_dir=None):

    dataset_builder = tfds.builder("patch_camelyon:2.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    tfds_splits = {
        "test": "test",
        "train": "train",
        "val": "validation",
        "trainval": "train+validation",
    }
    # Creates a dict with example counts.
    num_samples_splits = {
        "test": dataset_builder.info.splits["test"].num_examples,
        "train": dataset_builder.info.splits["train"].num_examples,
        "val": dataset_builder.info.splits["validation"].num_examples,
    }
    num_samples_splits["trainval"] = (
        num_samples_splits["train"] + num_samples_splits["val"])
    super(PatchCamelyonData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(["image", "label"]),
        num_classes=dataset_builder.info.features["label"].num_classes) 
Example #10
Source File: sun397.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, config="tfds", data_dir=None):

    if config == "tfds":
      dataset_builder = tfds.builder("sun397/tfds:4.*.*", data_dir=data_dir)
      dataset_builder.download_and_prepare()

      tfds_splits = {
          "train": "train",
          "val": "validation",
          "test": "test",
          "trainval": "train+validation",
      }
      # Creates a dict with example counts.
      num_samples_splits = {
          "test": dataset_builder.info.splits["test"].num_examples,
          "train": dataset_builder.info.splits["train"].num_examples,
          "val": dataset_builder.info.splits["validation"].num_examples,
      }
      num_samples_splits["trainval"] = (
          num_samples_splits["train"] + num_samples_splits["val"])
    else:

      raise ValueError("No supported config %r for Sun397Data." % config)

    super(Sun397Data, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(["image", "label"]),
        num_classes=dataset_builder.info.features["label"].num_classes) 
Example #11
Source File: kitti.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, task, data_dir=None):

    if task not in _TASK_DICT:
      raise ValueError("Unknown task: %s" % task)

    dataset_builder = tfds.builder("kitti:3.1.0", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    tfds_splits = {
        "train": "train",
        "val": "validation",
        "trainval": "train+validation",
        "test": "test",
    }

    # Example counts are retrieved from the tensorflow dataset info.
    num_examples = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    train_count = num_examples * TRAIN_SPLIT_PERCENT // 100
    val_count = num_examples * VALIDATION_SPLIT_PERCENT // 100
    test_count = num_examples * TEST_SPLIT_PERCENT // 100
    # Creates a dict with example counts for each split.
    num_samples_splits = {
        "train": train_count,
        "val": val_count,
        "trainval": train_count + val_count,
        "test": test_count
    }

    task = _TASK_DICT[task]
    base_preprocess_fn = task["preprocess_fn"]
    super(KittiData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        base_preprocess_fn=base_preprocess_fn,
        num_classes=task["num_classes"]) 
Example #12
Source File: dmlab.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, data_dir=None):
    dataset_builder = tfds.builder("dmlab:2.0.0", data_dir=data_dir)

    tfds_splits = {
        "train": "train",
        "val": "validation",
        "trainval": "train+validation",
        "test": "test"
    }

    # Example counts are retrieved from the tensorflow dataset info.
    train_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    val_count = dataset_builder.info.splits[tfds.Split.VALIDATION].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples

    # Creates a dict with example counts for each split.
    num_samples_splits = {
        "train": train_count,
        "val": val_count,
        "trainval": train_count + val_count,
        "test": test_count
    }

    super(DmlabData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        base_preprocess_fn=base.make_get_and_cast_tensors_fn({
            "image": ("image", None),
            "label": ("label", None),
        }),
        num_classes=dataset_builder.info.features["label"].num_classes,
        image_key="image") 
Example #13
Source File: oxford_flowers102.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, data_dir=None):
    dataset_builder = tfds.builder("oxford_flowers102:2.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    tfds_splits = {
        "train": "train",
        "val": "validation",
        "trainval": "train+validation",
        "test": "test",
    }

    # Example counts are retrieved from the tensorflow dataset info.
    train_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    val_count = dataset_builder.info.splits[tfds.Split.VALIDATION].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples

    # Creates a dict with example counts for each split.
    num_samples_splits = {
        "train": train_count,
        "val": val_count,
        "trainval": train_count + val_count,
        "test": test_count
    }

    super(OxfordFlowers102Data, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Rename tensors but keep their original types.
        base_preprocess_fn=base.make_get_and_cast_tensors_fn({
            "image": ("image", None),
            "label": ("label", None),
        }),
        num_classes=dataset_builder.info.features["label"]
        .num_classes) 
Example #14
Source File: dtd.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, data_dir=None):

    dataset_builder = tfds.builder("dtd:3.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    tfds_splits = {}
    tfds_splits["train"] = "train"
    tfds_splits["val"] = "validation"
    tfds_splits["trainval"] = "train+validation"
    tfds_splits["test"] = "test"

    # Creates a dict with example counts for each split.
    num_samples_splits = {}
    train_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    val_count = dataset_builder.info.splits[tfds.Split.VALIDATION].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples
    num_samples_splits["train"] = train_count
    num_samples_splits["val"] = val_count
    num_samples_splits["trainval"] = train_count + val_count
    num_samples_splits["test"] = test_count

    super(DTDData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(["image", "label"]),
        num_classes=dataset_builder.info.features["label"].num_classes) 
Example #15
Source File: utils.py    From text-to-text-transfer-transformer with Apache License 2.0 5 votes vote down vote up
def info(self):
    return self.builder.info 
Example #16
Source File: imagenet.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, features=("image", "label")):

    dataset_builder = tfds.builder("imagenet2012:5.*.*")

    # Defines dataset specific train/val/trainval/test splits.
    # Note, that the test split for "imagenet2012" dataset is not available.
    # Thus, we use the val split as test. Moreover, we split the train split
    # into two parts: new train split and new val split.
    tfds_splits = {}
    tfds_splits["train"] = "train[:{}%]".format(TRAIN_SPLIT_PERCENT)
    tfds_splits["val"] = "train[{}%:]".format(TRAIN_SPLIT_PERCENT)
    tfds_splits["trainval"] = "train"
    tfds_splits["test"] = "validation"

    # Creates a dict with example counts.
    num_samples_splits = {}
    trainval_count = dataset_builder.info.splits["train"].num_examples
    test_count = dataset_builder.info.splits["validation"].num_examples
    num_samples_splits["train"] = (TRAIN_SPLIT_PERCENT * trainval_count) // 100
    num_samples_splits["val"] = trainval_count - num_samples_splits["train"]
    num_samples_splits["trainval"] = trainval_count
    num_samples_splits["test"] = test_count

    super(ImageNetData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(features),
        filter_fn=self._get_filter_fn(),
        num_classes=dataset_builder.info.features["label"].num_classes) 
Example #17
Source File: svhn.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, data_dir=None):
    dataset_builder = tfds.builder("svhn_cropped:3.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    # The validation set is split out of the original training set, and the
    # remaining examples are used as the "train" split. The "trainval" split
    # corresponds to the original training set.
    tfds_splits = {
        "train": "train[:{}%]".format(TRAIN_SPLIT_PERCENT),
        "val": "train[{}%:]".format(TRAIN_SPLIT_PERCENT),
        "trainval": "train",
        "test": "test",
    }

    # Example counts are retrieved from the tensorflow dataset info.
    trainval_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples

    # Creates a dict with example counts for each split.
    num_samples_splits = {
        # Calculates the train/val split example count based on percent.
        "train": TRAIN_SPLIT_PERCENT * trainval_count // 100,
        "val": trainval_count - TRAIN_SPLIT_PERCENT * trainval_count // 100,
        "trainval": trainval_count,
        "test": test_count
    }

    super(SvhnData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Rename tensors but keep their original types.
        base_preprocess_fn=base.make_get_and_cast_tensors_fn({
            "image": ("image", None),
            "label": ("label", None),
        }),
        num_classes=dataset_builder.info.features["label"]
        .num_classes) 
Example #18
Source File: caltech.py    From task_adaptation with Apache License 2.0 5 votes vote down vote up
def __init__(self, num_classes=10, data_dir=None):
    dataset_builder = tfds.builder("caltech101:3.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    tfds_splits = {}
    tfds_splits["train"] = "train[:{}%]".format(_TRAIN_SPLIT_PERCENT)
    tfds_splits["val"] = "train[{}%:]".format(_TRAIN_SPLIT_PERCENT)
    tfds_splits["trainval"] = "train"
    tfds_splits["test"] = "test"

    # Creates a dict with example counts for each split.
    trainval_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    train_count = (_TRAIN_SPLIT_PERCENT * trainval_count) // 100
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples
    num_samples_splits = dict(
        train=train_count,
        val=trainval_count - train_count,
        trainval=trainval_count,
        test=test_count)

    super(Caltech101, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=3000,
        base_preprocess_fn=base.make_get_tensors_fn(("image", "label")),
        num_classes=dataset_builder.info.features["label"].num_classes) 
Example #19
Source File: dataset_factory.py    From models with Apache License 2.0 5 votes vote down vote up
def __init__(self, config: DatasetConfig, **overrides: Any):
    """Initialize the builder from the config."""
    self.config = config.replace(**overrides)
    self.builder_info = None

    if self.config.augmenter is not None:
      logging.info('Using augmentation: %s', self.config.augmenter.name)
      self.augmenter = self.config.augmenter.build()
    else:
      self.augmenter = None 
Example #20
Source File: dataset_factory.py    From models with Apache License 2.0 5 votes vote down vote up
def info(self) -> tfds.core.DatasetInfo:
    """The TFDS dataset info, if available."""
    if self.builder_info is None:
      self.builder_info = tfds.builder(self.config.name).info
    return self.builder_info 
Example #21
Source File: dataset_factory.py    From models with Apache License 2.0 5 votes vote down vote up
def _build(self, input_context: tf.distribute.InputContext = None
             ) -> tf.data.Dataset:
    """Construct a dataset end-to-end and return it.

    Args:
      input_context: An optional context provided by `tf.distribute` for
        cross-replica training.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    builders = {
        'tfds': self.load_tfds,
        'records': self.load_records,
        'synthetic': self.load_synthetic,
    }

    builder = builders.get(self.config.builder, None)

    if builder is None:
      raise ValueError('Unknown builder type {}'.format(self.config.builder))

    self.input_context = input_context
    dataset = builder()
    dataset = self.pipeline(dataset)

    return dataset 
Example #22
Source File: dataset_factory.py    From models with Apache License 2.0 5 votes vote down vote up
def load_tfds(self) -> tf.data.Dataset:
    """Return a dataset loading files from TFDS."""

    logging.info('Using TFDS to load data.')

    builder = tfds.builder(self.config.name,
                           data_dir=self.config.data_dir)

    if self.config.download:
      builder.download_and_prepare()

    decoders = {}

    if self.config.skip_decoding:
      decoders['image'] = tfds.decode.SkipDecoding()

    read_config = tfds.ReadConfig(
        interleave_cycle_length=10,
        interleave_block_length=1,
        input_context=self.input_context)

    dataset = builder.as_dataset(
        split=self.config.split,
        as_supervised=True,
        shuffle_files=True,
        decoders=decoders,
        read_config=read_config)

    return dataset 
Example #23
Source File: vocabulary.py    From mesh with Apache License 2.0 5 votes vote down vote up
def get_tfds_vocabulary(dataset_name=gin.REQUIRED):
  info = tfds.builder(dataset_name).info
  # this assumes that either there are no inputs, or that the
  # inputs and targets have the same vocabulary.
  return TFDSVocabulary(info.features[info.supervised_keys[1]].encoder) 
Example #24
Source File: t2t.py    From BERT with Apache License 2.0 5 votes vote down vote up
def train_and_eval_dataset(dataset_name, data_dir):
  """Return train and evaluation datasets, feature info and supervised keys.

  Args:
    dataset_name: a string, the name of the dataset; if it starts with "v1_"
      then we'll search T2T Problem registry for it, otherwise we assume it
      is a dataset from TFDS and load it from there.
    data_dir: directory where the data is located.

  Returns:
    a 4-tuple consisting of:
     * the train tf.data.Dataset
     * the eval tf.data.Dataset
     * information about features: a python dictionary with feature names
         as keys and an object as value that provides .shape and .num_classes.
     * supervised_keys: information what's the input and what's the target,
         ie., a pair of lists with input and target feature names.
  """
  if dataset_name.startswith("v1_"):
    return _train_and_eval_dataset_v1(dataset_name[3:], data_dir)
  dataset_builder = tfds.builder(dataset_name, data_dir=data_dir)
  info = dataset_builder.info
  splits = dataset_builder.info.splits
  if tfds.Split.TRAIN not in splits:
    raise ValueError("To train we require a train split in the dataset.")
  if tfds.Split.VALIDATION not in splits and "test" not in splits:
    raise ValueError("We require a validation or test split in the dataset.")
  eval_split = tfds.Split.VALIDATION
  if tfds.Split.VALIDATION not in splits:
    eval_split = tfds.Split.TEST
  train, valid = tfds.load(
      name=dataset_name, split=[tfds.Split.TRAIN, eval_split])
  keys = None
  if info.supervised_keys:
    keys = ([info.supervised_keys[0]], [info.supervised_keys[1]])
  return train, valid, info.features, keys 
Example #25
Source File: tf_inputs.py    From trax with Apache License 2.0 5 votes vote down vote up
def download_and_prepare(dataset_name, data_dir):
  """Downloads and prepares T2T or TFDS dataset.

  Args:
    dataset_name: tfds dataset or t2t problem name prefixed by 't2t_'.
    data_dir: location of existing dataset or None.

  Returns:
    data_dir: path string of downloaded data.
  """
  if not data_dir:
    data_dir = os.path.expanduser('~/tensorflow_datasets/')
    dl_dir = os.path.join(data_dir, 'download')
    logging.info(
        'No dataset directory provided. '
        'Downloading and generating dataset for %s inside data directory %s '
        'For large datasets it is better to prepare datasets manually!',
        dataset_name, data_dir)
    if dataset_name.startswith('t2t_'):
      # Download and run dataset generator for T2T problem.
      data_dir = os.path.join(data_dir, dataset_name)
      tf.io.gfile.makedirs(data_dir)
      tf.io.gfile.makedirs(dl_dir)
      t2t_problems().problem(
          dataset_name[len('t2t_'):]).generate_data(data_dir, dl_dir)
    else:
      # Download and prepare TFDS dataset.
      tfds_builder = tfds.builder(dataset_name)
      tfds_builder.download_and_prepare(download_dir=dl_dir)
  else:
    data_dir = os.path.expanduser(data_dir)
  return data_dir 
Example #26
Source File: tfds.py    From blueoil with Apache License 2.0 5 votes vote down vote up
def __init__(
            self,
            name,
            data_dir,
            image_size,
            download=False,
            num_max_boxes=None,
            tfds_pre_processor=None,
            tfds_augmentor=None,
            *args,
            **kwargs
    ):
        super().__init__(
            *args,
            **kwargs,
        )

        if name in tfds.list_builders():
            self._builder = tfds.builder(name, data_dir=data_dir)
            if download:
                self._builder.download_and_prepare()
        else:
            if not tf.io.gfile.exists(os.path.join(data_dir, name)):
                raise ValueError("Dataset directory does not exist: {}\n"
                                 "Please run `python blueoil/cmd/build_tfds.py -c <config file>` before training."
                                 .format(os.path.join(data_dir, name)))

            self._builder = self.builder_class(name, data_dir=data_dir)

        self.info = self._builder.info
        self._init_available_splits()
        self._validate_feature_structure()

        self.tf_dataset = self._builder.as_dataset(split=self.available_splits[self.subset])
        self.tfds_pre_processor = tfds_pre_processor
        self.tfds_augmentor = tfds_augmentor
        self._image_size = image_size
        self._num_max_boxes = num_max_boxes
        self._format_dataset() 
Example #27
Source File: download_and_prepare.py    From datasets with Apache License 2.0 5 votes vote down vote up
def download_and_prepare(builder):
  """Generate data for a given dataset."""
  logging.info("download_and_prepare for dataset %s...", builder.info.full_name)

  dl_config = download_config()

  if isinstance(builder, tfds.core.BeamBasedBuilder):
    beam = tfds.core.lazy_imports.apache_beam
    # TODO(b/129149715): Restore compute stats. Currently skipped because not
    # beam supported.
    dl_config.compute_stats = tfds.download.ComputeStatsMode.SKIP
    dl_config.beam_options = beam.options.pipeline_options.PipelineOptions(
        flags=["--%s" % opt for opt in FLAGS.beam_pipeline_options])

  if FLAGS.add_name_to_manual_dir:
    dl_config.manual_dir = os.path.join(dl_config.manual_dir, builder.name)

  builder.download_and_prepare(
      download_dir=FLAGS.download_dir,
      download_config=dl_config,
  )
  termcolor.cprint(str(builder.info.as_proto), attrs=["bold"])

  if FLAGS.debug:
    dataset = builder.as_dataset(split=tfds.Split.TRAIN)
    pdb.set_trace()
    del dataset 
Example #28
Source File: document_datasets.py    From datasets with Apache License 2.0 5 votes vote down vote up
def _get_name(self, builder):
    return builder.info.full_name.replace('/', '-') + '.png' 
Example #29
Source File: document_datasets.py    From datasets with Apache License 2.0 5 votes vote down vote up
def get_url(self, builder):
    return self.BASE_URL + self._get_name(builder) 
Example #30
Source File: document_datasets.py    From datasets with Apache License 2.0 5 votes vote down vote up
def has_visualization(self, builder):
    filepath = os.path.join(self.BASE_PATH, self._get_name(builder))
    return tf.io.gfile.exists(filepath)