Python Examples of tensorflow

Source File: datasets.py From mobilenetv3-tensorflow with Apache License 2.0

6 votes

def build_dataset(
    shape: Tuple[int, int],
    name: str="mnist",
    train_batch_size: int=32,
    valid_batch_size: int=32
    ):

    dataset = {}
    builder = tfds.builder(name)
    dataset["num_train"] = builder.info.splits['train'].num_examples
    dataset["num_test"] = builder.info.splits['test'].num_examples

    [ds_train, ds_test], info = tfds.load(name=name, split=["train", "test"], with_info=True)
    dataset["num_classes"] = info.features["label"].num_classes
    dataset["channels"] = ds_train.output_shapes["image"][-1].value

    ds_train = ds_train.shuffle(1024).repeat()
    ds_train = ds_train.map(lambda data: _parse_function(data, shape, dataset["num_classes"], dataset["channels"]))
    dataset["train"] = ds_train.batch(train_batch_size)

    ds_test = ds_test.shuffle(1024).repeat()
    ds_test = ds_test.map(lambda data: _parse_function(data, shape, dataset["num_classes"], dataset["channels"]))
    dataset["test"] = ds_test.batch(valid_batch_size)

    return dataset

Source File: post_training_quantization.py From models with Apache License 2.0

6 votes

def _representative_dataset_gen():
  """Gets a python generator of numpy arrays for the given dataset."""
  image_size = FLAGS.image_size
  dataset = tfds.builder(FLAGS.dataset_name, data_dir=FLAGS.dataset_dir)
  dataset.download_and_prepare()
  data = dataset.as_dataset()[FLAGS.dataset_split]
  iterator = tf.data.make_one_shot_iterator(data)
  if FLAGS.use_model_specific_preprocessing:
    preprocess_fn = functools.partial(
        preprocessing_factory.get_preprocessing(name=FLAGS.model_name),
        output_height=image_size,
        output_width=image_size)
  else:
    preprocess_fn = functools.partial(
        _preprocess_for_quantization, image_size=image_size)
  features = iterator.get_next()
  image = features["image"]
  image = preprocess_fn(image)
  image = tf.reshape(image, [1, image_size, image_size, 3])
  for _ in range(FLAGS.num_steps):
    yield [image.eval()]

Source File: tfds.py From blueoil with Apache License 2.0

6 votes

def count_max_boxes(cls, builder):
        sess = tf.compat.v1.Session()
        max_boxes = 0

        for split in builder.info.splits:
            tf_dataset = builder.as_dataset(split=split)
            iterator = tf.compat.v1.data.make_one_shot_iterator(tf_dataset)
            next_batch = iterator.get_next()

            while True:
                try:
                    data = sess.run(next_batch)
                    if max_boxes < data["objects"]["label"].shape[0]:
                        max_boxes = data["objects"]["label"].shape[0]
                except tf.errors.OutOfRangeError:
                    break

        return max_boxes

Source File: document_datasets.py From datasets with Apache License 2.0

6 votes

def document_single_builder(builder):
  """Doc string for a single builder, with or without configs."""
  print('Document builder %s...' % builder.name)
  get_config_builder = lambda config: tfds.builder(builder.name, config=config)
  config_builders = []
  if builder.builder_configs:
    with futures.ThreadPoolExecutor(max_workers=WORKER_COUNT_CONFIGS) as tpool:
      config_builders = list(
          tpool.map(get_config_builder, builder.BUILDER_CONFIGS))
  tmpl = get_mako_template('dataset')
  visu_doc_util = VisualizationDocUtil()
  out_str = tmpl.render_unicode(
      builder=builder,
      config_builders=config_builders,
      visu_doc_util=visu_doc_util,
      nightly_doc_util=NightlyDocUtil(),
  ).strip()
  schema_org_tmpl = get_mako_template('schema_org')
  schema_org_out_str = schema_org_tmpl.render_unicode(
      builder=builder,
      config_builders=config_builders,
      visu_doc_util=visu_doc_util,
  ).strip()
  out_str = schema_org_out_str + '\n' + out_str
  return out_str

Source File: generate_visualization.py From datasets with Apache License 2.0

6 votes

def _get_full_names(datasets: Optional[List[str]] = None) -> List[str]:
  """List all builder names `ds/version` and `ds/config/version` to generate.

  Args:
    datasets: List of datasets from which get the builder names.

  Returns:
    builder_names: The builder names.
  """
  if datasets is None:
    return tfds.core.registered.list_full_names(
        current_version_only=True,
    )
  else:
    builder_names = list(itertools.chain.from_iterable([
        tfds.core.registered.single_full_names(builder_name)
        for builder_name in datasets
    ]))
    return builder_names

Source File: oxford_iiit_pet.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, data_dir=None):

    dataset_builder = tfds.builder("oxford_iiit_pet:3.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    tfds_splits = {}
    tfds_splits["train"] = "train[:{}%]".format(TRAIN_SPLIT_PERCENT)
    tfds_splits["val"] = "train[{}%:]".format(TRAIN_SPLIT_PERCENT)
    tfds_splits["trainval"] = tfds.Split.TRAIN
    tfds_splits["test"] = tfds.Split.TEST

    # Creates a dict with example counts for each split.
    num_samples_splits = {}
    trainval_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples
    num_samples_splits["train"] = (TRAIN_SPLIT_PERCENT * trainval_count) // 100
    num_samples_splits["val"] = trainval_count - num_samples_splits["train"]
    num_samples_splits["trainval"] = trainval_count
    num_samples_splits["test"] = test_count

    super(OxfordIIITPetData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(["image", "label"]),
        num_classes=dataset_builder.info.features["label"].num_classes)

Source File: utils.py From text-to-text-transfer-transformer with Apache License 2.0

5 votes

def files(self, split):
    """Returns set of instructions for reading TFDS files for the dataset."""
    split = self._map_split(split)

    if "/" not in self.name and self.builder.BUILDER_CONFIGS:
      # If builder has multiple configs, and no particular config was
      # requested, raise an error.
      raise ValueError("Dataset '%s' has multiple configs." % self.name)

    split_info = self.builder.info.splits[split]
    files = split_info.file_instructions

    if not files:
      logging.fatal("No TFRecord files found for dataset: %s", self.name)
    return files

Source File: utils.py From text-to-text-transfer-transformer with Apache License 2.0

5 votes

def load_shard(self, file_instruction):
    """Returns a dataset for a single shard of the TFDS TFRecord files."""
    ds = self.builder._tfrecords_reader.read_files(  # pylint:disable=protected-access
        [file_instruction],
        read_config=tfds.ReadConfig(),
        shuffle_files=False)
    return ds

Source File: patch_camelyon.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, data_dir=None):

    dataset_builder = tfds.builder("patch_camelyon:2.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    tfds_splits = {
        "test": "test",
        "train": "train",
        "val": "validation",
        "trainval": "train+validation",
    }
    # Creates a dict with example counts.
    num_samples_splits = {
        "test": dataset_builder.info.splits["test"].num_examples,
        "train": dataset_builder.info.splits["train"].num_examples,
        "val": dataset_builder.info.splits["validation"].num_examples,
    }
    num_samples_splits["trainval"] = (
        num_samples_splits["train"] + num_samples_splits["val"])
    super(PatchCamelyonData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(["image", "label"]),
        num_classes=dataset_builder.info.features["label"].num_classes)

Source File: sun397.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, config="tfds", data_dir=None):

    if config == "tfds":
      dataset_builder = tfds.builder("sun397/tfds:4.*.*", data_dir=data_dir)
      dataset_builder.download_and_prepare()

      tfds_splits = {
          "train": "train",
          "val": "validation",
          "test": "test",
          "trainval": "train+validation",
      }
      # Creates a dict with example counts.
      num_samples_splits = {
          "test": dataset_builder.info.splits["test"].num_examples,
          "train": dataset_builder.info.splits["train"].num_examples,
          "val": dataset_builder.info.splits["validation"].num_examples,
      }
      num_samples_splits["trainval"] = (
          num_samples_splits["train"] + num_samples_splits["val"])
    else:

      raise ValueError("No supported config %r for Sun397Data." % config)

    super(Sun397Data, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(["image", "label"]),
        num_classes=dataset_builder.info.features["label"].num_classes)

Source File: kitti.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, task, data_dir=None):

    if task not in _TASK_DICT:
      raise ValueError("Unknown task: %s" % task)

    dataset_builder = tfds.builder("kitti:3.1.0", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    tfds_splits = {
        "train": "train",
        "val": "validation",
        "trainval": "train+validation",
        "test": "test",
    }

    # Example counts are retrieved from the tensorflow dataset info.
    num_examples = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    train_count = num_examples * TRAIN_SPLIT_PERCENT // 100
    val_count = num_examples * VALIDATION_SPLIT_PERCENT // 100
    test_count = num_examples * TEST_SPLIT_PERCENT // 100
    # Creates a dict with example counts for each split.
    num_samples_splits = {
        "train": train_count,
        "val": val_count,
        "trainval": train_count + val_count,
        "test": test_count
    }

    task = _TASK_DICT[task]
    base_preprocess_fn = task["preprocess_fn"]
    super(KittiData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        base_preprocess_fn=base_preprocess_fn,
        num_classes=task["num_classes"])

Source File: dmlab.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, data_dir=None):
    dataset_builder = tfds.builder("dmlab:2.0.0", data_dir=data_dir)

    tfds_splits = {
        "train": "train",
        "val": "validation",
        "trainval": "train+validation",
        "test": "test"
    }

    # Example counts are retrieved from the tensorflow dataset info.
    train_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    val_count = dataset_builder.info.splits[tfds.Split.VALIDATION].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples

    # Creates a dict with example counts for each split.
    num_samples_splits = {
        "train": train_count,
        "val": val_count,
        "trainval": train_count + val_count,
        "test": test_count
    }

    super(DmlabData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        base_preprocess_fn=base.make_get_and_cast_tensors_fn({
            "image": ("image", None),
            "label": ("label", None),
        }),
        num_classes=dataset_builder.info.features["label"].num_classes,
        image_key="image")

Source File: oxford_flowers102.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, data_dir=None):
    dataset_builder = tfds.builder("oxford_flowers102:2.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    tfds_splits = {
        "train": "train",
        "val": "validation",
        "trainval": "train+validation",
        "test": "test",
    }

    # Example counts are retrieved from the tensorflow dataset info.
    train_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    val_count = dataset_builder.info.splits[tfds.Split.VALIDATION].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples

    # Creates a dict with example counts for each split.
    num_samples_splits = {
        "train": train_count,
        "val": val_count,
        "trainval": train_count + val_count,
        "test": test_count
    }

    super(OxfordFlowers102Data, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Rename tensors but keep their original types.
        base_preprocess_fn=base.make_get_and_cast_tensors_fn({
            "image": ("image", None),
            "label": ("label", None),
        }),
        num_classes=dataset_builder.info.features["label"]
        .num_classes)

Source File: dtd.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, data_dir=None):

    dataset_builder = tfds.builder("dtd:3.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    tfds_splits = {}
    tfds_splits["train"] = "train"
    tfds_splits["val"] = "validation"
    tfds_splits["trainval"] = "train+validation"
    tfds_splits["test"] = "test"

    # Creates a dict with example counts for each split.
    num_samples_splits = {}
    train_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    val_count = dataset_builder.info.splits[tfds.Split.VALIDATION].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples
    num_samples_splits["train"] = train_count
    num_samples_splits["val"] = val_count
    num_samples_splits["trainval"] = train_count + val_count
    num_samples_splits["test"] = test_count

    super(DTDData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(["image", "label"]),
        num_classes=dataset_builder.info.features["label"].num_classes)

Source File: utils.py From text-to-text-transfer-transformer with Apache License 2.0

5 votes

def info(self):
    return self.builder.info

Source File: imagenet.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, features=("image", "label")):

    dataset_builder = tfds.builder("imagenet2012:5.*.*")

    # Defines dataset specific train/val/trainval/test splits.
    # Note, that the test split for "imagenet2012" dataset is not available.
    # Thus, we use the val split as test. Moreover, we split the train split
    # into two parts: new train split and new val split.
    tfds_splits = {}
    tfds_splits["train"] = "train[:{}%]".format(TRAIN_SPLIT_PERCENT)
    tfds_splits["val"] = "train[{}%:]".format(TRAIN_SPLIT_PERCENT)
    tfds_splits["trainval"] = "train"
    tfds_splits["test"] = "validation"

    # Creates a dict with example counts.
    num_samples_splits = {}
    trainval_count = dataset_builder.info.splits["train"].num_examples
    test_count = dataset_builder.info.splits["validation"].num_examples
    num_samples_splits["train"] = (TRAIN_SPLIT_PERCENT * trainval_count) // 100
    num_samples_splits["val"] = trainval_count - num_samples_splits["train"]
    num_samples_splits["trainval"] = trainval_count
    num_samples_splits["test"] = test_count

    super(ImageNetData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Export only image and label tensors with their original types.
        base_preprocess_fn=base.make_get_tensors_fn(features),
        filter_fn=self._get_filter_fn(),
        num_classes=dataset_builder.info.features["label"].num_classes)

Source File: svhn.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, data_dir=None):
    dataset_builder = tfds.builder("svhn_cropped:3.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    # The validation set is split out of the original training set, and the
    # remaining examples are used as the "train" split. The "trainval" split
    # corresponds to the original training set.
    tfds_splits = {
        "train": "train[:{}%]".format(TRAIN_SPLIT_PERCENT),
        "val": "train[{}%:]".format(TRAIN_SPLIT_PERCENT),
        "trainval": "train",
        "test": "test",
    }

    # Example counts are retrieved from the tensorflow dataset info.
    trainval_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples

    # Creates a dict with example counts for each split.
    num_samples_splits = {
        # Calculates the train/val split example count based on percent.
        "train": TRAIN_SPLIT_PERCENT * trainval_count // 100,
        "val": trainval_count - TRAIN_SPLIT_PERCENT * trainval_count // 100,
        "trainval": trainval_count,
        "test": test_count
    }

    super(SvhnData, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=10000,
        # Note: Rename tensors but keep their original types.
        base_preprocess_fn=base.make_get_and_cast_tensors_fn({
            "image": ("image", None),
            "label": ("label", None),
        }),
        num_classes=dataset_builder.info.features["label"]
        .num_classes)

Source File: caltech.py From task_adaptation with Apache License 2.0

5 votes

def __init__(self, num_classes=10, data_dir=None):
    dataset_builder = tfds.builder("caltech101:3.*.*", data_dir=data_dir)
    dataset_builder.download_and_prepare()

    # Defines dataset specific train/val/trainval/test splits.
    tfds_splits = {}
    tfds_splits["train"] = "train[:{}%]".format(_TRAIN_SPLIT_PERCENT)
    tfds_splits["val"] = "train[{}%:]".format(_TRAIN_SPLIT_PERCENT)
    tfds_splits["trainval"] = "train"
    tfds_splits["test"] = "test"

    # Creates a dict with example counts for each split.
    trainval_count = dataset_builder.info.splits[tfds.Split.TRAIN].num_examples
    train_count = (_TRAIN_SPLIT_PERCENT * trainval_count) // 100
    test_count = dataset_builder.info.splits[tfds.Split.TEST].num_examples
    num_samples_splits = dict(
        train=train_count,
        val=trainval_count - train_count,
        trainval=trainval_count,
        test=test_count)

    super(Caltech101, self).__init__(
        dataset_builder=dataset_builder,
        tfds_splits=tfds_splits,
        num_samples_splits=num_samples_splits,
        num_preprocessing_threads=400,
        shuffle_buffer_size=3000,
        base_preprocess_fn=base.make_get_tensors_fn(("image", "label")),
        num_classes=dataset_builder.info.features["label"].num_classes)

Source File: dataset_factory.py From models with Apache License 2.0

5 votes

def __init__(self, config: DatasetConfig, **overrides: Any):
    """Initialize the builder from the config."""
    self.config = config.replace(**overrides)
    self.builder_info = None

    if self.config.augmenter is not None:
      logging.info('Using augmentation: %s', self.config.augmenter.name)
      self.augmenter = self.config.augmenter.build()
    else:
      self.augmenter = None

Source File: dataset_factory.py From models with Apache License 2.0

5 votes

def info(self) -> tfds.core.DatasetInfo:
    """The TFDS dataset info, if available."""
    if self.builder_info is None:
      self.builder_info = tfds.builder(self.config.name).info
    return self.builder_info

Source File: dataset_factory.py From models with Apache License 2.0

5 votes

def _build(self, input_context: tf.distribute.InputContext = None
             ) -> tf.data.Dataset:
    """Construct a dataset end-to-end and return it.

    Args:
      input_context: An optional context provided by `tf.distribute` for
        cross-replica training.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    builders = {
        'tfds': self.load_tfds,
        'records': self.load_records,
        'synthetic': self.load_synthetic,
    }

    builder = builders.get(self.config.builder, None)

    if builder is None:
      raise ValueError('Unknown builder type {}'.format(self.config.builder))

    self.input_context = input_context
    dataset = builder()
    dataset = self.pipeline(dataset)

    return dataset

Source File: dataset_factory.py From models with Apache License 2.0

5 votes

def load_tfds(self) -> tf.data.Dataset:
    """Return a dataset loading files from TFDS."""

    logging.info('Using TFDS to load data.')

    builder = tfds.builder(self.config.name,
                           data_dir=self.config.data_dir)

    if self.config.download:
      builder.download_and_prepare()

    decoders = {}

    if self.config.skip_decoding:
      decoders['image'] = tfds.decode.SkipDecoding()

    read_config = tfds.ReadConfig(
        interleave_cycle_length=10,
        interleave_block_length=1,
        input_context=self.input_context)

    dataset = builder.as_dataset(
        split=self.config.split,
        as_supervised=True,
        shuffle_files=True,
        decoders=decoders,
        read_config=read_config)

    return dataset

Source File: vocabulary.py From mesh with Apache License 2.0

5 votes

def get_tfds_vocabulary(dataset_name=gin.REQUIRED):
  info = tfds.builder(dataset_name).info
  # this assumes that either there are no inputs, or that the
  # inputs and targets have the same vocabulary.
  return TFDSVocabulary(info.features[info.supervised_keys[1]].encoder)

Source File: t2t.py From BERT with Apache License 2.0

5 votes

def train_and_eval_dataset(dataset_name, data_dir):
  """Return train and evaluation datasets, feature info and supervised keys.

  Args:
    dataset_name: a string, the name of the dataset; if it starts with "v1_"
      then we'll search T2T Problem registry for it, otherwise we assume it
      is a dataset from TFDS and load it from there.
    data_dir: directory where the data is located.

  Returns:
    a 4-tuple consisting of:
     * the train tf.data.Dataset
     * the eval tf.data.Dataset
     * information about features: a python dictionary with feature names
         as keys and an object as value that provides .shape and .num_classes.
     * supervised_keys: information what's the input and what's the target,
         ie., a pair of lists with input and target feature names.
  """
  if dataset_name.startswith("v1_"):
    return _train_and_eval_dataset_v1(dataset_name[3:], data_dir)
  dataset_builder = tfds.builder(dataset_name, data_dir=data_dir)
  info = dataset_builder.info
  splits = dataset_builder.info.splits
  if tfds.Split.TRAIN not in splits:
    raise ValueError("To train we require a train split in the dataset.")
  if tfds.Split.VALIDATION not in splits and "test" not in splits:
    raise ValueError("We require a validation or test split in the dataset.")
  eval_split = tfds.Split.VALIDATION
  if tfds.Split.VALIDATION not in splits:
    eval_split = tfds.Split.TEST
  train, valid = tfds.load(
      name=dataset_name, split=[tfds.Split.TRAIN, eval_split])
  keys = None
  if info.supervised_keys:
    keys = ([info.supervised_keys[0]], [info.supervised_keys[1]])
  return train, valid, info.features, keys

Source File: tf_inputs.py From trax with Apache License 2.0

5 votes

def download_and_prepare(dataset_name, data_dir):
  """Downloads and prepares T2T or TFDS dataset.

  Args:
    dataset_name: tfds dataset or t2t problem name prefixed by 't2t_'.
    data_dir: location of existing dataset or None.

  Returns:
    data_dir: path string of downloaded data.
  """
  if not data_dir:
    data_dir = os.path.expanduser('~/tensorflow_datasets/')
    dl_dir = os.path.join(data_dir, 'download')
    logging.info(
        'No dataset directory provided. '
        'Downloading and generating dataset for %s inside data directory %s '
        'For large datasets it is better to prepare datasets manually!',
        dataset_name, data_dir)
    if dataset_name.startswith('t2t_'):
      # Download and run dataset generator for T2T problem.
      data_dir = os.path.join(data_dir, dataset_name)
      tf.io.gfile.makedirs(data_dir)
      tf.io.gfile.makedirs(dl_dir)
      t2t_problems().problem(
          dataset_name[len('t2t_'):]).generate_data(data_dir, dl_dir)
    else:
      # Download and prepare TFDS dataset.
      tfds_builder = tfds.builder(dataset_name)
      tfds_builder.download_and_prepare(download_dir=dl_dir)
  else:
    data_dir = os.path.expanduser(data_dir)
  return data_dir

Source File: tfds.py From blueoil with Apache License 2.0

5 votes

def __init__(
            self,
            name,
            data_dir,
            image_size,
            download=False,
            num_max_boxes=None,
            tfds_pre_processor=None,
            tfds_augmentor=None,
            *args,
            **kwargs
    ):
        super().__init__(
            *args,
            **kwargs,
        )

        if name in tfds.list_builders():
            self._builder = tfds.builder(name, data_dir=data_dir)
            if download:
                self._builder.download_and_prepare()
        else:
            if not tf.io.gfile.exists(os.path.join(data_dir, name)):
                raise ValueError("Dataset directory does not exist: {}\n"
                                 "Please run `python blueoil/cmd/build_tfds.py -c <config file>` before training."
                                 .format(os.path.join(data_dir, name)))

            self._builder = self.builder_class(name, data_dir=data_dir)

        self.info = self._builder.info
        self._init_available_splits()
        self._validate_feature_structure()

        self.tf_dataset = self._builder.as_dataset(split=self.available_splits[self.subset])
        self.tfds_pre_processor = tfds_pre_processor
        self.tfds_augmentor = tfds_augmentor
        self._image_size = image_size
        self._num_max_boxes = num_max_boxes
        self._format_dataset()

Source File: download_and_prepare.py From datasets with Apache License 2.0

5 votes

def download_and_prepare(builder):
  """Generate data for a given dataset."""
  logging.info("download_and_prepare for dataset %s...", builder.info.full_name)

  dl_config = download_config()

  if isinstance(builder, tfds.core.BeamBasedBuilder):
    beam = tfds.core.lazy_imports.apache_beam
    # TODO(b/129149715): Restore compute stats. Currently skipped because not
    # beam supported.
    dl_config.compute_stats = tfds.download.ComputeStatsMode.SKIP
    dl_config.beam_options = beam.options.pipeline_options.PipelineOptions(
        flags=["--%s" % opt for opt in FLAGS.beam_pipeline_options])

  if FLAGS.add_name_to_manual_dir:
    dl_config.manual_dir = os.path.join(dl_config.manual_dir, builder.name)

  builder.download_and_prepare(
      download_dir=FLAGS.download_dir,
      download_config=dl_config,
  )
  termcolor.cprint(str(builder.info.as_proto), attrs=["bold"])

  if FLAGS.debug:
    dataset = builder.as_dataset(split=tfds.Split.TRAIN)
    pdb.set_trace()
    del dataset

Source File: document_datasets.py From datasets with Apache License 2.0

5 votes

def _get_name(self, builder):
    return builder.info.full_name.replace('/', '-') + '.png'

Source File: document_datasets.py From datasets with Apache License 2.0

5 votes

def get_url(self, builder):
    return self.BASE_URL + self._get_name(builder)

Source File: document_datasets.py From datasets with Apache License 2.0

5 votes

def has_visualization(self, builder):
    filepath = os.path.join(self.BASE_PATH, self._get_name(builder))
    return tf.io.gfile.exists(filepath)

Python tensorflow_datasets.builder() Examples