Python Examples of chainer.dataset

Source File: convert.py From chainer with MIT License

6 votes

def _call_converter(converter, batch, device):
    # Calls the converter.
    # Converter can be either new-style (accepts chainer.backend.Device) or
    # old-style (accepts int as device).
    assert device is None or isinstance(device, backend.Device)

    if isinstance(converter, Converter):
        # New-style converter
        return converter(batch, device)

    # Old-style converter
    if device is None:
        return converter(batch, None)
    if device.xp is numpy:
        return converter(batch, -1)
    if device.xp is cuda.cupy:
        return converter(batch, device.device.id)
    raise RuntimeError(
        'Converter does not support ChainerX. '
        'Use chainer.dataset.converter decorator.')

Source File: utils_pretrain.py From models with MIT License

6 votes

def __next__(self):
        # This iterator returns a list representing a mini-batch. Each item
        # indicates a different position in the original sequence. Each item is
        # represented by a pair of two word IDs. The first word is at the
        # "current" position, while the second word at the next position.
        # At each iteration, the iteration count is incremented, which pushes
        # forward the "current" position.
        length = len(self.dataset)
        if not self.repeat and self.iteration * self.batch_size >= length:
            # If not self.repeat, this iterator stops at the end of the first
            # epoch (i.e., when all words are visited once).
            raise StopIteration
        cur_words = self.get_words()
        self._previous_epoch_detail = self.epoch_detail
        self.iteration += 1
        next_words = self.get_words()

        epoch = self.iteration * self.batch_size // length
        self.is_new_epoch = self.epoch < epoch
        if self.is_new_epoch:
            self.epoch = epoch

        return list(zip(cur_words, next_words))

Source File: train.py From portrait_matting with GNU General Public License v3.0

6 votes

def setup_dataset(mode, crop_dir, mask_dir=None, mean_mask_dir=None,
                  mean_grid_dir=None, trimap_dir=None, alpha_dir=None,
                  alpha_weight_dir=None):
    # Create dataset
    dataset = datasets.create(mode, crop_dir, mask_dir, mean_mask_dir,
                              mean_grid_dir, trimap_dir, alpha_dir,
                              alpha_weight_dir)

    # Create transform function
    transform = transforms.create(mode)
    transform_random = transforms.transform_random

    # Split into train and test
    train_raw, test_raw = datasets.split_dataset(dataset)

    # Increase data variety
    train_raw = chainer.datasets.TransformDataset(train_raw, transform_random)

    # Transform for network inputs
    train = chainer.datasets.TransformDataset(train_raw, transform)
    test = chainer.datasets.TransformDataset(test_raw, transform)

    return train, test

Source File: chain_utils.py From contextual_augmentation with MIT License

6 votes

def convert_sequence_chain(batch, device):
    def to_device_batch(batch):
        if device is None:
            return batch
        elif device < 0:
            return [chainer.dataset.to_device(device, x) for x in batch]
        else:
            xp = cuda.cupy.get_array_module(*batch)
            concat = xp.concatenate(batch, axis=0)
            sections = np.cumsum([len(x) for x in batch[:-1]], dtype='i')
            concat_dev = chainer.dataset.to_device(device, concat)
            batch_dev = cuda.cupy.split(concat_dev, sections)
            return batch_dev

    return [to_device_batch([x[i] for x in batch])
            for i in range(len(batch[0]))]

Source File: single_machine_custom_loop.py From sagemaker-chainer-container with Apache License 2.0

6 votes

def _preprocess_mnist(raw, withlabel, ndim, scale, image_dtype, label_dtype, rgb_format):
    images = raw['x']
    if ndim == 2:
        images = images.reshape(-1, 28, 28)
    elif ndim == 3:
        images = images.reshape(-1, 1, 28, 28)
        if rgb_format:
            images = np.broadcast_to(images, (len(images), 3) + images.shape[2:])
    elif ndim != 1:
        raise ValueError('invalid ndim for MNIST dataset')
    images = images.astype(image_dtype)
    images *= scale / 255.

    if withlabel:
        labels = raw['y'].astype(label_dtype)
        return tuple_dataset.TupleDataset(images, labels)
    else:
        return images

Source File: imagenet1k1.py From imgclsmob with MIT License

6 votes

def get_val_data_iterator(data_dir,
                          batch_size,
                          num_workers,
                          num_classes):

    val_dir_path = os.path.join(data_dir, 'val')
    val_dataset = DirectoryParsingLabelDataset(val_dir_path)
    val_dataset_len = len(val_dataset)
    assert(len(directory_parsing_label_names(val_dir_path)) == num_classes)

    val_iterator = iterators.MultiprocessIterator(
        dataset=val_dataset,
        batch_size=batch_size,
        repeat=False,
        shuffle=False,
        n_processes=num_workers,
        shared_mem=300000000)

    return val_iterator, val_dataset_len

Source File: utils_pretrain.py From models with MIT License

6 votes

def __init__(self, dataset, batch_size, repeat=True):
        self.dataset = dataset
        self.batch_size = batch_size  # batch size
        # Number of completed sweeps over the dataset. In this case, it is
        # incremented if every word is visited at least once after the last
        # increment.
        self.epoch = 0
        # True if the epoch is incremented at the last iteration.
        self.is_new_epoch = False
        self.repeat = repeat
        length = len(dataset)
        # Offsets maintain the position of each sequence in the mini-batch.
        self.offsets = [i * length // batch_size for i in range(batch_size)]
        # NOTE: this is not a count of parameter updates. It is just a count of
        # calls of ``__next__``.
        self.iteration = 0
        # use -1 instead of None internally
        self._previous_epoch_detail = -1.

Source File: cifar1.py From imgclsmob with MIT License

6 votes

def get_val_data_iterator(dataset_name,
                          batch_size,
                          num_workers):

    if dataset_name == "CIFAR10":
        _, test_ds = cifar.get_cifar10()
    elif dataset_name == "CIFAR100":
        _, test_ds = cifar.get_cifar100()
    elif dataset_name == "SVHN":
        _, test_ds = svhn.get_svhn()
    else:
        raise Exception('Unrecognized dataset: {}'.format(dataset_name))

    val_dataset = test_ds
    val_dataset_len = len(val_dataset)

    val_iterator = iterators.MultiprocessIterator(
        dataset=val_dataset,
        batch_size=batch_size,
        repeat=False,
        shuffle=False,
        n_processes=num_workers,
        shared_mem=300000000)

    return val_iterator, val_dataset_len

Source File: train_ptb_custom_loop.py From chainer with MIT License

6 votes

def __next__(self):
        # This iterator returns a list representing a mini-batch. Each item
        # indicates a different position in the original sequence. Each item is
        # represented by a pair of two word IDs. The first word is at the
        # "current" position, while the second word at the next position.
        # At each iteration, the iteration count is incremented, which pushes
        # forward the "current" position.
        length = len(self.dataset)
        if not self.repeat and self.iteration * self.batch_size >= length:
            # If not self.repeat, this iterator stops at the end of the first
            # epoch (i.e., when all words are visited once).
            raise StopIteration
        cur_words = self.get_words()
        self._previous_epoch_detail = self.epoch_detail
        self.iteration += 1
        next_words = self.get_words()

        epoch = self.iteration * self.batch_size // length
        self.is_new_epoch = self.epoch < epoch
        if self.is_new_epoch:
            self.epoch = epoch

        return list(zip(cur_words, next_words))

Source File: seq2seq.py From convolutional_seq2seq with BSD 3-Clause "New" or "Revised" License

6 votes

def __call__(self, trainer):
        print('## Calculate BLEU')
        with chainer.no_backprop_mode():
            with chainer.using_config('train', False):
                references = []
                hypotheses = []
                for i in range(0, len(self.test_data), self.batch):
                    sources, targets = zip(*self.test_data[i:i + self.batch])
                    references.extend([[t.tolist()] for t in targets])

                    sources = [
                        chainer.dataset.to_device(self.device, x) for x in sources]
                    ys = [y.tolist()
                          for y in self.model.translate(sources, self.max_length)]
                    hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1) * 100
        print('BLEU:', bleu)
        reporter.report({self.key: bleu})

Source File: cifar.py From chainer with MIT License

6 votes

def _preprocess_cifar(images, labels, withlabel, ndim, scale, dtype):
    if ndim == 1:
        images = images.reshape(-1, 3072)
    elif ndim == 3:
        images = images.reshape(-1, 3, 32, 32)
    else:
        raise ValueError('invalid ndim for CIFAR dataset')
    dtype = chainer.get_dtype(dtype)
    images = images.astype(dtype)
    images *= scale / 255.

    if withlabel:
        labels = labels.astype(numpy.int32)
        return tuple_dataset.TupleDataset(images, labels)
    else:
        return images

Source File: tabular_dataset.py From chainer with MIT License

6 votes

def convert(self, data):
        """Convert fetched data.

        This method takes data fetched by :meth:`fetch` and
        pre-process them before passing them to models.
        The default behaviour is converting each column into an ndarray.
        This behaviour can be overridden by :meth:`with_converter`.
        If the dataset is constructed by :meth:`concat` or :meth:`join`,
        the converter of the first dataset is used.

        Args:
            data (tuple or dict): Data from :meth:`fetch`.

        Returns:
            A tuple or dict.
            Each value is an ndarray.
        """
        if isinstance(data, tuple):
            return tuple(_as_array(d) for d in data)
        elif isinstance(data, dict):
            return {k: _as_array(v) for k, v in data.items()}
        else:
            return _as_array(data)

Source File: tabular_dataset.py From chainer with MIT License

6 votes

def fetch(self):
        """Fetch data.

        This method fetches all data of the dataset/view.
        Note that this method returns a column-major data
        (i.e. :obj:`([a[0], ..., a[3]], ..., [c[0], ... c[3]])`,
        :obj:`{'a': [a[0], ..., a[3]], ..., 'c': [c[0], ..., c[3]]}`, or
        :obj:`[a[0], ..., a[3]]`).

        Returns:
            If :attr:`mode` is :class:`tuple`,
            this method returns a tuple of lists/arrays.
            If :attr:`mode` is :class:`dict`,
            this method returns a dict of lists/arrays.
        """
        examples = self.get_examples(None, None)
        if self.mode is tuple:
            return examples
        elif self.mode is dict:
            return dict(six.moves.zip(self.keys, examples))
        elif self.mode is None:
            return examples[0]

Source File: train_ptb_custom_loop.py From chainer with MIT License

6 votes

def __init__(self, dataset, batch_size, repeat=True):
        self.dataset = dataset
        self.batch_size = batch_size  # batch size
        # Number of completed sweeps over the dataset. In this case, it is
        # incremented if every word is visited at least once after the last
        # increment.
        self.epoch = 0
        # True if the epoch is incremented at the last iteration.
        self.is_new_epoch = False
        self.repeat = repeat
        length = len(dataset)
        # Offsets maintain the position of each sequence in the mini-batch.
        self.offsets = [i * length // batch_size for i in range(batch_size)]
        # NOTE: this is not a count of parameter updates. It is just a count of
        # calls of ``__next__``.
        self.iteration = 0
        # use -1 instead of None internally
        self._previous_epoch_detail = -1.

Source File: train_ptb_custom_loop.py From chainer with MIT License

6 votes

def serialize(self, serializer):
        # It is important to serialize the state to be recovered on resume.
        self.iteration = serializer('iteration', self.iteration)
        self.epoch = serializer('epoch', self.epoch)
        try:
            self._previous_epoch_detail = serializer(
                'previous_epoch_detail', self._previous_epoch_detail)
        except KeyError:
            # guess previous_epoch_detail for older version
            self._previous_epoch_detail = self.epoch + \
                (self.current_position - self.batch_size) / len(self.dataset)
            if self.epoch_detail > 0:
                self._previous_epoch_detail = max(
                    self._previous_epoch_detail, 0.)
            else:
                self._previous_epoch_detail = -1.

Source File: train.py From portrait_matting with GNU General Public License v3.0

5 votes

def parse_arguments(argv):
    parser = argparse.ArgumentParser(description='Training Script')
    parser.add_argument('--config', '-c', default='config.json',
                        help='Configure json filepath')
    parser.add_argument('--batchsize', '-b', type=int, default=1,
                        help='Number of images in each mini-batch')
    parser.add_argument('--max_iteration', '-e', type=int, default=30000,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpus', '-g', type=int, default=[-1], nargs='*',
                        help='GPU IDs (negative value indicates CPU)')
    parser.add_argument('--lr', type=float, default=1e-4,
                        help='Initial learning rate')
    parser.add_argument('--momentum', default=0.99, help='Momentum for SGD')
    parser.add_argument('--weight_decay', default=0.0005, help='Weight decay')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--mode', choices=['seg', 'seg+', 'seg_tri', 'mat'],
                        help='Training mode', required=True)
    parser.add_argument('--pretrained_fcn8s', default=None,
                        help='Pretrained model path of FCN8s')
    parser.add_argument('--pretrained_n_input_ch', default=3, type=int,
                        help='Input channel number of Pretrained model')
    parser.add_argument('--pretrained_n_output_ch', default=21, type=int,
                        help='Output channel number of Pretrained model')
    parser.add_argument('--mat_scale', default=4, type=int,
                        help='Matting scale for speed up')
    args = parser.parse_args(argv)
    return args

Source File: image_dataset.py From kiss with GNU General Public License v3.0

5 votes

def __init__(self, image_size, npz_file=None, memory_manager=None, base_name=None, root='.', dtype=None, transform_probability=0, use_imgaug=True, keep_aspect_ratio=False, image_mode='RGB', full_normalize=False, resize_after_load=True):
        _check_pillow_availability()

        if not isinstance(image_size, Size):
            image_size = Size(*image_size)

        self.shared_buffers = []
        self.root = root
        self.dtype = chainer.get_dtype(dtype)
        self.image_size = image_size
        self.transform_probability = transform_probability
        self.use_imgaug = use_imgaug
        self.keep_aspect_ratio = keep_aspect_ratio
        self.image_mode = image_mode
        self.full_normalize = full_normalize  # normalize each image to be in range of [0, 1] even if brightest pixel is != 255
        self.resize_after_load = resize_after_load  # resize the image to self.image_size after loading

        if npz_file is not None:
            assert isinstance(npz_file, six.string_types), "paths must be a file name!"
            assert os.path.splitext(npz_file)[-1] == ".npz", "You have to supply gt information as npz file!"

            with numpy.load(npz_file, allow_pickle=True) as gt_data:
                self.gt_data = self.copy_npz_data_to_ram(gt_data)
            self.memory_manager = None
            self.base_name = None
            self.length = len(self.gt_data['file_name'])
        else:
            assert memory_manager is not None, "If you do not specify an npz file, you must specify a memory manager!"
            assert base_name is not None, "If you want to use shared memory, you'll need to supply a base name for each dataset"
            self.gt_data = None
            self.memory_manager = memory_manager
            self.base_name = base_name
            self.length = self.memory_manager.get_shape(self.base_name, 'file_name').pop(0)

        self.augmentations = self.init_augmentations()

Source File: test_delegate_dataset.py From chainer with MIT License

5 votes

def test_delegate_dataset(self):
        dataset = tabular.DelegateDataset(
            dummy_dataset.DummyDataset(mode=self.mode))

        self.assertIsInstance(dataset, chainer.dataset.TabularDataset)
        self.assertEqual(len(dataset), len(dataset.dataset))
        self.assertEqual(dataset.keys, dataset.dataset.keys)
        self.assertEqual(dataset.mode, dataset.dataset.mode)
        self.assertEqual(
            dataset.get_example(3), dataset.dataset.get_example(3))

Source File: train_utils.py From see with GNU General Public License v3.0

5 votes

def evaluate(self):
        iterator = self._iterators['main']
        target = self._targets['main']
        eval_func = self.eval_func or target

        if self.eval_hook:
            self.eval_hook(self)
        it = copy.copy(iterator)
        summary = reporter_module.DictSummary()

        for _ in range(min(len(iterator.dataset) // iterator.batch_size, self.num_iterations)):
            batch = next(it, None)
            if batch is None:
                break

            observation = {}
            with reporter_module.report_scope(observation), chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
                in_arrays = self.converter(batch, self.device)
                if isinstance(in_arrays, tuple):
                    eval_func(*in_arrays)
                elif isinstance(in_arrays, dict):
                    eval_func(**in_arrays)
                else:
                    eval_func(in_arrays)

            summary.add(observation)

        return summary.compute_mean()

Source File: utils_pretrain.py From models with MIT License

5 votes

def count_words(dataset, alpha=0.4):
    counts = collections.defaultdict(int)
    for w in dataset:
        counts[w] += 1
    counts = [counts[i] for i in range(len(counts))]
    counts = np.array(counts, 'f')
    counts /= counts.sum()
    counts = counts ** alpha
    counts = counts.tolist()
    return counts

Source File: kuzushiji_mnist.py From chainer with MIT License

5 votes

def _retrieve_kuzushiji_mnist_training():
    base_url = 'http://codh.rois.ac.jp/'
    urls = [base_url + 'kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',
            base_url + 'kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz']
    return _retrieve_kuzushiji_mnist('train.npz', urls)

Source File: tabular_dataset.py From chainer with MIT License

5 votes

def transform_batch(self, keys, transform_batch):
        """Apply a transform to examples.

        Args:
            keys (tuple of strs): The keys of transformed examples.
            transform_batch (callable): A callable that takes examples
                and returns transformed examples. :attr:`mode` of
                transformed dataset is determined by the transformed
                examples.

        Returns:
            A transfromed dataset.
        """
        return chainer.dataset.tabular._transform._TransformBatch(
            self, keys, transform_batch)

Source File: tabular_dataset.py From chainer with MIT License

5 votes

def transform(self, keys, transform):
        """Apply a transform to each example.

        Args:
            keys (tuple of strs): The keys of transformed examples.
            transform (callable): A callable that takes an example
                and returns transformed example. :attr:`mode` of
                transformed dataset is determined by the transformed
                examples.

        Returns:
            A transfromed dataset.
        """
        return chainer.dataset.tabular._transform._Transform(
            self, keys, transform)

Source File: tabular_dataset.py From chainer with MIT License

5 votes

def concat(self, *datasets):
        """Stack datasets along rows.

        Args:
            datasets (iterable of :class:`TabularDataset`):
                Datasets to be concatenated.
                All datasets must have the same :attr:`keys`.

        Returns:
            A concatenated dataset.
        """
        return chainer.dataset.tabular._concat._Concat(self, *datasets)

Source File: tabular_dataset.py From chainer with MIT License

5 votes

def asdict(self):
        """Return a view with dict mode.

        Returns:
            A view whose :attr:`mode` is :class:`dict`.
        """
        return chainer.dataset.tabular._asmode._Asdict(self)

Source File: tabular_dataset.py From chainer with MIT License

5 votes

def astuple(self):
        """Return a view with tuple mode.

        Returns:
            A view whose :attr:`mode` is :class:`tuple`.
        """
        return chainer.dataset.tabular._asmode._Astuple(self)

Source File: utils_pretrain.py From models with MIT License

5 votes

def get_words(self):
        # It returns a list of current words.
        return [self.dataset[(offset + self.iteration) % len(self.dataset)]
                for offset in self.offsets]

Source File: utils_pretrain.py From models with MIT License

5 votes

def epoch_detail(self):
        # Floating point version of epoch.
        return self.iteration * self.batch_size / len(self.dataset)

Source File: convert.py From chainer with MIT License

5 votes

def to_device(device, x):
    """Send an array to a given device.

    This method sends a given array to a given device. This method is used in
    :func:`~chainer.dataset.concat_examples`.
    You can also use this method in a custom converter method used in
    :class:`~chainer.training.Updater` and :class:`~chainer.training.Extension`
    such as :class:`~chainer.training.updaters.StandardUpdater` and
    :class:`~chainer.training.extensions.Evaluator`.

    See also :func:`chainer.dataset.concat_examples`.

    Args:
        device (None or int or device specifier): A device to which an array
            is sent. If it is a negative integer, an array is sent to CPU.
            If it is a positive integer, an array is sent to GPU with the
            given ID. If it is``None``, an array is left in the original
            device. Also, any of device specifiers described at
            :class:`~chainer.backend.DeviceId` is accepted.
        x (:ref:`ndarray`): An array to send.

    Returns:
        Converted array.

    """
    device = _get_device(device)

    if device is None:
        return x
    return device.send(x)

Source File: tabular_dataset.py From chainer with MIT License

5 votes

def with_converter(self, converter):
        """Override the behaviour of :meth:`convert`.

        This method overrides :meth:`convert`.

        Args:
            converter (callable): A new converter.

        Returns:
            A dataset with the new converter.
        """

        return chainer.dataset.tabular._with_converter._WithConverter(
            self, converter)

Python chainer.dataset() Examples