Python Examples of torch.utils.data.Dataset

Source File: data.py From VSE-C with MIT License

7 votes

def get_test_loader(split_name, data_name, vocab, crop_size, batch_size,
                    workers, opt, cap_suffix='caps'):
    dpath = os.path.join(opt.data_path, data_name)
    if opt.data_name.endswith('_precomp'):
        if not opt.use_external_captions:
            test_loader = get_precomp_loader(dpath, split_name, vocab, opt,
                                             batch_size, False, workers, cap_suffix)
        else:
            test_loader = get_precomp_train_caption_loader(dpath, split_name, vocab, opt,
                                                           batch_size, False, workers, cap_suffix)
    else:
        # Build Dataset Loader
        roots, ids = get_paths(dpath, data_name, opt.use_restval)

        transform = get_transform(data_name, split_name, opt)
        test_loader = get_loader_single(opt.data_name, split_name,
                                        roots[split_name]['img'],
                                        roots[split_name]['cap'],
                                        vocab, transform, ids=ids[split_name],
                                        batch_size=batch_size, shuffle=False,
                                        num_workers=workers,
                                        collate_fn=collate_fn)

    return test_loader

Source File: utils_NMT.py From ConvLab with MIT License

7 votes

def get_seq(pairs,lang,batch_size,type,max_len):   
    x_seq = []
    y_seq = []
    ptr_seq = []
    for pair in pairs:
        x_seq.append(pair[0])
        y_seq.append(pair[1])
        ptr_seq.append(pair[2])
        if(type):
            lang.index_words(pair[0])
            lang.index_words(pair[1])
    
    dataset = Dataset(x_seq, y_seq,ptr_seq,lang.word2index, lang.word2index,max_len)
    data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                              batch_size=batch_size,
                                              shuffle=type,
                                              collate_fn=collate_fn)
    return data_loader

Source File: dataset.py From seamseg with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, root_dir, split_name, transform):
        super(ISSDataset, self).__init__()
        self.root_dir = root_dir
        self.split_name = split_name
        self.transform = transform

        # Folders
        self._img_dir = path.join(root_dir, ISSDataset._IMG_DIR)
        self._msk_dir = path.join(root_dir, ISSDataset._MSK_DIR)
        self._lst_dir = path.join(root_dir, ISSDataset._LST_DIR)
        for d in self._img_dir, self._msk_dir, self._lst_dir:
            if not path.isdir(d):
                raise IOError("Dataset sub-folder {} does not exist".format(d))

        # Load meta-data and split
        self._meta, self._images = self._load_split()

Source File: data_loader.py From LipReading with MIT License

6 votes

def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False):
        """
        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
        a comma. Each new line is a different sample. Example below:

        /path/to/audio.wav,/path/to/audio.txt
        ...

        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
        :param manifest_filepath: Path to manifest csv as describe above
        :param labels: String containing all the possible characters to map to
        :param normalize: Apply standard mean and deviation normalization to audio tensor
        :param augment(default False):  Apply random tempo and gain perturbations
        """
        with open(manifest_filepath) as f:
            ids = f.readlines()
        ids = [x.strip().split(',') for x in ids]
        self.ids = ids
        self.size = len(ids)
        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
        super(SpectrogramDataset, self).__init__(audio_conf, normalize, augment)

Source File: utils_babi.py From ConvLab with MIT License

6 votes

def get_seq(pairs,lang,batch_size,type,max_len):   
    x_seq = []
    y_seq = []
    ptr_seq = []
    gate_seq = []
    for pair in pairs:
        x_seq.append(pair[0])
        y_seq.append(pair[1])
        ptr_seq.append(pair[2])
        gate_seq.append(pair[3])
        if(type):
            lang.index_words(pair[0])
            lang.index_words(pair[1])
    
    dataset = Dataset(x_seq, y_seq,ptr_seq,gate_seq,lang.word2index, lang.word2index,max_len)
    data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                              batch_size=batch_size,
                                              shuffle=type,
                                              collate_fn=collate_fn)
    return data_loader

Source File: sunrgbd_detection_dataset_hd.py From H3DNet with MIT License

6 votes

def __init__(self, data_path=None, split_set='train', num_points=20000,
        use_color=False, use_height=False, use_v1=False,
        augment=False, scan_idx_list=None):

        assert(num_points<=50000)
        self.use_v1 = use_v1 
        if use_v1:
            self.data_path = os.path.join(data_path, 'sunrgbd_pc_bbox_votes_50k_v1_' + split_set)
            # self.data_path = os.path.join('/scratch/cluster/yanght/Dataset/sunrgbd/sunrgbd_pc_bbox_votes_50k_v1_' + split_set)
        else:
            AssertionError("v2 data is not prepared")

        self.raw_data_path = os.path.join(ROOT_DIR, 'sunrgbd/sunrgbd_trainval')
        self.scan_names = sorted(list(set([os.path.basename(x)[0:6] \
            for x in os.listdir(self.data_path)])))

        if scan_idx_list is not None:
            self.scan_names = [self.scan_names[i] for i in scan_idx_list]
        self.num_points = num_points
        self.augment = augment
        self.use_color = use_color
        self.use_height = use_height

Source File: base.py From nsf with MIT License

6 votes

def __init__(self, num_epochs=None, *args, **kwargs):
        """Constructor.

        Args:
            dataset: A `Dataset` object to be loaded.
            batch_size: int, the size of each batch.
            shuffle: bool, whether to shuffle the dataset after each epoch.
            drop_last: bool, whether to drop last batch if its size is less than
                `batch_size`.
            num_epochs: int or None, number of epochs to iterate over the dataset.
                If None, defaults to infinity.
        """
        super().__init__(
            *args, **kwargs
        )
        self.finite_iterable = super().__iter__()
        self.counter = 0
        self.num_epochs = float('inf') if num_epochs is None else num_epochs

Source File: MyMNIST.py From sgd-influence with MIT License

6 votes

def __init__(self, root, train=True, transform=None, target_transform=None, download=False, seed=0):
        super(MNIST, self).__init__(root)
        self.seed = seed
        self.transform = transform
        self.target_transform = target_transform
        self.train = train  # training set or test set

        if download:
            self.download()

        if not self._check_exists():
            raise RuntimeError('Dataset not found.' +
                               ' You can use download=True to download it')

        if self.train:
            data_file = self.training_file
        else:
            data_file = self.test_file
        self.data, self.targets = torch.load(os.path.join(self.processed_folder, data_file))

Source File: datasets.py From bgd with MIT License

6 votes

def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
        self.root = os.path.expanduser(root)
        self.transform = transform
        self.target_transform = target_transform
        self.train = train  # training set or test set

        if download:
            self.download()

        if not self._check_exists():
            raise RuntimeError('Dataset not found.' +
                               ' You can use download=True to download it')

        if self.train:
            self.train_data, self.train_labels = torch.load(
                os.path.join(self.root, self.processed_folder, self.training_file))
        else:
            self.test_data, self.test_labels = torch.load(
                os.path.join(self.root, self.processed_folder, self.test_file))

Source File: dataset.py From prediction-flow with MIT License

6 votes

def __getitem__(self, idx):
        record = OrderedDict()

        for feat in chain(
                self.features.number_features,
                self.features.category_features):
            record[feat.name] = self.X_map[feat.name][idx]

        for feat in self.features.sequence_features:
            seq = self.X_map[feat.name][idx]
            record[feat.name] = Dataset.__pad_sequence(feat, seq)
            record[f"__{feat.name}_length"] = np.int64(seq.shape[0])

        if self.y is not None:
            record['label'] = self.y[idx]
        return record

Source File: base.py From LEDNet with MIT License

6 votes

def transform(self, fn, lazy=True):
        """Returns a new dataset with each sample transformed by the
        transformer function `fn`.

        Parameters
        ----------
        fn : callable
            A transformer function that takes a sample as input and
            returns the transformed sample.
        lazy : bool, default True
            If False, transforms all samples at once. Otherwise,
            transforms each sample on demand. Note that if `fn`
            is stochastic, you must set lazy to True or you will
            get the same result on all epochs.

        Returns
        -------
        Dataset
            The transformed dataset.
        """
        trans = _LazyTransformDataset(self, fn)
        if lazy:
            return trans
        return SimpleDataset([i for i in trans])

Source File: data_silo.py From FARM with Apache License 2.0

6 votes

def _dataset_from_chunk(cls, chunk, processor):
        """
        Creating a dataset for a chunk (= subset) of dicts. In multiprocessing:
          * we read in all dicts from a file
          * split all dicts into chunks
          * feed *one chunk* to *one process*
          => the *one chunk*  gets converted to *one dataset* (that's what we do here)
          * all datasets get collected and concatenated
        :param chunk: Instead of only having a list of dicts here we also supply an index (ascending int) for each.
            => [(0, dict), (1, dict) ...]
        :type chunk: list of tuples
        :param processor: FARM Processor (e.g. TextClassificationProcessor)
        :return: PyTorch Dataset
        """
        dicts = [d[1] for d in chunk]
        indices = [x[0] for x in chunk]
        dataset = processor.dataset_from_dicts(dicts=dicts, indices=indices)
        return dataset

Source File: dataloader.py From FARM with Apache License 2.0

6 votes

def covert_dataset_to_dataloader(dataset, sampler, batch_size):
    """
    Wraps a PyTorch Dataset with a DataLoader.

    :param dataset: Dataset to be wrapped.
    :type dataset: Dataset
    :param sampler: PyTorch sampler used to pick samples in a batch.
    :type sampler: Sampler
    :param batch_size: Number of samples in the batch.
    :return: A DataLoader that wraps the input Dataset.
    """
    sampler_initialized = sampler(dataset)
    data_loader = DataLoader(
        dataset, sampler=sampler_initialized, batch_size=batch_size
    )
    return data_loader

Source File: classifier.py From metal with Apache License 2.0

6 votes

def resume_training(self, train_data, model_path, valid_data=None):
        """This model resume training of a classifier by reloading the appropriate state_dicts for each model

        Args:
           train_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of
                X (data) and Y (labels) for the train split
            model_path: the path to the saved checpoint for resuming training
            valid_data: a tuple of Tensors (X,Y), a Dataset, or a DataLoader of
                X (data) and Y (labels) for the dev split
        """
        restore_state = self.checkpointer.restore(model_path)
        loss_fn = self._get_loss_fn()
        self.train()
        self._train_model(
            train_data=train_data,
            loss_fn=loss_fn,
            valid_data=valid_data,
            restore_state=restore_state,
        )

Source File: classifier.py From metal with Apache License 2.0

6 votes

def _create_data_loader(self, data, **kwargs):
        """Converts input data into a DataLoader"""
        if data is None:
            return None

        # Set DataLoader config
        # NOTE: Not applicable if data is already a DataLoader
        config = {
            **self.config["train_config"]["data_loader_config"],
            **kwargs,
            "pin_memory": self.config["device"] != "cpu",
        }
        # Return data as DataLoader
        if isinstance(data, DataLoader):
            return data
        elif isinstance(data, Dataset):
            return DataLoader(data, **config)
        elif isinstance(data, (tuple, list)):
            return DataLoader(self._create_dataset(*data), **config)
        else:
            raise ValueError("Input data type not recognized.")

Source File: datasets.py From signaltrain with GNU General Public License v3.0

5 votes

def get_single_chunk(self):
        """
        Grabs audio and knobs either from files or from preloaded buffer(s)
        """
        if self.preload:  # This will typically be the case
            i = np.random.randint(0,high=len(self.x))  # pick a random line from preloaded audio
            in_audio, targ_audio, knobs_wc = self.x[i], self.y[i], self.knobs[i]  # note these might be, e.g. 10 seconds long
        else:
            in_audio, targ_audio, knobs_wc = self.read_one_new_file_pair() # read x, y, knobs

        # Grab a random chunk from within total audio nfile
        assert len(in_audio) > self.chunk_size, f"Error: len(in_audio)={len(in_audio)}, must be > self.chunk_size={self.chunk_size}"
        ibgn = np.random.randint(0, len(in_audio) - self.chunk_size)
        x_item = in_audio[ibgn:ibgn+self.chunk_size]
        y_item = targ_audio[ibgn:ibgn+self.chunk_size]

        if self.rerun_effect:  # re-run the effect on this chunk , and replace target audio
            y_item, x_item = self.effect.go_wc(x_item, knobs_wc)   # Apply the audio effect

        y_item = y_item[-self.y_size:]   # Format for expected output size

        # normalize knobs for nn usage
        kr = self.effect.knob_ranges   # kr is abbribation for 'knob ranges'
        knobs_nn = (knobs_wc - kr[:,0])/(kr[:,1]-kr[:,0]) - 0.5

        if self.augment:
            x_item, y_item = do_augment(x_item, y_item)

        return x_item.astype(self.dtype, copy=False), y_item.astype(self.dtype, copy=False), knobs_nn.astype(self.dtype, copy=False)

    # required part of torch.Dataset class.  This is how DataLoader gets a new piece of data

Source File: eval_hooks.py From DenseMatchingBenchmark with MIT License

5 votes

def __init__(self, cfg, dataset, interval=1):
        self.cfg = cfg.copy()
        assert isinstance(dataset, Dataset), \
            "dataset must be a Dataset object, not {}".format(type(dataset))
        self.dataset = dataset
        self.interval = interval

Source File: eval_hooks.py From DenseMatchingBenchmark with MIT License

5 votes

def __init__(self, cfg, dataset, interval=1):
        self.cfg = cfg.copy()
        assert isinstance(dataset, Dataset), \
            "dataset must be a Dataset object, not {}".format(type(dataset))
        self.dataset = dataset
        self.interval = interval

Source File: cocostuff.py From SPNet with MIT License

5 votes

def __repr__(self):
        fmt_str = "Dataset " + self.__class__.__name__ + "\n"
        fmt_str += "    Number of datapoints: {}\n".format(self.__len__())
        fmt_str += "    Split: {}\n".format(self.split)
        fmt_str += "    Root Location: {}\n".format(self.root)
        return fmt_str

Source File: dataset.py From scVI with MIT License

5 votes

def __getitem__(self, idx):
        """Implements @abstractcmethod in ``torch.utils.data.dataset.Dataset`` ."""
        return idx

Source File: utils_kvr.py From ConvLab with MIT License

5 votes

def get_seq(pairs,lang,batch_size,type,max_len):   
    x_seq = []
    y_seq = []
    ptr_seq = []
    gate_seq = []
    entity = []
    entity_cal = []
    entity_nav = []
    entity_wet = []
    for pair in pairs:
        x_seq.append(pair[0])
        y_seq.append(pair[1])
        ptr_seq.append(pair[2])
        gate_seq.append(pair[3])
        entity.append(pair[4])
        entity_cal.append(pair[5])
        entity_nav.append(pair[6])
        entity_wet.append(pair[7])
        if(type):
            lang.index_words(pair[0])
            lang.index_words(pair[1])
    
    dataset = Dataset(x_seq, y_seq,ptr_seq,gate_seq,lang.word2index, lang.word2index,max_len,entity,entity_cal,entity_nav,entity_wet)
    data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                              batch_size=batch_size,
                                              shuffle=type,
                                              collate_fn=collate_fn)
    return data_loader

Source File: inc_ext.py From iccv2019-inc with MIT License

5 votes

def _load_meta(self):
        path = os.path.join(self.root, self.base_folder, self.meta['filename'])
        if not check_integrity(path, self.meta['md5']):
            raise RuntimeError('Dataset metadata file not found or corrupted.' +
                               ' You can use download=True to download it')
        with open(path, 'rb') as infile:
            if sys.version_info[0] == 2:
                data = pickle.load(infile)
            else:
                data = pickle.load(infile, encoding='latin1')
            self.classes = data[self.meta['key']]
        self.class_to_idx = {_class: i for i, _class in enumerate(self.classes)}

Source File: vis_hooks.py From DenseMatchingBenchmark with MIT License

5 votes

def __init__(self, dataset, cfg, interval=1):
        self.cfg = cfg.copy()
        if isinstance(dataset, Dataset):
            self.dataset = dataset
        else:
            raise TypeError("dataset must be a Dataset object, not {}".format(type(dataset)))
        self.interval = interval

Source File: data_utils.py From cloudml-samples with Apache License 2.0

5 votes

def download_data():
    """Download the data from Google Cloud Storage"""
    # Load the Dataset from the public GCS bucket
    bucket = storage.Client().bucket('cloud-samples-data')
    # Path to the data inside the public bucket
    blob = bucket.blob('ml-engine/sonar/sonar.all-data')
    # Download the data
    blob.download_to_filename('sonar.all-data')

Source File: utils.py From audio with BSD 2-Clause "Simplified" License

5 votes

def __init__(self, dataset: Dataset, location: str = ".cached") -> None:
        self.dataset = dataset
        self.location = location

        self._id = id(self)
        self._cache: List = [None] * len(dataset)

Source File: classifier.py From metal with Apache License 2.0

5 votes

def _create_dataset(self, *data):
        """Converts input data to the appropriate Dataset"""
        # Make sure data is a tuple of dense tensors
        data = [self._to_torch(x, dtype=torch.FloatTensor) for x in data]
        return TensorDataset(*data)

Source File: convert_lmdb.py From torch-toolbox with BSD 3-Clause "New" or "Revised" License

5 votes

def generate_lmdb_dataset(
        data_set: Dataset,
        save_dir: str,
        name: str,
        num_workers=0,
        max_size_rate=1.0,
        write_frequency=5000):
    data_loader = DataLoader(
        data_set,
        num_workers=num_workers,
        collate_fn=lambda x: x)
    num_samples = len(data_set)
    check_dir(save_dir)
    lmdb_path = os.path.join(save_dir, '{}.lmdb'.format(name))
    db = lmdb.open(lmdb_path, subdir=False,
                   map_size=int(1099511627776 * max_size_rate),
                   readonly=False, meminit=True, map_async=True)
    txn = db.begin(write=True)
    for idx, data in enumerate(tqdm(data_loader)):
        txn.put(get_key(idx), dumps_pyarrow(data[0]))
        if idx % write_frequency == 0 and idx > 0:
            txn.commit()
            txn = db.begin(write=True)
    txn.put(b'__len__', dumps_pyarrow(num_samples))
    try:
        classes = data_set.classes
        class_to_idx = data_set.class_to_idx
        txn.put(b'classes', dumps_pyarrow(classes))
        txn.put(b'class_to_idx', dumps_pyarrow(class_to_idx))
    except AttributeError:
        pass

    txn.commit()
    db.sync()
    db.close()

Source File: classifier.py From metal with Apache License 2.0

5 votes

def _get_predictions(self, data, break_ties="random", return_probs=False, **kwargs):
        """Computes predictions in batch, given a labeled dataset

        Args:
            data: a Pytorch DataLoader, Dataset, or tuple with Tensors (X,Y):
                X: The input for the predict method
                Y: An [n] or [n, 1] torch.Tensor or np.ndarray of target labels
                    in {1,...,k}
            break_ties: How to break ties when making predictions
            return_probs: Return the predicted probabilities as well

        Returns:
            Y_p: A Tensor of predictions
            Y: A Tensor of labels
            [Optionally: Y_s: An [n, k] np.ndarray of predicted probabilities]
        """
        data_loader = self._create_data_loader(data)
        Y_p = []
        Y = []
        Y_s = []

        # Do batch evaluation by default, getting the predictions and labels
        for batch_num, data in enumerate(data_loader):
            Xb, Yb = data
            Y.append(self._to_numpy(Yb))

            # Optionally move to device
            if self.config["device"] != "cpu":
                Xb = place_on_gpu(Xb)

            # Append predictions and labels from DataLoader
            Y_pb, Y_sb = self.predict(
                Xb, break_ties=break_ties, return_probs=True, **kwargs
            )
            Y_p.append(self._to_numpy(Y_pb))
            Y_s.append(self._to_numpy(Y_sb))
        Y_p, Y, Y_s = map(self._stack_batches, [Y_p, Y, Y_s])
        if return_probs:
            return Y_p, Y, Y_s
        else:
            return Y_p, Y

Source File: datasets.py From signaltrain with GNU General Public License v3.0

5 votes

def process_audio(self):  # TODO: not used yet following torchaudio
        """ Render raw audio as pytorch-friendly file. TODO: not done yet.
        """
        if os.path.exists(self.processed_dir):
            return

        # get a list of available audio files.  Note that knob settings are included to the target filenames
        input_filenames = sorted(glob.glob(self.path+'/'+'input_*'))
        self.target_filenames = sorted(glob.glob(self.path+'/'+'target_*'))
        assert len(input_filenames) == len(target_filenames)   # TODO: One can image a scheme with multiple targets per input
        print("Dataset: Found",self.__len__(),"raw audio i-o pairs in path",self.path)

Source File: vis_hooks.py From DenseMatchingBenchmark with MIT License

5 votes

def __init__(self, dataset, cfg, interval=1):
        self.cfg = cfg.copy()
        if isinstance(dataset, Dataset):
            self.dataset = dataset
        else:
            raise TypeError("dataset must be a Dataset object, not {}".format(type(dataset)))
        self.interval = interval

Python torch.utils.data.Dataset() Examples