Python Examples of torch.utils.data.ConcatDataset

Source File: run_hnn.py From mt-dnn with MIT License

6 votes

def build_training_data(args, tokenizer, tasks):
  dprd_task = DPRDTask(tokenizer)
  if args.wiki_data:
    wiki_task = WikiWSCRTask(tokenizer)
    train_data = wiki_task.get_train_dataset(args.wiki_data, args.max_seq_length, input_type=tasks)
  else:
    train_data = dprd_task.get_train_dataset(args.data_dir, args.max_seq_length, input_type=tasks)
    if args.dev_train:
      _data = dprd_task.get_dev_dataset(args.data_dir, args.max_seq_length, input_type=tasks)
      _data = [e.data for e in _data if e.name=='DPRD-test'][0]
      train_data = ConcatDataset([train_data, _data])
    if args.gap_data:
      gap_data = gap_task.get_train_dataset(args.gap_data, args.max_seq_length, input_type=tasks)
      train_data = ConcatDataset([train_data, gap_data])
      if args.dev_train:
        gap_data = [e.data for e in gap_task.get_dev_dataset(args.gap_data, args.max_seq_length, input_type=tasks)]
        train_data = ConcatDataset(gap_data + [train_data])
  return train_data

Source File: data.py From torch-kalman with MIT License

6 votes

def from_dataframe(cls,
                       dataframe: 'DataFrame',
                       group_colname: str,
                       time_colname: str,
                       dt_unit: Optional[str],
                       measure_colnames: Optional[Sequence[str]] = None,
                       X_colnames: Optional[Sequence[str]] = None,
                       y_colnames: Optional[Sequence[str]] = None,
                       **kwargs) -> 'TimeSeriesDataLoader':
        dataset = ConcatDataset(
            datasets=[
                TimeSeriesDataset.from_dataframe(
                    dataframe=df,
                    group_colname=group_colname,
                    time_colname=time_colname,
                    measure_colnames=measure_colnames,
                    X_colnames=X_colnames,
                    y_colnames=y_colnames,
                    dt_unit=dt_unit
                )
                for g, df in dataframe.groupby(group_colname)
            ]
        )
        return cls(dataset=dataset, **kwargs)

Source File: data.py From IIC with MIT License

6 votes

def _create_mapping_loader(config, dataset_class, partitions):
  imgs_list = []
  for partition in partitions:
    imgs_curr = dataset_class(
      **{"config": config,
         "split": partition,
         "purpose": "test"}  # return testing tuples, image and label
    )
    if config.use_doersch_datasets:
      imgs_curr = DoerschDataset(config, imgs_curr)
    imgs_list.append(imgs_curr)

  imgs = ConcatDataset(imgs_list)
  dataloader = torch.utils.data.DataLoader(imgs,
                                           batch_size=config.batch_sz,
                                           # full batch
                                           shuffle=False,
                                           # no point since not trained on
                                           num_workers=0,
                                           drop_last=False)
  return dataloader

Source File: test_its_journal_2019.py From ehpi_action_recognition with MIT License

6 votes

def get_test_set_lab(dataset_path: str, image_size: ImageSize):
    num_joints = 15
    datasets = [
    EhpiLSTMDataset(os.path.join(dataset_path, "JOURNAL_2019_03_TEST_VUE01_30FPS"),
                             transform=transforms.Compose([
                                 RemoveJointsOutsideImgEhpi(image_size),
                                 NormalizeEhpi(image_size)
                             ]), num_joints=num_joints, dataset_part=DatasetPart.TEST),
    EhpiLSTMDataset(os.path.join(dataset_path, "JOURNAL_2019_03_TEST_VUE02_30FPS"),
                             transform=transforms.Compose([
                                 RemoveJointsOutsideImgEhpi(image_size),
                                 NormalizeEhpi(image_size)
                             ]), num_joints=num_joints, dataset_part=DatasetPart.TEST),
    ]
    for dataset in datasets:
        dataset.print_label_statistics()
    return ConcatDataset(datasets)

Source File: full_omniglot.py From learn2learn with MIT License

6 votes

def __init__(self, root, transform=None, target_transform=None, download=False):
        self.root = os.path.expanduser(root)
        self.transform = transform
        self.target_transform = target_transform

        # Set up both the background and eval dataset
        omni_background = Omniglot(self.root, background=True, download=download)
        # Eval labels also start from 0.
        # It's important to add 964 to label values in eval so they don't overwrite background dataset.
        omni_evaluation = Omniglot(self.root,
                                   background=False,
                                   download=download,
                                   target_transform=lambda x: x + len(omni_background._characters))

        self.dataset = ConcatDataset((omni_background, omni_evaluation))
        self._bookkeeping_path = os.path.join(self.root, 'omniglot-bookkeeping.pkl')

Source File: __init__.py From SSD with MIT License

6 votes

def build_dataset(dataset_list, transform=None, target_transform=None, is_train=True):
    assert len(dataset_list) > 0
    datasets = []
    for dataset_name in dataset_list:
        data = DatasetCatalog.get(dataset_name)
        args = data['args']
        factory = _DATASETS[data['factory']]
        args['transform'] = transform
        args['target_transform'] = target_transform
        if factory == VOCDataset:
            args['keep_difficult'] = not is_train
        elif factory == COCODataset:
            args['remove_empty'] = is_train
        dataset = factory(**args)
        datasets.append(dataset)
    # for testing, return a list of datasets
    if not is_train:
        return datasets
    dataset = datasets[0]
    if len(datasets) > 1:
        dataset = ConcatDataset(datasets)

    return [dataset]

Source File: dataset_enum.py From BatchBALD with GNU General Public License v3.0

6 votes

def get_targets(dataset):
    """Get the targets of a dataset without any target target transforms(!)."""
    if isinstance(dataset, TransformedDataset):
        return get_targets(dataset.dataset)
    if isinstance(dataset, data.Subset):
        targets = get_targets(dataset.dataset)
        return torch.as_tensor(targets)[dataset.indices]
    if isinstance(dataset, data.ConcatDataset):
        return torch.cat([get_targets(sub_dataset) for sub_dataset in dataset.datasets])

    if isinstance(
            dataset, (datasets.MNIST, datasets.ImageFolder,)
    ):
        return torch.as_tensor(dataset.targets)
    if isinstance(dataset, datasets.SVHN):
        return dataset.labels

    raise NotImplementedError(f"Unknown dataset {dataset}!")

Source File: dataset_enum.py From BatchBALD with GNU General Public License v3.0

6 votes

def get_CINIC10(root="./"):
    cinic_directory = root + "data/CINIC-10"
    cinic_mean = [0.47889522, 0.47227842, 0.43047404]
    cinic_std = [0.24205776, 0.23828046, 0.25874835]

    train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip()])
    shared_transform = transforms.Compose([transforms.ToTensor(),
                                           transforms.Normalize(mean=cinic_mean,
                                                                std=cinic_std)])

    train_dataset = datasets.ImageFolder(cinic_directory + '/train')
    validation_dataset = datasets.ImageFolder(cinic_directory + '/valid')

    # Concatenate train and validation set to have more samples.
    merged_train_dataset = torch.utils.data.ConcatDataset([train_dataset, validation_dataset])

    test_dataset = datasets.ImageFolder(cinic_directory + '/test')

    return DataSource(
        train_dataset=merged_train_dataset,
        test_dataset=test_dataset,
        shared_transform=shared_transform,
        train_transform=train_transform,
    )

Source File: train_its_journal_2019.py From ehpi_action_recognition with MIT License

6 votes

def get_training_set_gt(dataset_path: str, image_size: ImageSize):
    num_joints = 15
    left_indexes: List[int] = [3, 4, 5, 9, 10, 11]
    right_indexes: List[int] = [6, 7, 8, 12, 13, 14]

    datasets: List[EhpiLSTMDataset] = [
        EhpiLSTMDataset(os.path.join(dataset_path, "JOURNAL_2019_03_GT_30fps"),
                        transform=transforms.Compose([
                            RemoveJointsOutsideImgEhpi(image_size),
                            ScaleEhpi(image_size),
                            TranslateEhpi(image_size),
                            FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                            NormalizeEhpi(image_size)
                        ]), num_joints=num_joints),
    ]
    for dataset in datasets:
        dataset.print_label_statistics()

    return ConcatDataset(datasets)

Source File: marcuhmot.py From tracking_wo_bnw with GNU General Public License v3.0

5 votes

def __init__(self, split, dataloader):
		print("[*] Loading Market1501")
		market = Market1501('gt_bbox', **dataloader)
		print("[*] Loading CUHK03")
		cuhk = CUHK03('labeled', **dataloader)
		print("[*] Loading MOT")
		mot = MOTreIDWrapper(split, dataloader)

		self.dataset = ConcatDataset([market, cuhk, mot])

Source File: data_silo.py From FARM with Apache License 2.0

5 votes

def random_split_ConcatDataset(self, ds, lengths):
        """
        Roughly split a Concatdataset into non-overlapping new datasets of given lengths.
        Samples inside Concatdataset should already be shuffled

        :param ds: Dataset to be split
        :type ds: Dataset
        :param lengths: lengths of splits to be produced
        :type lengths: list
        """
        if sum(lengths) != len(ds):
            raise ValueError("Sum of input lengths does not equal the length of the input dataset!")

        try:
            idx_dataset = np.where(np.array(ds.cumulative_sizes) > lengths[0])[0][0]
        except IndexError:
            raise Exception("All dataset chunks are being assigned to train set leaving no samples for dev set. "
                            "Either consider increasing dev_split or setting it to 0.0\n"
                            f"Cumulative chunk sizes: {ds.cumulative_sizes}\n"
                            f"train/dev split: {lengths}")

        assert idx_dataset >= 1, "Dev_split ratio is too large, there is no data in train set. " \
                             f"Please lower dev_split = {self.processor.dev_split}"

        train = ConcatDataset(ds.datasets[:idx_dataset])
        test = ConcatDataset(ds.datasets[idx_dataset:])
        return train, test

Source File: base.py From DSD-SATN with Apache License 2.0

5 votes

def _create_data_loader(self,train_flag=True,hard_minging=False):
        print('gathering datasets')
        if self.internet:
            datasets = Internet(train_flag = train_flag,high_resolution = self.high_resolution, spawn = self.receptive_field,video=self.video)
        elif self.test_single:
            datasets = Demo_Loader(train_flag = train_flag,high_resolution = self.high_resolution)
        elif self.eval_pw3d:
            datasets = PW3D(train_flag = train_flag,high_resolution = self.high_resolution, spawn = self.receptive_field,video=self.video,kps_alpha_format=self.alpha_format)
        else:
            datasets_list = []
            if self.with_h36m:
                h36m = hum36m_dataloader(scale_range = [1.0, 1.6],train_flag=train_flag)#[1.4, 1.6],
                datasets_list = [h36m]
            if self.with_up:
                updataset = UP(train_flag=train_flag,high_resolution=self.high_resolution)
                datasets_list.append(updataset)
            if self.with_mpii:
                mpii = MPIIDataset(train_flag=train_flag,high_resolution=self.high_resolution,)
                datasets_list.append(mpii)
            if self.with_aich:
                aich = AICH(train_flag=train_flag,high_resolution=self.high_resolution,)
                datasets_list.append(aich)
            if self.with_pa:
                pa = Penn_Action(train_flag = train_flag,high_resolution = self.high_resolution,kps_alpha_format=self.alpha_format,spawn = self.receptive_field,video=self.video,receptive_field = self.receptive_field,)
                datasets_list.append(pa)

            datasets = torch.utils.data.ConcatDataset(list(datasets_list))
        print('gathered datasets')

        return DataLoader(dataset = datasets, batch_size = self.batch_size if train_flag else self.val_batch_size,\
            shuffle = True,drop_last = False, pin_memory = True,num_workers = self.nw)

Source File: base.py From DSD-SATN with Apache License 2.0

5 votes

def _create_adv_data_loader(self, data_adv_set):
        data_set = []
        for data_set_name in data_adv_set:
            data_set_path = config.data_set_path[data_set_name]
            if data_set_name == 'mosh':
                mosh = Mosh(data_set_path = data_set_path,)
                data_set.append(mosh)
            else:
                msg = 'invalid adv dataset'
                sys.exit(msg)

        con_adv_dataset = ConcatDataset(data_set)
        return DataLoader(dataset = con_adv_dataset,batch_size = self.batch_size, shuffle = True,drop_last = True,pin_memory = True)

Source File: dataset_enum.py From BatchBALD with GNU General Public License v3.0

5 votes

def get_RepeatedMNIST():
    # num_classes = 10, input_size = 28
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
    org_train_dataset = datasets.MNIST("data", train=True, download=True, transform=transform)
    train_dataset = data.ConcatDataset([org_train_dataset] * 3)

    test_dataset = datasets.MNIST("data", train=False, transform=transform)
    return DataSource(train_dataset=train_dataset, test_dataset=test_dataset)

Source File: data.py From IIC with MIT License

5 votes

def _create_dataloaders(config, dataset_class):
  # unlike in clustering, each dataloader here returns pairs of images - we
  # need the matrix relation between them
  dataloaders = []
  do_shuffle = (config.num_dataloaders == 1)
  for d_i in xrange(config.num_dataloaders):
    print("Creating paired dataloader %d out of %d time %s" %
          (d_i, config.num_dataloaders, datetime.now()))
    sys.stdout.flush()

    train_imgs_list = []
    for train_partition in config.train_partitions:
      train_imgs_curr = dataset_class(
        **{"config": config,
           "split": train_partition,
           "purpose": "train"}  # return training tuples, not including labels
      )
      if config.use_doersch_datasets:
        train_imgs_curr = DoerschDataset(config, train_imgs_curr)

      train_imgs_list.append(train_imgs_curr)

    train_imgs = ConcatDataset(train_imgs_list)

    train_dataloader = torch.utils.data.DataLoader(train_imgs,
                                                   batch_size=config.dataloader_batch_sz,
                                                   shuffle=do_shuffle,
                                                   num_workers=0,
                                                   drop_last=False)

    if d_i > 0:
      assert (len(train_dataloader) == len(dataloaders[d_i - 1]))

    dataloaders.append(train_dataloader)

  num_train_batches = len(dataloaders[0])
  print("Length of paired datasets vector %d" % len(dataloaders))
  print("Number of batches per epoch: %d" % num_train_batches)
  sys.stdout.flush()

  return dataloaders

Source File: build.py From afm_cvpr2019 with MIT License

5 votes

def build_train_dataset(config):
    
    root_list = [osp.abspath(osp.join(osp.dirname(__file__),'..','data', f)) for f in config.DATASETS.TRAIN]

    IN_RES = [config.INPUT.IN_RES]*2
    OUT_RES= [config.INPUT.OUT_RES]*2

    get_dataset = lambda path: AFMTrainDataset(path, img_res=IN_RES, afm_res=OUT_RES)
    
    dataset = data.ConcatDataset(list(map(get_dataset,root_list)))    

    dataset = data.DataLoader(dataset, batch_size=config.SOLVER.BATCH_SIZE,shuffle=True,num_workers=config.DATALOADER.NUM_WORKERS, pin_memory=True)

    return dataset

Source File: run_hnn.py From mt-dnn with MIT License

5 votes

def build_training_data_mt(args, tokenizer):
  if args.group_tasks:
    return build_training_data(args, tokenizer, args.tasks)
  else:
    data = []
    for t in args.tasks:
      data.append(build_training_data(args, tokenizer, [t]))
    return ConcatDataset(data)

Source File: data.py From continual-learning with MIT License

5 votes

def get_dataset(name, type='train', download=True, capacity=None, permutation=None, dir='./datasets',
                verbose=False, target_transform=None):
    '''Create [train|valid|test]-dataset.'''

    data_name = 'mnist' if name=='mnist28' else name
    dataset_class = AVAILABLE_DATASETS[data_name]

    # specify image-transformations to be applied
    dataset_transform = transforms.Compose([
        *AVAILABLE_TRANSFORMS[name],
        transforms.Lambda(lambda x: _permutate_image_pixels(x, permutation)),
    ])

    # load data-set
    dataset = dataset_class('{dir}/{name}'.format(dir=dir, name=data_name), train=False if type=='test' else True,
                            download=download, transform=dataset_transform, target_transform=target_transform)

    # print information about dataset on the screen
    if verbose:
        print("  --> {}: '{}'-dataset consisting of {} samples".format(name, type, len(dataset)))

    # if dataset is (possibly) not large enough, create copies until it is.
    if capacity is not None and len(dataset) < capacity:
        dataset = ConcatDataset([copy.deepcopy(dataset) for _ in range(int(np.ceil(capacity / len(dataset))))])

    return dataset


#----------------------------------------------------------------------------------------------------------#

Source File: data.py From pytorch-deep-generative-replay with MIT License

5 votes

def get_dataset(name, train=True, permutation=None, capacity=None):
    dataset = (TRAIN_DATASETS[name] if train else TEST_DATASETS[name])()
    dataset.transform = transforms.Compose([
        dataset.transform,
        transforms.Lambda(lambda x: _permutate_image_pixels(x, permutation)),
    ])

    if capacity is not None and len(dataset) < capacity:
        return ConcatDataset([
            copy.deepcopy(dataset) for _ in
            range(math.ceil(capacity / len(dataset)))
        ])
    else:
        return dataset

Source File: core.py From texture_fields with MIT License

5 votes

def __init__(self, dataset_folder, fields, split=None,
                 classes=None, no_except=True, transform=None):
        # Read metadata file
        metadata_file = os.path.join(dataset_folder, 'metadata.yaml')
        if os.path.exists(metadata_file):
            with open(metadata_file, 'r') as f:
                metadata = yaml.load(f)
        else:
            metadata = {}

        # If classes is None, use all subfolders
        if classes is None:
            classes = os.listdir(dataset_folder)
            classes = [c for c in classes
                       if os.path.isdir(os.path.join(dataset_folder, c))]

        # Get all sub-datasets
        self.datasets_classes = []
        for c in classes:
            subpath = os.path.join(dataset_folder, c)
            if not os.path.isdir(subpath):
                logger.warning('Class %s does not exist in dataset.' % c)

            metadata_c = metadata.get(c, {'id': c, 'name': 'n/a'})
            dataset = Shapes3dClassDataset(subpath, fields, split,
                                           metadata_c, no_except,
                                           transform=transform)
            self.datasets_classes.append(dataset)

        self._concat_dataset = data.ConcatDataset(self.datasets_classes)

Source File: train_ehpi.py From ehpi_action_recognition with MIT License

5 votes

def get_train_set(dataset_path: str, image_size: ImageSize):
    num_joints = 15
    left_indexes: List[int] = [3, 4, 5, 9, 10, 11]
    right_indexes: List[int] = [6, 7, 8, 12, 13, 14]

    datasets: List[EhpiDataset] = [
        # Set 1
        EhpiDataset(os.path.join(dataset_path, "ofp_record_2019_03_11_HSRT_30FPS"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints, dataset_part=DatasetPart.TEST),
        # Set 2
        EhpiDataset(os.path.join(dataset_path, "2019_03_13_Freilichtmuseum_30FPS"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints, dataset_part=DatasetPart.TRAIN),
    ]
    for dataset in datasets:
        dataset.print_label_statistics()

    return ConcatDataset(datasets)

Source File: train_its_journal_2019.py From ehpi_action_recognition with MIT License

5 votes

def get_training_set_both(dataset_path: str, image_size: ImageSize):
    num_joints = 15
    left_indexes: List[int] = [3, 4, 5, 9, 10, 11]
    right_indexes: List[int] = [6, 7, 8, 12, 13, 14]

    datasets: List[EhpiLSTMDataset] = [
        EhpiLSTMDataset(os.path.join(dataset_path, "JOURNAL_2019_03_POSEALGO_30fps"),
                        transform=transforms.Compose([
                            RemoveJointsOutsideImgEhpi(image_size),
                            ScaleEhpi(image_size),
                            TranslateEhpi(image_size),
                            FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                            NormalizeEhpi(image_size)
                        ]), num_joints=num_joints),
        EhpiLSTMDataset(os.path.join(dataset_path, "JOURNAL_2019_03_GT_30fps"),
                        transform=transforms.Compose([
                            RemoveJointsOutsideImgEhpi(image_size),
                            ScaleEhpi(image_size),
                            TranslateEhpi(image_size),
                            FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                            NormalizeEhpi(image_size)
                        ]), num_joints=num_joints),
    ]
    for dataset in datasets:
        dataset.print_label_statistics()

    return ConcatDataset(datasets)

Source File: train_ehpi_itsc_2019_ofp.py From ehpi_action_recognition with MIT License

5 votes

def get_sim_gt_only(dataset_path: str, image_size: ImageSize):
    num_joints = 15
    left_indexes: List[int] = [3, 4, 5, 9, 10, 11]
    right_indexes: List[int] = [6, 7, 8, 12, 13, 14]

    datasets: List[EhpiDataset] = [
        EhpiDataset(os.path.join(dataset_path, "ofp_sim_gt_equal_30fps"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        RemoveJointsEhpi(indexes_to_remove=foot_indexes, indexes_to_remove_2=knee_indexes,
                                         probability=0.25),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints),
        EhpiDataset(os.path.join(dataset_path, "ofp_from_mocap_gt_30fps"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        RemoveJointsEhpi(indexes_to_remove=foot_indexes, indexes_to_remove_2=knee_indexes,
                                         probability=0.25),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints),
    ]
    for dataset in datasets:
        dataset.print_label_statistics()

    return ConcatDataset(datasets)

Source File: train_ehpi_itsc_2019_ofp.py From ehpi_action_recognition with MIT License

5 votes

def get_sim_pose_algo_only(dataset_path: str, image_size: ImageSize):
    num_joints = 15
    left_indexes: List[int] = [3, 4, 5, 9, 10, 11]
    right_indexes: List[int] = [6, 7, 8, 12, 13, 14]

    datasets: List[EhpiDataset] = [
        EhpiDataset(os.path.join(dataset_path, "ofp_sim_pose_algo_equal_30fps"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        RemoveJointsEhpi(indexes_to_remove=foot_indexes, indexes_to_remove_2=knee_indexes,
                                         probability=0.25),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints),
        EhpiDataset(os.path.join(dataset_path, "ofp_from_mocap_pose_algo_30fps"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        RemoveJointsEhpi(indexes_to_remove=foot_indexes, indexes_to_remove_2=knee_indexes,
                                         probability=0.25),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints),
    ]
    for dataset in datasets:
        dataset.print_label_statistics()

    return ConcatDataset(datasets)

Source File: dataset.py From source_separation with Apache License 2.0

5 votes

def get_concated_datasets(meta_dir_list: List[str], batch_size: int, num_workers: int,
                          meta_cls_list: List[MetaFrame],
                          fix_len: int = 0, skip_audio: bool = False, sample_rate: int = 44100,
                          audio_mask: bool = False) -> Tuple[SpeechDataLoader, SpeechDataLoader]:

    assert all([os.path.isdir(x) for x in meta_dir_list]), 'There are not valid directory paths!'.format()
    assert len(meta_dir_list) == len(meta_cls_list), 'meta_dir_list, meta_cls_list are must have same length!'

    # datasets
    train_datasets = []
    valid_datasets = []

    for meta_cls, meta_dir in zip(meta_cls_list, meta_dir_list):
        train_file, valid_file = meta_cls.frame_file_names[1:]

        # load meta file
        train_meta = meta_cls(os.path.join(meta_dir, train_file), sr=sample_rate)
        valid_meta = meta_cls(os.path.join(meta_dir, valid_file), sr=sample_rate)

        # create dataset
        train_dataset = AugmentSpeechDataset(train_meta, fix_len=fix_len, skip_audio=skip_audio, audio_mask=audio_mask)
        valid_dataset = AugmentSpeechDataset(valid_meta, fix_len=fix_len, skip_audio=skip_audio, audio_mask=audio_mask)

        train_datasets.append(train_dataset)
        valid_datasets.append(valid_dataset)

    # make concat dataset
    train_conc_dataset = ConcatDataset(train_datasets)
    valid_conc_dataset = ConcatDataset(valid_datasets)

    # create data loader
    train_loader = SpeechDataLoader(train_conc_dataset, batch_size=batch_size, is_bucket=False,
                                    num_workers=num_workers, skip_last_bucket=False)
    valid_loader = SpeechDataLoader(valid_conc_dataset, batch_size=batch_size, is_bucket=False,
                                    num_workers=num_workers, skip_last_bucket=False)

    return train_loader, valid_loader

Source File: language_modeling.py From training_results_v0.5 with Apache License 2.0

5 votes

def load_dataset(self, split, combine=False):
        """Load a dataset split."""

        loaded_datasets = []

        for k in itertools.count():
            split_k = split + (str(k) if k > 0 else '')
            path = os.path.join(self.args.data, split_k)

            if self.args.raw_text and IndexedRawTextDataset.exists(path):
                ds = IndexedRawTextDataset(path, self.dictionary)
                tokens = [t for l in ds.tokens_list for t in l]
            elif not self.args.raw_text and IndexedInMemoryDataset.exists(path):
                ds = IndexedInMemoryDataset(path, fix_lua_indexing=True)
                tokens = ds.buffer
            else:
                if k > 0:
                    break
                else:
                    raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data))

            loaded_datasets.append(
                TokenBlockDataset(
                    tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode,
                    include_targets=True
                ))

            print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1])))

            if not combine:
                break

        if len(loaded_datasets) == 1:
            dataset = loaded_datasets[0]
            sizes = dataset.sizes
        else:
            dataset = ConcatDataset(loaded_datasets)
            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])

        self.datasets[split] = MonolingualDataset(dataset, sizes, self.dictionary, shuffle=False)

Source File: slides.py From torchsupport with MIT License

5 votes

def MultiSlideData(self, paths, size=(224, 224), level=0, transform=lambda x: x):
  datasets = []
  for path in paths:
    datasets.append(SingleSlideData(path, size=size, level=level, transform=transform))
  return ConcatDataset(datasets)

Source File: dataloader.py From OpenLongTailRecognition-OLTR with BSD 3-Clause "New" or "Revised" License

5 votes

def load_data(data_root, dataset, phase, batch_size, sampler_dic=None, num_workers=4, test_open=False, shuffle=True):
    
    txt = './data/%s/%s_%s.txt'%(dataset, dataset, (phase if phase != 'train_plain' else 'train'))

    print('Loading data from %s' % (txt))

    if phase not in ['train', 'val']:
        transform = data_transforms['test']
    else:
        transform = data_transforms[phase]

    print('Use data transformation:', transform)

    set_ = LT_Dataset(data_root, txt, transform)

    if phase == 'test' and test_open:
        open_txt = './data/%s/%s_open.txt'%(dataset, dataset)
        print('Testing with opensets from %s'%(open_txt))
        open_set_ = LT_Dataset('./data/%s/%s_open'%(dataset, dataset), open_txt, transform)
        set_ = ConcatDataset([set_, open_set_])

    if sampler_dic and phase == 'train':
        print('Using sampler.')
        print('Sample %s samples per-class.' % sampler_dic['num_samples_cls'])
        return DataLoader(dataset=set_, batch_size=batch_size, shuffle=False,
                           sampler=sampler_dic['sampler'](set_, sampler_dic['num_samples_cls']),
                           num_workers=num_workers)
    else:
        print('No sampler.')
        print('Shuffle is %s.' % (shuffle))
        return DataLoader(dataset=set_, batch_size=batch_size,
                          shuffle=shuffle, num_workers=num_workers)

Source File: video_datasets.py From RCRNet-Pytorch with MIT License

4 votes

def get_datasets(name_list, split_list, config_path, root, training, transforms,
                    read_clip=False, random_reverse_clip=False, label_interval=1, frame_between_label_num=0, clip_len=4):
    """
        return type of data.ConcatDataset or single dataset data.Dataset
    """
    if not isinstance(name_list, list):
        name_list = [name_list]
    if not isinstance(split_list, list):
        split_list = [split_list]
    if len(name_list) != len(split_list):
        raise ValueError("Dataset numbers must match split numbers")
    # read dataset config
    datasets_config = yaml.load(open(config_path))
    # get datasets
    dataset_list = []
    for name, split in zip(name_list, split_list):
        if name not in datasets_config.keys():
            raise ValueError("Error dataset name {}".format(name))

        dataset_config = datasets_config[name]
        dataset_config['name'] = name
        dataset_config['root'] = root
        dataset_config['split'] = split
        dataset_config['training'] = training
        dataset_config['transforms'] = transforms

        if "video_split" in dataset_config:
            dataset_config['label_interval'] = label_interval
            dataset_config['frame_between_label_num'] = frame_between_label_num
            if read_clip:
                dataset = VideoClipDataset(clip_len=clip_len,
                                        random_reverse_clip=random_reverse_clip,
                                        **dataset_config)
            else:
                dataset = VideoImageDataset(**dataset_config)
        else:
            dataset = ImageDataset(**dataset_config)

        dataset_list.append(dataset)

    if len(dataset_list) == 1:
        return dataset_list[0]
    else:
        return data.ConcatDataset(dataset_list)

Source File: train_ehpi_itsc_2019_ofp.py From ehpi_action_recognition with MIT License

4 votes

def get_set_wo_sim(dataset_path: str, image_size: ImageSize):
    num_joints = 15
    left_indexes: List[int] = [3, 4, 5, 9, 10, 11]
    right_indexes: List[int] = [6, 7, 8, 12, 13, 14]

    datasets: List[EhpiDataset] = [
        EhpiDataset(os.path.join(dataset_path, "ofp_webcam"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints),
        EhpiDataset(os.path.join(dataset_path, "ofp_record_2019_03_11_30FPS"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints),
        EhpiDataset(os.path.join(dataset_path, "ofp_record_2019_03_11_HSRT_30FPS"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints, dataset_part=DatasetPart.TEST),
        EhpiDataset(os.path.join(dataset_path, "ofp_record_2019_03_11_HELLA_30FPS"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints, dataset_part=DatasetPart.TRAIN),
        # Freilichtmuseum
        EhpiDataset(os.path.join(dataset_path, "2019_03_13_Freilichtmuseum_30FPS"),
                    transform=transforms.Compose([
                        RemoveJointsOutsideImgEhpi(image_size),
                        ScaleEhpi(image_size),
                        TranslateEhpi(image_size),
                        FlipEhpi(left_indexes=left_indexes, right_indexes=right_indexes),
                        NormalizeEhpi(image_size)
                    ]), num_joints=num_joints, dataset_part=DatasetPart.TRAIN),
    ]
    for dataset in datasets:
        dataset.print_label_statistics()

    return ConcatDataset(datasets)

Python torch.utils.data.ConcatDataset() Examples