Python torch.utils.data.random_split() Examples

The following are 11 code examples of torch.utils.data.random_split(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.utils.data , or try the search function .
Example #1
Source File: data_utils.py    From cloudml-samples with Apache License 2.0 11 votes vote down vote up
def load_data(test_split, batch_size):
    """Loads the data"""
    sonar_dataset = SonarDataset('./sonar.all-data')
    # Create indices for the split
    dataset_size = len(sonar_dataset)
    test_size = int(test_split * dataset_size)
    train_size = dataset_size - test_size

    train_dataset, test_dataset = random_split(sonar_dataset,
                                               [train_size, test_size])

    train_loader = DataLoader(
        train_dataset.dataset,
        batch_size=batch_size,
        shuffle=True)
    test_loader = DataLoader(
        test_dataset.dataset,
        batch_size=batch_size,
        shuffle=True)

    return train_loader, test_loader 
Example #2
Source File: utils.py    From Text-Classification-Models-Pytorch with MIT License 7 votes vote down vote up
def get_iterators(config, train_file, test_file, val_file=None):
    train_set = MyDataset(train_file, config)
    test_set = MyDataset(test_file, config)
    
    # If validation file exists, load it. Otherwise get validation data from training data
    if val_file:
        val_set = MyDataset(val_file, config)
    else:
        train_size = int(0.9 * len(train_set))
        test_size = len(train_set) - train_size
        train_set, val_set = data.random_split(train_set, [train_size, test_size])
    
    train_iterator = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
    test_iterator = DataLoader(test_set, batch_size=config.batch_size)
    val_iterator = DataLoader(val_set, batch_size=config.batch_size)
    return train_iterator, test_iterator, val_iterator 
Example #3
Source File: dataset.py    From jdit with Apache License 2.0 7 votes vote down vote up
def _get_samples(dataset, sample_dataset_size=1):
        import math
        if int(len(dataset) * sample_dataset_size) <= 0:
            raise ValueError(
                "Dataset is %d too small. `sample_dataset_size` is %f" % (len(dataset), sample_dataset_size))
        size_is_prop = isinstance(sample_dataset_size, float)
        size_is_amount = isinstance(sample_dataset_size, int)
        if size_is_prop:
            if not (0 < sample_dataset_size <= 1):
                raise ValueError("sample_dataset_size proportion should between 0. and 1.")
            subdata_size = math.floor(sample_dataset_size * len(dataset))
        elif size_is_amount:
            if not (sample_dataset_size < len(dataset)):
                raise ValueError("sample_dataset_size amount should be smaller than length of dataset")
            subdata_size = sample_dataset_size
        else:
            raise Exception("sample_dataset_size should be float or int."
                            "%s was given" % str(sample_dataset_size))
        sample_dataset, _ = random_split(dataset, [subdata_size, len(dataset) - subdata_size])
        sample_loader = DataLoader(sample_dataset, batch_size=subdata_size, shuffle=True)
        [samples_data] = list(sample_loader)
        return samples_data 
Example #4
Source File: train.py    From sigver with BSD 3-Clause "New" or "Revised" License 7 votes vote down vote up
def setup_data_loaders(data, batch_size, input_size):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(data[1])
    data = TensorDataset(torch.from_numpy(data[0]), torch.from_numpy(y), torch.from_numpy(data[2]))
    train_size = int(0.9 * len(data))
    sizes = (train_size, len(data) - train_size)
    train_set, test_set = random_split(data, sizes)
    train_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomCrop(input_size),
        transforms.ToTensor(),
    ])
    train_set = TransformDataset(train_set, train_transforms)
    val_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
    ])
    test_set = TransformDataset(test_set, val_transforms)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(test_set, batch_size=batch_size)
    return train_loader, val_loader 
Example #5
Source File: data_utils.py    From cloudml-samples with Apache License 2.0 6 votes vote down vote up
def load_data(test_split, batch_size):
    """Loads the data"""
    sonar_dataset = SonarDataset('./sonar.all-data')
    # Create indices for the split
    dataset_size = len(sonar_dataset)
    test_size = int(test_split * dataset_size)
    train_size = dataset_size - test_size

    train_dataset, test_dataset = random_split(sonar_dataset,
                                               [train_size, test_size])

    train_loader = DataLoader(
        train_dataset.dataset,
        batch_size=batch_size,
        shuffle=True)
    test_loader = DataLoader(
        test_dataset.dataset,
        batch_size=batch_size,
        shuffle=True)

    return train_loader, test_loader 
Example #6
Source File: data_utils.py    From cloudml-samples with Apache License 2.0 6 votes vote down vote up
def load_data(test_split, seed, batch_size):
    """Loads the data"""
    sonar_dataset = SonarDataset('./sonar.all-data')
    # Create indices for the split
    dataset_size = len(sonar_dataset)
    test_size = int(test_split * dataset_size)
    train_size = dataset_size - test_size

    train_dataset, test_dataset = random_split(sonar_dataset,
                                               [train_size, test_size])

    train_loader = DataLoader(
        train_dataset.dataset,
        batch_size=batch_size,
        shuffle=True)
    test_loader = DataLoader(
        test_dataset.dataset,
        batch_size=batch_size,
        shuffle=True)

    return train_loader, test_loader 
Example #7
Source File: train_madry.py    From sigver with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setup_data_loaders(data, batch_size, input_size):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(data[1])
    data = TensorDataset(torch.from_numpy(data[0]), torch.from_numpy(y), torch.from_numpy(data[2]))
    train_size = int(0.9 * len(data))
    sizes = (train_size, len(data) - train_size)
    train_set, test_set = random_split(data, sizes)
    train_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomCrop(input_size),
        transforms.ToTensor(),
    ])
    train_set = TransformDataset(train_set, train_transforms)
    val_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
    ])
    test_set = TransformDataset(test_set, val_transforms)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(test_set, batch_size=batch_size)
    return train_loader, val_loader 
Example #8
Source File: train_ensadv.py    From sigver with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setup_data_loaders(data, batch_size, input_size):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(data[1])
    data = TensorDataset(torch.from_numpy(data[0]), torch.from_numpy(y), torch.from_numpy(data[2]))
    train_size = int(0.9 * len(data))
    sizes = (train_size, len(data) - train_size)
    train_set, test_set = random_split(data, sizes)
    train_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomCrop(input_size),
        transforms.ToTensor(),
    ])
    train_set = TransformDataset(train_set, train_transforms)
    val_transforms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
    ])
    test_set = TransformDataset(test_set, val_transforms)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(test_set, batch_size=batch_size)
    return train_loader, val_loader 
Example #9
Source File: loader.py    From fine-grained-sentiment with MIT License 6 votes vote down vote up
def create_dataloader(self,
                          df: pd.DataFrame,
                          batch_size: int = 32,
                          shuffle: bool = False,
                          valid_pct: float = None):
        "Process rows in pd.DataFrame using n_cpus and return a DataLoader"

        tqdm.pandas()
        with ProcessPoolExecutor(max_workers=n_cpu) as executor:
            result = list(
                tqdm(executor.map(self.process_row, df.iterrows(), chunksize=8192),
                     desc=f"Processing {len(df)} examples on {n_cpu} cores",
                     total=len(df)))

        features = [r[0] for r in result]
        labels = [r[1] for r in result]

        dataset = TensorDataset(torch.tensor(features, dtype=torch.long),
                                torch.tensor(labels, dtype=torch.long))

        if valid_pct is not None:
            valid_size = int(valid_pct * len(df))
            train_size = len(df) - valid_size
            valid_dataset, train_dataset = random_split(dataset, [valid_size, train_size])
            valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            return train_loader, valid_loader

        data_loader = DataLoader(dataset,
                                 batch_size=batch_size,
                                 num_workers=0,
                                 shuffle=shuffle,
                                 pin_memory=torch.cuda.is_available())
        return data_loader 
Example #10
Source File: mnist_pytorch_lightning.py    From ray with Apache License 2.0 5 votes vote down vote up
def prepare_data(self):
        mnist_train = self.download_data(self.data_dir)

        self.mnist_train, self.mnist_val = random_split(
            mnist_train, [55000, 5000]) 
Example #11
Source File: train.py    From asteroid with MIT License 4 votes vote down vote up
def main(conf):
    total_set = DNSDataset(conf['data']['json_dir'])
    train_len = int(len(total_set) * (1 - conf['data']['val_prop']))
    val_len = len(total_set) - train_len
    train_set, val_set = random_split(total_set, [train_len, val_len])

    train_loader = DataLoader(train_set, shuffle=True,
                              batch_size=conf['training']['batch_size'],
                              num_workers=conf['training']['num_workers'],
                              drop_last=True)
    val_loader = DataLoader(val_set, shuffle=False,
                            batch_size=conf['training']['batch_size'],
                            num_workers=conf['training']['num_workers'],
                            drop_last=True)

    # Define model and optimizer in a local function (defined in the recipe).
    # Two advantages to this : re-instantiating the model and optimizer
    # for retraining and evaluating is straight-forward.
    model, optimizer = make_model_and_optimizer(conf)
    # Define scheduler
    scheduler = None
    if conf['training']['half_lr']:
        scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5,
                                      patience=5)
    # Just after instantiating, save the args. Easy loading in the future.
    exp_dir = conf['main_args']['exp_dir']
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, 'conf.yml')
    with open(conf_path, 'w') as outfile:
        yaml.safe_dump(conf, outfile)

    # Define Loss function.
    loss_func = partial(distance, is_complex=conf['main_args']['is_complex'])
    system = SimpleSystem(model=model, loss_func=loss_func, optimizer=optimizer,
                          train_loader=train_loader, val_loader=val_loader,
                          scheduler=scheduler, config=conf)

    # Define callbacks
    checkpoint_dir = os.path.join(exp_dir, 'checkpoints/')
    checkpoint = ModelCheckpoint(checkpoint_dir, monitor='val_loss',
                                 mode='min', save_top_k=5, verbose=1)
    early_stopping = False
    if conf['training']['early_stop']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=10,
                                       verbose=1)

    # Don't ask GPU if they are not available.
    gpus = -1 if torch.cuda.is_available() else None
    trainer = pl.Trainer(max_nb_epochs=conf['training']['epochs'],
                         checkpoint_callback=checkpoint,
                         early_stop_callback=early_stopping,
                         default_save_path=exp_dir,
                         gpus=gpus,
                         distributed_backend='dp',
                         train_percent_check=1.0,  # Useful for fast experiment
                         gradient_clip_val=5.,)
    trainer.fit(system)

    best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        json.dump(best_k, f, indent=0)