Python Examples of torch.utils.data.TensorDataset

Source File: base_trainer.py From BiaffineDependencyParsing with MIT License

8 votes

def _unpack_batch(self, batch: TensorDataset) -> Dict:
        """
        拆分batch，得到encoder的输入和word mask，sentence length，以及dep ids,以及其他输入信息
        eg:
            dataset = TensorDataset(all_input_ids, all_input_mask,
                        all_segment_ids, all_start_pos,
                        all_end_pos, all_dep_ids,
                        all_pos_ids)

        Args:
            batch: 输入的单个batch,类型为TensorDataset(或者torchtext.dataset)，可用索引分别取值

        Returns:
            返回一个字典，[1]是inputs，类型为字典；[2]是word mask；[3]是sentence length,python 列表；[4]是dep ids,
            根据实际情况可能还包含其他输入信息
        """
        raise NotImplementedError('must implement in sub class')

Source File: bertology_loader.py From BiaffineDependencyParsing with MIT License

7 votes

def feature_to_dataset(features):
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_start_pos = torch.tensor([t.start_pos for t in features], dtype=torch.long)
    # print([t.end_pos for t in features])
    all_end_pos = torch.tensor([t.end_pos for t in features], dtype=torch.long)
    all_dep_ids = torch.tensor([t.dep_ids for t in features], dtype=torch.long)
    tensors = [all_input_ids, all_input_mask, all_segment_ids, all_start_pos, all_end_pos, all_dep_ids]
    if hasattr(features[0], 'pos_ids'):
        all_pos_ids = torch.tensor([t.pos_ids for t in features], dtype=torch.long)
        tensors.append(all_pos_ids)
    dataset = TensorDataset(*tensors)
    # Input Tensors:
    #   all_input_ids,
    #   all_input_mask,
    #   all_segment_ids,
    #   all_start_pos,
    #   all_end_pos,
    #   all_dep_ids,
    #   all_pos_ids, (如果有)
    return dataset

Source File: serverstate.py From gandissect with MIT License

6 votes

def feature_maps(self, z_batch, intervention=None, layers=None,
            quantiles=True):
        feature_map = defaultdict(list)
        with torch.no_grad(), self.modellock:
            batch_size = 10
            self.apply_intervention(intervention)
            test_loader = DataLoader(
                TensorDataset(z_batch[:,:,None,None]),
                batch_size=batch_size,
                pin_memory=('cuda' == self.device.type
                    and z_batch.device.type == 'cpu'))
            processed = 0
            for batch_num, [batch_z] in enumerate(test_loader):
                batch_z = batch_z.to(self.device)
                # Run model but disregard output
                self.model(batch_z)
                processing = batch_z.shape[0]
                for layer, feature in self.model.retained_features().items():
                    for single_featuremap in feature:
                        if quantiles:
                            feature_map[layer].append(self.quantiles[layer]
                                    .normalize(single_featuremap))
                        else:
                            feature_map[layer].append(single_featuremap)
        return feature_map

Source File: serverstate.py From gandissect with MIT License

6 votes

def generate_images(self, z_batch, intervention=None):
        '''
        Makes some images.
        '''
        with torch.no_grad(), self.modellock:
            batch_size = 10
            self.apply_intervention(intervention)
            test_loader = DataLoader(TensorDataset(z_batch[:,:,None,None]),
                batch_size=batch_size,
                pin_memory=('cuda' == self.device.type
                            and z_batch.device.type == 'cpu'))
            result_img = torch.zeros(
                    *((len(z_batch), 3) + self.model.output_shape[2:]),
                    dtype=torch.uint8, device=self.device)
            for batch_num, [batch_z,] in enumerate(test_loader):
                batch_z = batch_z.to(self.device)
                out = self.model(batch_z)
                result_img[batch_num*batch_size:
                        batch_num*batch_size+len(batch_z)] = (
                                (((out + 1) / 2) * 255).clamp(0, 255).byte())
            return result_img

Source File: msda_preprocessed_amazon_dataset.py From man with MIT License

6 votes

def get_msda_amazon_datasets(data_file, domain, kfold, feature_num):
    print(f'Loading mSDA Preprocessed Multi-Domain Amazon data for {domain} Domain')
    dataset = pickle.load(open(data_file, 'rb'))[domain]

    lx, ly = dataset['labeled']
    if feature_num > 0:
        lx = lx[:, : feature_num]
    lx = torch.from_numpy(lx.toarray()).float().to(opt.device)
    ly = torch.from_numpy(ly).long().to(opt.device)
    print(f'{domain} Domain has {len(ly)} labeled instances.')
    # if opt.use_cuda:
    #     lx, ly = lx.cuda(), ly.cuda()
    labeled_set = FoldedDataset(TensorDataset, kfold, lx, ly)

    ux, uy = dataset['unlabeled']
    if feature_num > 0:
        ux = ux[:, : feature_num]
    ux = torch.from_numpy(ux.toarray()).float().to(opt.device)
    uy = torch.from_numpy(uy).long().to(opt.device)
    print(f'{domain} Domain has {len(uy)} unlabeled instances.')
    # if opt.use_cuda:
    #     ux, uy = ux.cuda(), uy.cuda()
    unlabeled_set = TensorDataset(ux, uy)

    return labeled_set, unlabeled_set

Source File: folded_dataset.py From man with MIT License

6 votes

def get_folds(self, folds):
        indices = np.hstack([self.folds[f] for f in folds]).reshape(-1)
        if self.__class__.__bases__[0].__name__ == 'TensorDataset':
            indices = torch.from_numpy(indices).to(opt.device)
            # if opt.use_cuda:
            #     indices = indices.cuda()
            X = torch.index_select(self.tensors[0], 0, indices)
            Y = torch.index_select(self.tensors[1], 0, indices)
            return TensorDataset(X, Y)
        else:
            X = [self.X[i] for i in indices]
            indices = torch.from_numpy(indices).to(opt.device)
            # if opt.use_cuda:
            #     indices = indices.cuda()
            Y = torch.index_select(self.Y, 0, indices)
        return AmazonDataset(X, Y, self.max_seq_len)

Source File: rnn_utils.py From guacamol_baselines with MIT License

6 votes

def get_tensor_dataset(numpy_array):
    """
    Gets a numpy array of indices, convert it into a Torch tensor,
    divided it into inputs and targets and wrap it
    into a TensorDataset

    Args:
        numpy_array: to be converted

    Returns: a TensorDataset
    """

    tensor = torch.from_numpy(numpy_array).long()

    inp = tensor[:, :-1]
    target = tensor[:, 1:]

    return TensorDataset(inp, target)

Source File: trial.py From torchbearer with MIT License

6 votes

def with_test_data(self, x, batch_size=1, num_workers=1, steps=None):
        """Use this trial with the given test data. Returns self so that methods can be chained for convenience.

        Example: ::

            # Simple trial that runs for 10 test iterations on some random data
            >>> from torchbearer import Trial
            >>> data = torch.rand(10, 1)
            >>> trial = Trial(None).with_test_data(data).for_test_steps(10).run(1)

        Args:
            x (torch.Tensor): The test x data to use during calls to :meth:`.predict`
            batch_size (int): The size of each batch to sample from the data
            num_workers (int): Number of worker threads to use in the data loader
            steps (int): The number of steps per epoch to take when using this data

        Returns:
            Trial: self
        """
        dataset = TensorDataset(x)
        dataloader = DataLoader(dataset, batch_size, num_workers=num_workers)
        self.with_test_generator(dataloader, steps=steps)

        return self

Source File: logistic.py From holoclean with Apache License 2.0

6 votes

def train(self, num_epochs=3, batch_size=32):
        """
        Trains the LR model.

        :param num_epochs: (int) number of epochs.
        """
        batch_losses = []
        # We train only on cells that do not have their initial value as NULL.
        X_train, Y_train = self._X.index_select(0, self._train_idx), self._Y.index_select(0, self._train_idx)
        torch_ds = TensorDataset(X_train, Y_train)

        # Main training loop.
        for epoch_idx in range(1, num_epochs+1):
            logging.debug("Logistic: epoch %d", epoch_idx)
            batch_cnt = 0
            for batch_X, batch_Y in tqdm(DataLoader(torch_ds, batch_size=batch_size)):
                batch_pred = self.forward(batch_X)
                batch_loss = self._loss(batch_pred, batch_Y.reshape(-1,1))
                batch_losses.append(float(batch_loss))
                self.zero_grad()
                batch_loss.backward()
                self._optimizer.step()
                batch_cnt += 1
            logging.debug('Logistic: average batch loss: %f', sum(batch_losses[-1 * batch_cnt:]) / batch_cnt)
        return batch_losses

Source File: test_end_to_end.py From torchbearer with MIT License

6 votes

def test_callbacks(self):
        from torch.utils.data import TensorDataset
        traingen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1))
        valgen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1))
        testgen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1))

        model = torch.nn.Linear(3, 1)
        optim = torch.optim.SGD(model.parameters(), lr=0.01)
        cbs = []
        cbs.extend([c.EarlyStopping(), c.GradientClipping(10, model.parameters()), c.Best('test.pt'),
                    c.MostRecent('test.pt'), c.ReduceLROnPlateau(), c.CosineAnnealingLR(0.1, 0.01),
                    c.ExponentialLR(1), c.Interval('test.pt'), c.CSVLogger('test_csv.pt'),
                    c.L1WeightDecay(), c.L2WeightDecay(), c.TerminateOnNaN(monitor='fail_metric')])

        trial = torchbearer.Trial(model, optim, torch.nn.MSELoss(), metrics=['loss'], callbacks=cbs)
        trial = trial.with_generators(traingen, valgen, testgen)
        trial.run(2)
        trial.predict()
        trial.evaluate(data_key=torchbearer.TEST_DATA)
        trial.evaluate()

        import os
        os.remove('test.pt')
        os.remove('test_csv.pt')

Source File: run.py From s2cnn with MIT License

5 votes

def load_data(path, batch_size):

    with gzip.open(path, 'rb') as f:
        dataset = pickle.load(f)

    train_data = torch.from_numpy(
        dataset["train"]["images"][:, None, :, :].astype(np.float32))
    train_labels = torch.from_numpy(
        dataset["train"]["labels"].astype(np.int64))

    # TODO normalize dataset
    # mean = train_data.mean()
    # stdv = train_data.std()

    train_dataset = data_utils.TensorDataset(train_data, train_labels)
    train_loader = data_utils.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    test_data = torch.from_numpy(
        dataset["test"]["images"][:, None, :, :].astype(np.float32))
    test_labels = torch.from_numpy(
        dataset["test"]["labels"].astype(np.int64))

    test_dataset = data_utils.TensorDataset(test_data, test_labels)
    test_loader = data_utils.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    return train_loader, test_loader, train_dataset, test_dataset

Source File: aceoptimize.py From gandissect with MIT License

5 votes

def compute_feature_quantiles(args, corpus, cache_filename, model, full_sample):
    # Phase 1.6.  Figure the 99% and 99.9%ile of every feature.
    if all(k in corpus for k in ['feature_99', 'feature_999']):
        return
    progress = default_progress()
    with torch.no_grad():
        rq = RunningQuantile(resolution=10000) # 10x what's needed.
        for [zbatch] in progress(
                torch.utils.data.DataLoader(TensorDataset(full_sample),
                batch_size=args.inference_batch_size, num_workers=10,
                pin_memory=True),
                desc="Calculating 0.999 quantile"):
            zbatch = zbatch.cuda()
            tensor_image = model(zbatch)
            feat = model.retained_layer(args.layer)
            rq.add(feat.permute(0, 2, 3, 1
                ).contiguous().view(-1, feat.shape[1]))
        result = rq.quantiles([0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999])
        corpus.feature_001 = result[:, 0].cpu()
        corpus.feature_01 = result[:, 1].cpu()
        corpus.feature_10 = result[:, 2].cpu()
        corpus.feature_50 = result[:, 3].cpu()
        corpus.feature_90 = result[:, 4].cpu()
        corpus.feature_99 = result[:, 5].cpu()
        corpus.feature_999 = result[:, 6].cpu()
    numpy.savez(cache_filename, **corpus)

Source File: data_utils_torch.py From CaptchaRecognition with MIT License

5 votes

def load_dataset(batch_size,dir='data',n_workers=0,test_size=16384,total_size=None):
    print ("Loading data...")
    data = np.load(os.path.join(dir,'captcha.npz'))
    image = data['img'].astype(np.float32)/127.5-1
    text = data['text']
    print ("Loading dictionary...")
    vocab = pickle.load(open(os.path.join(dir,'captcha.vocab_dict'),'rb'),encoding='utf8')

    print ("Convert to tensor...")
    if total_size is None:
        image = torch.Tensor(image).permute(0,3,1,2)
        text = torch.LongTensor(text)
    else:
        image = torch.Tensor(image[:total_size]).permute(0,3,1,2)
        text = torch.LongTensor(text[:total_size])

    image_train = image[:-test_size]
    image_test = image[-test_size:]
    text_train = text[:-test_size]
    text_test = text[-test_size:]
    print ("Build dataset...")
    dataset_train = TensorDataset(image_train,text_train)
    dataset_test = TensorDataset(image_test,text_test)

    if torch.cuda.is_available():
        pm = True
    else:
        pm = False
    print ("Build dataloader...")
    dataloader_train = DataLoader(dataset_train,batch_size,True,num_workers=n_workers)
    dataloader_test = DataLoader(dataset_test,batch_size,shuffle=False)
    print ("data ready!")
    return dataloader_train,dataloader_test,vocab

Source File: load_data.py From cnn-surrogate with MIT License

5 votes

def load_data(data_dir, batch_size):
    """Return data loader

    Args:
        data_dir: directory to hdf5 file, e.g. `dir/to/kle4225_lhs256.hdf5`
        batch_size (int): mini-batch size for loading data

    Returns:
        (data_loader (torch.utils.data.DataLoader), stats)
    """

    with h5py.File(data_dir, 'r') as f:
        x_data = f['input'][()]
        y_data = f['output'][()]

    print("input data shape: {}".format(x_data.shape))
    print("output data shape: {}".format(y_data.shape))

    kwargs = {'num_workers': 4,
              'pin_memory': True} if torch.cuda.is_available() else {}

    dataset = TensorDataset(torch.tensor(x_data), torch.tensor(y_data))
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs)

    # simple statistics of output data
    y_data_mean = np.mean(y_data, 0)
    y_data_var = np.sum((y_data - y_data_mean) ** 2)
    stats = {}
    stats['y_mean'] = y_data_mean
    stats['y_var'] = y_data_var

    return data_loader, stats

Source File: layers_classify_pytorch.py From mead-baseline with Apache License 2.0

5 votes

def __init__(self, x, x_lengths, y):
        self.tensor_dataset = TensorDataset(x, x_lengths, y)

Source File: eager_lm_pytorch.py From mead-baseline with Apache License 2.0

5 votes

def __init__(self, x, y):
        self.tensor_dataset = TensorDataset(x, y)

Source File: runners.py From bert_on_stilts with Apache License 2.0

5 votes

def convert_to_dataset(features):
    full_batch = features_to_data(features)
    dataset_ls = [full_batch.input_ids, full_batch.input_mask,
                  full_batch.segment_ids, full_batch.lm_label_ids]
    if full_batch.is_next is not None:
        dataset_ls.append(full_batch.is_next)

    dataset = TensorDataset(*dataset_ls)
    return dataset, full_batch.tokens

Source File: burgerLoader.py From ar-pde-cnn with MIT License

5 votes

def createTestingLoader(self, data_dir, cases, t_start=0, dt=0.001, batch_size=1):
        '''
        Loads in training data from Fenics simulator
        Args:
            data_dir (string): directory of data
            cases (np.array): array of training cases, must be integers
            n_init (int): number of intial conditions to use from each case
            batch_size (int): mini-batch size
        '''
        testing_data = []
        target_data = []

        # Indexes of data time-steps to use as intial conditions
        nidx = int(t_start/dt)

        for i, val in enumerate(cases):
            file_name = data_dir+"/u{:d}.npy".format(val)
            print("Reading file: {}".format(file_name))
            u = np.load(file_name)

            # Convert to tensor and unsqueeze channel dim
            uTensor = torch.Tensor(u[nidx, :-1]).unsqueeze(0).unsqueeze(1)
            testing_data.append(uTensor.repeat(1,20,1))
            # Remove last element due to periodic conditions between [0,1]
            target_data.append(torch.Tensor(u[::int(self.dt/dt),:-1]))

        data_tuple = (torch.cat(testing_data, dim=0), torch.stack(target_data, dim=0))
        testing_loader = DataLoader(TensorDataset(*data_tuple),
            batch_size=batch_size, shuffle=False, drop_last=False)

        return testing_loader

Source File: aceoptimize.py From gandissect with MIT License

5 votes

def compute_mean_present_features(args, corpus, cache_filename, model):
    # Phase 1.5.  Figure mean activations for every channel where there
    # is a doorway.
    if all(k in corpus for k in ['mean_present_feature']):
        return
    progress = default_progress()
    with torch.no_grad():
        total_present_feature = 0
        for [zbatch, featloc] in progress(
                torch.utils.data.DataLoader(TensorDataset(
                    corpus.object_present_sample,
                    corpus.object_present_location),
                batch_size=args.inference_batch_size, num_workers=10,
                pin_memory=True),
                desc="Mean activations"):
            zbatch = zbatch.cuda()
            featloc = featloc.cuda()
            tensor_image = model(zbatch)
            feat = model.retained_layer(args.layer)
            flatfeat = feat.view(feat.shape[0], feat.shape[1], -1)
            sum_feature_at_obj = flatfeat[
                    torch.arange(feat.shape[0]).to(feat.device), :, featloc
                    ].sum(0)
            total_present_feature = total_present_feature + sum_feature_at_obj
        corpus.mean_present_feature = (total_present_feature / len(
                corpus.object_present_sample)).cpu()
    if cache_filename:
        numpy.savez(cache_filename, **corpus)

Source File: zdataset.py From gandissect with MIT License

5 votes

def z_dataset_for_model(model, size=100, seed=1):
    return TensorDataset(z_sample_for_model(model, size, seed))

Source File: run_classic.py From s2cnn with MIT License

5 votes

def load_data(path, batch_size):

    with gzip.open(path, 'rb') as f:
        dataset = pickle.load(f)

    train_data = torch.from_numpy(
        dataset["train"]["images"][:, None, :, :].astype(np.float32))
    train_labels = torch.from_numpy(
        dataset["train"]["labels"].astype(np.int64))

    # TODO normalize dataset
    # mean = train_data.mean()
    # stdv = train_data.std()

    train_dataset = data_utils.TensorDataset(train_data, train_labels)
    train_loader = data_utils.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    test_data = torch.from_numpy(
        dataset["test"]["images"][:, None, :, :].astype(np.float32))
    test_labels = torch.from_numpy(
        dataset["test"]["labels"].astype(np.int64))

    test_dataset = data_utils.TensorDataset(test_data, test_labels)
    test_loader = data_utils.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    return train_loader, test_loader, train_dataset, test_dataset

Source File: runners.py From bert_on_stilts with Apache License 2.0

5 votes

def convert_to_dataset(features, label_mode):
    full_batch = features_to_data(features, label_mode=label_mode)
    if full_batch.label_ids is None:
        dataset = TensorDataset(full_batch.input_ids, full_batch.input_mask,
                                full_batch.segment_ids)
    else:
        dataset = TensorDataset(full_batch.input_ids, full_batch.input_mask,
                                full_batch.segment_ids, full_batch.label_ids)
    return dataset, full_batch.tokens

Source File: main_dense.py From BLINK with MIT License

5 votes

def _process_crossencoder_dataloader(context_input, label_input, crossencoder_params):
    tensor_data = TensorDataset(context_input, label_input)
    sampler = SequentialSampler(tensor_data)
    dataloader = DataLoader(
        tensor_data, sampler=sampler, batch_size=crossencoder_params["eval_batch_size"]
    )
    return dataloader

Source File: absa_data_util.py From BERT-for-RRC-ABSA with Apache License 2.0

5 votes

def build_dataset(features):
    input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    label = torch.tensor([f.label for f in features], dtype=torch.long)
    return TensorDataset(input_ids, attention_mask, token_type_ids, label)

Source File: run_asc.py From BERT-for-RRC-ABSA with Apache License 2.0

5 votes

def test(args):  # Load a trained model that you have fine-tuned (we assume evaluate on cpu)    
    processor = data_utils.AscProcessor()
    label_list = processor.get_labels()
    tokenizer = BertTokenizer.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model])
    eval_examples = processor.get_test_examples(args.data_dir)
    eval_features = data_utils.convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, "asc")

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    model = torch.load(os.path.join(args.output_dir, "model.pt") )
    model.cuda()
    model.eval()
    
    full_logits=[]
    full_label_ids=[]
    for step, batch in enumerate(eval_dataloader):
        batch = tuple(t.cuda() for t in batch)
        input_ids, segment_ids, input_mask, label_ids = batch
        
        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.cpu().numpy()

        full_logits.extend(logits.tolist() )
        full_label_ids.extend(label_ids.tolist() )

    output_eval_json = os.path.join(args.output_dir, "predictions.json") 
    with open(output_eval_json, "w") as fw:
        json.dump({"logits": full_logits, "label_ids": full_label_ids}, fw)

Source File: rnn_utils.py From guacamol_baselines with MIT License

5 votes

def get_tensor_dataset_on_device(numpy_array, device):
    """
    Get tensor dataset and send it to a device
    Args:
        numpy_array: to be converted
        device: cuda | cpu

    Returns:
        a TensorDataset on the required device
    """

    dataset = get_tensor_dataset(numpy_array)
    dataset.tensors = tuple(t.to(device) for t in dataset.tensors)
    return dataset

Source File: burgerLoader2D.py From ar-pde-cnn with MIT License

5 votes

def createTestingLoader(self, data_dir, cases, tMax=1.0, simdt=0.001, save_every=2, batch_size=1):
        '''
        Loads in training data from Fenics simulator, assumes simulator has saved
        each time-step at specified delta t
        Args:
            data_dir (string): directory of data
            cases (np.array): array of training cases, must be integers
            tMax (float): maximum time value to load simulator data up to
            simdt (float): time-step size used in the simulation
            save_every (int): Interval to load the training data at (default is 2 to match FEM simulator)
            batch_size (int): mini-batch size
        Returns:
            test_loader (Pytorch DataLoader): Returns testing loader
        '''
        testing_data = []
        target_data = []

        # Loop through test cases
        for i, val in enumerate(cases):
            case_dir = os.path.join(data_dir, "run{:d}".format(val))
            print("Reading test case: {}".format(case_dir))
            seq = []
            for j in range(0, int(tMax/simdt)+1, save_every):
                file_dir = os.path.join(case_dir, "u{:d}.npy".format(j))
                u0 = np.load(file_dir)
                # Remove the periodic nodes
                seq.append(u0[:,:,:])

            file_dir = os.path.join(case_dir, "u0.npy")
            uInit = np.load(file_dir)
            uTarget = np.stack(seq, axis=0)

            # Remove the periodic nodes and unsqueeze first dim
            testing_data.append(torch.Tensor(uInit[:,:,:]).unsqueeze(0))
            target_data.append(torch.Tensor(uTarget))
        # Create data loader
        data_tuple = (torch.cat(testing_data, dim=0), torch.stack(target_data, dim=0))
        testing_loader = DataLoader(TensorDataset(*data_tuple),
            batch_size=batch_size, shuffle=False, drop_last=False)

        return testing_loader

Source File: 53_machine_translation.py From deep-learning-note with MIT License

5 votes

def read_data(max_seq_len):
    # in和out分别是input和output的缩写
    in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], []
    with io.open('./data/translation/fr-en-small.txt') as f:
        lines = f.readlines()
    for line in lines:
        in_seq, out_seq = line.rstrip().split('\t')
        in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ')
        if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
            continue  # 如果加上EOS后长于max_seq_len，则忽略掉此样本
        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)
    in_vocab, in_data = build_data(in_tokens, in_seqs)
    out_vocab, out_data = build_data(out_tokens, out_seqs)
    return in_vocab, out_vocab, Data.TensorDataset(in_data, out_data)

Source File: ksLoader.py From ar-pde-cnn with MIT License

5 votes

def createTestingLoader(self, data_dir, cases, dt = 0.1, tmax=1000, batch_size=64):
        '''
        Loads in testing data from matlab simulator; includes target values in dataloader
        Args:
            data_dir (string): directory of data
            cases (np.array): array of training cases, must be integers
            n_init (int): number of intial conditions to use from each case
            batch_size (int): mini-batch size
        '''
        testing_data = []
        target_data = []
        for i, val in enumerate(cases):
            file_name = data_dir+"/ks_data_{:d}.dat".format(val)
            print("Reading file: {}".format(file_name))
            u = np.loadtxt(file_name, delimiter=',')
            u = u[:,:-1]
            # Initial state
            uTensor = torch.Tensor(u[int(100/dt), :]).unsqueeze(0).unsqueeze(0)
            testing_data.append(uTensor.repeat(1,5,1))
            # Full target field
            target_data.append(torch.Tensor(u[int(100/dt):int(100/dt)+tmax+1, :]))

        data_tuple = (torch.cat(testing_data, dim=0), torch.stack(target_data, dim=0))

        testing_data = DataLoader(TensorDataset(*data_tuple),
            batch_size=batch_size, shuffle=False)

        return testing_data

Source File: train.py From squash-generation with MIT License

5 votes

def get_data_loaders(args, tokenizer):
    """ Prepare the dataset for training and evaluation """
    datasets_raw = {}
    logger.info("Loading training data")
    datasets_raw['train'] = get_dataset(tokenizer, args.dataset_cache, args.dataset_path, 'train')
    logger.info("Loading validation data")
    datasets_raw['valid'] = get_dataset(tokenizer, args.dataset_cache, args.dataset_path, 'dev')

    logger.info("Build inputs and labels")
    datasets = {
        "train": defaultdict(list),
        "valid": defaultdict(list)
    }

    for dataset_name, dataset in datasets_raw.items():
        for data_point in dataset:
            instance, _ = build_input_from_segments(data_point, tokenizer)
            for input_name, input_array in instance.items():
                datasets[dataset_name][input_name].append(input_array)

    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed))
    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False)

    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
    return train_loader, valid_loader, train_sampler, valid_sampler

Python torch.utils.data.TensorDataset() Examples