Python torch.utils.data.TensorDataset() Examples
The following are 30
code examples of torch.utils.data.TensorDataset().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.utils.data
, or try the search function
.
Example #1
Source File: base_trainer.py From BiaffineDependencyParsing with MIT License | 8 votes |
def _unpack_batch(self, batch: TensorDataset) -> Dict: """ 拆分batch,得到encoder的输入和word mask,sentence length,以及dep ids,以及其他输入信息 eg: dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_pos, all_end_pos, all_dep_ids, all_pos_ids) Args: batch: 输入的单个batch,类型为TensorDataset(或者torchtext.dataset),可用索引分别取值 Returns: 返回一个字典,[1]是inputs,类型为字典;[2]是word mask;[3]是sentence length,python 列表;[4]是dep ids, 根据实际情况可能还包含其他输入信息 """ raise NotImplementedError('must implement in sub class')
Example #2
Source File: bertology_loader.py From BiaffineDependencyParsing with MIT License | 7 votes |
def feature_to_dataset(features): all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_start_pos = torch.tensor([t.start_pos for t in features], dtype=torch.long) # print([t.end_pos for t in features]) all_end_pos = torch.tensor([t.end_pos for t in features], dtype=torch.long) all_dep_ids = torch.tensor([t.dep_ids for t in features], dtype=torch.long) tensors = [all_input_ids, all_input_mask, all_segment_ids, all_start_pos, all_end_pos, all_dep_ids] if hasattr(features[0], 'pos_ids'): all_pos_ids = torch.tensor([t.pos_ids for t in features], dtype=torch.long) tensors.append(all_pos_ids) dataset = TensorDataset(*tensors) # Input Tensors: # all_input_ids, # all_input_mask, # all_segment_ids, # all_start_pos, # all_end_pos, # all_dep_ids, # all_pos_ids, (如果有) return dataset
Example #3
Source File: serverstate.py From gandissect with MIT License | 6 votes |
def feature_maps(self, z_batch, intervention=None, layers=None, quantiles=True): feature_map = defaultdict(list) with torch.no_grad(), self.modellock: batch_size = 10 self.apply_intervention(intervention) test_loader = DataLoader( TensorDataset(z_batch[:,:,None,None]), batch_size=batch_size, pin_memory=('cuda' == self.device.type and z_batch.device.type == 'cpu')) processed = 0 for batch_num, [batch_z] in enumerate(test_loader): batch_z = batch_z.to(self.device) # Run model but disregard output self.model(batch_z) processing = batch_z.shape[0] for layer, feature in self.model.retained_features().items(): for single_featuremap in feature: if quantiles: feature_map[layer].append(self.quantiles[layer] .normalize(single_featuremap)) else: feature_map[layer].append(single_featuremap) return feature_map
Example #4
Source File: serverstate.py From gandissect with MIT License | 6 votes |
def generate_images(self, z_batch, intervention=None): ''' Makes some images. ''' with torch.no_grad(), self.modellock: batch_size = 10 self.apply_intervention(intervention) test_loader = DataLoader(TensorDataset(z_batch[:,:,None,None]), batch_size=batch_size, pin_memory=('cuda' == self.device.type and z_batch.device.type == 'cpu')) result_img = torch.zeros( *((len(z_batch), 3) + self.model.output_shape[2:]), dtype=torch.uint8, device=self.device) for batch_num, [batch_z,] in enumerate(test_loader): batch_z = batch_z.to(self.device) out = self.model(batch_z) result_img[batch_num*batch_size: batch_num*batch_size+len(batch_z)] = ( (((out + 1) / 2) * 255).clamp(0, 255).byte()) return result_img
Example #5
Source File: msda_preprocessed_amazon_dataset.py From man with MIT License | 6 votes |
def get_msda_amazon_datasets(data_file, domain, kfold, feature_num): print(f'Loading mSDA Preprocessed Multi-Domain Amazon data for {domain} Domain') dataset = pickle.load(open(data_file, 'rb'))[domain] lx, ly = dataset['labeled'] if feature_num > 0: lx = lx[:, : feature_num] lx = torch.from_numpy(lx.toarray()).float().to(opt.device) ly = torch.from_numpy(ly).long().to(opt.device) print(f'{domain} Domain has {len(ly)} labeled instances.') # if opt.use_cuda: # lx, ly = lx.cuda(), ly.cuda() labeled_set = FoldedDataset(TensorDataset, kfold, lx, ly) ux, uy = dataset['unlabeled'] if feature_num > 0: ux = ux[:, : feature_num] ux = torch.from_numpy(ux.toarray()).float().to(opt.device) uy = torch.from_numpy(uy).long().to(opt.device) print(f'{domain} Domain has {len(uy)} unlabeled instances.') # if opt.use_cuda: # ux, uy = ux.cuda(), uy.cuda() unlabeled_set = TensorDataset(ux, uy) return labeled_set, unlabeled_set
Example #6
Source File: folded_dataset.py From man with MIT License | 6 votes |
def get_folds(self, folds): indices = np.hstack([self.folds[f] for f in folds]).reshape(-1) if self.__class__.__bases__[0].__name__ == 'TensorDataset': indices = torch.from_numpy(indices).to(opt.device) # if opt.use_cuda: # indices = indices.cuda() X = torch.index_select(self.tensors[0], 0, indices) Y = torch.index_select(self.tensors[1], 0, indices) return TensorDataset(X, Y) else: X = [self.X[i] for i in indices] indices = torch.from_numpy(indices).to(opt.device) # if opt.use_cuda: # indices = indices.cuda() Y = torch.index_select(self.Y, 0, indices) return AmazonDataset(X, Y, self.max_seq_len)
Example #7
Source File: rnn_utils.py From guacamol_baselines with MIT License | 6 votes |
def get_tensor_dataset(numpy_array): """ Gets a numpy array of indices, convert it into a Torch tensor, divided it into inputs and targets and wrap it into a TensorDataset Args: numpy_array: to be converted Returns: a TensorDataset """ tensor = torch.from_numpy(numpy_array).long() inp = tensor[:, :-1] target = tensor[:, 1:] return TensorDataset(inp, target)
Example #8
Source File: trial.py From torchbearer with MIT License | 6 votes |
def with_test_data(self, x, batch_size=1, num_workers=1, steps=None): """Use this trial with the given test data. Returns self so that methods can be chained for convenience. Example: :: # Simple trial that runs for 10 test iterations on some random data >>> from torchbearer import Trial >>> data = torch.rand(10, 1) >>> trial = Trial(None).with_test_data(data).for_test_steps(10).run(1) Args: x (torch.Tensor): The test x data to use during calls to :meth:`.predict` batch_size (int): The size of each batch to sample from the data num_workers (int): Number of worker threads to use in the data loader steps (int): The number of steps per epoch to take when using this data Returns: Trial: self """ dataset = TensorDataset(x) dataloader = DataLoader(dataset, batch_size, num_workers=num_workers) self.with_test_generator(dataloader, steps=steps) return self
Example #9
Source File: logistic.py From holoclean with Apache License 2.0 | 6 votes |
def train(self, num_epochs=3, batch_size=32): """ Trains the LR model. :param num_epochs: (int) number of epochs. """ batch_losses = [] # We train only on cells that do not have their initial value as NULL. X_train, Y_train = self._X.index_select(0, self._train_idx), self._Y.index_select(0, self._train_idx) torch_ds = TensorDataset(X_train, Y_train) # Main training loop. for epoch_idx in range(1, num_epochs+1): logging.debug("Logistic: epoch %d", epoch_idx) batch_cnt = 0 for batch_X, batch_Y in tqdm(DataLoader(torch_ds, batch_size=batch_size)): batch_pred = self.forward(batch_X) batch_loss = self._loss(batch_pred, batch_Y.reshape(-1,1)) batch_losses.append(float(batch_loss)) self.zero_grad() batch_loss.backward() self._optimizer.step() batch_cnt += 1 logging.debug('Logistic: average batch loss: %f', sum(batch_losses[-1 * batch_cnt:]) / batch_cnt) return batch_losses
Example #10
Source File: test_end_to_end.py From torchbearer with MIT License | 6 votes |
def test_callbacks(self): from torch.utils.data import TensorDataset traingen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1)) valgen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1)) testgen = TensorDataset(torch.rand(10, 1, 3), torch.rand(10, 1)) model = torch.nn.Linear(3, 1) optim = torch.optim.SGD(model.parameters(), lr=0.01) cbs = [] cbs.extend([c.EarlyStopping(), c.GradientClipping(10, model.parameters()), c.Best('test.pt'), c.MostRecent('test.pt'), c.ReduceLROnPlateau(), c.CosineAnnealingLR(0.1, 0.01), c.ExponentialLR(1), c.Interval('test.pt'), c.CSVLogger('test_csv.pt'), c.L1WeightDecay(), c.L2WeightDecay(), c.TerminateOnNaN(monitor='fail_metric')]) trial = torchbearer.Trial(model, optim, torch.nn.MSELoss(), metrics=['loss'], callbacks=cbs) trial = trial.with_generators(traingen, valgen, testgen) trial.run(2) trial.predict() trial.evaluate(data_key=torchbearer.TEST_DATA) trial.evaluate() import os os.remove('test.pt') os.remove('test_csv.pt')
Example #11
Source File: run.py From s2cnn with MIT License | 5 votes |
def load_data(path, batch_size): with gzip.open(path, 'rb') as f: dataset = pickle.load(f) train_data = torch.from_numpy( dataset["train"]["images"][:, None, :, :].astype(np.float32)) train_labels = torch.from_numpy( dataset["train"]["labels"].astype(np.int64)) # TODO normalize dataset # mean = train_data.mean() # stdv = train_data.std() train_dataset = data_utils.TensorDataset(train_data, train_labels) train_loader = data_utils.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_data = torch.from_numpy( dataset["test"]["images"][:, None, :, :].astype(np.float32)) test_labels = torch.from_numpy( dataset["test"]["labels"].astype(np.int64)) test_dataset = data_utils.TensorDataset(test_data, test_labels) test_loader = data_utils.DataLoader(test_dataset, batch_size=batch_size, shuffle=True) return train_loader, test_loader, train_dataset, test_dataset
Example #12
Source File: aceoptimize.py From gandissect with MIT License | 5 votes |
def compute_feature_quantiles(args, corpus, cache_filename, model, full_sample): # Phase 1.6. Figure the 99% and 99.9%ile of every feature. if all(k in corpus for k in ['feature_99', 'feature_999']): return progress = default_progress() with torch.no_grad(): rq = RunningQuantile(resolution=10000) # 10x what's needed. for [zbatch] in progress( torch.utils.data.DataLoader(TensorDataset(full_sample), batch_size=args.inference_batch_size, num_workers=10, pin_memory=True), desc="Calculating 0.999 quantile"): zbatch = zbatch.cuda() tensor_image = model(zbatch) feat = model.retained_layer(args.layer) rq.add(feat.permute(0, 2, 3, 1 ).contiguous().view(-1, feat.shape[1])) result = rq.quantiles([0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999]) corpus.feature_001 = result[:, 0].cpu() corpus.feature_01 = result[:, 1].cpu() corpus.feature_10 = result[:, 2].cpu() corpus.feature_50 = result[:, 3].cpu() corpus.feature_90 = result[:, 4].cpu() corpus.feature_99 = result[:, 5].cpu() corpus.feature_999 = result[:, 6].cpu() numpy.savez(cache_filename, **corpus)
Example #13
Source File: data_utils_torch.py From CaptchaRecognition with MIT License | 5 votes |
def load_dataset(batch_size,dir='data',n_workers=0,test_size=16384,total_size=None): print ("Loading data...") data = np.load(os.path.join(dir,'captcha.npz')) image = data['img'].astype(np.float32)/127.5-1 text = data['text'] print ("Loading dictionary...") vocab = pickle.load(open(os.path.join(dir,'captcha.vocab_dict'),'rb'),encoding='utf8') print ("Convert to tensor...") if total_size is None: image = torch.Tensor(image).permute(0,3,1,2) text = torch.LongTensor(text) else: image = torch.Tensor(image[:total_size]).permute(0,3,1,2) text = torch.LongTensor(text[:total_size]) image_train = image[:-test_size] image_test = image[-test_size:] text_train = text[:-test_size] text_test = text[-test_size:] print ("Build dataset...") dataset_train = TensorDataset(image_train,text_train) dataset_test = TensorDataset(image_test,text_test) if torch.cuda.is_available(): pm = True else: pm = False print ("Build dataloader...") dataloader_train = DataLoader(dataset_train,batch_size,True,num_workers=n_workers) dataloader_test = DataLoader(dataset_test,batch_size,shuffle=False) print ("data ready!") return dataloader_train,dataloader_test,vocab
Example #14
Source File: load_data.py From cnn-surrogate with MIT License | 5 votes |
def load_data(data_dir, batch_size): """Return data loader Args: data_dir: directory to hdf5 file, e.g. `dir/to/kle4225_lhs256.hdf5` batch_size (int): mini-batch size for loading data Returns: (data_loader (torch.utils.data.DataLoader), stats) """ with h5py.File(data_dir, 'r') as f: x_data = f['input'][()] y_data = f['output'][()] print("input data shape: {}".format(x_data.shape)) print("output data shape: {}".format(y_data.shape)) kwargs = {'num_workers': 4, 'pin_memory': True} if torch.cuda.is_available() else {} dataset = TensorDataset(torch.tensor(x_data), torch.tensor(y_data)) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs) # simple statistics of output data y_data_mean = np.mean(y_data, 0) y_data_var = np.sum((y_data - y_data_mean) ** 2) stats = {} stats['y_mean'] = y_data_mean stats['y_var'] = y_data_var return data_loader, stats
Example #15
Source File: layers_classify_pytorch.py From mead-baseline with Apache License 2.0 | 5 votes |
def __init__(self, x, x_lengths, y): self.tensor_dataset = TensorDataset(x, x_lengths, y)
Example #16
Source File: eager_lm_pytorch.py From mead-baseline with Apache License 2.0 | 5 votes |
def __init__(self, x, y): self.tensor_dataset = TensorDataset(x, y)
Example #17
Source File: runners.py From bert_on_stilts with Apache License 2.0 | 5 votes |
def convert_to_dataset(features): full_batch = features_to_data(features) dataset_ls = [full_batch.input_ids, full_batch.input_mask, full_batch.segment_ids, full_batch.lm_label_ids] if full_batch.is_next is not None: dataset_ls.append(full_batch.is_next) dataset = TensorDataset(*dataset_ls) return dataset, full_batch.tokens
Example #18
Source File: burgerLoader.py From ar-pde-cnn with MIT License | 5 votes |
def createTestingLoader(self, data_dir, cases, t_start=0, dt=0.001, batch_size=1): ''' Loads in training data from Fenics simulator Args: data_dir (string): directory of data cases (np.array): array of training cases, must be integers n_init (int): number of intial conditions to use from each case batch_size (int): mini-batch size ''' testing_data = [] target_data = [] # Indexes of data time-steps to use as intial conditions nidx = int(t_start/dt) for i, val in enumerate(cases): file_name = data_dir+"/u{:d}.npy".format(val) print("Reading file: {}".format(file_name)) u = np.load(file_name) # Convert to tensor and unsqueeze channel dim uTensor = torch.Tensor(u[nidx, :-1]).unsqueeze(0).unsqueeze(1) testing_data.append(uTensor.repeat(1,20,1)) # Remove last element due to periodic conditions between [0,1] target_data.append(torch.Tensor(u[::int(self.dt/dt),:-1])) data_tuple = (torch.cat(testing_data, dim=0), torch.stack(target_data, dim=0)) testing_loader = DataLoader(TensorDataset(*data_tuple), batch_size=batch_size, shuffle=False, drop_last=False) return testing_loader
Example #19
Source File: aceoptimize.py From gandissect with MIT License | 5 votes |
def compute_mean_present_features(args, corpus, cache_filename, model): # Phase 1.5. Figure mean activations for every channel where there # is a doorway. if all(k in corpus for k in ['mean_present_feature']): return progress = default_progress() with torch.no_grad(): total_present_feature = 0 for [zbatch, featloc] in progress( torch.utils.data.DataLoader(TensorDataset( corpus.object_present_sample, corpus.object_present_location), batch_size=args.inference_batch_size, num_workers=10, pin_memory=True), desc="Mean activations"): zbatch = zbatch.cuda() featloc = featloc.cuda() tensor_image = model(zbatch) feat = model.retained_layer(args.layer) flatfeat = feat.view(feat.shape[0], feat.shape[1], -1) sum_feature_at_obj = flatfeat[ torch.arange(feat.shape[0]).to(feat.device), :, featloc ].sum(0) total_present_feature = total_present_feature + sum_feature_at_obj corpus.mean_present_feature = (total_present_feature / len( corpus.object_present_sample)).cpu() if cache_filename: numpy.savez(cache_filename, **corpus)
Example #20
Source File: zdataset.py From gandissect with MIT License | 5 votes |
def z_dataset_for_model(model, size=100, seed=1): return TensorDataset(z_sample_for_model(model, size, seed))
Example #21
Source File: run_classic.py From s2cnn with MIT License | 5 votes |
def load_data(path, batch_size): with gzip.open(path, 'rb') as f: dataset = pickle.load(f) train_data = torch.from_numpy( dataset["train"]["images"][:, None, :, :].astype(np.float32)) train_labels = torch.from_numpy( dataset["train"]["labels"].astype(np.int64)) # TODO normalize dataset # mean = train_data.mean() # stdv = train_data.std() train_dataset = data_utils.TensorDataset(train_data, train_labels) train_loader = data_utils.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_data = torch.from_numpy( dataset["test"]["images"][:, None, :, :].astype(np.float32)) test_labels = torch.from_numpy( dataset["test"]["labels"].astype(np.int64)) test_dataset = data_utils.TensorDataset(test_data, test_labels) test_loader = data_utils.DataLoader(test_dataset, batch_size=batch_size, shuffle=True) return train_loader, test_loader, train_dataset, test_dataset
Example #22
Source File: runners.py From bert_on_stilts with Apache License 2.0 | 5 votes |
def convert_to_dataset(features, label_mode): full_batch = features_to_data(features, label_mode=label_mode) if full_batch.label_ids is None: dataset = TensorDataset(full_batch.input_ids, full_batch.input_mask, full_batch.segment_ids) else: dataset = TensorDataset(full_batch.input_ids, full_batch.input_mask, full_batch.segment_ids, full_batch.label_ids) return dataset, full_batch.tokens
Example #23
Source File: main_dense.py From BLINK with MIT License | 5 votes |
def _process_crossencoder_dataloader(context_input, label_input, crossencoder_params): tensor_data = TensorDataset(context_input, label_input) sampler = SequentialSampler(tensor_data) dataloader = DataLoader( tensor_data, sampler=sampler, batch_size=crossencoder_params["eval_batch_size"] ) return dataloader
Example #24
Source File: absa_data_util.py From BERT-for-RRC-ABSA with Apache License 2.0 | 5 votes |
def build_dataset(features): input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) label = torch.tensor([f.label for f in features], dtype=torch.long) return TensorDataset(input_ids, attention_mask, token_type_ids, label)
Example #25
Source File: run_asc.py From BERT-for-RRC-ABSA with Apache License 2.0 | 5 votes |
def test(args): # Load a trained model that you have fine-tuned (we assume evaluate on cpu) processor = data_utils.AscProcessor() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model]) eval_examples = processor.get_test_examples(args.data_dir) eval_features = data_utils.convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, "asc") logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model = torch.load(os.path.join(args.output_dir, "model.pt") ) model.cuda() model.eval() full_logits=[] full_label_ids=[] for step, batch in enumerate(eval_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, segment_ids, input_mask, label_ids = batch with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.cpu().numpy() full_logits.extend(logits.tolist() ) full_label_ids.extend(label_ids.tolist() ) output_eval_json = os.path.join(args.output_dir, "predictions.json") with open(output_eval_json, "w") as fw: json.dump({"logits": full_logits, "label_ids": full_label_ids}, fw)
Example #26
Source File: rnn_utils.py From guacamol_baselines with MIT License | 5 votes |
def get_tensor_dataset_on_device(numpy_array, device): """ Get tensor dataset and send it to a device Args: numpy_array: to be converted device: cuda | cpu Returns: a TensorDataset on the required device """ dataset = get_tensor_dataset(numpy_array) dataset.tensors = tuple(t.to(device) for t in dataset.tensors) return dataset
Example #27
Source File: burgerLoader2D.py From ar-pde-cnn with MIT License | 5 votes |
def createTestingLoader(self, data_dir, cases, tMax=1.0, simdt=0.001, save_every=2, batch_size=1): ''' Loads in training data from Fenics simulator, assumes simulator has saved each time-step at specified delta t Args: data_dir (string): directory of data cases (np.array): array of training cases, must be integers tMax (float): maximum time value to load simulator data up to simdt (float): time-step size used in the simulation save_every (int): Interval to load the training data at (default is 2 to match FEM simulator) batch_size (int): mini-batch size Returns: test_loader (Pytorch DataLoader): Returns testing loader ''' testing_data = [] target_data = [] # Loop through test cases for i, val in enumerate(cases): case_dir = os.path.join(data_dir, "run{:d}".format(val)) print("Reading test case: {}".format(case_dir)) seq = [] for j in range(0, int(tMax/simdt)+1, save_every): file_dir = os.path.join(case_dir, "u{:d}.npy".format(j)) u0 = np.load(file_dir) # Remove the periodic nodes seq.append(u0[:,:,:]) file_dir = os.path.join(case_dir, "u0.npy") uInit = np.load(file_dir) uTarget = np.stack(seq, axis=0) # Remove the periodic nodes and unsqueeze first dim testing_data.append(torch.Tensor(uInit[:,:,:]).unsqueeze(0)) target_data.append(torch.Tensor(uTarget)) # Create data loader data_tuple = (torch.cat(testing_data, dim=0), torch.stack(target_data, dim=0)) testing_loader = DataLoader(TensorDataset(*data_tuple), batch_size=batch_size, shuffle=False, drop_last=False) return testing_loader
Example #28
Source File: 53_machine_translation.py From deep-learning-note with MIT License | 5 votes |
def read_data(max_seq_len): # in和out分别是input和output的缩写 in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], [] with io.open('./data/translation/fr-en-small.txt') as f: lines = f.readlines() for line in lines: in_seq, out_seq = line.rstrip().split('\t') in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ') if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1: continue # 如果加上EOS后长于max_seq_len,则忽略掉此样本 process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len) process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len) in_vocab, in_data = build_data(in_tokens, in_seqs) out_vocab, out_data = build_data(out_tokens, out_seqs) return in_vocab, out_vocab, Data.TensorDataset(in_data, out_data)
Example #29
Source File: ksLoader.py From ar-pde-cnn with MIT License | 5 votes |
def createTestingLoader(self, data_dir, cases, dt = 0.1, tmax=1000, batch_size=64): ''' Loads in testing data from matlab simulator; includes target values in dataloader Args: data_dir (string): directory of data cases (np.array): array of training cases, must be integers n_init (int): number of intial conditions to use from each case batch_size (int): mini-batch size ''' testing_data = [] target_data = [] for i, val in enumerate(cases): file_name = data_dir+"/ks_data_{:d}.dat".format(val) print("Reading file: {}".format(file_name)) u = np.loadtxt(file_name, delimiter=',') u = u[:,:-1] # Initial state uTensor = torch.Tensor(u[int(100/dt), :]).unsqueeze(0).unsqueeze(0) testing_data.append(uTensor.repeat(1,5,1)) # Full target field target_data.append(torch.Tensor(u[int(100/dt):int(100/dt)+tmax+1, :])) data_tuple = (torch.cat(testing_data, dim=0), torch.stack(target_data, dim=0)) testing_data = DataLoader(TensorDataset(*data_tuple), batch_size=batch_size, shuffle=False) return testing_data
Example #30
Source File: train.py From squash-generation with MIT License | 5 votes |
def get_data_loaders(args, tokenizer): """ Prepare the dataset for training and evaluation """ datasets_raw = {} logger.info("Loading training data") datasets_raw['train'] = get_dataset(tokenizer, args.dataset_cache, args.dataset_path, 'train') logger.info("Loading validation data") datasets_raw['valid'] = get_dataset(tokenizer, args.dataset_cache, args.dataset_path, 'dev') logger.info("Build inputs and labels") datasets = { "train": defaultdict(list), "valid": defaultdict(list) } for dataset_name, dataset in datasets_raw.items(): for data_point in dataset: instance, _ = build_input_from_segments(data_point, tokenizer) for input_name, input_array in instance.items(): datasets[dataset_name][input_name].append(input_array) logger.info("Pad inputs and convert to Tensor") tensor_datasets = {"train": [], "valid": []} for dataset_name, dataset in datasets.items(): dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) for input_name in MODEL_INPUTS: tensor = torch.tensor(dataset[input_name]) tensor_datasets[dataset_name].append(tensor) logger.info("Build train and validation dataloaders") train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed)) valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False) logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) return train_loader, valid_loader, train_sampler, valid_sampler