Python utils.repackage_hidden() Examples
The following are 30
code examples of utils.repackage_hidden().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
utils
, or try the search function
.
Example #1
Source File: train.py From NAO with GNU General Public License v3.0 | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
Example #2
Source File: test.py From NAO with GNU General Public License v3.0 | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): print(i, data_source.size(0)-1) data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source) # Load the best saved model.
Example #3
Source File: train.py From NAO with GNU General Public License v3.0 | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args.bptt, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
Example #4
Source File: test.py From NAO with GNU General Public License v3.0 | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): print(i, data_source.size(0)-1) data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden, args.arc) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source) # Load the best saved model.
Example #5
Source File: train.py From DREAM with MIT License | 6 votes |
def evaluate_reorder_dream(): dr_model.eval() dr_hidden = dr_model.init_hidden(dr_config.batch_size) total_loss = 0 start_time = time() num_batchs = ceil(len(test_ub) / dr_config.batch_size) for i, x in enumerate(batchify(test_ub, dr_config.batch_size, is_reordered=True)): baskets, lens, _, r_baskets, h_baskets = x dynamic_user, _ = dr_model(baskets, lens, dr_hidden) loss = reorder_bpr_loss(r_baskets, h_baskets, dynamic_user, dr_model.encode.weight, dr_config) dr_hidden = repackage_hidden(dr_hidden) total_loss += loss.data # Logging elapsed = (time() - start_time) * 1000 / num_batchs total_loss = total_loss[0] / num_batchs / dr_config.batch_size print('[Evaluation]| Epochs {:3d} | Elapsed {:02.2f} | Loss {:05.2f} |'.format(epoch, elapsed, total_loss)) return total_loss
Example #6
Source File: finetune.py From mos with MIT License | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += len(data) * loss hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
Example #7
Source File: main.py From mos with MIT License | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
Example #8
Source File: train.py From DREAM with MIT License | 6 votes |
def evaluate_dream(): dr_model.eval() dr_hidden = dr_model.init_hidden(dr_config.batch_size) total_loss = 0 start_time = time() num_batchs = ceil(len(test_ub) / dr_config.batch_size) for i, x in enumerate(batchify(test_ub, dr_config.batch_size)): baskets, lens, _ = x dynamic_user, _ = dr_model(baskets, lens, dr_hidden) loss = bpr_loss(baskets, dynamic_user, dr_model.encode.weight, dr_config) dr_hidden = repackage_hidden(dr_hidden) total_loss += loss.data # Logging elapsed = (time() - start_time) * 1000 / num_batchs total_loss = total_loss[0] / num_batchs / dr_config.batch_size writer.add_scalar('model/eval_loss', total_loss, (epoch + 1) * num_batchs) writer.add_scalar('model/eval_loss', total_loss, (epoch + 1) * num_batchs) print('[Evaluation]| Epochs {:3d} | Elapsed {:02.2f} | Loss {:05.2f} |'.format(epoch, elapsed, total_loss)) return total_loss
Example #9
Source File: train_search.py From darts with Apache License 2.0 | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
Example #10
Source File: test.py From darts with Apache License 2.0 | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): print(i, data_source.size(0)-1) data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source) # Load the best saved model.
Example #11
Source File: train.py From darts with Apache License 2.0 | 6 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
Example #12
Source File: eval.py From DREAM with MIT License | 6 votes |
def eval_pred(dr_model, ub): ''' evaluate dream model for predicting next basket on all training users in batches ''' item_embedding = dr_model.encode.weight dr_model.eval() dr_hidden = dr_model.init_hidden(dr_model.config.batch_size) start_time = time() id_u, score_u = [], [] # user's id, user's score num_batchs = ceil(len(ub) / dr_model.config.batch_size) for i,x in enumerate(batchify(ub, dr_model.config.batch_size)): print(i) baskets, lens, uids = x _, dynamic_user, _ = dr_model(baskets, lens, dr_hidden)# shape: batch_size, max_len, embedding_size dr_hidden = repackage_hidden(dr_hidden) for i,l,du in zip(uids, lens, dynamic_user): du_latest = du[l - 1].unsqueeze(0) # shape: 1, embedding_size score_up = torch.mm(du_latest, item_embedding.t()) # shape: 1, num_item score_u.append(score_up.cpu().data.numpy()) id_u.append(i) elapsed = time() - start_time print('[Predicting] Elapsed: {02.2f}'.format(elapsed)) return score_ub, id_u
Example #13
Source File: main.py From awd-lstm-lm with BSD 3-Clause "New" or "Revised" License | 5 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
Example #14
Source File: finetune.py From awd-lstm-lm with BSD 3-Clause "New" or "Revised" License | 5 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': model.reset() model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
Example #15
Source File: finetune.py From lm-context-analysis with Apache License 2.0 | 5 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': model.reset() model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
Example #16
Source File: eval.py From fraternal-dropout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source) # Load the best saved model.
Example #17
Source File: main.py From fraternal-dropout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
Example #18
Source File: main.py From PyTorch-NLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def evaluate(data_source, source_sampler, target_sampler, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 hidden = model.init_hidden(batch_size) for source_sample, target_sample in zip(source_sampler, target_sampler): model.train() data = torch.stack([data_source[i] for i in source_sample]) targets = torch.stack([data_source[i] for i in target_sample]).view(-1) with torch.no_grad(): output, hidden = model(data, hidden) total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).item() hidden = repackage_hidden(hidden) return total_loss / len(data_source)
Example #19
Source File: eval.py From lm-context-analysis with Apache License 2.0 | 5 votes |
def evaluate(data_source, batch_size, seq_len): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 tokens = 0 n = 0 save_all_losses = [] ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, seq_len): tokens += seq_len data, targets = get_batch(data_source, i, args, evaluation=True, seq_len=seq_len) output, hidden = model(data, hidden) output = nn.functional.log_softmax(output.permute(2,1,0)).permute(2,1,0) targets = targets.view(data.data.shape[0], batch_size, -1) CELoss = torch.gather(output.data, dim=2, index=targets.data).squeeze() CELoss = -1*CELoss if tokens < args.start_token: continue # We are not ready to accumulate error yet elif tokens >= args.start_token and tokens-seq_len < args.start_token: data.data = data.data[-(tokens-args.start_token+1):] CELoss = CELoss[-(tokens-args.start_token+1):] print('First word: %s' % (corpus.dictionary.idx2word[data.data[-(tokens-args.start_token+1),0]])) total_loss += torch.sum(CELoss) n += data.size(0) save_all_losses += CELoss.tolist() hidden = repackage_hidden(hidden) print('total: %d' % n) print('Last word: %s' % (corpus.dictionary.idx2word[data.data[-1,0]])) return total_loss / float(n), save_all_losses
Example #20
Source File: test_phrase_grammar.py From Ordered-Neurons with BSD 3-Clause "New" or "Revised" License | 5 votes |
def evaluate(data_source, batch_size=1): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) output = model.decoder(output) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss / len(data_source)
Example #21
Source File: main.py From lm-context-analysis with Apache License 2.0 | 5 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
Example #22
Source File: main.py From Ordered-Neurons with BSD 3-Clause "New" or "Revised" License | 5 votes |
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
Example #23
Source File: pointer.py From awd-lstm-lm with BSD 3-Clause "New" or "Revised" License | 4 votes |
def evaluate(data_source, batch_size=10, window=args.window): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': model.reset() model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) next_word_history = None pointer_history = None for i in range(0, data_source.size(0) - 1, args.bptt): if i > 0: print(i, len(data_source), math.exp(total_loss / i)) data, targets = get_batch(data_source, i, evaluation=True, args=args) output, hidden, rnn_outs, _ = model(data, hidden, return_h=True) rnn_out = rnn_outs[-1].squeeze() output_flat = output.view(-1, ntokens) ### # Fill pointer history start_idx = len(next_word_history) if next_word_history is not None else 0 next_word_history = torch.cat([one_hot(t.data[0], ntokens) for t in targets]) if next_word_history is None else torch.cat([next_word_history, torch.cat([one_hot(t.data[0], ntokens) for t in targets])]) #print(next_word_history) pointer_history = Variable(rnn_out.data) if pointer_history is None else torch.cat([pointer_history, Variable(rnn_out.data)], dim=0) #print(pointer_history) ### # Built-in cross entropy # total_loss += len(data) * criterion(output_flat, targets).data[0] ### # Manual cross entropy # softmax_output_flat = torch.nn.functional.softmax(output_flat) # soft = torch.gather(softmax_output_flat, dim=1, index=targets.view(-1, 1)) # entropy = -torch.log(soft) # total_loss += len(data) * entropy.mean().data[0] ### # Pointer manual cross entropy loss = 0 softmax_output_flat = torch.nn.functional.softmax(output_flat) for idx, vocab_loss in enumerate(softmax_output_flat): p = vocab_loss if start_idx + idx > window: valid_next_word = next_word_history[start_idx + idx - window:start_idx + idx] valid_pointer_history = pointer_history[start_idx + idx - window:start_idx + idx] logits = torch.mv(valid_pointer_history, rnn_out[idx]) theta = args.theta ptr_attn = torch.nn.functional.softmax(theta * logits).view(-1, 1) ptr_dist = (ptr_attn.expand_as(valid_next_word) * valid_next_word).sum(0).squeeze() lambdah = args.lambdasm p = lambdah * ptr_dist + (1 - lambdah) * vocab_loss ### target_loss = p[targets[idx].data] loss += (-torch.log(target_loss)).data[0] total_loss += loss / batch_size ### hidden = repackage_hidden(hidden) next_word_history = next_word_history[-window:] pointer_history = pointer_history[-window:] return total_loss / len(data_source) # Load the best saved model.
Example #24
Source File: pointer.py From outlier-exposure with Apache License 2.0 | 4 votes |
def evaluate(data_source, batch_size=10, window=args.window): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': model.reset() model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) next_word_history = None pointer_history = None for i in range(0, data_source.size(0) - 1, args.bptt): if i > 0: print(i, len(data_source), math.exp(total_loss / i)) data, targets = get_batch(data_source, i, evaluation=True, args=args) output, hidden, rnn_outs, _ = model(data, hidden, return_h=True) rnn_out = rnn_outs[-1].squeeze() output_flat = output.view(-1, ntokens) ### # Fill pointer history start_idx = len(next_word_history) if next_word_history is not None else 0 next_word_history = torch.cat([one_hot(t.data[0], ntokens) for t in targets]) if next_word_history is None else torch.cat([next_word_history, torch.cat([one_hot(t.data[0], ntokens) for t in targets])]) #print(next_word_history) pointer_history = Variable(rnn_out.data) if pointer_history is None else torch.cat([pointer_history, Variable(rnn_out.data)], dim=0) #print(pointer_history) ### # Built-in cross entropy # total_loss += len(data) * criterion(output_flat, targets).data[0] ### # Manual cross entropy # softmax_output_flat = torch.nn.functional.softmax(output_flat) # soft = torch.gather(softmax_output_flat, dim=1, index=targets.view(-1, 1)) # entropy = -torch.log(soft) # total_loss += len(data) * entropy.mean().data[0] ### # Pointer manual cross entropy loss = 0 softmax_output_flat = torch.nn.functional.softmax(output_flat) for idx, vocab_loss in enumerate(softmax_output_flat): p = vocab_loss if start_idx + idx > window: valid_next_word = next_word_history[start_idx + idx - window:start_idx + idx] valid_pointer_history = pointer_history[start_idx + idx - window:start_idx + idx] logits = torch.mv(valid_pointer_history, rnn_out[idx]) theta = args.theta ptr_attn = torch.nn.functional.softmax(theta * logits).view(-1, 1) ptr_dist = (ptr_attn.expand_as(valid_next_word) * valid_next_word).sum(0).squeeze() lambdah = args.lambdasm p = lambdah * ptr_dist + (1 - lambdah) * vocab_loss ### target_loss = p[targets[idx].data] loss += (-torch.log(target_loss)).data[0] total_loss += loss / batch_size ### hidden = repackage_hidden(hidden) next_word_history = next_word_history[-window:] pointer_history = pointer_history[-window:] return total_loss / len(data_source) # Load the best saved model.
Example #25
Source File: main.py From awd-lstm-lm with BSD 3-Clause "New" or "Revised" License | 4 votes |
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len # Loop over epochs.
Example #26
Source File: train.py From DREAM with MIT License | 4 votes |
def train_dream(): dr_model.train() # turn on training mode for dropout dr_hidden = dr_model.init_hidden(dr_config.batch_size) total_loss = 0 start_time = time() num_batchs = ceil(len(train_ub) / dr_config.batch_size) for i, x in enumerate(batchify(train_ub, dr_config.batch_size)): baskets, lens, _ = x dr_hidden = repackage_hidden(dr_hidden) # repackage hidden state for RNN dr_model.zero_grad() # optim.zero_grad() dynamic_user, _ = dr_model(baskets, lens, dr_hidden) loss = bpr_loss(baskets, dynamic_user, dr_model.encode.weight, dr_config) loss.backward() # Clip to avoid gradient exploding torch.nn.utils.clip_grad_norm(dr_model.parameters(), dr_config.clip) # Parameter updating # manual SGD # for p in dr_model.parameters(): # Update parameters by -lr*grad # p.data.add_(- dr_config.learning_rate, p.grad.data) # adam grad_norm = get_grad_norm(dr_model) previous_params = deepcopy(list(dr_model.parameters())) optim.step() total_loss += loss.data params = deepcopy(list(dr_model.parameters())) delta = get_weight_update(previous_params, params) weight_update_ratio = get_ratio_update(delta, params) # Logging if i % dr_config.log_interval == 0 and i > 0: elapsed = (time() - start_time) * 1000 / dr_config.log_interval cur_loss = total_loss[0] / dr_config.log_interval / dr_config.batch_size # turn tensor into float total_loss = 0 start_time = time() print( '[Training]| Epochs {:3d} | Batch {:5d} / {:5d} | ms/batch {:02.2f} | Loss {:05.2f} |'.format(epoch, i, num_batchs, elapsed, cur_loss)) writer.add_scalar('model/train_loss', cur_loss, epoch * num_batchs + i) writer.add_scalar('model/grad_norm', grad_norm, epoch * num_batchs + i) writer.add_scalar('model/weight_update_ratio', weight_update_ratio, epoch * num_batchs + i)
Example #27
Source File: finetune.py From awd-lstm-lm with BSD 3-Clause "New" or "Revised" License | 4 votes |
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(output.view(-1, ntokens), targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len # Load the best saved model.
Example #28
Source File: train.py From DREAM with MIT License | 4 votes |
def train_reorder_dream(): dr_model.train() # turn on training mode for dropout dr_hidden = dr_model.init_hidden(dr_config.batch_size) total_loss = 0 start_time = time() num_batchs = ceil(len(train_ub) / dr_config.batch_size) for i, x in enumerate(batchify(train_ub, dr_config.batch_size, is_reordered=True)): baskets, lens, ids, r_baskets, h_baskets = x dr_hidden = repackage_hidden(dr_hidden) # repackage hidden state for RNN dr_model.zero_grad() # optim.zero_grad() dynamic_user, _ = dr_model(baskets, lens, dr_hidden) loss = reorder_bpr_loss(r_baskets, h_baskets, dynamic_user, dr_model.encode.weight, dr_config) try: loss.backward() except RuntimeError: # for debugging print('caching') tmp = {'baskets': baskets, 'ids': ids, 'r_baskets': r_baskets, 'h_baskets': h_baskets, 'dynamic_user': dynamic_user, 'item_embedding': dr_model.encode.weight} print(baskets) print(ids) print(r_baskets) print(h_baskets) print(dr_model.encode.weight) print(dynamic_user.data) with open('tmp.pkl', 'wb') as f: pickle.dump(tmp, f, pickle.HIGHEST_PROTOCOL) break # Clip to avoid gradient exploding torch.nn.utils.clip_grad_norm(dr_model.parameters(), dr_config.clip) # Parameter updating # manual SGD # for p in dr_model.parameters(): # Update parameters by -lr*grad # p.data.add_(- dr_config.learning_rate, p.grad.data) # adam grad_norm = get_grad_norm(dr_model) previous_params = deepcopy(list(dr_model.parameters())) optim.step() total_loss += loss.data params = deepcopy(list(dr_model.parameters())) delta = get_weight_update(previous_params, params) weight_update_ratio = get_ratio_update(delta, params) # Logging if i % dr_config.log_interval == 0 and i > 0: elapsed = (time() - start_time) * 1000 / dr_config.log_interval cur_loss = total_loss[0] / dr_config.log_interval / dr_config.batch_size # turn tensor into float total_loss = 0 start_time = time() print( '[Training]| Epochs {:3d} | Batch {:5d} / {:5d} | ms/batch {:02.2f} | Loss {:05.2f} |'.format(epoch, i, num_batchs, elapsed, cur_loss))
Example #29
Source File: main.py From PyTorch-NLP with BSD 3-Clause "New" or "Revised" License | 4 votes |
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) batch = 0 for source_sample, target_sample in zip(train_source_sampler, train_target_sampler): model.train() data = torch.stack([train_data[i] for i in source_sample]).t_().contiguous() targets = torch.stack([train_data[i] for i in target_sample]).t_().contiguous().view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum( args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum( args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_source_sampler) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 start_time = time.time() ### batch += 1 # Loop over epochs.
Example #30
Source File: main.py From lm-context-analysis with Apache License 2.0 | 4 votes |
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(output.view(-1, ntokens), targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len # Loop over epochs.