Python pytorch_pretrained_bert.optimization.BertAdam() Examples
The following are 14
code examples of pytorch_pretrained_bert.optimization.BertAdam().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pytorch_pretrained_bert.optimization
, or try the search function
.
Example #1
Source File: run_mrc_ner.py From mrc-for-flat-nested-ner with Apache License 2.0 | 6 votes |
def load_model(config, num_train_steps, label_list): device = torch.device("cuda") n_gpu = torch.cuda.device_count() model = BertMRCNER(config, ) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # prepare optimzier param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01}, {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}] # optimizer = Adam(optimizer_grouped_parameters, lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps, max_grad_norm=config.clip_grad) return model, optimizer, device, n_gpu
Example #2
Source File: trainer.py From mrqa with Apache License 2.0 | 6 votes |
def get_opt(param_optimizer, num_train_optimization_steps, args): """ Hack to remove pooler, which is not used Thus it produce None grad that break apex """ param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] return BertAdam(optimizer_grouped_parameters, lr=args.lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps)
Example #3
Source File: utils.py From neutralizing-bias with MIT License | 6 votes |
def build_optimizer(model, num_train_steps, learning_rate): global ARGS if ARGS.tagger_from_debiaser: parameters = list(model.cls_classifier.parameters()) + list( model.tok_classifier.parameters()) parameters = list(filter(lambda p: p.requires_grad, parameters)) return optim.Adam(parameters, lr=ARGS.learning_rate) else: param_optimizer = list(model.named_parameters()) param_optimizer = list(filter(lambda name_param: name_param[1].requires_grad, param_optimizer)) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] return BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=0.1, t_total=num_train_steps)
Example #4
Source File: run_bert_tagger.py From mrc-for-flat-nested-ner with Apache License 2.0 | 5 votes |
def load_model(config, num_train_steps, label_list): # device = torch.device(torch.cuda.is_available()) device = torch.device("cuda") n_gpu = torch.cuda.device_count() model = BertTagger(config, num_labels=len(label_list)) # model = BertForTagger.from_pretrained(config.bert_model, num_labels=13) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # prepare optimzier param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01}, {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}] # optimizer = Adam(optimizer_grouped_parameters, lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps, max_grad_norm=config.clip_grad) return model, optimizer, device, n_gpu
Example #5
Source File: base_task.py From Doc2EDAG with MIT License | 5 votes |
def reset_bert_optimizer(self): # Prepare optimizer if self.setting.fp16: model_named_parameters = [(n, param.clone().detach().to('cpu').float().requires_grad_()) for n, param in self.model.named_parameters()] elif self.setting.optimize_on_cpu: model_named_parameters = [(n, param.clone().detach().to('cpu').requires_grad_()) for n, param in self.model.named_parameters()] else: model_named_parameters = list(self.model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ { 'params': [p for n, p in model_named_parameters if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model_named_parameters if n in no_decay], 'weight_decay_rate': 0.0 } ] num_train_steps = int(len(self.train_examples) / self.setting.train_batch_size / self.setting.gradient_accumulation_steps * self.setting.num_train_epochs) optimizer = BertAdam(optimizer_grouped_parameters, lr=self.setting.learning_rate, warmup=self.setting.warmup_proportion, t_total=num_train_steps) return optimizer, num_train_steps, model_named_parameters
Example #6
Source File: train.py From KernelGAT with MIT License | 5 votes |
def train_model(model, args, trainset_reader, validset_reader): save_path = args.outdir + '/model' best_acc = 0.0 running_loss = 0.0 t_total = int( trainset_reader.total_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 crit = nn.CrossEntropyLoss() for epoch in range(int(args.num_train_epochs)): optimizer.zero_grad() for inp_tensor, msk_tensor, seg_tensor, label_tensor in trainset_reader: model.train() prob = model(inp_tensor, msk_tensor, seg_tensor) loss = crit(prob, label_tensor) running_loss += loss.item() loss.backward() global_step += 1 if global_step % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() logger.info('Epoch: {0}, Step: {1}, Loss: {2}'.format(epoch, global_step, (running_loss / global_step))) if global_step % (args.eval_step * args.gradient_accumulation_steps) == 0: logger.info('Start eval!') eval_acc = eval_model(model, validset_reader) logger.info('Dev acc: {0}'.format(eval_acc)) if eval_acc >= best_acc: best_acc = eval_acc torch.save({'epoch': epoch, 'model': model.state_dict()}, save_path + ".best.pt") logger.info("Saved best epoch {0}, best acc {1}".format(epoch, best_acc))
Example #7
Source File: model_setup.py From bert_on_stilts with Apache License 2.0 | 5 votes |
def create_optimizer(model, learning_rate, t_total, loss_scale, fp16, warmup_proportion, state_dict): # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'adapter.down_project.weight', 'adapter.up_project.weight', ] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex " "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total) if state_dict is not None: optimizer.load_state_dict(state_dict) return optimizer
Example #8
Source File: train.py From curriculum with GNU General Public License v3.0 | 5 votes |
def __init__(self, args, model, train_examples, use_gpu): self.use_gpu = use_gpu self.model = model self.epochs = args.epochs self.best_f1 = -1 self.min_loss = 100 self.save_dir = args.save_dir param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.lr = args.lr self.warmup_proportion = args.warmup_proportion self.t_total = int(train_examples / args.batch_size / 1 * args.epochs) self.optimizer = BertAdam(optimizer_grouped_parameters, lr=args.lr, warmup=args.warmup_proportion, t_total=self.t_total) if self.use_gpu: self.loss_func = nn.CrossEntropyLoss(weight=torch.FloatTensor([1.0, args.weight]).cuda()) else: self.loss_func = nn.CrossEntropyLoss(weight=torch.FloatTensor([1.0, args.weight]))
Example #9
Source File: utils.py From neutralizing-bias with MIT License | 5 votes |
def build_optimizer(model, num_train_steps=None): global ARGS if ARGS.bert_encoder: assert num_train_steps param_optimizer = list(model.named_parameters()) param_optimizer = list(filter(lambda name_param: name_param[1].requires_grad, param_optimizer)) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=5e-5, warmup=0.1, t_total=num_train_steps) else: params = list(model.parameters()) params = list(filter(lambda p: p.requires_grad, params)) optimizer = optim.Adam(params, lr=ARGS.learning_rate) return optimizer
Example #10
Source File: token_classification.py From nlp-recipes with MIT License | 5 votes |
def _get_optimizer(self, learning_rate, num_train_optimization_steps, warmup_proportion): """ Initializes the optimizer and configure parameters to apply weight decay on. """ param_optimizer = list(self.model.named_parameters()) no_decay_params = ["bias", "LayerNorm.bias", "LayerNorm.weight"] params_weight_decay = 0.01 optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay_params) ], "weight_decay": params_weight_decay, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay_params)], "weight_decay": 0.0, }, ] if warmup_proportion is None: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=learning_rate, t_total=num_train_optimization_steps, warmup=warmup_proportion, ) return optimizer
Example #11
Source File: train.py From KernelGAT with MIT License | 4 votes |
def train_model(model, ori_model, args, trainset_reader, validset_reader): save_path = args.outdir + '/model' best_accuracy = 0.0 running_loss = 0.0 t_total = int( trainset_reader.total_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) #optimizer = optim.Adam(model.parameters(), # lr=args.learning_rate) global_step = 0 for epoch in range(int(args.num_train_epochs)): model.train() optimizer.zero_grad() for index, data in enumerate(trainset_reader): inputs, lab_tensor = data prob = model(inputs) loss = F.nll_loss(prob, lab_tensor) running_loss += loss.item() #if args.gradient_accumulation_steps > 1: # loss = loss / args.gradient_accumulation_steps loss.backward() global_step += 1 if global_step % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() logger.info('Epoch: {0}, Step: {1}, Loss: {2}'.format(epoch, global_step, (running_loss / global_step))) if global_step % (args.eval_step * args.gradient_accumulation_steps) == 0: logger.info('Start eval!') with torch.no_grad(): dev_accuracy = eval_model(model, validset_reader) logger.info('Dev total acc: {0}'.format(dev_accuracy)) if dev_accuracy > best_accuracy: best_accuracy = dev_accuracy torch.save({'epoch': epoch, 'model': ori_model.state_dict(), 'best_accuracy': best_accuracy}, save_path + ".best.pt") logger.info("Saved best epoch {0}, best accuracy {1}".format(epoch, best_accuracy))
Example #12
Source File: train.py From KernelGAT with MIT License | 4 votes |
def train_model(model, args, trainset_reader, validset_reader): save_path = args.outdir + '/model' best_acc = 0.0 running_loss = 0.0 t_total = int( trainset_reader.total_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 crit = nn.MarginRankingLoss(margin=1) for epoch in range(int(args.num_train_epochs)): optimizer.zero_grad() for inp_tensor_pos, msk_tensor_pos, seg_tensor_pos, inp_tensor_neg, msk_tensor_neg, seg_tensor_neg in trainset_reader: model.train() score_pos = model(inp_tensor_pos, msk_tensor_pos, seg_tensor_pos) score_neg = model(inp_tensor_neg, msk_tensor_neg, seg_tensor_neg) label = torch.ones(score_pos.size()) if args.cuda: label = label.cuda() loss = crit(score_pos, score_neg, Variable(label, requires_grad=False)) running_loss += loss.item() loss.backward() global_step += 1 if global_step % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() logger.info('Epoch: {0}, Step: {1}, Loss: {2}'.format(epoch, global_step, (running_loss / global_step))) if global_step % (args.eval_step * args.gradient_accumulation_steps) == 0: logger.info('Start eval!') eval_acc = eval_model(model, validset_reader) logger.info('Dev acc: {0}'.format(eval_acc)) if eval_acc >= best_acc: best_acc = eval_acc torch.save({'epoch': epoch, 'model': model.state_dict()}, save_path + ".best.pt") logger.info("Saved best epoch {0}, best acc {1}".format(epoch, best_acc))
Example #13
Source File: train_eval.py From Bert-Chinese-Text-Classification-Pytorch with MIT License | 4 votes |
def train(config, model, train_iter, dev_iter, test_iter): start_time = time.time() model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 # 记录进行到多少batch dev_best_loss = float('inf') last_improve = 0 # 记录上次验证集loss下降的batch数 flag = False # 记录是否很久没有效果提升 model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() if total_batch % 100 == 0: # 每多少轮输出在训练集和验证集上的效果 true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predic) dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.save_path) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}' print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improvement: # 验证集loss超过1000batch没下降,结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break test(config, model, test_iter)
Example #14
Source File: sequence_classification_distributed.py From nlp-recipes with MIT License | 4 votes |
def create_optimizer( self, num_train_optimization_steps, lr=2e-5, fp16_allreduce=False, warmup_proportion=None, ): """ Method to create an BERT Optimizer based on the inputs from the user. Args: num_train_optimization_steps(int): Number of optimization steps. lr (float): learning rate of the adam optimizer. defaults to 2e-5. warmup_proportion (float, optional): proportion of training to perform linear learning rate warmup for. e.g., 0.1 = 10% of training. defaults to none. fp16_allreduce(bool, optional)L if true, use fp16 compression during allreduce. Returns: pytorch_pretrained_bert.optimization.BertAdam : A BertAdam optimizer with user specified config. """ if self.use_distributed: lr = lr * hvd.size() if warmup_proportion is None: optimizer = BertAdam(self.optimizer_params, lr=lr) else: optimizer = BertAdam( self.optimizer_params, lr=lr, t_total=num_train_optimization_steps, warmup=warmup_proportion, ) if self.use_distributed: compression = ( hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none ) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=self.model.named_parameters(), compression=compression, ) return optimizer