Python pytorch_pretrained_bert.optimization.BertAdam() Examples

The following are 14 code examples of pytorch_pretrained_bert.optimization.BertAdam(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pytorch_pretrained_bert.optimization , or try the search function .
Example #1
Source File: run_mrc_ner.py    From mrc-for-flat-nested-ner with Apache License 2.0 6 votes vote down vote up
def load_model(config, num_train_steps, label_list):
    device = torch.device("cuda") 
    n_gpu = torch.cuda.device_count()
    model = BertMRCNER(config, ) 
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # prepare optimzier 
    param_optimizer = list(model.named_parameters())

        
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
    {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
    {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]

    # optimizer = Adam(optimizer_grouped_parameters, lr=config.learning_rate) 
    optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps, max_grad_norm=config.clip_grad) 

    return model, optimizer, device, n_gpu 
Example #2
Source File: trainer.py    From mrqa with Apache License 2.0 6 votes vote down vote up
def get_opt(param_optimizer, num_train_optimization_steps, args):
    """
    Hack to remove pooler, which is not used
    Thus it produce None grad that break apex
    """
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    return BertAdam(optimizer_grouped_parameters,
                    lr=args.lr,
                    warmup=args.warmup_proportion,
                    t_total=num_train_optimization_steps) 
Example #3
Source File: utils.py    From neutralizing-bias with MIT License 6 votes vote down vote up
def build_optimizer(model, num_train_steps, learning_rate):
    global ARGS

    if ARGS.tagger_from_debiaser:
        parameters = list(model.cls_classifier.parameters()) + list(
            model.tok_classifier.parameters())
        parameters = list(filter(lambda p: p.requires_grad, parameters))
        return optim.Adam(parameters, lr=ARGS.learning_rate)
    else:
        param_optimizer = list(model.named_parameters())
        param_optimizer = list(filter(lambda name_param: name_param[1].requires_grad, param_optimizer))
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ]
        return BertAdam(optimizer_grouped_parameters,
                             lr=learning_rate,
                             warmup=0.1,
                             t_total=num_train_steps) 
Example #4
Source File: run_bert_tagger.py    From mrc-for-flat-nested-ner with Apache License 2.0 5 votes vote down vote up
def load_model(config, num_train_steps, label_list):
    # device = torch.device(torch.cuda.is_available())
    device = torch.device("cuda") 
    n_gpu = torch.cuda.device_count()
    model = BertTagger(config, num_labels=len(label_list)) 
    # model = BertForTagger.from_pretrained(config.bert_model, num_labels=13)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # prepare  optimzier 
    param_optimizer = list(model.named_parameters())

        
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
    {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
    {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]

    # optimizer = Adam(optimizer_grouped_parameters, lr=config.learning_rate) 
    optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps, max_grad_norm=config.clip_grad) 

    return model, optimizer, device, n_gpu 
Example #5
Source File: base_task.py    From Doc2EDAG with MIT License 5 votes vote down vote up
def reset_bert_optimizer(self):
        # Prepare optimizer
        if self.setting.fp16:
            model_named_parameters = [(n, param.clone().detach().to('cpu').float().requires_grad_())
                                      for n, param in self.model.named_parameters()]
        elif self.setting.optimize_on_cpu:
            model_named_parameters = [(n, param.clone().detach().to('cpu').requires_grad_())
                                      for n, param in self.model.named_parameters()]
        else:
            model_named_parameters = list(self.model.named_parameters())

        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in model_named_parameters if n not in no_decay],
                'weight_decay_rate': 0.01
            },
            {
                'params': [p for n, p in model_named_parameters if n in no_decay],
                'weight_decay_rate': 0.0
            }
        ]

        num_train_steps = int(len(self.train_examples)
                              / self.setting.train_batch_size
                              / self.setting.gradient_accumulation_steps
                              * self.setting.num_train_epochs)

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=self.setting.learning_rate,
                             warmup=self.setting.warmup_proportion,
                             t_total=num_train_steps)

        return optimizer, num_train_steps, model_named_parameters 
Example #6
Source File: train.py    From KernelGAT with MIT License 5 votes vote down vote up
def train_model(model, args, trainset_reader, validset_reader):
    save_path = args.outdir + '/model'
    best_acc = 0.0
    running_loss = 0.0
    t_total = int(
        trainset_reader.total_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                        warmup=args.warmup_proportion,
                         t_total=t_total)
    global_step = 0
    crit = nn.CrossEntropyLoss()
    for epoch in range(int(args.num_train_epochs)):
        optimizer.zero_grad()
        for inp_tensor, msk_tensor, seg_tensor, label_tensor in trainset_reader:
            model.train()
            prob = model(inp_tensor, msk_tensor, seg_tensor)
            loss = crit(prob, label_tensor)
            running_loss += loss.item()
            loss.backward()
            global_step += 1
            if global_step % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                logger.info('Epoch: {0}, Step: {1}, Loss: {2}'.format(epoch, global_step, (running_loss / global_step)))
            if global_step % (args.eval_step * args.gradient_accumulation_steps) == 0:
                logger.info('Start eval!')
                eval_acc = eval_model(model, validset_reader)
                logger.info('Dev acc: {0}'.format(eval_acc))
                if eval_acc >= best_acc:
                    best_acc = eval_acc
                    torch.save({'epoch': epoch,
                                'model': model.state_dict()}, save_path + ".best.pt")
                    logger.info("Saved best epoch {0}, best acc {1}".format(epoch, best_acc)) 
Example #7
Source File: model_setup.py    From bert_on_stilts with Apache License 2.0 5 votes vote down vote up
def create_optimizer(model, learning_rate, t_total, loss_scale, fp16, warmup_proportion, state_dict):
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = [
        'bias', 'LayerNorm.bias', 'LayerNorm.weight',
        'adapter.down_project.weight', 'adapter.up_project.weight',
    ]
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    if fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex "
                              "to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=learning_rate,
                             warmup=warmup_proportion,
                             t_total=t_total)

    if state_dict is not None:
        optimizer.load_state_dict(state_dict)
    return optimizer 
Example #8
Source File: train.py    From curriculum with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, args, model, train_examples, use_gpu):
        self.use_gpu = use_gpu
        self.model = model

        self.epochs = args.epochs
        self.best_f1 = -1
        self.min_loss = 100
        self.save_dir = args.save_dir

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        self.lr = args.lr
        self.warmup_proportion = args.warmup_proportion
        self.t_total = int(train_examples / args.batch_size / 1 * args.epochs)

        self.optimizer = BertAdam(optimizer_grouped_parameters,
                                  lr=args.lr,
                                  warmup=args.warmup_proportion,
                                  t_total=self.t_total)

        if self.use_gpu:
            self.loss_func = nn.CrossEntropyLoss(weight=torch.FloatTensor([1.0, args.weight]).cuda())
        else:
            self.loss_func = nn.CrossEntropyLoss(weight=torch.FloatTensor([1.0, args.weight])) 
Example #9
Source File: utils.py    From neutralizing-bias with MIT License 5 votes vote down vote up
def build_optimizer(model, num_train_steps=None):
    global ARGS

    if ARGS.bert_encoder:
        assert num_train_steps

        param_optimizer = list(model.named_parameters())
        param_optimizer = list(filter(lambda name_param: name_param[1].requires_grad, param_optimizer))

        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ]

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=5e-5,
                             warmup=0.1,
                             t_total=num_train_steps)

    else:
        params = list(model.parameters())
        params = list(filter(lambda p: p.requires_grad, params))
        optimizer = optim.Adam(params, lr=ARGS.learning_rate)

    return optimizer 
Example #10
Source File: token_classification.py    From nlp-recipes with MIT License 5 votes vote down vote up
def _get_optimizer(self, learning_rate, num_train_optimization_steps, warmup_proportion):
        """
        Initializes the optimizer and configure parameters to apply weight
        decay on.
        """
        param_optimizer = list(self.model.named_parameters())
        no_decay_params = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        params_weight_decay = 0.01
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay_params)
                ],
                "weight_decay": params_weight_decay,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay_params)],
                "weight_decay": 0.0,
            },
        ]

        if warmup_proportion is None:
            optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate)
        else:
            optimizer = BertAdam(
                optimizer_grouped_parameters,
                lr=learning_rate,
                t_total=num_train_optimization_steps,
                warmup=warmup_proportion,
            )

        return optimizer 
Example #11
Source File: train.py    From KernelGAT with MIT License 4 votes vote down vote up
def train_model(model, ori_model, args, trainset_reader, validset_reader):
    save_path = args.outdir + '/model'
    best_accuracy = 0.0
    running_loss = 0.0
    t_total = int(
        trainset_reader.total_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)
    #optimizer = optim.Adam(model.parameters(),
    #                       lr=args.learning_rate)
    global_step = 0
    for epoch in range(int(args.num_train_epochs)):
        model.train()
        optimizer.zero_grad()
        for index, data in enumerate(trainset_reader):
            inputs, lab_tensor = data
            prob = model(inputs)
            loss = F.nll_loss(prob, lab_tensor)
            running_loss += loss.item()
            #if args.gradient_accumulation_steps > 1:
            #    loss = loss / args.gradient_accumulation_steps
            loss.backward()
            global_step += 1
            if global_step % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                logger.info('Epoch: {0}, Step: {1}, Loss: {2}'.format(epoch, global_step, (running_loss / global_step)))
            if global_step % (args.eval_step * args.gradient_accumulation_steps) == 0:
                logger.info('Start eval!')
                with torch.no_grad():
                    dev_accuracy = eval_model(model, validset_reader)
                    logger.info('Dev total acc: {0}'.format(dev_accuracy))
                    if dev_accuracy > best_accuracy:
                        best_accuracy = dev_accuracy

                        torch.save({'epoch': epoch,
                                    'model': ori_model.state_dict(),
                                    'best_accuracy': best_accuracy}, save_path + ".best.pt")
                        logger.info("Saved best epoch {0}, best accuracy {1}".format(epoch, best_accuracy)) 
Example #12
Source File: train.py    From KernelGAT with MIT License 4 votes vote down vote up
def train_model(model, args, trainset_reader, validset_reader):
    save_path = args.outdir + '/model'
    best_acc = 0.0
    running_loss = 0.0
    t_total = int(
        trainset_reader.total_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                        warmup=args.warmup_proportion,
                         t_total=t_total)
    global_step = 0
    crit = nn.MarginRankingLoss(margin=1)
    for epoch in range(int(args.num_train_epochs)):
        optimizer.zero_grad()
        for inp_tensor_pos, msk_tensor_pos, seg_tensor_pos, inp_tensor_neg, msk_tensor_neg, seg_tensor_neg in trainset_reader:
            model.train()
            score_pos = model(inp_tensor_pos, msk_tensor_pos, seg_tensor_pos)
            score_neg = model(inp_tensor_neg, msk_tensor_neg, seg_tensor_neg)
            label = torch.ones(score_pos.size())
            if args.cuda:
                label = label.cuda()
            loss = crit(score_pos, score_neg, Variable(label, requires_grad=False))
            running_loss += loss.item()
            loss.backward()
            global_step += 1
            if global_step % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                logger.info('Epoch: {0}, Step: {1}, Loss: {2}'.format(epoch, global_step, (running_loss / global_step)))
            if global_step % (args.eval_step * args.gradient_accumulation_steps) == 0:
                logger.info('Start eval!')
                eval_acc = eval_model(model, validset_reader)
                logger.info('Dev acc: {0}'.format(eval_acc))
                if eval_acc >= best_acc:
                    best_acc = eval_acc
                    torch.save({'epoch': epoch,
                                'model': model.state_dict()}, save_path + ".best.pt")
                    logger.info("Saved best epoch {0}, best acc {1}".format(epoch, best_acc)) 
Example #13
Source File: train_eval.py    From Bert-Chinese-Text-Classification-Pytorch with MIT License 4 votes vote down vote up
def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter) 
Example #14
Source File: sequence_classification_distributed.py    From nlp-recipes with MIT License 4 votes vote down vote up
def create_optimizer(
        self,
        num_train_optimization_steps,
        lr=2e-5,
        fp16_allreduce=False,
        warmup_proportion=None,
    ):

        """
        Method to create an BERT Optimizer based on the inputs from the user.

        Args:
            num_train_optimization_steps(int): Number of optimization steps.
            lr (float): learning rate of the adam optimizer. defaults to 2e-5.
            warmup_proportion (float, optional): proportion of training to
                perform linear learning rate warmup for. e.g., 0.1 = 10% of
                training. defaults to none.
            fp16_allreduce(bool, optional)L if true, use fp16 compression
                during allreduce.

        Returns:
            pytorch_pretrained_bert.optimization.BertAdam  : A BertAdam optimizer with
                user specified config.

        """
        if self.use_distributed:
            lr = lr * hvd.size()

        if warmup_proportion is None:
            optimizer = BertAdam(self.optimizer_params, lr=lr)
        else:
            optimizer = BertAdam(
                self.optimizer_params,
                lr=lr,
                t_total=num_train_optimization_steps,
                warmup=warmup_proportion,
            )

        if self.use_distributed:
            compression = (
                hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none
            )
            optimizer = hvd.DistributedOptimizer(
                optimizer,
                named_parameters=self.model.named_parameters(),
                compression=compression,
            )

        return optimizer