Python transformers.AdamW() Examples
The following are 5
code examples of transformers.AdamW().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
transformers
, or try the search function
.
Example #1
Source File: common.py From nlp-recipes with MIT License | 6 votes |
def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon ) return optimizer
Example #2
Source File: train.py From dialogue-generation with MIT License | 5 votes |
def create_optimizer(args, parameters): """ Creates an adam optimizer. """ optimizer = AdamW( lr=args.lr, params=parameters, weight_decay=0.01) return optimizer # implementation is from DialoGPT repo
Example #3
Source File: train.py From kaggle-google-quest with MIT License | 5 votes |
def get_optimizer(model, lr, weight_decay, model_type='siamese'): param_groups = get_optimizer_param_groups(model.head, lr, weight_decay) if model_type == 'siamese': param_groups += get_optimizer_param_groups(model.transformer, lr / 100, weight_decay) elif model_type == 'double': param_groups += get_optimizer_param_groups(model.q_transformer, lr / 100, weight_decay) param_groups += get_optimizer_param_groups(model.a_transformer, lr / 100, weight_decay) return AdamW(param_groups)
Example #4
Source File: test_transformers.py From docker-python with Apache License 2.0 | 5 votes |
def test_adam_w(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) criterion = torch.nn.MSELoss() # No warmup, constant schedule, no gradient clipping optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0) for _ in range(100): loss = criterion(w, target) loss.backward() optimizer.step() w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves. w.grad.zero_() self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
Example #5
Source File: run_seq2seq.py From unilm with MIT License | 5 votes |
def prepare_for_training(args, model, checkpoint_state_dict, amp): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if amp: model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if checkpoint_state_dict: amp.load_state_dict(checkpoint_state_dict['amp']) if checkpoint_state_dict: optimizer.load_state_dict(checkpoint_state_dict['optimizer']) model.load_state_dict(checkpoint_state_dict['model']) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) return model, optimizer