Python apex.optimizers.FusedAdam() Examples
The following are 12
code examples of apex.optimizers.FusedAdam().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apex.optimizers
, or try the search function
.
Example #1
Source File: fused_adam.py From fairseq with MIT License | 6 votes |
def get_fused_adam_class(): """ Look for the FusedAdam optimizer from apex. We first try to load the "contrib" interface, which is a bit faster than the main interface, but is technically deprecated. """ try: # The "deprecated" interface in recent versions of apex is a bit # faster than the main interface, since we don't use the apex # optimizer. This can be installed by passing the # `--deprecated_fused_adam` option when building apex. global fused_adam_cuda import importlib fused_adam_cuda = importlib.import_module("fused_adam_cuda") return FusedAdamV1 except ImportError: try: # fallback to the newer interface from apex.optimizers import FusedAdam as _FusedAdam # noqa from apex.multi_tensor_apply import multi_tensor_applier if multi_tensor_applier.available: return FusedAdamV2 except ImportError: pass return None
Example #2
Source File: fused_adam.py From fairseq with MIT License | 6 votes |
def __init__(self, params, lr=1e-3, bias_correction=True, betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False, weight_decay=0., max_grad_norm=0., amsgrad=False): global fused_adam_cuda import importlib fused_adam_cuda = importlib.import_module("fused_adam_cuda") if amsgrad: raise RuntimeError('FusedAdam does not support the AMSGrad variant.') defaults = { 'lr': lr, 'bias_correction': bias_correction, 'betas': betas, 'eps': eps, 'weight_decay': weight_decay, 'max_grad_norm': max_grad_norm, } super().__init__(params, defaults) self.eps_mode = 0 if eps_inside_sqrt else 1
Example #3
Source File: fused_adam.py From attn2d with MIT License | 6 votes |
def get_fused_adam_class(): """ Look for the FusedAdam optimizer from apex. We first try to load the "contrib" interface, which is a bit faster than the main interface, but is technically deprecated. """ try: # The "deprecated" interface in recent versions of apex is a bit # faster than the main interface, since we don't use the apex # optimizer. This can be installed by passing the # `--deprecated_fused_adam` option when building apex. global fused_adam_cuda import importlib fused_adam_cuda = importlib.import_module("fused_adam_cuda") return FusedAdamV1 except ImportError: try: # fallback to the newer interface from apex.optimizers import FusedAdam as _FusedAdam # noqa from apex.multi_tensor_apply import multi_tensor_applier if multi_tensor_applier.available: return FusedAdamV2 except ImportError: pass return None
Example #4
Source File: fused_adam.py From attn2d with MIT License | 6 votes |
def __init__(self, params, lr=1e-3, bias_correction=True, betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False, weight_decay=0., max_grad_norm=0., amsgrad=False): global fused_adam_cuda import importlib fused_adam_cuda = importlib.import_module("fused_adam_cuda") if amsgrad: raise RuntimeError('FusedAdam does not support the AMSGrad variant.') defaults = { 'lr': lr, 'bias_correction': bias_correction, 'betas': betas, 'eps': eps, 'weight_decay': weight_decay, 'max_grad_norm': max_grad_norm, } super().__init__(params, defaults) self.eps_mode = 0 if eps_inside_sqrt else 1
Example #5
Source File: checkpointing_test_part2.py From apex with BSD 3-Clause "New" or "Revised" License | 6 votes |
def main(step, args, model_state_dict, optimizer_state_dict): # # PART2 # model = build_model(args).cuda() one_ll = next(model.children()).weight optimizer = FusedAdam(model.parameters()) ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask) ASP.init_optimizer_for_pruning(optimizer) torch.manual_seed(args.seed2) model.load_state_dict(model_state_dict) optimizer.load_state_dict(optimizer_state_dict) print("Model sparsity is %s" % ("enabled" if ASP.sparsity_is_enabled() else "disabled")) # train for a few steps with sparse weights print("SPARSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)
Example #6
Source File: model_setup.py From bert_on_stilts with Apache License 2.0 | 5 votes |
def create_optimizer(model, learning_rate, t_total, loss_scale, fp16, warmup_proportion, state_dict): # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'adapter.down_project.weight', 'adapter.up_project.weight', ] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex " "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total) if state_dict is not None: optimizer.load_state_dict(state_dict) return optimizer
Example #7
Source File: __init__.py From pytorch-tools with MIT License | 5 votes |
def optimizer_from_name(optim_name): optim_name = optim_name.lower() if optim_name == "sgd": return optim.SGD elif optim_name == "sgdw": return SGDW elif optim_name == "adam": return partial(optim.Adam, eps=2e-5) elif optim_name == "adamw": return partial(AdamW_my, eps=2e-5) elif optim_name == "adamw_gc": # in this implementation eps in inside sqrt so it can be smaller return partial(AdamW_my, center=True, eps=1e-7) elif optim_name == "rmsprop": # in this implementation eps in inside sqrt so it can be smaller return partial(RMSprop, eps=1e-7) elif optim_name == "radam": return partial(RAdam, eps=2e-5) elif optim_name in ["fused_sgd", "fusedsgd"]: return FusedSGD elif optim_name in ["fused_adam", "fusedadam"]: return partial(FusedAdam, eps=2e-5) elif optim_name in ["fused_novograd", "fusednovograd", "novograd"]: return partial(FusedNovoGrad, eps=2e-5) else: raise ValueError(f"Optimizer {optim_name} not found")
Example #8
Source File: toy_problem.py From apex with BSD 3-Clause "New" or "Revised" License | 5 votes |
def main(args): model = build_model(args).cuda() one_ll = next(model.children()).weight optimizer = FusedAdam(model.parameters()) # only prune linear layers, even though we also support conv1d, conv2d and conv3d ASP.init_model_for_pruning(model, "m4n2_1d", whitelist=[torch.nn.Linear], allow_recompute_mask=True) ASP.init_optimizer_for_pruning(optimizer) step = 0 # train for a few steps with dense weights print("DENSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_dense_steps) # simulate sparsity by inserting zeros into existing dense weights ASP.compute_sparse_masks() # train for a few steps with sparse weights print("SPARSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_sparse_steps) # recompute sparse masks ASP.compute_sparse_masks() # train for a few steps with sparse weights print("SPARSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2) # turn off sparsity print("SPARSE :: ",one_ll) ASP.restore_pruned_weights() # train for a few steps with dense weights print("DENSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_dense_steps_2)
Example #9
Source File: checkpointing_test_part1.py From apex with BSD 3-Clause "New" or "Revised" License | 5 votes |
def main(args): # # PART1 # torch.manual_seed(args.seed) model = build_model(args).cuda() one_ll = next(model.children()).weight optimizer = FusedAdam(model.parameters()) ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask) ASP.init_optimizer_for_pruning(optimizer) step = 0 # train for a few steps with dense weights print("DENSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_dense_steps) # simulate sparsity by inserting zeros into existing dense weights ASP.enable_sparsity() # train for a few steps with sparse weights print("SPARSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_sparse_steps) torch.save({ 'step': step, 'verbosity': args.verbosity, 'seed2': args.seed2, 'pattern': args.pattern, 'whitelist': args.whitelist, 'allow_recompute_mask': args.allow_recompute_mask, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, args.checkpoint_path)
Example #10
Source File: checkpointing_test_reference.py From apex with BSD 3-Clause "New" or "Revised" License | 5 votes |
def main(args): # # PART1 # torch.manual_seed(args.seed) model = build_model(args).cuda() one_ll = next(model.children()).weight optimizer = FusedAdam(model.parameters()) ASP.init_model_for_pruning(model, args.pattern, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask) ASP.init_optimizer_for_pruning(optimizer) step = 0 # train for a few steps with dense weights print("DENSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_dense_steps) # simulate sparsity by inserting zeros into existing dense weights ASP.enable_sparsity() # train for a few steps with sparse weights print("SPARSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_sparse_steps) # # PART 2 # torch.manual_seed(args.seed2) # train for a few steps with sparse weights print("SPARSE :: ",one_ll) step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)
Example #11
Source File: imagenet.py From apex with BSD 3-Clause "New" or "Revised" License | 4 votes |
def main(): args = parseArgs() pyprof.nvtx.init() # pyprof.nvtx.wrap(fused_adam_cuda, 'adam') N = args.b C = 3 H = d[args.m]['H'] W = d[args.m]['W'] opts = d[args.m]['opts'] classes = 1000 net = getattr(models, args.m) net = net(**opts).cuda().half() net.train() x = torch.rand(N, C, H, W).cuda().half() target = torch.empty(N, dtype=torch.long).random_(classes).cuda() criterion = nn.CrossEntropyLoss().cuda() if (args.o == "sgd"): optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9) elif (args.o == "adam"): optimizer = FusedAdam(net.parameters()) else: assert False #Warm up without profiler for i in range(2): output = net(x) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() with torch.autograd.profiler.emit_nvtx(): profiler.start() output = net(x) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() profiler.stop()
Example #12
Source File: runner.py From Self-Supervised-Speech-Pretraining-and-Representation-Learning with MIT License | 4 votes |
def set_model(self): print('[Runner] - Initializing Transformer model...') # build the Transformer model with speech prediction head model_config = TransformerConfig(self.config) self.dr = model_config.downsample_rate self.hidden_size = model_config.hidden_size self.model = TransformerForMaskedAcousticModel(model_config, self.input_dim, self.output_dim).to(self.device) self.model.train() if self.args.multi_gpu: self.model = torch.nn.DataParallel(self.model) print('[Runner] - Multi-GPU training Enabled: ' + str(torch.cuda.device_count())) print('[Runner] - Number of parameters: ' + str(sum(p.numel() for p in self.model.parameters() if p.requires_grad))) # Setup optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if self.apex: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.learning_rate, bias_correction=False, max_grad_norm=1.0) if self.config['optimizer']['loss_scale'] == 0: self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: self.optimizer = FP16_Optimizer(optimizer, static_loss_scale=self.config['optimizer']['loss_scale']) self.warmup_linear = WarmupLinearSchedule(warmup=self.warmup_proportion, t_total=self.total_steps) else: self.optimizer = BertAdam(optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=self.total_steps)