Python torch.optim.optimizer() Examples
The following are 30
code examples of torch.optim.optimizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.optim
, or try the search function
.
Example #1
Source File: optimizer.py From stog with MIT License | 6 votes |
def build_optim(opt, model): """ Build optimizer """ optim = Optimizer( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_steps=opt.start_decay_steps, decay_steps=opt.decay_steps, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.encoder_size ) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optim.set_parameters(parameters) return optim
Example #2
Source File: optimizers.py From PreSumm with MIT License | 6 votes |
def set_parameters(self, params): """ ? """ self.params = [] self.sparse_params = [] for k, p in params: if p.requires_grad: if self.method != 'sparseadam' or "embed" not in k: self.params.append(p) else: self.sparse_params.append(p) if self.method == 'sgd': self.optimizer = optim.SGD(self.params, lr=self.learning_rate) elif self.method == 'adagrad': self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate) for group in self.optimizer.param_groups: for p in group['params']: self.optimizer.state[p]['sum'] = self.optimizer\ .state[p]['sum'].fill_(self.adagrad_accum) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate) elif self.method == 'adam': self.optimizer = optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-9) else: raise RuntimeError("Invalid optim method: " + self.method)
Example #3
Source File: optimizer.py From gtos with MIT License | 6 votes |
def build_optim(opt, model): """ Build optimizer """ optim = Optimizer( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_steps=opt.start_decay_steps, decay_steps=opt.decay_steps, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.encoder_size ) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optim.set_parameters(parameters) return optim
Example #4
Source File: optimizers.py From BiSET with MIT License | 5 votes |
def set_parameters(self, params): """ ? """ self.params = [] self.sparse_params = [] for k, p in params: if p.requires_grad: if self.method != 'sparseadam' or "embed" not in k: self.params.append(p) else: self.sparse_params.append(p) if self.method == 'sgd': self.optimizer = optim.SGD(self.params, lr=self.learning_rate) elif self.method == 'adagrad': self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate) for group in self.optimizer.param_groups: for p in group['params']: self.optimizer.state[p]['sum'] = self.optimizer\ .state[p]['sum'].fill_(self.adagrad_accum) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate) elif self.method == 'adam': self.optimizer = optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-9) elif self.method == 'sparseadam': self.optimizer = MultipleOptimizer( [optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-8), optim.SparseAdam(self.sparse_params, lr=self.learning_rate, betas=self.betas, eps=1e-8)]) else: raise RuntimeError("Invalid optim method: " + self.method)
Example #5
Source File: optimizers.py From ITDD with MIT License | 5 votes |
def param_groups(self): param_groups = [] for optimizer in self.optimizers: param_groups.extend(optimizer.param_groups) return param_groups
Example #6
Source File: optimizer.py From hiersumm with Apache License 2.0 | 5 votes |
def step(self): """Update the model parameters based on current gradients. Optionally, will employ gradient modification or update learning rate. """ self._step += 1 # Decay method used in tensor2tensor. if self.decay_method == "noam": self._set_rate( self.original_lr * ( self.model_size ** -0.5*min(self._step ** (-0.5), self._step * self.warmup_steps**(-1.5)))) else: if ((self.start_decay_steps is not None) and ( self._step >= self.start_decay_steps)): self.start_decay = True if self.start_decay: if ((self._step - self.start_decay_steps) % self.decay_steps == 0): self.learning_rate = self.learning_rate * self.lr_decay if self.method != 'sparseadam': self.optimizer.param_groups[0]['lr'] = self.learning_rate if self.max_grad_norm: clip_grad_norm_(self.params, self.max_grad_norm) self.optimizer.step()
Example #7
Source File: optimizer.py From stog with MIT License | 5 votes |
def lr(self): if self.method != 'sparseadam': return self.optimizer.param_groups[0]['lr'] else: return max(op.param_groups[0]['lr'] for op in self.optimizer.optimizers)
Example #8
Source File: optimizer.py From stog with MIT License | 5 votes |
def state_dict(self): return self.optimizer.state_dict()
Example #9
Source File: optimizer.py From stog with MIT License | 5 votes |
def set_parameters(self, params): """ ? """ self.params = [] self.sparse_params = [] for k, p in params: if p.requires_grad: if self.method != 'sparseadam' or "embed" not in k: self.params.append(p) else: self.sparse_params.append(p) if self.method == 'sgd': self.optimizer = optim.SGD(self.params, lr=self.learning_rate) elif self.method == 'adagrad': self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate) for group in self.optimizer.param_groups: for p in group['params']: self.optimizer.state[p]['sum'] = self.optimizer\ .state[p]['sum'].fill_(self.adagrad_accum) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate) elif self.method == 'adam': self.optimizer = optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-8, weight_decay=3e-9) elif self.method == 'sparseadam': self.optimizer = MultipleOptimizer( [optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-8), optim.SparseAdam(self.sparse_params, lr=self.learning_rate, betas=self.betas, eps=1e-8)]) else: raise RuntimeError("Invalid optim method: " + self.method)
Example #10
Source File: optimizer.py From stog with MIT License | 5 votes |
def set_state(self, state_dict): """ If you want to load the checkpoint of an optimizer, call this function after set_parameters. Because the method optim.set_parameters(model.parameters()) will overwrite optim.optimizer, and with ith the values stored in optim.optimizer.state_dict() """ self.optimizer.load_state_dict(state_dict) # Convert back the state values to cuda type if applicable for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(self.device)
Example #11
Source File: optimizer.py From stog with MIT License | 5 votes |
def zero_grad(self): self.optimizer.zero_grad()
Example #12
Source File: optimizer.py From stog with MIT License | 5 votes |
def step(self): """Update the model parameters based on current gradients. Optionally, will employ gradient modification or update learning rate. """ self._step += 1 # Decay method used in tensor2tensor. if self.decay_method == "noam": self._set_rate( self.original_lr * (self.model_size ** (-0.5) * min(self._step ** (-0.5), self._step * self.warmup_steps**(-1.5)))) # Decay based on start_decay_steps every decay_steps else: if ((self.start_decay_steps is not None) and ( self._step >= self.start_decay_steps)): self.start_decay = True if self.start_decay: if ((self._step - self.start_decay_steps) % self.decay_steps == 0): self.learning_rate = self.learning_rate * self.lr_decay if self.method != 'sparseadam': self.optimizer.param_groups[0]['lr'] = self.learning_rate if self.max_grad_norm: clip_grad_norm_(self.params, self.max_grad_norm) self.optimizer.step()
Example #13
Source File: optimizer.py From hiersumm with Apache License 2.0 | 5 votes |
def _set_rate(self, learning_rate): self.learning_rate = learning_rate if self.method != 'sparseadam': self.optimizer.param_groups[0]['lr'] = self.learning_rate else: for op in self.optimizer.optimizers: op.param_groups[0]['lr'] = self.learning_rate
Example #14
Source File: optimizers.py From BiSET with MIT License | 5 votes |
def _set_rate(self, learning_rate): self.learning_rate = learning_rate if self.method != 'sparseadam': self.optimizer.param_groups[0]['lr'] = self.learning_rate else: for op in self.optimizer.optimizers: op.param_groups[0]['lr'] = self.learning_rate
Example #15
Source File: optimizers.py From BiSET with MIT License | 5 votes |
def step(self): """Update the model parameters based on current gradients. Optionally, will employ gradient modification or update learning rate. """ self._step += 1 # Decay method used in tensor2tensor. if self.decay_method == "noam": self._set_rate( self.original_lr * (self.model_size ** (-0.5) * min(self._step ** (-0.5), self._step * self.warmup_steps**(-1.5)))) # Decay based on start_decay_steps every decay_steps else: if ((self.start_decay_steps is not None) and ( self._step >= self.start_decay_steps)): self.start_decay = True if self.start_decay: if ((self._step - self.start_decay_steps) % self.decay_steps == 0): self.learning_rate = self.learning_rate * self.lr_decay if self.method != 'sparseadam': self.optimizer.param_groups[0]['lr'] = self.learning_rate if self.max_grad_norm: clip_grad_norm_(self.params, self.max_grad_norm) self.optimizer.step()
Example #16
Source File: optimizers.py From nlp-recipes with MIT License | 5 votes |
def set_parameters(self, params): """ ? """ self.params = [] self.sparse_params = [] for k, p in params: if p.requires_grad: if self.method != "sparseadam" or "embed" not in k: self.params.append(p) else: self.sparse_params.append(p) if self.method == "sgd": self.optimizer = optim.SGD(self.params, lr=self.learning_rate) elif self.method == "adagrad": self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate) for group in self.optimizer.param_groups: for p in group["params"]: self.optimizer.state[p]["sum"] = self.optimizer.state[p][ "sum" ].fill_(self.adagrad_accum) elif self.method == "adadelta": self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate) elif self.method == "adam": self.optimizer = optim.Adam( self.params, lr=self.learning_rate, betas=self.betas, eps=1e-9 ) else: raise RuntimeError("Invalid optim method: " + self.method) self.param_groups = self.optimizer.param_groups self.state = self.optimizer.state
Example #17
Source File: optimizers.py From nlp-recipes with MIT License | 5 votes |
def _set_rate(self, learning_rate): self.learning_rate = learning_rate if self.method != "sparseadam": self.optimizer.param_groups[0]["lr"] = self.learning_rate else: for op in self.optimizer.optimizers: op.param_groups[0]["lr"] = self.learning_rate
Example #18
Source File: optimizers.py From nlp-recipes with MIT License | 5 votes |
def step(self): """Update the model parameters based on current gradients. Optionally, will employ gradient modification or update learning rate. """ self._step += 1 # Decay method used in tensor2tensor. if self.decay_method == "noam": self._set_rate( self.original_lr * min(self._step ** (-0.5), self._step * self.warmup_steps ** (-1.5)) ) else: if (self.start_decay_steps is not None) and ( self._step >= self.start_decay_steps ): self.start_decay = True if self.start_decay: if (self._step - self.start_decay_steps) % self.decay_steps == 0: self.learning_rate = self.learning_rate * self.lr_decay if self.method != "sparseadam": self.optimizer.param_groups[0]["lr"] = self.learning_rate if self.max_grad_norm: clip_grad_norm_(self.params, self.max_grad_norm) self.optimizer.step()
Example #19
Source File: optimizers.py From nlp-recipes with MIT License | 5 votes |
def state_dict(self): """ ? """ return self.optimizer.state_dict()
Example #20
Source File: optimizers.py From nlp-recipes with MIT License | 5 votes |
def zero_grad(self): """ ? """ self.optimizer.zero_grad()
Example #21
Source File: optimizer.py From gtos with MIT License | 5 votes |
def set_parameters(self, params): """ ? """ self.params = [] self.sparse_params = [] for k, p in params: if p.requires_grad: if self.method != 'sparseadam' or "embed" not in k: self.params.append(p) else: self.sparse_params.append(p) if self.method == 'sgd': self.optimizer = optim.SGD(self.params, lr=self.learning_rate) elif self.method == 'adagrad': self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate) for group in self.optimizer.param_groups: for p in group['params']: self.optimizer.state[p]['sum'] = self.optimizer\ .state[p]['sum'].fill_(self.adagrad_accum) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate) elif self.method == 'adam': self.optimizer = optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-8, weight_decay=3e-9) elif self.method == 'sparseadam': self.optimizer = MultipleOptimizer( [optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-8), optim.SparseAdam(self.sparse_params, lr=self.learning_rate, betas=self.betas, eps=1e-8)]) else: raise RuntimeError("Invalid optim method: " + self.method)
Example #22
Source File: optimizers.py From ITDD with MIT License | 5 votes |
def set_parameters(self, model): """ ? """ params = [p for p in model.parameters() if p.requires_grad] if self.method == 'sgd': self.optimizer = optim.SGD(params, lr=self.learning_rate) elif self.method == 'adagrad': self.optimizer = optim.Adagrad( self.params, lr=self.learning_rate, initial_accumulator_value=self.adagrad_accum) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(params, lr=self.learning_rate) elif self.method == 'adafactor': self.optimizer = AdaFactor(params, non_constant_decay=True, enable_factorization=True, weight_decay=0) elif self.method == 'adam': self.optimizer = optim.Adam(params, lr=self.learning_rate, betas=self.betas, eps=1e-9) elif self.method == 'sparseadam': dense = [] sparse = [] for name, param in model.named_parameters(): if not param.requires_grad: continue # TODO: Find a better way to check for sparse gradients. if 'embed' in name: sparse.append(param) else: dense.append(param) self.optimizer = MultipleOptimizer( [optim.Adam(dense, lr=self.learning_rate, betas=self.betas, eps=1e-8), optim.SparseAdam(sparse, lr=self.learning_rate, betas=self.betas, eps=1e-8)]) else: raise RuntimeError("Invalid optim method: " + self.method)
Example #23
Source File: optimizers.py From ITDD with MIT License | 5 votes |
def step(self): """Update the model parameters based on current gradients. Optionally, will employ gradient modification or update learning rate. """ self._step += 1 # Decay method used in tensor2tensor. if self.decay_method == "noam": lr_scale = ( self.model_size ** (-0.5) * min(self._step ** (-0.5), self._step * self.warmup_steps**(-1.5))) # Decay based on start_decay_steps every decay_steps elif self.start_decay_steps is not None: step = self._step - self.start_decay_steps lr_scale = (self.lr_decay ** ( max(step + self.decay_steps, 0) // self.decay_steps)) else: lr_scale = 1 self.learning_rate = lr_scale * self.original_lr for group in self.optimizer.param_groups: if self.method != 'adafactor': group['lr'] = self.learning_rate if self.max_grad_norm: clip_grad_norm_(group['params'], self.max_grad_norm) self.optimizer.step() # Code below is an implementation of https://arxiv.org/pdf/1804.04235.pdf # inspired but modified from https://github.com/DeadAt0m/adafactor-pytorch
Example #24
Source File: optimizers.py From PreSumm with MIT License | 5 votes |
def build_optim(model, opt, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if opt.train_from: optim = checkpoint['optim'] # We need to save a copy of optim.optimizer.state_dict() for setting # the, optimizer state later on in Stage 2 in this method, since # the method optim.set_parameters(model.parameters()) will overwrite # optim.optimizer, and with ith the values stored in # optim.optimizer.state_dict() saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_steps=opt.start_decay_steps, decay_steps=opt.decay_steps, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps) optim.set_parameters(model.named_parameters()) if opt.train_from: optim.optimizer.load_state_dict(saved_optimizer_state_dict) if use_gpu(opt): for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
Example #25
Source File: optimizers.py From PreSumm with MIT License | 5 votes |
def _set_rate(self, learning_rate): self.learning_rate = learning_rate if self.method != 'sparseadam': self.optimizer.param_groups[0]['lr'] = self.learning_rate else: for op in self.optimizer.optimizers: op.param_groups[0]['lr'] = self.learning_rate
Example #26
Source File: optimizers.py From PreSumm with MIT License | 5 votes |
def step(self): """Update the model parameters based on current gradients. Optionally, will employ gradient modification or update learning rate. """ self._step += 1 # Decay method used in tensor2tensor. if self.decay_method == "noam": self._set_rate( self.original_lr * min(self._step ** (-0.5), self._step * self.warmup_steps**(-1.5))) else: if ((self.start_decay_steps is not None) and ( self._step >= self.start_decay_steps)): self.start_decay = True if self.start_decay: if ((self._step - self.start_decay_steps) % self.decay_steps == 0): self.learning_rate = self.learning_rate * self.lr_decay if self.method != 'sparseadam': self.optimizer.param_groups[0]['lr'] = self.learning_rate if self.max_grad_norm: clip_grad_norm_(self.params, self.max_grad_norm) self.optimizer.step()
Example #27
Source File: optimizer.py From gtos with MIT License | 5 votes |
def lr(self): if self.method != 'sparseadam': return self.optimizer.param_groups[0]['lr'] else: return max(op.param_groups[0]['lr'] for op in self.optimizer.optimizers)
Example #28
Source File: optimizer.py From gtos with MIT License | 5 votes |
def state_dict(self): return self.optimizer.state_dict()
Example #29
Source File: optimizer.py From hiersumm with Apache License 2.0 | 5 votes |
def set_parameters(self, params): """ ? """ self.params = [] self.sparse_params = [] for k, p in params: if p.requires_grad: if self.method != 'sparseadam' or "embed" not in k: self.params.append(p) else: self.sparse_params.append(p) if self.method == 'sgd': self.optimizer = optim.SGD(self.params, lr=self.learning_rate) elif self.method == 'adagrad': self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate) for group in self.optimizer.param_groups: for p in group['params']: self.optimizer.state[p]['sum'] = self.optimizer\ .state[p]['sum'].fill_(self.adagrad_accum) elif self.method == 'adadelta': self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate) elif self.method == 'adam': self.optimizer = optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-9) elif self.method == 'sparseadam': self.optimizer = MultipleOptimizer( [optim.Adam(self.params, lr=self.learning_rate, betas=self.betas, eps=1e-8), optim.SparseAdam(self.sparse_params, lr=self.learning_rate, betas=self.betas, eps=1e-8)]) else: raise RuntimeError("Invalid optim method: " + self.method)
Example #30
Source File: optimizer.py From gtos with MIT License | 5 votes |
def set_state(self, state_dict): """ If you want to load the checkpoint of an optimizer, call this function after set_parameters. Because the method optim.set_parameters(model.parameters()) will overwrite optim.optimizer, and with ith the values stored in optim.optimizer.state_dict() """ self.optimizer.load_state_dict(state_dict) # Convert back the state values to cuda type if applicable for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(self.device)