Python apex.optimizers.FP16_Optimizer() Examples
The following are 7
code examples of apex.optimizers.FP16_Optimizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apex.optimizers
, or try the search function
.
Example #1
Source File: optimization_fp16.py From unilm with MIT License | 6 votes |
def state_dict(self): """ Returns a dict containing the current state of this :class:`FP16_Optimizer` instance. This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict of the contained Pytorch optimizer. Example:: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() torch.save(checkpoint, "saved.pth") """ state_dict = {} state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale state_dict['cur_scale'] = self.cur_scale state_dict['cur_iter'] = self.cur_iter if state_dict['dynamic_loss_scale']: state_dict['last_overflow_iter'] = self.last_overflow_iter state_dict['scale_factor'] = self.scale_factor state_dict['scale_window'] = self.scale_window state_dict['optimizer_state_dict'] = self.optimizer.state_dict() state_dict['fp32_groups_flat'] = self.fp32_groups_flat return state_dict
Example #2
Source File: optimization_fp16.py From VLP with Apache License 2.0 | 6 votes |
def state_dict(self): """ Returns a dict containing the current state of this :class:`FP16_Optimizer` instance. This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict of the contained Pytorch optimizer. Example:: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() torch.save(checkpoint, "saved.pth") """ state_dict = {} state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale state_dict['cur_scale'] = self.cur_scale state_dict['cur_iter'] = self.cur_iter if state_dict['dynamic_loss_scale']: state_dict['last_overflow_iter'] = self.last_overflow_iter state_dict['scale_factor'] = self.scale_factor state_dict['scale_window'] = self.scale_window state_dict['optimizer_state_dict'] = self.optimizer.state_dict() state_dict['fp32_groups_flat'] = self.fp32_groups_flat return state_dict
Example #3
Source File: trainer.py From metal with Apache License 2.0 | 5 votes |
def _set_lr_scheduler(self, model): lr_scheduler = self.config["lr_scheduler"] lr_scheduler_config = self.config["lr_scheduler_config"] # Create warmup scheduler for first warmup_steps warmup_units if applicable self._set_warmup_scheduler(model) optimizer_to_config = self.optimizer # If using half precision, configure the underlying # optimizer of FP16_Optimizer if model.config["fp16"]: optimizer_to_config = self.optimizer.optimizer # Create regular lr scheduler for use after warmup if lr_scheduler is None: lr_scheduler = None else: lr_scheduler_config = self.config["lr_scheduler_config"] if lr_scheduler == "linear": total_steps = self.batches_per_epoch * self.config["n_epochs"] cooldown_steps = total_steps - self.warmup_steps linear_cooldown_func = lambda x: (cooldown_steps - x) / cooldown_steps lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer_to_config, linear_cooldown_func ) elif lr_scheduler == "exponential": lr_scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer_to_config, **lr_scheduler_config["exponential_config"] ) elif lr_scheduler == "reduce_on_plateau": lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer_to_config, min_lr=lr_scheduler_config["min_lr"], **lr_scheduler_config["plateau_config"], ) else: raise ValueError( f"Did not recognize lr_scheduler option '{lr_scheduler}'" ) self.lr_scheduler = lr_scheduler
Example #4
Source File: model_setup.py From bert_on_stilts with Apache License 2.0 | 5 votes |
def create_optimizer(model, learning_rate, t_total, loss_scale, fp16, warmup_proportion, state_dict): # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'adapter.down_project.weight', 'adapter.up_project.weight', ] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex " "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total) if state_dict is not None: optimizer.load_state_dict(state_dict) return optimizer
Example #5
Source File: optimization_fp16.py From unilm with MIT License | 5 votes |
def load_state_dict(self, state_dict): """ Loads a state_dict created by an earlier call to state_dict(). If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, whose parameters in turn came from ``model``, it is expected that the user will call ``model.load_state_dict()`` before ``fp16_optimizer_instance.load_state_dict()`` is called. Example:: model = torch.nn.Linear(D_in, D_out).cuda().half() optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) ... checkpoint = torch.load("saved.pth") model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) """ # I think it should actually be ok to reload the optimizer before the model. self.dynamic_loss_scale = state_dict['dynamic_loss_scale'] self.cur_scale = state_dict['cur_scale'] self.cur_iter = state_dict['cur_iter'] if state_dict['dynamic_loss_scale']: self.last_overflow_iter = state_dict['last_overflow_iter'] self.scale_factor = state_dict['scale_factor'] self.scale_window = state_dict['scale_window'] self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) # At this point, the optimizer's references to the model's fp32 parameters are up to date. # The optimizer's hyperparameters and internal buffers are also up to date. # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still # out of date. There are two options. # 1: Refresh the master params from the model's fp16 params. # This requires less storage but incurs precision loss. # 2: Save and restore the fp32 master copies separately. # We choose option 2. # # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device # of their associated parameters, because it's possible those buffers might not exist yet in # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been # constructed in the same way as the one whose state_dict we are loading, the same master params # are guaranteed to exist, so we can just copy_() from the saved master params. for current, saved in zip(self.fp32_groups_flat, state_dict['fp32_groups_flat']): current.data.copy_(saved.data)
Example #6
Source File: optimization_fp16.py From VLP with Apache License 2.0 | 5 votes |
def load_state_dict(self, state_dict): """ Loads a state_dict created by an earlier call to state_dict(). If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, whose parameters in turn came from ``model``, it is expected that the user will call ``model.load_state_dict()`` before ``fp16_optimizer_instance.load_state_dict()`` is called. Example:: model = torch.nn.Linear(D_in, D_out).cuda().half() optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) ... checkpoint = torch.load("saved.pth") model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) """ # I think it should actually be ok to reload the optimizer before the model. self.dynamic_loss_scale = state_dict['dynamic_loss_scale'] self.cur_scale = state_dict['cur_scale'] self.cur_iter = state_dict['cur_iter'] if state_dict['dynamic_loss_scale']: self.last_overflow_iter = state_dict['last_overflow_iter'] self.scale_factor = state_dict['scale_factor'] self.scale_window = state_dict['scale_window'] self.optimizer.load_state_dict(state_dict['optimizer_state_dict']) # At this point, the optimizer's references to the model's fp32 parameters are up to date. # The optimizer's hyperparameters and internal buffers are also up to date. # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still # out of date. There are two options. # 1: Refresh the master params from the model's fp16 params. # This requires less storage but incurs precision loss. # 2: Save and restore the fp32 master copies separately. # We choose option 2. # # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device # of their associated parameters, because it's possible those buffers might not exist yet in # the current optimizer instance. In our case, as long as the current FP16_Optimizer has been # constructed in the same way as the one whose state_dict we are loading, the same master params # are guaranteed to exist, so we can just copy_() from the saved master params. for current, saved in zip(self.fp32_groups_flat, state_dict['fp32_groups_flat']): current.data.copy_(saved.data)
Example #7
Source File: runner.py From Self-Supervised-Speech-Pretraining-and-Representation-Learning with MIT License | 4 votes |
def set_model(self): print('[Runner] - Initializing Transformer model...') # build the Transformer model with speech prediction head model_config = TransformerConfig(self.config) self.dr = model_config.downsample_rate self.hidden_size = model_config.hidden_size self.model = TransformerForMaskedAcousticModel(model_config, self.input_dim, self.output_dim).to(self.device) self.model.train() if self.args.multi_gpu: self.model = torch.nn.DataParallel(self.model) print('[Runner] - Multi-GPU training Enabled: ' + str(torch.cuda.device_count())) print('[Runner] - Number of parameters: ' + str(sum(p.numel() for p in self.model.parameters() if p.requires_grad))) # Setup optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if self.apex: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.learning_rate, bias_correction=False, max_grad_norm=1.0) if self.config['optimizer']['loss_scale'] == 0: self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: self.optimizer = FP16_Optimizer(optimizer, static_loss_scale=self.config['optimizer']['loss_scale']) self.warmup_linear = WarmupLinearSchedule(warmup=self.warmup_proportion, t_total=self.total_steps) else: self.optimizer = BertAdam(optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=self.total_steps)