Python apex.amp.master_params() Examples
The following are 15
code examples of apex.amp.master_params().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apex.amp
, or try the search function
.
Example #1
Source File: trainer.py From allennlp with Apache License 2.0 | 6 votes |
def rescale_gradients(self) -> float: """ Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled. Returns the norm of the gradients. """ if self._opt_level is not None: # See: https://nvidia.github.io/apex/advanced.html#gradient-clipping parameters_to_clip = [ p for p in amp.master_params(self.optimizer) if p.grad is not None ] else: parameters_to_clip = [p for p in self.model.parameters() if p.grad is not None] if self._grad_norm: return clip_grad_norm_(parameters_to_clip, self._grad_norm) else: return torch.norm( torch.stack([torch.norm(p.grad.detach()) for p in parameters_to_clip]) )
Example #2
Source File: wrapper.py From pytorch-tools with MIT License | 6 votes |
def _make_step(self): data, target = self.state.input output = self.state.model(data) self.state.output = output loss = self.state.criterion(output, target) if self.state.is_train: with amp.scale_loss(loss / self.accumulate_steps, self.state.optimizer) as scaled_loss: scaled_loss.backward() if self.gradient_clip_val > 0: torch.nn.utils.clip_grad_norm_(amp.master_params(self.state.optimizer), self.gradient_clip_val) if self.state.step % self.accumulate_steps == 0: self.state.optimizer.step() self.state.optimizer.zero_grad() torch.cuda.synchronize() # update metrics self.state.loss_meter.update(to_numpy(loss)) with torch.no_grad(): for metric, meter in zip(self.state.metrics, self.state.metric_meters): meter.update(to_numpy(metric(output, target).squeeze()))
Example #3
Source File: training.py From tape with BSD 3-Clause "New" or "Revised" License | 6 votes |
def save_state(self, save_directory: typing.Union[str, Path], epoch_id: int): save_directory = Path(save_directory) if not save_directory.exists(): save_directory.mkdir() else: assert save_directory.is_dir(), "Save path should be a directory" model_to_save = getattr(self.model, 'module', self.model) model_to_save.save_pretrained(save_directory) optimizer_state: typing.Dict[str, typing.Any] = { 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), 'epoch': epoch_id} if APEX_FOUND: optimizer_state['master params'] = list(amp.master_params(self.optimizer)) try: optimizer_state['amp'] = amp.state_dict() except AttributeError: pass torch.save(optimizer_state, save_directory / 'checkpoint.bin')
Example #4
Source File: train.py From FARM with Apache License 2.0 | 5 votes |
def backward_propagate(self, loss, step): loss = self.adjust_loss(loss) if self.global_step % self.log_loss_every == 0 and self.local_rank in [-1, 0]: if self.local_rank in [-1, 0]: MlLogger.log_metrics( {"Train_loss_total": float(loss.detach().cpu().numpy())}, step=self.global_step, ) if self.log_learning_rate: MlLogger.log_metrics({"learning_rate": self.lr_schedule.get_last_lr()[0]}, step=self.global_step) if self.use_amp: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if step % self.grad_acc_steps == 0: if self.max_grad_norm is not None: if self.use_amp: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() if self.lr_schedule: self.lr_schedule.step() return loss
Example #5
Source File: callback.py From fastNLP with Apache License 2.0 | 5 votes |
def on_backward_end(self): if self.step%self.update_every==0: if self.parameters is None: if getattr(self.trainer, 'fp16', ''): _check_fp16() self.clip_fun(amp.master_params(self.optimizer), self.clip_value) else: self.clip_fun(self.model.parameters(), self.clip_value) else: self.clip_fun(self.parameters, self.clip_value)
Example #6
Source File: transformer_main.py From Count-Sketch-Optimizers with Apache License 2.0 | 5 votes |
def train(train_step): train_loader = train_corpus.batch_generator() start_time = time.time() for batch, item in enumerate(train_loader): net.train() data, targets, word_cnt, batch_len = get_batch(item) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. optimizer.zero_grad() # Network logits, new_targets = net(data, targets) loss = F.cross_entropy(logits.view(-1, nsampled+1), new_targets) # AMP with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.clip) optimizer.step() scheduler.step(train_step) train_step += 1 interval = 125 if batch % interval == 0: elapsed = time.time() - start_time print('Epoch: {:3d} | {:5d}/{:5d} batches | lr {:.6f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}' .format(epoch, batch, batch_len, scheduler.get_lr()[0], elapsed * 1000 / interval, loss.item(), math.exp(loss.item()))) start_time = time.time() sys.stdout.flush() return train_step # Load the saved model.
Example #7
Source File: distiller.py From exbert with Apache License 2.0 | 5 votes |
def optimize(self, loss): """ Normalization on the loss (gradient accumulation or distributed training), followed by backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation). Also update the metrics for tensorboard. """ # Check for NaN if (loss != loss).data.any(): logger.error("NaN detected") exit() if self.multi_gpu: loss = loss.mean() if self.params.gradient_accumulation_steps > 1: loss = loss / self.params.gradient_accumulation_steps if self.fp16: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.iter() if self.n_iter % self.params.gradient_accumulation_steps == 0: if self.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() self.scheduler.step()
Example #8
Source File: distiller.py From DistilKoBERT with Apache License 2.0 | 5 votes |
def optimize(self, loss): """ Normalization on the loss (gradient accumulation or distributed training), followed by backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation). Also update the metrics for tensorboard. """ # Check for NaN if (loss != loss).data.any(): logger.error("NaN detected") exit() if self.multi_gpu: loss = loss.mean() if self.params.gradient_accumulation_steps > 1: loss = loss / self.params.gradient_accumulation_steps if self.fp16: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.iter() if self.n_iter % self.params.gradient_accumulation_steps == 0: if self.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() self.scheduler.step()
Example #9
Source File: training.py From tape with BSD 3-Clause "New" or "Revised" License | 5 votes |
def resume_from_checkpoint(self, checkpoint_dir: str) -> int: checkpoint = torch.load( os.path.join(checkpoint_dir, 'checkpoint.bin'), map_location=self.device) self.optimizer.load_state_dict(checkpoint['optimizer']) if self.fp16: self.optimizer._lazy_init_maybe_master_weights() self.optimizer._amp_stash.lazy_init_called = True self.optimizer.load_state_dict(checkpoint['optimizer']) for param, saved in zip( amp.master_params(self.optimizer), checkpoint['master params']): param.data.copy_(saved.data) amp.load_state_dict(checkpoint['amp']) self.scheduler.load_state_dict(checkpoint['scheduler']) start_epoch = checkpoint['epoch'] + 1 return start_epoch
Example #10
Source File: trainer.py From Multi-Label-Text-Classification-for-Chinese with MIT License | 4 votes |
def train_epoch(self, data): pbar = ProgressBar(n_total=len(data)) tr_loss = AverageMeter() self.epoch_reset() for step, batch in enumerate(data): self.batch_reset() self.model.train() batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch print("input_ids, input_mask, segment_ids, label_ids SIZE: \n") print(input_ids.size(), input_mask.size(), segment_ids.size(), label_ids.size()) logits = self.model(input_ids, input_mask, segment_ids) print("logits and label ids size: ", logits.size(), label_ids.size()) loss = self.criterion(output=logits, target=label_ids) if len(self.n_gpu) >= 2: loss = loss.mean() if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps if self.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() clip_grad_norm_(amp.master_params( self.optimizer), self.grad_clip) else: loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_clip) if (step + 1) % self.gradient_accumulation_steps == 0: self.lr_scheduler.step() self.optimizer.step() self.optimizer.zero_grad() self.global_step += 1 if self.batch_metrics: for metric in self.batch_metrics: metric(logits=logits, target=label_ids) self.info[metric.name()] = metric.value() self.info['loss'] = loss.item() tr_loss.update(loss.item(), n=1) if self.verbose >= 1: pbar.batch_step(step=step, info=self.info, bar_type='Training') self.outputs.append(logits.cpu().detach()) self.targets.append(label_ids.cpu().detach()) print("\n------------- train result --------------") # epoch metric self.outputs = torch.cat(self.outputs, dim=0).cpu().detach() self.targets = torch.cat(self.targets, dim=0).cpu().detach() self.result['loss'] = tr_loss.avg if self.epoch_metrics: for metric in self.epoch_metrics: metric(logits=self.outputs, target=self.targets) value = metric.value() if value: self.result[f'{metric.name()}'] = value if "cuda" in str(self.device): torch.cuda.empty_cache() return self.result
Example #11
Source File: trainer.py From BERT-for-RRC-ABSA with Apache License 2.0 | 4 votes |
def _train_batch(self, args, step, inputs, labels, masker, eval_dataset, eval_masker, model): inputs = self._to(args, inputs) labels = self._to(args, labels) model.train() loss = self._forward(args, inputs, labels, masker, model, backprop=True) self.tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() # Update learning rate schedule model.zero_grad() self._post_training() self.global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and self.global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = self.evaluate(args, eval_dataset, eval_masker, model) for key, value in results.items(): self.tb_writer.add_scalar('eval_{}'.format(key), value, self.global_step) self._train_writer(args.logging_steps) if args.local_rank in [-1, 0] and args.save_steps > 0 and self.global_step % args.save_steps == 0: checkpoint_prefix = 'checkpoint' # Save model checkpoint output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, self.global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(self.scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) self._rotate_checkpoints(args, checkpoint_prefix)
Example #12
Source File: transformers_example.py From ray with Apache License 2.0 | 4 votes |
def train_batch(self, batch, batch_info=None): args = self.args model = self.model optimizer = self.optimizer step = batch_info["batch_idx"] model.train() batch = tuple(t.to(self.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in [ "bert", "xlnet", "albert" ] else None) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() batch_loss = loss.item() # last step in epoch but step is always smaller # than gradient_accumulation_steps ending = (self.train_data_len <= args.gradient_accumulation_steps and (step + 1) == self.train_data_len) if (step + 1) % args.gradient_accumulation_steps == 0 or ending: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) self.optimizer.step() self._warmup_scheduler.step() # Update learning rate schedule model.zero_grad() self._global_step += 1 learning_rate_scalar = self._warmup_scheduler.get_lr()[0] return {"learning_rate": learning_rate_scalar, "loss": batch_loss}
Example #13
Source File: trainer.py From Bert-Multi-Label-Text-Classification with MIT License | 4 votes |
def train_epoch(self,data): pbar = ProgressBar(n_total = len(data),desc='Training') tr_loss = AverageMeter() self.epoch_reset() for step, batch in enumerate(data): self.batch_reset() self.model.train() batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = self.model(input_ids, segment_ids,input_mask) loss = self.criterion(output=logits,target=label_ids) if len(self.args.n_gpu) >= 2: loss = loss.mean() if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() clip_grad_norm_(amp.master_params(self.optimizer), self.args.grad_clip) else: loss.backward() clip_grad_norm_(self.model.parameters(), self.args.grad_clip) if (step + 1) % self.args.gradient_accumulation_steps == 0: self.scheduler.step() self.optimizer.step() self.optimizer.zero_grad() self.global_step += 1 if self.batch_metrics: for metric in self.batch_metrics: metric(logits = logits,target = label_ids) self.info[metric.name()] = metric.value() self.info['loss'] = loss.item() tr_loss.update(loss.item(),n = 1) if self.verbose >= 1: pbar(step= step,info = self.info) self.outputs.append(logits.cpu().detach()) self.targets.append(label_ids.cpu().detach()) print("\n------------- train result --------------") # epoch metric self.outputs = torch.cat(self.outputs, dim =0).cpu().detach() self.targets = torch.cat(self.targets, dim =0).cpu().detach() self.result['loss'] = tr_loss.avg if self.epoch_metrics: for metric in self.epoch_metrics: metric(logits=self.outputs, target=self.targets) value = metric.value() if value: self.result[f'{metric.name()}'] = value if "cuda" in str(self.device): torch.cuda.empty_cache() return self.result
Example #14
Source File: training.py From tape with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _step_distributed_fp16(self) -> None: # manually allreduce gradients after all accumulation steps # check for Inf/NaN # 1. allocate an uninitialized buffer for flattened gradient scaler = _amp_state.loss_scalers[0] master_grads = [p.grad for p in amp.master_params(self.optimizer) if p.grad is not None] flat_grad_size = sum(p.numel() for p in master_grads) # allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else \ # torch.float32 allreduce_dtype = torch.float16 flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype) # 2. combine unflattening and predivision of unscaled 'raw' gradient allreduced_views = apex_C.unflatten(flat_raw, master_grads) self._overflow_buf.zero_() amp_C.multi_tensor_scale( 65536, self._overflow_buf, [master_grads, allreduced_views], scaler.loss_scale() / ( torch.distributed.get_world_size() * self.gradient_accumulation_steps)) # 3. sum gradient across ranks. Because of the predivision, this averages the gradient torch.distributed.all_reduce(flat_raw) # 4. combine unscaling and unflattening of allreduced gradient self._overflow_buf.zero_() amp_C.multi_tensor_scale( 65536, self._overflow_buf, [allreduced_views, master_grads], 1. / scaler.loss_scale()) # 5. update loss scale scaler = _amp_state.loss_scalers[0] old_overflow_buf = scaler._overflow_buf scaler._overflow_buf = self._overflow_buf had_overflow = scaler.update_scale() scaler._overfloat_buf = old_overflow_buf # 6. call optimizer step function if had_overflow == 0: self._step() else: # Overflow detected, print message and clear gradients logger.info(f"Gradient overflow. Skipping step, reducing loss scale to " f"{scaler.loss_scale()}") if _amp_state.opt_properties.master_weights: for param in self.optimizer._amp_stash.all_fp32_from_fp16_params: param.grad = None for param in self.model.parameters(): param.grad = None
Example #15
Source File: train_apex.py From FloWaveNet with MIT License | 4 votes |
def train(epoch, model, optimizer, scheduler): global global_step epoch_loss = 0.0 running_num = 0 running_loss = np.zeros(3) train_sampler.set_epoch(epoch) model.train() bar = tqdm(train_loader) if args.local_rank == 0 else train_loader for batch_idx, (x, c) in enumerate(bar): scheduler.step() global_step += 1 x, c = x.to(device, non_blocking=True), c.to(device, non_blocking=True) optimizer.zero_grad() log_p, logdet = model(x, c) log_p, logdet = torch.mean(log_p), torch.mean(logdet) loss = -(log_p + logdet) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.) optimizer.step() running_num += 1 running_loss[0] += loss.item() running_loss[1] += log_p.item() running_loss[2] += logdet.item() epoch_loss += loss.item() if args.local_rank == 0: bar.set_description('{}/{}, [Log pdf, Log p(z), Log Det] : {}' .format(epoch, global_step, running_loss / running_num)) if (batch_idx + 1) % 100 == 0: running_num = 0 running_loss = np.zeros(3) del x, c, log_p, logdet, loss del running_loss gc.collect() print('{}/{}/{} Training Loss : {:.4f}'.format(epoch, global_step, args.local_rank, epoch_loss / (len(train_loader)))) return epoch_loss / len(train_loader)