Python apex.amp.master_params() Examples

The following are 15 code examples of apex.amp.master_params(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apex.amp , or try the search function .
Example #1
Source File: trainer.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def rescale_gradients(self) -> float:
        """
        Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled.

        Returns the norm of the gradients.
        """
        if self._opt_level is not None:
            # See: https://nvidia.github.io/apex/advanced.html#gradient-clipping
            parameters_to_clip = [
                p for p in amp.master_params(self.optimizer) if p.grad is not None
            ]
        else:
            parameters_to_clip = [p for p in self.model.parameters() if p.grad is not None]
        if self._grad_norm:
            return clip_grad_norm_(parameters_to_clip, self._grad_norm)
        else:
            return torch.norm(
                torch.stack([torch.norm(p.grad.detach()) for p in parameters_to_clip])
            ) 
Example #2
Source File: wrapper.py    From pytorch-tools with MIT License 6 votes vote down vote up
def _make_step(self):
        data, target = self.state.input
        output = self.state.model(data)
        self.state.output = output
        loss = self.state.criterion(output, target)
        if self.state.is_train:
            with amp.scale_loss(loss / self.accumulate_steps, self.state.optimizer) as scaled_loss:
                scaled_loss.backward()
            if self.gradient_clip_val > 0:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.state.optimizer), self.gradient_clip_val)
            if self.state.step % self.accumulate_steps == 0:
                self.state.optimizer.step()
                self.state.optimizer.zero_grad()
            torch.cuda.synchronize()

        # update metrics
        self.state.loss_meter.update(to_numpy(loss))
        with torch.no_grad():
            for metric, meter in zip(self.state.metrics, self.state.metric_meters):
                meter.update(to_numpy(metric(output, target).squeeze())) 
Example #3
Source File: training.py    From tape with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def save_state(self, save_directory: typing.Union[str, Path], epoch_id: int):
        save_directory = Path(save_directory)
        if not save_directory.exists():
            save_directory.mkdir()
        else:
            assert save_directory.is_dir(), "Save path should be a directory"
        model_to_save = getattr(self.model, 'module', self.model)
        model_to_save.save_pretrained(save_directory)
        optimizer_state: typing.Dict[str, typing.Any] = {
            'optimizer': self.optimizer.state_dict(),
            'scheduler': self.scheduler.state_dict(),
            'epoch': epoch_id}
        if APEX_FOUND:
            optimizer_state['master params'] = list(amp.master_params(self.optimizer))
            try:
                optimizer_state['amp'] = amp.state_dict()
            except AttributeError:
                pass
        torch.save(optimizer_state, save_directory / 'checkpoint.bin') 
Example #4
Source File: train.py    From FARM with Apache License 2.0 5 votes vote down vote up
def backward_propagate(self, loss, step):
        loss = self.adjust_loss(loss)
        if self.global_step % self.log_loss_every == 0 and self.local_rank in [-1, 0]:
            if self.local_rank in [-1, 0]:
                MlLogger.log_metrics(
                    {"Train_loss_total": float(loss.detach().cpu().numpy())},
                    step=self.global_step,
                )
                if self.log_learning_rate:
                    MlLogger.log_metrics({"learning_rate": self.lr_schedule.get_last_lr()[0]},
                                         step=self.global_step)
        if self.use_amp:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if step % self.grad_acc_steps == 0:
            if self.max_grad_norm is not None:
                if self.use_amp:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            if self.lr_schedule:
                self.lr_schedule.step()
        return loss 
Example #5
Source File: callback.py    From fastNLP with Apache License 2.0 5 votes vote down vote up
def on_backward_end(self):
        if self.step%self.update_every==0:
            if self.parameters is None:
                if getattr(self.trainer, 'fp16', ''):
                    _check_fp16()
                    self.clip_fun(amp.master_params(self.optimizer), self.clip_value)
                else:
                    self.clip_fun(self.model.parameters(), self.clip_value)
            else:
                self.clip_fun(self.parameters, self.clip_value) 
Example #6
Source File: transformer_main.py    From Count-Sketch-Optimizers with Apache License 2.0 5 votes vote down vote up
def train(train_step):
    train_loader = train_corpus.batch_generator()

    start_time = time.time()
    for batch, item in enumerate(train_loader):
        net.train()
        data, targets, word_cnt, batch_len = get_batch(item)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        optimizer.zero_grad()

        # Network
        logits, new_targets = net(data, targets)
        loss = F.cross_entropy(logits.view(-1, nsampled+1), new_targets)

        # AMP
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.clip)

        optimizer.step()
        scheduler.step(train_step)
        train_step += 1

        interval = 125
        if batch % interval == 0:
            elapsed = time.time() - start_time
            print('Epoch: {:3d} | {:5d}/{:5d} batches | lr {:.6f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'
                  .format(epoch, batch, batch_len, scheduler.get_lr()[0], elapsed * 1000 / interval, loss.item(), math.exp(loss.item())))
            start_time = time.time()
            sys.stdout.flush()
    return train_step

# Load the saved model. 
Example #7
Source File: distiller.py    From exbert with Apache License 2.0 5 votes vote down vote up
def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
        Also update the metrics for tensorboard.
        """
        # Check for NaN
        if (loss != loss).data.any():
            logger.error("NaN detected")
            exit()

        if self.multi_gpu:
            loss = loss.mean()
        if self.params.gradient_accumulation_steps > 1:
            loss = loss / self.params.gradient_accumulation_steps

        if self.fp16:
            from apex import amp

            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step() 
Example #8
Source File: distiller.py    From DistilKoBERT with Apache License 2.0 5 votes vote down vote up
def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
        Also update the metrics for tensorboard.
        """
        # Check for NaN
        if (loss != loss).data.any():
            logger.error("NaN detected")
            exit()

        if self.multi_gpu:
            loss = loss.mean()
        if self.params.gradient_accumulation_steps > 1:
            loss = loss / self.params.gradient_accumulation_steps

        if self.fp16:
            from apex import amp

            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step() 
Example #9
Source File: training.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def resume_from_checkpoint(self, checkpoint_dir: str) -> int:
        checkpoint = torch.load(
            os.path.join(checkpoint_dir, 'checkpoint.bin'), map_location=self.device)
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        if self.fp16:
            self.optimizer._lazy_init_maybe_master_weights()
            self.optimizer._amp_stash.lazy_init_called = True
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            for param, saved in zip(
                    amp.master_params(self.optimizer), checkpoint['master params']):
                param.data.copy_(saved.data)
            amp.load_state_dict(checkpoint['amp'])
        self.scheduler.load_state_dict(checkpoint['scheduler'])
        start_epoch = checkpoint['epoch'] + 1
        return start_epoch 
Example #10
Source File: trainer.py    From Multi-Label-Text-Classification-for-Chinese with MIT License 4 votes vote down vote up
def train_epoch(self, data):
        pbar = ProgressBar(n_total=len(data))
        tr_loss = AverageMeter()
        self.epoch_reset()
        for step,  batch in enumerate(data):
            self.batch_reset()
            self.model.train()
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            print("input_ids, input_mask, segment_ids, label_ids SIZE: \n")
            print(input_ids.size(), input_mask.size(),
                  segment_ids.size(), label_ids.size())
            logits = self.model(input_ids, input_mask, segment_ids)
            print("logits and label ids size: ",
                  logits.size(), label_ids.size())
            loss = self.criterion(output=logits, target=label_ids)
            if len(self.n_gpu) >= 2:
                loss = loss.mean()
            if self.gradient_accumulation_steps > 1:
                loss = loss / self.gradient_accumulation_steps
            if self.fp16:
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
                clip_grad_norm_(amp.master_params(
                    self.optimizer), self.grad_clip)
            else:
                loss.backward()
                clip_grad_norm_(self.model.parameters(), self.grad_clip)
            if (step + 1) % self.gradient_accumulation_steps == 0:
                self.lr_scheduler.step()
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.global_step += 1
            if self.batch_metrics:
                for metric in self.batch_metrics:
                    metric(logits=logits, target=label_ids)
                    self.info[metric.name()] = metric.value()
            self.info['loss'] = loss.item()
            tr_loss.update(loss.item(), n=1)
            if self.verbose >= 1:
                pbar.batch_step(step=step, info=self.info, bar_type='Training')
            self.outputs.append(logits.cpu().detach())
            self.targets.append(label_ids.cpu().detach())
        print("\n------------- train result --------------")
        # epoch metric
        self.outputs = torch.cat(self.outputs, dim=0).cpu().detach()
        self.targets = torch.cat(self.targets, dim=0).cpu().detach()
        self.result['loss'] = tr_loss.avg
        if self.epoch_metrics:
            for metric in self.epoch_metrics:
                metric(logits=self.outputs, target=self.targets)
                value = metric.value()
                if value:
                    self.result[f'{metric.name()}'] = value
        if "cuda" in str(self.device):
            torch.cuda.empty_cache()
        return self.result 
Example #11
Source File: trainer.py    From BERT-for-RRC-ABSA with Apache License 2.0 4 votes vote down vote up
def _train_batch(self, args, step, inputs, labels, masker, eval_dataset, eval_masker, model):
        inputs = self._to(args, inputs)        
        labels = self._to(args, labels)
        
        model.train()
        loss = self._forward(args, inputs, labels, masker, model, backprop=True)

        self.tr_loss += loss.item()
        if (step + 1) % args.gradient_accumulation_steps == 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            self._post_training()
            self.global_step += 1
            
            if args.local_rank in [-1, 0] and args.logging_steps > 0 and self.global_step % args.logging_steps == 0:
                # Log metrics
                if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                    results = self.evaluate(args, eval_dataset, eval_masker, model)
                    for key, value in results.items():
                        self.tb_writer.add_scalar('eval_{}'.format(key), value, self.global_step)
                self._train_writer(args.logging_steps)

            if args.local_rank in [-1, 0] and args.save_steps > 0 and self.global_step % args.save_steps == 0:
                checkpoint_prefix = 'checkpoint'
                # Save model checkpoint
                output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, self.global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                
                self.tokenizer.save_pretrained(output_dir)
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                logger.info("Saving model checkpoint to %s", output_dir)

                torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                torch.save(self.scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                logger.info("Saving optimizer and scheduler states to %s", output_dir)

                self._rotate_checkpoints(args, checkpoint_prefix) 
Example #12
Source File: transformers_example.py    From ray with Apache License 2.0 4 votes vote down vote up
def train_batch(self, batch, batch_info=None):
        args = self.args
        model = self.model
        optimizer = self.optimizer
        step = batch_info["batch_idx"]

        model.train()
        batch = tuple(t.to(self.device) for t in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[3]
        }
        if args.model_type != "distilbert":
            # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            inputs["token_type_ids"] = (batch[2] if args.model_type in [
                "bert", "xlnet", "albert"
            ] else None)
        outputs = model(**inputs)

        # model outputs are always tuple in transformers (see doc)
        loss = outputs[0]

        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        batch_loss = loss.item()

        # last step in epoch but step is always smaller
        # than gradient_accumulation_steps
        ending = (self.train_data_len <= args.gradient_accumulation_steps
                  and (step + 1) == self.train_data_len)
        if (step + 1) % args.gradient_accumulation_steps == 0 or ending:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            self.optimizer.step()
            self._warmup_scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            self._global_step += 1

        learning_rate_scalar = self._warmup_scheduler.get_lr()[0]
        return {"learning_rate": learning_rate_scalar, "loss": batch_loss} 
Example #13
Source File: trainer.py    From Bert-Multi-Label-Text-Classification with MIT License 4 votes vote down vote up
def train_epoch(self,data):
        pbar = ProgressBar(n_total = len(data),desc='Training')
        tr_loss = AverageMeter()
        self.epoch_reset()
        for step,  batch in enumerate(data):
            self.batch_reset()
            self.model.train()
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            logits = self.model(input_ids, segment_ids,input_mask)
            loss = self.criterion(output=logits,target=label_ids)
            if len(self.args.n_gpu) >= 2:
                loss = loss.mean()
            if self.args.gradient_accumulation_steps > 1:
                loss = loss / self.args.gradient_accumulation_steps
            if self.args.fp16:
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
                clip_grad_norm_(amp.master_params(self.optimizer), self.args.grad_clip)
            else:
                loss.backward()
                clip_grad_norm_(self.model.parameters(), self.args.grad_clip)
            if (step + 1) % self.args.gradient_accumulation_steps == 0:
                self.scheduler.step()
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.global_step += 1
            if self.batch_metrics:
                for metric in self.batch_metrics:
                    metric(logits = logits,target = label_ids)
                    self.info[metric.name()] = metric.value()
            self.info['loss'] = loss.item()
            tr_loss.update(loss.item(),n = 1)
            if self.verbose >= 1:
                pbar(step= step,info = self.info)
            self.outputs.append(logits.cpu().detach())
            self.targets.append(label_ids.cpu().detach())
        print("\n------------- train result --------------")
        # epoch metric
        self.outputs = torch.cat(self.outputs, dim =0).cpu().detach()
        self.targets = torch.cat(self.targets, dim =0).cpu().detach()
        self.result['loss'] = tr_loss.avg
        if self.epoch_metrics:
            for metric in self.epoch_metrics:
                metric(logits=self.outputs, target=self.targets)
                value = metric.value()
                if value:
                    self.result[f'{metric.name()}'] = value
        if "cuda" in str(self.device):
            torch.cuda.empty_cache()
        return self.result 
Example #14
Source File: training.py    From tape with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _step_distributed_fp16(self) -> None:
        # manually allreduce gradients after all accumulation steps
        # check for Inf/NaN
        # 1. allocate an uninitialized buffer for flattened gradient
        scaler = _amp_state.loss_scalers[0]
        master_grads = [p.grad for p in amp.master_params(self.optimizer) if p.grad is not None]
        flat_grad_size = sum(p.numel() for p in master_grads)
        # allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else \
            # torch.float32
        allreduce_dtype = torch.float16
        flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
        # 2. combine unflattening and predivision of unscaled 'raw' gradient
        allreduced_views = apex_C.unflatten(flat_raw, master_grads)
        self._overflow_buf.zero_()
        amp_C.multi_tensor_scale(
            65536,
            self._overflow_buf,
            [master_grads, allreduced_views],
            scaler.loss_scale() / (
                torch.distributed.get_world_size() * self.gradient_accumulation_steps))
        # 3. sum gradient across ranks. Because of the predivision, this averages the gradient
        torch.distributed.all_reduce(flat_raw)
        # 4. combine unscaling and unflattening of allreduced gradient
        self._overflow_buf.zero_()
        amp_C.multi_tensor_scale(
            65536,
            self._overflow_buf,
            [allreduced_views, master_grads],
            1. / scaler.loss_scale())
        # 5. update loss scale
        scaler = _amp_state.loss_scalers[0]
        old_overflow_buf = scaler._overflow_buf
        scaler._overflow_buf = self._overflow_buf
        had_overflow = scaler.update_scale()
        scaler._overfloat_buf = old_overflow_buf
        # 6. call optimizer step function
        if had_overflow == 0:
            self._step()
        else:
            # Overflow detected, print message and clear gradients
            logger.info(f"Gradient overflow.  Skipping step, reducing loss scale to "
                        f"{scaler.loss_scale()}")
            if _amp_state.opt_properties.master_weights:
                for param in self.optimizer._amp_stash.all_fp32_from_fp16_params:
                    param.grad = None
        for param in self.model.parameters():
            param.grad = None 
Example #15
Source File: train_apex.py    From FloWaveNet with MIT License 4 votes vote down vote up
def train(epoch, model, optimizer, scheduler):
    global global_step

    epoch_loss = 0.0
    running_num = 0
    running_loss = np.zeros(3)

    train_sampler.set_epoch(epoch)
    model.train()

    bar = tqdm(train_loader) if args.local_rank == 0 else train_loader

    for batch_idx, (x, c) in enumerate(bar):

        scheduler.step()
        global_step += 1

        x, c = x.to(device, non_blocking=True), c.to(device, non_blocking=True)

        optimizer.zero_grad()

        log_p, logdet = model(x, c)
        log_p, logdet = torch.mean(log_p), torch.mean(logdet)

        loss = -(log_p + logdet)

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.)
         
        optimizer.step()

        running_num += 1
        running_loss[0] += loss.item()
        running_loss[1] += log_p.item()
        running_loss[2] += logdet.item()

        epoch_loss += loss.item()

        if args.local_rank == 0:
            bar.set_description('{}/{}, [Log pdf, Log p(z), Log Det] : {}'
                                .format(epoch, global_step, running_loss / running_num))
            if (batch_idx + 1) % 100 == 0:
                running_num = 0
                running_loss = np.zeros(3)

        del x, c, log_p, logdet, loss
    del running_loss
    gc.collect()
    print('{}/{}/{} Training Loss : {:.4f}'.format(epoch, global_step, args.local_rank, epoch_loss / (len(train_loader))))
    return epoch_loss / len(train_loader)