Python apex.amp.scale_loss() Examples

The following are 30 code examples of apex.amp.scale_loss(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apex.amp , or try the search function .
Example #1
Source File: test_lr_finder.py    From pytorch-lr-finder with MIT License 6 votes vote down vote up
def test_gradient_accumulation_with_apex_amp(self, mocker):
        desired_bs, accum_steps = 32, 4
        real_bs = desired_bs // accum_steps
        num_iter = 10
        task = mod_task.XORTask(batch_size=real_bs)

        # Wrap model and optimizer by `amp.initialize`. Beside, `amp` requires
        # CUDA GPU. So we have to move model to GPU first.
        model, optimizer, device = task.model, task.optimizer, task.device
        model = model.to(device)
        task.model, task.optimizer = amp.initialize(model, optimizer)

        lr_finder = prepare_lr_finder(task)
        spy = mocker.spy(amp, "scale_loss")

        lr_finder.range_test(
            task.train_loader, num_iter=num_iter, accumulation_steps=accum_steps
        )
        assert spy.call_count == accum_steps * num_iter 
Example #2
Source File: test_larc.py    From apex with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_larc_mixed_precision(self):
        for opt_level in ["O0", "O1", "O2", "O3"]:
            model = MyModel(1)

            optimizer = LARC(
                torch.optim.SGD(
                    [{"params": model.parameters(), "lr": 0.25}], momentum=0.125
                )
            )

            model, optimizer = amp.initialize(
                model, optimizer, opt_level=opt_level, verbosity=0
            )

            optimizer.zero_grad()
            loss = model(self.x)
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step() 
Example #3
Source File: loss.py    From DeepPrivacy with MIT License 6 votes vote down vote up
def compute_gradient_penalty(self, real_data, fake_data, condition, landmarks):
        epsilon_shape = [real_data.shape[0]] + [1]*(real_data.dim() - 1)
        epsilon = torch.rand(epsilon_shape)
        epsilon = epsilon.to(fake_data.device, fake_data.dtype)
        real_data = real_data.to(fake_data.dtype)
        x_hat = epsilon * real_data + (1-epsilon) * fake_data.detach()
        x_hat.requires_grad = True
        logits = self.discriminator(x_hat, condition, landmarks)
        logits = logits.sum()
        grad = torch.autograd.grad(
            outputs=logits,
            inputs=x_hat,
            grad_outputs=torch.ones(logits.shape).to(fake_data.dtype).to(fake_data.device),
            create_graph=True
        )[0]
        grad = grad.view(x_hat.shape[0], -1)
        gradient_pen = ((grad.norm(p=2, dim=1) - 1)**2)
        to_backward = gradient_pen.sum() * 10
        with amp.scale_loss(to_backward, self.d_optimizer, loss_id=1) as scaled_loss:
            scaled_loss.backward(retain_graph=True)
        return gradient_pen.detach().mean() 
Example #4
Source File: wrapper.py    From pytorch-tools with MIT License 6 votes vote down vote up
def _make_step(self):
        data, target = self.state.input
        output = self.state.model(data)
        self.state.output = output
        loss = self.state.criterion(output, target)
        if self.state.is_train:
            with amp.scale_loss(loss / self.accumulate_steps, self.state.optimizer) as scaled_loss:
                scaled_loss.backward()
            if self.gradient_clip_val > 0:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.state.optimizer), self.gradient_clip_val)
            if self.state.step % self.accumulate_steps == 0:
                self.state.optimizer.step()
                self.state.optimizer.zero_grad()
            torch.cuda.synchronize()

        # update metrics
        self.state.loss_meter.update(to_numpy(loss))
        with torch.no_grad():
            for metric, meter in zip(self.state.metrics, self.state.metric_meters):
                meter.update(to_numpy(metric(output, target).squeeze())) 
Example #5
Source File: trainer.py    From reid_baseline_with_syncbn with MIT License 6 votes vote down vote up
def step(self, batch):
        self.model.train()
        self.optim.zero_grad()
        img, target = batch
        img, target = img.cuda(), target.cuda()
        score, feat = self.model(img)
        loss = self.loss_func(score, feat, target)
        if self.mix_precision:
            with amp.scale_loss(loss, self.optim) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.optim.step()

        acc = (score.max(1)[1] == target).float().mean()

        self.loss_avg.update(loss.cpu().item())
        self.acc_avg.update(acc.cpu().item())
        
        return self.loss_avg.avg, self.acc_avg.avg 
Example #6
Source File: test_lr_finder.py    From pytorch-lr-finder with MIT License 6 votes vote down vote up
def test_mixed_precision(self, mocker):
        batch_size = 32
        num_iter = 10
        task = mod_task.XORTask(batch_size=batch_size)

        # Wrap model and optimizer by `amp.initialize`. Beside, `amp` requires
        # CUDA GPU. So we have to move model to GPU first.
        model, optimizer, device = task.model, task.optimizer, task.device
        model = model.to(device)
        task.model, task.optimizer = amp.initialize(model, optimizer)
        assert hasattr(task.optimizer, "_amp_stash")

        lr_finder = prepare_lr_finder(task)
        spy = mocker.spy(amp, "scale_loss")

        lr_finder.range_test(task.train_loader, num_iter=num_iter)
        # NOTE: Here we did not perform gradient accumulation, so that call count
        # of `amp.scale_loss` should equal to `num_iter`.
        assert spy.call_count == num_iter 
Example #7
Source File: trainer.py    From BERT-for-RRC-ABSA with Apache License 2.0 6 votes vote down vote up
def _forward(self, args, inputs, labels, masker, model, backprop=True):
        outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
        loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
        if backprop:
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            self._post_step(args, outputs)
            return loss
        else:
            return loss 
Example #8
Source File: train_nn.py    From cryodrgn with GNU General Public License v3.0 6 votes vote down vote up
def train(model, lattice, optim, y, rot, trans=None, ctf_params=None, use_amp=False):
    model.train()
    optim.zero_grad()
    B = y.size(0)
    D = lattice.D
    # reconstruct circle of pixels instead of whole image
    mask = lattice.get_circular_mask(D//2)
    yhat = model(lattice.coords[mask] @ rot).view(B,-1)
    if ctf_params is not None:
        freqs = lattice.freqs2d[mask]
        freqs = freqs.unsqueeze(0).expand(B, *freqs.shape)/ctf_params[:,0].view(B,1,1)
        yhat *= ctf.compute_ctf(freqs, *torch.split(ctf_params[:,1:], 1, 1))
    y = y.view(B,-1)[:, mask]
    if trans is not None:
        y = lattice.translate_ht(y, trans.unsqueeze(1), mask).view(B,-1)
    loss = F.mse_loss(yhat, y)
    if use_amp:
        with amp.scale_loss(loss, optim) as scaled_loss:
            scaled_loss.backward()
    else:
        loss.backward()
    optim.step()
    return loss.item() 
Example #9
Source File: test_checkpointing.py    From apex with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def train_step(self, model, optimizer, data, loss_ids):
        optimizer.zero_grad()        

        output = model(data)

        # Call backward for num_losses-1
        for idx in loss_ids:
            loss = output.mean()
            with amp.scale_loss(loss, optimizer, loss_id=idx) as scaled_loss:
                scaled_loss.backward(retain_graph=True)

        optimizer.step()
        return output 
Example #10
Source File: hooks.py    From pytorch-lightning with Apache License 2.0 5 votes vote down vote up
def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx):
        if NATIVE_AMP_AVALAIBLE:
            scaled_loss = self.trainer.scaler.scale(unscaled_loss)
        else:
            scaled_loss = amp.scale_loss(unscaled_loss, optimizer)

        return scaled_loss 
Example #11
Source File: network_trainer.py    From nnUNet with Apache License 2.0 5 votes vote down vote up
def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False):
        data_dict = next(data_generator)
        data = data_dict['data']
        target = data_dict['target']

        if not isinstance(data, torch.Tensor):
            data = torch.from_numpy(data).float()
        if not isinstance(target, torch.Tensor):
            target = torch.from_numpy(target).float()

        if torch.cuda.is_available():
            data = data.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

        self.optimizer.zero_grad()
        output = self.network(data)
        del data
        l = self.loss(output, target)

        if run_online_evaluation:
            self.run_online_evaluation(output, target)

        del target

        if do_backprop:
            if not self.fp16 or amp is None or not torch.cuda.is_available():
                l.backward()
            else:
                with amp.scale_loss(l, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
            self.optimizer.step()

        return l.detach().cpu().numpy() 
Example #12
Source File: mixed_precision.py    From amdim-public with MIT License 5 votes vote down vote up
def backward(loss, optimizer):
    """Calls backward on the loss. If mixed precision is on, will
    scale the loss.
    """
    if is_mixed_precision():
        from apex import amp
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
    else:
        loss.backward() 
Example #13
Source File: train_vae.py    From cryodrgn with GNU General Public License v3.0 5 votes vote down vote up
def train_batch(model, lattice, y, yt, rot, trans, optim, beta, beta_control=None, tilt=None, ctf_params=None, yr=None, use_amp=False):
    optim.zero_grad()
    model.train()
    if trans is not None:
        y, yt = preprocess_input(y, yt, lattice, trans)
    z_mu, z_logvar, z, y_recon, y_recon_tilt, mask = run_batch(model, lattice, y, yt, rot, tilt, ctf_params, yr)
    loss, gen_loss, kld = loss_function(z_mu, z_logvar, y, yt, y_recon, mask, beta, y_recon_tilt, beta_control)
    if use_amp:
        with amp.scale_loss(loss, optim) as scaled_loss:
            scaled_loss.backward()
    else:
        loss.backward()
    optim.step()
    return loss.item(), gen_loss.item(), kld.item() 
Example #14
Source File: horovod_benchmark_apex.py    From ray with Apache License 2.0 5 votes vote down vote up
def benchmark_step():
    optimizer.zero_grad()
    output = model(data)
    loss = F.cross_entropy(output, target)
    # Apex
    if args.amp_fp16:
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
            optimizer.synchronize()
        with optimizer.skip_synchronize():
            optimizer.step()
    else:
        loss.backward()
        optimizer.step() 
Example #15
Source File: test_checkpointing.py    From apex with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_state_dict(self):
        for opt_level in self.test_opt_levels:
            # Skip O3
            if opt_level == 'O3':
                continue

            model = MyModel().to('cuda')
            optimizer = optim.Adam(model.parameters(), lr=1e-3)
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=opt_level, verbosity=0)

            # Export state_dict and check for Half
            state_dict = model.state_dict()
            for key in state_dict:
                self.assertFalse('Half' in state_dict[key].type())

            # Check, if model is still trainable
            # Create dummy data
            data = torch.randn(10, 3, 4, 4, device='cuda')
            target = torch.randn(10, 6, 4, 4, device='cuda')
            
            # Get initnial loss
            optimizer.zero_grad()
            output = model(data)
            loss = F.mse_loss(output, target)
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()
            last_loss = loss.item()

            # train for some epochs
            for epoch in range(10):
                optimizer.zero_grad()
                output = model(data)
                loss = F.mse_loss(output, target)
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                optimizer.step()
                self.assertTrue(loss.item() < last_loss)
                last_loss = loss.item() 
Example #16
Source File: train_segmentation.py    From ray with Apache License 2.0 5 votes vote down vote up
def train_batch(self, batch, batch_info):
        image, target = batch
        image, target = image.to(self.device), target.to(self.device)
        output = self.model(image)
        loss = criterion(output, target)
        self.optimizer.zero_grad()
        if self.use_fp16 and amp:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.optimizer.step()
        lr = self.optimizer.param_groups[0]["lr"]
        return {"loss": loss.item(), "lr": lr, "num_samples": len(batch)} 
Example #17
Source File: distiller.py    From DistilKoBERT with Apache License 2.0 5 votes vote down vote up
def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
        Also update the metrics for tensorboard.
        """
        # Check for NaN
        if (loss != loss).data.any():
            logger.error("NaN detected")
            exit()

        if self.multi_gpu:
            loss = loss.mean()
        if self.params.gradient_accumulation_steps > 1:
            loss = loss / self.params.gradient_accumulation_steps

        if self.fp16:
            from apex import amp

            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step() 
Example #18
Source File: training.py    From tape with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def backward(self, loss) -> None:
        if not self._delay_accumulation:
            loss = loss / self.gradient_accumulation_steps
        if self.fp16:
            with amp.scale_loss(loss, self.optimizer,
                                delay_overflow_check=self._delay_accumulation) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward() 
Example #19
Source File: nnUNetTrainerV2.py    From nnUNet with Apache License 2.0 5 votes vote down vote up
def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False):
        """
        gradient clipping improves training stability

        :param data_generator:
        :param do_backprop:
        :param run_online_evaluation:
        :return:
        """
        data_dict = next(data_generator)
        data = data_dict['data']
        target = data_dict['target']

        data = maybe_to_torch(data)
        target = maybe_to_torch(target)

        if torch.cuda.is_available():
            data = to_cuda(data)
            target = to_cuda(target)

        self.optimizer.zero_grad()

        output = self.network(data)

        del data
        loss = self.loss(output, target)

        if run_online_evaluation:
            self.run_online_evaluation(output, target)
        del target

        if do_backprop:
            if not self.fp16 or amp is None or not torch.cuda.is_available():
                loss.backward()
            else:
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
            _ = clip_grad_norm_(self.network.parameters(), 12)
            self.optimizer.step()

        return loss.detach().cpu().numpy() 
Example #20
Source File: lm.py    From espnet with Apache License 2.0 5 votes vote down vote up
def update_core(self):
        """Update the model."""
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")
        # Progress the dataset iterator for sentences at each iteration.
        self.model.zero_grad()  # Clear the parameter gradients
        accum = {"loss": 0.0, "nll": 0.0, "count": 0}
        for _ in range(self.accum_grad):
            batch = train_iter.__next__()
            # Concatenate the token IDs to matrices and send them to the device
            # self.converter does this job
            # (it is chainer.dataset.concat_examples by default)
            x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
            if self.device[0] == -1:
                loss, nll, count = self.model(x, t)
            else:
                # apex does not support torch.nn.DataParallel
                loss, nll, count = data_parallel(self.model, (x, t), self.device)

            # backward
            loss = loss.mean() / self.accum_grad
            if self.use_apex:
                from apex import amp

                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()  # Backprop
            # accumulate stats
            accum["loss"] += float(loss)
            accum["nll"] += float(nll.sum())
            accum["count"] += int(count.sum())

        for k, v in accum.items():
            reporter.report({k: v}, optimizer.target)
        if self.gradclip is not None:
            nn.utils.clip_grad_norm_(self.model.parameters(), self.gradclip)
        optimizer.step()  # Update the parameters
        self.scheduler.step(n_iter=self.iteration) 
Example #21
Source File: train.py    From FARM with Apache License 2.0 5 votes vote down vote up
def backward_propagate(self, loss, step):
        loss = self.adjust_loss(loss)
        if self.global_step % self.log_loss_every == 0 and self.local_rank in [-1, 0]:
            if self.local_rank in [-1, 0]:
                MlLogger.log_metrics(
                    {"Train_loss_total": float(loss.detach().cpu().numpy())},
                    step=self.global_step,
                )
                if self.log_learning_rate:
                    MlLogger.log_metrics({"learning_rate": self.lr_schedule.get_last_lr()[0]},
                                         step=self.global_step)
        if self.use_amp:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if step % self.grad_acc_steps == 0:
            if self.max_grad_norm is not None:
                if self.use_amp:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            if self.lr_schedule:
                self.lr_schedule.step()
        return loss 
Example #22
Source File: dist_utils.py    From DenseMatchingBenchmark with MIT License 5 votes vote down vote up
def after_train_iter(self, runner):
        runner.model.zero_grad()
        runner.optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(runner.outputs['loss'], runner.optimizer) as scaled_losses:
            scaled_losses.backward()
        all_reduce_grads(runner.model, self.coalesce, self.bucket_size_mb)
        if self.grad_clip is not None:
            self.clip_grads(runner.model.parameters())
        runner.optimizer.step() 
Example #23
Source File: bases.py    From torchbearer with MIT License 5 votes vote down vote up
def apex_closure():
    from apex import amp

    def _apex_closure(state):
        # Zero grads
        state[torchbearer.OPTIMIZER].zero_grad()

        _forward_with_exceptions(torchbearer.X, torchbearer.MODEL, torchbearer.Y_PRED, state)

        state[torchbearer.CALLBACK_LIST].on_forward(state)

        # Loss Calculation
        try:
            state[torchbearer.LOSS] = state[torchbearer.CRITERION](state)
        except TypeError:
            loss_function_params = _get_param_list(state[torchbearer.Y_PRED]) + _get_param_list(state[torchbearer.Y_TRUE])
            state[torchbearer.LOSS] = state[torchbearer.CRITERION](*loss_function_params)

        state[torchbearer.CALLBACK_LIST].on_criterion(state)

        # Backwards pass
        with amp.scale_loss(state[torchbearer.LOSS], state[torchbearer.OPTIMIZER]) as scaled_loss:
            scaled_loss.backward(**state[torchbearer.BACKWARD_ARGS])

        state[torchbearer.CALLBACK_LIST].on_backward(state)
    return _apex_closure 
Example #24
Source File: main.py    From kaggle-kuzushiji-2019 with MIT License 5 votes vote down vote up
def create_supervised_trainer(
        model, optimizer, loss_fn,
        device=None, non_blocking=False,
        prepare_batch=_prepare_batch,
        output_transform=lambda x, y, y_pred, loss: loss.item(),
        accumulation_steps: int = 1,
        fp16: bool = False,
        ):

    def update_fn(engine, batch):
        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        if fp16:
            from apex import amp
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return output_transform(x, y, y_pred, loss)

    return Engine(update_fn) 
Example #25
Source File: callbacks.py    From kekas with MIT License 5 votes vote down vote up
def on_batch_end(self, i: int, state: DotDict) -> None:
        if state.core.mode == "train":
            state.core.opt.zero_grad()
            if state.core.use_fp16:
                with amp.scale_loss(state.core.loss, state.core.opt) as scaled_loss:
                    scaled_loss.backward()
            else:
                state.core.loss.backward()
            state.core.opt.step() 
Example #26
Source File: lr_finder.py    From pytorch-lr-finder with MIT License 5 votes vote down vote up
def _train_batch(self, train_iter, accumulation_steps, non_blocking_transfer=True):
        self.model.train()
        total_loss = None  # for late initialization

        self.optimizer.zero_grad()
        for i in range(accumulation_steps):
            inputs, labels = next(train_iter)
            inputs, labels = self._move_to_device(
                inputs, labels, non_blocking=non_blocking_transfer
            )

            # Forward pass
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)

            # Loss should be averaged in each step
            loss /= accumulation_steps

            # Backward pass
            if IS_AMP_AVAILABLE and hasattr(self.optimizer, "_amp_stash"):
                # For minor performance optimization, see also:
                # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
                delay_unscale = ((i + 1) % accumulation_steps) != 0

                with amp.scale_loss(
                    loss, self.optimizer, delay_unscale=delay_unscale
                ) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if total_loss is None:
                total_loss = loss
            else:
                total_loss += loss

        self.optimizer.step()

        return total_loss.item() 
Example #27
Source File: transformer_main.py    From Count-Sketch-Optimizers with Apache License 2.0 5 votes vote down vote up
def train(train_step):
    train_loader = train_corpus.batch_generator()

    start_time = time.time()
    for batch, item in enumerate(train_loader):
        net.train()
        data, targets, word_cnt, batch_len = get_batch(item)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        optimizer.zero_grad()

        # Network
        logits, new_targets = net(data, targets)
        loss = F.cross_entropy(logits.view(-1, nsampled+1), new_targets)

        # AMP
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.clip)

        optimizer.step()
        scheduler.step(train_step)
        train_step += 1

        interval = 125
        if batch % interval == 0:
            elapsed = time.time() - start_time
            print('Epoch: {:3d} | {:5d}/{:5d} batches | lr {:.6f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'
                  .format(epoch, batch, batch_len, scheduler.get_lr()[0], elapsed * 1000 / interval, loss.item(), math.exp(loss.item())))
            start_time = time.time()
            sys.stdout.flush()
    return train_step

# Load the saved model. 
Example #28
Source File: distiller.py    From exbert with Apache License 2.0 5 votes vote down vote up
def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
        Also update the metrics for tensorboard.
        """
        # Check for NaN
        if (loss != loss).data.any():
            logger.error("NaN detected")
            exit()

        if self.multi_gpu:
            loss = loss.mean()
        if self.params.gradient_accumulation_steps > 1:
            loss = loss / self.params.gradient_accumulation_steps

        if self.fp16:
            from apex import amp

            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step() 
Example #29
Source File: base_model.py    From TractSeg with Apache License 2.0 4 votes vote down vote up
def train(self, X, y, weight_factor=None):
        X = X.contiguous().cuda(non_blocking=True)  # (bs, features, x, y)
        y = y.contiguous().cuda(non_blocking=True)  # (bs, classes, x, y)

        self.net.train()
        self.optimizer.zero_grad()
        outputs = self.net(X)  # (bs, classes, x, y)
        angle_err = None

        if weight_factor is not None:
            if len(y.shape) == 4:  # 2D
                weights = torch.ones((self.Config.BATCH_SIZE, self.Config.NR_OF_CLASSES,
                                      y.shape[2], y.shape[3])).cuda()
            else:  # 3D
                weights = torch.ones((self.Config.BATCH_SIZE, self.Config.NR_OF_CLASSES,
                                      y.shape[2], y.shape[3], y.shape[4])).cuda()
            bundle_mask = y > 0
            weights[bundle_mask.data] *= weight_factor  # 10

            if self.Config.EXPERIMENT_TYPE == "peak_regression":
                loss, angle_err = self.criterion(outputs, y, weights)
            else:
                loss = nn.BCEWithLogitsLoss(weight=weights)(outputs, y)
        else:
            if self.Config.LOSS_FUNCTION == "soft_sample_dice" or self.Config.LOSS_FUNCTION == "soft_batch_dice":
                loss = self.criterion(F.sigmoid(outputs), y)
                # loss = criterion(F.sigmoid(outputs), y) + nn.BCEWithLogitsLoss()(outputs, y)  # combined loss
            else:
                loss = self.criterion(outputs, y)

        if APEX_AVAILABLE and self.Config.FP16:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.optimizer.step()

        if self.Config.EXPERIMENT_TYPE == "peak_regression":
            f1 = metric_utils.calc_peak_length_dice_pytorch(self.Config.CLASSES, outputs.detach(), y.detach(),
                                                            max_angle_error=self.Config.PEAK_DICE_THR,
                                                            max_length_error=self.Config.PEAK_DICE_LEN_THR)
        elif self.Config.EXPERIMENT_TYPE == "dm_regression":
            f1 = pytorch_utils.f1_score_macro(y.detach() > self.Config.THRESHOLD, outputs.detach(),
                                              per_class=True, threshold=self.Config.THRESHOLD)
        else:
            f1 = pytorch_utils.f1_score_macro(y.detach(), F.sigmoid(outputs).detach(), per_class=True,
                                              threshold=self.Config.THRESHOLD)

        if self.Config.USE_VISLOGGER:
            probs = F.sigmoid(outputs)
        else:
            probs = None  # faster

        metrics = {}
        metrics["loss"] = loss.item()
        metrics["f1_macro"] = f1
        metrics["angle_err"] = angle_err if angle_err is not None else 0

        return probs, metrics 
Example #30
Source File: benchmark_nvidia_apex.py    From ignite with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def main(dataset_path, batch_size=256, max_epochs=10, opt="O1"):
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    device = "cuda"

    train_loader, test_loader, eval_train_loader = get_train_eval_loaders(dataset_path, batch_size=batch_size)

    model = wide_resnet50_2(num_classes=100).to(device)
    optimizer = SGD(model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss().to(device)

    model, optimizer = amp.initialize(model, optimizer, opt_level=opt)

    def train_step(engine, batch):
        x = convert_tensor(batch[0], device, non_blocking=True)
        y = convert_tensor(batch[1], device, non_blocking=True)

        optimizer.zero_grad()

        y_pred = model(x)
        loss = criterion(y_pred, y)

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        optimizer.step()

        return loss.item()

    trainer = Engine(train_step)
    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)
    ProgressBar(persist=True).attach(trainer, output_transform=lambda out: {"batch loss": out})

    metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)}

    evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)

    def log_metrics(engine, title):
        for name in metrics:
            print("\t{} {}: {:.2f}".format(title, name, engine.state.metrics[name]))

    @trainer.on(Events.COMPLETED)
    def run_validation(_):
        print("- Mean elapsed time for 1 epoch: {}".format(timer.value()))
        print("- Metrics:")
        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"):
            evaluator.run(eval_train_loader)

        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"):
            evaluator.run(test_loader)

    trainer.run(train_loader, max_epochs=max_epochs)