Python Examples of apex.amp.scale

Source File: test_lr_finder.py From pytorch-lr-finder with MIT License

6 votes

def test_gradient_accumulation_with_apex_amp(self, mocker):
        desired_bs, accum_steps = 32, 4
        real_bs = desired_bs // accum_steps
        num_iter = 10
        task = mod_task.XORTask(batch_size=real_bs)

        # Wrap model and optimizer by `amp.initialize`. Beside, `amp` requires
        # CUDA GPU. So we have to move model to GPU first.
        model, optimizer, device = task.model, task.optimizer, task.device
        model = model.to(device)
        task.model, task.optimizer = amp.initialize(model, optimizer)

        lr_finder = prepare_lr_finder(task)
        spy = mocker.spy(amp, "scale_loss")

        lr_finder.range_test(
            task.train_loader, num_iter=num_iter, accumulation_steps=accum_steps
        )
        assert spy.call_count == accum_steps * num_iter

Source File: test_larc.py From apex with BSD 3-Clause "New" or "Revised" License

6 votes

def test_larc_mixed_precision(self):
        for opt_level in ["O0", "O1", "O2", "O3"]:
            model = MyModel(1)

            optimizer = LARC(
                torch.optim.SGD(
                    [{"params": model.parameters(), "lr": 0.25}], momentum=0.125
                )
            )

            model, optimizer = amp.initialize(
                model, optimizer, opt_level=opt_level, verbosity=0
            )

            optimizer.zero_grad()
            loss = model(self.x)
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()

Source File: loss.py From DeepPrivacy with MIT License

6 votes

def compute_gradient_penalty(self, real_data, fake_data, condition, landmarks):
        epsilon_shape = [real_data.shape[0]] + [1]*(real_data.dim() - 1)
        epsilon = torch.rand(epsilon_shape)
        epsilon = epsilon.to(fake_data.device, fake_data.dtype)
        real_data = real_data.to(fake_data.dtype)
        x_hat = epsilon * real_data + (1-epsilon) * fake_data.detach()
        x_hat.requires_grad = True
        logits = self.discriminator(x_hat, condition, landmarks)
        logits = logits.sum()
        grad = torch.autograd.grad(
            outputs=logits,
            inputs=x_hat,
            grad_outputs=torch.ones(logits.shape).to(fake_data.dtype).to(fake_data.device),
            create_graph=True
        )[0]
        grad = grad.view(x_hat.shape[0], -1)
        gradient_pen = ((grad.norm(p=2, dim=1) - 1)**2)
        to_backward = gradient_pen.sum() * 10
        with amp.scale_loss(to_backward, self.d_optimizer, loss_id=1) as scaled_loss:
            scaled_loss.backward(retain_graph=True)
        return gradient_pen.detach().mean()

Source File: wrapper.py From pytorch-tools with MIT License

6 votes

def _make_step(self):
        data, target = self.state.input
        output = self.state.model(data)
        self.state.output = output
        loss = self.state.criterion(output, target)
        if self.state.is_train:
            with amp.scale_loss(loss / self.accumulate_steps, self.state.optimizer) as scaled_loss:
                scaled_loss.backward()
            if self.gradient_clip_val > 0:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.state.optimizer), self.gradient_clip_val)
            if self.state.step % self.accumulate_steps == 0:
                self.state.optimizer.step()
                self.state.optimizer.zero_grad()
            torch.cuda.synchronize()

        # update metrics
        self.state.loss_meter.update(to_numpy(loss))
        with torch.no_grad():
            for metric, meter in zip(self.state.metrics, self.state.metric_meters):
                meter.update(to_numpy(metric(output, target).squeeze()))

Source File: trainer.py From reid_baseline_with_syncbn with MIT License

6 votes

def step(self, batch):
        self.model.train()
        self.optim.zero_grad()
        img, target = batch
        img, target = img.cuda(), target.cuda()
        score, feat = self.model(img)
        loss = self.loss_func(score, feat, target)
        if self.mix_precision:
            with amp.scale_loss(loss, self.optim) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.optim.step()

        acc = (score.max(1)[1] == target).float().mean()

        self.loss_avg.update(loss.cpu().item())
        self.acc_avg.update(acc.cpu().item())
        
        return self.loss_avg.avg, self.acc_avg.avg

Source File: test_lr_finder.py From pytorch-lr-finder with MIT License

6 votes

def test_mixed_precision(self, mocker):
        batch_size = 32
        num_iter = 10
        task = mod_task.XORTask(batch_size=batch_size)

        # Wrap model and optimizer by `amp.initialize`. Beside, `amp` requires
        # CUDA GPU. So we have to move model to GPU first.
        model, optimizer, device = task.model, task.optimizer, task.device
        model = model.to(device)
        task.model, task.optimizer = amp.initialize(model, optimizer)
        assert hasattr(task.optimizer, "_amp_stash")

        lr_finder = prepare_lr_finder(task)
        spy = mocker.spy(amp, "scale_loss")

        lr_finder.range_test(task.train_loader, num_iter=num_iter)
        # NOTE: Here we did not perform gradient accumulation, so that call count
        # of `amp.scale_loss` should equal to `num_iter`.
        assert spy.call_count == num_iter

Source File: trainer.py From BERT-for-RRC-ABSA with Apache License 2.0

6 votes

def _forward(self, args, inputs, labels, masker, model, backprop=True):
        outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
        loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
        if backprop:
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            self._post_step(args, outputs)
            return loss
        else:
            return loss

Source File: train_nn.py From cryodrgn with GNU General Public License v3.0

6 votes

def train(model, lattice, optim, y, rot, trans=None, ctf_params=None, use_amp=False):
    model.train()
    optim.zero_grad()
    B = y.size(0)
    D = lattice.D
    # reconstruct circle of pixels instead of whole image
    mask = lattice.get_circular_mask(D//2)
    yhat = model(lattice.coords[mask] @ rot).view(B,-1)
    if ctf_params is not None:
        freqs = lattice.freqs2d[mask]
        freqs = freqs.unsqueeze(0).expand(B, *freqs.shape)/ctf_params[:,0].view(B,1,1)
        yhat *= ctf.compute_ctf(freqs, *torch.split(ctf_params[:,1:], 1, 1))
    y = y.view(B,-1)[:, mask]
    if trans is not None:
        y = lattice.translate_ht(y, trans.unsqueeze(1), mask).view(B,-1)
    loss = F.mse_loss(yhat, y)
    if use_amp:
        with amp.scale_loss(loss, optim) as scaled_loss:
            scaled_loss.backward()
    else:
        loss.backward()
    optim.step()
    return loss.item()

Source File: test_checkpointing.py From apex with BSD 3-Clause "New" or "Revised" License

5 votes

def train_step(self, model, optimizer, data, loss_ids):
        optimizer.zero_grad()        

        output = model(data)

        # Call backward for num_losses-1
        for idx in loss_ids:
            loss = output.mean()
            with amp.scale_loss(loss, optimizer, loss_id=idx) as scaled_loss:
                scaled_loss.backward(retain_graph=True)

        optimizer.step()
        return output

Source File: hooks.py From pytorch-lightning with Apache License 2.0

5 votes

def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx):
        if NATIVE_AMP_AVALAIBLE:
            scaled_loss = self.trainer.scaler.scale(unscaled_loss)
        else:
            scaled_loss = amp.scale_loss(unscaled_loss, optimizer)

        return scaled_loss

Source File: network_trainer.py From nnUNet with Apache License 2.0

5 votes

def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False):
        data_dict = next(data_generator)
        data = data_dict['data']
        target = data_dict['target']

        if not isinstance(data, torch.Tensor):
            data = torch.from_numpy(data).float()
        if not isinstance(target, torch.Tensor):
            target = torch.from_numpy(target).float()

        if torch.cuda.is_available():
            data = data.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

        self.optimizer.zero_grad()
        output = self.network(data)
        del data
        l = self.loss(output, target)

        if run_online_evaluation:
            self.run_online_evaluation(output, target)

        del target

        if do_backprop:
            if not self.fp16 or amp is None or not torch.cuda.is_available():
                l.backward()
            else:
                with amp.scale_loss(l, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
            self.optimizer.step()

        return l.detach().cpu().numpy()

Source File: mixed_precision.py From amdim-public with MIT License

5 votes

def backward(loss, optimizer):
    """Calls backward on the loss. If mixed precision is on, will
    scale the loss.
    """
    if is_mixed_precision():
        from apex import amp
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
    else:
        loss.backward()

Source File: train_vae.py From cryodrgn with GNU General Public License v3.0

5 votes

def train_batch(model, lattice, y, yt, rot, trans, optim, beta, beta_control=None, tilt=None, ctf_params=None, yr=None, use_amp=False):
    optim.zero_grad()
    model.train()
    if trans is not None:
        y, yt = preprocess_input(y, yt, lattice, trans)
    z_mu, z_logvar, z, y_recon, y_recon_tilt, mask = run_batch(model, lattice, y, yt, rot, tilt, ctf_params, yr)
    loss, gen_loss, kld = loss_function(z_mu, z_logvar, y, yt, y_recon, mask, beta, y_recon_tilt, beta_control)
    if use_amp:
        with amp.scale_loss(loss, optim) as scaled_loss:
            scaled_loss.backward()
    else:
        loss.backward()
    optim.step()
    return loss.item(), gen_loss.item(), kld.item()

Source File: horovod_benchmark_apex.py From ray with Apache License 2.0

5 votes

def benchmark_step():
    optimizer.zero_grad()
    output = model(data)
    loss = F.cross_entropy(output, target)
    # Apex
    if args.amp_fp16:
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
            optimizer.synchronize()
        with optimizer.skip_synchronize():
            optimizer.step()
    else:
        loss.backward()
        optimizer.step()

Source File: test_checkpointing.py From apex with BSD 3-Clause "New" or "Revised" License

5 votes

def test_state_dict(self):
        for opt_level in self.test_opt_levels:
            # Skip O3
            if opt_level == 'O3':
                continue

            model = MyModel().to('cuda')
            optimizer = optim.Adam(model.parameters(), lr=1e-3)
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=opt_level, verbosity=0)

            # Export state_dict and check for Half
            state_dict = model.state_dict()
            for key in state_dict:
                self.assertFalse('Half' in state_dict[key].type())

            # Check, if model is still trainable
            # Create dummy data
            data = torch.randn(10, 3, 4, 4, device='cuda')
            target = torch.randn(10, 6, 4, 4, device='cuda')
            
            # Get initnial loss
            optimizer.zero_grad()
            output = model(data)
            loss = F.mse_loss(output, target)
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            optimizer.step()
            last_loss = loss.item()

            # train for some epochs
            for epoch in range(10):
                optimizer.zero_grad()
                output = model(data)
                loss = F.mse_loss(output, target)
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                optimizer.step()
                self.assertTrue(loss.item() < last_loss)
                last_loss = loss.item()

Source File: train_segmentation.py From ray with Apache License 2.0

5 votes

def train_batch(self, batch, batch_info):
        image, target = batch
        image, target = image.to(self.device), target.to(self.device)
        output = self.model(image)
        loss = criterion(output, target)
        self.optimizer.zero_grad()
        if self.use_fp16 and amp:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.optimizer.step()
        lr = self.optimizer.param_groups[0]["lr"]
        return {"loss": loss.item(), "lr": lr, "num_samples": len(batch)}

Source File: distiller.py From DistilKoBERT with Apache License 2.0

5 votes

def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
        Also update the metrics for tensorboard.
        """
        # Check for NaN
        if (loss != loss).data.any():
            logger.error("NaN detected")
            exit()

        if self.multi_gpu:
            loss = loss.mean()
        if self.params.gradient_accumulation_steps > 1:
            loss = loss / self.params.gradient_accumulation_steps

        if self.fp16:
            from apex import amp

            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step()

Source File: training.py From tape with BSD 3-Clause "New" or "Revised" License

5 votes

def backward(self, loss) -> None:
        if not self._delay_accumulation:
            loss = loss / self.gradient_accumulation_steps
        if self.fp16:
            with amp.scale_loss(loss, self.optimizer,
                                delay_overflow_check=self._delay_accumulation) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

Source File: nnUNetTrainerV2.py From nnUNet with Apache License 2.0

5 votes

def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False):
        """
        gradient clipping improves training stability

        :param data_generator:
        :param do_backprop:
        :param run_online_evaluation:
        :return:
        """
        data_dict = next(data_generator)
        data = data_dict['data']
        target = data_dict['target']

        data = maybe_to_torch(data)
        target = maybe_to_torch(target)

        if torch.cuda.is_available():
            data = to_cuda(data)
            target = to_cuda(target)

        self.optimizer.zero_grad()

        output = self.network(data)

        del data
        loss = self.loss(output, target)

        if run_online_evaluation:
            self.run_online_evaluation(output, target)
        del target

        if do_backprop:
            if not self.fp16 or amp is None or not torch.cuda.is_available():
                loss.backward()
            else:
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
            _ = clip_grad_norm_(self.network.parameters(), 12)
            self.optimizer.step()

        return loss.detach().cpu().numpy()

Source File: lm.py From espnet with Apache License 2.0

5 votes

def update_core(self):
        """Update the model."""
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")
        # Progress the dataset iterator for sentences at each iteration.
        self.model.zero_grad()  # Clear the parameter gradients
        accum = {"loss": 0.0, "nll": 0.0, "count": 0}
        for _ in range(self.accum_grad):
            batch = train_iter.__next__()
            # Concatenate the token IDs to matrices and send them to the device
            # self.converter does this job
            # (it is chainer.dataset.concat_examples by default)
            x, t = concat_examples(batch, device=self.device[0], padding=(0, -100))
            if self.device[0] == -1:
                loss, nll, count = self.model(x, t)
            else:
                # apex does not support torch.nn.DataParallel
                loss, nll, count = data_parallel(self.model, (x, t), self.device)

            # backward
            loss = loss.mean() / self.accum_grad
            if self.use_apex:
                from apex import amp

                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()  # Backprop
            # accumulate stats
            accum["loss"] += float(loss)
            accum["nll"] += float(nll.sum())
            accum["count"] += int(count.sum())

        for k, v in accum.items():
            reporter.report({k: v}, optimizer.target)
        if self.gradclip is not None:
            nn.utils.clip_grad_norm_(self.model.parameters(), self.gradclip)
        optimizer.step()  # Update the parameters
        self.scheduler.step(n_iter=self.iteration)

Source File: train.py From FARM with Apache License 2.0

5 votes

def backward_propagate(self, loss, step):
        loss = self.adjust_loss(loss)
        if self.global_step % self.log_loss_every == 0 and self.local_rank in [-1, 0]:
            if self.local_rank in [-1, 0]:
                MlLogger.log_metrics(
                    {"Train_loss_total": float(loss.detach().cpu().numpy())},
                    step=self.global_step,
                )
                if self.log_learning_rate:
                    MlLogger.log_metrics({"learning_rate": self.lr_schedule.get_last_lr()[0]},
                                         step=self.global_step)
        if self.use_amp:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if step % self.grad_acc_steps == 0:
            if self.max_grad_norm is not None:
                if self.use_amp:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            if self.lr_schedule:
                self.lr_schedule.step()
        return loss

Source File: dist_utils.py From DenseMatchingBenchmark with MIT License

5 votes

def after_train_iter(self, runner):
        runner.model.zero_grad()
        runner.optimizer.zero_grad()
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        with amp.scale_loss(runner.outputs['loss'], runner.optimizer) as scaled_losses:
            scaled_losses.backward()
        all_reduce_grads(runner.model, self.coalesce, self.bucket_size_mb)
        if self.grad_clip is not None:
            self.clip_grads(runner.model.parameters())
        runner.optimizer.step()

Source File: bases.py From torchbearer with MIT License

5 votes

def apex_closure():
    from apex import amp

    def _apex_closure(state):
        # Zero grads
        state[torchbearer.OPTIMIZER].zero_grad()

        _forward_with_exceptions(torchbearer.X, torchbearer.MODEL, torchbearer.Y_PRED, state)

        state[torchbearer.CALLBACK_LIST].on_forward(state)

        # Loss Calculation
        try:
            state[torchbearer.LOSS] = state[torchbearer.CRITERION](state)
        except TypeError:
            loss_function_params = _get_param_list(state[torchbearer.Y_PRED]) + _get_param_list(state[torchbearer.Y_TRUE])
            state[torchbearer.LOSS] = state[torchbearer.CRITERION](*loss_function_params)

        state[torchbearer.CALLBACK_LIST].on_criterion(state)

        # Backwards pass
        with amp.scale_loss(state[torchbearer.LOSS], state[torchbearer.OPTIMIZER]) as scaled_loss:
            scaled_loss.backward(**state[torchbearer.BACKWARD_ARGS])

        state[torchbearer.CALLBACK_LIST].on_backward(state)
    return _apex_closure

Source File: main.py From kaggle-kuzushiji-2019 with MIT License

5 votes

def create_supervised_trainer(
        model, optimizer, loss_fn,
        device=None, non_blocking=False,
        prepare_batch=_prepare_batch,
        output_transform=lambda x, y, y_pred, loss: loss.item(),
        accumulation_steps: int = 1,
        fp16: bool = False,
        ):

    def update_fn(engine, batch):
        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        if fp16:
            from apex import amp
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return output_transform(x, y, y_pred, loss)

    return Engine(update_fn)

Source File: callbacks.py From kekas with MIT License

5 votes

def on_batch_end(self, i: int, state: DotDict) -> None:
        if state.core.mode == "train":
            state.core.opt.zero_grad()
            if state.core.use_fp16:
                with amp.scale_loss(state.core.loss, state.core.opt) as scaled_loss:
                    scaled_loss.backward()
            else:
                state.core.loss.backward()
            state.core.opt.step()

Source File: lr_finder.py From pytorch-lr-finder with MIT License

5 votes

def _train_batch(self, train_iter, accumulation_steps, non_blocking_transfer=True):
        self.model.train()
        total_loss = None  # for late initialization

        self.optimizer.zero_grad()
        for i in range(accumulation_steps):
            inputs, labels = next(train_iter)
            inputs, labels = self._move_to_device(
                inputs, labels, non_blocking=non_blocking_transfer
            )

            # Forward pass
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)

            # Loss should be averaged in each step
            loss /= accumulation_steps

            # Backward pass
            if IS_AMP_AVAILABLE and hasattr(self.optimizer, "_amp_stash"):
                # For minor performance optimization, see also:
                # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
                delay_unscale = ((i + 1) % accumulation_steps) != 0

                with amp.scale_loss(
                    loss, self.optimizer, delay_unscale=delay_unscale
                ) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if total_loss is None:
                total_loss = loss
            else:
                total_loss += loss

        self.optimizer.step()

        return total_loss.item()

Source File: transformer_main.py From Count-Sketch-Optimizers with Apache License 2.0

5 votes

def train(train_step):
    train_loader = train_corpus.batch_generator()

    start_time = time.time()
    for batch, item in enumerate(train_loader):
        net.train()
        data, targets, word_cnt, batch_len = get_batch(item)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        optimizer.zero_grad()

        # Network
        logits, new_targets = net(data, targets)
        loss = F.cross_entropy(logits.view(-1, nsampled+1), new_targets)

        # AMP
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.clip)

        optimizer.step()
        scheduler.step(train_step)
        train_step += 1

        interval = 125
        if batch % interval == 0:
            elapsed = time.time() - start_time
            print('Epoch: {:3d} | {:5d}/{:5d} batches | lr {:.6f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'
                  .format(epoch, batch, batch_len, scheduler.get_lr()[0], elapsed * 1000 / interval, loss.item(), math.exp(loss.item())))
            start_time = time.time()
            sys.stdout.flush()
    return train_step

# Load the saved model.

Source File: distiller.py From exbert with Apache License 2.0

5 votes

def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
        Also update the metrics for tensorboard.
        """
        # Check for NaN
        if (loss != loss).data.any():
            logger.error("NaN detected")
            exit()

        if self.multi_gpu:
            loss = loss.mean()
        if self.params.gradient_accumulation_steps > 1:
            loss = loss / self.params.gradient_accumulation_steps

        if self.fp16:
            from apex import amp

            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step()

Source File: base_model.py From TractSeg with Apache License 2.0

4 votes

def train(self, X, y, weight_factor=None):
        X = X.contiguous().cuda(non_blocking=True)  # (bs, features, x, y)
        y = y.contiguous().cuda(non_blocking=True)  # (bs, classes, x, y)

        self.net.train()
        self.optimizer.zero_grad()
        outputs = self.net(X)  # (bs, classes, x, y)
        angle_err = None

        if weight_factor is not None:
            if len(y.shape) == 4:  # 2D
                weights = torch.ones((self.Config.BATCH_SIZE, self.Config.NR_OF_CLASSES,
                                      y.shape[2], y.shape[3])).cuda()
            else:  # 3D
                weights = torch.ones((self.Config.BATCH_SIZE, self.Config.NR_OF_CLASSES,
                                      y.shape[2], y.shape[3], y.shape[4])).cuda()
            bundle_mask = y > 0
            weights[bundle_mask.data] *= weight_factor  # 10

            if self.Config.EXPERIMENT_TYPE == "peak_regression":
                loss, angle_err = self.criterion(outputs, y, weights)
            else:
                loss = nn.BCEWithLogitsLoss(weight=weights)(outputs, y)
        else:
            if self.Config.LOSS_FUNCTION == "soft_sample_dice" or self.Config.LOSS_FUNCTION == "soft_batch_dice":
                loss = self.criterion(F.sigmoid(outputs), y)
                # loss = criterion(F.sigmoid(outputs), y) + nn.BCEWithLogitsLoss()(outputs, y)  # combined loss
            else:
                loss = self.criterion(outputs, y)

        if APEX_AVAILABLE and self.Config.FP16:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.optimizer.step()

        if self.Config.EXPERIMENT_TYPE == "peak_regression":
            f1 = metric_utils.calc_peak_length_dice_pytorch(self.Config.CLASSES, outputs.detach(), y.detach(),
                                                            max_angle_error=self.Config.PEAK_DICE_THR,
                                                            max_length_error=self.Config.PEAK_DICE_LEN_THR)
        elif self.Config.EXPERIMENT_TYPE == "dm_regression":
            f1 = pytorch_utils.f1_score_macro(y.detach() > self.Config.THRESHOLD, outputs.detach(),
                                              per_class=True, threshold=self.Config.THRESHOLD)
        else:
            f1 = pytorch_utils.f1_score_macro(y.detach(), F.sigmoid(outputs).detach(), per_class=True,
                                              threshold=self.Config.THRESHOLD)

        if self.Config.USE_VISLOGGER:
            probs = F.sigmoid(outputs)
        else:
            probs = None  # faster

        metrics = {}
        metrics["loss"] = loss.item()
        metrics["f1_macro"] = f1
        metrics["angle_err"] = angle_err if angle_err is not None else 0

        return probs, metrics

Source File: benchmark_nvidia_apex.py From ignite with BSD 3-Clause "New" or "Revised" License

4 votes

def main(dataset_path, batch_size=256, max_epochs=10, opt="O1"):
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    device = "cuda"

    train_loader, test_loader, eval_train_loader = get_train_eval_loaders(dataset_path, batch_size=batch_size)

    model = wide_resnet50_2(num_classes=100).to(device)
    optimizer = SGD(model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss().to(device)

    model, optimizer = amp.initialize(model, optimizer, opt_level=opt)

    def train_step(engine, batch):
        x = convert_tensor(batch[0], device, non_blocking=True)
        y = convert_tensor(batch[1], device, non_blocking=True)

        optimizer.zero_grad()

        y_pred = model(x)
        loss = criterion(y_pred, y)

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        optimizer.step()

        return loss.item()

    trainer = Engine(train_step)
    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)
    ProgressBar(persist=True).attach(trainer, output_transform=lambda out: {"batch loss": out})

    metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)}

    evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)

    def log_metrics(engine, title):
        for name in metrics:
            print("\t{} {}: {:.2f}".format(title, name, engine.state.metrics[name]))

    @trainer.on(Events.COMPLETED)
    def run_validation(_):
        print("- Mean elapsed time for 1 epoch: {}".format(timer.value()))
        print("- Metrics:")
        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"):
            evaluator.run(eval_train_loader)

        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"):
            evaluator.run(test_loader)

    trainer.run(train_loader, max_epochs=max_epochs)

Python apex.amp.scale_loss() Examples