Python Examples of apex.amp

Source File: argus_models.py From argus-freesound with MIT License

6 votes

def __init__(self, params):
        super().__init__(params)

        if 'aux' in params:
            self.aux_weights = params['aux']['weights']
        else:
            self.aux_weights = None

        self.use_amp = not config.kernel and 'amp' in params
        if self.use_amp:
            from apex import amp
            self.amp = amp
            self.nn_module, self.optimizer = self.amp.initialize(
                self.nn_module, self.optimizer,
                opt_level=params['amp']['opt_level'],
                keep_batchnorm_fp32=params['amp']['keep_batchnorm_fp32'],
                loss_scale=params['amp']['loss_scale']
            )

Source File: distrib_parts.py From pytorch-lightning with Apache License 2.0

6 votes

def single_gpu_train(self, model):
        # call setup
        self.setup('fit')
        if self.is_function_implemented('setup', model):
            model.setup('fit')

        model.cuda(self.root_gpu)

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

        # TODO: remove with dropping NVIDIA AMP support
        if self.use_amp and not NATIVE_AMP_AVALAIBLE:
            # An example
            model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
            self.optimizers = optimizers
            self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers)

        self.run_pretrain_routine(model)

Source File: argus_models.py From argus-freesound with MIT License

5 votes

def train_step(self, batch)-> dict:
        if not self.nn_module.training:
            self.nn_module.train()
        self.optimizer.zero_grad()
        input, target, noisy = self.prepare_batch(batch, self.device)
        prediction = self.nn_module(input)
        if self.aux_weights is not None:
            loss = 0
            for pred, weight in zip(prediction, self.aux_weights):
                loss += self.loss(pred, target, noisy) * weight
        else:
            loss = self.loss(prediction, target, noisy)
        if self.use_amp:
            with self.amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.optimizer.step()

        prediction = deep_detach(prediction)
        target = deep_detach(target)
        return {
            'prediction': self.prediction_transform(prediction[0]),
            'target': target,
            'loss': loss.item(),
            'noisy': noisy
        }

Source File: classification_task.py From ClassyVision with MIT License

5 votes

def set_amp_args(self, amp_args: Optional[Dict[str, Any]]):
        """Disable / enable apex.amp and set the automatic mixed precision parameters.

        apex.amp can be utilized for mixed / half precision training.

        Args:
            amp_args: Dictionary containing arguments to be passed to
            amp.initialize. Set to None to disable amp.  To enable mixed
            precision training, pass amp_args={"opt_level": "O1"} here.
            See https://nvidia.github.io/apex/amp.html for more info.

        Raises:
            RuntimeError: If opt_level is not None and apex is not installed.

        Warning: apex needs to be installed to utilize this feature.
        """
        self.amp_args = amp_args

        if amp_args is None:
            logging.info(f"AMP disabled")
        else:
            if not apex_available:
                raise RuntimeError("apex is not installed, cannot enable amp")

            logging.info(f"AMP enabled with args {amp_args}")
        return self

Source File: classification_task.py From ClassyVision with MIT License

5 votes

def get_classy_state(self, deep_copy: bool = False):
        """Returns serialiable state of task

        Args:
            deep_copy: If true, does a deep copy of state before returning.
        """
        optimizer_state = {}
        if self.optimizer is not None:
            optimizer_state = self.optimizer.get_classy_state()

        classy_state_dict = {
            "train": self.train,
            "base_model": self.base_model.get_classy_state(),
            "meters": [meter.get_classy_state() for meter in self.meters],
            "optimizer": optimizer_state,
            "phase_idx": self.phase_idx,
            "train_phase_idx": self.train_phase_idx,
            "num_updates": self.num_updates,
            "losses": self.losses,
            "hooks": {hook.name(): hook.get_classy_state() for hook in self.hooks},
            "loss": {},
        }
        if "train" in self.datasets and self._is_checkpointable_dataset(
            self.datasets["train"]
        ):
            classy_state_dict["train_dataset_iterator"] = self.datasets[
                "train"
            ].get_classy_state()

        if isinstance(self.loss, ClassyLoss):
            classy_state_dict["loss"] = self.loss.get_classy_state()
        if self.amp_args is not None:
            classy_state_dict["amp"] = apex.amp.state_dict()
        if deep_copy:
            classy_state_dict = copy.deepcopy(classy_state_dict)
        return classy_state_dict

Source File: utils.py From fastNLP with Apache License 2.0

5 votes

def _check_fp16():
    if amp is None:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    if not torch.backends.cudnn.enabled:
        raise RuntimeError("Amp requires cudnn backend to be enabled.")

Source File: infer.py From NLP_Toolkit with Apache License 2.0

5 votes

def evaluate_corpus_bleu(args, early_stopping=True, stop_no=1000):
    args.batch_size = 1
    train_iter, FR, EN, train_length = load_dataloaders(args)
    src_vocab = len(EN.vocab)
    trg_vocab = len(FR.vocab)
    
    cuda = torch.cuda.is_available()
    if args.fp16:    
        from apex import amp
    else:
        amp = None
    net, _, _, _, _, _ = load_model_and_optimizer(args, src_vocab, \
                                                  trg_vocab, cuda, amp=amp)
    
    net.eval()
    trg_init = FR.vocab.stoi["<sos>"]
    trg_init = Variable(torch.LongTensor([trg_init])).unsqueeze(0)
    
    logger.info("Evaluating corpus bleu...")
    refs = []; hyps = []
    with torch.no_grad():
        for i, data in tqdm(enumerate(train_iter), total=len(train_iter)):
            trg_input = trg_init
            labels = data.FR[:,1:].contiguous().view(-1)
            src_mask, trg_mask = create_masks(data.EN, trg_input)
            if cuda:
                data.EN = data.EN.cuda(); trg_input = trg_input.cuda(); labels = labels.cuda()
                src_mask = src_mask.cuda(); trg_mask = trg_mask.cuda()
            stepwise_translated_words, final_step_words = net(data.EN, trg_input, src_mask, None,\
                                                              infer=True, trg_vocab_obj=FR)
            refs.append([stepwise_translated_words]) # need to remove <eos> tokens
            hyps.append([FR.vocab.itos[i] for i in labels[:-1]])
            if early_stopping and ((i + 1) % stop_no == 0):
                print(refs); print(hyps)
                break
    score = calculate_bleu(refs, hyps, corpus_level=True)
    print("Corpus bleu score: %.5f" % score)
    return score

Source File: classification_task.py From ClassyVision with MIT License

4 votes

def set_classy_state(self, state):
        """Set task state

        Args:
            state: Dict containing state of a task
        """
        # some settings are different in test only
        self.train = False if self.test_only else state["train"]
        if not self.test_only:
            self.phase_idx = state["phase_idx"]
            self.num_updates = state["num_updates"]
            self.train_phase_idx = state["train_phase_idx"]
            self.losses = state["losses"]
            for meter, meter_state in zip(self.meters, state["meters"]):
                meter.set_classy_state(meter_state)

        self.base_model.set_classy_state(state["base_model"])
        if self.optimizer is not None:
            self.optimizer.set_classy_state(state["optimizer"])
        if state.get("loss") and isinstance(self.loss, ClassyLoss):
            self.loss.set_classy_state(state["loss"])

        if "amp" in state:
            apex.amp.load_state_dict(state["amp"])

        for hook in self.hooks:
            # we still want to be able to run when new hooks are added or old
            # hooks are removed
            if hook.name() in state["hooks"]:
                hook.set_classy_state(state["hooks"][hook.name()])
            else:
                logging.warn(f"No state found for hook: {hook.name()}")

        if "train" in self.datasets and self._is_checkpointable_dataset(
            self.datasets["train"]
        ):
            self.datasets["train"].set_classy_state(state.get("train_dataset_iterator"))

        # TODO (mannatsingh): Figure out how to set the state of the dataloaders
        # Re-build dataloader & re-create iterator.
        self._recreate_data_loader_from_dataset()
        self.create_data_iterator()
        # Set up pytorch module in train vs eval mode, update optimizer.
        self._set_model_train_mode()

Source File: classification_task.py From ClassyVision with MIT License

4 votes

def train_step(self):
        """Train step to be executed in train loop."""

        self.last_batch = None

        # Process next sample
        sample = next(self.get_data_iterator())

        assert isinstance(sample, dict) and "input" in sample and "target" in sample, (
            f"Returned sample [{sample}] is not a map with 'input' and"
            + "'target' keys"
        )

        # Copy sample to GPU
        target = sample["target"]
        if self.use_gpu:
            sample = recursive_copy_to_gpu(sample, non_blocking=True)

        if self.mixup_transform is not None:
            sample = self.mixup_transform(sample)

        with torch.enable_grad():
            # Forward pass
            output = self.model(sample["input"])

            local_loss = self.compute_loss(output, sample)

            loss = local_loss.detach().clone()

            self.losses.append(loss.data.cpu().item() * target.size(0))

            self.update_meters(output, sample)

        # Run backwards pass / update optimizer
        if self.amp_args is not None:
            self.optimizer.zero_grad()
            with apex.amp.scale_loss(
                local_loss, self.optimizer.optimizer
            ) as scaled_loss:
                scaled_loss.backward()
        else:
            self.optimizer.backward(local_loss)

        self.check_inf_nan(loss)

        self.optimizer.update_schedule_on_step(self.where)
        self.optimizer.step()

        self.num_updates += self.get_global_batchsize()

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(
            loss=loss, output=output, target=target, sample=sample
        )

Source File: infer.py From NLP_Toolkit with Apache License 2.0

4 votes

def __init__(self, args=None):
        if args is None:
            self.args = load_pickle("args.pkl")
        else:
            self.args = args
        self.cuda = torch.cuda.is_available()
        self.args.batch_size = 1
        
        if self.args.model_no != 1:
            logger.info("Loading tokenizer and model...")
            self.tokenizer_en = tokener(args.src_lang)
            train_iter, FR, EN, train_length = load_dataloaders(self.args)
            self.FR = FR
            self.EN = EN
            self.train_iter = train_iter
            self.train_length = train_length
            self.src_vocab = len(EN.vocab)
            self.trg_vocab = len(FR.vocab)
            
            if self.args.fp16:    
                from apex import amp
            else:
                amp = None
            self.amp = amp
            net, _, _, _, _, _ = load_model_and_optimizer(self.args, self.src_vocab, \
                                                          self.trg_vocab, self.cuda, amp=amp)
            self.net = net
            self.net.eval()
            trg_init = FR.vocab.stoi["<sos>"]
            self.trg_init = Variable(torch.LongTensor([trg_init])).unsqueeze(0)
        elif self.args.model_no == 1:
            from .mass.interactive import Translator
            src, tgt = "zh-en".split('-')
            logger.info("Loading translator, tokenizer...")
            self.translator = Translator(data_path='./data/data-bin/processed_data_%s_%s' % (src, tgt),\
                                         checkpoint_path="./data/checkpoints/%s_%s/checkpoint50.pt" % (src, tgt),\
                                         task='translation',\
                                         user_dir='',\
                                         s=src, t=tgt,\
                                         langs='%s,%s' % (src, tgt),\
                                         mt_steps='%s-%s' % (src, tgt),\
                                         source_langs=src,\
                                         target_langs=tgt,\
                                         beam=5,\
                                         use_cuda=args.cuda)

Source File: training_loop.py From pytorch-lightning with Apache License 2.0

4 votes

def call_optimizer_step(self, optimizer, opt_idx, batch_idx, split_batch):
        # calls .step(), .zero_grad()
        # override function to modify this behavior
        model = self.get_model()

        with self.profiler.profile('optimizer_step'):
            lambda_closure = lambda: self.optimizer_closure(
                split_batch,
                batch_idx,
                opt_idx,
                optimizer,
                self.hiddens
            ).loss

            # apply TPU optimizer
            if self.use_tpu and XLA_AVAILABLE:
                model.optimizer_step(self.current_epoch, batch_idx,
                                     optimizer, opt_idx, lambda_closure, on_tpu=True)

            # for LBFGS do something a bit different
            elif isinstance(optimizer, torch.optim.LBFGS):

                # native amp + lbfgs is a no go right now
                if self.use_amp and NATIVE_AMP_AVALAIBLE:
                    raise MisconfigurationException(
                        'native PyTorch amp and lbfgs are not compatible.'
                        ' To request, please file a Github issue in PyTorch and tag @mcarilli')
                model.optimizer_step(self.current_epoch, batch_idx, optimizer, opt_idx, lambda_closure,
                                     using_lbfgs=True)


            # when using 16-bit
            else:
                native_amp = self.use_amp and NATIVE_AMP_AVALAIBLE
                model.optimizer_step(self.current_epoch, batch_idx, optimizer, opt_idx, lambda_closure,
                                     using_native_amp=native_amp)

            # in native 16-bit we need to update scaler after optimizer step
            if self.use_amp and NATIVE_AMP_AVALAIBLE:
                self.scaler.update()

            # model hook
            model.on_before_zero_grad(optimizer)

            # clear gradients
            model.optimizer_zero_grad(self.current_epoch, batch_idx, optimizer, opt_idx)

Source File: distrib_parts.py From pytorch-lightning with Apache License 2.0

4 votes

def dp_train(self, model):
        # call setup after the ddp process has connected
        self.setup('fit')
        if self.is_function_implemented('setup', model):
            model.setup('fit')

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

        model.cuda(self.root_gpu)

        # hack forward to do autocast for the user
        model_autocast_original_forward = model.forward
        if self.use_amp and NATIVE_AMP_AVALAIBLE:
            # wrap the user's forward in autocast and give it back at the end
            model.forward = torch.cuda.amp.autocast()(model.forward)

        # TODO: remove with dropping NVIDIA AMP support
        # check for this bug (amp + dp + !01 doesn't work)
        # https://github.com/NVIDIA/apex/issues/227
        if self.use_dp and self.use_amp and not NATIVE_AMP_AVALAIBLE:
            if self.amp_level == 'O2':
                raise MisconfigurationException(
                    f'Amp level {self.amp_level} with DataParallel is not supported.'
                    f' See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.'
                    f' We recommend you switch to ddp if you want to use amp')
            else:
                model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
                self.reinit_scheduler_properties(optimizers, self.lr_schedulers)

        # create list of device ids
        device_ids = self.data_parallel_device_ids
        if isinstance(device_ids, int):
            device_ids = list(range(device_ids))

        # set dp device
        torch.cuda.set_device(self.root_gpu)

        model = LightningDataParallel(model, device_ids=device_ids)

        self.run_pretrain_routine(model)

        model.forward = model_autocast_original_forward

Source File: distrib_parts.py From pytorch-lightning with Apache License 2.0

4 votes

def horovod_train(self, model):
        # call setup after the ddp process has connected
        self.setup('fit')
        if self.is_function_implemented('setup', model):
            model.setup('fit')

        if torch.cuda.is_available() and self.on_gpu:
            # Horovod: pin GPU to local rank
            assert self.root_gpu == hvd.local_rank()
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)

        # avoid duplicating progress bar
        if hvd.rank() != 0 and self.progress_bar_callback is not None:
            self.progress_bar_callback.disable()

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in self.optimizers:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= hvd.size()

        if self.use_amp:
            # An example
            model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
            self.optimizers = optimizers
            self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers)

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        for optimizer in self.optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        def filter_named_parameters(model, optimizer):
            opt_params = set([p for group in optimizer.param_groups for p in group.get('params', [])])
            return [(name, p) for name, p in model.named_parameters() if p in opt_params]

        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
        self.optimizers = [
            hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters(model, optimizer))
            for optimizer in self.optimizers
        ]

        # Update logger rank info from Horovod to avoid race conditions from  different ranks
        # creating directories / writing files in the same locations.
        self.global_rank = hvd.rank()
        rank_zero_only.rank = self.global_rank

        with ExitStack() as stack:
            for optimizer in self.optimizers:
                # Synchronization will be performed explicitly following backward()
                stack.enter_context(optimizer.skip_synchronize())

            self.run_pretrain_routine(model)

        # Make sure all workers have finished training before returning to the user
        hvd.join()

Python apex.amp() Examples