Python apex.optimizers.FusedAdam() Examples

The following are 12 code examples of apex.optimizers.FusedAdam(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apex.optimizers , or try the search function .
Example #1
Source File: fused_adam.py    From fairseq with MIT License 6 votes vote down vote up
def get_fused_adam_class():
    """
    Look for the FusedAdam optimizer from apex. We first try to load the
    "contrib" interface, which is a bit faster than the main interface,
    but is technically deprecated.
    """
    try:
        # The "deprecated" interface in recent versions of apex is a bit
        # faster than the main interface, since we don't use the apex
        # optimizer. This can be installed by passing the
        # `--deprecated_fused_adam` option when building apex.
        global fused_adam_cuda
        import importlib
        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
        return FusedAdamV1
    except ImportError:
        try:
            # fallback to the newer interface
            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
            from apex.multi_tensor_apply import multi_tensor_applier
            if multi_tensor_applier.available:
                return FusedAdamV2
        except ImportError:
            pass
    return None 
Example #2
Source File: fused_adam.py    From fairseq with MIT License 6 votes vote down vote up
def __init__(self, params,
                 lr=1e-3, bias_correction=True,
                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False,
                 weight_decay=0., max_grad_norm=0., amsgrad=False):
        global fused_adam_cuda
        import importlib
        fused_adam_cuda = importlib.import_module("fused_adam_cuda")

        if amsgrad:
            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
        defaults = {
            'lr': lr,
            'bias_correction': bias_correction,
            'betas': betas,
            'eps': eps,
            'weight_decay': weight_decay,
            'max_grad_norm': max_grad_norm,
        }
        super().__init__(params, defaults)
        self.eps_mode = 0 if eps_inside_sqrt else 1 
Example #3
Source File: fused_adam.py    From attn2d with MIT License 6 votes vote down vote up
def get_fused_adam_class():
    """
    Look for the FusedAdam optimizer from apex. We first try to load the
    "contrib" interface, which is a bit faster than the main interface,
    but is technically deprecated.
    """
    try:
        # The "deprecated" interface in recent versions of apex is a bit
        # faster than the main interface, since we don't use the apex
        # optimizer. This can be installed by passing the
        # `--deprecated_fused_adam` option when building apex.
        global fused_adam_cuda
        import importlib
        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
        return FusedAdamV1
    except ImportError:
        try:
            # fallback to the newer interface
            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
            from apex.multi_tensor_apply import multi_tensor_applier
            if multi_tensor_applier.available:
                return FusedAdamV2
        except ImportError:
            pass
    return None 
Example #4
Source File: fused_adam.py    From attn2d with MIT License 6 votes vote down vote up
def __init__(self, params,
                 lr=1e-3, bias_correction=True,
                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False,
                 weight_decay=0., max_grad_norm=0., amsgrad=False):
        global fused_adam_cuda
        import importlib
        fused_adam_cuda = importlib.import_module("fused_adam_cuda")

        if amsgrad:
            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
        defaults = {
            'lr': lr,
            'bias_correction': bias_correction,
            'betas': betas,
            'eps': eps,
            'weight_decay': weight_decay,
            'max_grad_norm': max_grad_norm,
        }
        super().__init__(params, defaults)
        self.eps_mode = 0 if eps_inside_sqrt else 1 
Example #5
Source File: checkpointing_test_part2.py    From apex with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def main(step, args, model_state_dict, optimizer_state_dict):
    #
    # PART2
    #

    model = build_model(args).cuda()
    one_ll = next(model.children()).weight
    optimizer = FusedAdam(model.parameters())
    ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask)
    ASP.init_optimizer_for_pruning(optimizer)

    torch.manual_seed(args.seed2)
    model.load_state_dict(model_state_dict)
    optimizer.load_state_dict(optimizer_state_dict)

    print("Model sparsity is %s" % ("enabled" if ASP.sparsity_is_enabled() else "disabled"))

    # train for a few steps with sparse weights
    print("SPARSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2) 
Example #6
Source File: model_setup.py    From bert_on_stilts with Apache License 2.0 5 votes vote down vote up
def create_optimizer(model, learning_rate, t_total, loss_scale, fp16, warmup_proportion, state_dict):
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = [
        'bias', 'LayerNorm.bias', 'LayerNorm.weight',
        'adapter.down_project.weight', 'adapter.up_project.weight',
    ]
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    if fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex "
                              "to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=learning_rate,
                             warmup=warmup_proportion,
                             t_total=t_total)

    if state_dict is not None:
        optimizer.load_state_dict(state_dict)
    return optimizer 
Example #7
Source File: __init__.py    From pytorch-tools with MIT License 5 votes vote down vote up
def optimizer_from_name(optim_name):
    optim_name = optim_name.lower()
    if optim_name == "sgd":
        return optim.SGD
    elif optim_name == "sgdw":
        return SGDW
    elif optim_name == "adam":
        return partial(optim.Adam, eps=2e-5)
    elif optim_name == "adamw":
        return partial(AdamW_my, eps=2e-5)
    elif optim_name == "adamw_gc":
        # in this implementation eps in inside sqrt so it can be smaller
        return partial(AdamW_my, center=True, eps=1e-7)
    elif optim_name == "rmsprop":
        # in this implementation eps in inside sqrt so it can be smaller
        return partial(RMSprop, eps=1e-7)
    elif optim_name == "radam":
        return partial(RAdam, eps=2e-5)
    elif optim_name in ["fused_sgd", "fusedsgd"]:
        return FusedSGD
    elif optim_name in ["fused_adam", "fusedadam"]:
        return partial(FusedAdam, eps=2e-5)
    elif optim_name in ["fused_novograd", "fusednovograd", "novograd"]:
        return partial(FusedNovoGrad, eps=2e-5)
    else:
        raise ValueError(f"Optimizer {optim_name} not found") 
Example #8
Source File: toy_problem.py    From apex with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def main(args):
    model = build_model(args).cuda()
    one_ll = next(model.children()).weight
    optimizer = FusedAdam(model.parameters())
    # only prune linear layers, even though we also support conv1d, conv2d and conv3d
    ASP.init_model_for_pruning(model, "m4n2_1d", whitelist=[torch.nn.Linear], allow_recompute_mask=True)
    ASP.init_optimizer_for_pruning(optimizer)

    step = 0

    # train for a few steps with dense weights
    print("DENSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_dense_steps)

    # simulate sparsity by inserting zeros into existing dense weights
    ASP.compute_sparse_masks()

    # train for a few steps with sparse weights
    print("SPARSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps)

    # recompute sparse masks
    ASP.compute_sparse_masks()

    # train for a few steps with sparse weights
    print("SPARSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2)

    # turn off sparsity
    print("SPARSE :: ",one_ll)
    ASP.restore_pruned_weights()

    # train for a few steps with dense weights
    print("DENSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_dense_steps_2) 
Example #9
Source File: checkpointing_test_part1.py    From apex with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def main(args):
    #
    # PART1
    #

    torch.manual_seed(args.seed)

    model = build_model(args).cuda()
    one_ll = next(model.children()).weight
    optimizer = FusedAdam(model.parameters())
    ASP.init_model_for_pruning(model, args.pattern, verbosity=args.verbosity, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask)
    ASP.init_optimizer_for_pruning(optimizer)

    step = 0

    # train for a few steps with dense weights
    print("DENSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_dense_steps)

    # simulate sparsity by inserting zeros into existing dense weights
    ASP.enable_sparsity()

    # train for a few steps with sparse weights
    print("SPARSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps)

    torch.save({
            'step': step,
            'verbosity': args.verbosity,
            'seed2': args.seed2,
            'pattern': args.pattern,
            'whitelist': args.whitelist,
            'allow_recompute_mask': args.allow_recompute_mask,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, args.checkpoint_path) 
Example #10
Source File: checkpointing_test_reference.py    From apex with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def main(args):
    #
    # PART1
    #

    torch.manual_seed(args.seed)

    model = build_model(args).cuda()
    one_ll = next(model.children()).weight
    optimizer = FusedAdam(model.parameters())
    ASP.init_model_for_pruning(model, args.pattern, whitelist=args.whitelist, allow_recompute_mask=args.allow_recompute_mask)
    ASP.init_optimizer_for_pruning(optimizer)

    step = 0

    # train for a few steps with dense weights
    print("DENSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_dense_steps)

    # simulate sparsity by inserting zeros into existing dense weights
    ASP.enable_sparsity()

    # train for a few steps with sparse weights
    print("SPARSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps)

    #
    # PART 2
    #

    torch.manual_seed(args.seed2)

    # train for a few steps with sparse weights
    print("SPARSE :: ",one_ll)
    step = train_loop(args, model, optimizer, step, args.num_sparse_steps_2) 
Example #11
Source File: imagenet.py    From apex with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def main():
	args = parseArgs()

	pyprof.nvtx.init()
#	pyprof.nvtx.wrap(fused_adam_cuda, 'adam')

	N = args.b
	C = 3
	H = d[args.m]['H']
	W = d[args.m]['W']
	opts = d[args.m]['opts']
	classes = 1000

	net = getattr(models, args.m)
	net = net(**opts).cuda().half()
	net.train()

	x = torch.rand(N, C, H, W).cuda().half()
	target = torch.empty(N, dtype=torch.long).random_(classes).cuda()

	criterion = nn.CrossEntropyLoss().cuda()
	if (args.o == "sgd"):
		optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)
	elif (args.o == "adam"):
		optimizer = FusedAdam(net.parameters())
	else:
		assert False

	#Warm up without profiler
	for i in range(2):
		output = net(x)
		loss = criterion(output, target)
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

	with torch.autograd.profiler.emit_nvtx():
		profiler.start()
		output = net(x)
		loss = criterion(output, target)
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()
		profiler.stop() 
Example #12
Source File: runner.py    From Self-Supervised-Speech-Pretraining-and-Representation-Learning with MIT License 4 votes vote down vote up
def set_model(self):
        print('[Runner] - Initializing Transformer model...')
        
        # build the Transformer model with speech prediction head
        model_config = TransformerConfig(self.config)
        self.dr = model_config.downsample_rate
        self.hidden_size = model_config.hidden_size
        
        self.model = TransformerForMaskedAcousticModel(model_config, self.input_dim, self.output_dim).to(self.device)
        self.model.train()

        if self.args.multi_gpu:
            self.model = torch.nn.DataParallel(self.model)
            print('[Runner] - Multi-GPU training Enabled: ' + str(torch.cuda.device_count()))
        print('[Runner] - Number of parameters: ' + str(sum(p.numel() for p in self.model.parameters() if p.requires_grad)))

        # Setup optimizer
        param_optimizer = list(self.model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        if self.apex:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                    lr=self.learning_rate,
                                    bias_correction=False,
                                    max_grad_norm=1.0)
            if self.config['optimizer']['loss_scale'] == 0:
                self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                self.optimizer = FP16_Optimizer(optimizer, static_loss_scale=self.config['optimizer']['loss_scale'])
            self.warmup_linear = WarmupLinearSchedule(warmup=self.warmup_proportion,
                                                      t_total=self.total_steps)
        else:
            self.optimizer = BertAdam(optimizer_grouped_parameters,
                                      lr=self.learning_rate,
                                      warmup=self.warmup_proportion,
                                      t_total=self.total_steps)