Python apex.parallel.DistributedDataParallel() Examples
The following are 15
code examples of apex.parallel.DistributedDataParallel().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
apex.parallel
, or try the search function
.
Example #1
Source File: nnUNetTrainerV2_DDP.py From nnUNet with Apache License 2.0 | 6 votes |
def run_training(self): """ if we run with -c then we need to set the correct lr for the first epoch, otherwise it will run the first continued epoch with self.initial_lr we also need to make sure deep supervision in the network is enabled for training, thus the wrapper :return: """ self.maybe_update_lr(self.epoch) # if we dont overwrite epoch then self.epoch+1 is used which is not what we # want at the start of the training if isinstance(self.network, DDP): net = self.network.module else: net = self.network ds = net.do_ds net.do_ds = True ret = nnUNetTrainer.run_training(self) net.do_ds = ds return ret
Example #2
Source File: nnUNetTrainerV2_DDP.py From nnUNet with Apache License 2.0 | 6 votes |
def validate(self, do_mirroring: bool = True, use_sliding_window: bool = True, step_size: float = 0.5, save_softmax: bool = True, use_gaussian: bool = True, overwrite: bool = True, validation_folder_name: str = 'validation_raw', debug: bool = False, all_in_gpu: bool = False, force_separate_z: bool = None, interpolation_order: int = 3, interpolation_order_z=0): if self.local_rank == 0: if isinstance(self.network, DDP): net = self.network.module else: net = self.network ds = net.do_ds net.do_ds = False ret = nnUNetTrainer.validate(self, do_mirroring, use_sliding_window, step_size, save_softmax, use_gaussian, overwrite, validation_folder_name, debug, all_in_gpu, force_separate_z=force_separate_z, interpolation_order=interpolation_order, interpolation_order_z=interpolation_order_z) net.do_ds = ds return ret
Example #3
Source File: distributed.py From catalyst with Apache License 2.0 | 6 votes |
def check_ddp_wrapped(model: nn.Module) -> bool: """ Checks whether model is wrapped with DataParallel/DistributedDataParallel. """ parallel_wrappers = nn.DataParallel, nn.parallel.DistributedDataParallel # Check whether Apex is installed and if it is, # add Apex's DistributedDataParallel to list of checked types try: from apex.parallel import DistributedDataParallel as apex_DDP parallel_wrappers = parallel_wrappers + (apex_DDP,) except ImportError: pass return isinstance(model, parallel_wrappers)
Example #4
Source File: py_factory.py From CornerNet-Lite-Pytorch with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, system_config, model, distributed=False, gpu=None): super(NetworkFactory, self).__init__() self.system_config = system_config self.gpu = gpu self.model = DummyModule(model) self.loss = model.loss self.network = Network(self.model, self.loss) if distributed: from apex.parallel import DistributedDataParallel, convert_syncbn_model torch.cuda.set_device(gpu) self.network = self.network.cuda(gpu) self.network = convert_syncbn_model(self.network) self.network = DistributedDataParallel(self.network) else: self.network = DataParallel(self.network, chunk_sizes=system_config.chunk_sizes) total_params = 0 for params in self.model.parameters(): num_params = 1 for x in params.size(): num_params *= x total_params += num_params print("\033[0;35m " + "total parameters: {}".format( total_params) + "\033[0m") if system_config.opt_algo == "adam": self.optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, self.model.parameters()) ) elif system_config.opt_algo == "sgd": self.optimizer = torch.optim.SGD( filter(lambda p: p.requires_grad, self.model.parameters()), lr=system_config.learning_rate, momentum=0.9, weight_decay=0.0001 ) else: raise ValueError("unknown optimizer")
Example #5
Source File: distributed_mixin.py From fastMRI with MIT License | 5 votes |
def distribute_model_object(self, mdl): args = self.args if args.apex_distributed: #TODO: try delay_allreduce=True from apex.parallel import DistributedDataParallel as ApexDDP mdl = ApexDDP(mdl, delay_allreduce=True) else: mdl = DDP(mdl, device_ids=[args.gpu], output_device=args.gpu) return mdl
Example #6
Source File: model_setup.py From bert_on_stilts with Apache License 2.0 | 5 votes |
def stage_model(model, fp16, device, local_rank, n_gpu): if fp16: model.half() model.to(device) if local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex " "to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) return model
Example #7
Source File: py_factory.py From CornerNet-Lite with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, system_config, model, distributed=False, gpu=None): super(NetworkFactory, self).__init__() self.system_config = system_config self.gpu = gpu self.model = DummyModule(model) self.loss = model.loss self.network = Network(self.model, self.loss) if distributed: from apex.parallel import DistributedDataParallel, convert_syncbn_model torch.cuda.set_device(gpu) self.network = self.network.cuda(gpu) self.network = convert_syncbn_model(self.network) self.network = DistributedDataParallel(self.network) else: self.network = DataParallel(self.network, chunk_sizes=system_config.chunk_sizes) total_params = 0 for params in self.model.parameters(): num_params = 1 for x in params.size(): num_params *= x total_params += num_params print("total parameters: {}".format(total_params)) if system_config.opt_algo == "adam": self.optimizer = torch.optim.Adam( filter(lambda p: p.requires_grad, self.model.parameters()) ) elif system_config.opt_algo == "sgd": self.optimizer = torch.optim.SGD( filter(lambda p: p.requires_grad, self.model.parameters()), lr=system_config.learning_rate, momentum=0.9, weight_decay=0.0001 ) else: raise ValueError("unknown optimizer")
Example #8
Source File: train.py From pkuseg with Apache License 2.0 | 5 votes |
def main(cfgs): Logger.init(**cfgs['logger']) local_rank = cfgs['local_rank'] world_size = int(os.environ['WORLD_SIZE']) Log.info('rank: {}, world_size: {}'.format(local_rank, world_size)) log_dir = cfgs['log_dir'] pth_dir = cfgs['pth_dir'] if local_rank == 0: assure_dir(log_dir) assure_dir(pth_dir) aux_config = cfgs.get('auxiliary', None) network = ModuleBuilder(cfgs['network'], aux_config).cuda() criterion = build_criterion(cfgs['criterion'], aux_config).cuda() optimizer = optim.SGD(network.parameters(), **cfgs['optimizer']) scheduler = PolyLRScheduler(optimizer, **cfgs['scheduler']) dataset = build_dataset(**cfgs['dataset'], **cfgs['transforms']) sampler = DistributedSampler4Iter(dataset, world_size=world_size, rank=local_rank, **cfgs['sampler']) train_loader = DataLoader(dataset, sampler=sampler, **cfgs['loader']) cudnn.benchmark = True torch.manual_seed(666) torch.cuda.manual_seed(666) torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl', init_method='env://') model = DistributedDataParallel(network) model = apex.parallel.convert_syncbn_model(model) torch.cuda.empty_cache() train(local_rank, world_size, pth_dir, cfgs['frequency'], criterion, train_loader, model, optimizer, scheduler)
Example #9
Source File: flownmt.py From flowseq with Apache License 2.0 | 5 votes |
def init_distributed(self, rank, local_rank): assert not self.distribured_enabled self.distribured_enabled = True print("Initializing Distributed, rank {}, local rank {}".format(rank, local_rank)) dist.init_process_group(backend='nccl', rank=rank) torch.cuda.set_device(local_rank) self.core = DistributedDataParallel(self.core)
Example #10
Source File: nnUNetTrainerV2_DDP.py From nnUNet with Apache License 2.0 | 5 votes |
def predict_preprocessed_data_return_seg_and_softmax(self, data: np.ndarray, do_mirroring: bool = True, mirror_axes: Tuple[int] = None, use_sliding_window: bool = True, step_size: float = 0.5, use_gaussian: bool = True, pad_border_mode: str = 'constant', pad_kwargs: dict = None, all_in_gpu: bool = True, verbose: bool = True) -> Tuple[np.ndarray, np.ndarray]: if pad_border_mode == 'constant' and pad_kwargs is None: pad_kwargs = {'constant_values': 0} if do_mirroring and mirror_axes is None: mirror_axes = self.data_aug_params['mirror_axes'] if do_mirroring: assert self.data_aug_params["do_mirror"], "Cannot do mirroring as test time augmentation when training " \ "was done without mirroring" valid = list((SegmentationNetwork, nn.DataParallel, DDP)) assert isinstance(self.network, tuple(valid)) if isinstance(self.network, DDP): net = self.network.module else: net = self.network ds = net.do_ds net.do_ds = False ret = net.predict_3D(data, do_mirroring, mirror_axes, use_sliding_window, step_size, self.patch_size, self.regions_class_order, use_gaussian, pad_border_mode, pad_kwargs, all_in_gpu, verbose) net.do_ds = ds return ret
Example #11
Source File: train.py From BiSeNet with MIT License | 5 votes |
def set_model_dist(net): if has_apex: net = parallel.DistributedDataParallel(net, delay_allreduce=True) else: local_rank = dist.get_rank() net = nn.parallel.DistributedDataParallel( net, device_ids=[local_rank, ], output_device=local_rank) return net
Example #12
Source File: helpers.py From training with Apache License 2.0 | 5 votes |
def model_multi_gpu(model, multi_gpu=False): if multi_gpu: model = DDP(model) print('DDP(model)') return model
Example #13
Source File: main.py From sparse_learning with MIT License | 5 votes |
def __init__(self, args, arch, loss, pretrained_weights=None, state=None, cuda=True, fp16=False, distributed=False): super(ModelAndLoss, self).__init__() self.arch = arch self.mask = None print("=> creating model '{}'".format(arch)) model = models.build_resnet(arch[0], arch[1]) if pretrained_weights is not None: print("=> using pre-trained model from a file '{}'".format(arch)) model.load_state_dict(pretrained_weights) if cuda: model = model.cuda() if fp16: model = network_to_half(model) if distributed: model = DDP(model) if not state is None: model.load_state_dict(state) # define loss function (criterion) and optimizer criterion = loss() if cuda: criterion = criterion.cuda() self.model = model self.loss = criterion
Example #14
Source File: eval_detectron2.py From virtex with MIT License | 4 votes |
def __init__(self, cfg, weights: Union[str, Dict[str, Any]]): self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg # We do not make any super call here and implement `__init__` from # `DefaultTrainer`: we need to initialize mixed precision model before # wrapping to DDP, so we need to do it this way. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) scheduler = self.build_lr_scheduler(cfg, optimizer) # Load pre-trained weights before wrapping to DDP because `ApexDDP` has # some weird issue with `DetectionCheckpointer`. # fmt: off if isinstance(weights, str): # weights are ``str`` means ImageNet init or resume training. self.start_iter = ( DetectionCheckpointer( model, optimizer=optimizer, scheduler=scheduler ).resume_or_load(weights, resume=True).get("iteration", -1) + 1 ) elif isinstance(weights, dict): # weights are a state dict means our pretrain init. DetectionCheckpointer(model)._load_model(weights) # fmt: on # Enable distributed training if we have multiple GPUs. Use Apex DDP for # non-FPN backbones because its `delay_allreduce` functionality helps with # gradient checkpointing. if dist.get_world_size() > 1: if global_cfg.get("GRADIENT_CHECKPOINT", False): model = ApexDDP(model, delay_allreduce=True) else: model = nn.parallel.DistributedDataParallel( model, device_ids=[dist.get_rank()], broadcast_buffers=False ) # Call `__init__` from grandparent class: `SimpleTrainer`. SimpleTrainer.__init__(self, model, data_loader, optimizer) self.scheduler = scheduler self.checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler ) self.register_hooks(self.build_hooks())
Example #15
Source File: nnUNetTrainerV2_DDP.py From nnUNet with Apache License 2.0 | 4 votes |
def load_checkpoint_ram(self, saved_model, train=True): """ used for if the checkpoint is already in ram :param saved_model: :param train: :return: """ if not self.was_initialized: self.initialize(train) new_state_dict = OrderedDict() curr_state_dict_keys = list(self.network.state_dict().keys()) # if state dict comes form nn.DataParallel but we use non-parallel model here then the state dict keys do not # match. Use heuristic to make it match for k, value in saved_model['state_dict'].items(): key = k if key not in curr_state_dict_keys: print("duh") key = key[7:] new_state_dict[key] = value # if we are fp16, then we need to reinitialize the network and the optimizer. Otherwise amp will throw an error if self.fp16: self.network, self.optimizer, self.lr_scheduler = None, None, None self.initialize_network() self.initialize_optimizer_and_scheduler() # we need to reinitialize DDP here self.network = DDP(self.network) self.network.load_state_dict(new_state_dict) self.epoch = saved_model['epoch'] if train: optimizer_state_dict = saved_model['optimizer_state_dict'] if optimizer_state_dict is not None: self.optimizer.load_state_dict(optimizer_state_dict) if self.lr_scheduler is not None and hasattr(self.lr_scheduler, 'load_state_dict') and saved_model[ 'lr_scheduler_state_dict'] is not None: self.lr_scheduler.load_state_dict(saved_model['lr_scheduler_state_dict']) if issubclass(self.lr_scheduler.__class__, _LRScheduler): self.lr_scheduler.step(self.epoch) self.all_tr_losses, self.all_val_losses, self.all_val_losses_tr_mode, self.all_val_eval_metrics = saved_model[ 'plot_stuff'] # after the training is done, the epoch is incremented one more time in my old code. This results in # self.epoch = 1001 for old trained models when the epoch is actually 1000. This causes issues because # len(self.all_tr_losses) = 1000 and the plot function will fail. We can easily detect and correct that here if self.epoch != len(self.all_tr_losses): self.print_to_log_file("WARNING in loading checkpoint: self.epoch != len(self.all_tr_losses). This is " "due to an old bug and should only appear when you are loading old models. New " "models should have this fixed! self.epoch is now set to len(self.all_tr_losses)") self.epoch = len(self.all_tr_losses) self.all_tr_losses = self.all_tr_losses[:self.epoch] self.all_val_losses = self.all_val_losses[:self.epoch] self.all_val_losses_tr_mode = self.all_val_losses_tr_mode[:self.epoch] self.all_val_eval_metrics = self.all_val_eval_metrics[:self.epoch] self.amp_initialized = False self._maybe_init_amp()