Python torch.distributed() Examples
The following are 30
code examples of torch.distributed().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch
, or try the search function
.
Example #1
Source File: distributed.py From Parsing-R-CNN with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #2
Source File: distributed_utils.py From crosentgec with GNU General Public License v3.0 | 6 votes |
def distributed_init(args): if args.distributed_world_size == 1: raise ValueError('Cannot initialize distributed with distributed_world_size=1') print('| distributed init (rank {}): {}'.format( args.distributed_rank, args.distributed_init_method), flush=True) if args.distributed_init_method.startswith('tcp://'): torch.distributed.init_process_group( backend=args.distributed_backend, init_method=args.distributed_init_method, world_size=args.distributed_world_size, rank=args.distributed_rank) else: torch.distributed.init_process_group( backend=args.distributed_backend, init_method=args.distributed_init_method, world_size=args.distributed_world_size) args.distributed_rank = torch.distributed.get_rank() if not is_master(args): suppress_output() return args.distributed_rank
Example #3
Source File: distributed.py From R2CNN.pytorch with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #4
Source File: train.py From CornerNet-Lite-Pytorch with BSD 3-Clause "New" or "Revised" License | 6 votes |
def parse_args(): parser = argparse.ArgumentParser(description="Training Script") parser.add_argument("cfg_file", help="config file", type=str) #训练用的配置文件 parser.add_argument("--iter", dest="start_iter", help="train at iteration i", default=0, type=int) #指定训练从第i次迭代开始 parser.add_argument("--workers", default=4, type=int) parser.add_argument("--initialize", action="store_true") parser.add_argument("--distributed", action="store_true") # 分布式训练 parser.add_argument("--world-size", default=-1, type=int, help="number of nodes of distributed training") # 分布式节点的数量 parser.add_argument("--rank", default=0, type=int, help="node rank for distributed training") # 分布式训练节点的等级 parser.add_argument("--dist-url", default=None, type=str, help="url used to set up distributed training") parser.add_argument("--dist-backend", default="nccl", type=str) args = parser.parse_args() return args
Example #5
Source File: distributed.py From Clothing-Detection with GNU General Public License v3.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #6
Source File: iterator.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 6 votes |
def init_epoch(self): """Set up the batch generator for a new epoch.""" if not self.distributed: if self._restored_from_state: self.random_shuffler.random_state = self._random_state_this_epoch else: self._random_state_this_epoch = self.random_shuffler.random_state self.create_batches() if not self.distributed: if self._restored_from_state: self._restored_from_state = False else: self._iterations_this_epoch = 0 else: self._iterations_this_epoch = 0 if not self.repeat: self.iterations = 0 self.epoch += 1 if self.distributed: self.random_shuffler.set_epoch(self.epoch)
Example #7
Source File: sampler.py From LEDNet with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #8
Source File: distributed.py From SegmenTron with Apache License 2.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #9
Source File: sampler.py From mars with Apache License 2.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): import torch.distributed as dist super().__init__(dataset) if num_replicas is None: # pragma: no cover if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: # pragma: no cover if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #10
Source File: test.py From DenseMatchingBenchmark with MIT License | 6 votes |
def parse_args(): parser = argparse.ArgumentParser(description='Test dense matching benchmark') parser.add_argument('config', help='train config file path') parser.add_argument('--checkpoint', help='checkpoint file') parser.add_argument('--out_dir', help='output result directory') parser.add_argument('--show', type=str, default='False', help='show results in images') parser.add_argument('--validate', action='store_true', help='whether to evaluate the result') parser.add_argument('--gpus', type=int, default=1, help='number of gpus to use (only applicable to non-distributed training)') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='pytorch', help='job launcher' ) parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) return args
Example #11
Source File: distributed.py From awesome-semantic-segmentation-pytorch with Apache License 2.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #12
Source File: repeat_factor.py From Parsing-R-CNN with MIT License | 6 votes |
def __iter__(self): if self.shuffle: # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch) indices = self._get_epoch_indices(g) randperm = torch.randperm(len(indices), generator=g).tolist() indices = indices[randperm] else: g = torch.Generator() g.manual_seed(self.epoch) indices = self._get_epoch_indices(g) # indices = torch.arange(len(self.dataset)).tolist() # when balance len(indices) diff from dataset image_num self.total_size = len(indices) logging_rank('balance sample total_size: {}'.format(self.total_size), distributed=1, local_rank=self.rank) # subsample self.num_samples = int(len(indices) / self.num_replicas) offset = self.num_samples * self.rank indices = indices[offset: offset + self.num_samples] assert len(indices) == self.num_samples return iter(indices)
Example #13
Source File: mnist.py From sagemaker-python-sdk with Apache License 2.0 | 6 votes |
def _get_train_data_loader(training_dir, is_distributed, batch_size, **kwargs): logger.info("Get train data loader") dataset = datasets.MNIST( training_dir, train=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), download=False, # True sets a dependency on an external site for our canaries. ) train_sampler = ( torch.utils.data.distributed.DistributedSampler(dataset) if is_distributed else None ) train_loader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, shuffle=train_sampler is None, sampler=train_sampler, **kwargs ) return train_sampler, train_loader
Example #14
Source File: distributed.py From sparktorch with MIT License | 6 votes |
def process_generic_model(params: List, iters: int, has_early_stop: bool = False): """ Runs a mock training with zero grads. This is due to a bug where the connection gets reset with custom new groups. :param params: The params of the model :param iters: Iterations. """ # Hopefully this function can go away in newer versions. for i in range(iters): for p in params: z = torch.zeros(p) dist.all_reduce(z, op=torch.distributed.ReduceOp.SUM) if has_early_stop: dist.all_reduce(torch.tensor(0.0), op=torch.distributed.ReduceOp.SUM) zeros = torch.zeros(1) dist.all_reduce(zeros, op=torch.distributed.ReduceOp.SUM) if zeros.item() > 0: break
Example #15
Source File: distributed.py From DetNAS with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #16
Source File: distributed.py From Res2Net-maskrcnn with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #17
Source File: trainmyData.py From CornerNet-Lite-Pytorch with BSD 3-Clause "New" or "Revised" License | 6 votes |
def parse_args(): parser = argparse.ArgumentParser(description="Training Script") parser.add_argument("cfg_file", help="config file", type=str) parser.add_argument("--iter", dest="start_iter", help="train at iteration i", default=0, type=int) parser.add_argument("--workers", default=4, type=int) parser.add_argument("--initialize", action="store_true") parser.add_argument("--distributed", action="store_true") parser.add_argument("--world-size", default=-1, type=int, help="number of nodes of distributed training") parser.add_argument("--rank", default=0, type=int, help="node rank for distributed training") parser.add_argument("--dist-url", default=None, type=str, help="url used to set up distributed training") parser.add_argument("--dist-backend", default="nccl", type=str) args = parser.parse_args() return args
Example #18
Source File: base_task.py From Doc2EDAG with MIT License | 6 votes |
def _decorate_model(self, parallel_decorate=True): self.logging('='*20 + 'Decorate Model' + '='*20) if self.setting.fp16: self.model.half() self.model.to(self.device) self.logging('Set model device to {}'.format(str(self.device))) if parallel_decorate: if self.in_distributed_mode(): self.model = para.DistributedDataParallel(self.model, device_ids=[self.setting.local_rank], output_device=self.setting.local_rank) self.logging('Wrap distributed data parallel') # self.logging('In Distributed Mode, but do not use DistributedDataParallel Wrapper') elif self.n_gpu > 1: self.model = para.DataParallel(self.model) self.logging('Wrap data parallel') else: self.logging('Do not wrap parallel layers')
Example #19
Source File: base_task.py From Doc2EDAG with MIT License | 6 votes |
def _init_device(self): self.logging('='*20 + 'Init Device' + '='*20) # set device if self.setting.local_rank == -1 or self.setting.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.setting.no_cuda else "cpu") self.n_gpu = torch.cuda.device_count() else: self.device = torch.device("cuda", self.setting.local_rank) self.n_gpu = 1 if self.setting.fp16: self.logging("16-bits training currently not supported in distributed training") self.setting.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) self.logging("device {} n_gpu {} distributed training {}".format( self.device, self.n_gpu,self.in_distributed_mode() ))
Example #20
Source File: utils.py From kaggle-kuzushiji-2019 with MIT License | 6 votes |
def init_distributed_mode(args): if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ['WORLD_SIZE']) args.gpu = int(os.environ['LOCAL_RANK']) elif 'SLURM_PROCID' in os.environ: args.rank = int(os.environ['SLURM_PROCID']) args.gpu = args.rank % torch.cuda.device_count() else: print('Not using distributed mode') args.distributed = False return args.distributed = True torch.cuda.set_device(args.gpu) args.dist_backend = 'nccl' print('| distributed init (rank {}): {}'.format( args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.distributed.barrier() setup_for_distributed(args.rank == 0)
Example #21
Source File: distributed_utils.py From fairseq with MIT License | 5 votes |
def call_main(args, main, **kwargs): if args.distributed_init_method is None: infer_init_method(args) if args.distributed_init_method is not None: # distributed main if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically kwargs['start_rank'] = start_rank torch.multiprocessing.spawn( fn=_distributed_main, args=(main, args, kwargs), nprocs=torch.cuda.device_count(), ) else: _distributed_main(args.device_id, main, args, kwargs) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port) args.distributed_rank = None # set based on device id torch.multiprocessing.spawn( fn=_distributed_main, args=(main, args, kwargs), nprocs=args.distributed_world_size, ) else: # single GPU main main(args, **kwargs)
Example #22
Source File: train.py From tn2-wg with BSD 3-Clause "New" or "Revised" License | 5 votes |
def init_distributed(hparams, n_gpus, rank, group_name): assert torch.cuda.is_available(), "Distributed mode requires CUDA." print("Initializing Distributed") # Set cuda device so everything is done on the right GPU. torch.cuda.set_device(rank % torch.cuda.device_count()) # Initialize distributed communication dist.init_process_group( backend=hparams.dist_backend, init_method=hparams.dist_url, world_size=n_gpus, rank=rank, group_name=group_name) print("Done initializing distributed")
Example #23
Source File: train.py From EfficientDet.Pytorch with MIT License | 5 votes |
def main(): args = parser.parse_args() if(not os.path.exists(os.path.join(args.save_folder, args.dataset, args.network))): os.makedirs(os.path.join(args.save_folder, args.dataset, args.network)) if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' os.environ['WORLD_SIZE'] = '2' if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args)
Example #24
Source File: distributed_data_parallel.py From torchbearer with MIT License | 5 votes |
def worker(): setup() print("Rank and node: {}-{}".format(args.rank, platform.node())) model = ToyModel().to('cpu') ddp_model = DDP(model) kwargs = {} ds = datasets.MNIST('./data/mnist/', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) train_sampler = torch.utils.data.distributed.DistributedSampler(ds) train_loader = torch.utils.data.DataLoader(ds, batch_size=128, sampler=train_sampler, **kwargs) test_ds = datasets.MNIST('./data/mnist', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) test_sampler = torch.utils.data.distributed.DistributedSampler(test_ds) test_loader = torch.utils.data.DataLoader(test_ds, batch_size=128, sampler=test_sampler, **kwargs) loss_fn = nn.CrossEntropyLoss() optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) trial = torchbearer.Trial(ddp_model, optimizer, loss_fn, metrics=['loss', 'acc'], callbacks=[sync, grad, flatten]) trial.with_train_generator(train_loader) trial.run(10, verbose=2) print("Model hash: {}".format(hash(model))) print('First parameter: {}'.format(next(model.parameters()))) cleanup()
Example #25
Source File: classification_task.py From ClassyVision with MIT License | 5 votes |
def init_distributed_data_parallel_model(self): """ Initialize `torch.nn.parallel.distributed.DistributedDataParallel <https://pytorch.org/ docs/stable/nn.html#distributeddataparallel>`_. Needed for distributed training. This is where a model should be wrapped by DDP. """ if not is_distributed_training_run(): return assert ( self.distributed_model is None ), "init_ddp_non_elastic must only be called once" broadcast_buffers = ( self.broadcast_buffers_mode == BroadcastBuffersMode.FORWARD_PASS ) self.distributed_model = init_distributed_data_parallel_model( self.base_model, broadcast_buffers=broadcast_buffers, find_unused_parameters=self.find_unused_parameters, ) if isinstance(self.loss, ClassyLoss) and self.loss.has_learned_parameters(): logging.info("Initializing distributed loss") self.loss = init_distributed_data_parallel_model( self.loss, broadcast_buffers=broadcast_buffers, find_unused_parameters=self.find_unused_parameters, )
Example #26
Source File: engine.py From TreeFilter-Torch with MIT License | 5 votes |
def __init__(self, custom_parser=None): self.version = __version__ logger.info( "PyTorch Version {}, Furnace Version {}".format(torch.__version__, self.version)) self.state = State() self.devices = None self.distributed = False if custom_parser is None: self.parser = argparse.ArgumentParser() else: assert isinstance(custom_parser, argparse.ArgumentParser) self.parser = custom_parser self.inject_default_parser() self.args = self.parser.parse_args() self.continue_state_object = self.args.continue_fpath if 'WORLD_SIZE' in os.environ: self.distributed = int(os.environ['WORLD_SIZE']) > 1 if self.distributed: self.world_size = int(os.environ['WORLD_SIZE']) self.local_rank = self.args.local_rank torch.cuda.set_device(self.local_rank) dist.init_process_group(backend="nccl", init_method='env://') self.devices = [i for i in range(self.world_size)] else: self.devices = parse_devices(self.args.devices)
Example #27
Source File: adam.py From fairseq with MIT License | 5 votes |
def average_params(self): """Reduce Params is only used during BMUF distributed training.""" state_dict = self.optimizer.state_dict() total_gpus = float(dist.get_world_size()) for _, value in state_dict["state"].items(): value["exp_avg"] /= total_gpus value["exp_avg_sq"] /= total_gpus dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM) dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
Example #28
Source File: comm.py From Clothing-Detection with GNU General Public License v3.0 | 5 votes |
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #29
Source File: classy_meter.py From ClassyVision with MIT License | 5 votes |
def sync_state(self) -> None: """ Syncs state with all other meters in distributed training. If not provided by child class this does nothing by default and meter only provides the local process stats. If implemented then the meter provides the global stats at last sync + any local updates since the last sync. Warning: Calls to sync_state could involve communications via :mod:`torch.distributed` which can result in a loss of performance or deadlocks if not coordinated among threads. """ pass
Example #30
Source File: translation_train.py From dgl with Apache License 2.0 | 5 votes |
def run(dev_id, args): dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip=args.master_ip, master_port=args.master_port) world_size = args.ngpu torch.distributed.init_process_group(backend="nccl", init_method=dist_init_method, world_size=world_size, rank=dev_id) gpu_rank = torch.distributed.get_rank() assert gpu_rank == dev_id main(dev_id, args)