Python torch.distributed.get_world_size() Examples
The following are 30
code examples of torch.distributed.get_world_size().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.distributed
, or try the search function
.
Example #1
Source File: distributed_utils.py From conditional-motion-propagation with MIT License | 8 votes |
def gather_tensors(input_array): world_size = dist.get_world_size() ## gather shapes first myshape = input_array.shape mycount = input_array.size shape_tensor = torch.Tensor(np.array(myshape)).cuda() all_shape = [torch.Tensor(np.array(myshape)).cuda() for i in range(world_size)] dist.all_gather(all_shape, shape_tensor) ## compute largest shapes all_shape = [x.cpu().numpy() for x in all_shape] all_count = [int(x.prod()) for x in all_shape] all_shape = [list(map(int, x)) for x in all_shape] max_count = max(all_count) ## padding tensors and gather them output_tensors = [torch.Tensor(max_count).cuda() for i in range(world_size)] padded_input_array = np.zeros(max_count) padded_input_array[:mycount] = input_array.reshape(-1) input_tensor = torch.Tensor(padded_input_array).cuda() dist.all_gather(output_tensors, input_tensor) ## unpadding gathered tensors padded_output = [x.cpu().numpy() for x in output_tensors] output = [x[:all_count[i]].reshape(all_shape[i]) for i,x in enumerate(padded_output)] return output
Example #2
Source File: distributed_operations.py From sagemaker-pytorch-training-toolkit with Apache License 2.0 | 7 votes |
def _gather(rank, rows, columns): dest = 0 tensor = _get_tensor(rank, rows, columns) if rank == dest: tensors_list = _get_zeros_tensors_list(rows, columns) logger.debug('Rank: {},\nTensor BEFORE gather: {}. tensors_list: {}'.format( rank, tensor, tensors_list)) dist.gather(tensor=tensor, gather_list=tensors_list) logger.debug('Rank: {},\nTensor AFTER gather: {}. tensors_list: {}\n'.format( rank, tensor, tensors_list)) for i in range(dist.get_world_size()): assert torch.equal(tensors_list[i], _get_tensor(i, rows, columns)), \ 'Rank {}: tensors lists are not the same after gather.' else: logger.debug('Rank: {},\nTensor BEFORE gather: {}\n'.format(rank, tensor)) dist.gather(tensor=tensor, dst=dest) logger.debug('Rank: {},\nTensor AFTER gather: {}\n'.format(rank, tensor)) # tensor shouldn't have changed assert torch.equal(tensor, _get_tensor(rank, rows, columns)), \ 'Rank {}: Tensor got changed after gather.'.format(rank)
Example #3
Source File: batch_norm.py From SegmenTron with Apache License 2.0 | 6 votes |
def forward(self, input): if get_world_size() == 1 or not self.training: return super().forward(input) assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs" C = input.shape[1] mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * (mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) return input * scale + bias
Example #4
Source File: batch_norm.py From Parsing-R-CNN with MIT License | 6 votes |
def forward(self, input): if get_world_size() == 1 or not self.training: return super().forward(input) assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs" C = input.shape[1] mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * (mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) return input * scale + bias
Example #5
Source File: distributed.py From R2CNN.pytorch with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #6
Source File: samplers.py From DenseMatchingBenchmark with MIT License | 6 votes |
def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas
Example #7
Source File: sampler.py From mars with Apache License 2.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): import torch.distributed as dist super().__init__(dataset) if num_replicas is None: # pragma: no cover if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: # pragma: no cover if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #8
Source File: distributed.py From Parsing-R-CNN with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #9
Source File: distributed.py From SegmenTron with Apache License 2.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #10
Source File: distributed.py From Clothing-Detection with GNU General Public License v3.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #11
Source File: sampler.py From AerialDetection with Apache License 2.0 | 6 votes |
def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas
Example #12
Source File: distributed.py From SegmenTron with Apache License 2.0 | 6 votes |
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
Example #13
Source File: distributed.py From Res2Net-maskrcnn with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #14
Source File: parallel.py From LEDNet with MIT License | 6 votes |
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
Example #15
Source File: distributed_utils.py From conditional-motion-propagation with MIT License | 6 votes |
def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1): if world_size is None: world_size = dist.get_world_size() if rank is None: rank = dist.get_rank() assert rank < world_size self.dataset = dataset self.total_iter = total_iter self.batch_size = batch_size self.world_size = world_size self.rank = rank self.last_iter = last_iter self.total_size = self.total_iter*self.batch_size self.indices = self.gen_new_list() self.call = 0
Example #16
Source File: dist_utils.py From mmdetection with Apache License 2.0 | 6 votes |
def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): """Allreduce gradients. Args: params (list[torch.Parameters]): List of parameters of a model coalesce (bool, optional): Whether allreduce parameters as a whole. Defaults to True. bucket_size_mb (int, optional): Size of bucket, the unit is MB. Defaults to -1. """ grads = [ param.grad.data for param in params if param.requires_grad and param.grad is not None ] world_size = dist.get_world_size() if coalesce: _allreduce_coalesced(grads, world_size, bucket_size_mb) else: for tensor in grads: dist.all_reduce(tensor.div_(world_size))
Example #17
Source File: sampler.py From LEDNet with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #18
Source File: single_stage_model.py From conditional-motion-propagation with MIT License | 6 votes |
def __init__(self, params, dist_model=False): model_params = params['module'] self.model = models.modules.__dict__[params['module']['arch']](model_params) utils.init_weights(self.model, init_type='xavier') self.model.cuda() if dist_model: self.model = utils.DistModule(self.model) self.world_size = dist.get_world_size() else: self.model = models.modules.FixModule(self.model) self.world_size = 1 if params['optim'] == 'SGD': self.optim = torch.optim.SGD( self.model.parameters(), lr=params['lr'], momentum=0.9, weight_decay=0.0001) elif params['optim'] == 'Adam': self.optim = torch.optim.Adam( self.model.parameters(), lr=params['lr'], betas=(params['beta1'], 0.999)) else: raise Exception("No such optimizer: {}".format(params['optim'])) cudnn.benchmark = True
Example #19
Source File: mnist.py From sagemaker-pytorch-training-toolkit with Apache License 2.0 | 5 votes |
def _average_gradients(model): # Gradient averaging. size = float(dist.get_world_size()) for param in model.parameters(): dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM) param.grad.data /= size
Example #20
Source File: comm.py From Clothing-Detection with GNU General Public License v3.0 | 5 votes |
def get_world_size(): if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size()
Example #21
Source File: comm.py From Clothing-Detection with GNU General Public License v3.0 | 5 votes |
def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.reduce(values, dst=0) if dist.get_rank() == 0 and average: # only main process gets accumulated, so only divide by # world_size in this case values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict
Example #22
Source File: comm.py From Clothing-Detection with GNU General Public License v3.0 | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.LongTensor([tensor.numel()]).to("cuda") size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #23
Source File: comm.py From Clothing-Detection with GNU General Public License v3.0 | 5 votes |
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #24
Source File: distributed_operations.py From sagemaker-pytorch-training-toolkit with Apache License 2.0 | 5 votes |
def _get_tensors_sum(rows, columns): device = torch.device( "cuda:{}".format(dist.get_rank() % torch.cuda.device_count()) if torch.cuda.is_available() else "cpu" ) result = (1 + dist.get_world_size()) * dist.get_world_size() / 2 tensor = torch.ones(rows, columns) * result return tensor.to(device)
Example #25
Source File: distributed_operations.py From sagemaker-pytorch-training-toolkit with Apache License 2.0 | 5 votes |
def _get_zeros_tensors_list(rows, columns): return [_get_zeros_tensor(rows, columns) for _ in range(dist.get_world_size())]
Example #26
Source File: coco_eval.py From torchbench with Apache License 2.0 | 5 votes |
def get_world_size(): if not is_dist_avail_and_initialized(): return 1 return dist.get_world_size()
Example #27
Source File: dist_utils.py From DenseMatchingBenchmark with MIT License | 5 votes |
def all_reduce_grads(model, coalesce=True, bucket_size_mb=-1): grads = [ param.grad.data for param in model.parameters() if param.requires_grad and param.grad is not None ] world_size = dist.get_world_size() if coalesce: _all_reduce_coalesced(grads, world_size, bucket_size_mb) else: for tensor in grads: dist.all_reduce(tensor.div_(world_size))
Example #28
Source File: distributed.py From SegmenTron with Apache License 2.0 | 5 votes |
def get_world_size(): if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size()
Example #29
Source File: distributed.py From SegmenTron with Apache License 2.0 | 5 votes |
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #30
Source File: distributed.py From SegmenTron with Apache License 2.0 | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list