Python torch.distributed.all_reduce() Examples
The following are 30
code examples of torch.distributed.all_reduce().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.distributed
, or try the search function
.
Example #1
Source File: ray_container.py From adeptRL with GNU General Public License v3.0 | 7 votes |
def step(self): print(f"learner {self.rank} step") # make sure exp_handles are done for handle in self.exp_handles: handle.wait() # batch together exp time.sleep(random.randint(0, 3)) # update with other learners dist.barrier(self.learner_group) for p in self.network_grads: dist.all_reduce(p, group=self.learner_group) print(f"learner {self.rank} shared gradients") return True
Example #2
Source File: utils.py From kaggle-kuzushiji-2019 with MIT License | 7 votes |
def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that all processes have the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.all_reduce(values) if average: values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict
Example #3
Source File: dist_utils.py From kaggle-kuzushiji-recognition with MIT License | 6 votes |
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
Example #4
Source File: dist_utils.py From mmdetection-annotated with Apache License 2.0 | 6 votes |
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
Example #5
Source File: distrib.py From adeptRL with GNU General Public License v3.0 | 6 votes |
def step(self, loss): self.optimizer.zero_grad() loss.backward() dist.barrier() handles = [] for param in self.network.parameters(): handles.append(dist.all_reduce(param.grad, async_op=True)) for handle in handles: handle.wait() if self.divide_grad: for param in self.network.parameters(): param.grad.mul_(1.0 / self.world_sz) if self.grad_norm_clip: nn.utils.clip_grad_norm_( self.network.parameters(), self.grad_norm_clip ) self.optimizer.step()
Example #6
Source File: distributed.py From virtex with MIT License | 6 votes |
def average_across_processes(t: Union[torch.Tensor, Dict[str, torch.Tensor]]): r""" Averages a tensor, or a dict of tensors across all processes in a process group. Objects in all processes will finally have same mean value. .. note:: Nested dicts of tensors are not supported. Parameters ---------- t: torch.Tensor or Dict[str, torch.Tensor] A tensor or dict of tensors to average across processes. """ if dist.is_initialized(): if isinstance(t, torch.Tensor): dist.all_reduce(t, op=dist.ReduceOp.SUM) t /= get_world_size() elif isinstance(t, dict): for k in t: dist.all_reduce(t[k], op=dist.ReduceOp.SUM) t[k] /= dist.get_world_size()
Example #7
Source File: sync_bn.py From mmcv with Apache License 2.0 | 6 votes |
def backward(self, grad_output): norm, std, weight = self.saved_tensors grad_weight = torch.empty_like(weight) grad_bias = torch.empty_like(weight) grad_input = torch.empty_like(grad_output) grad_output3d = grad_output.view( grad_output.size(0), grad_output.size(1), -1) grad_input3d = grad_input.view_as(grad_output3d) ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight, grad_bias) # all reduce if self.group_size > 1: dist.all_reduce(grad_weight, group=self.group) dist.all_reduce(grad_bias, group=self.group) grad_weight /= self.group_size grad_bias /= self.group_size ext_module.sync_bn_backward_data(grad_output3d, weight, grad_weight, grad_bias, norm, std, grad_input3d) return grad_input, None, None, grad_weight, grad_bias, \ None, None, None, None
Example #8
Source File: dist_utils.py From GCNet with Apache License 2.0 | 6 votes |
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
Example #9
Source File: dist_utils.py From DenseMatchingBenchmark with MIT License | 6 votes |
def _all_reduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
Example #10
Source File: dist_utils.py From mmdetection_with_SENet154 with Apache License 2.0 | 6 votes |
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
Example #11
Source File: distributed.py From sparktorch with MIT License | 6 votes |
def process_generic_model(params: List, iters: int, has_early_stop: bool = False): """ Runs a mock training with zero grads. This is due to a bug where the connection gets reset with custom new groups. :param params: The params of the model :param iters: Iterations. """ # Hopefully this function can go away in newer versions. for i in range(iters): for p in params: z = torch.zeros(p) dist.all_reduce(z, op=torch.distributed.ReduceOp.SUM) if has_early_stop: dist.all_reduce(torch.tensor(0.0), op=torch.distributed.ReduceOp.SUM) zeros = torch.zeros(1) dist.all_reduce(zeros, op=torch.distributed.ReduceOp.SUM) if zeros.item() > 0: break
Example #12
Source File: dist_utils.py From PolarMask with Apache License 2.0 | 6 votes |
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
Example #13
Source File: sync_switchwhiten.py From Switchable-Whitening with MIT License | 6 votes |
def backward(ctx, grad_mean_out, grad_cov_out): in_data, mean_bn = ctx.saved_tensors if ctx.training: dist.all_reduce(grad_mean_out) dist.all_reduce(grad_cov_out) world_size = dist.get_world_size() else: world_size = 1 grad_cov_out = (grad_cov_out + grad_cov_out.transpose(1, 2)) / 2 grad_cov_in = 2 * torch.bmm(grad_cov_out, (in_data - mean_bn)) \ / (ctx.NHW*world_size) # g x c x (N x H x W) grad_mean_in = grad_mean_out / ctx.NHW / world_size inDiff = grad_mean_in + grad_cov_in return inDiff, None, None, None, None
Example #14
Source File: dist_utils.py From AerialDetection with Apache License 2.0 | 6 votes |
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
Example #15
Source File: dist_utils.py From mmdetection with Apache License 2.0 | 6 votes |
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
Example #16
Source File: distributed_communicator.py From CrypTen with MIT License | 6 votes |
def all_reduce(self, input, op=ReduceOp.SUM, batched=False): """Reduces the input data across all parties; all get the final result.""" assert dist.is_initialized(), "initialize the communicator first" if batched: assert isinstance(input, list), "batched reduce input must be a list" reqs = [] result = [x.clone() for x in input] for tensor in result: reqs.append( dist.all_reduce( tensor.data, op=op, group=self.main_group, async_op=True ) ) for req in reqs: req.wait() else: assert torch.is_tensor( input.data ), "unbatched input for reduce must be a torch tensor" result = input.clone() dist.all_reduce(result.data, op=op, group=self.main_group) return result
Example #17
Source File: dist_utils.py From mmdetection with Apache License 2.0 | 6 votes |
def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): """Allreduce gradients. Args: params (list[torch.Parameters]): List of parameters of a model coalesce (bool, optional): Whether allreduce parameters as a whole. Defaults to True. bucket_size_mb (int, optional): Size of bucket, the unit is MB. Defaults to -1. """ grads = [ param.grad.data for param in params if param.requires_grad and param.grad is not None ] world_size = dist.get_world_size() if coalesce: _allreduce_coalesced(grads, world_size, bucket_size_mb) else: for tensor in grads: dist.all_reduce(tensor.div_(world_size))
Example #18
Source File: sync_switchwhiten.py From Switchable-Whitening with MIT License | 5 votes |
def forward(ctx, in_data, running_mean, running_cov, momentum, training): g, c, NHW = in_data.size() ctx.g = g ctx.c = c ctx.NHW = NHW ctx.training = training if training: mean_bn = in_data.mean(-1, keepdim=True) # g x c x 1 dist.all_reduce(mean_bn) mean_bn /= dist.get_world_size() in_data_bn = in_data - mean_bn cov_bn = torch.bmm(in_data_bn, in_data_bn.transpose(1, 2)).div(NHW) dist.all_reduce(cov_bn) cov_bn /= dist.get_world_size() running_mean.mul_(momentum) running_mean.add_((1 - momentum) * mean_bn.data) running_cov.mul_(momentum) running_cov.add_((1 - momentum) * cov_bn.data) else: mean_bn = torch.autograd.Variable(running_mean) cov_bn = torch.autograd.Variable(running_cov) ctx.save_for_backward(in_data.data, mean_bn.data) return mean_bn, cov_bn
Example #19
Source File: mnist.py From sagemaker-python-sdk with Apache License 2.0 | 5 votes |
def _average_gradients(model): # Gradient averaging. size = float(dist.get_world_size()) for param in model.parameters(): dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0) param.grad.data /= size
Example #20
Source File: distributed_utils.py From Switchable-Whitening with MIT License | 5 votes |
def average_gradients(model): """ average gradients """ for param in model.parameters(): if param.requires_grad: dist.all_reduce(param.grad.data)
Example #21
Source File: dist_utils.py From PolarMask with Apache License 2.0 | 5 votes |
def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): grads = [ param.grad.data for param in params if param.requires_grad and param.grad is not None ] world_size = dist.get_world_size() if coalesce: _allreduce_coalesced(grads, world_size, bucket_size_mb) else: for tensor in grads: dist.all_reduce(tensor.div_(world_size))
Example #22
Source File: utils.py From kaggle-kuzushiji-2019 with MIT License | 5 votes |
def synchronize_between_processes(self): """ Warning: does not synchronize the deque! """ if not is_dist_avail_and_initialized(): return t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') dist.barrier() dist.all_reduce(t) t = t.tolist() self.count = int(t[0]) self.total = t[1]
Example #23
Source File: functions.py From DeepLab-v3-plus-cityscapes with MIT License | 5 votes |
def backward(ctx, dz): z, var, weight, bias = ctx.saved_tensors dz = dz.contiguous() # Undo activation _act_backward(ctx, z, dz) if ctx.training: edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps) edz_local = edz.clone() eydz_local = eydz.clone() if ctx.world_size>1: edz *= ctx.factor dist.all_reduce(edz, dist.ReduceOp.SUM) eydz *= ctx.factor dist.all_reduce(eydz, dist.ReduceOp.SUM) else: edz_local = edz = dz.new_zeros(dz.size(1)) eydz_local = eydz = dz.new_zeros(dz.size(1)) dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps) dweight = eydz_local * weight.sign() if ctx.affine else None dbias = edz_local if ctx.affine else None return dx, dweight, dbias, None, None, None, None, None, None, None
Example #24
Source File: tpu_bn.py From fast-autoaugment with MIT License | 5 votes |
def _reduce_avg(self, t): dist.all_reduce(t, dist.ReduceOp.SUM) t.mul_(1. / dist.get_world_size())
Example #25
Source File: distributed_mixin.py From fastMRI with MIT License | 5 votes |
def stats(self, epoch, loader, setname): """ We use All-reduce to sync the losses across all the processes """ losses = self.compute_stats(epoch, loader, setname) logging.debug(f'Epoch: {epoch}. process-local losses: {losses}') sys.stdout.flush() losses_tensor = torch.zeros(len(losses)).to(self.device) all_dataset_size = loader.sampler.total_size all_local_size = loader.sampler.num_samples for i, k in enumerate(sorted(losses.keys())): local_size = all_local_size if k in loader.sampler.system_acquisition_local_count: local_size = loader.sampler.system_acquisition_local_count[k] losses_tensor[i] = losses[k]*local_size dist.all_reduce(losses_tensor, op=dist.ReduceOp.SUM) logging.debug(f'({self.args.rank}) Loss all-reduce complete') sys.stdout.flush() for i, k in enumerate(sorted(losses.keys())): dataset_size = all_dataset_size if k in loader.sampler.system_acquisition_total_count: dataset_size = loader.sampler.system_acquisition_total_count[k] losses[k] = losses_tensor[i].item()/dataset_size # Average it self.test_loss_hook(losses) logging.info(f'Epoch: {epoch}. losses: {losses}') #print(f'Epoch: {epoch}. losses: {losses}') sys.stdout.flush() return losses["NMSE"]
Example #26
Source File: tree_filter.py From TreeFilter-Torch with MIT License | 5 votes |
def print_info(self, edge_weight): edge_weight = edge_weight.clone() info = torch.stack([edge_weight.mean(), edge_weight.std(), edge_weight.max(), edge_weight.min()]) if self.training and dist.is_initialized(): dist.all_reduce(info / dist.get_world_size()) info_str = (float(x) for x in info) if dist.get_rank() == 0: print('Mean:{0:.4f}, Std:{1:.4f}, Max:{2:.4f}, Min:{3:.4f}'.format(*info_str)) else: info_str = [float(x) for x in info] print('Mean:{0:.4f}, Std:{1:.4f}, Max:{2:.4f}, Min:{3:.4f}'.format(*info_str))
Example #27
Source File: pyt_utils.py From TreeFilter-Torch with MIT License | 5 votes |
def all_reduce_tensor(tensor, op=dist.ReduceOp.SUM, world_size=1): tensor = tensor.clone() dist.all_reduce(tensor, op) tensor.div_(world_size) return tensor
Example #28
Source File: distributed.py From torchsupport with MIT License | 5 votes |
def _average_gradients(net, world_size, group, cuda=False): for p in net.parameters(): tensor = p.grad.data.cpu() distributed.all_reduce(tensor, op=distributed.reduce_op.SUM, group=group) tensor /= float(world_size) if cuda: p.grad.data = tensor.cuda() else: p.grad.data = tensor
Example #29
Source File: syncbn.py From DetNAS with MIT License | 5 votes |
def backward(ctx, grad_ouput): x, ex, exs, gamma, beta = ctx.saved_tensors grad_gamma, grad_beta, grad_ex, grad_exs = \ syncbn_gpu.batch_norm_collect_grad_statistics(x, grad_ouput, gamma, ex, exs, ctx.eps, ctx.cf) if ctx.training: if ctx.sync: world_size = dist.get_world_size() grad_ex_all_reduce = dist.all_reduce(grad_ex, async_op=True) grad_exs_all_reduce = dist.all_reduce(grad_exs, async_op=True) grad_gamma_all_reduce = dist.all_reduce(grad_gamma, async_op=True) grad_beta_all_reduce = dist.all_reduce(grad_beta, async_op=True) grad_ex_all_reduce.wait() grad_exs_all_reduce.wait() grad_gamma_all_reduce.wait() grad_beta_all_reduce.wait() grad_ex /= world_size grad_exs /= world_size grad_input = syncbn_gpu.batch_norm_input_backward(x, grad_ouput, gamma, ex, exs, grad_ex, grad_exs, ctx.eps, ctx.cf) return grad_input, grad_gamma, grad_beta, None, None, None, None, None, None
Example #30
Source File: distributed.py From torchsupport with MIT License | 5 votes |
def _gossip_grad(net, world_size, rank, groups, step, cuda=False): group = groups[step] for p in net.parameters(): tensor = p.grad.data.cpu() distributed.all_reduce(tensor, op=distributed.reduce_op.SUM, group=group) tensor /= 2.0 if cuda: p.grad.data = tensor.cuda() else: p.grad.data = tensor