Python torch.distributed.all_reduce() Examples

The following are 30 code examples of torch.distributed.all_reduce(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.distributed , or try the search function .
Example #1
Source File: ray_container.py    From adeptRL with GNU General Public License v3.0 7 votes vote down vote up
def step(self):
        print(f"learner {self.rank} step")

        # make sure exp_handles are done
        for handle in self.exp_handles:
            handle.wait()

        # batch together exp
        time.sleep(random.randint(0, 3))

        # update with other learners
        dist.barrier(self.learner_group)
        for p in self.network_grads:
            dist.all_reduce(p, group=self.learner_group)
        print(f"learner {self.rank} shared gradients")
        return True 
Example #2
Source File: utils.py    From kaggle-kuzushiji-2019 with MIT License 7 votes vote down vote up
def reduce_dict(input_dict, average=True):
    """
    Args:
        input_dict (dict): all the values will be reduced
        average (bool): whether to do average or sum
    Reduce the values in the dictionary from all processes so that all processes
    have the averaged results. Returns a dict with the same fields as
    input_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return input_dict
    with torch.no_grad():
        names = []
        values = []
        # sort the keys so that they are consistent across processes
        for k in sorted(input_dict.keys()):
            names.append(k)
            values.append(input_dict[k])
        values = torch.stack(values, dim=0)
        dist.all_reduce(values)
        if average:
            values /= world_size
        reduced_dict = {k: v for k, v in zip(names, values)}
    return reduced_dict 
Example #3
Source File: dist_utils.py    From kaggle-kuzushiji-recognition with MIT License 6 votes vote down vote up
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced) 
Example #4
Source File: dist_utils.py    From mmdetection-annotated with Apache License 2.0 6 votes vote down vote up
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced) 
Example #5
Source File: distrib.py    From adeptRL with GNU General Public License v3.0 6 votes vote down vote up
def step(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        dist.barrier()
        handles = []
        for param in self.network.parameters():
            handles.append(dist.all_reduce(param.grad, async_op=True))
        for handle in handles:
            handle.wait()
        if self.divide_grad:
            for param in self.network.parameters():
                param.grad.mul_(1.0 / self.world_sz)
        if self.grad_norm_clip:
            nn.utils.clip_grad_norm_(
                self.network.parameters(), self.grad_norm_clip
            )
        self.optimizer.step() 
Example #6
Source File: distributed.py    From virtex with MIT License 6 votes vote down vote up
def average_across_processes(t: Union[torch.Tensor, Dict[str, torch.Tensor]]):
    r"""
    Averages a tensor, or a dict of tensors across all processes in a process
    group. Objects in all processes will finally have same mean value.

    .. note::

        Nested dicts of tensors are not supported.

    Parameters
    ----------
    t: torch.Tensor or Dict[str, torch.Tensor]
        A tensor or dict of tensors to average across processes.
    """
    if dist.is_initialized():
        if isinstance(t, torch.Tensor):
            dist.all_reduce(t, op=dist.ReduceOp.SUM)
            t /= get_world_size()
        elif isinstance(t, dict):
            for k in t:
                dist.all_reduce(t[k], op=dist.ReduceOp.SUM)
                t[k] /= dist.get_world_size() 
Example #7
Source File: sync_bn.py    From mmcv with Apache License 2.0 6 votes vote down vote up
def backward(self, grad_output):
        norm, std, weight = self.saved_tensors
        grad_weight = torch.empty_like(weight)
        grad_bias = torch.empty_like(weight)
        grad_input = torch.empty_like(grad_output)
        grad_output3d = grad_output.view(
            grad_output.size(0), grad_output.size(1), -1)
        grad_input3d = grad_input.view_as(grad_output3d)
        ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
                                          grad_bias)
        # all reduce
        if self.group_size > 1:
            dist.all_reduce(grad_weight, group=self.group)
            dist.all_reduce(grad_bias, group=self.group)
            grad_weight /= self.group_size
            grad_bias /= self.group_size
        ext_module.sync_bn_backward_data(grad_output3d, weight, grad_weight,
                                         grad_bias, norm, std, grad_input3d)
        return grad_input, None, None, grad_weight, grad_bias, \
            None, None, None, None 
Example #8
Source File: dist_utils.py    From GCNet with Apache License 2.0 6 votes vote down vote up
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced) 
Example #9
Source File: dist_utils.py    From DenseMatchingBenchmark with MIT License 6 votes vote down vote up
def _all_reduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced) 
Example #10
Source File: dist_utils.py    From mmdetection_with_SENet154 with Apache License 2.0 6 votes vote down vote up
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced) 
Example #11
Source File: distributed.py    From sparktorch with MIT License 6 votes vote down vote up
def process_generic_model(params: List, iters: int, has_early_stop: bool = False):
    """
    Runs a mock training with zero grads. This is due to a bug where the connection gets reset with custom new groups.
    :param params: The params of the model
    :param iters: Iterations.
    """
    # Hopefully this function can go away in newer versions.
    for i in range(iters):
        for p in params:
            z = torch.zeros(p)
            dist.all_reduce(z, op=torch.distributed.ReduceOp.SUM)

        if has_early_stop:
            dist.all_reduce(torch.tensor(0.0), op=torch.distributed.ReduceOp.SUM)
            zeros = torch.zeros(1)
            dist.all_reduce(zeros, op=torch.distributed.ReduceOp.SUM)
            if zeros.item() > 0:
                break 
Example #12
Source File: dist_utils.py    From PolarMask with Apache License 2.0 6 votes vote down vote up
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced) 
Example #13
Source File: sync_switchwhiten.py    From Switchable-Whitening with MIT License 6 votes vote down vote up
def backward(ctx, grad_mean_out, grad_cov_out):
        in_data, mean_bn = ctx.saved_tensors

        if ctx.training:
            dist.all_reduce(grad_mean_out)
            dist.all_reduce(grad_cov_out)
            world_size = dist.get_world_size()
        else:
            world_size = 1

        grad_cov_out = (grad_cov_out + grad_cov_out.transpose(1, 2)) / 2
        grad_cov_in = 2 * torch.bmm(grad_cov_out, (in_data - mean_bn)) \
            / (ctx.NHW*world_size)   # g x c x (N x H x W)

        grad_mean_in = grad_mean_out / ctx.NHW / world_size
        inDiff = grad_mean_in + grad_cov_in
        return inDiff, None, None, None, None 
Example #14
Source File: dist_utils.py    From AerialDetection with Apache License 2.0 6 votes vote down vote up
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced) 
Example #15
Source File: dist_utils.py    From mmdetection with Apache License 2.0 6 votes vote down vote up
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced) 
Example #16
Source File: distributed_communicator.py    From CrypTen with MIT License 6 votes vote down vote up
def all_reduce(self, input, op=ReduceOp.SUM, batched=False):
        """Reduces the input data across all parties; all get the final result."""
        assert dist.is_initialized(), "initialize the communicator first"

        if batched:
            assert isinstance(input, list), "batched reduce input must be a list"
            reqs = []
            result = [x.clone() for x in input]
            for tensor in result:
                reqs.append(
                    dist.all_reduce(
                        tensor.data, op=op, group=self.main_group, async_op=True
                    )
                )
            for req in reqs:
                req.wait()
        else:
            assert torch.is_tensor(
                input.data
            ), "unbatched input for reduce must be a torch tensor"
            result = input.clone()
            dist.all_reduce(result.data, op=op, group=self.main_group)
        return result 
Example #17
Source File: dist_utils.py    From mmdetection with Apache License 2.0 6 votes vote down vote up
def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
    """Allreduce gradients.

    Args:
        params (list[torch.Parameters]): List of parameters of a model
        coalesce (bool, optional): Whether allreduce parameters as a whole.
            Defaults to True.
        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
            Defaults to -1.
    """
    grads = [
        param.grad.data for param in params
        if param.requires_grad and param.grad is not None
    ]
    world_size = dist.get_world_size()
    if coalesce:
        _allreduce_coalesced(grads, world_size, bucket_size_mb)
    else:
        for tensor in grads:
            dist.all_reduce(tensor.div_(world_size)) 
Example #18
Source File: sync_switchwhiten.py    From Switchable-Whitening with MIT License 5 votes vote down vote up
def forward(ctx, in_data, running_mean, running_cov, momentum, training):
        g, c, NHW = in_data.size()
        ctx.g = g
        ctx.c = c
        ctx.NHW = NHW
        ctx.training = training

        if training:
            mean_bn = in_data.mean(-1, keepdim=True)  # g x c x 1
            dist.all_reduce(mean_bn)
            mean_bn /= dist.get_world_size()
            in_data_bn = in_data - mean_bn
            cov_bn = torch.bmm(in_data_bn, in_data_bn.transpose(1, 2)).div(NHW)
            dist.all_reduce(cov_bn)
            cov_bn /= dist.get_world_size()

            running_mean.mul_(momentum)
            running_mean.add_((1 - momentum) * mean_bn.data)
            running_cov.mul_(momentum)
            running_cov.add_((1 - momentum) * cov_bn.data)
        else:
            mean_bn = torch.autograd.Variable(running_mean)
            cov_bn = torch.autograd.Variable(running_cov)

        ctx.save_for_backward(in_data.data, mean_bn.data)
        return mean_bn, cov_bn 
Example #19
Source File: mnist.py    From sagemaker-python-sdk with Apache License 2.0 5 votes vote down vote up
def _average_gradients(model):
    # Gradient averaging.
    size = float(dist.get_world_size())
    for param in model.parameters():
        dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)
        param.grad.data /= size 
Example #20
Source File: distributed_utils.py    From Switchable-Whitening with MIT License 5 votes vote down vote up
def average_gradients(model):
    """ average gradients """
    for param in model.parameters():
        if param.requires_grad:
            dist.all_reduce(param.grad.data) 
Example #21
Source File: dist_utils.py    From PolarMask with Apache License 2.0 5 votes vote down vote up
def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
    grads = [
        param.grad.data for param in params
        if param.requires_grad and param.grad is not None
    ]
    world_size = dist.get_world_size()
    if coalesce:
        _allreduce_coalesced(grads, world_size, bucket_size_mb)
    else:
        for tensor in grads:
            dist.all_reduce(tensor.div_(world_size)) 
Example #22
Source File: utils.py    From kaggle-kuzushiji-2019 with MIT License 5 votes vote down vote up
def synchronize_between_processes(self):
        """
        Warning: does not synchronize the deque!
        """
        if not is_dist_avail_and_initialized():
            return
        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
        dist.barrier()
        dist.all_reduce(t)
        t = t.tolist()
        self.count = int(t[0])
        self.total = t[1] 
Example #23
Source File: functions.py    From DeepLab-v3-plus-cityscapes with MIT License 5 votes vote down vote up
def backward(ctx, dz):
        z, var, weight, bias = ctx.saved_tensors
        dz = dz.contiguous()

        # Undo activation
        _act_backward(ctx, z, dz)

        if ctx.training:
            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
            edz_local = edz.clone()
            eydz_local = eydz.clone()

            if ctx.world_size>1:
                edz *= ctx.factor
                dist.all_reduce(edz, dist.ReduceOp.SUM)

                eydz *= ctx.factor
                dist.all_reduce(eydz, dist.ReduceOp.SUM)
        else:
            edz_local = edz = dz.new_zeros(dz.size(1))
            eydz_local = eydz = dz.new_zeros(dz.size(1))

        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
        dweight = eydz_local * weight.sign() if ctx.affine else None
        dbias = edz_local if ctx.affine else None

        return dx, dweight, dbias, None, None, None, None, None, None, None 
Example #24
Source File: tpu_bn.py    From fast-autoaugment with MIT License 5 votes vote down vote up
def _reduce_avg(self, t):
        dist.all_reduce(t, dist.ReduceOp.SUM)
        t.mul_(1. / dist.get_world_size()) 
Example #25
Source File: distributed_mixin.py    From fastMRI with MIT License 5 votes vote down vote up
def stats(self, epoch, loader, setname):
        """ We use All-reduce to sync the losses across all the processes """
        losses = self.compute_stats(epoch, loader, setname)
        logging.debug(f'Epoch: {epoch}. process-local losses: {losses}')
        sys.stdout.flush()

        losses_tensor = torch.zeros(len(losses)).to(self.device)
        all_dataset_size = loader.sampler.total_size
        all_local_size = loader.sampler.num_samples

        for i, k in enumerate(sorted(losses.keys())):
            local_size = all_local_size
            if k in loader.sampler.system_acquisition_local_count:
                local_size = loader.sampler.system_acquisition_local_count[k]

            losses_tensor[i] = losses[k]*local_size

        dist.all_reduce(losses_tensor, op=dist.ReduceOp.SUM)
        logging.debug(f'({self.args.rank}) Loss all-reduce complete')
        sys.stdout.flush()

        for i, k in enumerate(sorted(losses.keys())):
            dataset_size = all_dataset_size
            if k in loader.sampler.system_acquisition_total_count:
                dataset_size = loader.sampler.system_acquisition_total_count[k]

            losses[k] = losses_tensor[i].item()/dataset_size # Average it

        self.test_loss_hook(losses)
        logging.info(f'Epoch: {epoch}. losses: {losses}')
        #print(f'Epoch: {epoch}. losses: {losses}')
        sys.stdout.flush()
        return losses["NMSE"] 
Example #26
Source File: tree_filter.py    From TreeFilter-Torch with MIT License 5 votes vote down vote up
def print_info(self, edge_weight):
        edge_weight = edge_weight.clone()
        info = torch.stack([edge_weight.mean(), edge_weight.std(), edge_weight.max(), edge_weight.min()])
        if self.training and dist.is_initialized():
            dist.all_reduce(info / dist.get_world_size())
            info_str = (float(x) for x in info)
            if dist.get_rank() == 0:
                print('Mean:{0:.4f}, Std:{1:.4f}, Max:{2:.4f}, Min:{3:.4f}'.format(*info_str))
        else:
            info_str = [float(x) for x in info]
            print('Mean:{0:.4f}, Std:{1:.4f}, Max:{2:.4f}, Min:{3:.4f}'.format(*info_str)) 
Example #27
Source File: pyt_utils.py    From TreeFilter-Torch with MIT License 5 votes vote down vote up
def all_reduce_tensor(tensor, op=dist.ReduceOp.SUM, world_size=1):
    tensor = tensor.clone()
    dist.all_reduce(tensor, op)
    tensor.div_(world_size)

    return tensor 
Example #28
Source File: distributed.py    From torchsupport with MIT License 5 votes vote down vote up
def _average_gradients(net, world_size, group, cuda=False):
  for p in net.parameters():
    tensor = p.grad.data.cpu()
    distributed.all_reduce(tensor,
                           op=distributed.reduce_op.SUM,
                           group=group)
    tensor /= float(world_size)
    if cuda:
      p.grad.data = tensor.cuda()
    else:
      p.grad.data = tensor 
Example #29
Source File: syncbn.py    From DetNAS with MIT License 5 votes vote down vote up
def backward(ctx, grad_ouput):
        x, ex, exs, gamma, beta = ctx.saved_tensors

        grad_gamma, grad_beta, grad_ex, grad_exs = \
                syncbn_gpu.batch_norm_collect_grad_statistics(x, grad_ouput, gamma, ex, exs, ctx.eps, ctx.cf)

        if ctx.training:
            if ctx.sync:
                world_size = dist.get_world_size()
                grad_ex_all_reduce = dist.all_reduce(grad_ex, async_op=True)
                grad_exs_all_reduce = dist.all_reduce(grad_exs, async_op=True)

                grad_gamma_all_reduce = dist.all_reduce(grad_gamma, async_op=True)
                grad_beta_all_reduce = dist.all_reduce(grad_beta, async_op=True)

                grad_ex_all_reduce.wait()
                grad_exs_all_reduce.wait()

                grad_gamma_all_reduce.wait()
                grad_beta_all_reduce.wait()

                grad_ex /= world_size
                grad_exs /= world_size

        grad_input = syncbn_gpu.batch_norm_input_backward(x, grad_ouput, gamma, ex, exs, grad_ex, grad_exs, ctx.eps, ctx.cf)

        return grad_input, grad_gamma, grad_beta, None, None, None, None, None, None 
Example #30
Source File: distributed.py    From torchsupport with MIT License 5 votes vote down vote up
def _gossip_grad(net, world_size, rank, groups, step, cuda=False):
  group = groups[step]
  for p in net.parameters():
    tensor = p.grad.data.cpu()
    distributed.all_reduce(tensor,
                           op=distributed.reduce_op.SUM,
                           group=group)
    tensor /= 2.0
    if cuda:
      p.grad.data = tensor.cuda()
    else:
      p.grad.data = tensor