Python Examples of torch.distributed.all

Source File: ray_container.py From adeptRL with GNU General Public License v3.0

7 votes

def step(self):
        print(f"learner {self.rank} step")

        # make sure exp_handles are done
        for handle in self.exp_handles:
            handle.wait()

        # batch together exp
        time.sleep(random.randint(0, 3))

        # update with other learners
        dist.barrier(self.learner_group)
        for p in self.network_grads:
            dist.all_reduce(p, group=self.learner_group)
        print(f"learner {self.rank} shared gradients")
        return True

Source File: utils.py From kaggle-kuzushiji-2019 with MIT License

7 votes

def reduce_dict(input_dict, average=True):
    """
    Args:
        input_dict (dict): all the values will be reduced
        average (bool): whether to do average or sum
    Reduce the values in the dictionary from all processes so that all processes
    have the averaged results. Returns a dict with the same fields as
    input_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return input_dict
    with torch.no_grad():
        names = []
        values = []
        # sort the keys so that they are consistent across processes
        for k in sorted(input_dict.keys()):
            names.append(k)
            values.append(input_dict[k])
        values = torch.stack(values, dim=0)
        dist.all_reduce(values)
        if average:
            values /= world_size
        reduced_dict = {k: v for k, v in zip(names, values)}
    return reduced_dict

Source File: dist_utils.py From kaggle-kuzushiji-recognition with MIT License

6 votes

def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)

Source File: dist_utils.py From mmdetection-annotated with Apache License 2.0

6 votes

def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)

Source File: distrib.py From adeptRL with GNU General Public License v3.0

6 votes

def step(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        dist.barrier()
        handles = []
        for param in self.network.parameters():
            handles.append(dist.all_reduce(param.grad, async_op=True))
        for handle in handles:
            handle.wait()
        if self.divide_grad:
            for param in self.network.parameters():
                param.grad.mul_(1.0 / self.world_sz)
        if self.grad_norm_clip:
            nn.utils.clip_grad_norm_(
                self.network.parameters(), self.grad_norm_clip
            )
        self.optimizer.step()

Source File: distributed.py From virtex with MIT License

6 votes

def average_across_processes(t: Union[torch.Tensor, Dict[str, torch.Tensor]]):
    r"""
    Averages a tensor, or a dict of tensors across all processes in a process
    group. Objects in all processes will finally have same mean value.

    .. note::

        Nested dicts of tensors are not supported.

    Parameters
    ----------
    t: torch.Tensor or Dict[str, torch.Tensor]
        A tensor or dict of tensors to average across processes.
    """
    if dist.is_initialized():
        if isinstance(t, torch.Tensor):
            dist.all_reduce(t, op=dist.ReduceOp.SUM)
            t /= get_world_size()
        elif isinstance(t, dict):
            for k in t:
                dist.all_reduce(t[k], op=dist.ReduceOp.SUM)
                t[k] /= dist.get_world_size()

Source File: sync_bn.py From mmcv with Apache License 2.0

6 votes

def backward(self, grad_output):
        norm, std, weight = self.saved_tensors
        grad_weight = torch.empty_like(weight)
        grad_bias = torch.empty_like(weight)
        grad_input = torch.empty_like(grad_output)
        grad_output3d = grad_output.view(
            grad_output.size(0), grad_output.size(1), -1)
        grad_input3d = grad_input.view_as(grad_output3d)
        ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
                                          grad_bias)
        # all reduce
        if self.group_size > 1:
            dist.all_reduce(grad_weight, group=self.group)
            dist.all_reduce(grad_bias, group=self.group)
            grad_weight /= self.group_size
            grad_bias /= self.group_size
        ext_module.sync_bn_backward_data(grad_output3d, weight, grad_weight,
                                         grad_bias, norm, std, grad_input3d)
        return grad_input, None, None, grad_weight, grad_bias, \
            None, None, None, None

Source File: dist_utils.py From GCNet with Apache License 2.0

6 votes

def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)

Source File: dist_utils.py From DenseMatchingBenchmark with MIT License

6 votes

def _all_reduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)

Source File: dist_utils.py From mmdetection_with_SENet154 with Apache License 2.0

6 votes

def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)

Source File: distributed.py From sparktorch with MIT License

6 votes

def process_generic_model(params: List, iters: int, has_early_stop: bool = False):
    """
    Runs a mock training with zero grads. This is due to a bug where the connection gets reset with custom new groups.
    :param params: The params of the model
    :param iters: Iterations.
    """
    # Hopefully this function can go away in newer versions.
    for i in range(iters):
        for p in params:
            z = torch.zeros(p)
            dist.all_reduce(z, op=torch.distributed.ReduceOp.SUM)

        if has_early_stop:
            dist.all_reduce(torch.tensor(0.0), op=torch.distributed.ReduceOp.SUM)
            zeros = torch.zeros(1)
            dist.all_reduce(zeros, op=torch.distributed.ReduceOp.SUM)
            if zeros.item() > 0:
                break

Source File: dist_utils.py From PolarMask with Apache License 2.0

6 votes

def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)

Source File: sync_switchwhiten.py From Switchable-Whitening with MIT License

6 votes

def backward(ctx, grad_mean_out, grad_cov_out):
        in_data, mean_bn = ctx.saved_tensors

        if ctx.training:
            dist.all_reduce(grad_mean_out)
            dist.all_reduce(grad_cov_out)
            world_size = dist.get_world_size()
        else:
            world_size = 1

        grad_cov_out = (grad_cov_out + grad_cov_out.transpose(1, 2)) / 2
        grad_cov_in = 2 * torch.bmm(grad_cov_out, (in_data - mean_bn)) \
            / (ctx.NHW*world_size)   # g x c x (N x H x W)

        grad_mean_in = grad_mean_out / ctx.NHW / world_size
        inDiff = grad_mean_in + grad_cov_in
        return inDiff, None, None, None, None

Source File: dist_utils.py From AerialDetection with Apache License 2.0

6 votes

def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)

Source File: dist_utils.py From mmdetection with Apache License 2.0

6 votes

def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)

Source File: distributed_communicator.py From CrypTen with MIT License

6 votes

def all_reduce(self, input, op=ReduceOp.SUM, batched=False):
        """Reduces the input data across all parties; all get the final result."""
        assert dist.is_initialized(), "initialize the communicator first"

        if batched:
            assert isinstance(input, list), "batched reduce input must be a list"
            reqs = []
            result = [x.clone() for x in input]
            for tensor in result:
                reqs.append(
                    dist.all_reduce(
                        tensor.data, op=op, group=self.main_group, async_op=True
                    )
                )
            for req in reqs:
                req.wait()
        else:
            assert torch.is_tensor(
                input.data
            ), "unbatched input for reduce must be a torch tensor"
            result = input.clone()
            dist.all_reduce(result.data, op=op, group=self.main_group)
        return result

Source File: dist_utils.py From mmdetection with Apache License 2.0

6 votes

def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
    """Allreduce gradients.

    Args:
        params (list[torch.Parameters]): List of parameters of a model
        coalesce (bool, optional): Whether allreduce parameters as a whole.
            Defaults to True.
        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
            Defaults to -1.
    """
    grads = [
        param.grad.data for param in params
        if param.requires_grad and param.grad is not None
    ]
    world_size = dist.get_world_size()
    if coalesce:
        _allreduce_coalesced(grads, world_size, bucket_size_mb)
    else:
        for tensor in grads:
            dist.all_reduce(tensor.div_(world_size))

Source File: sync_switchwhiten.py From Switchable-Whitening with MIT License

5 votes

def forward(ctx, in_data, running_mean, running_cov, momentum, training):
        g, c, NHW = in_data.size()
        ctx.g = g
        ctx.c = c
        ctx.NHW = NHW
        ctx.training = training

        if training:
            mean_bn = in_data.mean(-1, keepdim=True)  # g x c x 1
            dist.all_reduce(mean_bn)
            mean_bn /= dist.get_world_size()
            in_data_bn = in_data - mean_bn
            cov_bn = torch.bmm(in_data_bn, in_data_bn.transpose(1, 2)).div(NHW)
            dist.all_reduce(cov_bn)
            cov_bn /= dist.get_world_size()

            running_mean.mul_(momentum)
            running_mean.add_((1 - momentum) * mean_bn.data)
            running_cov.mul_(momentum)
            running_cov.add_((1 - momentum) * cov_bn.data)
        else:
            mean_bn = torch.autograd.Variable(running_mean)
            cov_bn = torch.autograd.Variable(running_cov)

        ctx.save_for_backward(in_data.data, mean_bn.data)
        return mean_bn, cov_bn

Source File: mnist.py From sagemaker-python-sdk with Apache License 2.0

5 votes

def _average_gradients(model):
    # Gradient averaging.
    size = float(dist.get_world_size())
    for param in model.parameters():
        dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)
        param.grad.data /= size

Source File: distributed_utils.py From Switchable-Whitening with MIT License

5 votes

def average_gradients(model):
    """ average gradients """
    for param in model.parameters():
        if param.requires_grad:
            dist.all_reduce(param.grad.data)

Source File: dist_utils.py From PolarMask with Apache License 2.0

5 votes

def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
    grads = [
        param.grad.data for param in params
        if param.requires_grad and param.grad is not None
    ]
    world_size = dist.get_world_size()
    if coalesce:
        _allreduce_coalesced(grads, world_size, bucket_size_mb)
    else:
        for tensor in grads:
            dist.all_reduce(tensor.div_(world_size))

Source File: utils.py From kaggle-kuzushiji-2019 with MIT License

5 votes

def synchronize_between_processes(self):
        """
        Warning: does not synchronize the deque!
        """
        if not is_dist_avail_and_initialized():
            return
        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
        dist.barrier()
        dist.all_reduce(t)
        t = t.tolist()
        self.count = int(t[0])
        self.total = t[1]

Source File: functions.py From DeepLab-v3-plus-cityscapes with MIT License

5 votes

def backward(ctx, dz):
        z, var, weight, bias = ctx.saved_tensors
        dz = dz.contiguous()

        # Undo activation
        _act_backward(ctx, z, dz)

        if ctx.training:
            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
            edz_local = edz.clone()
            eydz_local = eydz.clone()

            if ctx.world_size>1:
                edz *= ctx.factor
                dist.all_reduce(edz, dist.ReduceOp.SUM)

                eydz *= ctx.factor
                dist.all_reduce(eydz, dist.ReduceOp.SUM)
        else:
            edz_local = edz = dz.new_zeros(dz.size(1))
            eydz_local = eydz = dz.new_zeros(dz.size(1))

        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
        dweight = eydz_local * weight.sign() if ctx.affine else None
        dbias = edz_local if ctx.affine else None

        return dx, dweight, dbias, None, None, None, None, None, None, None

Source File: tpu_bn.py From fast-autoaugment with MIT License

5 votes

def _reduce_avg(self, t):
        dist.all_reduce(t, dist.ReduceOp.SUM)
        t.mul_(1. / dist.get_world_size())

Source File: distributed_mixin.py From fastMRI with MIT License

5 votes

def stats(self, epoch, loader, setname):
        """ We use All-reduce to sync the losses across all the processes """
        losses = self.compute_stats(epoch, loader, setname)
        logging.debug(f'Epoch: {epoch}. process-local losses: {losses}')
        sys.stdout.flush()

        losses_tensor = torch.zeros(len(losses)).to(self.device)
        all_dataset_size = loader.sampler.total_size
        all_local_size = loader.sampler.num_samples

        for i, k in enumerate(sorted(losses.keys())):
            local_size = all_local_size
            if k in loader.sampler.system_acquisition_local_count:
                local_size = loader.sampler.system_acquisition_local_count[k]

            losses_tensor[i] = losses[k]*local_size

        dist.all_reduce(losses_tensor, op=dist.ReduceOp.SUM)
        logging.debug(f'({self.args.rank}) Loss all-reduce complete')
        sys.stdout.flush()

        for i, k in enumerate(sorted(losses.keys())):
            dataset_size = all_dataset_size
            if k in loader.sampler.system_acquisition_total_count:
                dataset_size = loader.sampler.system_acquisition_total_count[k]

            losses[k] = losses_tensor[i].item()/dataset_size # Average it

        self.test_loss_hook(losses)
        logging.info(f'Epoch: {epoch}. losses: {losses}')
        #print(f'Epoch: {epoch}. losses: {losses}')
        sys.stdout.flush()
        return losses["NMSE"]

Source File: tree_filter.py From TreeFilter-Torch with MIT License

5 votes

def print_info(self, edge_weight):
        edge_weight = edge_weight.clone()
        info = torch.stack([edge_weight.mean(), edge_weight.std(), edge_weight.max(), edge_weight.min()])
        if self.training and dist.is_initialized():
            dist.all_reduce(info / dist.get_world_size())
            info_str = (float(x) for x in info)
            if dist.get_rank() == 0:
                print('Mean:{0:.4f}, Std:{1:.4f}, Max:{2:.4f}, Min:{3:.4f}'.format(*info_str))
        else:
            info_str = [float(x) for x in info]
            print('Mean:{0:.4f}, Std:{1:.4f}, Max:{2:.4f}, Min:{3:.4f}'.format(*info_str))

Source File: pyt_utils.py From TreeFilter-Torch with MIT License

5 votes

def all_reduce_tensor(tensor, op=dist.ReduceOp.SUM, world_size=1):
    tensor = tensor.clone()
    dist.all_reduce(tensor, op)
    tensor.div_(world_size)

    return tensor

Source File: distributed.py From torchsupport with MIT License

5 votes

def _average_gradients(net, world_size, group, cuda=False):
  for p in net.parameters():
    tensor = p.grad.data.cpu()
    distributed.all_reduce(tensor,
                           op=distributed.reduce_op.SUM,
                           group=group)
    tensor /= float(world_size)
    if cuda:
      p.grad.data = tensor.cuda()
    else:
      p.grad.data = tensor

Source File: syncbn.py From DetNAS with MIT License

5 votes

def backward(ctx, grad_ouput):
        x, ex, exs, gamma, beta = ctx.saved_tensors

        grad_gamma, grad_beta, grad_ex, grad_exs = \
                syncbn_gpu.batch_norm_collect_grad_statistics(x, grad_ouput, gamma, ex, exs, ctx.eps, ctx.cf)

        if ctx.training:
            if ctx.sync:
                world_size = dist.get_world_size()
                grad_ex_all_reduce = dist.all_reduce(grad_ex, async_op=True)
                grad_exs_all_reduce = dist.all_reduce(grad_exs, async_op=True)

                grad_gamma_all_reduce = dist.all_reduce(grad_gamma, async_op=True)
                grad_beta_all_reduce = dist.all_reduce(grad_beta, async_op=True)

                grad_ex_all_reduce.wait()
                grad_exs_all_reduce.wait()

                grad_gamma_all_reduce.wait()
                grad_beta_all_reduce.wait()

                grad_ex /= world_size
                grad_exs /= world_size

        grad_input = syncbn_gpu.batch_norm_input_backward(x, grad_ouput, gamma, ex, exs, grad_ex, grad_exs, ctx.eps, ctx.cf)

        return grad_input, grad_gamma, grad_beta, None, None, None, None, None, None

Source File: distributed.py From torchsupport with MIT License

5 votes

def _gossip_grad(net, world_size, rank, groups, step, cuda=False):
  group = groups[step]
  for p in net.parameters():
    tensor = p.grad.data.cpu()
    distributed.all_reduce(tensor,
                           op=distributed.reduce_op.SUM,
                           group=group)
    tensor /= 2.0
    if cuda:
      p.grad.data = tensor.cuda()
    else:
      p.grad.data = tensor

Python torch.distributed.all_reduce() Examples