Python torch.distributed.broadcast() Examples
The following are 30
code examples of torch.distributed.broadcast().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.distributed
, or try the search function
.
Example #1
Source File: base.py From adeptRL with GNU General Public License v3.0 | 7 votes |
def sync(self, src, grp=None, async_op=False): keys = [] handles = [] for k, t in self.state_dict().items(): if grp is None: h = dist.broadcast(t, src, async_op=True) else: h = dist.broadcast(t, src, grp, async_op=True) keys.append(k) handles.append(h) if not async_op: for k, h in zip(keys, handles): h.wait() return handles
Example #2
Source File: distributed_communicator.py From CrypTen with MIT License | 6 votes |
def broadcast_obj(self, obj, src, group=None): """Broadcasts a given object to all parties.""" if group is None: group = self.main_group if self.rank == src: assert obj is not None, "src party must provide obj for broadcast" buf = pickle.dumps(obj) size = torch.tensor(len(buf), dtype=torch.int32) arr = torch.from_numpy(numpy.frombuffer(buf, dtype=numpy.int8)) dist.broadcast(size, src, group=group) dist.broadcast(arr, src, group=group) else: size = torch.tensor(1, dtype=torch.int32) dist.broadcast(size, src, group=group) data = torch.empty(size=(size,), dtype=torch.int8) dist.broadcast(data, src, group=group) buf = data.numpy().tobytes() obj = serial.restricted_loads(buf) return obj
Example #3
Source File: main.py From ftlib with Apache License 2.0 | 6 votes |
def train_step(self, *args, **kwargs): if self.need_reinit: if dist.is_initialized(): # parallel mode print("wait for barrier") dist.barrier() print("start to broadcast") for p in self._raw_model.parameters(): dist.broadcast(p.data, 0) print("wrap with DDP") self._ddp_model = nn.parallel.DistributedDataParallel( self._raw_model, broadcast_buffers=False, check_reduction=True, ) else: # single worker mode # skip all reduce print("single worker mode") self._ddp_model = self._raw_model self._optimizer = optim.SGD(self._ddp_model.parameters(), lr=1e-3) self.need_reinit = False self._train_step(*args, **kwargs)
Example #4
Source File: distributed_communicator.py From CrypTen with MIT License | 6 votes |
def broadcast(self, input, src, batched=False): """Broadcasts the tensor to all parties.""" assert dist.is_initialized(), "initialize the communicator first" if batched: assert isinstance(input, list), "batched reduce input must be a list" reqs = [] for tensor in input: reqs.append( dist.broadcast(tensor, src, group=self.main_group, async_op=True) ) for req in reqs: req.wait() else: assert torch.is_tensor( input.data ), "unbatched input for reduce must be a torch tensor" dist.broadcast(input.data, src, group=self.main_group) return input
Example #5
Source File: distributed_utils.py From conditional-motion-propagation with MIT License | 6 votes |
def broadcast_params(model): """ broadcast model parameters """ for p in model.state_dict().values(): dist.broadcast(p, 0)
Example #6
Source File: point_to_point.py From pipedream with MIT License | 6 votes |
def receive_tensor_helper(tensor, src_rank, group, tag, num_iterations, broadcast): dist.barrier() start_time = time.time() for i in range(num_iterations): if broadcast: dist.broadcast(tensor=tensor, group=group, src=src_rank) else: dist.recv(tensor=tensor.cpu(), src=src_rank, tag=tag) end_time = time.time() dist.barrier() size = tensor.size()[0] throughput = (size * 4. * num_iterations) / ( (end_time - start_time) * 10**9) print("Time to receive %s MB: %.3f seconds" % ((size * 4.) / 10**6, (end_time - start_time) / num_iterations)) print("Throughput: %.3f GB/s" % throughput)
Example #7
Source File: distributed.py From dataset-distillation with MIT License | 6 votes |
def broadcast_coalesced(tensors, src=0, buffer_size=10 * MB): r""" Broadcast a sequence of tensors to the default group from rank 0. Small tensors are first coalesced into a buffer to reduce the number of broadcasts. tensors (sequence): tensors to broadcast. Each tensor needs to be on the same GPU. src (int): src rank. Default: 0. buffer_size (int): maximum size of the buffer for coalescing. Default: 10MB. """ for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, src) for old_t, new_t in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): old_t.data = new_t
Example #8
Source File: distributed.py From pytorch-asr with GNU General Public License v3.0 | 5 votes |
def forward(self, *inputs, **kwargs): if self.first_call: print("first broadcast start") self.weight_broadcast() self.first_call = False print("first broadcast done") self.needs_reduction = True return self.module(*inputs, **kwargs)
Example #9
Source File: distributed.py From pytorch-asr with GNU General Public License v3.0 | 5 votes |
def weight_broadcast(self): for param in self.module.parameters(): dist.broadcast(param.data, 0)
Example #10
Source File: impl.py From ftlib with Apache License 2.0 | 5 votes |
def broadcast(self, data, root_rank, *args, **kwargs): data = torch.from_numpy(data) if isinstance(data, np.ndarray) else data dist.broadcast(data, root_rank)
Example #11
Source File: test.py From Libra_R-CNN with Apache License 2.0 | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #12
Source File: test.py From IoU-Uniform-R-CNN with Apache License 2.0 | 5 votes |
def collect_results_cpu(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #13
Source File: inference_network.py From pyprob with BSD 2-Clause "Simplified" License | 5 votes |
def _distributed_sync_parameters(self): """ broadcast rank 0 parameter to all ranks """ # print('Distributed training synchronizing parameters across nodes...') for param in self.parameters(): dist.broadcast(param.data, 0)
Example #14
Source File: distributed.py From imagenet-fast with Apache License 2.0 | 5 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module param_list = [param for param in self.module.state_dict().values() if torch.is_tensor(param)] if dist._backend == dist.dist_backend.NCCL: for param in param_list: assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU." #broadcast parameters flat_dist_call(param_list, dist.broadcast, (0,) ) #all reduce gradient hook def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False else: return grads = [param.grad.data for param in self.module.parameters() if param.grad is not None] flat_dist_call(grads, dist.all_reduce) for param in list(self.module.parameters()): def allreduce_hook(*unused): torch.autograd.Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
Example #15
Source File: distributed.py From imagenet-fast with Apache License 2.0 | 5 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module param_list = [param for param in self.module.state_dict().values() if torch.is_tensor(param)] if dist._backend == dist.dist_backend.NCCL: for param in param_list: assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU." #broadcast parameters flat_dist_call(param_list, dist.broadcast, (0,) ) #all reduce gradient hook def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False else: return grads = [param.grad.data for param in self.module.parameters() if param.grad is not None] flat_dist_call(grads, dist.all_reduce) for param in list(self.module.parameters()): def allreduce_hook(*unused): torch.autograd.Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
Example #16
Source File: distributed.py From pysot with Apache License 2.0 | 5 votes |
def broadcast_params(model): """ broadcast model parameters """ for p in model.state_dict().values(): dist.broadcast(p, 0)
Example #17
Source File: test_robustness.py From IoU-Uniform-R-CNN with Apache License 2.0 | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #18
Source File: distributed.py From pysot with Apache License 2.0 | 5 votes |
def broadcast_buffers(model, method=0): """ broadcast model buffers """ if method == 0: return world_size = get_world_size() for b in model._all_buffers(): if method == 1: # broadcast from main proccess dist.broadcast(b, 0) elif method == 2: # average dist.all_reduce(b) b /= world_size else: raise Exception('Invalid buffer broadcast code {}'.format(method))
Example #19
Source File: distributed_utils.py From KBRD with MIT License | 5 votes |
def sync_object(data, max_size=16384): """ Syncs an object among all workers, overriding everyone's version with the primary worker's. Data must be pickleable. """ if not is_distributed(): return data # prepare the buffer if (not hasattr(sync_object, '_buffer') or sync_object._buffer.numel() < max_size): # cuda is safe because distributed mode is only okay with CUDA sync_object._buffer = torch.cuda.ByteTensor(max_size) buffer = sync_object._buffer if is_primary_worker(): enc = pickle.dumps(data) enc_size = len(enc) if (enc_size + 2 > max_size) or (enc_size > 255 * 255): # can't store the size in the first 2 bytes raise ValueError('encoded data exceeds max_size') buffer[0] = enc_size // 255 buffer[1] = enc_size % 255 buffer[2: enc_size + 2] = torch.ByteTensor(list(enc)) dist.broadcast(buffer, 0) if not is_primary_worker(): # deserialize the data enc_size = buffer[0].item() * 255 + buffer[1].item() try: data = pickle.loads(bytes(buffer[2: enc_size + 2].tolist())) except pickle.UnpicklingError: raise RuntimeError( 'There was an unpickling error in sync_object. This likely ' 'means your workers got out of syncronization (e.g. one is ' 'expecting to sync and another is not.)' ) return data
Example #20
Source File: communication.py From pipedream with MIT License | 5 votes |
def _send(tensor, tensor_name, src_rank, dst_rank, tag, sub_process_group=None): """ Sends tensor by calling PyTorch's send() call. If tensor is being sent not via broadcast(), it will be first copied to the CPU. """ if sub_process_group is not None: assert tensor.is_cuda # Send tensor shape. tensor_shape = torch.tensor(tensor.shape, dtype=torch.int) dist.broadcast(tensor=tensor_shape, src=src_rank, group=sub_process_group) # Send tensor. contiguous_tensor = tensor.detach().clone() dist.broadcast(tensor=contiguous_tensor.contiguous(), src=src_rank, group=sub_process_group) else: assert tensor.is_cuda tensor = tensor.cpu() # Send tensor shape. tensor_shape = torch.tensor(tensor.shape, dtype=torch.int) dist.send(tensor=tensor_shape, dst=dst_rank, tag=tag) # Send tensor. dist.send(tensor=tensor, dst=dst_rank, tag=tag)
Example #21
Source File: distributed_utils.py From Actor-Critic-Based-Resource-Allocation-for-Multimodal-Optical-Networks with GNU General Public License v3.0 | 5 votes |
def broadcast_params(model): """ broadcast model parameters """ for p in model.state_dict().values(): dist.broadcast(p, 0)
Example #22
Source File: test.py From Grid-R-CNN with Apache License 2.0 | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full( (MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #23
Source File: test.py From kaggle-kuzushiji-recognition with MIT License | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #24
Source File: test_robustness.py From kaggle-kuzushiji-recognition with MIT License | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #25
Source File: test_robustness.py From FoveaBox with Apache License 2.0 | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #26
Source File: test_communicator.py From CrypTen with MIT License | 5 votes |
def test_batched_broadcast(self): sizes = [(), (1,), (5,), (5, 5), (5, 5, 5)] for rank in range(self.world_size): if self.rank == rank: tensors = [torch.ones(size) for size in sizes] else: tensors = [torch.zeros(size) for size in sizes] tensors = comm.get().broadcast(tensors, src=rank, batched=True) self.assertTrue(isinstance(tensors, list)) for tensor in tensors: self.assertTrue(torch.is_tensor(tensor)) self.assertTrue(tensor.eq(1).all())
Example #27
Source File: test_communicator.py From CrypTen with MIT License | 5 votes |
def test_broadcast(self): for rank in range(self.world_size): tensor = torch.LongTensor([0]) if self.rank == rank: tensor += 1 tensor = comm.get().broadcast(tensor, src=rank) self.assertTrue(torch.is_tensor(tensor)) self.assertEqual(tensor.item(), 1)
Example #28
Source File: test.py From FoveaBox with Apache License 2.0 | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #29
Source File: test_robustness.py From mmdetection with Apache License 2.0 | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, f'part_{i}.pkl') part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #30
Source File: optim.py From cherry with Apache License 2.0 | 5 votes |
def sync_parameters(self, root=0): """ **Description** Broadcasts all parameters of root to all other replicas. **Arguments** * **root** (int, *optional*, default=0) - Rank of root replica. """ if self.world_size > 1: for group in self.param_groups: for p in group['params']: dist.broadcast(p.data, src=root)