Python torch.distributed.all_gather() Examples
The following are 30
code examples of torch.distributed.all_gather().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.distributed
, or try the search function
.
Example #1
Source File: distributed_utils.py From conditional-motion-propagation with MIT License | 8 votes |
def gather_tensors(input_array): world_size = dist.get_world_size() ## gather shapes first myshape = input_array.shape mycount = input_array.size shape_tensor = torch.Tensor(np.array(myshape)).cuda() all_shape = [torch.Tensor(np.array(myshape)).cuda() for i in range(world_size)] dist.all_gather(all_shape, shape_tensor) ## compute largest shapes all_shape = [x.cpu().numpy() for x in all_shape] all_count = [int(x.prod()) for x in all_shape] all_shape = [list(map(int, x)) for x in all_shape] max_count = max(all_count) ## padding tensors and gather them output_tensors = [torch.Tensor(max_count).cuda() for i in range(world_size)] padded_input_array = np.zeros(max_count) padded_input_array[:mycount] = input_array.reshape(-1) input_tensor = torch.Tensor(padded_input_array).cuda() dist.all_gather(output_tensors, input_tensor) ## unpadding gathered tensors padded_output = [x.cpu().numpy() for x in output_tensors] output = [x[:all_count[i]].reshape(all_shape[i]) for i,x in enumerate(padded_output)] return output
Example #2
Source File: distributed.py From SlowFast with Apache License 2.0 | 7 votes |
def all_gather(tensors): """ All gathers the provided tensors from all processes across machines. Args: tensors (list): tensors to perform all gather across all processes in all machines. """ gather_list = [] output_tensor = [] world_size = dist.get_world_size() for tensor in tensors: tensor_placeholder = [ torch.ones_like(tensor) for _ in range(world_size) ] dist.all_gather(tensor_placeholder, tensor, async_op=False) gather_list.append(tensor_placeholder) for gathered_tensor in gather_list: output_tensor.append(torch.cat(gathered_tensor, dim=0)) return output_tensor
Example #3
Source File: comm.py From fast-reid with Apache License 2.0 | 6 votes |
def _pad_to_largest_tensor(tensor, group): """ Returns: list[int]: size of the tensor, on each rank Tensor: padded tensor that has the max size """ world_size = dist.get_world_size(group=group) assert ( world_size >= 1 ), "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes if local_size != max_size: padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor
Example #4
Source File: comm.py From detectron2 with Apache License 2.0 | 6 votes |
def _pad_to_largest_tensor(tensor, group): """ Returns: list[int]: size of the tensor, on each rank Tensor: padded tensor that has the max size """ world_size = dist.get_world_size(group=group) assert ( world_size >= 1 ), "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes if local_size != max_size: padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor
Example #5
Source File: functions.py From inplace_abn with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _gather_values(*tensors, group, world_size): # Start gather operations asynchronously gathered, gather_ops = [], [] for t in tensors: t_all = t.new_empty(world_size, *t.shape) t_op = distributed.all_gather(list(t_all.unbind(0)), t, group=group, async_op=True) gathered.append(t_all) gather_ops.append(t_op) # Wait for op in gather_ops: op.wait() # Return results return tuple(gathered)
Example #6
Source File: coco_eval.py From torchbench with Apache License 2.0 | 6 votes |
def merge(img_ids, eval_imgs): all_img_ids = all_gather(img_ids) all_eval_imgs = all_gather(eval_imgs) merged_img_ids = [] for p in all_img_ids: merged_img_ids.extend(p) merged_eval_imgs = [] for p in all_eval_imgs: merged_eval_imgs.append(p) merged_img_ids = np.array(merged_img_ids) merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) # keep only unique (and in sorted order) images merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) merged_eval_imgs = merged_eval_imgs[..., idx] return merged_img_ids, merged_eval_imgs
Example #7
Source File: comm.py From detectron2 with Apache License 2.0 | 6 votes |
def _pad_to_largest_tensor(tensor, group): """ Returns: list[int]: size of the tensor, on each rank Tensor: padded tensor that has the max size """ world_size = dist.get_world_size(group=group) assert ( world_size >= 1 ), "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes if local_size != max_size: padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor
Example #8
Source File: dist_utils.py From video_analyst with MIT License | 5 votes |
def _pad_to_largest_tensor(tensor, group): """ Returns: list[int]: size of the tensor, on each rank Tensor: padded tensor that has the max size """ world_size = dist.get_world_size(group=group) assert ( world_size >= 1 ), "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes if local_size != max_size: padding = torch.zeros((max_size - local_size, ), dtype=torch.uint8, device=tensor.device) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor
Example #9
Source File: distributed.py From SlowFast with Apache License 2.0 | 5 votes |
def all_gather_unaligned(data, group=None): """ Run all_gather on arbitrary picklable data (not necessarily tensors). Args: data: any picklable object group: a torch process group. By default, will use a group which contains all ranks on gloo backend. Returns: list[data]: list of data gathered from each rank """ if get_world_size() == 1: return [data] if group is None: group = _get_global_gloo_group() if dist.get_world_size(group) == 1: return [data] tensor = _serialize_to_tensor(data, group) size_list, tensor = _pad_to_largest_tensor(tensor, group) max_size = max(size_list) # receiving Tensor from all ranks tensor_list = [ torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list ] dist.all_gather(tensor_list, tensor, group=group) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #10
Source File: batchnorm_helper.py From SlowFast with Apache License 2.0 | 5 votes |
def forward(ctx, input, num_sync_devices, num_groups): """ Perform forwarding, gathering the stats across different process/ GPU group. """ ctx.num_sync_devices = num_sync_devices ctx.num_groups = num_groups input_list = [ torch.zeros_like(input) for k in range(du.get_local_size()) ] dist.all_gather( input_list, input, async_op=False, group=du._LOCAL_PROCESS_GROUP ) inputs = torch.stack(input_list, dim=0) if num_groups > 1: rank = du.get_local_rank() group_idx = rank // num_sync_devices inputs = inputs[ group_idx * num_sync_devices : (group_idx + 1) * num_sync_devices ] inputs = torch.sum(inputs, dim=0) return inputs
Example #11
Source File: dist_utils.py From video_analyst with MIT License | 5 votes |
def shared_random_seed(): """ Returns: int: a random number that is the same across all workers. If workers need a shared RNG, they can use this shared seed to create one. All workers must call this function, otherwise it will deadlock. """ ints = np.random.randint(2**31) all_ints = all_gather(ints) return all_ints[0]
Example #12
Source File: comm.py From DF-Traffic-Sign-Identification with MIT License | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #13
Source File: comm.py From TinyBenchmark with MIT License | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #14
Source File: comm.py From detectron2 with Apache License 2.0 | 5 votes |
def shared_random_seed(): """ Returns: int: a random number that is the same across all workers. If workers need a shared RNG, they can use this shared seed to create one. All workers must call this function, otherwise it will deadlock. """ ints = np.random.randint(2 ** 31) all_ints = all_gather(ints) return all_ints[0]
Example #15
Source File: dist_utils.py From video_analyst with MIT License | 5 votes |
def all_gather(data, group=None): """ Run all_gather on arbitrary picklable data (not necessarily tensors). Args: data: any picklable object group: a torch process group. By default, will use a group which contains all ranks on gloo backend. Returns: list[data]: list of data gathered from each rank """ if get_world_size() == 1: return [data] if group is None: group = _get_global_gloo_group() if dist.get_world_size(group) == 1: return [data] tensor = _serialize_to_tensor(data, group) size_list, tensor = _pad_to_largest_tensor(tensor, group) max_size = max(size_list) # receiving Tensor from all ranks tensor_list = [ torch.empty((max_size, ), dtype=torch.uint8, device=tensor.device) for _ in size_list ] dist.all_gather(tensor_list, tensor, group=group) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #16
Source File: dist_common.py From Det3D with Apache License 2.0 | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #17
Source File: norm.py From Det3D with Apache License 2.0 | 5 votes |
def forward(ctx, input): input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())] # Use allgather instead of allreduce since I don't trust in-place operations .. dist.all_gather(input_list, input, async_op=False) inputs = torch.stack(input_list, dim=0) return torch.sum(inputs, dim=0)
Example #18
Source File: comm.py From RRPN_pytorch with MIT License | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #19
Source File: utils.py From Det3D with Apache License 2.0 | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #20
Source File: distributed.py From SlowFast with Apache License 2.0 | 5 votes |
def _pad_to_largest_tensor(tensor, group): """ Padding all the tensors from different GPUs to the largest ones. Args: tensor (tensor): tensor to pad. group (group): pytorch dist group. Returns: list[int]: size of the tensor, on each rank Tensor: padded tensor that has the max size """ world_size = dist.get_world_size(group=group) assert ( world_size >= 1 ), "comm.gather/all_gather must be called from ranks within the given group!" local_size = torch.tensor( [tensor.numel()], dtype=torch.int64, device=tensor.device ) size_list = [ torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) ] dist.all_gather(size_list, local_size, group=group) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes if local_size != max_size: padding = torch.zeros( (max_size - local_size,), dtype=torch.uint8, device=tensor.device ) tensor = torch.cat((tensor, padding), dim=0) return size_list, tensor
Example #21
Source File: batch_norm.py From detectron2 with Apache License 2.0 | 5 votes |
def forward(ctx, input): input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())] # Use allgather instead of allreduce since I don't trust in-place operations .. dist.all_gather(input_list, input, async_op=False) inputs = torch.stack(input_list, dim=0) return torch.sum(inputs, dim=0)
Example #22
Source File: distributed.py From nnUNet with Apache License 2.0 | 5 votes |
def forward(ctx, input): world_size = distributed.get_world_size() # create a destination list for the allgather. I'm assuming you're gathering from 3 workers. allgather_list = [torch.empty_like(input) for _ in range(world_size)] #if distributed.get_rank() == 0: # import IPython;IPython.embed() distributed.all_gather(allgather_list, input) return torch.cat(allgather_list, dim=0)
Example #23
Source File: comm.py From detectron2 with Apache License 2.0 | 5 votes |
def all_gather(data, group=None): """ Run all_gather on arbitrary picklable data (not necessarily tensors). Args: data: any picklable object group: a torch process group. By default, will use a group which contains all ranks on gloo backend. Returns: list[data]: list of data gathered from each rank """ if get_world_size() == 1: return [data] if group is None: group = _get_global_gloo_group() if dist.get_world_size(group) == 1: return [data] tensor = _serialize_to_tensor(data, group) size_list, tensor = _pad_to_largest_tensor(tensor, group) max_size = max(size_list) # receiving Tensor from all ranks tensor_list = [ torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list ] dist.all_gather(tensor_list, tensor, group=group) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #24
Source File: batch_norm.py From detectron2 with Apache License 2.0 | 5 votes |
def forward(ctx, input): input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())] # Use allgather instead of allreduce since I don't trust in-place operations .. dist.all_gather(input_list, input, async_op=False) inputs = torch.stack(input_list, dim=0) return torch.sum(inputs, dim=0)
Example #25
Source File: comm.py From detectron2 with Apache License 2.0 | 5 votes |
def shared_random_seed(): """ Returns: int: a random number that is the same across all workers. If workers need a shared RNG, they can use this shared seed to create one. All workers must call this function, otherwise it will deadlock. """ ints = np.random.randint(2 ** 31) all_ints = all_gather(ints) return all_ints[0]
Example #26
Source File: comm.py From detectron2 with Apache License 2.0 | 5 votes |
def all_gather(data, group=None): """ Run all_gather on arbitrary picklable data (not necessarily tensors). Args: data: any picklable object group: a torch process group. By default, will use a group which contains all ranks on gloo backend. Returns: list[data]: list of data gathered from each rank """ if get_world_size() == 1: return [data] if group is None: group = _get_global_gloo_group() if dist.get_world_size(group) == 1: return [data] tensor = _serialize_to_tensor(data, group) size_list, tensor = _pad_to_largest_tensor(tensor, group) max_size = max(size_list) # receiving Tensor from all ranks tensor_list = [ torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list ] dist.all_gather(tensor_list, tensor, group=group) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #27
Source File: distributed.py From mobilenetv3-segmentation with Apache License 2.0 | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #28
Source File: comm.py From EmbedMask with MIT License | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.IntTensor([tensor.numel()]).to("cuda") size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #29
Source File: comm.py From maskrcnn-benchmark with MIT License | 5 votes |
def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.LongTensor([tensor.numel()]).to("cuda") size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) if local_size != max_size: padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list
Example #30
Source File: test.py From IoU-Uniform-R-CNN with Apache License 2.0 | 5 votes |
def collect_results_gpu(result_part, size): rank, world_size = get_dist_info() # dump result part to tensor with pickle part_tensor = torch.tensor( bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') # gather all result part tensor shape shape_tensor = torch.tensor(part_tensor.shape, device='cuda') shape_list = [shape_tensor.clone() for _ in range(world_size)] dist.all_gather(shape_list, shape_tensor) # padding result part tensor to max length shape_max = torch.tensor(shape_list).max() part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') part_send[:shape_tensor[0]] = part_tensor part_recv_list = [ part_tensor.new_zeros(shape_max) for _ in range(world_size) ] # gather all result part dist.all_gather(part_recv_list, part_send) if rank == 0: part_list = [] for recv, shape in zip(part_recv_list, shape_list): part_list.append( pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] return ordered_results