Python torch.distributed.get_rank() Examples
The following are 30
code examples of torch.distributed.get_rank().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.distributed
, or try the search function
.
Example #1
Source File: trainer.py From Res2Net-maskrcnn with MIT License | 8 votes |
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
Example #2
Source File: distributed.py From SegmenTron with Apache License 2.0 | 6 votes |
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
Example #3
Source File: distributed_utils.py From conditional-motion-propagation with MIT License | 6 votes |
def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1): if world_size is None: world_size = dist.get_world_size() if rank is None: rank = dist.get_rank() assert rank < world_size self.dataset = dataset self.total_iter = total_iter self.batch_size = batch_size self.world_size = world_size self.rank = rank self.last_iter = last_iter self.total_size = self.total_iter*self.batch_size self.indices = self.gen_new_list() self.call = 0
Example #4
Source File: trainer.py From Clothing-Detection with GNU General Public License v3.0 | 6 votes |
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
Example #5
Source File: distributed.py From R2CNN.pytorch with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #6
Source File: trainer.py From R2CNN.pytorch with MIT License | 6 votes |
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
Example #7
Source File: distributed.py From Clothing-Detection with GNU General Public License v3.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #8
Source File: statistics.py From ITDD with MIT License | 6 votes |
def all_gather_stats_list(stat_list, max_size=4096): """ Gather a `Statistics` list accross all processes/nodes Args: stat_list(list([`Statistics`])): list of statistics objects to gather accross all processes/nodes max_size(int): max buffer size to use Returns: our_stats(list([`Statistics`])): list of updated stats """ # Get a list of world_size lists with len(stat_list) Statistics objects all_stats = all_gather_list(stat_list, max_size=max_size) our_rank = get_rank() our_stats = all_stats[our_rank] for other_rank, stats in enumerate(all_stats): if other_rank == our_rank: continue for i, stat in enumerate(stats): our_stats[i].update(stat, update_n_src_words=True) return our_stats
Example #9
Source File: distributed.py From Parsing-R-CNN with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #10
Source File: sampler.py From LEDNet with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #11
Source File: parallel.py From LEDNet with MIT License | 6 votes |
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k in sorted(loss_dict.keys()): loss_names.append(k) all_losses.append(loss_dict[k]) all_losses = torch.stack(all_losses, dim=0) dist.reduce(all_losses, dst=0) if dist.get_rank() == 0: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
Example #12
Source File: distributed.py From Res2Net-maskrcnn with MIT License | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #13
Source File: samplers.py From DenseMatchingBenchmark with MIT License | 6 votes |
def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas
Example #14
Source File: sampler.py From mars with Apache License 2.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): import torch.distributed as dist super().__init__(dataset) if num_replicas is None: # pragma: no cover if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: # pragma: no cover if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #15
Source File: distributed.py From SegmenTron with Apache License 2.0 | 6 votes |
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas self.shuffle = shuffle
Example #16
Source File: sampler.py From AerialDetection with Apache License 2.0 | 6 votes |
def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas
Example #17
Source File: comm.py From Clothing-Detection with GNU General Public License v3.0 | 5 votes |
def get_rank(): if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank()
Example #18
Source File: distributed.py From awesome-semantic-segmentation-pytorch with Apache License 2.0 | 5 votes |
def get_rank(): if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank()
Example #19
Source File: utils.py From kaggle-kuzushiji-2019 with MIT License | 5 votes |
def is_main_process(): return get_rank() == 0
Example #20
Source File: utils.py From kaggle-kuzushiji-2019 with MIT License | 5 votes |
def get_rank(): if not is_dist_avail_and_initialized(): return 0 return dist.get_rank()
Example #21
Source File: evaluate.py From DeepLab-v3-plus-cityscapes with MIT License | 5 votes |
def evaluate(): ## setup cfg = config_factory['resnet_cityscapes'] args = parse_args() if not args.local_rank == -1: torch.cuda.set_device(args.local_rank) dist.init_process_group( backend = 'nccl', init_method = 'tcp://127.0.0.1:{}'.format(cfg.port), world_size = torch.cuda.device_count(), rank = args.local_rank ) setup_logger(cfg.respth) else: FORMAT = '%(levelname)s %(filename)s(%(lineno)d): %(message)s' log_level = logging.INFO if dist.is_initialized() and dist.get_rank()!=0: log_level = logging.ERROR logging.basicConfig(level=log_level, format=FORMAT, stream=sys.stdout) logger = logging.getLogger() ## model logger.info('setup and restore model') net = Deeplab_v3plus(cfg) save_pth = osp.join(cfg.respth, 'model_final.pth') net.load_state_dict(torch.load(save_pth), strict=False) net.cuda() net.eval() if not args.local_rank == -1: net = nn.parallel.DistributedDataParallel(net, device_ids = [args.local_rank, ], output_device = args.local_rank ) ## evaluator logger.info('compute the mIOU') evaluator = MscEval(cfg) mIOU = evaluator(net) logger.info('mIOU is: {:.6f}'.format(mIOU))
Example #22
Source File: logger.py From DeepLab-v3-plus-cityscapes with MIT License | 5 votes |
def setup_logger(logpth): logfile = 'Deeplab_v3plus-{}.log'.format(time.strftime('%Y-%m-%d-%H-%M-%S')) logfile = osp.join(logpth, logfile) FORMAT = '%(levelname)s %(filename)s(%(lineno)d): %(message)s' log_level = logging.INFO if dist.is_initialized() and dist.get_rank()!=0: log_level = logging.WARNING logging.basicConfig(level=log_level, format=FORMAT, filename=logfile) logging.root.addHandler(logging.StreamHandler())
Example #23
Source File: distributed.py From torchsupport with MIT License | 5 votes |
def __init__(self, *args, **kwargs): super(SynchronousDistributedTraining, self).__init__(*args, **kwargs) self.world_size = distributed.get_world_size() self.rank = distributed.get_rank() self.group = distributed.new_group(ranks=list(range(self.world_size)))
Example #24
Source File: comm.py From Clothing-Detection with GNU General Public License v3.0 | 5 votes |
def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.reduce(values, dst=0) if dist.get_rank() == 0 and average: # only main process gets accumulated, so only divide by # world_size in this case values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict
Example #25
Source File: data_sampler.py From BasicSR with Apache License 2.0 | 5 votes |
def __init__(self, dataset, num_replicas=None, rank=None, ratio=100): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * ratio / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
Example #26
Source File: comm.py From Clothing-Detection with GNU General Public License v3.0 | 5 votes |
def is_main_process(): return get_rank() == 0
Example #27
Source File: distributed_utils.py From fairseq with MIT License | 5 votes |
def get_rank(): return dist.get_rank()
Example #28
Source File: distributed_operations.py From sagemaker-pytorch-training-toolkit with Apache License 2.0 | 5 votes |
def _get_tensors_sum(rows, columns): device = torch.device( "cuda:{}".format(dist.get_rank() % torch.cuda.device_count()) if torch.cuda.is_available() else "cpu" ) result = (1 + dist.get_world_size()) * dist.get_world_size() / 2 tensor = torch.ones(rows, columns) * result return tensor.to(device)
Example #29
Source File: distributed_operations.py From sagemaker-pytorch-training-toolkit with Apache License 2.0 | 5 votes |
def _get_tensor(rank, rows, columns): device = torch.device( "cuda:{}".format(dist.get_rank() % torch.cuda.device_count()) if torch.cuda.is_available() else "cpu" ) tensor = torch.ones(rows, columns) * (rank + 1) return tensor.to(device)
Example #30
Source File: distributed_operations.py From sagemaker-pytorch-training-toolkit with Apache License 2.0 | 5 votes |
def _get_zeros_tensor(rows, columns): device = torch.device( "cuda:{}".format(dist.get_rank() % torch.cuda.device_count()) if torch.cuda.is_available() else "cpu" ) tensor = torch.zeros(rows, columns) return tensor.to(device)