Python torch.distributed.new_group() Examples
The following are 17
code examples of torch.distributed.new_group().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.distributed
, or try the search function
.
Example #1
Source File: main.py From pytorch-distributed-example with MIT License | 6 votes |
def run(world_size, rank, steps): for step in range(1, steps + 1): # get random int value = randint(0, 10) # group all ranks ranks = list(range(world_size)) group = dist.new_group(ranks=ranks) # compute reduced sum tensor = torch.tensor(value, dtype=torch.int) dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group) print('rank: {}, step: {}, value: {}, reduced sum: {}.'.format(rank, step, value, tensor.item())) sleep(1)
Example #2
Source File: ray_container.py From adeptRL with GNU General Public License v3.0 | 6 votes |
def __init__(self, rank, learner_ranks, worker_ranks, ip, port): world_size = len(learner_ranks) + len(worker_ranks) dist.init_process_group( "nccl", init_method="tcp://{}:{}".format(ip, port), rank=rank, world_size=world_size, ) groups = {} for learner_rank in learner_ranks: for worker_rank in worker_ranks: g = dist.new_group([learner_rank, worker_rank]) if worker_rank == rank: groups[learner_rank] = g dist.new_group(learner_ranks) self.groups = groups self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}") self.rank = rank self.network = torch.zeros(3).to(self.device) self.exp = None self.network_handle = None
Example #3
Source File: data_parallel_dist.py From ps_pytorch with MIT License | 6 votes |
def _start_reduction_threads(self): num_buckets = len(self.bucket_sizes) self._reduction_queues = [queue.Queue() for _ in range(num_buckets)] self._reduction_threads = [] self._reduction_streams = [[] for _ in range(num_buckets)] self._nccl_streams = [] self._default_streams = [] for dev_id in self.device_ids: with torch.cuda.device(dev_id): # TODO: don't assume we're on a default stream self._default_streams.append(torch.cuda.current_stream()) self._nccl_streams.append(torch.cuda.Stream()) for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams): for dev_id in self.device_ids: with torch.cuda.device(dev_id): reduction_streams.append(torch.cuda.Stream()) # We only use the first device for distributed reductions dist._register_stream(reduction_streams[0]) group_id = dist.new_group() self._reduction_threads.append(threading.Thread( target=self._reduction_thread_fn, args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams))) self._reduction_threads[-1].daemon = True self._reduction_threads[-1].start()
Example #4
Source File: distributed.py From SlowFast with Apache License 2.0 | 6 votes |
def init_distributed_training(cfg): """ Initialize variables needed for distributed training. """ if cfg.NUM_GPUS == 1: return num_gpus_per_machine = cfg.NUM_GPUS num_machines = dist.get_world_size() // num_gpus_per_machine for i in range(num_machines): ranks_on_i = list( range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine) ) pg = dist.new_group(ranks_on_i) if i == cfg.SHARD_ID: global _LOCAL_PROCESS_GROUP _LOCAL_PROCESS_GROUP = pg
Example #5
Source File: comm.py From detectron2 with Apache License 2.0 | 5 votes |
def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. """ if dist.get_backend() == "nccl": return dist.new_group(backend="gloo") else: return dist.group.WORLD
Example #6
Source File: dist_utils.py From video_analyst with MIT License | 5 votes |
def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. """ if dist.get_backend() == "nccl": return dist.new_group(backend="gloo") else: return dist.group.WORLD
Example #7
Source File: distributed.py From SlowFast with Apache License 2.0 | 5 votes |
def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. Returns: (group): pytorch dist group. """ if dist.get_backend() == "nccl": return dist.new_group(backend="gloo") else: return dist.group.WORLD
Example #8
Source File: comm.py From detectron2 with Apache License 2.0 | 5 votes |
def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. """ if dist.get_backend() == "nccl": return dist.new_group(backend="gloo") else: return dist.group.WORLD
Example #9
Source File: launch.py From detectron2 with Apache License 2.0 | 5 votes |
def _distributed_worker( local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args ): assert torch.cuda.is_available(), "cuda is not available. Please check your installation." global_rank = machine_rank * num_gpus_per_machine + local_rank try: dist.init_process_group( backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank ) except Exception as e: logger = logging.getLogger(__name__) logger.error("Process group URL: {}".format(dist_url)) raise e # synchronize is needed here to prevent a possible timeout after calling init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() assert num_gpus_per_machine <= torch.cuda.device_count() torch.cuda.set_device(local_rank) # Setup the local process group (which contains ranks within the same machine) assert comm._LOCAL_PROCESS_GROUP is None num_machines = world_size // num_gpus_per_machine for i in range(num_machines): ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) pg = dist.new_group(ranks_on_i) if i == machine_rank: comm._LOCAL_PROCESS_GROUP = pg main_func(*args)
Example #10
Source File: main.py From elastic with BSD 3-Clause "New" or "Revised" License | 5 votes |
def tmp_process_group(backend): cpu_pg = dist.new_group(backend=backend) try: yield cpu_pg finally: dist.destroy_process_group(cpu_pg)
Example #11
Source File: distributed.py From torchsupport with MIT License | 5 votes |
def __init__(self, *args, **kwargs): super(SynchronousDistributedTraining, self).__init__(*args, **kwargs) self.world_size = distributed.get_world_size() self.rank = distributed.get_rank() self.group = distributed.new_group(ranks=list(range(self.world_size)))
Example #12
Source File: launch.py From detectron2 with Apache License 2.0 | 5 votes |
def _distributed_worker( local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args ): assert torch.cuda.is_available(), "cuda is not available. Please check your installation." global_rank = machine_rank * num_gpus_per_machine + local_rank try: dist.init_process_group( backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank ) except Exception as e: logger = logging.getLogger(__name__) logger.error("Process group URL: {}".format(dist_url)) raise e # synchronize is needed here to prevent a possible timeout after calling init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() assert num_gpus_per_machine <= torch.cuda.device_count() torch.cuda.set_device(local_rank) # Setup the local process group (which contains ranks within the same machine) assert comm._LOCAL_PROCESS_GROUP is None num_machines = world_size // num_gpus_per_machine for i in range(num_machines): ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) pg = dist.new_group(ranks_on_i) if i == machine_rank: comm._LOCAL_PROCESS_GROUP = pg main_func(*args)
Example #13
Source File: comm.py From fast-reid with Apache License 2.0 | 5 votes |
def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. """ if dist.get_backend() == "nccl": return dist.new_group(backend="gloo") else: return dist.group.WORLD
Example #14
Source File: group.py From inplace_abn with BSD 3-Clause "New" or "Revised" License | 5 votes |
def active_group(active): """Initialize a distributed group where each process can independently decide whether to participate or not Parameters ---------- active : bool Whether this process will be active in the group or not Returns ------- A distributed group containing all processes that passed `active=True`, or `None` if all passed `False` """ world_size = distributed.get_world_size() rank = distributed.get_rank() # Check if cache is initialized, add WORLD and None to it if not hasattr(active_group, "__cache__"): active_group.__cache__ = { frozenset(range(world_size)): distributed.group.WORLD, frozenset(): None } # Gather active status from all workers active = torch.tensor(rank if active else -1, dtype=torch.long, device=torch.cuda.current_device()) active_workers = torch.empty(world_size, dtype=torch.long, device=torch.cuda.current_device()) distributed.all_gather(list(active_workers.unbind(0)), active) # Create and cache group if it doesn't exist yet active_workers = frozenset(int(i) for i in active_workers.tolist() if i != -1) if active_workers not in active_group.__cache__: group = distributed.new_group(list(active_workers)) active_group.__cache__[active_workers] = group return active_group.__cache__[active_workers]
Example #15
Source File: distributed_communicator.py From CrypTen with MIT License | 5 votes |
def __init__(self, init_ttp=False): # no need to do anything if we already initialized the communicator: if not dist.is_initialized(): # get configuration variables from environmens: for key in ["distributed_backend", "rendezvous", "world_size", "rank"]: if key.upper() not in os.environ: raise ValueError("Environment variable %s must be set." % key) setattr(self, key.lower(), os.environ[key.upper()]) # make sure world size and rank are integers; comms stats are reset: self.world_size = int(self.world_size) self.rank = int(self.rank) self.reset_communication_stats() self._name = f"rank{self.rank}" # logging: logging.info("==================") logging.info("DistributedCommunicator with rank %d" % self.rank) logging.info("==================") # initialize process group: total_ws = self.world_size + 1 if init_ttp else self.world_size dist.init_process_group( backend=self.distributed_backend, init_method=self.rendezvous, world_size=total_ws, rank=self.rank, ) self.ttp_group = dist.new_group(list(range(total_ws))) self.main_group = dist.new_group(list(range(self.world_size))) self.ttp_initialized = init_ttp logging.info("World size = %d" % self.world_size)
Example #16
Source File: ray_container.py From adeptRL with GNU General Public License v3.0 | 5 votes |
def __init__(self, rank, learner_ranks, worker_ranks, ip, port): world_size = len(learner_ranks) + len(worker_ranks) dist.init_process_group( "nccl", init_method="tcp://{}:{}".format(ip, port), rank=rank, world_size=world_size, ) groups = {} for learner_rank in learner_ranks: for worker_rank in worker_ranks: g = dist.new_group([learner_rank, worker_rank]) if learner_rank == rank: groups[worker_rank] = g learner_group = dist.new_group(learner_ranks) self.groups = groups self.learner_group = learner_group self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}") self.rank = rank self.exps = { w_rank: torch.zeros(2).to(self.device) for w_rank in worker_ranks } self.network = torch.ones(3).to(self.device) self.network_grads = [torch.ones(3).to(self.device)] self.exp_handles = None
Example #17
Source File: distributed.py From torchsupport with MIT License | 5 votes |
def __init__(self, *args, **kwargs): super(AsynchronousDistributedTraining, self).__init__(*args, **kwargs) self.gossip_step = 0 self.world_size = distributed.get_world_size() self.rank = distributed.get_rank() self.groups = [] for idx in range(self.world_size - 1): partner = (self.rank + idx + 1) % self.world_size group = distributed.new_group(ranks=[self.rank, partner]) self.groups.append(group)