Python Examples of torch.distributed.new

Source File: main.py From pytorch-distributed-example with MIT License

6 votes

def run(world_size, rank, steps):
    for step in range(1, steps + 1):
        # get random int
        value = randint(0, 10)

        # group all ranks
        ranks = list(range(world_size))
        group = dist.new_group(ranks=ranks)

        # compute reduced sum
        tensor = torch.tensor(value, dtype=torch.int)
        dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)

        print('rank: {}, step: {}, value: {}, reduced sum: {}.'.format(rank, step, value, tensor.item()))

        sleep(1)

Source File: ray_container.py From adeptRL with GNU General Public License v3.0

6 votes

def __init__(self, rank, learner_ranks, worker_ranks, ip, port):
        world_size = len(learner_ranks) + len(worker_ranks)
        dist.init_process_group(
            "nccl",
            init_method="tcp://{}:{}".format(ip, port),
            rank=rank,
            world_size=world_size,
        )
        groups = {}
        for learner_rank in learner_ranks:
            for worker_rank in worker_ranks:
                g = dist.new_group([learner_rank, worker_rank])
                if worker_rank == rank:
                    groups[learner_rank] = g
        dist.new_group(learner_ranks)

        self.groups = groups
        self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.rank = rank
        self.network = torch.zeros(3).to(self.device)
        self.exp = None
        self.network_handle = None

Source File: data_parallel_dist.py From ps_pytorch with MIT License

6 votes

def _start_reduction_threads(self):
        num_buckets = len(self.bucket_sizes)
        self._reduction_queues = [queue.Queue() for _ in range(num_buckets)]
        self._reduction_threads = []
        self._reduction_streams = [[] for _ in range(num_buckets)]
        self._nccl_streams = []
        self._default_streams = []
        for dev_id in self.device_ids:
            with torch.cuda.device(dev_id):
                # TODO: don't assume we're on a default stream
                self._default_streams.append(torch.cuda.current_stream())
                self._nccl_streams.append(torch.cuda.Stream())
        for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams):
            for dev_id in self.device_ids:
                with torch.cuda.device(dev_id):
                    reduction_streams.append(torch.cuda.Stream())
            # We only use the first device for distributed reductions
            dist._register_stream(reduction_streams[0])
            group_id = dist.new_group()

            self._reduction_threads.append(threading.Thread(
                target=self._reduction_thread_fn,
                args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams)))
            self._reduction_threads[-1].daemon = True
            self._reduction_threads[-1].start()

Source File: distributed.py From SlowFast with Apache License 2.0

6 votes

def init_distributed_training(cfg):
    """
    Initialize variables needed for distributed training.
    """
    if cfg.NUM_GPUS == 1:
        return
    num_gpus_per_machine = cfg.NUM_GPUS
    num_machines = dist.get_world_size() // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(
            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
        )
        pg = dist.new_group(ranks_on_i)
        if i == cfg.SHARD_ID:
            global _LOCAL_PROCESS_GROUP
            _LOCAL_PROCESS_GROUP = pg

Source File: comm.py From detectron2 with Apache License 2.0

5 votes

def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD

Source File: dist_utils.py From video_analyst with MIT License

5 votes

def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD

Source File: distributed.py From SlowFast with Apache License 2.0

5 votes

def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    Returns:
        (group): pytorch dist group.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD

Source File: comm.py From detectron2 with Apache License 2.0

5 votes

def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD

Source File: launch.py From detectron2 with Apache License 2.0

5 votes

def _distributed_worker(
    local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args
):
    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
    global_rank = machine_rank * num_gpus_per_machine + local_rank
    try:
        dist.init_process_group(
            backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
        )
    except Exception as e:
        logger = logging.getLogger(__name__)
        logger.error("Process group URL: {}".format(dist_url))
        raise e
    # synchronize is needed here to prevent a possible timeout after calling init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    assert num_gpus_per_machine <= torch.cuda.device_count()
    torch.cuda.set_device(local_rank)

    # Setup the local process group (which contains ranks within the same machine)
    assert comm._LOCAL_PROCESS_GROUP is None
    num_machines = world_size // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
        pg = dist.new_group(ranks_on_i)
        if i == machine_rank:
            comm._LOCAL_PROCESS_GROUP = pg

    main_func(*args)

Source File: main.py From elastic with BSD 3-Clause "New" or "Revised" License

5 votes

def tmp_process_group(backend):
    cpu_pg = dist.new_group(backend=backend)
    try:
        yield cpu_pg
    finally:
        dist.destroy_process_group(cpu_pg)

Source File: distributed.py From torchsupport with MIT License

5 votes

def __init__(self, *args, **kwargs):
    super(SynchronousDistributedTraining, self).__init__(*args, **kwargs)
    self.world_size = distributed.get_world_size()
    self.rank = distributed.get_rank()
    self.group = distributed.new_group(ranks=list(range(self.world_size)))

Source File: launch.py From detectron2 with Apache License 2.0

5 votes

def _distributed_worker(
    local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args
):
    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
    global_rank = machine_rank * num_gpus_per_machine + local_rank
    try:
        dist.init_process_group(
            backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
        )
    except Exception as e:
        logger = logging.getLogger(__name__)
        logger.error("Process group URL: {}".format(dist_url))
        raise e
    # synchronize is needed here to prevent a possible timeout after calling init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    assert num_gpus_per_machine <= torch.cuda.device_count()
    torch.cuda.set_device(local_rank)

    # Setup the local process group (which contains ranks within the same machine)
    assert comm._LOCAL_PROCESS_GROUP is None
    num_machines = world_size // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
        pg = dist.new_group(ranks_on_i)
        if i == machine_rank:
            comm._LOCAL_PROCESS_GROUP = pg

    main_func(*args)

Source File: comm.py From fast-reid with Apache License 2.0

5 votes

def _get_global_gloo_group():
    """
    Return a process group based on gloo backend, containing all the ranks
    The result is cached.
    """
    if dist.get_backend() == "nccl":
        return dist.new_group(backend="gloo")
    else:
        return dist.group.WORLD

Source File: group.py From inplace_abn with BSD 3-Clause "New" or "Revised" License

5 votes

def active_group(active):
    """Initialize a distributed group where each process can independently decide whether to participate or not

    Parameters
    ----------
    active : bool
        Whether this process will be active in the group or not

    Returns
    -------
        A distributed group containing all processes that passed `active=True`, or `None` if all passed `False`
    """
    world_size = distributed.get_world_size()
    rank = distributed.get_rank()

    # Check if cache is initialized, add WORLD and None to it
    if not hasattr(active_group, "__cache__"):
        active_group.__cache__ = {
            frozenset(range(world_size)): distributed.group.WORLD,
            frozenset(): None
        }

    # Gather active status from all workers
    active = torch.tensor(rank if active else -1, dtype=torch.long, device=torch.cuda.current_device())
    active_workers = torch.empty(world_size, dtype=torch.long, device=torch.cuda.current_device())
    distributed.all_gather(list(active_workers.unbind(0)), active)

    # Create and cache group if it doesn't exist yet
    active_workers = frozenset(int(i) for i in active_workers.tolist() if i != -1)
    if active_workers not in active_group.__cache__:
        group = distributed.new_group(list(active_workers))
        active_group.__cache__[active_workers] = group

    return active_group.__cache__[active_workers]

Source File: distributed_communicator.py From CrypTen with MIT License

5 votes

def __init__(self, init_ttp=False):
        # no need to do anything if we already initialized the communicator:
        if not dist.is_initialized():
            # get configuration variables from environmens:
            for key in ["distributed_backend", "rendezvous", "world_size", "rank"]:
                if key.upper() not in os.environ:
                    raise ValueError("Environment variable %s must be set." % key)
                setattr(self, key.lower(), os.environ[key.upper()])

            # make sure world size and rank are integers; comms stats are reset:
            self.world_size = int(self.world_size)
            self.rank = int(self.rank)
            self.reset_communication_stats()
            self._name = f"rank{self.rank}"

            # logging:
            logging.info("==================")
            logging.info("DistributedCommunicator with rank %d" % self.rank)
            logging.info("==================")

            # initialize process group:
            total_ws = self.world_size + 1 if init_ttp else self.world_size
            dist.init_process_group(
                backend=self.distributed_backend,
                init_method=self.rendezvous,
                world_size=total_ws,
                rank=self.rank,
            )
            self.ttp_group = dist.new_group(list(range(total_ws)))
            self.main_group = dist.new_group(list(range(self.world_size)))
            self.ttp_initialized = init_ttp
            logging.info("World size = %d" % self.world_size)

Source File: ray_container.py From adeptRL with GNU General Public License v3.0

5 votes

def __init__(self, rank, learner_ranks, worker_ranks, ip, port):
        world_size = len(learner_ranks) + len(worker_ranks)

        dist.init_process_group(
            "nccl",
            init_method="tcp://{}:{}".format(ip, port),
            rank=rank,
            world_size=world_size,
        )
        groups = {}
        for learner_rank in learner_ranks:
            for worker_rank in worker_ranks:
                g = dist.new_group([learner_rank, worker_rank])
                if learner_rank == rank:
                    groups[worker_rank] = g
        learner_group = dist.new_group(learner_ranks)

        self.groups = groups
        self.learner_group = learner_group
        self.device = torch.device(f"cuda:{ray.get_gpu_ids()[0]}")
        self.rank = rank

        self.exps = {
            w_rank: torch.zeros(2).to(self.device) for w_rank in worker_ranks
        }
        self.network = torch.ones(3).to(self.device)
        self.network_grads = [torch.ones(3).to(self.device)]
        self.exp_handles = None

Source File: distributed.py From torchsupport with MIT License

5 votes

def __init__(self, *args, **kwargs):
    super(AsynchronousDistributedTraining, self).__init__(*args, **kwargs)
    self.gossip_step = 0
    self.world_size = distributed.get_world_size()
    self.rank = distributed.get_rank()
    self.groups = []
    for idx in range(self.world_size - 1):
      partner = (self.rank + idx + 1) % self.world_size
      group = distributed.new_group(ranks=[self.rank, partner])
      self.groups.append(group)

Python torch.distributed.new_group() Examples