Python torch.distributed.barrier() Examples
The following are 30
code examples of torch.distributed.barrier().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.distributed
, or try the search function
.
Example #1
Source File: ray_container.py From adeptRL with GNU General Public License v3.0 | 7 votes |
def step(self): print(f"learner {self.rank} step") # make sure exp_handles are done for handle in self.exp_handles: handle.wait() # batch together exp time.sleep(random.randint(0, 3)) # update with other learners dist.barrier(self.learner_group) for p in self.network_grads: dist.all_reduce(p, group=self.learner_group) print(f"learner {self.rank} shared gradients") return True
Example #2
Source File: distrib.py From adeptRL with GNU General Public License v3.0 | 6 votes |
def step(self, loss): self.optimizer.zero_grad() loss.backward() dist.barrier() handles = [] for param in self.network.parameters(): handles.append(dist.all_reduce(param.grad, async_op=True)) for handle in handles: handle.wait() if self.divide_grad: for param in self.network.parameters(): param.grad.mul_(1.0 / self.world_sz) if self.grad_norm_clip: nn.utils.clip_grad_norm_( self.network.parameters(), self.grad_norm_clip ) self.optimizer.step()
Example #3
Source File: utils.py From kaggle-kuzushiji-2019 with MIT License | 6 votes |
def init_distributed_mode(args): if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ['WORLD_SIZE']) args.gpu = int(os.environ['LOCAL_RANK']) elif 'SLURM_PROCID' in os.environ: args.rank = int(os.environ['SLURM_PROCID']) args.gpu = args.rank % torch.cuda.device_count() else: print('Not using distributed mode') args.distributed = False return args.distributed = True torch.cuda.set_device(args.gpu) args.dist_backend = 'nccl' print('| distributed init (rank {}): {}'.format( args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.distributed.barrier() setup_for_distributed(args.rank == 0)
Example #4
Source File: point_to_point.py From pipedream with MIT License | 6 votes |
def receive_tensor_helper(tensor, src_rank, group, tag, num_iterations, broadcast): dist.barrier() start_time = time.time() for i in range(num_iterations): if broadcast: dist.broadcast(tensor=tensor, group=group, src=src_rank) else: dist.recv(tensor=tensor.cpu(), src=src_rank, tag=tag) end_time = time.time() dist.barrier() size = tensor.size()[0] throughput = (size * 4. * num_iterations) / ( (end_time - start_time) * 10**9) print("Time to receive %s MB: %.3f seconds" % ((size * 4.) / 10**6, (end_time - start_time) / num_iterations)) print("Throughput: %.3f GB/s" % throughput)
Example #5
Source File: eval_hooks.py From Grid-R-CNN with Apache License 2.0 | 5 votes |
def after_train_epoch(self, runner): if not self.every_n_epochs(runner, self.interval): return runner.model.eval() results = [None for _ in range(len(self.dataset))] if runner.rank == 0: prog_bar = mmcv.ProgressBar(len(self.dataset)) for idx in range(runner.rank, len(self.dataset), runner.world_size): data = self.dataset[idx] data_gpu = scatter( collate([data], samples_per_gpu=1), [torch.cuda.current_device()])[0] # compute output with torch.no_grad(): result = runner.model( return_loss=False, rescale=True, **data_gpu) results[idx] = result batch_size = runner.world_size if runner.rank == 0: for _ in range(batch_size): prog_bar.update() if runner.rank == 0: print('\n') dist.barrier() for i in range(1, runner.world_size): tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) tmp_results = mmcv.load(tmp_file) for idx in range(i, len(results), runner.world_size): results[idx] = tmp_results[idx] os.remove(tmp_file) self.evaluate(runner, results) else: tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(runner.rank)) mmcv.dump(results, tmp_file) dist.barrier() dist.barrier()
Example #6
Source File: comm.py From maskrcnn-benchmark with MIT License | 5 votes |
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #7
Source File: data_loading.py From pytorch-lightning with Apache License 2.0 | 5 votes |
def request_dataloader(self, dataloader_fx: Callable) -> DataLoader: """Handles downloading data in the GPU or TPU case. Args: dataloader_fx: The bound dataloader getter Returns: The dataloader """ dataloader = dataloader_fx() # get the function we'll use to get data if self.use_ddp or self.use_ddp2: # all processes wait until data download has happened torch_distrib.barrier() # data download/load on TPU elif self.use_tpu and XLA_AVAILABLE: # all processes wait until data download has happened torch_xla.core.xla_model.rendezvous('pl.TrainerDataLoadingMixin.get_dataloaders') elif self.use_horovod: # all processes wait until data download has happened hvd.join() return dataloader
Example #8
Source File: eval_hooks.py From Libra_R-CNN with Apache License 2.0 | 5 votes |
def after_train_epoch(self, runner): if not self.every_n_epochs(runner, self.interval): return runner.model.eval() results = [None for _ in range(len(self.dataset))] if runner.rank == 0: prog_bar = mmcv.ProgressBar(len(self.dataset)) for idx in range(runner.rank, len(self.dataset), runner.world_size): data = self.dataset[idx] data_gpu = scatter( collate([data], samples_per_gpu=1), [torch.cuda.current_device()])[0] # compute output with torch.no_grad(): result = runner.model( return_loss=False, rescale=True, **data_gpu) results[idx] = result batch_size = runner.world_size if runner.rank == 0: for _ in range(batch_size): prog_bar.update() if runner.rank == 0: print('\n') dist.barrier() for i in range(1, runner.world_size): tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) tmp_results = mmcv.load(tmp_file) for idx in range(i, len(results), runner.world_size): results[idx] = tmp_results[idx] os.remove(tmp_file) self.evaluate(runner, results) else: tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(runner.rank)) mmcv.dump(results, tmp_file) dist.barrier() dist.barrier()
Example #9
Source File: trainer.py From pytorch-lightning with Apache License 2.0 | 5 votes |
def barrier(self, name): if self.use_ddp or self.use_ddp2: torch_distrib.barrier() if self.on_tpu and XLA_AVAILABLE: # wait for all processes to catch up torch_xla.core.xla_model.rendezvous(f'pl.Trainer.{name}')
Example #10
Source File: utils.py From deconvolution with GNU General Public License v3.0 | 5 votes |
def synchronize_between_processes(self): """ Warning: does not synchronize the deque! """ if not is_dist_avail_and_initialized(): return t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') dist.barrier() dist.all_reduce(t) t = t.tolist() self.count = int(t[0]) self.total = t[1]
Example #11
Source File: distributed.py From mobilenetv3-segmentation with Apache License 2.0 | 5 votes |
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #12
Source File: ddp_utils.py From habitat-api with MIT License | 5 votes |
def requeue_job(): r"""Requeues the job by calling ``scontrol requeue ${SLURM_JOBID}`` """ if SLURM_JOBID is None: return if not REQUEUE.is_set(): return distrib.barrier() if distrib.get_rank() == 0: logger.info(f"Requeueing job {SLURM_JOBID}") subprocess.check_call(shlex.split(f"scontrol requeue {SLURM_JOBID}"))
Example #13
Source File: comm.py From EmbedMask with MIT License | 5 votes |
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #14
Source File: utils.py From deconvolution with GNU General Public License v3.0 | 5 votes |
def reduce_from_all_processes(self): if not torch.distributed.is_available(): return if not torch.distributed.is_initialized(): return torch.distributed.barrier() torch.distributed.all_reduce(self.mat)
Example #15
Source File: all_to_all.py From pipedream with MIT License | 5 votes |
def all_reduce_helper(tensor, group, multiplier, num_iterations): dist.barrier() start_time = time.time() for i in range(num_iterations): dist.all_reduce(tensor=tensor, group=group) dist.barrier() size = tensor.size()[0] bandwidth = (size * 4. * NUM_TRIALS * multiplier) / ((time.time() - start_time) * 10**6) print("Bandwidth for tensor size %s: %.2f MB/s" % (size, bandwidth))
Example #16
Source File: download.py From gpt-2-output-dataset with MIT License | 5 votes |
def download(*datasets, data_dir='data'): os.makedirs(data_dir, exist_ok=True) if distributed() and dist.get_rank() > 0: dist.barrier() for ds in datasets: assert ds in ALL_DATASETS, f'Unknown dataset {ds}' for split in ['train', 'valid', 'test']: filename = ds + "." + split + '.jsonl' output_file = os.path.join(data_dir, filename) if os.path.isfile(output_file): continue r = requests.get("https://storage.googleapis.com/gpt-2/output-dataset/v1/" + filename, stream=True) with open(output_file, 'wb') as f: file_size = int(r.headers["content-length"]) chunk_size = 1000 with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar: # 1k for chunk_size, since Ethernet packet size is around 1500 bytes for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) pbar.update(chunk_size) if distributed() and dist.get_rank() == 0: dist.barrier()
Example #17
Source File: test.py From Grid-R-CNN with Apache License 2.0 | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full( (MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #18
Source File: engine.py From Centripetal-SGD with Apache License 2.0 | 5 votes |
def __init__(self): self.version = 0.01 self.state = State() self.devices = None self.distributed = False self.logger = None if 'WORLD_SIZE' in os.environ: self.distributed = int(os.environ['WORLD_SIZE']) >= 1 if self.distributed: print('Initialize Engine for distributed training.') self.local_rank = 0 # TODO we only use single-machine-multi-gpus self.world_size = int(os.environ['WORLD_SIZE']) self.world_rank = int(os.environ['RANK']) torch.cuda.set_device(self.local_rank) dist.init_process_group(backend="nccl", init_method='env://') dist.barrier() self.devices = [i for i in range(self.world_size)] else: # todo check non-distributed training print('Initialize Engine for non-distributed training.') self.world_size = 1 self.world_rank = 1 self.devices = parse_torch_devices('0') # TODO correct? torch.backends.cudnn.benchmark = True
Example #19
Source File: test.py From kaggle-kuzushiji-recognition with MIT License | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #20
Source File: test_robustness.py From kaggle-kuzushiji-recognition with MIT License | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #21
Source File: eval_hooks.py From kaggle-kuzushiji-recognition with MIT License | 5 votes |
def after_train_epoch(self, runner): if not self.every_n_epochs(runner, self.interval): return runner.model.eval() results = [None for _ in range(len(self.dataset))] if runner.rank == 0: prog_bar = mmcv.ProgressBar(len(self.dataset)) for idx in range(runner.rank, len(self.dataset), runner.world_size): data = self.dataset[idx] data_gpu = scatter( collate([data], samples_per_gpu=1), [torch.cuda.current_device()])[0] # compute output with torch.no_grad(): result = runner.model( return_loss=False, rescale=True, **data_gpu) results[idx] = result batch_size = runner.world_size if runner.rank == 0: for _ in range(batch_size): prog_bar.update() if runner.rank == 0: print('\n') dist.barrier() for i in range(1, runner.world_size): tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) tmp_results = mmcv.load(tmp_file) for idx in range(i, len(results), runner.world_size): results[idx] = tmp_results[idx] os.remove(tmp_file) self.evaluate(runner, results) else: tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(runner.rank)) mmcv.dump(results, tmp_file) dist.barrier() dist.barrier()
Example #22
Source File: distributed.py From ocr-pytorch with MIT License | 5 votes |
def synchronize(): if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #23
Source File: distributed_communicator.py From CrypTen with MIT License | 5 votes |
def barrier(self): """Synchronizes all processes. This collective blocks processes until the whole group enters this function. """ assert dist.is_initialized(), "initialize the communicator first" dist.barrier(group=self.main_group)
Example #24
Source File: ttp_provider.py From CrypTen with MIT License | 5 votes |
def _setup_generators(self): """Create random generator to send to a party""" ws = comm.get().get_world_size() seeds = [torch.randint(-(2 ** 63), 2 ** 63 - 1, size=()) for _ in range(ws)] reqs = [dist.isend(tensor=seeds[i], dst=i, group=self.group) for i in range(ws)] self.generators = [torch.Generator() for _ in range(ws)] for i in range(ws): self.generators[i].manual_seed(seeds[i].item()) reqs[i].wait() dist.barrier(group=self.group)
Example #25
Source File: ttp_provider.py From CrypTen with MIT License | 5 votes |
def _setup_generators(self): seed = torch.empty(size=(), dtype=torch.long) dist.irecv( tensor=seed, src=comm.get().get_ttp_rank(), group=self.group ).wait() dist.barrier(group=self.group) self.generator = torch.Generator() self.generator.manual_seed(seed.item())
Example #26
Source File: test.py From PolarMask with Apache License 2.0 | 5 votes |
def collect_results(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results
Example #27
Source File: eval_hooks.py From PolarMask with Apache License 2.0 | 5 votes |
def after_train_epoch(self, runner): if not self.every_n_epochs(runner, self.interval): return runner.model.eval() results = [None for _ in range(len(self.dataset))] if runner.rank == 0: prog_bar = mmcv.ProgressBar(len(self.dataset)) for idx in range(runner.rank, len(self.dataset), runner.world_size): data = self.dataset[idx] data_gpu = scatter( collate([data], samples_per_gpu=1), [torch.cuda.current_device()])[0] # compute output with torch.no_grad(): result = runner.model( return_loss=False, rescale=True, **data_gpu) results[idx] = result batch_size = runner.world_size if runner.rank == 0: for _ in range(batch_size): prog_bar.update() if runner.rank == 0: print('\n') dist.barrier() for i in range(1, runner.world_size): tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) tmp_results = mmcv.load(tmp_file) for idx in range(i, len(results), runner.world_size): results[idx] = tmp_results[idx] os.remove(tmp_file) self.evaluate(runner, results) else: tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(runner.rank)) mmcv.dump(results, tmp_file) dist.barrier() dist.barrier()
Example #28
Source File: comm.py From Centripetal-SGD with Apache License 2.0 | 5 votes |
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #29
Source File: torch_utils.py From Centripetal-SGD with Apache License 2.0 | 5 votes |
def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return dist.barrier()
Example #30
Source File: eval_hooks.py From mmdetection_with_SENet154 with Apache License 2.0 | 5 votes |
def after_train_epoch(self, runner): if not self.every_n_epochs(runner, self.interval): return runner.model.eval() results = [None for _ in range(len(self.dataset))] if runner.rank == 0: prog_bar = mmcv.ProgressBar(len(self.dataset)) for idx in range(runner.rank, len(self.dataset), runner.world_size): data = self.dataset[idx] data_gpu = scatter( collate([data], samples_per_gpu=1), [torch.cuda.current_device()])[0] # compute output with torch.no_grad(): result = runner.model( return_loss=False, rescale=True, **data_gpu) results[idx] = result batch_size = runner.world_size if runner.rank == 0: for _ in range(batch_size): prog_bar.update() if runner.rank == 0: print('\n') dist.barrier() for i in range(1, runner.world_size): tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) tmp_results = mmcv.load(tmp_file) for idx in range(i, len(results), runner.world_size): results[idx] = tmp_results[idx] os.remove(tmp_file) self.evaluate(runner, results) else: tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(runner.rank)) mmcv.dump(results, tmp_file) dist.barrier() dist.barrier()