Python torch.distributed._backend() Examples
The following are 17
code examples of torch.distributed._backend().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.distributed
, or try the search function
.
Example #1
Source File: distributed.py From apex with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __getstate__(self): attrs = copy.copy(self.__dict__) if self._backend != self.backend_enum_holder.NCCL: del attrs['self.bucket_streams'] del attrs['self.bucket_events'] return attrs
Example #2
Source File: utils.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def reduce(self, op): """ Reduces average value over all workers. :param op: 'sum' or 'mean', reduction operator """ if op not in ('sum', 'mean'): raise NotImplementedError distributed = (get_world_size() > 1) if distributed: if(hasattr(dist, "get_backend")): backend = dist.get_backend() else: backend = dist._backend cuda = (backend == dist.dist_backend.NCCL) if cuda: avg = torch.cuda.FloatTensor([self.avg]) _sum = torch.cuda.FloatTensor([self.sum]) else: avg = torch.FloatTensor([self.avg]) _sum = torch.FloatTensor([self.sum]) dist.all_reduce(avg, op=dist.reduce_op.SUM) dist.all_reduce(_sum, op=dist.reduce_op.SUM) self.avg = avg.item() self.sum = _sum.item() if op == 'mean': self.avg /= get_world_size() self.sum /= get_world_size()
Example #3
Source File: utils.py From pipedream with MIT License | 5 votes |
def reduce(self, op): """ Reduces average value over all workers. :param op: 'sum' or 'mean', reduction operator """ if op not in ('sum', 'mean'): raise NotImplementedError distributed = (get_world_size() > 1) if distributed: if(hasattr(dist, "get_backend")): backend = dist.get_backend() else: backend = dist._backend cuda = (backend == dist.dist_backend.NCCL) if cuda: avg = torch.cuda.FloatTensor([self.avg]) _sum = torch.cuda.FloatTensor([self.sum]) else: avg = torch.FloatTensor([self.avg]) _sum = torch.FloatTensor([self.sum]) dist.all_reduce(avg, op=dist.reduce_op.SUM) dist.all_reduce(_sum, op=dist.reduce_op.SUM) self.avg = avg.item() self.sum = _sum.item() if op == 'mean': self.avg /= get_world_size() self.sum /= get_world_size()
Example #4
Source File: distributed_data_parallel.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True#$ True if dist._backend == dist.dist_backend.GLOO else False self.module = module for p in self.module.state_dict().values(): if torch.is_tensor(p): dist.broadcast(p, 0) def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(self.module.parameters()): if param.requires_grad: def allreduce_hook(*unused): param._execution_engine.queue_callback(allreduce_params) param.register_hook(allreduce_hook)
Example #5
Source File: distributed.py From imagenet-fast with Apache License 2.0 | 5 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module param_list = [param for param in self.module.state_dict().values() if torch.is_tensor(param)] if dist._backend == dist.dist_backend.NCCL: for param in param_list: assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU." #broadcast parameters flat_dist_call(param_list, dist.broadcast, (0,) ) #all reduce gradient hook def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False else: return grads = [param.grad.data for param in self.module.parameters() if param.grad is not None] flat_dist_call(grads, dist.all_reduce) for param in list(self.module.parameters()): def allreduce_hook(*unused): torch.autograd.Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
Example #6
Source File: distributed.py From imagenet-fast with Apache License 2.0 | 5 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module param_list = [param for param in self.module.state_dict().values() if torch.is_tensor(param)] if dist._backend == dist.dist_backend.NCCL: for param in param_list: assert param.is_cuda, "NCCL backend only supports model parameters to be on GPU." #broadcast parameters flat_dist_call(param_list, dist.broadcast, (0,) ) #all reduce gradient hook def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False else: return grads = [param.grad.data for param in self.module.parameters() if param.grad is not None] flat_dist_call(grads, dist.all_reduce) for param in list(self.module.parameters()): def allreduce_hook(*unused): torch.autograd.Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
Example #7
Source File: distributed.py From nonparaSeq2seqVC_code with MIT License | 4 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() #fallback for PyTorch 0.3 if not hasattr(dist, '_backend'): self.warn_on_half = True else: self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module for p in list(self.module.state_dict().values()): if not torch.is_tensor(p): continue dist.broadcast(p, 0) def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print(("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case. This currently requires" + "PyTorch built from top of tree master.")) self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(self.module.parameters()): def allreduce_hook(*unused): param._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
Example #8
Source File: distributed.py From fac-via-ppg with Apache License 2.0 | 4 votes |
def apply_gradient_allreduce(module): """ Modifies existing model to do gradient allreduce, but doesn't change class so you don't need "module" """ if not hasattr(dist, '_backend'): module.warn_on_half = True else: module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False for p in module.state_dict().values(): if not torch.is_tensor(p): continue dist.broadcast(p, 0) def allreduce_params(): if(module.needs_reduction): module.needs_reduction = False buckets = {} for param in module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if module.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case. This currently requires" + "PyTorch built from top of tree master.") module.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook) dir(param) def set_needs_reduction(self, input, output): self.needs_reduction = True module.register_forward_hook(set_needs_reduction) return module
Example #9
Source File: distributed.py From training with Apache License 2.0 | 4 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module for p in self.module.state_dict().values(): if not torch.is_tensor(p): continue if dist._backend == dist.dist_backend.NCCL: assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU." dist.broadcast(p, 0) def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = param.data.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(self.module.parameters()): def allreduce_hook(*unused): param._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
Example #10
Source File: utils.py From training with Apache License 2.0 | 4 votes |
def reduce(self, op): """ Reduces average value over all workers. :param op: 'sum' or 'mean', reduction operator """ if op not in ('sum', 'mean'): raise NotImplementedError distributed = (get_world_size() > 1) if distributed: # Backward/forward compatibility around # https://github.com/pytorch/pytorch/commit/540ef9b1fc5506369a48491af8a285a686689b36 and # https://github.com/pytorch/pytorch/commit/044d00516ccd6572c0d6ab6d54587155b02a3b86 # To accomodate change in Pytorch's distributed API if hasattr(dist, "get_backend"): _backend = dist.get_backend() if hasattr(dist, "DistBackend"): backend_enum_holder = dist.DistBackend else: backend_enum_holder = dist.Backend else: _backend = dist._backend backend_enum_holder = dist.dist_backend cuda = _backend == backend_enum_holder.NCCL if cuda: avg = torch.cuda.FloatTensor([self.avg]) _sum = torch.cuda.FloatTensor([self.sum]) else: avg = torch.FloatTensor([self.avg]) _sum = torch.FloatTensor([self.sum]) _reduce_op = dist.reduce_op if hasattr(dist, "reduce_op") else dist.ReduceOp dist.all_reduce(avg, op=_reduce_op.SUM) dist.all_reduce(_sum, op=_reduce_op.SUM) self.avg = avg.item() self.sum = _sum.item() if op == 'mean': self.avg /= get_world_size() self.sum /= get_world_size()
Example #11
Source File: distributed.py From waveglow with BSD 3-Clause "New" or "Revised" License | 4 votes |
def apply_gradient_allreduce(module): """ Modifies existing model to do gradient allreduce, but doesn't change class so you don't need "module" """ if not hasattr(dist, '_backend'): module.warn_on_half = True else: module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False for p in module.state_dict().values(): if not torch.is_tensor(p): continue dist.broadcast(p, 0) def allreduce_params(): if(module.needs_reduction): module.needs_reduction = False buckets = {} for param in module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if module.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case. This currently requires" + "PyTorch built from top of tree master.") module.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook) dir(param) def set_needs_reduction(self, input, output): self.needs_reduction = True module.register_forward_hook(set_needs_reduction) return module
Example #12
Source File: distributed.py From nonparaSeq2seqVC_code with MIT License | 4 votes |
def apply_gradient_allreduce(module): if not hasattr(dist, '_backend'): module.warn_on_half = True else: module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False for p in list(module.state_dict().values()): if not torch.is_tensor(p): continue dist.broadcast(p, 0) def allreduce_params(): if(module.needs_reduction): module.needs_reduction = False buckets = {} for param in module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if module.warn_on_half: if torch.cuda.HalfTensor in buckets: print(("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case. This currently requires" + "PyTorch built from top of tree master.")) module.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook) def set_needs_reduction(self, input, output): self.needs_reduction = True module.register_forward_hook(set_needs_reduction) return module
Example #13
Source File: distributed.py From nonparaSeq2seqVC_code with MIT License | 4 votes |
def apply_gradient_allreduce(module): if not hasattr(dist, '_backend'): module.warn_on_half = True else: module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False for p in list(module.state_dict().values()): if not torch.is_tensor(p): continue dist.broadcast(p, 0) def allreduce_params(): if(module.needs_reduction): module.needs_reduction = False buckets = {} for param in module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if module.warn_on_half: if torch.cuda.HalfTensor in buckets: print(("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case. This currently requires" + "PyTorch built from top of tree master.")) module.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook) def set_needs_reduction(self, input, output): self.needs_reduction = True module.register_forward_hook(set_needs_reduction) return module
Example #14
Source File: distributed.py From nonparaSeq2seqVC_code with MIT License | 4 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() #fallback for PyTorch 0.3 if not hasattr(dist, '_backend'): self.warn_on_half = True else: self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module for p in list(self.module.state_dict().values()): if not torch.is_tensor(p): continue dist.broadcast(p, 0) def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print(("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case. This currently requires" + "PyTorch built from top of tree master.")) self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(self.module.parameters()): def allreduce_hook(*unused): param._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
Example #15
Source File: distributed.py From training_results_v0.5 with Apache License 2.0 | 4 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module for p in self.module.state_dict().values(): if not torch.is_tensor(p): continue if dist._backend == dist.dist_backend.NCCL: assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU." dist.broadcast(p, 0) def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = param.data.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(self.module.parameters()): def allreduce_hook(*unused): param._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
Example #16
Source File: distributed.py From tn2-wg with BSD 3-Clause "New" or "Revised" License | 4 votes |
def apply_gradient_allreduce(module): if not hasattr(dist, '_backend'): module.warn_on_half = True else: module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False for p in module.state_dict().values(): if not torch.is_tensor(p): continue dist.broadcast(p, 0) def allreduce_params(): if(module.needs_reduction): module.needs_reduction = False buckets = {} for param in module.parameters(): if param.requires_grad and param.grad is not None: tp = param.data.dtype if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if module.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case. This currently requires" + "PyTorch built from top of tree master.") module.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(module.parameters()): def allreduce_hook(*unused): Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook) def set_needs_reduction(self, input, output): self.needs_reduction = True module.register_forward_hook(set_needs_reduction) return module
Example #17
Source File: distributed.py From tn2-wg with BSD 3-Clause "New" or "Revised" License | 4 votes |
def __init__(self, module): super(DistributedDataParallel, self).__init__() #fallback for PyTorch 0.3 if not hasattr(dist, '_backend'): self.warn_on_half = True else: self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module for p in self.module.state_dict().values(): if not torch.is_tensor(p): continue dist.broadcast(p, 0) def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case. This currently requires" + "PyTorch built from top of tree master.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(self.module.parameters()): def allreduce_hook(*unused): param._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)