Python horovod.tensorflow.init() Examples
The following are 30
code examples of horovod.tensorflow.init().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
horovod.tensorflow
, or try the search function
.
Example #1
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
Example #2
Source File: trainers.py From tensorpack with Apache License 2.0 | 6 votes |
def __init__(self, average=True): """ Args: average (bool): whether to average or sum the gradients across processes. """ import byteps.tensorflow as bps self.hvd = bps # BytePS has the same interface as Horovod self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8 assert os.environ.get("DMLC_ROLE", None) == "worker" assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ bps.init() self.is_chief = bps.rank() == 0 self._local_rank = bps.local_rank() self._rank = bps.rank() self._average = average self._compression = None self._has_compression = False logger.info("[BytePSTrainer] local rank={}".format(self._local_rank)) SingleCostTrainer.__init__(self)
Example #3
Source File: trainers.py From tensorpack with Apache License 2.0 | 6 votes |
def __init__(self, average=True, compression=None): """ Args: average (bool): whether to average or sum the gradients across processes. compression: `hvd.Compression.fp16` or `hvd.Compression.none` """ if 'pyarrow' in sys.modules: logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs.") # lazy import import horovod.tensorflow as hvd import horovod hvd_version = tuple(map(int, horovod.__version__.split('.')[:3])) self.hvd = hvd hvd.init() self.is_chief = hvd.rank() == 0 self._local_rank = hvd.local_rank() self._rank = hvd.rank() self._average = average self._compression = compression self._has_compression = hvd_version >= (0, 15, 0) logger.info("[HorovodTrainer] local rank={}".format(self._local_rank)) super(HorovodTrainer, self).__init__() self.BROADCAST_EVERY_EPOCH = True
Example #4
Source File: solver.py From athena with Apache License 2.0 | 6 votes |
def evaluate(self, dataset, epoch): """ evaluate the model """ loss_metric = tf.keras.metrics.Mean(name="AverageLoss") loss, metrics = None, None evaluate_step = self.evaluate_step if self.hparams.enable_tf_function: logging.info("please be patient, enable tf.function, it takes time ...") evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature) self.model.reset_metrics() # init metric.result() with 0 for batch, samples in enumerate(dataset): samples = self.model.prepare_samples(samples) loss, metrics = evaluate_step(samples) if batch % self.hparams.log_interval == 0: logging.info(self.metric_checker(loss, metrics, -2)) loss_metric.update_state(loss) logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch)) self.model.reset_metrics() return loss_metric.result()
Example #5
Source File: flow_training.py From flowpp with MIT License | 6 votes |
def setup_horovod(): import horovod.tensorflow as hvd # Initialize Horovod hvd.init() # Verify that MPI multi-threading is supported. assert hvd.mpi_threads_supported() from mpi4py import MPI assert hvd.size() == MPI.COMM_WORLD.Get_size() is_root = hvd.rank() == 0 def mpi_average(local_list): # _local_list_orig = local_list local_list = list(map(float, local_list)) # print('RANK {} AVERAGING {} -> {}'.format(hvd.rank(), _local_list_orig, local_list)) sums = MPI.COMM_WORLD.gather(sum(local_list), root=0) counts = MPI.COMM_WORLD.gather(len(local_list), root=0) sum_counts = sum(counts) if is_root else None avg = (sum(sums) / sum_counts) if is_root else None return avg, sum_counts return hvd, MPI, is_root, mpi_average
Example #6
Source File: tf_distributed_optimizer.py From deep500 with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, optimizer: TFOptimizer, comm=None): super().__init__(optimizer.executor, optimizer.loss) try: import horovod.tensorflow as hvd except ImportError: raise ImportError('Cannot import Horovod') hvd.init() self.op = hvd.DistributedOptimizer(optimizer.op) if comm is None: comm = CommunicationNetwork() self.communication = comm self.original_optimizer = optimizer
Example #7
Source File: trainers.py From ADL with MIT License | 6 votes |
def __init__(self, average=True): """ Args: average (bool): whether to average or sum the gradients across processes. """ import byteps.tensorflow as bps self.hvd = bps # BytePS has the same interface as Horovod self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8 assert os.environ.get("DMLC_ROLE", None) == "worker" assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ bps.init() self.is_chief = bps.rank() == 0 self._local_rank = bps.local_rank() self._rank = bps.rank() self._average = average self._compression = None self._has_compression = False logger.info("[BytePSTrainer] local rank={}".format(self._local_rank)) SingleCostTrainer.__init__(self)
Example #8
Source File: horovod.py From blueoil with Apache License 2.0 | 6 votes |
def setup(): if not horovod_installed: return False global horovod_initialized if horovod_initialized: return hvd hvd.init() horovod_initialized = True horovod_num_worker = hvd.size() horovod_rank = hvd.rank() # verify that MPI multi-threading is supported. assert hvd.mpi_threads_supported() # make sure MPI is not re-initialized. import mpi4py.rc mpi4py.rc.initialize = False # import mpi4py from mpi4py import MPI comm = MPI.COMM_WORLD # check size and rank are synchronized assert horovod_num_worker == comm.Get_size() assert horovod_rank == comm.Get_rank() return hvd
Example #9
Source File: horovod.py From blueoil with Apache License 2.0 | 6 votes |
def setup(): if not horovod_installed: return False global horovod_initialized if horovod_initialized: return hvd hvd.init() horovod_initialized = True horovod_num_worker = hvd.size() horovod_rank = hvd.rank() # verify that MPI multi-threading is supported. assert hvd.mpi_threads_supported() # make sure MPI is not re-initialized. import mpi4py.rc mpi4py.rc.initialize = False # import mpi4py from mpi4py import MPI comm = MPI.COMM_WORLD # check size and rank are synchronized assert horovod_num_worker == comm.Get_size() assert horovod_rank == comm.Get_rank() return hvd
Example #10
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
Example #11
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_allgather_type_error(self): """Test that the allgather returns an error if the types being gathered differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allgather(tensor))
Example #12
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_allgather_error(self): """Test that the allgather returns an error if any dimension besides the first is different among the tensors being gathered.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allgather(tensor))
Example #13
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return device = "/gpu:%d" % local_rank if local_rank % 2 == 0 else "/cpu:0" with self.test_session(config=self.config) as session: with tf.device(device): # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
Example #14
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
Example #15
Source File: solver.py From athena with Apache License 2.0 | 6 votes |
def evaluate(self, dataset, epoch): """ evaluate the model """ loss_metric = tf.keras.metrics.Mean(name="AverageLoss") loss, metrics = None, None evaluate_step = self.evaluate_step if self.hparams.enable_tf_function: logging.info("please be patient, enable tf.function, it takes time ...") evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature) self.model.reset_metrics() # init metric.result() with 0 for batch, samples in enumerate(dataset): samples = self.model.prepare_samples(samples) loss, metrics = evaluate_step(samples) if batch % self.hparams.log_interval == 0: logging.info(self.metric_checker(loss, metrics, -2)) total_loss = sum(list(loss.values())) if isinstance(loss, dict) else loss loss_metric.update_state(total_loss) logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch)) self.model.reset_metrics() return loss_metric.result(), metrics
Example #16
Source File: trainers.py From ADL with MIT License | 5 votes |
def __init__(self, average=True, compression=None): """ Args: average (bool): whether to average or sum the gradients across processes. compression: `hvd.Compression.fp16` or `hvd.Compression.none` """ if 'pyarrow' in sys.modules: logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs. " "Uninstall pyarrow and use msgpack instead.") # lazy import import horovod.tensorflow as hvd import horovod hvd_version = tuple(map(int, horovod.__version__.split('.')[:3])) self.hvd = hvd hvd.init() self.is_chief = hvd.rank() == 0 self._local_rank = hvd.local_rank() self._rank = hvd.rank() self._average = average self._compression = compression self._has_compression = hvd_version >= (0, 15, 0) logger.info("[HorovodTrainer] local rank={}".format(self._local_rank)) super(HorovodTrainer, self).__init__() self.BROADCAST_EVERY_EPOCH = True
Example #17
Source File: gloo_allred_task.py From tf-yarn with Apache License 2.0 | 5 votes |
def _driver_fn(client, net_if): cluster_tasks = _task_commons._get_cluster_tasks(client) # Worker discovery worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"] n_workers = 1 for cluster_task in cluster_tasks: if 'worker' in cluster_task: worker_addr = event.wait(client, f"{cluster_task}/addr") logger.info(f"{cluster_task}: {worker_addr}") worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}") n_workers += 1 # Worker task allocation to workers hosts = gloo_run.parse_hosts(','.join(worker_list)) host_alloc_plan = gloo_run.get_host_assignments(hosts, n_workers) for host in host_alloc_plan: host_info = f"""\ {host.rank},{host.size},{host.local_rank},\ {host.local_size},{host.cross_rank},{host.cross_size}\ """ event.broadcast(client, f"{cluster.get_task()}/{host.hostname}", host_info) global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start_server() global_rendezv.httpd.init(host_alloc_plan) event.broadcast(client, f"{cluster.get_task()}/sock_addr", f"{net_if[1]}:{global_rendezv_port}") return global_rendezv.listen_thread
Example #18
Source File: tensorflow_executor.py From rlgraph with Apache License 2.0 | 5 votes |
def setup_horovod_execution(self): """ Sets up Horovod. """ # Check again to avoid import if unset which will crash if horovod is not installed. if get_distributed_backend() == "horovod": import horovod.tensorflow as hvd self.logger.info("Setting up Horovod execution.") hvd.init() config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank())
Example #19
Source File: cape_ablate_horovod.py From cape-document-qa with Apache License 2.0 | 5 votes |
def run_training(savename: str, train_config: TrainConfig, dataset_oversampling: Dict[str, int], n_processes: int, use_cudnn: bool ): """Train a Cape-Flavoured DocumentQA model. After preparing the datasets for training, a model will be created and saved in a directory specified by `savename`. Logging (Tensorboard) can be found in the log subdirectory of the model directory. The datasets to train the model on are specified in the `dataset_oversampling` dictionary. E.g. {'squad': 2, 'wiki':1} will train a model on one equivalence of triviaqa wiki and two equivalences of squad. :param savename: Name of model :param train_config: cape_config.TrainConfig object containing hyperparameters etc :param dataset_oversampling: dictionary mapping dataset names to integer counts of how much to oversample them :param n_processes: Number of processes to paralellize prepro on :param use_cudnn: Whether to train with GRU's optimized for Cudnn (recommended) """ hvd.init() model = build_model(WithIndicators(), train_config, use_cudnn=use_cudnn) data = prepare_data(model, train_config, dataset_oversampling, n_processes) eval = get_evaluators(train_config) params = get_training_params(train_config) with open(__file__, "r", encoding='utf8') as f: notes = f.read() notes = "Mode: " + train_config.trivia_qa_mode + "\n" + notes notes += '\nDataset oversampling : ' + str(dataset_oversampling) # pull the trigger trainer.start_training(data, model, params, eval, model_dir.ModelDir(savename), notes)
Example #20
Source File: solver.py From athena with Apache License 2.0 | 5 votes |
def initialize_devices(visible_gpu_idx=None): """ initialize hvd devices, should be called firstly """ if visible_gpu_idx is not None: warnings.warn("we can not set the visible gpu idx like this") hvd.init() gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
Example #21
Source File: train.py From glow with MIT License | 5 votes |
def main(hps): # Initialize Horovod. hvd.init() # Create tensorflow session sess = tensorflow_session() # Download and load dataset. tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed) np.random.seed(hvd.rank() + hvd.size() * hps.seed) # Get data and set train_its and valid_its train_iterator, test_iterator, data_init = get_data(hps, sess) hps.train_its, hps.test_its, hps.full_test_its = get_its(hps) # Create log dir logdir = os.path.abspath(hps.logdir) + "/" if not os.path.exists(logdir): os.mkdir(logdir) # Create model import model model = model.model(sess, hps, train_iterator, test_iterator, data_init) # Initialize visualization functions visualise = init_visualizations(hps, model, logdir) if not hps.inference: # Perform training train(sess, model, hps, logdir, visualise) else: infer(sess, model, hps, test_iterator)
Example #22
Source File: distribute.py From THUMT with BSD 3-Clause "New" or "Revised" License | 5 votes |
def enable_distributed_training(): global _ENGINE try: import horovod.tensorflow as hvd _ENGINE = hvd hvd.init() except ImportError: sys.stderr.write("Error: You must install horovod first in order to" " enable distributed training.\n") exit()
Example #23
Source File: speech2text_test.py From OpenSeq2Seq with Apache License 2.0 | 5 votes |
def convergence_with_iter_size_test(self): try: import horovod.tensorflow as hvd hvd.init() except ImportError: print("Horovod not installed skipping test_convergence_with_iter_size") return for dtype in [tf.float32, "mixed"]: train_config, eval_config = self.prepare_config() train_config.update({ "dtype": dtype, "iter_size": 5, "batch_size_per_gpu": 2, "use_horovod": True, "num_epochs": 200, }) eval_config.update({ "dtype": dtype, "iter_size": 5, "batch_size_per_gpu": 2, "use_horovod": True, }) loss, eval_loss, eval_dict = self.run_model( train_config, eval_config, hvd, ) self.assertLess(loss, 10.0) self.assertLess(eval_loss, 30.0) self.assertLess(eval_dict['Eval WER'], 0.2)
Example #24
Source File: multi_gpu_wrapper.py From tf-hrnet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def init(cls, *args): """Initialization.""" try: return mgw.init(*args) except NameError: raise NameError('module <mgw> not imported')
Example #25
Source File: train_model.py From DistributedDeepLearning with MIT License | 5 votes |
def main(): """Train your model """ logger = logging.getLogger(__name__) if DISTRIBUTED: # Horovod: initialize Horovod. hvd.init() logger.info("Running Distributed") logger.info("Num GPUs: {:.3f}".format(hvd.size())) input_function = input_fn run_config = _get_runconfig() params = { "learning_rate": LR, "momentum": MOMENTUM, "classes": NUM_CLASSES, } logger.info("Creating estimator with params: {}".format(params)) model = tf.estimator.Estimator( model_fn=model_fn, params=params, config=run_config ) hooks = _get_hooks() model.train(input_fn=input_function, hooks=hooks) model.evaluate(input_fn=input_function)
Example #26
Source File: graph_transform.py From parallax with Apache License 2.0 | 5 votes |
def graph_transform_mpi(single_gpu_meta_graph_def, config, op_library_path=None): if op_library_path is not None: tf.load_op_library(op_library_path) with tf.Graph().as_default() as replica: tf.train.import_meta_graph(single_gpu_meta_graph_def) tensor_or_op_name_to_replica_names = {} for op in replica.get_operations(): tensor_or_op_name_to_replica_names[op.name] = [op.name] for output in op.outputs: tensor_or_op_name_to_replica_names[output.name] = [output.name] # Initialize horovod hvd.init() num_workers = hvd.size() worker_id = hvd.rank() update_shard_values_for_worker(num_workers, worker_id) op_to_control_consumer_ops = get_all_control_consumers(replica) trainable_variable_ops = [var.op for var in tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES)] for gradients_info in tf.get_collection(tf.GraphKeys.GRADIENTS_INFO): target_tensor = gradients_info._target if target_tensor.op not in trainable_variable_ops: parallax_log.debug( "Gradient for non-trainable variable %s is created, ignore" % target_tensor.op.name) continue _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config) _add_broadcast_ops() return tf.train.export_meta_graph(graph=replica), \ tensor_or_op_name_to_replica_names
Example #27
Source File: solver.py From athena with Apache License 2.0 | 5 votes |
def initialize_devices(visible_gpu_idx=None): """ initialize hvd devices, should be called firstly """ if visible_gpu_idx is not None: warnings.warn("we can not set the visible gpu idx like this") hvd.init() gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
Example #28
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 5 votes |
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: tensor = tf.ones([17] * 3, dtype=tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, rank))
Example #29
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 5 votes |
def test_horovod_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: # Same rank, different dimension tf.set_random_seed(1234) dims = [17 + rank] * 3 tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor)) # Same number of elements, different rank tf.set_random_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = tf.random_uniform(dims, -1.0, 1.0) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
Example #30
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 5 votes |
def test_horovod_allreduce_cpu_fused(self): """Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors with Tensor Fusion.""" hvd.init() size = hvd.size() with self.test_session(config=self.config) as session: dtypes = [tf.int32, tf.int64, tf.float32, tf.float64] dims = [1, 2, 3] tests = [] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/cpu:0"): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [tf.int32, tf.int64]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break test = max_difference <= threshold tests.append(test) self.assertTrue(session.run(tf.reduce_all(tests)), "hvd.allreduce produces incorrect results")