Python horovod.tensorflow.rank() Examples
The following are 30
code examples of horovod.tensorflow.rank().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
horovod.tensorflow
, or try the search function
.
Example #1
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
Example #2
Source File: tfops.py From glow with MIT License | 6 votes |
def print_act_stats(x, _str=""): if not do_print_act_stats: return x if hvd.rank() != 0: return x if len(x.get_shape()) == 1: x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True) if len(x.get_shape()) == 2: x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True) if len(x.get_shape()) == 4: x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True) stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean), tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))] return tf.Print(x, stats, "["+_str+"] "+x.name) # Allreduce methods
Example #3
Source File: eval.py From tensorpack with Apache License 2.0 | 6 votes |
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750 buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)] # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu if buggy_tf else num_gpu * 2 self.predictors = [self._build_predictor(k % num_gpu) for k in range(self.num_predictor)] self.dataflows = [get_eval_dataflow(self._eval_dataset, shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor)] else: # Only eval on the first machine, # Because evaluation assumes that all horovod workers share the filesystem. # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs self._horovod_run_eval = hvd.rank() == hvd.local_rank() if self._horovod_run_eval: self.predictor = self._build_predictor(0) self.dataflow = get_eval_dataflow(self._eval_dataset, shard=hvd.local_rank(), num_shards=hvd.local_size()) self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
Example #4
Source File: hvd_distributed_tf_data_utils.py From BERT with Apache License 2.0 | 6 votes |
def train_input_fn(input_file, _parse_fn, name_to_features, params, **kargs): if_shard = kargs.get("if_shard", "1") dataset = tf.data.TFRecordDataset(input_file, buffer_size=params.get("buffer_size", 100)) print("==hvd size {}, rank {}==".format(hvd.size(), hvd.rank())) if if_shard == "1": dataset = dataset.shard(hvd.size(), hvd.rank()) dataset = dataset.map(lambda x:_parse_fn(x, name_to_features)) dataset = dataset.shuffle( buffer_size=params.get("buffer_size", 1024)+3*params.get("batch_size", 32), seed=np.random.randint(0,1e10,1)[0], reshuffle_each_iteration=True) dataset = dataset.batch(params.get("batch_size", 32)) dataset = dataset.repeat(params.get("epoch", 100)) iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features
Example #5
Source File: tfops.py From pix2pix-flow with MIT License | 6 votes |
def print_act_stats(x, _str=""): if not do_print_act_stats: return x if hvd.rank() != 0: return x if len(x.get_shape()) == 1: x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True) if len(x.get_shape()) == 2: x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True) if len(x.get_shape()) == 4: x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True) stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean), tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))] return tf.Print(x, stats, "["+_str+"] "+x.name) # Allreduce methods
Example #6
Source File: trainers.py From tensorpack with Apache License 2.0 | 6 votes |
def __init__(self, average=True, compression=None): """ Args: average (bool): whether to average or sum the gradients across processes. compression: `hvd.Compression.fp16` or `hvd.Compression.none` """ if 'pyarrow' in sys.modules: logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs.") # lazy import import horovod.tensorflow as hvd import horovod hvd_version = tuple(map(int, horovod.__version__.split('.')[:3])) self.hvd = hvd hvd.init() self.is_chief = hvd.rank() == 0 self._local_rank = hvd.local_rank() self._rank = hvd.rank() self._average = average self._compression = compression self._has_compression = hvd_version >= (0, 15, 0) logger.info("[HorovodTrainer] local rank={}".format(self._local_rank)) super(HorovodTrainer, self).__init__() self.BROADCAST_EVERY_EPOCH = True
Example #7
Source File: solver.py From athena with Apache License 2.0 | 6 votes |
def train(self, dataset, total_batches=-1): """ Update the model in 1 epoch """ train_step = self.train_step if self.hparams.enable_tf_function: logging.info("please be patient, enable tf.function, it takes time ...") train_step = tf.function(train_step, input_signature=self.sample_signature) for batch, samples in enumerate(dataset.take(total_batches)): # train 1 step samples = self.model.prepare_samples(samples) loss, metrics = train_step(samples) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if batch == 0: hvd.broadcast_variables(self.model.trainable_variables, root_rank=0) hvd.broadcast_variables(self.optimizer.variables(), root_rank=0) if batch % self.hparams.log_interval == 0 and hvd.rank() == 0: logging.info(self.metric_checker(loss, metrics)) self.model.reset_metrics()
Example #8
Source File: horovod_patches.py From cape-document-qa with Apache License 2.0 | 6 votes |
def _train(model: Model, data: TrainingData, checkpoint: Union[str, None], parameter_checkpoint: Union[str, None], save_start: bool, train_params: trainer.TrainParams, evaluators: List[Evaluator], out: ModelDir, notes=None, dry_run=False, start_eval=False): print('Horovod size: ', hvd.size()) print('Horovod rank: ', hvd.rank()) print('Horovod local rank: ', hvd.local_rank()) if train_params.async_encoding: _train_async(model, data, checkpoint, parameter_checkpoint, save_start, train_params, evaluators, out, notes, dry_run, start_eval) return else: raise NotImplementedError('Syncronous training with Horovod not supported yet')
Example #9
Source File: solver.py From athena with Apache License 2.0 | 6 votes |
def evaluate(self, dataset, epoch=0): """ evaluate the model """ loss_metric = tf.keras.metrics.Mean(name="AverageLoss") loss, metrics = None, None evaluate_step = self.evaluate_step if self.hparams.enable_tf_function: logging.info("please be patient, enable tf.function, it takes time ...") evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature) self.model.reset_metrics() for batch, samples in enumerate(dataset): samples = self.model.prepare_samples(samples) loss, metrics = evaluate_step(samples) if batch % self.hparams.log_interval == 0 and hvd.rank() == 0: logging.info(self.metric_checker(loss, metrics, -2)) loss_metric.update_state(loss) if hvd.rank() == 0: logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch)) self.model.reset_metrics() return loss_metric.result()
Example #10
Source File: trainers.py From tensorpack with Apache License 2.0 | 6 votes |
def __init__(self, average=True): """ Args: average (bool): whether to average or sum the gradients across processes. """ import byteps.tensorflow as bps self.hvd = bps # BytePS has the same interface as Horovod self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8 assert os.environ.get("DMLC_ROLE", None) == "worker" assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ bps.init() self.is_chief = bps.rank() == 0 self._local_rank = bps.local_rank() self._rank = bps.rank() self._average = average self._compression = None self._has_compression = False logger.info("[BytePSTrainer] local rank={}".format(self._local_rank)) SingleCostTrainer.__init__(self)
Example #11
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return device = "/gpu:%d" % local_rank if local_rank % 2 == 0 else "/cpu:0" with self.test_session(config=self.config) as session: with tf.device(device): # Same rank, different dimension dims = [17] * 3 tensor = tf.ones(dims, dtype=tf.int32) with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allreduce(tensor))
Example #12
Source File: flow_training.py From flowpp with MIT License | 6 votes |
def setup_horovod(): import horovod.tensorflow as hvd # Initialize Horovod hvd.init() # Verify that MPI multi-threading is supported. assert hvd.mpi_threads_supported() from mpi4py import MPI assert hvd.size() == MPI.COMM_WORLD.Get_size() is_root = hvd.rank() == 0 def mpi_average(local_list): # _local_list_orig = local_list local_list = list(map(float, local_list)) # print('RANK {} AVERAGING {} -> {}'.format(hvd.rank(), _local_list_orig, local_list)) sums = MPI.COMM_WORLD.gather(sum(local_list), root=0) counts = MPI.COMM_WORLD.gather(len(local_list), root=0) sum_counts = sum(counts) if is_root else None avg = (sum(sums) / sum_counts) if is_root else None return avg, sum_counts return hvd, MPI, is_root, mpi_average
Example #13
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_allgather_error(self): """Test that the allgather returns an error if any dimension besides the first is different among the tensors being gathered.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = tf.ones(tensor_size, dtype=tf.float32) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allgather(tensor))
Example #14
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_allgather_type_error(self): """Test that the allgather returns an error if the types being gathered differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.allgather(tensor))
Example #15
Source File: trainers.py From ADL with MIT License | 6 votes |
def __init__(self, average=True): """ Args: average (bool): whether to average or sum the gradients across processes. """ import byteps.tensorflow as bps self.hvd = bps # BytePS has the same interface as Horovod self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8 assert os.environ.get("DMLC_ROLE", None) == "worker" assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ bps.init() self.is_chief = bps.rank() == 0 self._local_rank = bps.local_rank() self._rank = bps.rank() self._average = average self._compression = None self._has_compression = False logger.info("[BytePSTrainer] local rank={}".format(self._local_rank)) SingleCostTrainer.__init__(self)
Example #16
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return with self.test_session(config=self.config) as session: tensor_size = [17] * 3 dtype = tf.int32 if rank % 2 == 0 else tf.float32 tensor = tf.ones(tensor_size, dtype=dtype) * rank with self.assertRaises(tf.errors.FailedPreconditionError): session.run(hvd.broadcast(tensor, 0))
Example #17
Source File: horovod.py From blueoil with Apache License 2.0 | 6 votes |
def setup(): if not horovod_installed: return False global horovod_initialized if horovod_initialized: return hvd hvd.init() horovod_initialized = True horovod_num_worker = hvd.size() horovod_rank = hvd.rank() # verify that MPI multi-threading is supported. assert hvd.mpi_threads_supported() # make sure MPI is not re-initialized. import mpi4py.rc mpi4py.rc.initialize = False # import mpi4py from mpi4py import MPI comm = MPI.COMM_WORLD # check size and rank are synchronized assert horovod_num_worker == comm.Get_size() assert horovod_rank == comm.Get_rank() return hvd
Example #18
Source File: horovod.py From blueoil with Apache License 2.0 | 6 votes |
def is_enabled(): if os.getenv("USE_HOROVOD"): return True ppid = os.getppid() if ppid <= 1: return False parent_process_name = _get_pname(ppid) if parent_process_name.startswith("horovodrun") or parent_process_name.startswith("mpirun"): if horovod_installed: return True else: print("you're trying to run on horovod, but importing Horovod failed. exit.") sys.exit(1) else: return False # return True if horovod is not enabled, or enabled and the process is rank 0.
Example #19
Source File: horovod.py From blueoil with Apache License 2.0 | 6 votes |
def setup(): if not horovod_installed: return False global horovod_initialized if horovod_initialized: return hvd hvd.init() horovod_initialized = True horovod_num_worker = hvd.size() horovod_rank = hvd.rank() # verify that MPI multi-threading is supported. assert hvd.mpi_threads_supported() # make sure MPI is not re-initialized. import mpi4py.rc mpi4py.rc.initialize = False # import mpi4py from mpi4py import MPI comm = MPI.COMM_WORLD # check size and rank are synchronized assert horovod_num_worker == comm.Get_size() assert horovod_rank == comm.Get_rank() return hvd
Example #20
Source File: horovod.py From blueoil with Apache License 2.0 | 6 votes |
def is_enabled(): if os.getenv("USE_HOROVOD"): return True ppid = os.getppid() if ppid <= 1: return False parent_process_name = _get_pname(ppid) if parent_process_name.startswith("horovodrun") or parent_process_name.startswith("mpirun"): if horovod_installed: return True else: print("you're trying to run on horovod, but importing Horovod failed. exit.") sys.exit(1) else: return False # return True if horovod is not enabled, or enabled and the process is rank 0.
Example #21
Source File: classifier.py From tensorflow_fasttext with MIT License | 6 votes |
def FastTrain(): print("FastTrain", FLAGS.train_steps) estimator = FastTextEstimator(FLAGS.model_dir) print("TEST" + FLAGS.train_records) train_input = InputFn(tf.estimator.ModeKeys.TRAIN, FLAGS.train_records) print("STARTING TRAIN") hooks = None if FLAGS.horovod: hooks = [hvd.BroadcastGlobalVariablesHook(0)] estimator.train(input_fn=train_input, steps=FLAGS.train_steps, hooks=hooks) print("TRAIN COMPLETE") if not FLAGS.horovod or hvd.rank() == 0: print("EVALUATE") eval_input = InputFn(tf.estimator.ModeKeys.EVAL, FLAGS.eval_records) #eval_metrics = { "accuracy": tf.metrics.accuracy(labels, predictions) } result = estimator.evaluate(input_fn=eval_input, steps=FLAGS.eval_steps, hooks=None) print(result) print("DONE") if FLAGS.export_dir: print("EXPORTING") estimator.export_savedmodel(FLAGS.export_dir, inputs.ServingInputFn(FLAGS.use_ngrams))
Example #22
Source File: resnet_main.py From DistributedDeepLearning with MIT License | 6 votes |
def _get_runconfig(is_distributed=defaults.DISTRIBUTED, save_checkpoints_steps=None): if is_distributed: # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) return tf.estimator.RunConfig( save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, session_config=config, log_step_count_steps=100, ) else: return tf.estimator.RunConfig( save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, log_step_count_steps=100, )
Example #23
Source File: train_model.py From DistributedDeepLearning with MIT License | 6 votes |
def _get_runconfig(is_distributed=DISTRIBUTED, save_checkpoints_steps=None): if is_distributed: # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) return tf.estimator.RunConfig( save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, session_config=config, log_step_count_steps=100, ) else: return tf.estimator.RunConfig( save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, log_step_count_steps=100, )
Example #24
Source File: horovod_mnist.py From sagemaker-tensorflow-training-toolkit with Apache License 2.0 | 6 votes |
def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = mnist_model(images, training=True) loss_value = loss(labels, probs) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(mnist_model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return loss_value # Horovod: adjust number of steps based on number of GPUs.
Example #25
Source File: variational_autoencoder_deconv_tfdataset_horovod.py From keras_experiments with The Unlicense | 5 votes |
def parser_(desc): parser = ap.ArgumentParser(description=desc, formatter_class=CustomFormatter) parser.add_argument( '--epochs', type=int, default=5, help='Number of epochs to run training for. Default: %(default)s') parser.add_argument( '--nranks_per_gpu', type=int, default=1, help='S|Number of ranks to run on each GPUs. Use this parameter to\n' 'oversubscribe a GPU. When oversubscribing a GPU use in combination\n' 'with MPS (multi-process service). Default: %(default)s') parser.add_argument( '--speedup', action='store', nargs='?', type=str.lower, const=SpeedupOpts.epoch, default=SpeedupOpts.epoch, choices=[SpeedupOpts.epoch, SpeedupOpts.imgspersec], help='S|Speedup the fit to run each epoch faster or overall images\n' 'per seconds faster. With the "imgspersec" option each rank\n' 'fits the fullset and with epoch option only a subset. Loss\n' 'convergence stability might be better with imgspersec option.\n' 'Default: %(default)s') args = parser.parse_args() return args
Example #26
Source File: graph_transform.py From parallax with Apache License 2.0 | 5 votes |
def graph_transform_mpi(single_gpu_meta_graph_def, config, op_library_path=None): if op_library_path is not None: tf.load_op_library(op_library_path) with tf.Graph().as_default() as replica: tf.train.import_meta_graph(single_gpu_meta_graph_def) tensor_or_op_name_to_replica_names = {} for op in replica.get_operations(): tensor_or_op_name_to_replica_names[op.name] = [op.name] for output in op.outputs: tensor_or_op_name_to_replica_names[output.name] = [output.name] # Initialize horovod hvd.init() num_workers = hvd.size() worker_id = hvd.rank() update_shard_values_for_worker(num_workers, worker_id) op_to_control_consumer_ops = get_all_control_consumers(replica) trainable_variable_ops = [var.op for var in tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES)] for gradients_info in tf.get_collection(tf.GraphKeys.GRADIENTS_INFO): target_tensor = gradients_info._target if target_tensor.op not in trainable_variable_ops: parallax_log.debug( "Gradient for non-trainable variable %s is created, ignore" % target_tensor.op.name) continue _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config) _add_broadcast_ops() return tf.train.export_meta_graph(graph=replica), \ tensor_or_op_name_to_replica_names
Example #27
Source File: eval.py From tensorpack with Apache License 2.0 | 5 votes |
def _eval(self): logdir = self._output_dir if cfg.TRAINER == 'replicated': all_results = multithread_predict_dataflow(self.dataflows, self.predictors) else: filenames = [os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, rank) ) for rank in range(hvd.local_size())] if self._horovod_run_eval: local_results = predict_dataflow(self.dataflow, self.predictor) fname = filenames[hvd.local_rank()] with open(fname, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for fname in filenames: with open(fname, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(fname) scores = DatasetRegistry.get(self._eval_dataset).eval_inference_results(all_results) for k, v in scores.items(): self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v)
Example #28
Source File: resnet_main.py From DistributedDeepLearning with MIT License | 5 votes |
def _get_rank(): if defaults.DISTRIBUTED: try: return hvd.rank() except: return 0 else: return 0 # Data processing ###############################################################################
Example #29
Source File: trainers.py From tensorpack with Apache License 2.0 | 5 votes |
def initialize(self, session_creator, session_init): # broadcast_op should be the last setup_graph: it needs to be created # "right before" the graph is finalized, # because it needs to capture all the variables (which may be created by callbacks). self._broadcast_op = self.hvd.broadcast_global_variables(0) # it's important that our NewSessionCreator does not finalize the graph if not isinstance(session_creator, NewSessionCreator): raise ValueError( "session_creator has to be `NewSessionCreator` for horovod/byteps training! ") # NOTE It will fail if GPU was already detected before initializing the session # https://github.com/tensorflow/tensorflow/issues/8136 session_creator.config.gpu_options.visible_device_list = str(self._local_rank) try: session_creator.config.inter_op_parallelism_threads = mp.cpu_count() // self.hvd.local_size() except AttributeError: # old horovod does not have local_size pass super(HorovodTrainer, self).initialize(session_creator, session_init) # This broadcast belongs to the "intialize" stage # It should not be delayed to the "before_train" stage. # TODO: # 1. a allgather helper to concat strings # 2. check variables on each rank match each other, print warnings, and broadcast the common set. if self.is_chief: logger.info("Broadcasting initialized variables ...") else: logger.info("Rank {} waiting for initialization broadcasting ...".format(self._rank)) self.sess.run(self._broadcast_op)
Example #30
Source File: trainers.py From ADL with MIT License | 5 votes |
def __init__(self, average=True, compression=None): """ Args: average (bool): whether to average or sum the gradients across processes. compression: `hvd.Compression.fp16` or `hvd.Compression.none` """ if 'pyarrow' in sys.modules: logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs. " "Uninstall pyarrow and use msgpack instead.") # lazy import import horovod.tensorflow as hvd import horovod hvd_version = tuple(map(int, horovod.__version__.split('.')[:3])) self.hvd = hvd hvd.init() self.is_chief = hvd.rank() == 0 self._local_rank = hvd.local_rank() self._rank = hvd.rank() self._average = average self._compression = compression self._has_compression = hvd_version >= (0, 15, 0) logger.info("[HorovodTrainer] local rank={}".format(self._local_rank)) super(HorovodTrainer, self).__init__() self.BROADCAST_EVERY_EPOCH = True