Python Examples of horovod.tensorflow.rank

Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0

6 votes

def test_horovod_allreduce_type_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different type."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            # Same rank, different dimension
            dims = [17] * 3
            tensor = tf.ones(dims,
                             dtype=tf.int32 if rank % 2 == 0 else tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))

Source File: tfops.py From glow with MIT License

6 votes

def print_act_stats(x, _str=""):
    if not do_print_act_stats:
        return x
    if hvd.rank() != 0:
        return x
    if len(x.get_shape()) == 1:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 2:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 4:
        x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True)
    stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),
             tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
    return tf.Print(x, stats, "["+_str+"] "+x.name)

# Allreduce methods

Source File: eval.py From tensorpack with Apache License 2.0

6 votes

def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750
            buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)]

            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu if buggy_tf else num_gpu * 2
            self.predictors = [self._build_predictor(k % num_gpu) for k in range(self.num_predictor)]
            self.dataflows = [get_eval_dataflow(self._eval_dataset,
                                                shard=k, num_shards=self.num_predictor)
                              for k in range(self.num_predictor)]
        else:
            # Only eval on the first machine,
            # Because evaluation assumes that all horovod workers share the filesystem.
            # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
            self._horovod_run_eval = hvd.rank() == hvd.local_rank()
            if self._horovod_run_eval:
                self.predictor = self._build_predictor(0)
                self.dataflow = get_eval_dataflow(self._eval_dataset,
                                                  shard=hvd.local_rank(), num_shards=hvd.local_size())

            self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))

Source File: hvd_distributed_tf_data_utils.py From BERT with Apache License 2.0

6 votes

def train_input_fn(input_file, _parse_fn, name_to_features,
		params, **kargs):
	if_shard = kargs.get("if_shard", "1")
	dataset = tf.data.TFRecordDataset(input_file, buffer_size=params.get("buffer_size", 100))
	print("==hvd size {}, rank {}==".format(hvd.size(), hvd.rank()))
	if if_shard == "1":
		dataset = dataset.shard(hvd.size(), hvd.rank())
	dataset = dataset.map(lambda x:_parse_fn(x, name_to_features))
	dataset = dataset.shuffle(
							buffer_size=params.get("buffer_size", 1024)+3*params.get("batch_size", 32),
							seed=np.random.randint(0,1e10,1)[0],
							reshuffle_each_iteration=True)
	dataset = dataset.batch(params.get("batch_size", 32))
	dataset = dataset.repeat(params.get("epoch", 100))
	iterator = dataset.make_one_shot_iterator()
	features = iterator.get_next()
	return features

Source File: tfops.py From pix2pix-flow with MIT License

6 votes

def print_act_stats(x, _str=""):
    if not do_print_act_stats:
        return x
    if hvd.rank() != 0:
        return x
    if len(x.get_shape()) == 1:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 2:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 4:
        x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True)
    stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),
             tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
    return tf.Print(x, stats, "["+_str+"] "+x.name)

# Allreduce methods

Source File: trainers.py From tensorpack with Apache License 2.0

6 votes

def __init__(self, average=True, compression=None):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
            compression: `hvd.Compression.fp16` or `hvd.Compression.none`
        """
        if 'pyarrow' in sys.modules:
            logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs.")
        # lazy import
        import horovod.tensorflow as hvd
        import horovod
        hvd_version = tuple(map(int, horovod.__version__.split('.')[:3]))
        self.hvd = hvd

        hvd.init()
        self.is_chief = hvd.rank() == 0
        self._local_rank = hvd.local_rank()
        self._rank = hvd.rank()
        self._average = average
        self._compression = compression
        self._has_compression = hvd_version >= (0, 15, 0)
        logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
        super(HorovodTrainer, self).__init__()

        self.BROADCAST_EVERY_EPOCH = True

Source File: solver.py From athena with Apache License 2.0

6 votes

def train(self, dataset, total_batches=-1):
        """ Update the model in 1 epoch """
        train_step = self.train_step
        if self.hparams.enable_tf_function:
            logging.info("please be patient, enable tf.function, it takes time ...")
            train_step = tf.function(train_step, input_signature=self.sample_signature)
        for batch, samples in enumerate(dataset.take(total_batches)):
            # train 1 step
            samples = self.model.prepare_samples(samples)
            loss, metrics = train_step(samples)
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            #
            # Note: broadcast should be done after the first gradient step to ensure optimizer
            # initialization.
            if batch == 0:
                hvd.broadcast_variables(self.model.trainable_variables, root_rank=0)
                hvd.broadcast_variables(self.optimizer.variables(), root_rank=0)
            if batch % self.hparams.log_interval == 0 and hvd.rank() == 0:
                logging.info(self.metric_checker(loss, metrics))
                self.model.reset_metrics()

Source File: horovod_patches.py From cape-document-qa with Apache License 2.0

6 votes

def _train(model: Model,
           data: TrainingData,
           checkpoint: Union[str, None],
           parameter_checkpoint: Union[str, None],
           save_start: bool,
           train_params: trainer.TrainParams,
           evaluators: List[Evaluator],
           out: ModelDir,
           notes=None,
           dry_run=False,
           start_eval=False):
    print('Horovod size: ', hvd.size())
    print('Horovod rank: ', hvd.rank())
    print('Horovod local rank: ', hvd.local_rank())

    if train_params.async_encoding:
        _train_async(model, data, checkpoint, parameter_checkpoint, save_start, train_params,
                 evaluators, out, notes, dry_run, start_eval)
        return
    else:
        raise NotImplementedError('Syncronous training with Horovod not supported yet')

Source File: solver.py From athena with Apache License 2.0

6 votes

def evaluate(self, dataset, epoch=0):
        """ evaluate the model """
        loss_metric = tf.keras.metrics.Mean(name="AverageLoss")
        loss, metrics = None, None
        evaluate_step = self.evaluate_step
        if self.hparams.enable_tf_function:
            logging.info("please be patient, enable tf.function, it takes time ...")
            evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature)
        self.model.reset_metrics()
        for batch, samples in enumerate(dataset):
            samples = self.model.prepare_samples(samples)
            loss, metrics = evaluate_step(samples)
            if batch % self.hparams.log_interval == 0 and hvd.rank() == 0:
                logging.info(self.metric_checker(loss, metrics, -2))
            loss_metric.update_state(loss)
        if hvd.rank() == 0:
            logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch))
            self.model.reset_metrics()
        return loss_metric.result()

Source File: trainers.py From tensorpack with Apache License 2.0

6 votes

def __init__(self, average=True):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
        """
        import byteps.tensorflow as bps
        self.hvd = bps  # BytePS has the same interface as Horovod
        self.hvd.allreduce = bps.push_pull  # https://github.com/bytedance/byteps/issues/8
        assert os.environ.get("DMLC_ROLE", None) == "worker"
        assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ
        bps.init()
        self.is_chief = bps.rank() == 0

        self._local_rank = bps.local_rank()
        self._rank = bps.rank()
        self._average = average

        self._compression = None
        self._has_compression = False
        logger.info("[BytePSTrainer] local rank={}".format(self._local_rank))
        SingleCostTrainer.__init__(self)

Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0

6 votes

def test_horovod_allreduce_cpu_gpu_error(self):
        """Test that the allreduce raises an error if different ranks try to
        perform reduction on CPU and GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        device = "/gpu:%d" % local_rank if local_rank % 2 == 0 else "/cpu:0"
        with self.test_session(config=self.config) as session:
            with tf.device(device):
                # Same rank, different dimension
                dims = [17] * 3
                tensor = tf.ones(dims, dtype=tf.int32)
                with self.assertRaises(tf.errors.FailedPreconditionError):
                    session.run(hvd.allreduce(tensor))

Source File: flow_training.py From flowpp with MIT License

6 votes

def setup_horovod():
    import horovod.tensorflow as hvd

    # Initialize Horovod
    hvd.init()
    # Verify that MPI multi-threading is supported.
    assert hvd.mpi_threads_supported()

    from mpi4py import MPI

    assert hvd.size() == MPI.COMM_WORLD.Get_size()

    is_root = hvd.rank() == 0

    def mpi_average(local_list):
        # _local_list_orig = local_list
        local_list = list(map(float, local_list))
        # print('RANK {} AVERAGING {} -> {}'.format(hvd.rank(), _local_list_orig, local_list))
        sums = MPI.COMM_WORLD.gather(sum(local_list), root=0)
        counts = MPI.COMM_WORLD.gather(len(local_list), root=0)
        sum_counts = sum(counts) if is_root else None
        avg = (sum(sums) / sum_counts) if is_root else None
        return avg, sum_counts

    return hvd, MPI, is_root, mpi_average

Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0

6 votes

def test_horovod_allgather_error(self):
        """Test that the allgather returns an error if any dimension besides
        the first is different among the tensors being gathered."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            tensor_size = [17] * 3
            tensor_size[1] = 10 * (rank + 1)
            tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allgather(tensor))

Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0

6 votes

def test_horovod_allgather_type_error(self):
        """Test that the allgather returns an error if the types being gathered
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            tensor_size = [17] * 3
            dtype = tf.int32 if rank % 2 == 0 else tf.float32
            tensor = tf.ones(tensor_size, dtype=dtype) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allgather(tensor))

Source File: trainers.py From ADL with MIT License

6 votes

def __init__(self, average=True):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
        """
        import byteps.tensorflow as bps
        self.hvd = bps  # BytePS has the same interface as Horovod
        self.hvd.allreduce = bps.push_pull  # https://github.com/bytedance/byteps/issues/8
        assert os.environ.get("DMLC_ROLE", None) == "worker"
        assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ
        bps.init()
        self.is_chief = bps.rank() == 0

        self._local_rank = bps.local_rank()
        self._rank = bps.rank()
        self._average = average

        self._compression = None
        self._has_compression = False
        logger.info("[BytePSTrainer] local rank={}".format(self._local_rank))
        SingleCostTrainer.__init__(self)

Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0

6 votes

def test_horovod_broadcast_type_error(self):
        """Test that the broadcast returns an error if the types being broadcasted
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            tensor_size = [17] * 3
            dtype = tf.int32 if rank % 2 == 0 else tf.float32
            tensor = tf.ones(tensor_size, dtype=dtype) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, 0))

Source File: horovod.py From blueoil with Apache License 2.0

6 votes

def setup():
    if not horovod_installed:
        return False

    global horovod_initialized
    if horovod_initialized:
        return hvd

    hvd.init()
    horovod_initialized = True

    horovod_num_worker = hvd.size()
    horovod_rank = hvd.rank()
    # verify that MPI multi-threading is supported.
    assert hvd.mpi_threads_supported()
    # make sure MPI is not re-initialized.
    import mpi4py.rc
    mpi4py.rc.initialize = False
    # import mpi4py
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    # check size and rank are synchronized
    assert horovod_num_worker == comm.Get_size()
    assert horovod_rank == comm.Get_rank()
    return hvd

Source File: horovod.py From blueoil with Apache License 2.0

6 votes

def is_enabled():
    if os.getenv("USE_HOROVOD"):
        return True
    ppid = os.getppid()
    if ppid <= 1:
        return False

    parent_process_name = _get_pname(ppid)
    if parent_process_name.startswith("horovodrun") or parent_process_name.startswith("mpirun"):
        if horovod_installed:
            return True
        else:
            print("you're trying to run on horovod, but importing Horovod failed. exit.")
            sys.exit(1)
    else:
        return False


# return True if horovod is not enabled, or enabled and the process is rank 0.

Source File: horovod.py From blueoil with Apache License 2.0

6 votes

def setup():
    if not horovod_installed:
        return False

    global horovod_initialized
    if horovod_initialized:
        return hvd

    hvd.init()
    horovod_initialized = True

    horovod_num_worker = hvd.size()
    horovod_rank = hvd.rank()
    # verify that MPI multi-threading is supported.
    assert hvd.mpi_threads_supported()
    # make sure MPI is not re-initialized.
    import mpi4py.rc
    mpi4py.rc.initialize = False
    # import mpi4py
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    # check size and rank are synchronized
    assert horovod_num_worker == comm.Get_size()
    assert horovod_rank == comm.Get_rank()
    return hvd

Source File: horovod.py From blueoil with Apache License 2.0

6 votes

def is_enabled():
    if os.getenv("USE_HOROVOD"):
        return True
    ppid = os.getppid()
    if ppid <= 1:
        return False

    parent_process_name = _get_pname(ppid)
    if parent_process_name.startswith("horovodrun") or parent_process_name.startswith("mpirun"):
        if horovod_installed:
            return True
        else:
            print("you're trying to run on horovod, but importing Horovod failed. exit.")
            sys.exit(1)
    else:
        return False


# return True if horovod is not enabled, or enabled and the process is rank 0.

Source File: classifier.py From tensorflow_fasttext with MIT License

6 votes

def FastTrain():
    print("FastTrain", FLAGS.train_steps)
    estimator = FastTextEstimator(FLAGS.model_dir)
    print("TEST" + FLAGS.train_records)
    train_input = InputFn(tf.estimator.ModeKeys.TRAIN, FLAGS.train_records)
    print("STARTING TRAIN")
    hooks = None
    if FLAGS.horovod:
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
    estimator.train(input_fn=train_input, steps=FLAGS.train_steps, hooks=hooks)
    print("TRAIN COMPLETE")
    if not FLAGS.horovod or hvd.rank() == 0:
        print("EVALUATE")
        eval_input = InputFn(tf.estimator.ModeKeys.EVAL, FLAGS.eval_records)
        #eval_metrics = { "accuracy": tf.metrics.accuracy(labels, predictions) }
        result = estimator.evaluate(input_fn=eval_input, steps=FLAGS.eval_steps, hooks=None)
        print(result)
        print("DONE")
        if FLAGS.export_dir:
            print("EXPORTING")
            estimator.export_savedmodel(FLAGS.export_dir,
                                        inputs.ServingInputFn(FLAGS.use_ngrams))

Source File: resnet_main.py From DistributedDeepLearning with MIT License

6 votes

def _get_runconfig(is_distributed=defaults.DISTRIBUTED, save_checkpoints_steps=None):
    if is_distributed:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())

        return tf.estimator.RunConfig(
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=None,
            session_config=config,
            log_step_count_steps=100,
        )
    else:
        return tf.estimator.RunConfig(
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=None,
            log_step_count_steps=100,
        )

Source File: train_model.py From DistributedDeepLearning with MIT License

6 votes

def _get_runconfig(is_distributed=DISTRIBUTED, save_checkpoints_steps=None):
    if is_distributed:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())

        return tf.estimator.RunConfig(
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=None,
            session_config=config,
            log_step_count_steps=100,
        )
    else:
        return tf.estimator.RunConfig(
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=None,
            log_step_count_steps=100,
        )

Source File: horovod_mnist.py From sagemaker-tensorflow-training-toolkit with Apache License 2.0

6 votes

def training_step(images, labels, first_batch):
  with tf.GradientTape() as tape:
    probs = mnist_model(images, training=True)
    loss_value = loss(labels, probs)

  # Horovod: add Horovod Distributed GradientTape.
  tape = hvd.DistributedGradientTape(tape)

  grads = tape.gradient(loss_value, mnist_model.trainable_variables)
  opt.apply_gradients(zip(grads, mnist_model.trainable_variables))

  # Horovod: broadcast initial variable states from rank 0 to all other processes.
  # This is necessary to ensure consistent initialization of all workers when
  # training is started with random weights or restored from a checkpoint.
  #
  # Note: broadcast should be done after the first gradient step to ensure optimizer
  # initialization.
  if first_batch:
    hvd.broadcast_variables(mnist_model.variables, root_rank=0)
    hvd.broadcast_variables(opt.variables(), root_rank=0)

  return loss_value


# Horovod: adjust number of steps based on number of GPUs.

Source File: variational_autoencoder_deconv_tfdataset_horovod.py From keras_experiments with The Unlicense

5 votes

def parser_(desc):
    parser = ap.ArgumentParser(description=desc,
                               formatter_class=CustomFormatter)

    parser.add_argument(
        '--epochs', type=int, default=5,
        help='Number of epochs to run training for. Default: %(default)s')

    parser.add_argument(
        '--nranks_per_gpu', type=int, default=1,
        help='S|Number of ranks to run on each GPUs. Use this parameter to\n'
        'oversubscribe a GPU. When oversubscribing a GPU use in combination\n'
        'with MPS (multi-process service). Default: %(default)s')

    parser.add_argument(
        '--speedup', action='store', nargs='?', type=str.lower,
        const=SpeedupOpts.epoch, default=SpeedupOpts.epoch,
        choices=[SpeedupOpts.epoch, SpeedupOpts.imgspersec],
        help='S|Speedup the fit to run each epoch faster or overall images\n'
        'per seconds faster. With the "imgspersec" option each rank\n'
        'fits the fullset and with epoch option only a subset. Loss\n'
        'convergence stability might be better with imgspersec option.\n'
        'Default: %(default)s')

    args = parser.parse_args()

    return args

Source File: graph_transform.py From parallax with Apache License 2.0

5 votes

def graph_transform_mpi(single_gpu_meta_graph_def, config,
                        op_library_path=None):
    if op_library_path is not None:
        tf.load_op_library(op_library_path)

    with tf.Graph().as_default() as replica:
        tf.train.import_meta_graph(single_gpu_meta_graph_def)

        tensor_or_op_name_to_replica_names = {}
        for op in replica.get_operations():
            tensor_or_op_name_to_replica_names[op.name] = [op.name]
            for output in op.outputs:
                tensor_or_op_name_to_replica_names[output.name] = [output.name]

        # Initialize horovod
        hvd.init()

        num_workers = hvd.size()
        worker_id = hvd.rank()
        update_shard_values_for_worker(num_workers, worker_id)

        op_to_control_consumer_ops = get_all_control_consumers(replica)
        trainable_variable_ops = [var.op for var in tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES)]

        for gradients_info in tf.get_collection(tf.GraphKeys.GRADIENTS_INFO):
            target_tensor = gradients_info._target
            if target_tensor.op not in trainable_variable_ops:
                parallax_log.debug(
                    "Gradient for non-trainable variable %s is created, ignore"
                    % target_tensor.op.name)
                continue

            _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config)
        _add_broadcast_ops()

    return tf.train.export_meta_graph(graph=replica), \
           tensor_or_op_name_to_replica_names

Source File: eval.py From tensorpack with Apache License 2.0

5 votes

def _eval(self):
        logdir = self._output_dir
        if cfg.TRAINER == 'replicated':
            all_results = multithread_predict_dataflow(self.dataflows, self.predictors)
        else:
            filenames = [os.path.join(
                logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
            ) for rank in range(hvd.local_size())]

            if self._horovod_run_eval:
                local_results = predict_dataflow(self.dataflow, self.predictor)
                fname = filenames[hvd.local_rank()]
                with open(fname, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for fname in filenames:
                with open(fname, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(fname)

        scores = DatasetRegistry.get(self._eval_dataset).eval_inference_results(all_results)
        for k, v in scores.items():
            self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v)

Source File: resnet_main.py From DistributedDeepLearning with MIT License

5 votes

def _get_rank():
    if defaults.DISTRIBUTED:
        try:
            return hvd.rank()
        except:
            return 0
    else:
        return 0


# Data processing
###############################################################################

Source File: trainers.py From tensorpack with Apache License 2.0

5 votes

def initialize(self, session_creator, session_init):
        # broadcast_op should be the last setup_graph: it needs to be created
        # "right before" the graph is finalized,
        # because it needs to capture all the variables (which may be created by callbacks).
        self._broadcast_op = self.hvd.broadcast_global_variables(0)

        # it's important that our NewSessionCreator does not finalize the graph
        if not isinstance(session_creator, NewSessionCreator):
            raise ValueError(
                "session_creator has to be `NewSessionCreator` for horovod/byteps training! ")
        # NOTE It will fail if GPU was already detected before initializing the session
        # https://github.com/tensorflow/tensorflow/issues/8136
        session_creator.config.gpu_options.visible_device_list = str(self._local_rank)
        try:
            session_creator.config.inter_op_parallelism_threads = mp.cpu_count() // self.hvd.local_size()
        except AttributeError:  # old horovod does not have local_size
            pass
        super(HorovodTrainer, self).initialize(session_creator, session_init)

        # This broadcast belongs to the "intialize" stage
        # It should not be delayed to the "before_train" stage.
        # TODO:
        # 1. a allgather helper to concat strings
        # 2. check variables on each rank match each other, print warnings, and broadcast the common set.
        if self.is_chief:
            logger.info("Broadcasting initialized variables ...")
        else:
            logger.info("Rank {} waiting for initialization broadcasting ...".format(self._rank))
        self.sess.run(self._broadcast_op)

Source File: trainers.py From ADL with MIT License

5 votes

def __init__(self, average=True, compression=None):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
            compression: `hvd.Compression.fp16` or `hvd.Compression.none`
        """
        if 'pyarrow' in sys.modules:
            logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs. "
                        "Uninstall pyarrow and use msgpack instead.")
        # lazy import
        import horovod.tensorflow as hvd
        import horovod
        hvd_version = tuple(map(int, horovod.__version__.split('.')[:3]))
        self.hvd = hvd

        hvd.init()
        self.is_chief = hvd.rank() == 0
        self._local_rank = hvd.local_rank()
        self._rank = hvd.rank()
        self._average = average
        self._compression = compression
        self._has_compression = hvd_version >= (0, 15, 0)
        logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
        super(HorovodTrainer, self).__init__()

        self.BROADCAST_EVERY_EPOCH = True

Python horovod.tensorflow.rank() Examples