Python horovod.tensorflow.rank() Examples

The following are 30 code examples of horovod.tensorflow.rank(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module horovod.tensorflow , or try the search function .
Example #1
Source File: test_tensorflow.py    From training_results_v0.6 with Apache License 2.0 6 votes vote down vote up
def test_horovod_allreduce_type_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different type."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            # Same rank, different dimension
            dims = [17] * 3
            tensor = tf.ones(dims,
                             dtype=tf.int32 if rank % 2 == 0 else tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor)) 
Example #2
Source File: tfops.py    From glow with MIT License 6 votes vote down vote up
def print_act_stats(x, _str=""):
    if not do_print_act_stats:
        return x
    if hvd.rank() != 0:
        return x
    if len(x.get_shape()) == 1:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 2:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 4:
        x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True)
    stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),
             tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
    return tf.Print(x, stats, "["+_str+"] "+x.name)

# Allreduce methods 
Example #3
Source File: eval.py    From tensorpack with Apache License 2.0 6 votes vote down vote up
def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750
            buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)]

            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu if buggy_tf else num_gpu * 2
            self.predictors = [self._build_predictor(k % num_gpu) for k in range(self.num_predictor)]
            self.dataflows = [get_eval_dataflow(self._eval_dataset,
                                                shard=k, num_shards=self.num_predictor)
                              for k in range(self.num_predictor)]
        else:
            # Only eval on the first machine,
            # Because evaluation assumes that all horovod workers share the filesystem.
            # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
            self._horovod_run_eval = hvd.rank() == hvd.local_rank()
            if self._horovod_run_eval:
                self.predictor = self._build_predictor(0)
                self.dataflow = get_eval_dataflow(self._eval_dataset,
                                                  shard=hvd.local_rank(), num_shards=hvd.local_size())

            self.barrier = hvd.allreduce(tf.random_normal(shape=[1])) 
Example #4
Source File: hvd_distributed_tf_data_utils.py    From BERT with Apache License 2.0 6 votes vote down vote up
def train_input_fn(input_file, _parse_fn, name_to_features,
		params, **kargs):
	if_shard = kargs.get("if_shard", "1")
	dataset = tf.data.TFRecordDataset(input_file, buffer_size=params.get("buffer_size", 100))
	print("==hvd size {}, rank {}==".format(hvd.size(), hvd.rank()))
	if if_shard == "1":
		dataset = dataset.shard(hvd.size(), hvd.rank())
	dataset = dataset.map(lambda x:_parse_fn(x, name_to_features))
	dataset = dataset.shuffle(
							buffer_size=params.get("buffer_size", 1024)+3*params.get("batch_size", 32),
							seed=np.random.randint(0,1e10,1)[0],
							reshuffle_each_iteration=True)
	dataset = dataset.batch(params.get("batch_size", 32))
	dataset = dataset.repeat(params.get("epoch", 100))
	iterator = dataset.make_one_shot_iterator()
	features = iterator.get_next()
	return features 
Example #5
Source File: tfops.py    From pix2pix-flow with MIT License 6 votes vote down vote up
def print_act_stats(x, _str=""):
    if not do_print_act_stats:
        return x
    if hvd.rank() != 0:
        return x
    if len(x.get_shape()) == 1:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 2:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 4:
        x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True)
    stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),
             tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
    return tf.Print(x, stats, "["+_str+"] "+x.name)

# Allreduce methods 
Example #6
Source File: trainers.py    From tensorpack with Apache License 2.0 6 votes vote down vote up
def __init__(self, average=True, compression=None):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
            compression: `hvd.Compression.fp16` or `hvd.Compression.none`
        """
        if 'pyarrow' in sys.modules:
            logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs.")
        # lazy import
        import horovod.tensorflow as hvd
        import horovod
        hvd_version = tuple(map(int, horovod.__version__.split('.')[:3]))
        self.hvd = hvd

        hvd.init()
        self.is_chief = hvd.rank() == 0
        self._local_rank = hvd.local_rank()
        self._rank = hvd.rank()
        self._average = average
        self._compression = compression
        self._has_compression = hvd_version >= (0, 15, 0)
        logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
        super(HorovodTrainer, self).__init__()

        self.BROADCAST_EVERY_EPOCH = True 
Example #7
Source File: solver.py    From athena with Apache License 2.0 6 votes vote down vote up
def train(self, dataset, total_batches=-1):
        """ Update the model in 1 epoch """
        train_step = self.train_step
        if self.hparams.enable_tf_function:
            logging.info("please be patient, enable tf.function, it takes time ...")
            train_step = tf.function(train_step, input_signature=self.sample_signature)
        for batch, samples in enumerate(dataset.take(total_batches)):
            # train 1 step
            samples = self.model.prepare_samples(samples)
            loss, metrics = train_step(samples)
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            #
            # Note: broadcast should be done after the first gradient step to ensure optimizer
            # initialization.
            if batch == 0:
                hvd.broadcast_variables(self.model.trainable_variables, root_rank=0)
                hvd.broadcast_variables(self.optimizer.variables(), root_rank=0)
            if batch % self.hparams.log_interval == 0 and hvd.rank() == 0:
                logging.info(self.metric_checker(loss, metrics))
                self.model.reset_metrics() 
Example #8
Source File: horovod_patches.py    From cape-document-qa with Apache License 2.0 6 votes vote down vote up
def _train(model: Model,
           data: TrainingData,
           checkpoint: Union[str, None],
           parameter_checkpoint: Union[str, None],
           save_start: bool,
           train_params: trainer.TrainParams,
           evaluators: List[Evaluator],
           out: ModelDir,
           notes=None,
           dry_run=False,
           start_eval=False):
    print('Horovod size: ', hvd.size())
    print('Horovod rank: ', hvd.rank())
    print('Horovod local rank: ', hvd.local_rank())

    if train_params.async_encoding:
        _train_async(model, data, checkpoint, parameter_checkpoint, save_start, train_params,
                 evaluators, out, notes, dry_run, start_eval)
        return
    else:
        raise NotImplementedError('Syncronous training with Horovod not supported yet') 
Example #9
Source File: solver.py    From athena with Apache License 2.0 6 votes vote down vote up
def evaluate(self, dataset, epoch=0):
        """ evaluate the model """
        loss_metric = tf.keras.metrics.Mean(name="AverageLoss")
        loss, metrics = None, None
        evaluate_step = self.evaluate_step
        if self.hparams.enable_tf_function:
            logging.info("please be patient, enable tf.function, it takes time ...")
            evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature)
        self.model.reset_metrics()
        for batch, samples in enumerate(dataset):
            samples = self.model.prepare_samples(samples)
            loss, metrics = evaluate_step(samples)
            if batch % self.hparams.log_interval == 0 and hvd.rank() == 0:
                logging.info(self.metric_checker(loss, metrics, -2))
            loss_metric.update_state(loss)
        if hvd.rank() == 0:
            logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch))
            self.model.reset_metrics()
        return loss_metric.result() 
Example #10
Source File: trainers.py    From tensorpack with Apache License 2.0 6 votes vote down vote up
def __init__(self, average=True):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
        """
        import byteps.tensorflow as bps
        self.hvd = bps  # BytePS has the same interface as Horovod
        self.hvd.allreduce = bps.push_pull  # https://github.com/bytedance/byteps/issues/8
        assert os.environ.get("DMLC_ROLE", None) == "worker"
        assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ
        bps.init()
        self.is_chief = bps.rank() == 0

        self._local_rank = bps.local_rank()
        self._rank = bps.rank()
        self._average = average

        self._compression = None
        self._has_compression = False
        logger.info("[BytePSTrainer] local rank={}".format(self._local_rank))
        SingleCostTrainer.__init__(self) 
Example #11
Source File: test_tensorflow.py    From training_results_v0.6 with Apache License 2.0 6 votes vote down vote up
def test_horovod_allreduce_cpu_gpu_error(self):
        """Test that the allreduce raises an error if different ranks try to
        perform reduction on CPU and GPU."""
        # Only do this test if there are GPUs available.
        if not tf.test.is_gpu_available(cuda_only=True):
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        device = "/gpu:%d" % local_rank if local_rank % 2 == 0 else "/cpu:0"
        with self.test_session(config=self.config) as session:
            with tf.device(device):
                # Same rank, different dimension
                dims = [17] * 3
                tensor = tf.ones(dims, dtype=tf.int32)
                with self.assertRaises(tf.errors.FailedPreconditionError):
                    session.run(hvd.allreduce(tensor)) 
Example #12
Source File: flow_training.py    From flowpp with MIT License 6 votes vote down vote up
def setup_horovod():
    import horovod.tensorflow as hvd

    # Initialize Horovod
    hvd.init()
    # Verify that MPI multi-threading is supported.
    assert hvd.mpi_threads_supported()

    from mpi4py import MPI

    assert hvd.size() == MPI.COMM_WORLD.Get_size()

    is_root = hvd.rank() == 0

    def mpi_average(local_list):
        # _local_list_orig = local_list
        local_list = list(map(float, local_list))
        # print('RANK {} AVERAGING {} -> {}'.format(hvd.rank(), _local_list_orig, local_list))
        sums = MPI.COMM_WORLD.gather(sum(local_list), root=0)
        counts = MPI.COMM_WORLD.gather(len(local_list), root=0)
        sum_counts = sum(counts) if is_root else None
        avg = (sum(sums) / sum_counts) if is_root else None
        return avg, sum_counts

    return hvd, MPI, is_root, mpi_average 
Example #13
Source File: test_tensorflow.py    From training_results_v0.6 with Apache License 2.0 6 votes vote down vote up
def test_horovod_allgather_error(self):
        """Test that the allgather returns an error if any dimension besides
        the first is different among the tensors being gathered."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            tensor_size = [17] * 3
            tensor_size[1] = 10 * (rank + 1)
            tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allgather(tensor)) 
Example #14
Source File: test_tensorflow.py    From training_results_v0.6 with Apache License 2.0 6 votes vote down vote up
def test_horovod_allgather_type_error(self):
        """Test that the allgather returns an error if the types being gathered
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            tensor_size = [17] * 3
            dtype = tf.int32 if rank % 2 == 0 else tf.float32
            tensor = tf.ones(tensor_size, dtype=dtype) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allgather(tensor)) 
Example #15
Source File: trainers.py    From ADL with MIT License 6 votes vote down vote up
def __init__(self, average=True):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
        """
        import byteps.tensorflow as bps
        self.hvd = bps  # BytePS has the same interface as Horovod
        self.hvd.allreduce = bps.push_pull  # https://github.com/bytedance/byteps/issues/8
        assert os.environ.get("DMLC_ROLE", None) == "worker"
        assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ
        bps.init()
        self.is_chief = bps.rank() == 0

        self._local_rank = bps.local_rank()
        self._rank = bps.rank()
        self._average = average

        self._compression = None
        self._has_compression = False
        logger.info("[BytePSTrainer] local rank={}".format(self._local_rank))
        SingleCostTrainer.__init__(self) 
Example #16
Source File: test_tensorflow.py    From training_results_v0.6 with Apache License 2.0 6 votes vote down vote up
def test_horovod_broadcast_type_error(self):
        """Test that the broadcast returns an error if the types being broadcasted
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            tensor_size = [17] * 3
            dtype = tf.int32 if rank % 2 == 0 else tf.float32
            tensor = tf.ones(tensor_size, dtype=dtype) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, 0)) 
Example #17
Source File: horovod.py    From blueoil with Apache License 2.0 6 votes vote down vote up
def setup():
    if not horovod_installed:
        return False

    global horovod_initialized
    if horovod_initialized:
        return hvd

    hvd.init()
    horovod_initialized = True

    horovod_num_worker = hvd.size()
    horovod_rank = hvd.rank()
    # verify that MPI multi-threading is supported.
    assert hvd.mpi_threads_supported()
    # make sure MPI is not re-initialized.
    import mpi4py.rc
    mpi4py.rc.initialize = False
    # import mpi4py
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    # check size and rank are synchronized
    assert horovod_num_worker == comm.Get_size()
    assert horovod_rank == comm.Get_rank()
    return hvd 
Example #18
Source File: horovod.py    From blueoil with Apache License 2.0 6 votes vote down vote up
def is_enabled():
    if os.getenv("USE_HOROVOD"):
        return True
    ppid = os.getppid()
    if ppid <= 1:
        return False

    parent_process_name = _get_pname(ppid)
    if parent_process_name.startswith("horovodrun") or parent_process_name.startswith("mpirun"):
        if horovod_installed:
            return True
        else:
            print("you're trying to run on horovod, but importing Horovod failed. exit.")
            sys.exit(1)
    else:
        return False


# return True if horovod is not enabled, or enabled and the process is rank 0. 
Example #19
Source File: horovod.py    From blueoil with Apache License 2.0 6 votes vote down vote up
def setup():
    if not horovod_installed:
        return False

    global horovod_initialized
    if horovod_initialized:
        return hvd

    hvd.init()
    horovod_initialized = True

    horovod_num_worker = hvd.size()
    horovod_rank = hvd.rank()
    # verify that MPI multi-threading is supported.
    assert hvd.mpi_threads_supported()
    # make sure MPI is not re-initialized.
    import mpi4py.rc
    mpi4py.rc.initialize = False
    # import mpi4py
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    # check size and rank are synchronized
    assert horovod_num_worker == comm.Get_size()
    assert horovod_rank == comm.Get_rank()
    return hvd 
Example #20
Source File: horovod.py    From blueoil with Apache License 2.0 6 votes vote down vote up
def is_enabled():
    if os.getenv("USE_HOROVOD"):
        return True
    ppid = os.getppid()
    if ppid <= 1:
        return False

    parent_process_name = _get_pname(ppid)
    if parent_process_name.startswith("horovodrun") or parent_process_name.startswith("mpirun"):
        if horovod_installed:
            return True
        else:
            print("you're trying to run on horovod, but importing Horovod failed. exit.")
            sys.exit(1)
    else:
        return False


# return True if horovod is not enabled, or enabled and the process is rank 0. 
Example #21
Source File: classifier.py    From tensorflow_fasttext with MIT License 6 votes vote down vote up
def FastTrain():
    print("FastTrain", FLAGS.train_steps)
    estimator = FastTextEstimator(FLAGS.model_dir)
    print("TEST" + FLAGS.train_records)
    train_input = InputFn(tf.estimator.ModeKeys.TRAIN, FLAGS.train_records)
    print("STARTING TRAIN")
    hooks = None
    if FLAGS.horovod:
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
    estimator.train(input_fn=train_input, steps=FLAGS.train_steps, hooks=hooks)
    print("TRAIN COMPLETE")
    if not FLAGS.horovod or hvd.rank() == 0:
        print("EVALUATE")
        eval_input = InputFn(tf.estimator.ModeKeys.EVAL, FLAGS.eval_records)
        #eval_metrics = { "accuracy": tf.metrics.accuracy(labels, predictions) }
        result = estimator.evaluate(input_fn=eval_input, steps=FLAGS.eval_steps, hooks=None)
        print(result)
        print("DONE")
        if FLAGS.export_dir:
            print("EXPORTING")
            estimator.export_savedmodel(FLAGS.export_dir,
                                        inputs.ServingInputFn(FLAGS.use_ngrams)) 
Example #22
Source File: resnet_main.py    From DistributedDeepLearning with MIT License 6 votes vote down vote up
def _get_runconfig(is_distributed=defaults.DISTRIBUTED, save_checkpoints_steps=None):
    if is_distributed:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())

        return tf.estimator.RunConfig(
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=None,
            session_config=config,
            log_step_count_steps=100,
        )
    else:
        return tf.estimator.RunConfig(
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=None,
            log_step_count_steps=100,
        ) 
Example #23
Source File: train_model.py    From DistributedDeepLearning with MIT License 6 votes vote down vote up
def _get_runconfig(is_distributed=DISTRIBUTED, save_checkpoints_steps=None):
    if is_distributed:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())

        return tf.estimator.RunConfig(
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=None,
            session_config=config,
            log_step_count_steps=100,
        )
    else:
        return tf.estimator.RunConfig(
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=None,
            log_step_count_steps=100,
        ) 
Example #24
Source File: horovod_mnist.py    From sagemaker-tensorflow-training-toolkit with Apache License 2.0 6 votes vote down vote up
def training_step(images, labels, first_batch):
  with tf.GradientTape() as tape:
    probs = mnist_model(images, training=True)
    loss_value = loss(labels, probs)

  # Horovod: add Horovod Distributed GradientTape.
  tape = hvd.DistributedGradientTape(tape)

  grads = tape.gradient(loss_value, mnist_model.trainable_variables)
  opt.apply_gradients(zip(grads, mnist_model.trainable_variables))

  # Horovod: broadcast initial variable states from rank 0 to all other processes.
  # This is necessary to ensure consistent initialization of all workers when
  # training is started with random weights or restored from a checkpoint.
  #
  # Note: broadcast should be done after the first gradient step to ensure optimizer
  # initialization.
  if first_batch:
    hvd.broadcast_variables(mnist_model.variables, root_rank=0)
    hvd.broadcast_variables(opt.variables(), root_rank=0)

  return loss_value


# Horovod: adjust number of steps based on number of GPUs. 
Example #25
Source File: variational_autoencoder_deconv_tfdataset_horovod.py    From keras_experiments with The Unlicense 5 votes vote down vote up
def parser_(desc):
    parser = ap.ArgumentParser(description=desc,
                               formatter_class=CustomFormatter)

    parser.add_argument(
        '--epochs', type=int, default=5,
        help='Number of epochs to run training for. Default: %(default)s')

    parser.add_argument(
        '--nranks_per_gpu', type=int, default=1,
        help='S|Number of ranks to run on each GPUs. Use this parameter to\n'
        'oversubscribe a GPU. When oversubscribing a GPU use in combination\n'
        'with MPS (multi-process service). Default: %(default)s')

    parser.add_argument(
        '--speedup', action='store', nargs='?', type=str.lower,
        const=SpeedupOpts.epoch, default=SpeedupOpts.epoch,
        choices=[SpeedupOpts.epoch, SpeedupOpts.imgspersec],
        help='S|Speedup the fit to run each epoch faster or overall images\n'
        'per seconds faster. With the "imgspersec" option each rank\n'
        'fits the fullset and with epoch option only a subset. Loss\n'
        'convergence stability might be better with imgspersec option.\n'
        'Default: %(default)s')

    args = parser.parse_args()

    return args 
Example #26
Source File: graph_transform.py    From parallax with Apache License 2.0 5 votes vote down vote up
def graph_transform_mpi(single_gpu_meta_graph_def, config,
                        op_library_path=None):
    if op_library_path is not None:
        tf.load_op_library(op_library_path)

    with tf.Graph().as_default() as replica:
        tf.train.import_meta_graph(single_gpu_meta_graph_def)

        tensor_or_op_name_to_replica_names = {}
        for op in replica.get_operations():
            tensor_or_op_name_to_replica_names[op.name] = [op.name]
            for output in op.outputs:
                tensor_or_op_name_to_replica_names[output.name] = [output.name]

        # Initialize horovod
        hvd.init()

        num_workers = hvd.size()
        worker_id = hvd.rank()
        update_shard_values_for_worker(num_workers, worker_id)

        op_to_control_consumer_ops = get_all_control_consumers(replica)
        trainable_variable_ops = [var.op for var in tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES)]

        for gradients_info in tf.get_collection(tf.GraphKeys.GRADIENTS_INFO):
            target_tensor = gradients_info._target
            if target_tensor.op not in trainable_variable_ops:
                parallax_log.debug(
                    "Gradient for non-trainable variable %s is created, ignore"
                    % target_tensor.op.name)
                continue

            _add_aggregation_ops(gradients_info, op_to_control_consumer_ops, config)
        _add_broadcast_ops()

    return tf.train.export_meta_graph(graph=replica), \
           tensor_or_op_name_to_replica_names 
Example #27
Source File: eval.py    From tensorpack with Apache License 2.0 5 votes vote down vote up
def _eval(self):
        logdir = self._output_dir
        if cfg.TRAINER == 'replicated':
            all_results = multithread_predict_dataflow(self.dataflows, self.predictors)
        else:
            filenames = [os.path.join(
                logdir, 'outputs{}-part{}.json'.format(self.global_step, rank)
            ) for rank in range(hvd.local_size())]

            if self._horovod_run_eval:
                local_results = predict_dataflow(self.dataflow, self.predictor)
                fname = filenames[hvd.local_rank()]
                with open(fname, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for fname in filenames:
                with open(fname, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(fname)

        scores = DatasetRegistry.get(self._eval_dataset).eval_inference_results(all_results)
        for k, v in scores.items():
            self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v) 
Example #28
Source File: resnet_main.py    From DistributedDeepLearning with MIT License 5 votes vote down vote up
def _get_rank():
    if defaults.DISTRIBUTED:
        try:
            return hvd.rank()
        except:
            return 0
    else:
        return 0


# Data processing
############################################################################### 
Example #29
Source File: trainers.py    From tensorpack with Apache License 2.0 5 votes vote down vote up
def initialize(self, session_creator, session_init):
        # broadcast_op should be the last setup_graph: it needs to be created
        # "right before" the graph is finalized,
        # because it needs to capture all the variables (which may be created by callbacks).
        self._broadcast_op = self.hvd.broadcast_global_variables(0)

        # it's important that our NewSessionCreator does not finalize the graph
        if not isinstance(session_creator, NewSessionCreator):
            raise ValueError(
                "session_creator has to be `NewSessionCreator` for horovod/byteps training! ")
        # NOTE It will fail if GPU was already detected before initializing the session
        # https://github.com/tensorflow/tensorflow/issues/8136
        session_creator.config.gpu_options.visible_device_list = str(self._local_rank)
        try:
            session_creator.config.inter_op_parallelism_threads = mp.cpu_count() // self.hvd.local_size()
        except AttributeError:  # old horovod does not have local_size
            pass
        super(HorovodTrainer, self).initialize(session_creator, session_init)

        # This broadcast belongs to the "intialize" stage
        # It should not be delayed to the "before_train" stage.
        # TODO:
        # 1. a allgather helper to concat strings
        # 2. check variables on each rank match each other, print warnings, and broadcast the common set.
        if self.is_chief:
            logger.info("Broadcasting initialized variables ...")
        else:
            logger.info("Rank {} waiting for initialization broadcasting ...".format(self._rank))
        self.sess.run(self._broadcast_op) 
Example #30
Source File: trainers.py    From ADL with MIT License 5 votes vote down vote up
def __init__(self, average=True, compression=None):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
            compression: `hvd.Compression.fp16` or `hvd.Compression.none`
        """
        if 'pyarrow' in sys.modules:
            logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs. "
                        "Uninstall pyarrow and use msgpack instead.")
        # lazy import
        import horovod.tensorflow as hvd
        import horovod
        hvd_version = tuple(map(int, horovod.__version__.split('.')[:3]))
        self.hvd = hvd

        hvd.init()
        self.is_chief = hvd.rank() == 0
        self._local_rank = hvd.local_rank()
        self._rank = hvd.rank()
        self._average = average
        self._compression = compression
        self._has_compression = hvd_version >= (0, 15, 0)
        logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
        super(HorovodTrainer, self).__init__()

        self.BROADCAST_EVERY_EPOCH = True