Python Examples of horovod.tensorflow.BroadcastGlobalVariablesHook

Source File: classifier.py From tensorflow_fasttext with MIT License

6 votes

def FastTrain():
    print("FastTrain", FLAGS.train_steps)
    estimator = FastTextEstimator(FLAGS.model_dir)
    print("TEST" + FLAGS.train_records)
    train_input = InputFn(tf.estimator.ModeKeys.TRAIN, FLAGS.train_records)
    print("STARTING TRAIN")
    hooks = None
    if FLAGS.horovod:
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
    estimator.train(input_fn=train_input, steps=FLAGS.train_steps, hooks=hooks)
    print("TRAIN COMPLETE")
    if not FLAGS.horovod or hvd.rank() == 0:
        print("EVALUATE")
        eval_input = InputFn(tf.estimator.ModeKeys.EVAL, FLAGS.eval_records)
        #eval_metrics = { "accuracy": tf.metrics.accuracy(labels, predictions) }
        result = estimator.evaluate(input_fn=eval_input, steps=FLAGS.eval_steps, hooks=None)
        print(result)
        print("DONE")
        if FLAGS.export_dir:
            print("EXPORTING")
            estimator.export_savedmodel(FLAGS.export_dir,
                                        inputs.ServingInputFn(FLAGS.use_ngrams))

Source File: collective_all_reduce_example.py From tf-yarn with Apache License 2.0

5 votes

def experiment_fn() -> Experiment:
    # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15
    import tensorflow as tf

    def train_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train")
        return dataset.shuffle(1000).batch(128).repeat()

    def eval_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
        return dataset.shuffle(1000).batch(128)

    estimator = tf.compat.v1.estimator.LinearClassifier(
        feature_columns=winequality.get_feature_columns(),
        model_dir=f"{HDFS_DIR}",
        n_classes=winequality.get_n_classes(),
        optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer()))

    return Experiment(
        estimator,
        tf.estimator.TrainSpec(
            train_input_fn,
            max_steps=10,
            hooks=[hvd.BroadcastGlobalVariablesHook(0)]
        ),
        tf.estimator.EvalSpec(
            eval_input_fn,
            steps=10,
            start_delay_secs=0,
            throttle_secs=30
        )
    )

Source File: train_model.py From DistributedDeepLearning with MIT License

5 votes

def _get_hooks(is_distributed=DISTRIBUTED):
    logger = logging.getLogger(__name__)
    if is_distributed:
        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
        logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
        return [bcast_hook]
    else:
        return []

Source File: resnet_main.py From DistributedDeepLearning with MIT License

5 votes

def _get_hooks(batch_size, is_distributed=defaults.DISTRIBUTED):
    logger = logging.getLogger(__name__)

    if is_distributed:
        exps_hook = ExamplesPerSecondHook(batch_size * hvd.size())
        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
        logger.info("Rank: {} Cluster Size {}".format(hvd.rank(), hvd.size()))
        return [bcast_hook, exps_hook]
    else:
        exps_hook = ExamplesPerSecondHook(batch_size)
        return [exps_hook]

Source File: tf_distributed_optimizer.py From deep500 with BSD 3-Clause "New" or "Revised" License

5 votes

def as_operator(self):
        try:
            import horovod.tensorflow as hvd
        except ImportError:
            raise ImportError('Cannot import Horovod')
        
        self.network.session_config.gpu_options.visible_device_list = str(hvd.local_rank())
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        self.network.add_hooks(hooks)
        return self.op.minimize(self.network.fetch_internal_tensor(self.loss))

Source File: distributed_optimizer.py From BERT with Apache License 2.0

4 votes

def get_opt(self, init_lr, 
				num_train_steps, **kargs):

		learning_rate = init_lr
		if self.config.get("decay", "no") == "decay":
			print("==apply lr decay==")
			learning_rate = self.lr_decay_fn(learning_rate, num_train_steps, **kargs)
		if self.config.get("warmup", "no") == "warmup":
			print("==apply warmup==")
			learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
		else:
			learning_rate = tf.cast(tf.constant(learning_rate), tf.float32)
		self.learning_rate = learning_rate #* (self.config.get('gpu_count', 1) / 2)
		# self.learning_rate = learning_rate / np.sqrt(self.config.get('gpu_count', 1) / 2)
		# self.learning_rate = learning_rate * np.sqrt(self.config.get('gpu_count', 1)) * 2
		self.single_node_learning = learning_rate
		
		# add uber horvod distributed optimizer
		if hvd and self.config["opt_type"] == "hvd":
			print("==optimizer hvd size=={}".format(self.config.get("worker_count", hvd.size())))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", hvd.size()), **kargs)
			self.opt = hvd.DistributedOptimizer(opt)
			self.distributed_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
		# add pai soar distributed optimizer
		elif pai and self.config["opt_type"] == "pai_soar":
			print("==optimizer pai_soar size=={}".format(self.config.get("worker_count", 4)))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
			self.opt = pai.ReplicatedVarsOptimizer(opt, clip_norm=self.config.get("clip_norm", 1.0))
			self.distributed_hooks = []
		# add tensorflow ps sync distributed optimizer
		elif self.config["opt_type"] == "ps_sync":
			print("==optimizer ps_sync size=={}".format(self.config.get("worker_count", 4)))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
			self.opt = tf.train.SyncReplicasOptimizer(opt, 
											replicas_to_aggregate=self.config.get("worker_count", 4), 
											total_num_replicas=self.config.get("worker_count", 4))
			self.distributed_hooks = [self.opt.make_session_run_hook(self.config["is_chief"], num_tokens=0)]
		elif self.config["opt_type"] == "ps":
			print("==optimizer ps_async size=={}".format(self.config.get("worker_count", 4)))
			self.opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
		else:
			print("==initialization of single node optimizer==")
			self.opt = self.optimizer_op(self.learning_rate, **kargs)
			self.distributed_hooks = []

Source File: tensorflow_mnist.py From training_results_v0.6 with Apache License 2.0

4 votes

def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Download and load MNIST dataset.
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(100)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})

Source File: tensorflow_mnist_estimator.py From training_results_v0.6 with Apache License 2.0

4 votes

def main(unused_argv):
    # Horovod: initialize Horovod.
    hvd.init()

    # Load training and eval data
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
    train_data = mnist.train.images  # Returns np.array
    train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
    eval_data = mnist.test.images  # Returns np.array
    eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log, every_n_iter=500)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=20000 // hvd.size(),
        hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        num_epochs=1,
        shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)

Python horovod.tensorflow.BroadcastGlobalVariablesHook() Examples