Python horovod.tensorflow.BroadcastGlobalVariablesHook() Examples

The following are 8 code examples of horovod.tensorflow.BroadcastGlobalVariablesHook(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module horovod.tensorflow , or try the search function .
Example #1
Source File: classifier.py    From tensorflow_fasttext with MIT License 6 votes vote down vote up
def FastTrain():
    print("FastTrain", FLAGS.train_steps)
    estimator = FastTextEstimator(FLAGS.model_dir)
    print("TEST" + FLAGS.train_records)
    train_input = InputFn(tf.estimator.ModeKeys.TRAIN, FLAGS.train_records)
    print("STARTING TRAIN")
    hooks = None
    if FLAGS.horovod:
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
    estimator.train(input_fn=train_input, steps=FLAGS.train_steps, hooks=hooks)
    print("TRAIN COMPLETE")
    if not FLAGS.horovod or hvd.rank() == 0:
        print("EVALUATE")
        eval_input = InputFn(tf.estimator.ModeKeys.EVAL, FLAGS.eval_records)
        #eval_metrics = { "accuracy": tf.metrics.accuracy(labels, predictions) }
        result = estimator.evaluate(input_fn=eval_input, steps=FLAGS.eval_steps, hooks=None)
        print(result)
        print("DONE")
        if FLAGS.export_dir:
            print("EXPORTING")
            estimator.export_savedmodel(FLAGS.export_dir,
                                        inputs.ServingInputFn(FLAGS.use_ngrams)) 
Example #2
Source File: collective_all_reduce_example.py    From tf-yarn with Apache License 2.0 5 votes vote down vote up
def experiment_fn() -> Experiment:
    # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15
    import tensorflow as tf

    def train_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train")
        return dataset.shuffle(1000).batch(128).repeat()

    def eval_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
        return dataset.shuffle(1000).batch(128)

    estimator = tf.compat.v1.estimator.LinearClassifier(
        feature_columns=winequality.get_feature_columns(),
        model_dir=f"{HDFS_DIR}",
        n_classes=winequality.get_n_classes(),
        optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer()))

    return Experiment(
        estimator,
        tf.estimator.TrainSpec(
            train_input_fn,
            max_steps=10,
            hooks=[hvd.BroadcastGlobalVariablesHook(0)]
        ),
        tf.estimator.EvalSpec(
            eval_input_fn,
            steps=10,
            start_delay_secs=0,
            throttle_secs=30
        )
    ) 
Example #3
Source File: train_model.py    From DistributedDeepLearning with MIT License 5 votes vote down vote up
def _get_hooks(is_distributed=DISTRIBUTED):
    logger = logging.getLogger(__name__)
    if is_distributed:
        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
        logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
        return [bcast_hook]
    else:
        return [] 
Example #4
Source File: resnet_main.py    From DistributedDeepLearning with MIT License 5 votes vote down vote up
def _get_hooks(batch_size, is_distributed=defaults.DISTRIBUTED):
    logger = logging.getLogger(__name__)

    if is_distributed:
        exps_hook = ExamplesPerSecondHook(batch_size * hvd.size())
        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
        logger.info("Rank: {} Cluster Size {}".format(hvd.rank(), hvd.size()))
        return [bcast_hook, exps_hook]
    else:
        exps_hook = ExamplesPerSecondHook(batch_size)
        return [exps_hook] 
Example #5
Source File: tf_distributed_optimizer.py    From deep500 with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def as_operator(self):
        try:
            import horovod.tensorflow as hvd
        except ImportError:
            raise ImportError('Cannot import Horovod')
        
        self.network.session_config.gpu_options.visible_device_list = str(hvd.local_rank())
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        self.network.add_hooks(hooks)
        return self.op.minimize(self.network.fetch_internal_tensor(self.loss)) 
Example #6
Source File: distributed_optimizer.py    From BERT with Apache License 2.0 4 votes vote down vote up
def get_opt(self, init_lr, 
				num_train_steps, **kargs):

		learning_rate = init_lr
		if self.config.get("decay", "no") == "decay":
			print("==apply lr decay==")
			learning_rate = self.lr_decay_fn(learning_rate, num_train_steps, **kargs)
		if self.config.get("warmup", "no") == "warmup":
			print("==apply warmup==")
			learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
		else:
			learning_rate = tf.cast(tf.constant(learning_rate), tf.float32)
		self.learning_rate = learning_rate #* (self.config.get('gpu_count', 1) / 2)
		# self.learning_rate = learning_rate / np.sqrt(self.config.get('gpu_count', 1) / 2)
		# self.learning_rate = learning_rate * np.sqrt(self.config.get('gpu_count', 1)) * 2
		self.single_node_learning = learning_rate
		
		# add uber horvod distributed optimizer
		if hvd and self.config["opt_type"] == "hvd":
			print("==optimizer hvd size=={}".format(self.config.get("worker_count", hvd.size())))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", hvd.size()), **kargs)
			self.opt = hvd.DistributedOptimizer(opt)
			self.distributed_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
		# add pai soar distributed optimizer
		elif pai and self.config["opt_type"] == "pai_soar":
			print("==optimizer pai_soar size=={}".format(self.config.get("worker_count", 4)))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
			self.opt = pai.ReplicatedVarsOptimizer(opt, clip_norm=self.config.get("clip_norm", 1.0))
			self.distributed_hooks = []
		# add tensorflow ps sync distributed optimizer
		elif self.config["opt_type"] == "ps_sync":
			print("==optimizer ps_sync size=={}".format(self.config.get("worker_count", 4)))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
			self.opt = tf.train.SyncReplicasOptimizer(opt, 
											replicas_to_aggregate=self.config.get("worker_count", 4), 
											total_num_replicas=self.config.get("worker_count", 4))
			self.distributed_hooks = [self.opt.make_session_run_hook(self.config["is_chief"], num_tokens=0)]
		elif self.config["opt_type"] == "ps":
			print("==optimizer ps_async size=={}".format(self.config.get("worker_count", 4)))
			self.opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
		else:
			print("==initialization of single node optimizer==")
			self.opt = self.optimizer_op(self.learning_rate, **kargs)
			self.distributed_hooks = [] 
Example #7
Source File: tensorflow_mnist.py    From training_results_v0.6 with Apache License 2.0 4 votes vote down vote up
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Download and load MNIST dataset.
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(100)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_}) 
Example #8
Source File: tensorflow_mnist_estimator.py    From training_results_v0.6 with Apache License 2.0 4 votes vote down vote up
def main(unused_argv):
    # Horovod: initialize Horovod.
    hvd.init()

    # Load training and eval data
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
    train_data = mnist.train.images  # Returns np.array
    train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
    eval_data = mnist.test.images  # Returns np.array
    eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log, every_n_iter=500)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=20000 // hvd.size(),
        hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        num_epochs=1,
        shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)