Python horovod.tensorflow.BroadcastGlobalVariablesHook() Examples
The following are 8
code examples of horovod.tensorflow.BroadcastGlobalVariablesHook().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
horovod.tensorflow
, or try the search function
.
Example #1
Source File: classifier.py From tensorflow_fasttext with MIT License | 6 votes |
def FastTrain(): print("FastTrain", FLAGS.train_steps) estimator = FastTextEstimator(FLAGS.model_dir) print("TEST" + FLAGS.train_records) train_input = InputFn(tf.estimator.ModeKeys.TRAIN, FLAGS.train_records) print("STARTING TRAIN") hooks = None if FLAGS.horovod: hooks = [hvd.BroadcastGlobalVariablesHook(0)] estimator.train(input_fn=train_input, steps=FLAGS.train_steps, hooks=hooks) print("TRAIN COMPLETE") if not FLAGS.horovod or hvd.rank() == 0: print("EVALUATE") eval_input = InputFn(tf.estimator.ModeKeys.EVAL, FLAGS.eval_records) #eval_metrics = { "accuracy": tf.metrics.accuracy(labels, predictions) } result = estimator.evaluate(input_fn=eval_input, steps=FLAGS.eval_steps, hooks=None) print(result) print("DONE") if FLAGS.export_dir: print("EXPORTING") estimator.export_savedmodel(FLAGS.export_dir, inputs.ServingInputFn(FLAGS.use_ngrams))
Example #2
Source File: collective_all_reduce_example.py From tf-yarn with Apache License 2.0 | 5 votes |
def experiment_fn() -> Experiment: # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15 import tensorflow as tf def train_input_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train") return dataset.shuffle(1000).batch(128).repeat() def eval_input_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test") return dataset.shuffle(1000).batch(128) estimator = tf.compat.v1.estimator.LinearClassifier( feature_columns=winequality.get_feature_columns(), model_dir=f"{HDFS_DIR}", n_classes=winequality.get_n_classes(), optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer())) return Experiment( estimator, tf.estimator.TrainSpec( train_input_fn, max_steps=10, hooks=[hvd.BroadcastGlobalVariablesHook(0)] ), tf.estimator.EvalSpec( eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30 ) )
Example #3
Source File: train_model.py From DistributedDeepLearning with MIT License | 5 votes |
def _get_hooks(is_distributed=DISTRIBUTED): logger = logging.getLogger(__name__) if is_distributed: bcast_hook = hvd.BroadcastGlobalVariablesHook(0) logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size())) return [bcast_hook] else: return []
Example #4
Source File: resnet_main.py From DistributedDeepLearning with MIT License | 5 votes |
def _get_hooks(batch_size, is_distributed=defaults.DISTRIBUTED): logger = logging.getLogger(__name__) if is_distributed: exps_hook = ExamplesPerSecondHook(batch_size * hvd.size()) bcast_hook = hvd.BroadcastGlobalVariablesHook(0) logger.info("Rank: {} Cluster Size {}".format(hvd.rank(), hvd.size())) return [bcast_hook, exps_hook] else: exps_hook = ExamplesPerSecondHook(batch_size) return [exps_hook]
Example #5
Source File: tf_distributed_optimizer.py From deep500 with BSD 3-Clause "New" or "Revised" License | 5 votes |
def as_operator(self): try: import horovod.tensorflow as hvd except ImportError: raise ImportError('Cannot import Horovod') self.network.session_config.gpu_options.visible_device_list = str(hvd.local_rank()) hooks = [hvd.BroadcastGlobalVariablesHook(0)] self.network.add_hooks(hooks) return self.op.minimize(self.network.fetch_internal_tensor(self.loss))
Example #6
Source File: distributed_optimizer.py From BERT with Apache License 2.0 | 4 votes |
def get_opt(self, init_lr, num_train_steps, **kargs): learning_rate = init_lr if self.config.get("decay", "no") == "decay": print("==apply lr decay==") learning_rate = self.lr_decay_fn(learning_rate, num_train_steps, **kargs) if self.config.get("warmup", "no") == "warmup": print("==apply warmup==") learning_rate = self.warm_up(learning_rate, init_lr, **kargs) else: learning_rate = tf.cast(tf.constant(learning_rate), tf.float32) self.learning_rate = learning_rate #* (self.config.get('gpu_count', 1) / 2) # self.learning_rate = learning_rate / np.sqrt(self.config.get('gpu_count', 1) / 2) # self.learning_rate = learning_rate * np.sqrt(self.config.get('gpu_count', 1)) * 2 self.single_node_learning = learning_rate # add uber horvod distributed optimizer if hvd and self.config["opt_type"] == "hvd": print("==optimizer hvd size=={}".format(self.config.get("worker_count", hvd.size()))) opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", hvd.size()), **kargs) self.opt = hvd.DistributedOptimizer(opt) self.distributed_hooks = [hvd.BroadcastGlobalVariablesHook(0)] # add pai soar distributed optimizer elif pai and self.config["opt_type"] == "pai_soar": print("==optimizer pai_soar size=={}".format(self.config.get("worker_count", 4))) opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs) self.opt = pai.ReplicatedVarsOptimizer(opt, clip_norm=self.config.get("clip_norm", 1.0)) self.distributed_hooks = [] # add tensorflow ps sync distributed optimizer elif self.config["opt_type"] == "ps_sync": print("==optimizer ps_sync size=={}".format(self.config.get("worker_count", 4))) opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs) self.opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=self.config.get("worker_count", 4), total_num_replicas=self.config.get("worker_count", 4)) self.distributed_hooks = [self.opt.make_session_run_hook(self.config["is_chief"], num_tokens=0)] elif self.config["opt_type"] == "ps": print("==optimizer ps_async size=={}".format(self.config.get("worker_count", 4))) self.opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs) else: print("==initialization of single node optimizer==") self.opt = self.optimizer_op(self.learning_rate, **kargs) self.distributed_hooks = []
Example #7
Source File: tensorflow_mnist.py From training_results_v0.6 with Apache License 2.0 | 4 votes |
def main(_): # Horovod: initialize Horovod. hvd.init() # Download and load MNIST dataset. mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.contrib.framework.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = mnist.train.next_batch(100) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
Example #8
Source File: tensorflow_mnist_estimator.py From training_results_v0.6 with Apache License 2.0 | 4 votes |
def main(unused_argv): # Horovod: initialize Horovod. hvd.init() # Load training and eval data mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) train_data = mnist.train.images # Returns np.array train_labels = np.asarray(mnist.train.labels, dtype=np.int32) eval_data = mnist.test.images # Returns np.array eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=500) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train( input_fn=train_input_fn, steps=20000 // hvd.size(), hooks=[logging_hook, bcast_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)