Python horovod.tensorflow.DistributedOptimizer() Examples
The following are 10
code examples of horovod.tensorflow.DistributedOptimizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
horovod.tensorflow
, or try the search function
.
Example #1
Source File: horovod_optimizer.py From rlgraph with Apache License 2.0 | 6 votes |
def __init__(self, local_optimizer=None, **kwargs): """ Initializes a distributed horovod optimizer by wrapping a local optimizer. Args: local_optimizer (Optional[dict,LocalOptimizer]): The spec-dict for the wrapped LocalOptimizer object or a LocalOptimizer object itself. """ super(HorovodOptimizer, self).__init__(**kwargs) # Create the horovod wrapper. wrapped_local_optimizer = Optimizer.from_spec(local_optimizer) self.local_optimizer = hvd.DistributedOptimizer(wrapped_local_optimizer) @rlgraph_api def step(self, variables, loss, time_percentage, *inputs): grads_and_vars = self._graph_fn_calculate_gradients(variables, loss, time_percentage, *inputs) return self._graph_fn_apply_gradients(grads_and_vars)
Example #2
Source File: hvd_distributed_optimizer.py From BERT with Apache License 2.0 | 6 votes |
def get_train_op(self, loss, tvars, init_lr, num_train_steps, **kargs): learning_rate = self.lr_decay_fn(init_lr, num_train_steps, **kargs) learning_rate = self.warm_up(learning_rate, init_lr, **kargs) print("==optimizer hvd size=={}".format(hvd.size())) opt = self.optimizer_op(learning_rate*hvd.size(), **kargs) # add uber horvod distributed optimizer self.opt = hvd.DistributedOptimizer(opt) grads = self.grad_clip_fn(self.opt, loss, tvars, **kargs) # self.grad_summaries_merged = optimizer_utils.add_grad_summaries( # zip(grads, tvars)) train_op = self.opt.apply_gradients( zip(grads, tvars), global_step=self.global_step) new_global_step = self.global_step + 1 train_op = tf.group(train_op, [self.global_step.assign(new_global_step)]) return train_op
Example #3
Source File: tf_distributed_optimizer.py From deep500 with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, optimizer: TFOptimizer, comm=None): super().__init__(optimizer.executor, optimizer.loss) try: import horovod.tensorflow as hvd except ImportError: raise ImportError('Cannot import Horovod') hvd.init() self.op = hvd.DistributedOptimizer(optimizer.op) if comm is None: comm = CommunicationNetwork() self.communication = comm self.original_optimizer = optimizer
Example #4
Source File: collective_all_reduce_example.py From tf-yarn with Apache License 2.0 | 5 votes |
def experiment_fn() -> Experiment: # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15 import tensorflow as tf def train_input_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train") return dataset.shuffle(1000).batch(128).repeat() def eval_input_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test") return dataset.shuffle(1000).batch(128) estimator = tf.compat.v1.estimator.LinearClassifier( feature_columns=winequality.get_feature_columns(), model_dir=f"{HDFS_DIR}", n_classes=winequality.get_n_classes(), optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer())) return Experiment( estimator, tf.estimator.TrainSpec( train_input_fn, max_steps=10, hooks=[hvd.BroadcastGlobalVariablesHook(0)] ), tf.estimator.EvalSpec( eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30 ) )
Example #5
Source File: hvd_distributed_optimizer.py From BERT with Apache License 2.0 | 5 votes |
def get_opt(self, init_lr, num_train_steps, **kargs): learning_rate = self.lr_decay_fn(init_lr, num_train_steps, **kargs) learning_rate = self.warm_up(learning_rate, init_lr, **kargs) print("==optimizer hvd size=={}".format(hvd.size())) opt = self.optimizer_op(learning_rate*hvd.size(), **kargs) # add uber horvod distributed optimizer self.opt = hvd.DistributedOptimizer(opt)
Example #6
Source File: multi_gpu_wrapper.py From tf-hrnet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def DistributedOptimizer(cls, *args): """Get a distributed optimizer from the base optimizer.""" try: return mgw.DistributedOptimizer(*args) except NameError: raise NameError('module <mgw> not imported')
Example #7
Source File: train_model.py From DistributedDeepLearning with MIT License | 5 votes |
def _get_optimizer(params, is_distributed=DISTRIBUTED): if is_distributed: # Horovod: add Horovod Distributed Optimizer. return hvd.DistributedOptimizer( tf.train.MomentumOptimizer( learning_rate=params["learning_rate"] * hvd.size(), momentum=params["momentum"], ) ) else: return tf.train.MomentumOptimizer( learning_rate=params["learning_rate"], momentum=params["momentum"] )
Example #8
Source File: resnet_main.py From DistributedDeepLearning with MIT License | 5 votes |
def _get_optimizer(params, is_distributed=defaults.DISTRIBUTED): if is_distributed: # Horovod: add Horovod Distributed Optimizer. return hvd.DistributedOptimizer( tf.train.MomentumOptimizer( learning_rate=params["learning_rate"] * hvd.size(), momentum=params["momentum"], ) ) else: return tf.train.MomentumOptimizer( learning_rate=params["learning_rate"], momentum=params["momentum"] )
Example #9
Source File: distributed_optimizer.py From BERT with Apache License 2.0 | 4 votes |
def get_opt(self, init_lr, num_train_steps, **kargs): learning_rate = init_lr if self.config.get("decay", "no") == "decay": print("==apply lr decay==") learning_rate = self.lr_decay_fn(learning_rate, num_train_steps, **kargs) if self.config.get("warmup", "no") == "warmup": print("==apply warmup==") learning_rate = self.warm_up(learning_rate, init_lr, **kargs) else: learning_rate = tf.cast(tf.constant(learning_rate), tf.float32) self.learning_rate = learning_rate #* (self.config.get('gpu_count', 1) / 2) # self.learning_rate = learning_rate / np.sqrt(self.config.get('gpu_count', 1) / 2) # self.learning_rate = learning_rate * np.sqrt(self.config.get('gpu_count', 1)) * 2 self.single_node_learning = learning_rate # add uber horvod distributed optimizer if hvd and self.config["opt_type"] == "hvd": print("==optimizer hvd size=={}".format(self.config.get("worker_count", hvd.size()))) opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", hvd.size()), **kargs) self.opt = hvd.DistributedOptimizer(opt) self.distributed_hooks = [hvd.BroadcastGlobalVariablesHook(0)] # add pai soar distributed optimizer elif pai and self.config["opt_type"] == "pai_soar": print("==optimizer pai_soar size=={}".format(self.config.get("worker_count", 4))) opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs) self.opt = pai.ReplicatedVarsOptimizer(opt, clip_norm=self.config.get("clip_norm", 1.0)) self.distributed_hooks = [] # add tensorflow ps sync distributed optimizer elif self.config["opt_type"] == "ps_sync": print("==optimizer ps_sync size=={}".format(self.config.get("worker_count", 4))) opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs) self.opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=self.config.get("worker_count", 4), total_num_replicas=self.config.get("worker_count", 4)) self.distributed_hooks = [self.opt.make_session_run_hook(self.config["is_chief"], num_tokens=0)] elif self.config["opt_type"] == "ps": print("==optimizer ps_async size=={}".format(self.config.get("worker_count", 4))) self.opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs) else: print("==initialization of single node optimizer==") self.opt = self.optimizer_op(self.learning_rate, **kargs) self.distributed_hooks = []
Example #10
Source File: tensorflow_mnist.py From training_results_v0.6 with Apache License 2.0 | 4 votes |
def main(_): # Horovod: initialize Horovod. hvd.init() # Download and load MNIST dataset. mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.contrib.framework.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = mnist.train.next_batch(100) mon_sess.run(train_op, feed_dict={image: image_, label: label_})