Python horovod.tensorflow.DistributedOptimizer() Examples

The following are 10 code examples of horovod.tensorflow.DistributedOptimizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module horovod.tensorflow , or try the search function .
Example #1
Source File: horovod_optimizer.py    From rlgraph with Apache License 2.0 6 votes vote down vote up
def __init__(self, local_optimizer=None, **kwargs):
        """
        Initializes a distributed horovod optimizer by wrapping a local optimizer.

        Args:
            local_optimizer (Optional[dict,LocalOptimizer]): The spec-dict for the wrapped LocalOptimizer object or
                a LocalOptimizer object itself.
        """
        super(HorovodOptimizer, self).__init__(**kwargs)

        # Create the horovod wrapper.
        wrapped_local_optimizer = Optimizer.from_spec(local_optimizer)
        self.local_optimizer = hvd.DistributedOptimizer(wrapped_local_optimizer)

        @rlgraph_api
        def step(self, variables, loss, time_percentage, *inputs):
            grads_and_vars = self._graph_fn_calculate_gradients(variables, loss, time_percentage, *inputs)
            return self._graph_fn_apply_gradients(grads_and_vars) 
Example #2
Source File: hvd_distributed_optimizer.py    From BERT with Apache License 2.0 6 votes vote down vote up
def get_train_op(self, loss, tvars, init_lr, 
							num_train_steps, **kargs):
		learning_rate = self.lr_decay_fn(init_lr, num_train_steps, **kargs)
		learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
		print("==optimizer hvd size=={}".format(hvd.size()))
		opt = self.optimizer_op(learning_rate*hvd.size(), **kargs)

		# add uber horvod distributed optimizer
		self.opt = hvd.DistributedOptimizer(opt)
		grads = self.grad_clip_fn(self.opt, loss, tvars, **kargs)

		# self.grad_summaries_merged = optimizer_utils.add_grad_summaries(
		# 						zip(grads, tvars))

		train_op = self.opt.apply_gradients(
					zip(grads, tvars), global_step=self.global_step)
		new_global_step = self.global_step + 1
		train_op = tf.group(train_op, [self.global_step.assign(new_global_step)])
		return train_op 
Example #3
Source File: tf_distributed_optimizer.py    From deep500 with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, optimizer: TFOptimizer, comm=None):
        super().__init__(optimizer.executor, optimizer.loss)

        try:
            import horovod.tensorflow as hvd
        except ImportError:
            raise ImportError('Cannot import Horovod')
            
        hvd.init()
        self.op = hvd.DistributedOptimizer(optimizer.op)


        if comm is None:
            comm = CommunicationNetwork()
        self.communication = comm
        self.original_optimizer = optimizer 
Example #4
Source File: collective_all_reduce_example.py    From tf-yarn with Apache License 2.0 5 votes vote down vote up
def experiment_fn() -> Experiment:
    # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15
    import tensorflow as tf

    def train_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train")
        return dataset.shuffle(1000).batch(128).repeat()

    def eval_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
        return dataset.shuffle(1000).batch(128)

    estimator = tf.compat.v1.estimator.LinearClassifier(
        feature_columns=winequality.get_feature_columns(),
        model_dir=f"{HDFS_DIR}",
        n_classes=winequality.get_n_classes(),
        optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer()))

    return Experiment(
        estimator,
        tf.estimator.TrainSpec(
            train_input_fn,
            max_steps=10,
            hooks=[hvd.BroadcastGlobalVariablesHook(0)]
        ),
        tf.estimator.EvalSpec(
            eval_input_fn,
            steps=10,
            start_delay_secs=0,
            throttle_secs=30
        )
    ) 
Example #5
Source File: hvd_distributed_optimizer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def get_opt(self, init_lr, 
				num_train_steps, **kargs):
		learning_rate = self.lr_decay_fn(init_lr, num_train_steps, **kargs)
		learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
		print("==optimizer hvd size=={}".format(hvd.size()))
		opt = self.optimizer_op(learning_rate*hvd.size(), **kargs)

		# add uber horvod distributed optimizer
		self.opt = hvd.DistributedOptimizer(opt) 
Example #6
Source File: multi_gpu_wrapper.py    From tf-hrnet with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def DistributedOptimizer(cls, *args):
    """Get a distributed optimizer from the base optimizer."""

    try:
      return mgw.DistributedOptimizer(*args)
    except NameError:
      raise NameError('module <mgw> not imported') 
Example #7
Source File: train_model.py    From DistributedDeepLearning with MIT License 5 votes vote down vote up
def _get_optimizer(params, is_distributed=DISTRIBUTED):
    if is_distributed:
        # Horovod: add Horovod Distributed Optimizer.
        return hvd.DistributedOptimizer(
            tf.train.MomentumOptimizer(
                learning_rate=params["learning_rate"] * hvd.size(),
                momentum=params["momentum"],
            )
        )
    else:
        return tf.train.MomentumOptimizer(
            learning_rate=params["learning_rate"], momentum=params["momentum"]
        ) 
Example #8
Source File: resnet_main.py    From DistributedDeepLearning with MIT License 5 votes vote down vote up
def _get_optimizer(params, is_distributed=defaults.DISTRIBUTED):
    if is_distributed:
        # Horovod: add Horovod Distributed Optimizer.
        return hvd.DistributedOptimizer(
            tf.train.MomentumOptimizer(
                learning_rate=params["learning_rate"] * hvd.size(),
                momentum=params["momentum"],
            )
        )
    else:
        return tf.train.MomentumOptimizer(
            learning_rate=params["learning_rate"], momentum=params["momentum"]
        ) 
Example #9
Source File: distributed_optimizer.py    From BERT with Apache License 2.0 4 votes vote down vote up
def get_opt(self, init_lr, 
				num_train_steps, **kargs):

		learning_rate = init_lr
		if self.config.get("decay", "no") == "decay":
			print("==apply lr decay==")
			learning_rate = self.lr_decay_fn(learning_rate, num_train_steps, **kargs)
		if self.config.get("warmup", "no") == "warmup":
			print("==apply warmup==")
			learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
		else:
			learning_rate = tf.cast(tf.constant(learning_rate), tf.float32)
		self.learning_rate = learning_rate #* (self.config.get('gpu_count', 1) / 2)
		# self.learning_rate = learning_rate / np.sqrt(self.config.get('gpu_count', 1) / 2)
		# self.learning_rate = learning_rate * np.sqrt(self.config.get('gpu_count', 1)) * 2
		self.single_node_learning = learning_rate
		
		# add uber horvod distributed optimizer
		if hvd and self.config["opt_type"] == "hvd":
			print("==optimizer hvd size=={}".format(self.config.get("worker_count", hvd.size())))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", hvd.size()), **kargs)
			self.opt = hvd.DistributedOptimizer(opt)
			self.distributed_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
		# add pai soar distributed optimizer
		elif pai and self.config["opt_type"] == "pai_soar":
			print("==optimizer pai_soar size=={}".format(self.config.get("worker_count", 4)))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
			self.opt = pai.ReplicatedVarsOptimizer(opt, clip_norm=self.config.get("clip_norm", 1.0))
			self.distributed_hooks = []
		# add tensorflow ps sync distributed optimizer
		elif self.config["opt_type"] == "ps_sync":
			print("==optimizer ps_sync size=={}".format(self.config.get("worker_count", 4)))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
			self.opt = tf.train.SyncReplicasOptimizer(opt, 
											replicas_to_aggregate=self.config.get("worker_count", 4), 
											total_num_replicas=self.config.get("worker_count", 4))
			self.distributed_hooks = [self.opt.make_session_run_hook(self.config["is_chief"], num_tokens=0)]
		elif self.config["opt_type"] == "ps":
			print("==optimizer ps_async size=={}".format(self.config.get("worker_count", 4)))
			self.opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
		else:
			print("==initialization of single node optimizer==")
			self.opt = self.optimizer_op(self.learning_rate, **kargs)
			self.distributed_hooks = [] 
Example #10
Source File: tensorflow_mnist.py    From training_results_v0.6 with Apache License 2.0 4 votes vote down vote up
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Download and load MNIST dataset.
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(100)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})