Python Examples of horovod.tensorflow.DistributedOptimizer

Source File: horovod_optimizer.py From rlgraph with Apache License 2.0

6 votes

def __init__(self, local_optimizer=None, **kwargs):
        """
        Initializes a distributed horovod optimizer by wrapping a local optimizer.

        Args:
            local_optimizer (Optional[dict,LocalOptimizer]): The spec-dict for the wrapped LocalOptimizer object or
                a LocalOptimizer object itself.
        """
        super(HorovodOptimizer, self).__init__(**kwargs)

        # Create the horovod wrapper.
        wrapped_local_optimizer = Optimizer.from_spec(local_optimizer)
        self.local_optimizer = hvd.DistributedOptimizer(wrapped_local_optimizer)

        @rlgraph_api
        def step(self, variables, loss, time_percentage, *inputs):
            grads_and_vars = self._graph_fn_calculate_gradients(variables, loss, time_percentage, *inputs)
            return self._graph_fn_apply_gradients(grads_and_vars)

Source File: hvd_distributed_optimizer.py From BERT with Apache License 2.0

6 votes

def get_train_op(self, loss, tvars, init_lr, 
							num_train_steps, **kargs):
		learning_rate = self.lr_decay_fn(init_lr, num_train_steps, **kargs)
		learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
		print("==optimizer hvd size=={}".format(hvd.size()))
		opt = self.optimizer_op(learning_rate*hvd.size(), **kargs)

		# add uber horvod distributed optimizer
		self.opt = hvd.DistributedOptimizer(opt)
		grads = self.grad_clip_fn(self.opt, loss, tvars, **kargs)

		# self.grad_summaries_merged = optimizer_utils.add_grad_summaries(
		# 						zip(grads, tvars))

		train_op = self.opt.apply_gradients(
					zip(grads, tvars), global_step=self.global_step)
		new_global_step = self.global_step + 1
		train_op = tf.group(train_op, [self.global_step.assign(new_global_step)])
		return train_op

Source File: tf_distributed_optimizer.py From deep500 with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, optimizer: TFOptimizer, comm=None):
        super().__init__(optimizer.executor, optimizer.loss)

        try:
            import horovod.tensorflow as hvd
        except ImportError:
            raise ImportError('Cannot import Horovod')
            
        hvd.init()
        self.op = hvd.DistributedOptimizer(optimizer.op)


        if comm is None:
            comm = CommunicationNetwork()
        self.communication = comm
        self.original_optimizer = optimizer

Source File: collective_all_reduce_example.py From tf-yarn with Apache License 2.0

5 votes

def experiment_fn() -> Experiment:
    # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15
    import tensorflow as tf

    def train_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train")
        return dataset.shuffle(1000).batch(128).repeat()

    def eval_input_fn():
        dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
        return dataset.shuffle(1000).batch(128)

    estimator = tf.compat.v1.estimator.LinearClassifier(
        feature_columns=winequality.get_feature_columns(),
        model_dir=f"{HDFS_DIR}",
        n_classes=winequality.get_n_classes(),
        optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer()))

    return Experiment(
        estimator,
        tf.estimator.TrainSpec(
            train_input_fn,
            max_steps=10,
            hooks=[hvd.BroadcastGlobalVariablesHook(0)]
        ),
        tf.estimator.EvalSpec(
            eval_input_fn,
            steps=10,
            start_delay_secs=0,
            throttle_secs=30
        )
    )

Source File: hvd_distributed_optimizer.py From BERT with Apache License 2.0

5 votes

def get_opt(self, init_lr, 
				num_train_steps, **kargs):
		learning_rate = self.lr_decay_fn(init_lr, num_train_steps, **kargs)
		learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
		print("==optimizer hvd size=={}".format(hvd.size()))
		opt = self.optimizer_op(learning_rate*hvd.size(), **kargs)

		# add uber horvod distributed optimizer
		self.opt = hvd.DistributedOptimizer(opt)

Source File: multi_gpu_wrapper.py From tf-hrnet with BSD 3-Clause "New" or "Revised" License

5 votes

def DistributedOptimizer(cls, *args):
    """Get a distributed optimizer from the base optimizer."""

    try:
      return mgw.DistributedOptimizer(*args)
    except NameError:
      raise NameError('module <mgw> not imported')

Source File: train_model.py From DistributedDeepLearning with MIT License

5 votes

def _get_optimizer(params, is_distributed=DISTRIBUTED):
    if is_distributed:
        # Horovod: add Horovod Distributed Optimizer.
        return hvd.DistributedOptimizer(
            tf.train.MomentumOptimizer(
                learning_rate=params["learning_rate"] * hvd.size(),
                momentum=params["momentum"],
            )
        )
    else:
        return tf.train.MomentumOptimizer(
            learning_rate=params["learning_rate"], momentum=params["momentum"]
        )

Source File: resnet_main.py From DistributedDeepLearning with MIT License

5 votes

def _get_optimizer(params, is_distributed=defaults.DISTRIBUTED):
    if is_distributed:
        # Horovod: add Horovod Distributed Optimizer.
        return hvd.DistributedOptimizer(
            tf.train.MomentumOptimizer(
                learning_rate=params["learning_rate"] * hvd.size(),
                momentum=params["momentum"],
            )
        )
    else:
        return tf.train.MomentumOptimizer(
            learning_rate=params["learning_rate"], momentum=params["momentum"]
        )

Source File: distributed_optimizer.py From BERT with Apache License 2.0

4 votes

def get_opt(self, init_lr, 
				num_train_steps, **kargs):

		learning_rate = init_lr
		if self.config.get("decay", "no") == "decay":
			print("==apply lr decay==")
			learning_rate = self.lr_decay_fn(learning_rate, num_train_steps, **kargs)
		if self.config.get("warmup", "no") == "warmup":
			print("==apply warmup==")
			learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
		else:
			learning_rate = tf.cast(tf.constant(learning_rate), tf.float32)
		self.learning_rate = learning_rate #* (self.config.get('gpu_count', 1) / 2)
		# self.learning_rate = learning_rate / np.sqrt(self.config.get('gpu_count', 1) / 2)
		# self.learning_rate = learning_rate * np.sqrt(self.config.get('gpu_count', 1)) * 2
		self.single_node_learning = learning_rate
		
		# add uber horvod distributed optimizer
		if hvd and self.config["opt_type"] == "hvd":
			print("==optimizer hvd size=={}".format(self.config.get("worker_count", hvd.size())))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", hvd.size()), **kargs)
			self.opt = hvd.DistributedOptimizer(opt)
			self.distributed_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
		# add pai soar distributed optimizer
		elif pai and self.config["opt_type"] == "pai_soar":
			print("==optimizer pai_soar size=={}".format(self.config.get("worker_count", 4)))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
			self.opt = pai.ReplicatedVarsOptimizer(opt, clip_norm=self.config.get("clip_norm", 1.0))
			self.distributed_hooks = []
		# add tensorflow ps sync distributed optimizer
		elif self.config["opt_type"] == "ps_sync":
			print("==optimizer ps_sync size=={}".format(self.config.get("worker_count", 4)))
			opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
			self.opt = tf.train.SyncReplicasOptimizer(opt, 
											replicas_to_aggregate=self.config.get("worker_count", 4), 
											total_num_replicas=self.config.get("worker_count", 4))
			self.distributed_hooks = [self.opt.make_session_run_hook(self.config["is_chief"], num_tokens=0)]
		elif self.config["opt_type"] == "ps":
			print("==optimizer ps_async size=={}".format(self.config.get("worker_count", 4)))
			self.opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
		else:
			print("==initialization of single node optimizer==")
			self.opt = self.optimizer_op(self.learning_rate, **kargs)
			self.distributed_hooks = []

Source File: tensorflow_mnist.py From training_results_v0.6 with Apache License 2.0

4 votes

def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Download and load MNIST dataset.
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(100)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})

Python horovod.tensorflow.DistributedOptimizer() Examples