Python horovod.tensorflow.local_rank() Examples
The following are 26
code examples of horovod.tensorflow.local_rank().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
horovod.tensorflow
, or try the search function
.
Example #1
Source File: trainers.py From ADL with MIT License | 6 votes |
def __init__(self, average=True): """ Args: average (bool): whether to average or sum the gradients across processes. """ import byteps.tensorflow as bps self.hvd = bps # BytePS has the same interface as Horovod self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8 assert os.environ.get("DMLC_ROLE", None) == "worker" assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ bps.init() self.is_chief = bps.rank() == 0 self._local_rank = bps.local_rank() self._rank = bps.rank() self._average = average self._compression = None self._has_compression = False logger.info("[BytePSTrainer] local rank={}".format(self._local_rank)) SingleCostTrainer.__init__(self)
Example #2
Source File: trainers.py From tensorpack with Apache License 2.0 | 6 votes |
def __init__(self, average=True): """ Args: average (bool): whether to average or sum the gradients across processes. """ import byteps.tensorflow as bps self.hvd = bps # BytePS has the same interface as Horovod self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8 assert os.environ.get("DMLC_ROLE", None) == "worker" assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ bps.init() self.is_chief = bps.rank() == 0 self._local_rank = bps.local_rank() self._rank = bps.rank() self._average = average self._compression = None self._has_compression = False logger.info("[BytePSTrainer] local rank={}".format(self._local_rank)) SingleCostTrainer.__init__(self)
Example #3
Source File: trainers.py From tensorpack with Apache License 2.0 | 6 votes |
def __init__(self, average=True, compression=None): """ Args: average (bool): whether to average or sum the gradients across processes. compression: `hvd.Compression.fp16` or `hvd.Compression.none` """ if 'pyarrow' in sys.modules: logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs.") # lazy import import horovod.tensorflow as hvd import horovod hvd_version = tuple(map(int, horovod.__version__.split('.')[:3])) self.hvd = hvd hvd.init() self.is_chief = hvd.rank() == 0 self._local_rank = hvd.local_rank() self._rank = hvd.rank() self._average = average self._compression = compression self._has_compression = hvd_version >= (0, 15, 0) logger.info("[HorovodTrainer] local rank={}".format(self._local_rank)) super(HorovodTrainer, self).__init__() self.BROADCAST_EVERY_EPOCH = True
Example #4
Source File: eval.py From tensorpack with Apache License 2.0 | 6 votes |
def _setup_graph(self): num_gpu = cfg.TRAIN.NUM_GPUS if cfg.TRAINER == 'replicated': # TF bug in version 1.11, 1.12: https://github.com/tensorflow/tensorflow/issues/22750 buggy_tf = get_tf_version_tuple() in [(1, 11), (1, 12)] # Use two predictor threads per GPU to get better throughput self.num_predictor = num_gpu if buggy_tf else num_gpu * 2 self.predictors = [self._build_predictor(k % num_gpu) for k in range(self.num_predictor)] self.dataflows = [get_eval_dataflow(self._eval_dataset, shard=k, num_shards=self.num_predictor) for k in range(self.num_predictor)] else: # Only eval on the first machine, # Because evaluation assumes that all horovod workers share the filesystem. # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs self._horovod_run_eval = hvd.rank() == hvd.local_rank() if self._horovod_run_eval: self.predictor = self._build_predictor(0) self.dataflow = get_eval_dataflow(self._eval_dataset, shard=hvd.local_rank(), num_shards=hvd.local_size()) self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
Example #5
Source File: horovod_patches.py From cape-document-qa with Apache License 2.0 | 6 votes |
def _train(model: Model, data: TrainingData, checkpoint: Union[str, None], parameter_checkpoint: Union[str, None], save_start: bool, train_params: trainer.TrainParams, evaluators: List[Evaluator], out: ModelDir, notes=None, dry_run=False, start_eval=False): print('Horovod size: ', hvd.size()) print('Horovod rank: ', hvd.rank()) print('Horovod local rank: ', hvd.local_rank()) if train_params.async_encoding: _train_async(model, data, checkpoint, parameter_checkpoint, save_start, train_params, evaluators, out, notes, dry_run, start_eval) return else: raise NotImplementedError('Syncronous training with Horovod not supported yet')
Example #6
Source File: solver.py From athena with Apache License 2.0 | 6 votes |
def evaluate(self, dataset, epoch=0): """ evaluate the model """ loss_metric = tf.keras.metrics.Mean(name="AverageLoss") loss, metrics = None, None evaluate_step = self.evaluate_step if self.hparams.enable_tf_function: logging.info("please be patient, enable tf.function, it takes time ...") evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature) self.model.reset_metrics() for batch, samples in enumerate(dataset): samples = self.model.prepare_samples(samples) loss, metrics = evaluate_step(samples) if batch % self.hparams.log_interval == 0 and hvd.local_rank() == 0: logging.info(self.metric_checker(loss, metrics, -2)) loss_metric.update_state(loss) if hvd.local_rank() == 0: logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch)) self.model.reset_metrics() return loss_metric.result()
Example #7
Source File: solver.py From athena with Apache License 2.0 | 6 votes |
def train(self, dataset, total_batches=-1): """ Update the model in 1 epoch """ train_step = self.train_step if self.hparams.enable_tf_function: logging.info("please be patient, enable tf.function, it takes time ...") train_step = tf.function(train_step, input_signature=self.sample_signature) for batch, samples in enumerate(dataset.take(total_batches)): # train 1 step samples = self.model.prepare_samples(samples) loss, metrics = train_step(samples) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if batch == 0: hvd.broadcast_variables(self.model.trainable_variables, root_rank=0) hvd.broadcast_variables(self.optimizer.variables(), root_rank=0) if batch % self.hparams.log_interval == 0 and hvd.local_rank() == 0: logging.info(self.metric_checker(loss, metrics)) self.model.reset_metrics()
Example #8
Source File: resnet_main.py From DistributedDeepLearning with MIT License | 6 votes |
def _get_runconfig(is_distributed=defaults.DISTRIBUTED, save_checkpoints_steps=None): if is_distributed: # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) return tf.estimator.RunConfig( save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, session_config=config, log_step_count_steps=100, ) else: return tf.estimator.RunConfig( save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, log_step_count_steps=100, )
Example #9
Source File: train_model.py From DistributedDeepLearning with MIT License | 6 votes |
def _get_runconfig(is_distributed=DISTRIBUTED, save_checkpoints_steps=None): if is_distributed: # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) return tf.estimator.RunConfig( save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, session_config=config, log_step_count_steps=100, ) else: return tf.estimator.RunConfig( save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=None, log_step_count_steps=100, )
Example #10
Source File: train.py From pix2pix-flow with MIT License | 5 votes |
def tensorflow_session(): # Init session and params config = tf.ConfigProto() config.gpu_options.allow_growth = True # Pin GPU to local rank (one GPU per process) config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.Session(config=config) return sess
Example #11
Source File: train.py From glow with MIT License | 5 votes |
def tensorflow_session(): # Init session and params config = tf.ConfigProto() config.gpu_options.allow_growth = True # Pin GPU to local rank (one GPU per process) config.gpu_options.visible_device_list = str(hvd.local_rank()) sess = tf.Session(config=config) return sess
Example #12
Source File: multi_gpu_wrapper.py From tf-hrnet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def local_rank(cls, *args): """Get the rank of current worker at the current node.""" try: return mgw.local_rank(*args) except NameError: raise NameError('module <mgw> not imported')
Example #13
Source File: eval.py From tensorpack with Apache License 2.0 | 5 votes |
def _eval(self): logdir = self._output_dir if cfg.TRAINER == 'replicated': all_results = multithread_predict_dataflow(self.dataflows, self.predictors) else: filenames = [os.path.join( logdir, 'outputs{}-part{}.json'.format(self.global_step, rank) ) for rank in range(hvd.local_size())] if self._horovod_run_eval: local_results = predict_dataflow(self.dataflow, self.predictor) fname = filenames[hvd.local_rank()] with open(fname, 'w') as f: json.dump(local_results, f) self.barrier.eval() if hvd.rank() > 0: return all_results = [] for fname in filenames: with open(fname, 'r') as f: obj = json.load(f) all_results.extend(obj) os.unlink(fname) scores = DatasetRegistry.get(self._eval_dataset).eval_inference_results(all_results) for k, v in scores.items(): self.trainer.monitors.put_scalar(self._eval_dataset + '-' + k, v)
Example #14
Source File: solver.py From athena with Apache License 2.0 | 5 votes |
def initialize_devices(visible_gpu_idx=None): """ initialize hvd devices, should be called firstly """ if visible_gpu_idx is not None: warnings.warn("we can not set the visible gpu idx like this") hvd.init() gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
Example #15
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 5 votes |
def test_horovod_allreduce_gpu(self): """Test that the allreduce works on GPUs. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() with self.test_session(config=self.config) as session: dtypes = [tf.int32, tf.int64, tf.float16, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [tf.int32, tf.int64]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return diff = session.run(max_difference) self.assertTrue(diff <= threshold, "hvd.allreduce on GPU produces incorrect results")
Example #16
Source File: solver.py From athena with Apache License 2.0 | 5 votes |
def initialize_devices(visible_gpu_idx=None): """ initialize hvd devices, should be called firstly """ if visible_gpu_idx is not None: warnings.warn("we can not set the visible gpu idx like this") hvd.init() gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
Example #17
Source File: tf_distributed_optimizer.py From deep500 with BSD 3-Clause "New" or "Revised" License | 5 votes |
def as_operator(self): try: import horovod.tensorflow as hvd except ImportError: raise ImportError('Cannot import Horovod') self.network.session_config.gpu_options.visible_device_list = str(hvd.local_rank()) hooks = [hvd.BroadcastGlobalVariablesHook(0)] self.network.add_hooks(hooks) return self.op.minimize(self.network.fetch_internal_tensor(self.loss))
Example #18
Source File: tensorflow_executor.py From rlgraph with Apache License 2.0 | 5 votes |
def setup_horovod_execution(self): """ Sets up Horovod. """ # Check again to avoid import if unset which will crash if horovod is not installed. if get_distributed_backend() == "horovod": import horovod.tensorflow as hvd self.logger.info("Setting up Horovod execution.") hvd.init() config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank())
Example #19
Source File: trainers.py From ADL with MIT License | 5 votes |
def __init__(self, average=True, compression=None): """ Args: average (bool): whether to average or sum the gradients across processes. compression: `hvd.Compression.fp16` or `hvd.Compression.none` """ if 'pyarrow' in sys.modules: logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs. " "Uninstall pyarrow and use msgpack instead.") # lazy import import horovod.tensorflow as hvd import horovod hvd_version = tuple(map(int, horovod.__version__.split('.')[:3])) self.hvd = hvd hvd.init() self.is_chief = hvd.rank() == 0 self._local_rank = hvd.local_rank() self._rank = hvd.rank() self._average = average self._compression = compression self._has_compression = hvd_version >= (0, 15, 0) logger.info("[HorovodTrainer] local rank={}".format(self._local_rank)) super(HorovodTrainer, self).__init__() self.BROADCAST_EVERY_EPOCH = True
Example #20
Source File: train_model.py From DistributedDeepLearning with MIT License | 5 votes |
def _get_hooks(is_distributed=DISTRIBUTED): logger = logging.getLogger(__name__) if is_distributed: bcast_hook = hvd.BroadcastGlobalVariablesHook(0) logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size())) return [bcast_hook] else: return []
Example #21
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 4 votes |
def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 gpu_ids = [local_rank * 2, local_rank * 2 + 1] with self.test_session(config=self.config) as session: dtypes = [tf.int32, tf.int64, tf.float16, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 with tf.device("/gpu:%d" % gpu_ids[(iter + local_rank) % 2]): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [tf.int32, tf.int64]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return diff = session.run(max_difference) self.assertTrue(diff <= threshold, "hvd.allreduce on GPU produces incorrect results")
Example #22
Source File: test_atari.py From atari-reset with MIT License | 4 votes |
def test(game_name, num_timesteps, policy, load_path, save_path, noops=False, sticky=False, epsgreedy=False): import tensorflow as tf import horovod.tensorflow as hvd hvd.init() print('initialized worker %d' % hvd.rank(), flush=True) from baselines.common import set_global_seeds set_global_seeds(hvd.rank()) from baselines import bench from baselines.common import set_global_seeds from atari_reset.wrappers import VecFrameStack, VideoWriter, my_wrapper,\ EpsGreedyEnv, StickyActionEnv, NoopResetEnv, SubprocVecEnv from atari_reset.ppo import learn from atari_reset.policies import CnnPolicy, GRUPolicy set_global_seeds(hvd.rank()) ncpu = 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.Session(config=config).__enter__() def make_env(rank): def env_fn(): env = gym.make(game_name + 'NoFrameskip-v4') env = bench.Monitor(env, "{}.monitor.json".format(rank)) if rank%nenvs == 0 and hvd.local_rank()==0: os.makedirs('results/' + game_name, exist_ok=True) videofile_prefix = 'results/' + game_name env = VideoWriter(env, videofile_prefix) if noops: env = NoopResetEnv(env) if sticky: env = StickyActionEnv(env) env = my_wrapper(env, clip_rewards=True) if epsgreedy: env = EpsGreedyEnv(env) return env return env_fn nenvs = 8 env = SubprocVecEnv([make_env(i + nenvs * hvd.rank()) for i in range(nenvs)]) env = VecFrameStack(env, 4) policy = {'cnn' : CnnPolicy, 'gru': GRUPolicy}[policy] learn(policy=policy, env=env, nsteps=256, log_interval=1, save_interval=100, total_timesteps=num_timesteps, load_path=load_path, save_path=save_path, game_name=game_name, test_mode=True)
Example #23
Source File: train_atari.py From atari-reset with MIT License | 4 votes |
def train(game_name, policy, num_timesteps, lr, entropy_coef, load_path, starting_point, save_path): import tensorflow as tf import horovod.tensorflow as hvd hvd.init() print('initialized worker %d' % hvd.rank(), flush=True) from baselines.common import set_global_seeds set_global_seeds(hvd.rank()) from atari_reset.ppo import learn from atari_reset.policies import CnnPolicy, GRUPolicy from atari_reset.wrappers import ReplayResetEnv, ResetManager, SubprocVecEnv, VideoWriter, VecFrameStack, my_wrapper ncpu = 2 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.Session(config=config).__enter__() nrstartsteps = 320 # number of non frameskipped steps to divide workers over nenvs = 16 nrworkers = hvd.size() * nenvs workers_per_sp = int(np.ceil(nrworkers / nrstartsteps)) def make_env(rank): def env_fn(): env = gym.make(game_name + 'NoFrameskip-v4') env = ReplayResetEnv(env, demo_file_name='demos/'+game_name+'.demo', seed=rank, workers_per_sp=workers_per_sp) if rank%nenvs == 0 and hvd.local_rank()==0: # write videos during training to track progress dir = os.path.join(save_path, game_name) os.makedirs(dir, exist_ok=True) videofile_prefix = os.path.join(dir, 'episode') env = VideoWriter(env, videofile_prefix) env = my_wrapper(env, clip_rewards=True) return env return env_fn env = SubprocVecEnv([make_env(i + nenvs * hvd.rank()) for i in range(nenvs)]) env = ResetManager(env) env = VecFrameStack(env, 4) if starting_point is not None: env.set_max_starting_point(starting_point) policy = {'cnn' : CnnPolicy, 'gru': GRUPolicy}[policy] learn(policy=policy, env=env, nsteps=128, lam=.95, gamma=.999, noptepochs=4, log_interval=1, save_interval=100, ent_coef=entropy_coef, l2_coef=1e-7, lr=lr, cliprange=0.1, total_timesteps=num_timesteps, norm_adv=True, load_path=load_path, save_path=save_path, game_name=game_name)
Example #24
Source File: tensorflow_mnist_estimator.py From training_results_v0.6 with Apache License 2.0 | 4 votes |
def main(unused_argv): # Horovod: initialize Horovod. hvd.init() # Load training and eval data mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) train_data = mnist.train.images # Returns np.array train_labels = np.asarray(mnist.train.labels, dtype=np.int32) eval_data = mnist.test.images # Returns np.array eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=500) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train( input_fn=train_input_fn, steps=20000 // hvd.size(), hooks=[logging_hook, bcast_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
Example #25
Source File: tensorflow_mnist.py From training_results_v0.6 with Apache License 2.0 | 4 votes |
def main(_): # Horovod: initialize Horovod. hvd.init() # Download and load MNIST dataset. mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank()) # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) global_step = tf.contrib.framework.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=20000 // hvd.size()), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = mnist.train.next_batch(100) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
Example #26
Source File: test_tensorflow.py From training_results_v0.6 with Apache License 2.0 | 4 votes |
def test_horovod_allreduce_gpu_fused(self): """Test that the allreduce works on GPUs with Tensor Fusion. This test will crash badly if used with an MPI implementation that does not support GPU memory transfers directly, as it will call MPI_Send on a GPU data pointer.""" # Only do this test if there are GPUs available. if not tf.test.is_gpu_available(cuda_only=True): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() with self.test_session(config=self.config) as session: dtypes = [tf.int32, tf.int64, tf.float16, tf.float32, tf.float64] dims = [1, 2, 3] tests = [] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): tf.set_random_seed(1234) tensor = tf.random_uniform( [17] * dim, -100, 100, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size max_difference = tf.reduce_max(tf.abs(summed - multiplied)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [tf.int32, tf.int64]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: return test = max_difference <= threshold tests.append(test) self.assertTrue(session.run(tf.reduce_all(tests)), "hvd.allreduce produces incorrect results")