Python tensorflow.clip_by_global_norm() Examples
The following are 30
code examples of tensorflow.clip_by_global_norm().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
Example #1
Source File: optimizer.py From BERT with Apache License 2.0 | 6 votes |
def grad_clip_fn(self, loss, tvars, **kargs): grads = tf.gradients(loss, tvars) grad_clip = self.config.get("grad_clip", "global_norm") tf.logging.info(" gradient clip method {}".format(grad_clip)) if grad_clip == "global_norm": clip_norm = self.config.get("clip_norm", 1.0) [grads, _] = tf.clip_by_global_norm(grads, clip_norm=clip_norm) elif grad_clip == "norm": clip_norm = self.config.get("clip_norm", 1.0) grads = [tf.clip_by_norm(grad, clip_norm) for grad in grads] elif grad_clip == "value": clip_min_value = self.config.get("clip_min_value", -1.0) clip_max_value = self.config.get("clip_max_value", 1.0) grads = [tf.clip_by_value(grad, clip_norm) for grad in grads] else: grads = grads return grads
Example #2
Source File: dqn.py From TransferRL with MIT License | 6 votes |
def _add_train_op(self): # In regression, the objective loss is Mean Squared Error (MSE). self.loss = tf.losses.mean_squared_error(labels = self._y, predictions = self.output) tvars = tf.trainable_variables() gradients = tf.gradients(self.loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)): grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer optimizer = tf.train.AdamOptimizer(self._hps.lr) with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)): self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step') self.variable_summaries('dqn_loss',self.loss)
Example #3
Source File: seq2seq_attention_model.py From DOTA_models with Apache License 2.0 | 6 votes |
def _add_train_op(self): """Sets self._train_op, op to run for training.""" hps = self._hps self._lr_rate = tf.maximum( hps.min_lr, # min_lr_rate. tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98)) tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus-1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), hps.max_grad_norm) tf.summary.scalar('global_norm', global_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr_rate) tf.summary.scalar('learning rate', self._lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
Example #4
Source File: agent.py From ppo-lstm-parallel with MIT License | 6 votes |
def get_train_op(self, loss, clip_factor, clip, step): import tensorflow as tf optimizer = tf.train.AdamOptimizer(learning_rate=step) gradients, variables = zip(*optimizer.compute_gradients(loss)) filtered_grads = [] filtered_vars = [] for i in range(len(gradients)): if gradients[i] is not None: filtered_grads.append(gradients[i]) filtered_vars.append(variables[i]) gradients = filtered_grads variables = filtered_vars if clip: gradients, _ = tf.clip_by_global_norm(gradients, clip_factor) grad_norm = tf.reduce_sum([tf.norm(grad) for grad in gradients]) train_op = optimizer.apply_gradients(zip(gradients, variables)) return optimizer, train_op, grad_norm
Example #5
Source File: adem_graphs.py From ADEM with MIT License | 6 votes |
def adem(context_vector, model_response_vector, reference_response_vector, context_dim, model_response_dim, reference_response_dim, human_score_place, lr, max_grad_norm): model_score, M, N = tf_dynamic_adem_score( context=context_vector, model_response=model_response_vector, reference_response=reference_response_vector, shape_info={'batch_size': None, 'ct_dim': context_dim, 'mr_dim': model_response_dim, 'rr_dim': reference_response_dim}) loss = compute_adem_l1_loss(human_score_place, model_score, M, N) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(loss, tvars), max_grad_norm) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step() ) return train_op, loss, model_score
Example #6
Source File: train.py From hierarchical-attention-networks with MIT License | 6 votes |
def train_fn(loss): trained_vars = tf.trainable_variables() count_parameters(trained_vars) # Gradient clipping gradients = tf.gradients(loss, trained_vars) clipped_grads, global_norm = tf.clip_by_global_norm(gradients, FLAGS.max_grad_norm) tf.summary.scalar('global_grad_norm', global_norm) # Add gradients and vars to summary # for gradient, var in list(zip(clipped_grads, trained_vars)): # if 'attention' in var.name: # tf.summary.histogram(var.name + '/gradient', gradient) # tf.summary.histogram(var.name, var) # Define optimizer global_step = tf.train.get_or_create_global_step() optimizer = tf.train.RMSPropOptimizer(FLAGS.learning_rate) train_op = optimizer.apply_gradients(zip(clipped_grads, trained_vars), name='train_op', global_step=global_step) return train_op, global_step
Example #7
Source File: policy_gradient.py From EasyRL with Apache License 2.0 | 6 votes |
def _build_train(self, loss, optimizer, vars=None, global_step=None): grads_and_vars = optimizer.compute_gradients(loss=loss, var_list=vars) grads_and_vars = [(grad, var) for grad, var in grads_and_vars if grad is not None] # apply grad clipping grads, vars = zip(*grads_and_vars) clipped_grads, _ = tf.clip_by_global_norm( grads, clip_norm=self.config.get('global_norm_clip', 40)) grads_and_vars = list(zip(clipped_grads, vars)) train_op = optimizer.apply_gradients( grads_and_vars, global_step=global_step) return train_op
Example #8
Source File: batch_dqn.py From EasyRL with Apache License 2.0 | 6 votes |
def _build_train(self, loss, optimizer, vars, global_step=None): """ construct the operation for optimization. Arguments: loss: the object loss function to minimize optimizer: optimizer to implement the optimization vars: the available variables to optimize global_step: record to total number of optimization """ # compute gradients grads_and_vars = optimizer.compute_gradients(loss=loss, var_list=vars) grads_and_vars = [(grad, var) for grad, var in grads_and_vars if grad is not None] # apply grad clipping grads, vars = zip(*grads_and_vars) clipped_grads, _ = tf.clip_by_global_norm( grads, clip_norm=self.config.get('global_norm_clip', 40)) grads_and_vars = list(zip(clipped_grads, vars)) train_op = optimizer.apply_gradients( grads_and_vars, global_step=global_step) return train_op
Example #9
Source File: tripletext2seq.py From Zeroshot-QuestionGeneration with MIT License | 6 votes |
def __create_optimizer(self): print('creating optimizer...') start = time.time() learning_rate = tf.train.exponential_decay(self.config.LR, self.global_step, 200, 0.97, staircase=True) self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate) # self.opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) # normalize the gradients of a parameter vector when its L2 norm exceeds a certain threshold according to trainable_params = tf.trainable_variables() # calculate gradients of the loss given all the trainable parameters gradients = tf.gradients(self.loss, trainable_params) # Gradient clipping: new_gradients = gradients * threshold / l2_norm(gradients) clip_gradients, _ = tf.clip_by_global_norm(gradients, self.config.MAX_GRAD_NORM) self.updates = self.opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step) print('Building optimizer in: ', time.time() - start, ' secs')
Example #10
Source File: triples2seq.py From Zeroshot-QuestionGeneration with MIT License | 6 votes |
def __create_optimizer(self): print('creating optimizer...') start = time.time() learning_rate = tf.train.exponential_decay(self.config.LR, self.global_step, 200, 0.97, staircase=True) self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate) # learning_rate = tf.train.exponential_decay(self.config.LR, self.global_step, 100, 0.96, staircase=True) # self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate) # self.opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) # normalize the gradients of a parameter vector when its L2 norm exceeds a certain threshold according to trainable_params = tf.trainable_variables() # calculate gradients of the loss given all the trainable parameters gradients = tf.gradients(self.loss, trainable_params) # Gradient clipping: new_gradients = gradients * threshold / l2_norm(gradients) clip_gradients, _ = tf.clip_by_global_norm(gradients, self.config.MAX_GRAD_NORM) self.updates = self.opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step) print('Building optimizer in: ', time.time() - start, ' secs')
Example #11
Source File: model_updater.py From nematus with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _define_apply_ops(self): """Defines the graph nodes for applying the accumulated gradients.""" final_loss = self._accumulated_loss final_grad_vars = [(self._accumulated_gradients[key], self._trainables[key]) for key in self._trainables.keys()] if self._config.clip_c > 0.0: grads, varss = list(zip(*final_grad_vars)) clipped_grads, global_norm = tf.clip_by_global_norm( grads, clip_norm=self._config.clip_c) # Might be interesting to see how the global norm changes over # time, attach a summary? final_grad_vars = list(zip(clipped_grads, varss)) apply_grads = self._optimizer.apply_gradients( final_grad_vars, global_step=self._global_step) self._apply_ops = [self._global_step, apply_grads, final_loss]
Example #12
Source File: tacotron.py From vae_tacotron with MIT License | 6 votes |
def add_optimizer(self, global_step): '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. Args: global_step: int32 scalar Tensor representing current global step in training ''' with tf.variable_scope('optimizer') as scope: hp = self._hparams if hp.decay_learning_rate: self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step) else: self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate) optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2) gradients, variables = zip(*optimizer.compute_gradients(self.loss)) self.gradients = gradients clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), global_step=global_step)
Example #13
Source File: dqn.py From RLSeq2Seq with MIT License | 6 votes |
def _add_train_op(self): # In regression, the objective loss is Mean Squared Error (MSE). self.loss = tf.losses.mean_squared_error(labels = self._y, predictions = self.output) tvars = tf.trainable_variables() gradients = tf.gradients(self.loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)): grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer optimizer = tf.train.AdamOptimizer(self._hps.lr) with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)): self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step') self.variable_summaries('dqn_loss',self.loss)
Example #14
Source File: hvd_distributed_optimizer.py From BERT with Apache License 2.0 | 6 votes |
def grad_clip_fn(self, opt, loss, tvars, **kargs): grads_and_vars = opt.compute_gradients(loss, tvars) grads = [grad for grad, _ in grads_and_vars] grad_clip = self.config.get("grad_clip", "global_norm") tf.logging.info(" gradient clip method {}".format(grad_clip)) if grad_clip == "global_norm": clip_norm = self.config.get("clip_norm", 1.0) [grads, _] = tf.clip_by_global_norm(grads, clip_norm=clip_norm) elif grad_clip == "norm": clip_norm = self.config.get("clip_norm", 1.0) grads = [tf.clip_by_norm(grad, clip_norm) for grad in grads] elif grad_clip == "value": clip_min_value = self.config.get("clip_min_value", -1.0) clip_max_value = self.config.get("clip_max_value", 1.0) grads = [tf.clip_by_value(grad, clip_norm) for grad in grads] else: grads = grads return grads
Example #15
Source File: agent.py From async-deeprl with MIT License | 5 votes |
def __init__(self, session, action_size, h, w, channels, opt=tf.train.AdamOptimizer(1e-4)): """Creates Q-Learning agent :param session: tensorflow session :param action_size: (int) length of action space :param h: (int) input image height :param w: (int) input image width :param channels: (int) number of image channels :param opt: tensorflow optimizer (by default: Adam optimizer)""" self.action_size = action_size self.opt = opt self.global_step = tf.Variable(0, name='frame', trainable=False) self.frame_inc_op = self.global_step.assign_add(1, use_locking=True) K.set_session(session) self.sess = session with tf.variable_scope('network'): self.action = tf.placeholder('int32', [None], name='action') self.reward = tf.placeholder('float32', [None], name='reward') model, self.state, self.q_values = self._build_model(h, w, channels) self.weights = model.trainable_weights with tf.variable_scope('optimizer'): # Zero all actions, except one that was performed action_onehot = tf.one_hot(self.action, self.action_size, 1.0, 0.0) # Predict expected future reward for performed action q_value = tf.reduce_sum(tf.multiply(self.q_values, action_onehot), reduction_indices=1) # Define squared mean loss function: (y - y_)^2 self.loss = tf.reduce_mean(tf.square(self.reward - q_value)) # Compute gradients w.r.t. weights grads = tf.gradients(self.loss, self.weights) # Apply gradient norm clipping grads, _ = tf.clip_by_global_norm(grads, 40.) grads_vars = list(zip(grads, self.weights)) self.train_op = opt.apply_gradients(grads_vars) with tf.variable_scope('target_network'): target_m, self.target_state, self.target_q_values = self._build_model(h, w, channels) target_w = target_m.trainable_weights with tf.variable_scope('target_update'): self.target_update = [target_w[i].assign(self.weights[i]) for i in range(len(target_w))]
Example #16
Source File: kfac.py From stable-baselines with MIT License | 5 votes |
def apply_gradients(self, grads): """ apply the gradient :param grads: ([TensorFlow Tensor]) the gradient :return: (function, QueueRunner) train operation, queue operation runner """ cold_optim = tf.train.MomentumOptimizer(self._cold_lr, self._momentum) def _cold_sgd_start(): sgd_grads, sgd_var = zip(*grads) if self.max_grad_norm is not None: sgd_grads, _ = tf.clip_by_global_norm(sgd_grads, self.max_grad_norm) sgd_grads = list(zip(sgd_grads, sgd_var)) sgd_step_op = tf.assign_add(self.sgd_step, 1) cold_optim_op = cold_optim.apply_gradients(sgd_grads) if KFAC_DEBUG: with tf.control_dependencies([sgd_step_op, cold_optim_op]): sgd_step_op = tf.Print( sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')]) return tf.group(*[sgd_step_op, cold_optim_op]) # remove unused variables grads = [(grad, var) for (grad, var) in grads if grad is not None] kfac_optim_op, queue_runner = self.apply_gradients_kfac(grads) def _warm_kfac_start(): return kfac_optim_op return tf.cond(tf.greater(self.sgd_step, self._cold_iter), _warm_kfac_start, _cold_sgd_start), queue_runner
Example #17
Source File: kfac.py From rl_graph_generation with BSD 3-Clause "New" or "Revised" License | 5 votes |
def apply_gradients(self, grads): coldOptim = tf.train.MomentumOptimizer( self._cold_lr, self._momentum) def coldSGDstart(): sgd_grads, sgd_var = zip(*grads) if self.max_grad_norm != None: sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm) sgd_grads = list(zip(sgd_grads,sgd_var)) sgd_step_op = tf.assign_add(self.sgd_step, 1) coldOptim_op = coldOptim.apply_gradients(sgd_grads) if KFAC_DEBUG: with tf.control_dependencies([sgd_step_op, coldOptim_op]): sgd_step_op = tf.Print( sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')]) return tf.group(*[sgd_step_op, coldOptim_op]) kfacOptim_op, qr = self.apply_gradients_kfac(grads) def warmKFACstart(): return kfacOptim_op return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr
Example #18
Source File: optimization_utils.py From nucleus7 with Mozilla Public License 2.0 | 5 votes |
def clip_grads_and_vars(grads_and_vars: _GRAD_AND_VARS_TYPE, gradient_clip: float, gradient_l2_norm: Optional[tf.Tensor] = None ) -> _GRAD_AND_VARS_TYPE: """ Clip all the gradients according to global normal with gradient_clip Parameters ---------- grads_and_vars list of (gradient, variable) gradient_clip value to clip gradient_l2_norm gradient l2 norm used for the gradient clipping Returns ------- grads_and_vars list of (clipped gradient, variable) """ grads, variables = zip(*grads_and_vars) if gradient_l2_norm is None: grads_clipped, gradient_l2_norm = tf.clip_by_global_norm( grads, gradient_clip) else: grads_clipped = [each_grad * gradient_clip / tf.maximum(gradient_l2_norm, gradient_clip) for each_grad in grads] grads_and_vars_clipped = list(zip(grads_clipped, variables)) return grads_and_vars_clipped
Example #19
Source File: policy_value_network_tf2.py From cchess-zero with MIT License | 5 votes |
def train_step(self, positions, pi, z, learning_rate=0): # Record the operations used to compute the loss, so that the gradient # of the loss with respect to the variables can be computed. # metrics = 0 with tf.GradientTape() as tape: policy_head, value_head = self.model(positions, training=True) loss = self.compute_loss(pi, z, policy_head, value_head) # self.ComputeMetrics(y, logits) metrics = self.compute_metrics(pi, policy_head) grads = tape.gradient(loss, self.model.trainable_variables) # grads = self.average_gradients(tower_grads) # grads = self.optimizer.compute_gradients(self.loss) # defensive step 2 to clip norm # grads0_lst = tf.map_fn(lambda x: x[0], grads) # [g for g, _ in grads] clipped_grads, self.norm = tf.clip_by_global_norm(grads, self.global_norm) # defensive step 3 check NaN # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [tf.debugging.check_numerics(g, message='NaN Found!') for g in clipped_grads] with tf.control_dependencies(grad_check): self.optimizer.apply_gradients( zip(clipped_grads, self.model.trainable_variables), # [v for _, v in grads] global_step=self.global_step, name='train_step') if self.is_logging: for grad, var in zip(grads, self.model.trainable_variables): if grad is not None: summary_ops_v2.histogram(var.name + '/gradients', grad) for var in self.model.trainable_variables: summary_ops_v2.histogram(var.name, var) return metrics, loss, self.global_step #@profile
Example #20
Source File: ppo.py From fine-lm with MIT License | 5 votes |
def define_ppo_step(data_points, optimizer, hparams): """Define ppo step.""" observation, action, discounted_reward, norm_advantage, old_pdf = data_points new_policy_dist, new_value, _ = get_policy(observation, hparams) new_pdf = new_policy_dist.prob(action) ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef, 1 + hparams.clipping_coef) surrogate_objective = tf.minimum(clipped_ratio * norm_advantage, ratio * norm_advantage) policy_loss = -tf.reduce_mean(surrogate_objective) value_error = new_value - discounted_reward value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2) entropy = new_policy_dist.entropy() entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy) losses = [policy_loss, value_loss, entropy_loss] gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses] gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients] gradients_flat = sum([gradient[0] for gradient in gradients], ()) gradients_variables_flat = sum([gradient[1] for gradient in gradients], ()) if hparams.max_gradients_norm: gradients_flat, _ = tf.clip_by_global_norm(gradients_flat, hparams.max_gradients_norm) optimize_op = optimizer.apply_gradients(zip(gradients_flat, gradients_variables_flat)) with tf.control_dependencies([optimize_op]): return [tf.identity(x) for x in losses + gradients_norms]
Example #21
Source File: model.py From cs294-112_hws with MIT License | 5 votes |
def __init__(self, FLAGS, algorithm, expert_returns=None, expert_policy_fn=None): print('Initializing the model...') if not algorithm.strip().lower() in ['behavioral_cloning', 'dagger']: raise NotImplementedError('Algorithm {} not implemented.'.format(algorithm)) self.FLAGS = FLAGS self.algorithm = algorithm.strip().lower() self.expert_returns = expert_returns self.expert_policy_fn = expert_policy_fn if self.algorithm == 'dagger' and self.expert_policy_fn is None: raise ValueError('No expert policy found.') self.scope = self.algorithm + '_' + time.strftime('%Y-%m-%d-%H-%M-%S') with tf.variable_scope( self.scope, initializer=tf.keras.initializers.he_normal(), regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), reuse=tf.AUTO_REUSE ): self.add_placeholders() self.build_graph() self.add_loss() params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) self.gradient_norm = tf.global_norm(gradients) clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.FLAGS['max_gradient_norm']) self.param_norm = tf.global_norm(params) self.global_step = tf.Variable(0, name="global_step", trainable=False) lr = self.FLAGS['learning_rate'] opt = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.8, beta2=0.999, epsilon=1e-7) self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) self.bestmodel_saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) self.summaries = tf.summary.merge_all()
Example #22
Source File: base_model.py From PCNN with Apache License 2.0 | 5 votes |
def add_train_op(self, lr_method, lr, loss, clip=-1): """Defines self.train_op that performs an update on a batch Args: lr_method: (string) sgd method, for example "adam" lr: (tf.placeholder) tf.float32, learning rate loss: (tensor) tf.float32 loss to minimize clip: (python float) clipping of gradient. If < 0, no clipping """ _lr_m = lr_method.lower() # lower to make sure with tf.variable_scope("train_step"): if _lr_m == 'adam': # sgd method optimizer = tf.train.AdamOptimizer(lr) elif _lr_m == 'adagrad': optimizer = tf.train.AdagradOptimizer(lr) elif _lr_m == 'sgd': optimizer = tf.train.GradientDescentOptimizer(lr) elif _lr_m == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(lr) elif _lr_m == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(lr) else: raise NotImplementedError("Unknown method {}".format(_lr_m)) if clip > 0: # gradient clipping if clip is positive grads, vs = zip(*optimizer.compute_gradients(loss)) grads, gnorm = tf.clip_by_global_norm(grads, clip) self.train_op = optimizer.apply_gradients(zip(grads, vs)) else: self.train_op = optimizer.minimize(loss)
Example #23
Source File: model.py From Python-Deep-Learning-SE with MIT License | 5 votes |
def init_train_op(self, optimizer): # Flatten the targets to be compatible with the flattened logits targets_flat = tf.reshape(self.targets, (-1,)) # Get the loss over all outputs loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits_flat, labels=targets_flat, name='x_entropy') self.loss = tf.reduce_mean(loss) trainable_variables = tf.trainable_variables() gradients = tf.gradients(loss, trainable_variables) gradients, _ = tf.clip_by_global_norm(gradients, 5) self.train_op = optimizer.apply_gradients(zip(gradients, trainable_variables))
Example #24
Source File: bert_cnn_model.py From BERT with Apache License 2.0 | 5 votes |
def train_lm(self): """based on the loss, use SGD to update parameter""" learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, self.decay_rate, staircase=True) self.learning_rate_=learning_rate #noise_std_dev = tf.constant(0.3) / (tf.sqrt(tf.cast(tf.constant(1) + self.global_step, tf.float32))) #gradient_noise_scale=noise_std_dev optimizer = tf.train.AdamOptimizer(learning_rate) gradients, variables = zip(*optimizer.compute_gradients(self.loss_val_lm)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) #ADD 2018.06.01 with tf.control_dependencies(update_ops): #ADD 2018.06.01 train_op = optimizer.apply_gradients(zip(gradients, variables)) #train_op = tf_contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,learning_rate=learning_rate, optimizer="Adam",clip_gradients=self.clip_gradients) return train_op
Example #25
Source File: kfac.py From lirpg with MIT License | 5 votes |
def apply_gradients(self, grads): coldOptim = tf.train.MomentumOptimizer( self._cold_lr, self._momentum) def coldSGDstart(): sgd_grads, sgd_var = zip(*grads) if self.max_grad_norm != None: sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm) sgd_grads = list(zip(sgd_grads,sgd_var)) sgd_step_op = tf.assign_add(self.sgd_step, 1) coldOptim_op = coldOptim.apply_gradients(sgd_grads) if KFAC_DEBUG: with tf.control_dependencies([sgd_step_op, coldOptim_op]): sgd_step_op = tf.Print( sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')]) return tf.group(*[sgd_step_op, coldOptim_op]) kfacOptim_op, qr = self.apply_gradients_kfac(grads) def warmKFACstart(): return kfacOptim_op return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr
Example #26
Source File: objective.py From DOTA_models with Apache License 2.0 | 5 votes |
def training_ops(self, loss, learning_rate=None): """Gradient ops.""" opt = self.get_optimizer(learning_rate) params = tf.trainable_variables() grads = tf.gradients(loss, params) if self.clip_norm: grads, global_norm = tf.clip_by_global_norm(grads, self.clip_norm) tf.summary.scalar('grad_global_norm', global_norm) return opt.apply_gradients(zip(grads, params))
Example #27
Source File: model.py From RLSeq2Seq with MIT License | 5 votes |
def _add_shared_train_op(self): """Sets self._train_op, the op to run for training.""" # Take gradients of the trainable variables w.r.t. the loss function to minimize if self._hps.rl_training or self._hps.ac_training: loss_to_minimize = self._reinforce_shared_loss if self._hps.coverage: loss_to_minimize = self._reinforce_cov_total_loss else: loss_to_minimize = self._pgen_loss if self._hps.coverage: loss_to_minimize = self._pointer_cov_total_loss tvars = tf.trainable_variables() gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients with tf.device("/gpu:{}".format(self._hps.gpu_num)): grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer optimizer = tf.train.AdagradOptimizer(self._hps.lr, initial_accumulator_value=self._hps.adagrad_init_acc) with tf.device("/gpu:{}".format(self._hps.gpu_num)): self._shared_train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')
Example #28
Source File: language_model.py From lm with MIT License | 5 votes |
def _backward(self, loss, summaries=False): hps = self.hps loss = loss * hps.num_steps emb_vars = find_trainable_variables("emb") lstm_vars = find_trainable_variables("LSTM") softmax_vars = find_trainable_variables("softmax") all_vars = emb_vars + lstm_vars + softmax_vars grads = tf.gradients(loss, all_vars) orig_grads = grads[:] emb_grads = grads[:len(emb_vars)] grads = grads[len(emb_vars):] for i in range(len(emb_grads)): assert isinstance(emb_grads[i], tf.IndexedSlices) emb_grads[i] = tf.IndexedSlices(emb_grads[i].values * hps.batch_size, emb_grads[i].indices, emb_grads[i].dense_shape) lstm_grads = grads[:len(lstm_vars)] softmax_grads = grads[len(lstm_vars):] lstm_grads, lstm_norm = tf.clip_by_global_norm(lstm_grads, hps.max_grad_norm) clipped_grads = emb_grads + lstm_grads + softmax_grads assert len(clipped_grads) == len(orig_grads) if summaries: tf.scalar_summary("model/lstm_grad_norm", lstm_norm) tf.scalar_summary("model/lstm_grad_scale", tf.minimum(hps.max_grad_norm / lstm_norm, 1.0)) tf.scalar_summary("model/lstm_weight_norm", tf.global_norm(lstm_vars)) # for v, g, cg in zip(all_vars, orig_grads, clipped_grads): # name = v.name.lstrip("model/") # tf.histogram_summary(name + "/var", v) # tf.histogram_summary(name + "/grad", g) # tf.histogram_summary(name + "/clipped_grad", cg) return list(zip(clipped_grads, all_vars))
Example #29
Source File: marwil.py From EasyRL with Apache License 2.0 | 5 votes |
def _build_train(self, loss, optimizer, vars=None, global_step=None): grads_and_vars = optimizer.compute_gradients(loss=loss, var_list=vars) grads_and_vars = [(grad, var) for grad, var in grads_and_vars if grad is not None] # apply grad clipping grads, vars = zip(*grads_and_vars) clipped_grads, _ = tf.clip_by_global_norm( grads, clip_norm=self.config.get('global_norm_clip', 40)) grads_and_vars = list(zip(clipped_grads, vars)) train_op = optimizer.apply_gradients( grads_and_vars, global_step=global_step) return train_op
Example #30
Source File: model.py From rgn with MIT License | 5 votes |
def _training(config, loss): """ Creates loss optimizer and returns minimization op. """ # helper function optimizer_args = lambda o: o.__init__.__code__.co_varnames[:o.__init__.__code__.co_argcount] # select appropriate optimization function and construct arg list based on config optimizer_func = {'steepest': tf.train.GradientDescentOptimizer, # doesn't support momentum, unlike autograd 'rmsprop': tf.train.RMSPropOptimizer, 'adam': tf.train.AdamOptimizer, 'momentum': tf.train.MomentumOptimizer, 'adagrad': tf.train.AdagradOptimizer, 'adadelta': tf.train.AdadeltaOptimizer}[config['optimizer']] optimizer_params = config.viewkeys() & set(optimizer_args(optimizer_func)) optimizer_params_and_values = {param: config[param] for param in optimizer_params} optimizer = optimizer_func(**optimizer_params_and_values) # obtain and process gradients grads_and_vars = optimizer.compute_gradients(loss) threshold = config['gradient_threshold'] if threshold != float('inf'): for case in switch(config['rescale_behavior']): if case('norm_rescaling'): grads, _ = tf.clip_by_global_norm([g for g, _ in grads_and_vars], threshold) vars_ = [v for _, v in grads_and_vars] grads_and_vars = zip(grads, vars_) elif case('hard_clipping'): grads_and_vars = [(tf.clip_by_value(g, -threshold, threshold), v) for g, v in grads_and_vars] # apply gradients and return stepping op global_step = tf.get_variable(initializer=tf.constant_initializer(0), shape=[], trainable=False, dtype=tf.int32, name='global_step') minimize_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # dict useful for diagnostics grads_and_vars_dict = {} grads_and_vars_dict.update({('g' + str(i)): g for i, (g, _) in enumerate(grads_and_vars)}) grads_and_vars_dict.update({('v' + str(i)): v for i, (_, v) in enumerate(grads_and_vars)}) return global_step, minimize_op, grads_and_vars_dict