Python tensorflow.gradients() Examples
The following are 30
code examples of tensorflow.gradients().
Example #1
Source File: From BERT-Classification-Tutorial with Apache License 2.0 | 6 votes |
def test_adam(self): with self.test_session() as sess: w = tf.get_variable( "w", shape=[3], initializer=tf.constant_initializer([0.1, -0.2, -0.1])) x = tf.constant([0.4, 0.2, -0.5]) loss = tf.reduce_mean(tf.square(x - w)) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) global_step = tf.train.get_or_create_global_step() optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) init_op =, tf.local_variables_initializer()) for _ in range(100): w_np = self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
Example #2
Source File: From reinforcement_learning with MIT License | 6 votes |
def __init__(self, state_size, action_size, lr, name, n_h1=400, n_h2=300, global_name='global'): self.state_size = state_size self.action_size = action_size = name self.n_h1 = n_h1 self.n_h2 = n_h2 self.optimizer = tf.train.AdamOptimizer(lr) self.input_s, self.input_a, self.advantage, self.target_v, self.policy, self.value, self.action_est, self.model_variables = self._build_network( name) # 0.5, 0.2, 1.0 self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.entropy_loss = 1.0 * tf.reduce_sum(self.policy * tf.log(self.policy)) self.policy_loss = 1.0 * tf.reduce_sum(-tf.log(self.action_est) * self.advantage) self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.model_variables]) # self.loss = 0.5 * self.value_loss + self.policy_loss + 0.2 * self.entropy_loss self.loss = self.value_loss + self.policy_loss + self.entropy_loss self.gradients = tf.gradients(self.loss, self.model_variables) if name != global_name: self.var_norms = tf.global_norm(self.model_variables) global_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, global_name) self.apply_gradients = self.optimizer.apply_gradients(zip(self.gradients, global_variables))
Example #3
Source File: From Adversarial-Face-Attack with GNU General Public License v3.0 | 6 votes |
def build_pgd_attack(self, eps): victim_embeddings = tf.constant(self.victim_embeddings, dtype=tf.float32) def one_step_attack(image, grad): """ core components of this attack are: (a) PGD adversarial attack ( (b) momentum ( (c) input diversity ( """ orig_image = image image = self.structure(image) image = (image - 127.5) / 128.0 image = image + tf.random_uniform(tf.shape(image), minval=-1e-2, maxval=1e-2) prelogits, _ =, 1.0, False, bottleneck_layer_size=512) embeddings = tf.nn.l2_normalize(prelogits, 1, 1e-10, name='embeddings') embeddings = tf.reshape(embeddings[0], [512, 1]) objective = tf.reduce_mean(tf.matmul(victim_embeddings, embeddings)) # to be maximized noise, = tf.gradients(objective, orig_image) noise = noise / tf.reduce_mean(tf.abs(noise), [1, 2, 3], keep_dims=True) noise = 0.9 * grad + noise adv = tf.clip_by_value(orig_image + tf.sign(noise) * 1.0, lower_bound, upper_bound) return adv, noise input = tf.to_float(self.image_batch) lower_bound = tf.clip_by_value(input - eps, 0, 255.) upper_bound = tf.clip_by_value(input + eps, 0, 255.) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): adv, _ = tf.while_loop( lambda _, __: True, one_step_attack, (input, tf.zeros_like(input)), back_prop=False, maximum_iterations=100, parallel_iterations=1) self.adv_image = adv return adv
Example #4
Source File: From TransferRL with MIT License | 6 votes |
def _add_train_op(self): # In regression, the objective loss is Mean Squared Error (MSE). self.loss = tf.losses.mean_squared_error(labels = self._y, predictions = self.output) tvars = tf.trainable_variables() gradients = tf.gradients(self.loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)): grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer optimizer = tf.train.AdamOptimizer( with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)): self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step') self.variable_summaries('dqn_loss',self.loss)
Example #5
Source File: From post--memorization-in-rnns with MIT License | 6 votes |
def connectivity(logits, target, embedding, embedding_matrix, offset): logits_correct = select_dim_value(logits, target) # Compute partial gradient with respect to the embedding partial_gradient = tf.gradients( logits_correct[0, offset[0]], embedding )[0][0, ...] # Finailize the chain rule and compute the gradient with respect # to the one-hot-encoding of the source. Note that the # one-hot-encoding is not part of the graph, which is why the # gradient can't be computed directly this way. full_gradient = tf.matmul(partial_gradient, tf.transpose(embedding_matrix)) connectivity = tf.reduce_sum(full_gradient ** 2, axis=1) return tf.reshape(connectivity, [1, -1])
Example #6
Source File: From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self): x_val = np.random.rand(1, 2) x_val = np.array(x_val, dtype=np.float32) self.attack.generate_np(x_val, eps=.3, num_iterations=10, clip_max=-5.0, clip_min=-5.0, xi=1e-6) old_grads = tf.gradients def fn(*x, **y): raise RuntimeError() tf.gradients = fn self.attack.generate_np(x_val, eps=.2, num_iterations=10, clip_max=-4.0, clip_min=-4.0, xi=1e-5) tf.gradients = old_grads
Example #7
Source File: From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_fgm_gradient_max(self): input_dim = 2 num_classes = 3 batch_size = 4 rng = np.random.RandomState([2017, 8, 23]) x = tf.placeholder(tf.float32, [batch_size, input_dim]) weights = tf.placeholder(tf.float32, [input_dim, num_classes]) logits = tf.matmul(x, weights) probs = tf.nn.softmax(logits) adv_x = fgm(x, probs) random_example = rng.randint(batch_size) random_feature = rng.randint(input_dim) output = tf.slice(adv_x, [random_example, random_feature], [1, 1]) dx, = tf.gradients(output, x) # The following line catches GitHub issue #243 self.assertIsNotNone(dx) dx =, feed_dict=random_feed_dict(rng, [x, weights])) ground_truth = np.zeros((batch_size, input_dim)) ground_truth[random_example, random_feature] = 1. self.assertClose(dx, ground_truth)
Example #8
Source File: From ADEM with MIT License | 6 votes |
def adem(context_vector, model_response_vector, reference_response_vector, context_dim, model_response_dim, reference_response_dim, human_score_place, lr, max_grad_norm): model_score, M, N = tf_dynamic_adem_score( context=context_vector, model_response=model_response_vector, reference_response=reference_response_vector, shape_info={'batch_size': None, 'ct_dim': context_dim, 'mr_dim': model_response_dim, 'rr_dim': reference_response_dim}) loss = compute_adem_l1_loss(human_score_place, model_score, M, N) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(loss, tvars), max_grad_norm) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step() ) return train_op, loss, model_score
Example #9
Source File: From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def jacobian_graph(predictions, x, nb_classes): """ Create the Jacobian graph to be ran later in a TF session :param predictions: the model's symbolic output (linear output, pre-softmax) :param x: the input placeholder :param nb_classes: the number of classes the model has :return: """ # This function will return a list of TF gradients list_derivatives = [] # Define the TF graph elements to compute our derivatives for each class for class_ind in xrange(nb_classes): derivatives, = tf.gradients(predictions[:, class_ind], x) list_derivatives.append(derivatives) return list_derivatives
Example #10
Source File: From reinforcement_learning with MIT License | 6 votes |
def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001): self.state_size = state_size self.action_size = action_size self.optimizer = tf.train.AdamOptimizer(lr) self.tau = tau self.n_h1 = n_h1 self.n_h2 = n_h2 self.input_s, self.action, self.critic_variables, self.q_value = self._build_network("critic") self.input_s_target, self.action_target, self.critic_variables_target, self.q_value_target = self._build_network("critic_target") = tf.placeholder(tf.float32, [None]) self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.critic_variables]) self.loss = tf.reduce_mean(tf.square( - self.q_value)) + 0.01*self.l2_loss self.optimize = self.optimizer.minimize(self.loss) self.update_target_op = [self.critic_variables_target[i].assign(tf.multiply(self.critic_variables[i], self.tau) + tf.multiply(self.critic_variables_target[i], 1 - self.tau)) for i in range(len(self.critic_variables))] self.action_gradients = tf.gradients(self.q_value, self.action)
Example #11
Source File: From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _compute_gradients(self, loss_fn, x, unused_optim_state): """Compute a new value of `x` to minimize `loss_fn`. Args: loss_fn: a callable that takes `x`, a batch of images, and returns a batch of loss values. `x` will be optimized to minimize `loss_fn(x)`. x: A list of Tensors, the values to be updated. This is analogous to the `var_list` argument in standard TF Optimizer. unused_optim_state: A (possibly nested) dict, containing any state info needed for the optimizer. Returns: new_x: A list of Tensors, the same length as `x`, which are updated new_optim_state: A dict, with the same structure as `optim_state`, which have been updated. """ # Assumes `x` is a list, # and contains a tensor representing a batch of images assert len(x) == 1 and isinstance(x, list), \ 'x should be a list and contain only one image tensor' x = x[0] loss = reduce_mean(loss_fn(x), axis=0) return tf.gradients(loss, x)
Example #12
Source File: From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_clip_eta_goldilocks(self): # Test that the clipping handles perturbations that are # too small, just right, and too big correctly eta = tf.constant([[2.], [3.], [4.]]) assert eta.dtype == tf.float32, eta.dtype eps = 3. for ord_arg in [np.inf, 1, 2]: for sign in [-1., 1.]: clipped = clip_eta(eta * sign, ord_arg, eps) clipped_value = gold = sign * np.array([[2.], [3.], [3.]]) self.assertClose(clipped_value, gold) grad, = tf.gradients(clipped, eta) grad_value = # Note: the second 1. is debatable (the left-sided derivative # and the right-sided derivative do not match, so formally # the derivative is not defined). This test makes sure that # we at least handle this oddity consistently across all the # argument values we test gold = sign * np.array([[1.], [1.], [0.]]) assert np.allclose(grad_value, gold)
Example #13
Source File: From GroundeR with MIT License | 6 votes |
def build_train_op(self, loss): if self.optim == 'adam': print 'Adam optimizer' v_dict = self.get_variables_by_name([""], True) var_list1 = [i for i in v_dict[""] if 'vis_enc' not in] var_list2 = self.get_variables_by_name(["vis_enc"], True) var_list2 = var_list2["vis_enc"] opt1 = tf.train.AdamOptimizer(, name="Adam") opt2 = tf.train.AdamOptimizer(*0.1, name="Adam_vis_enc") grads = tf.gradients(loss, var_list1 + var_list2) grads1 = grads[:len(var_list1)] grads2 = grads[len(var_list1):] train_op1 = opt1.apply_gradients(zip(grads1, var_list1)) train_op2 = opt2.apply_gradients(zip(grads2, var_list2)) train_op =, train_op2) else: print 'SGD optimizer' tvars = tf.trainable_variables() optimizer = tf.train.GradientDescentOptimizer(self._lr) grads = tf.gradients(cost, tvars) train_op = optimizer.apply_gradients(zip(grads, tvars)) return train_op
Example #14
Source File: From DOTA_models with Apache License 2.0 | 6 votes |
def _add_train_op(self): """Sets self._train_op, op to run for training.""" hps = self._hps self._lr_rate = tf.maximum( hps.min_lr, # min_lr_rate. tf.train.exponential_decay(, self.global_step, 30000, 0.98)) tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus-1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), hps.max_grad_norm) tf.summary.scalar('global_norm', global_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr_rate) tf.summary.scalar('learning rate', self._lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
Example #15
Source File: From DOTA_models with Apache License 2.0 | 6 votes |
def _build_train_op(self): """Build training specific ops for the graph.""" self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32) tf.summary.scalar('learning_rate', self.lrn_rate) trainable_variables = tf.trainable_variables() grads = tf.gradients(self.cost, trainable_variables) if self.hps.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate) elif self.hps.optimizer == 'mom': optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9) apply_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=self.global_step, name='train_step') train_ops = [apply_op] + self._extra_train_ops self.train_op =*train_ops) # TODO(xpan): Consider batch_norm in contrib/layers/python/layers/
Example #16
Source File: From reinforcement_learning with MIT License | 6 votes |
def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001): self.state_size = state_size self.action_size = action_size self.optimizer = tf.train.AdamOptimizer(lr) self.tau = tau self.n_h1 = n_h1 self.n_h2 = n_h2 self.input_s, self.actor_variables, self.action_values = self._build_network("actor") self.input_s_target, self.actor_variables_target, self.action_values_target = self._build_network("actor_target") self.action_gradients = tf.placeholder(tf.float32, [None, self.action_size]) self.actor_gradients = tf.gradients(self.action_values, self.actor_variables, -self.action_gradients) self.update_target_op = [self.actor_variables_target[i].assign(tf.multiply(self.actor_variables[i], self.tau) + tf.multiply(self.actor_variables_target[i], 1 - self.tau)) for i in range(len(self.actor_variables))] self.optimize = self.optimizer.apply_gradients(zip(self.actor_gradients, self.actor_variables))
Example #17
Source File: From fine-lm with MIT License | 5 votes |
def testConvHiddenReluMemoryEfficient(self): batch = 3 length = 23 io_size = 16 filter_size = 7 x = np.random.rand(batch, length, io_size) dy = np.random.rand(batch, length, io_size) with self.test_session() as session: x = tf.to_float(x) dy = tf.to_float(dy) f1 = tf.get_variable("f1", [1, io_size, filter_size]) f2 = tf.get_variable("f2", [1, filter_size, io_size]) norm_scale, norm_bias = common_layers.layer_norm_vars(io_size) y = common_layers.conv_hidden_relu_memory_efficient( x, filter_size, forget=False, test_vars=(f1, f2, norm_scale, norm_bias)) y_forget = common_layers.conv_hidden_relu_memory_efficient( x, filter_size, forget=True, test_vars=(f1, f2, norm_scale, norm_bias)) dx, df1, df2, dnorm_scale, dnorm_bias = tf.gradients( ys=[y], xs=[x, f1, f2, norm_scale, norm_bias], grad_ys=[dy]) dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f = tf.gradients( ys=[y_forget], xs=[x, f1, f2, norm_scale, norm_bias], grad_ys=[dy]) (y, y_forget, dx, df1, df2, dnorm_scale, dnorm_bias, dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f) = [y, y_forget, dx, df1, df2, dnorm_scale, dnorm_bias, dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f]) self.assertAllClose(y, y_forget) self.assertAllClose(df2, df2_f) self.assertAllClose(df1, df1_f) self.assertAllClose(dnorm_scale, dnorm_scale_f) self.assertAllClose(dnorm_bias, dnorm_bias_f) self.assertAllClose(dx, dx_f)
Example #18
Source File: From TransferRL with MIT License | 5 votes |
def _add_shared_train_op(self): """Sets self._train_op, the op to run for training.""" # Take gradients of the trainable variables w.r.t. the loss function to minimize loss_to_minimize = self._pgen_loss if self._hps.coverage: loss_to_minimize = self._pointer_cov_total_loss if self._hps.rl_training: loss_to_minimize = self._reinforce_shared_loss if self._hps.coverage: loss_to_minimize = self._reinforce_cov_total_loss tvars = tf.trainable_variables() gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer self.epoch = (self.global_step * FLAGS.batch_size) / FLAGS.train_size new_lr = tf.cond(tf.greater(self.epoch, 0), lambda:, tf.float32), lambda: #new_lr = if self.epoch>0 else optimizer = tf.train.AdagradOptimizer(new_lr, initial_accumulator_value=self._hps.adagrad_init_acc) #optimizer = tf.train.AdamOptimizer() self._shared_train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')
Example #19
Source File: From lirpg with MIT License | 5 votes |
def compute_gradients(self, loss, var_list=None): varlist = var_list if varlist is None: varlist = tf.trainable_variables() g = tf.gradients(loss, varlist) return [(a, b) for a, b in zip(g, varlist)]
Example #20
Source File: From fine-lm with MIT License | 5 votes |
def testCustomGrad(self): def fn(a, b, c): return tf.layers.dense(a, 10, use_bias=False) + tf.matmul(b, c) def grad_fn(inputs, variables, unused_outputs, unused_grad_outputs): grad_inputs = [tf.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)] grad_vars = [ tf.ones_like(t) * (i + len(inputs) + 1.) for i, t in enumerate(variables) ] return grad_inputs, grad_vars a = tf.random_uniform([11, 6]) b = tf.random_uniform([11, 7]) c = tf.random_uniform([7, 10]) w = tf.random_uniform([6, 10]) out = common_layers.fn_with_custom_grad(grad_fn)(fn)(a, b, c) loss = tf.reduce_mean(out) grads = tf.gradients(loss, [a, b, c, tf.trainable_variables()[0]]) expected_grads = [ tf.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w]) ] with self.test_session() as sess: g_val, eg_val =[grads, expected_grads]) for g1, g2 in zip(g_val, eg_val): self.assertAllClose(g1, g2)
Example #21
Source File: From fine-lm with MIT License | 5 votes |
def shakeshake2_grad(x1, x2, dy): """Overriding gradient for shake-shake of 2 tensors.""" y = shakeshake2_py(x1, x2) dx = tf.gradients(ys=[y], xs=[x1, x2], grad_ys=[dy]) return dx
Example #22
Source File: From fine-lm with MIT License | 5 votes |
def _acc_grads(*lists_of_grads): """Accumulates lists of gradients.""" acc_grads = [] for grads in zip(*lists_of_grads): grads = [g for g in grads if g is not None] if grads: acc_grads.append(tf.add_n(grads)) else: acc_grads.append(None) return acc_grads
Example #23
Source File: From fine-lm with MIT License | 5 votes |
def testDiet(self): params = diet.diet_adam_optimizer_params() @diet.fn_with_diet_vars(params) def model_fn(x): y = tf.layers.dense(x, 10, use_bias=False) return y @diet.fn_with_diet_vars(params) def model_fn2(x): y = tf.layers.dense(x, 10, use_bias=False) return y x = tf.random_uniform((10, 10)) y = model_fn(x) + 10. y = model_fn2(y) + 10. grads = tf.gradients(y, [x]) with tf.control_dependencies(grads): incr_step = tf.assign_add(tf.train.get_or_create_global_step(), 1) train_op =, *grads) with self.test_session() as sess: orig_vals = for _ in range(10): new_vals = different = [] for old, new in zip(orig_vals, new_vals): try: self.assertAllClose(old, new) except AssertionError: different.append(True) self.assertEqual(len(different), len(tf.global_variables()))
Example #24
Source File: From HardRLWithYoutube with MIT License | 5 votes |
def compute_gradients(self, loss, var_list=None): varlist = var_list if varlist is None: varlist = tf.trainable_variables() g = tf.gradients(loss, varlist) return [(a, b) for a, b in zip(g, varlist)]
Example #25
Source File: From fine-lm with MIT License | 5 votes |
def compute_gradients(self, loss, var_list, global_step=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None): """Compute gradients through momentum optimizer. Args: loss: A Tensor containing the value to minimize. var_list: Optional list or tuple of tf.Variable to update to minimize loss. Defaults to the list of variables collected in the graph under the key GraphKey.TRAINABLE_VARIABLES. global_step: Optional Variable to increment by one after the variables have been updated. gate_gradients: How to gate the computation of gradients. Can be GATE_NONE, GATE_OP, or GATE_GRAPH. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class AggregationMethod. colocate_gradients_with_ops: If True, try collocating gradients with the corresponding op. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. grad_loss: Optional. A Tensor holding the gradient computed for loss. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be None. """ del global_step, name # Unused for now. return self._momentum_optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss)
Example #26
Source File: From DOTA_models with Apache License 2.0 | 5 votes |
def __call__(self, w, z_grads): idx = list(self.op.inputs).index(w) # Make sure that `op` was actually applied to `w` assert idx != -1 assert len(z_grads) == len(self.op.outputs) # The following assert may be removed when we are ready to use this # for general purpose code. # This assert is only expected to hold in the contex of our preliminary # MNIST experiments. assert idx == 1 # We expect convolution weights to be arg 1 images, filters = self.op.inputs strides = self.op.get_attr("strides") padding = self.op.get_attr("padding") # Currently assuming that one specifies at most these four arguments and # that all other arguments to conv2d are set to default. conv, w_px = self._PxConv2DBuilder(images, filters, strides, padding) z_grads, = z_grads gradients_list = tf.gradients(conv, w_px, z_grads, colocate_gradients_with_ops= self.colocate_gradients_with_ops, gate_gradients=self.gate_gradients) return tf.stack(gradients_list)
Example #27
Source File: From cs294-112_hws with MIT License | 5 votes |
def __init__(self, FLAGS, algorithm, expert_returns=None, expert_policy_fn=None): print('Initializing the model...') if not algorithm.strip().lower() in ['behavioral_cloning', 'dagger']: raise NotImplementedError('Algorithm {} not implemented.'.format(algorithm)) self.FLAGS = FLAGS self.algorithm = algorithm.strip().lower() self.expert_returns = expert_returns self.expert_policy_fn = expert_policy_fn if self.algorithm == 'dagger' and self.expert_policy_fn is None: raise ValueError('No expert policy found.') self.scope = self.algorithm + '_' + time.strftime('%Y-%m-%d-%H-%M-%S') with tf.variable_scope( self.scope, initializer=tf.keras.initializers.he_normal(), regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), reuse=tf.AUTO_REUSE ): self.add_placeholders() self.build_graph() self.add_loss() params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) self.gradient_norm = tf.global_norm(gradients) clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.FLAGS['max_gradient_norm']) self.param_norm = tf.global_norm(params) self.global_step = tf.Variable(0, name="global_step", trainable=False) lr = self.FLAGS['learning_rate'] opt = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.8, beta2=0.999, epsilon=1e-7) self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) self.bestmodel_saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) self.summaries = tf.summary.merge_all()
Example #28
Source File: From DOTA_models with Apache License 2.0 | 5 votes |
def flatgrad(loss, var_list): grads = gradients(loss, var_list) return tf.concat([tf.reshape(grad, [-1]) for (v, grad) in zip(var_list, grads) if grad is not None], 0)
Example #29
Source File: From DOTA_models with Apache License 2.0 | 5 votes |
def gradients(loss, var_list): grads = tf.gradients(loss, var_list) return [g if g is not None else tf.zeros(v.shape) for g, v in zip(grads, var_list)]
Example #30
Source File: From fine-lm with MIT License | 5 votes |
def smoothing_cross_entropy_factored_grad(op, dy): """Gradient function for smoothing_cross_entropy_factored.""" a = op.inputs[0] b = op.inputs[1] labels = op.inputs[2] confidence = op.inputs[3] num_splits = 16 vocab_size = shape_list(b)[0] labels = approximate_split(labels, num_splits) a = approximate_split(a, num_splits) dy = approximate_split(dy, num_splits) b_grad = None a_grad_parts = [] deps = [] for part in range(num_splits): with tf.control_dependencies(deps): logits = tf.matmul(a[part], b, transpose_b=True) output_part = smoothing_cross_entropy(logits, labels[part], vocab_size, confidence) a_grad_part, b_grad_part = tf.gradients( ys=[output_part], xs=[a[part], b], grad_ys=[dy[part]]) a_grad_parts.append(a_grad_part) if part > 0: b_grad += b_grad_part else: b_grad = b_grad_part deps = [b_grad, a_grad_part] a_grad = tf.concat(a_grad_parts, 0) return a_grad, b_grad, None, None