Python tensorflow.gradients() Examples
The following are 30
code examples of tensorflow.gradients().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
Example #1
Source File: optimization_test.py From BERT-Classification-Tutorial with Apache License 2.0 | 6 votes |
def test_adam(self): with self.test_session() as sess: w = tf.get_variable( "w", shape=[3], initializer=tf.constant_initializer([0.1, -0.2, -0.1])) x = tf.constant([0.4, 0.2, -0.5]) loss = tf.reduce_mean(tf.square(x - w)) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) global_step = tf.train.get_or_create_global_step() optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) for _ in range(100): sess.run(train_op) w_np = sess.run(w) self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
Example #2
Source File: ac_net.py From reinforcement_learning with MIT License | 6 votes |
def __init__(self, state_size, action_size, lr, name, n_h1=400, n_h2=300, global_name='global'): self.state_size = state_size self.action_size = action_size self.name = name self.n_h1 = n_h1 self.n_h2 = n_h2 self.optimizer = tf.train.AdamOptimizer(lr) self.input_s, self.input_a, self.advantage, self.target_v, self.policy, self.value, self.action_est, self.model_variables = self._build_network( name) # 0.5, 0.2, 1.0 self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) self.entropy_loss = 1.0 * tf.reduce_sum(self.policy * tf.log(self.policy)) self.policy_loss = 1.0 * tf.reduce_sum(-tf.log(self.action_est) * self.advantage) self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.model_variables]) # self.loss = 0.5 * self.value_loss + self.policy_loss + 0.2 * self.entropy_loss self.loss = self.value_loss + self.policy_loss + self.entropy_loss self.gradients = tf.gradients(self.loss, self.model_variables) if name != global_name: self.var_norms = tf.global_norm(self.model_variables) global_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, global_name) self.apply_gradients = self.optimizer.apply_gradients(zip(self.gradients, global_variables))
Example #3
Source File: face_attack.py From Adversarial-Face-Attack with GNU General Public License v3.0 | 6 votes |
def build_pgd_attack(self, eps): victim_embeddings = tf.constant(self.victim_embeddings, dtype=tf.float32) def one_step_attack(image, grad): """ core components of this attack are: (a) PGD adversarial attack (https://arxiv.org/pdf/1706.06083.pdf) (b) momentum (https://arxiv.org/pdf/1710.06081.pdf) (c) input diversity (https://arxiv.org/pdf/1803.06978.pdf) """ orig_image = image image = self.structure(image) image = (image - 127.5) / 128.0 image = image + tf.random_uniform(tf.shape(image), minval=-1e-2, maxval=1e-2) prelogits, _ = self.network.inference(image, 1.0, False, bottleneck_layer_size=512) embeddings = tf.nn.l2_normalize(prelogits, 1, 1e-10, name='embeddings') embeddings = tf.reshape(embeddings[0], [512, 1]) objective = tf.reduce_mean(tf.matmul(victim_embeddings, embeddings)) # to be maximized noise, = tf.gradients(objective, orig_image) noise = noise / tf.reduce_mean(tf.abs(noise), [1, 2, 3], keep_dims=True) noise = 0.9 * grad + noise adv = tf.clip_by_value(orig_image + tf.sign(noise) * 1.0, lower_bound, upper_bound) return adv, noise input = tf.to_float(self.image_batch) lower_bound = tf.clip_by_value(input - eps, 0, 255.) upper_bound = tf.clip_by_value(input + eps, 0, 255.) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): adv, _ = tf.while_loop( lambda _, __: True, one_step_attack, (input, tf.zeros_like(input)), back_prop=False, maximum_iterations=100, parallel_iterations=1) self.adv_image = adv return adv
Example #4
Source File: dqn.py From TransferRL with MIT License | 6 votes |
def _add_train_op(self): # In regression, the objective loss is Mean Squared Error (MSE). self.loss = tf.losses.mean_squared_error(labels = self._y, predictions = self.output) tvars = tf.trainable_variables() gradients = tf.gradients(self.loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)): grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer optimizer = tf.train.AdamOptimizer(self._hps.lr) with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)): self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step') self.variable_summaries('dqn_loss',self.loss)
Example #5
Source File: __init__.py From post--memorization-in-rnns with MIT License | 6 votes |
def connectivity(logits, target, embedding, embedding_matrix, offset): logits_correct = select_dim_value(logits, target) # Compute partial gradient with respect to the embedding partial_gradient = tf.gradients( logits_correct[0, offset[0]], embedding )[0][0, ...] # Finailize the chain rule and compute the gradient with respect # to the one-hot-encoding of the source. Note that the # one-hot-encoding is not part of the graph, which is why the # gradient can't be computed directly this way. full_gradient = tf.matmul(partial_gradient, tf.transpose(embedding_matrix)) connectivity = tf.reduce_sum(full_gradient ** 2, axis=1) return tf.reshape(connectivity, [1, -1])
Example #6
Source File: test_attacks.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self): x_val = np.random.rand(1, 2) x_val = np.array(x_val, dtype=np.float32) self.attack.generate_np(x_val, eps=.3, num_iterations=10, clip_max=-5.0, clip_min=-5.0, xi=1e-6) old_grads = tf.gradients def fn(*x, **y): raise RuntimeError() tf.gradients = fn self.attack.generate_np(x_val, eps=.2, num_iterations=10, clip_max=-4.0, clip_min=-4.0, xi=1e-5) tf.gradients = old_grads
Example #7
Source File: test_attacks_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_fgm_gradient_max(self): input_dim = 2 num_classes = 3 batch_size = 4 rng = np.random.RandomState([2017, 8, 23]) x = tf.placeholder(tf.float32, [batch_size, input_dim]) weights = tf.placeholder(tf.float32, [input_dim, num_classes]) logits = tf.matmul(x, weights) probs = tf.nn.softmax(logits) adv_x = fgm(x, probs) random_example = rng.randint(batch_size) random_feature = rng.randint(input_dim) output = tf.slice(adv_x, [random_example, random_feature], [1, 1]) dx, = tf.gradients(output, x) # The following line catches GitHub issue #243 self.assertIsNotNone(dx) dx = self.sess.run(dx, feed_dict=random_feed_dict(rng, [x, weights])) ground_truth = np.zeros((batch_size, input_dim)) ground_truth[random_example, random_feature] = 1. self.assertClose(dx, ground_truth)
Example #8
Source File: adem_graphs.py From ADEM with MIT License | 6 votes |
def adem(context_vector, model_response_vector, reference_response_vector, context_dim, model_response_dim, reference_response_dim, human_score_place, lr, max_grad_norm): model_score, M, N = tf_dynamic_adem_score( context=context_vector, model_response=model_response_vector, reference_response=reference_response_vector, shape_info={'batch_size': None, 'ct_dim': context_dim, 'mr_dim': model_response_dim, 'rr_dim': reference_response_dim}) loss = compute_adem_l1_loss(human_score_place, model_score, M, N) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(loss, tvars), max_grad_norm) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step() ) return train_op, loss, model_score
Example #9
Source File: attacks_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def jacobian_graph(predictions, x, nb_classes): """ Create the Jacobian graph to be ran later in a TF session :param predictions: the model's symbolic output (linear output, pre-softmax) :param x: the input placeholder :param nb_classes: the number of classes the model has :return: """ # This function will return a list of TF gradients list_derivatives = [] # Define the TF graph elements to compute our derivatives for each class for class_ind in xrange(nb_classes): derivatives, = tf.gradients(predictions[:, class_ind], x) list_derivatives.append(derivatives) return list_derivatives
Example #10
Source File: critic.py From reinforcement_learning with MIT License | 6 votes |
def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001): self.state_size = state_size self.action_size = action_size self.optimizer = tf.train.AdamOptimizer(lr) self.tau = tau self.n_h1 = n_h1 self.n_h2 = n_h2 self.input_s, self.action, self.critic_variables, self.q_value = self._build_network("critic") self.input_s_target, self.action_target, self.critic_variables_target, self.q_value_target = self._build_network("critic_target") self.target = tf.placeholder(tf.float32, [None]) self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.critic_variables]) self.loss = tf.reduce_mean(tf.square(self.target - self.q_value)) + 0.01*self.l2_loss self.optimize = self.optimizer.minimize(self.loss) self.update_target_op = [self.critic_variables_target[i].assign(tf.multiply(self.critic_variables[i], self.tau) + tf.multiply(self.critic_variables_target[i], 1 - self.tau)) for i in range(len(self.critic_variables))] self.action_gradients = tf.gradients(self.q_value, self.action)
Example #11
Source File: attacks_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _compute_gradients(self, loss_fn, x, unused_optim_state): """Compute a new value of `x` to minimize `loss_fn`. Args: loss_fn: a callable that takes `x`, a batch of images, and returns a batch of loss values. `x` will be optimized to minimize `loss_fn(x)`. x: A list of Tensors, the values to be updated. This is analogous to the `var_list` argument in standard TF Optimizer. unused_optim_state: A (possibly nested) dict, containing any state info needed for the optimizer. Returns: new_x: A list of Tensors, the same length as `x`, which are updated new_optim_state: A dict, with the same structure as `optim_state`, which have been updated. """ # Assumes `x` is a list, # and contains a tensor representing a batch of images assert len(x) == 1 and isinstance(x, list), \ 'x should be a list and contain only one image tensor' x = x[0] loss = reduce_mean(loss_fn(x), axis=0) return tf.gradients(loss, x)
Example #12
Source File: test_utils_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_clip_eta_goldilocks(self): # Test that the clipping handles perturbations that are # too small, just right, and too big correctly eta = tf.constant([[2.], [3.], [4.]]) assert eta.dtype == tf.float32, eta.dtype eps = 3. for ord_arg in [np.inf, 1, 2]: for sign in [-1., 1.]: clipped = clip_eta(eta * sign, ord_arg, eps) clipped_value = self.sess.run(clipped) gold = sign * np.array([[2.], [3.], [3.]]) self.assertClose(clipped_value, gold) grad, = tf.gradients(clipped, eta) grad_value = self.sess.run(grad) # Note: the second 1. is debatable (the left-sided derivative # and the right-sided derivative do not match, so formally # the derivative is not defined). This test makes sure that # we at least handle this oddity consistently across all the # argument values we test gold = sign * np.array([[1.], [1.], [0.]]) assert np.allclose(grad_value, gold)
Example #13
Source File: model_unsupervise.py From GroundeR with MIT License | 6 votes |
def build_train_op(self, loss): if self.optim == 'adam': print 'Adam optimizer' v_dict = self.get_variables_by_name([""], True) var_list1 = [i for i in v_dict[""] if 'vis_enc' not in i.name] var_list2 = self.get_variables_by_name(["vis_enc"], True) var_list2 = var_list2["vis_enc"] opt1 = tf.train.AdamOptimizer(self.lr, name="Adam") opt2 = tf.train.AdamOptimizer(self.lr*0.1, name="Adam_vis_enc") grads = tf.gradients(loss, var_list1 + var_list2) grads1 = grads[:len(var_list1)] grads2 = grads[len(var_list1):] train_op1 = opt1.apply_gradients(zip(grads1, var_list1)) train_op2 = opt2.apply_gradients(zip(grads2, var_list2)) train_op = tf.group(train_op1, train_op2) else: print 'SGD optimizer' tvars = tf.trainable_variables() optimizer = tf.train.GradientDescentOptimizer(self._lr) grads = tf.gradients(cost, tvars) train_op = optimizer.apply_gradients(zip(grads, tvars)) return train_op
Example #14
Source File: seq2seq_attention_model.py From DOTA_models with Apache License 2.0 | 6 votes |
def _add_train_op(self): """Sets self._train_op, op to run for training.""" hps = self._hps self._lr_rate = tf.maximum( hps.min_lr, # min_lr_rate. tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98)) tvars = tf.trainable_variables() with tf.device(self._get_gpu(self._num_gpus-1)): grads, global_norm = tf.clip_by_global_norm( tf.gradients(self._loss, tvars), hps.max_grad_norm) tf.summary.scalar('global_norm', global_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr_rate) tf.summary.scalar('learning rate', self._lr_rate) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
Example #15
Source File: resnet_model.py From DOTA_models with Apache License 2.0 | 6 votes |
def _build_train_op(self): """Build training specific ops for the graph.""" self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32) tf.summary.scalar('learning_rate', self.lrn_rate) trainable_variables = tf.trainable_variables() grads = tf.gradients(self.cost, trainable_variables) if self.hps.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate) elif self.hps.optimizer == 'mom': optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9) apply_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=self.global_step, name='train_step') train_ops = [apply_op] + self._extra_train_ops self.train_op = tf.group(*train_ops) # TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py
Example #16
Source File: actor.py From reinforcement_learning with MIT License | 6 votes |
def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001): self.state_size = state_size self.action_size = action_size self.optimizer = tf.train.AdamOptimizer(lr) self.tau = tau self.n_h1 = n_h1 self.n_h2 = n_h2 self.input_s, self.actor_variables, self.action_values = self._build_network("actor") self.input_s_target, self.actor_variables_target, self.action_values_target = self._build_network("actor_target") self.action_gradients = tf.placeholder(tf.float32, [None, self.action_size]) self.actor_gradients = tf.gradients(self.action_values, self.actor_variables, -self.action_gradients) self.update_target_op = [self.actor_variables_target[i].assign(tf.multiply(self.actor_variables[i], self.tau) + tf.multiply(self.actor_variables_target[i], 1 - self.tau)) for i in range(len(self.actor_variables))] self.optimize = self.optimizer.apply_gradients(zip(self.actor_gradients, self.actor_variables))
Example #17
Source File: common_layers_test.py From fine-lm with MIT License | 5 votes |
def testConvHiddenReluMemoryEfficient(self): batch = 3 length = 23 io_size = 16 filter_size = 7 x = np.random.rand(batch, length, io_size) dy = np.random.rand(batch, length, io_size) with self.test_session() as session: x = tf.to_float(x) dy = tf.to_float(dy) f1 = tf.get_variable("f1", [1, io_size, filter_size]) f2 = tf.get_variable("f2", [1, filter_size, io_size]) norm_scale, norm_bias = common_layers.layer_norm_vars(io_size) y = common_layers.conv_hidden_relu_memory_efficient( x, filter_size, forget=False, test_vars=(f1, f2, norm_scale, norm_bias)) y_forget = common_layers.conv_hidden_relu_memory_efficient( x, filter_size, forget=True, test_vars=(f1, f2, norm_scale, norm_bias)) dx, df1, df2, dnorm_scale, dnorm_bias = tf.gradients( ys=[y], xs=[x, f1, f2, norm_scale, norm_bias], grad_ys=[dy]) dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f = tf.gradients( ys=[y_forget], xs=[x, f1, f2, norm_scale, norm_bias], grad_ys=[dy]) session.run(tf.global_variables_initializer()) (y, y_forget, dx, df1, df2, dnorm_scale, dnorm_bias, dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f) = session.run( [y, y_forget, dx, df1, df2, dnorm_scale, dnorm_bias, dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f]) self.assertAllClose(y, y_forget) self.assertAllClose(df2, df2_f) self.assertAllClose(df1, df1_f) self.assertAllClose(dnorm_scale, dnorm_scale_f) self.assertAllClose(dnorm_bias, dnorm_bias_f) self.assertAllClose(dx, dx_f)
Example #18
Source File: model.py From TransferRL with MIT License | 5 votes |
def _add_shared_train_op(self): """Sets self._train_op, the op to run for training.""" # Take gradients of the trainable variables w.r.t. the loss function to minimize loss_to_minimize = self._pgen_loss if self._hps.coverage: loss_to_minimize = self._pointer_cov_total_loss if self._hps.rl_training: loss_to_minimize = self._reinforce_shared_loss if self._hps.coverage: loss_to_minimize = self._reinforce_cov_total_loss tvars = tf.trainable_variables() gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer self.epoch = (self.global_step * FLAGS.batch_size) / FLAGS.train_size new_lr = tf.cond(tf.greater(self.epoch, 0), lambda: self._hps.lr/tf.cast(self.epoch, tf.float32), lambda: self._hps.lr) #new_lr = self._hps.lr/self.epoch if self.epoch>0 else self._hps.lr optimizer = tf.train.AdagradOptimizer(new_lr, initial_accumulator_value=self._hps.adagrad_init_acc) #optimizer = tf.train.AdamOptimizer() self._shared_train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')
Example #19
Source File: kfac.py From lirpg with MIT License | 5 votes |
def compute_gradients(self, loss, var_list=None): varlist = var_list if varlist is None: varlist = tf.trainable_variables() g = tf.gradients(loss, varlist) return [(a, b) for a, b in zip(g, varlist)]
Example #20
Source File: common_layers_test.py From fine-lm with MIT License | 5 votes |
def testCustomGrad(self): def fn(a, b, c): return tf.layers.dense(a, 10, use_bias=False) + tf.matmul(b, c) def grad_fn(inputs, variables, unused_outputs, unused_grad_outputs): grad_inputs = [tf.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)] grad_vars = [ tf.ones_like(t) * (i + len(inputs) + 1.) for i, t in enumerate(variables) ] return grad_inputs, grad_vars a = tf.random_uniform([11, 6]) b = tf.random_uniform([11, 7]) c = tf.random_uniform([7, 10]) w = tf.random_uniform([6, 10]) out = common_layers.fn_with_custom_grad(grad_fn)(fn)(a, b, c) loss = tf.reduce_mean(out) grads = tf.gradients(loss, [a, b, c, tf.trainable_variables()[0]]) expected_grads = [ tf.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w]) ] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) g_val, eg_val = sess.run([grads, expected_grads]) for g1, g2 in zip(g_val, eg_val): self.assertAllClose(g1, g2)
Example #21
Source File: common_layers.py From fine-lm with MIT License | 5 votes |
def shakeshake2_grad(x1, x2, dy): """Overriding gradient for shake-shake of 2 tensors.""" y = shakeshake2_py(x1, x2) dx = tf.gradients(ys=[y], xs=[x1, x2], grad_ys=[dy]) return dx
Example #22
Source File: rev_block.py From fine-lm with MIT License | 5 votes |
def _acc_grads(*lists_of_grads): """Accumulates lists of gradients.""" acc_grads = [] for grads in zip(*lists_of_grads): grads = [g for g in grads if g is not None] if grads: acc_grads.append(tf.add_n(grads)) else: acc_grads.append(None) return acc_grads
Example #23
Source File: diet_test.py From fine-lm with MIT License | 5 votes |
def testDiet(self): params = diet.diet_adam_optimizer_params() @diet.fn_with_diet_vars(params) def model_fn(x): y = tf.layers.dense(x, 10, use_bias=False) return y @diet.fn_with_diet_vars(params) def model_fn2(x): y = tf.layers.dense(x, 10, use_bias=False) return y x = tf.random_uniform((10, 10)) y = model_fn(x) + 10. y = model_fn2(y) + 10. grads = tf.gradients(y, [x]) with tf.control_dependencies(grads): incr_step = tf.assign_add(tf.train.get_or_create_global_step(), 1) train_op = tf.group(incr_step, *grads) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) orig_vals = sess.run(tf.global_variables()) for _ in range(10): sess.run(train_op) new_vals = sess.run(tf.global_variables()) different = [] for old, new in zip(orig_vals, new_vals): try: self.assertAllClose(old, new) except AssertionError: different.append(True) self.assertEqual(len(different), len(tf.global_variables()))
Example #24
Source File: kfac.py From HardRLWithYoutube with MIT License | 5 votes |
def compute_gradients(self, loss, var_list=None): varlist = var_list if varlist is None: varlist = tf.trainable_variables() g = tf.gradients(loss, varlist) return [(a, b) for a, b in zip(g, varlist)]
Example #25
Source File: yellowfin.py From fine-lm with MIT License | 5 votes |
def compute_gradients(self, loss, var_list, global_step=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None): """Compute gradients through momentum optimizer. Args: loss: A Tensor containing the value to minimize. var_list: Optional list or tuple of tf.Variable to update to minimize loss. Defaults to the list of variables collected in the graph under the key GraphKey.TRAINABLE_VARIABLES. global_step: Optional Variable to increment by one after the variables have been updated. gate_gradients: How to gate the computation of gradients. Can be GATE_NONE, GATE_OP, or GATE_GRAPH. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class AggregationMethod. colocate_gradients_with_ops: If True, try collocating gradients with the corresponding op. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. grad_loss: Optional. A Tensor holding the gradient computed for loss. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be None. """ del global_step, name # Unused for now. return self._momentum_optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss)
Example #26
Source File: per_example_gradients.py From DOTA_models with Apache License 2.0 | 5 votes |
def __call__(self, w, z_grads): idx = list(self.op.inputs).index(w) # Make sure that `op` was actually applied to `w` assert idx != -1 assert len(z_grads) == len(self.op.outputs) # The following assert may be removed when we are ready to use this # for general purpose code. # This assert is only expected to hold in the contex of our preliminary # MNIST experiments. assert idx == 1 # We expect convolution weights to be arg 1 images, filters = self.op.inputs strides = self.op.get_attr("strides") padding = self.op.get_attr("padding") # Currently assuming that one specifies at most these four arguments and # that all other arguments to conv2d are set to default. conv, w_px = self._PxConv2DBuilder(images, filters, strides, padding) z_grads, = z_grads gradients_list = tf.gradients(conv, w_px, z_grads, colocate_gradients_with_ops= self.colocate_gradients_with_ops, gate_gradients=self.gate_gradients) return tf.stack(gradients_list)
Example #27
Source File: model.py From cs294-112_hws with MIT License | 5 votes |
def __init__(self, FLAGS, algorithm, expert_returns=None, expert_policy_fn=None): print('Initializing the model...') if not algorithm.strip().lower() in ['behavioral_cloning', 'dagger']: raise NotImplementedError('Algorithm {} not implemented.'.format(algorithm)) self.FLAGS = FLAGS self.algorithm = algorithm.strip().lower() self.expert_returns = expert_returns self.expert_policy_fn = expert_policy_fn if self.algorithm == 'dagger' and self.expert_policy_fn is None: raise ValueError('No expert policy found.') self.scope = self.algorithm + '_' + time.strftime('%Y-%m-%d-%H-%M-%S') with tf.variable_scope( self.scope, initializer=tf.keras.initializers.he_normal(), regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), reuse=tf.AUTO_REUSE ): self.add_placeholders() self.build_graph() self.add_loss() params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) self.gradient_norm = tf.global_norm(gradients) clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.FLAGS['max_gradient_norm']) self.param_norm = tf.global_norm(params) self.global_step = tf.Variable(0, name="global_step", trainable=False) lr = self.FLAGS['learning_rate'] opt = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.8, beta2=0.999, epsilon=1e-7) self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) self.bestmodel_saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) self.summaries = tf.summary.merge_all()
Example #28
Source File: trust_region.py From DOTA_models with Apache License 2.0 | 5 votes |
def flatgrad(loss, var_list): grads = gradients(loss, var_list) return tf.concat([tf.reshape(grad, [-1]) for (v, grad) in zip(var_list, grads) if grad is not None], 0)
Example #29
Source File: trust_region.py From DOTA_models with Apache License 2.0 | 5 votes |
def gradients(loss, var_list): grads = tf.gradients(loss, var_list) return [g if g is not None else tf.zeros(v.shape) for g, v in zip(grads, var_list)]
Example #30
Source File: common_layers.py From fine-lm with MIT License | 5 votes |
def smoothing_cross_entropy_factored_grad(op, dy): """Gradient function for smoothing_cross_entropy_factored.""" a = op.inputs[0] b = op.inputs[1] labels = op.inputs[2] confidence = op.inputs[3] num_splits = 16 vocab_size = shape_list(b)[0] labels = approximate_split(labels, num_splits) a = approximate_split(a, num_splits) dy = approximate_split(dy, num_splits) b_grad = None a_grad_parts = [] deps = [] for part in range(num_splits): with tf.control_dependencies(deps): logits = tf.matmul(a[part], b, transpose_b=True) output_part = smoothing_cross_entropy(logits, labels[part], vocab_size, confidence) a_grad_part, b_grad_part = tf.gradients( ys=[output_part], xs=[a[part], b], grad_ys=[dy[part]]) a_grad_parts.append(a_grad_part) if part > 0: b_grad += b_grad_part else: b_grad = b_grad_part deps = [b_grad, a_grad_part] a_grad = tf.concat(a_grad_parts, 0) return a_grad, b_grad, None, None