Python Examples of tensorflow.gradients

Source File: optimization_test.py From BERT-Classification-Tutorial with Apache License 2.0

6 votes

def test_adam(self):
        with self.test_session() as sess:
            w = tf.get_variable(
                "w",
                shape=[3],
                initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
            x = tf.constant([0.4, 0.2, -0.5])
            loss = tf.reduce_mean(tf.square(x - w))
            tvars = tf.trainable_variables()
            grads = tf.gradients(loss, tvars)
            global_step = tf.train.get_or_create_global_step()
            optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
            train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
            init_op = tf.group(tf.global_variables_initializer(),
                               tf.local_variables_initializer())
            sess.run(init_op)
            for _ in range(100):
                sess.run(train_op)
            w_np = sess.run(w)
            self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)

Source File: ac_net.py From reinforcement_learning with MIT License

6 votes

def __init__(self, state_size, action_size, lr,
               name, n_h1=400, n_h2=300, global_name='global'):

    self.state_size = state_size
    self.action_size = action_size
    self.name = name
    self.n_h1 = n_h1
    self.n_h2 = n_h2

    self.optimizer = tf.train.AdamOptimizer(lr)
    self.input_s, self.input_a, self.advantage, self.target_v, self.policy, self.value, self.action_est, self.model_variables = self._build_network(
        name)

    # 0.5, 0.2, 1.0
    self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1])))
    self.entropy_loss = 1.0 * tf.reduce_sum(self.policy * tf.log(self.policy))
    self.policy_loss = 1.0 * tf.reduce_sum(-tf.log(self.action_est) * self.advantage)
    self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.model_variables])
    # self.loss = 0.5 * self.value_loss + self.policy_loss + 0.2 * self.entropy_loss
    self.loss = self.value_loss + self.policy_loss + self.entropy_loss
    self.gradients = tf.gradients(self.loss, self.model_variables)
    if name != global_name:
      self.var_norms = tf.global_norm(self.model_variables)
      global_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, global_name)
      self.apply_gradients = self.optimizer.apply_gradients(zip(self.gradients, global_variables))

Source File: face_attack.py From Adversarial-Face-Attack with GNU General Public License v3.0

6 votes

def build_pgd_attack(self, eps):
        victim_embeddings = tf.constant(self.victim_embeddings, dtype=tf.float32)

        def one_step_attack(image, grad):
            """
            core components of this attack are:
            (a) PGD adversarial attack (https://arxiv.org/pdf/1706.06083.pdf)
            (b) momentum (https://arxiv.org/pdf/1710.06081.pdf)
            (c) input diversity (https://arxiv.org/pdf/1803.06978.pdf)
            """
            orig_image = image
            image = self.structure(image)
            image = (image - 127.5) / 128.0
            image = image + tf.random_uniform(tf.shape(image), minval=-1e-2, maxval=1e-2)
            prelogits, _ = self.network.inference(image, 1.0, False, bottleneck_layer_size=512)
            embeddings = tf.nn.l2_normalize(prelogits, 1, 1e-10, name='embeddings')

            embeddings = tf.reshape(embeddings[0], [512, 1])
            objective = tf.reduce_mean(tf.matmul(victim_embeddings, embeddings))  # to be maximized

            noise, = tf.gradients(objective, orig_image)

            noise = noise / tf.reduce_mean(tf.abs(noise), [1, 2, 3], keep_dims=True)
            noise = 0.9 * grad + noise

            adv = tf.clip_by_value(orig_image + tf.sign(noise) * 1.0, lower_bound, upper_bound)
            return adv, noise

        input = tf.to_float(self.image_batch)
        lower_bound = tf.clip_by_value(input - eps, 0, 255.)
        upper_bound = tf.clip_by_value(input + eps, 0, 255.)

        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            adv, _ = tf.while_loop(
                lambda _, __: True, one_step_attack,
                (input, tf.zeros_like(input)),
                back_prop=False,
                maximum_iterations=100,
                parallel_iterations=1)
        self.adv_image = adv
        return adv

Source File: dqn.py From TransferRL with MIT License

6 votes

def _add_train_op(self):
        # In regression, the objective loss is Mean Squared Error (MSE).
        self.loss = tf.losses.mean_squared_error(labels = self._y, predictions = self.output)

        tvars = tf.trainable_variables()
        gradients = tf.gradients(self.loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)

        # Clip the gradients
        with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)):
            grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm)

        # Add a summary
        tf.summary.scalar('global_norm', global_norm)

        # Apply adagrad optimizer
        optimizer = tf.train.AdamOptimizer(self._hps.lr)
        with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)):
            self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')

        self.variable_summaries('dqn_loss',self.loss)

Source File: __init__.py From post--memorization-in-rnns with MIT License

6 votes

def connectivity(logits, target, embedding, embedding_matrix, offset):
    logits_correct = select_dim_value(logits, target)
    # Compute partial gradient with respect to the embedding
    partial_gradient = tf.gradients(
        logits_correct[0, offset[0]],
        embedding
    )[0][0, ...]
    # Finailize the chain rule and compute the gradient with respect
    # to the one-hot-encoding of the source. Note that the
    # one-hot-encoding is not part of the graph, which is why the
    # gradient can't be computed directly this way.
    full_gradient = tf.matmul(partial_gradient,
                              tf.transpose(embedding_matrix))

    connectivity = tf.reduce_sum(full_gradient ** 2, axis=1)
    return tf.reshape(connectivity, [1, -1])

Source File: test_attacks.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License

6 votes

def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self):

        x_val = np.random.rand(1, 2)
        x_val = np.array(x_val, dtype=np.float32)

        self.attack.generate_np(x_val, eps=.3, num_iterations=10,
                                clip_max=-5.0, clip_min=-5.0,
                                xi=1e-6)

        old_grads = tf.gradients

        def fn(*x, **y):
            raise RuntimeError()
        tf.gradients = fn

        self.attack.generate_np(x_val, eps=.2, num_iterations=10,
                                clip_max=-4.0, clip_min=-4.0,
                                xi=1e-5)

        tf.gradients = old_grads

Source File: test_attacks_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License

6 votes

def test_fgm_gradient_max(self):
        input_dim = 2
        num_classes = 3
        batch_size = 4
        rng = np.random.RandomState([2017, 8, 23])
        x = tf.placeholder(tf.float32, [batch_size, input_dim])
        weights = tf.placeholder(tf.float32, [input_dim, num_classes])
        logits = tf.matmul(x, weights)
        probs = tf.nn.softmax(logits)
        adv_x = fgm(x, probs)
        random_example = rng.randint(batch_size)
        random_feature = rng.randint(input_dim)
        output = tf.slice(adv_x, [random_example, random_feature], [1, 1])
        dx, = tf.gradients(output, x)
        # The following line catches GitHub issue #243
        self.assertIsNotNone(dx)
        dx = self.sess.run(dx, feed_dict=random_feed_dict(rng, [x, weights]))
        ground_truth = np.zeros((batch_size, input_dim))
        ground_truth[random_example, random_feature] = 1.
        self.assertClose(dx, ground_truth)

Source File: adem_graphs.py From ADEM with MIT License

6 votes

def adem(context_vector, model_response_vector, reference_response_vector,
         context_dim, model_response_dim, reference_response_dim,
         human_score_place, lr, max_grad_norm):
    model_score, M, N = tf_dynamic_adem_score(
        context=context_vector,
        model_response=model_response_vector,
        reference_response=reference_response_vector,
        shape_info={'batch_size': None,
                    'ct_dim': context_dim,
                    'mr_dim': model_response_dim,
                    'rr_dim': reference_response_dim})

    loss = compute_adem_l1_loss(human_score_place, model_score, M, N)

    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(
        tf.gradients(loss, tvars), max_grad_norm)
    optimizer = tf.train.AdamOptimizer(lr)
    train_op = optimizer.apply_gradients(
        zip(grads, tvars),
        global_step=tf.contrib.framework.get_or_create_global_step()
    )
    return train_op, loss, model_score

Source File: attacks_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License

6 votes

def jacobian_graph(predictions, x, nb_classes):
    """
    Create the Jacobian graph to be ran later in a TF session
    :param predictions: the model's symbolic output (linear output,
        pre-softmax)
    :param x: the input placeholder
    :param nb_classes: the number of classes the model has
    :return:
    """
    # This function will return a list of TF gradients
    list_derivatives = []

    # Define the TF graph elements to compute our derivatives for each class
    for class_ind in xrange(nb_classes):
        derivatives, = tf.gradients(predictions[:, class_ind], x)
        list_derivatives.append(derivatives)

    return list_derivatives

Source File: critic.py From reinforcement_learning with MIT License

6 votes

def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001):
    self.state_size = state_size
    self.action_size = action_size
    self.optimizer = tf.train.AdamOptimizer(lr)
    self.tau = tau

    self.n_h1 = n_h1
    self.n_h2 = n_h2

    self.input_s, self.action, self.critic_variables, self.q_value = self._build_network("critic")
    self.input_s_target, self.action_target, self.critic_variables_target, self.q_value_target = self._build_network("critic_target")

    self.target = tf.placeholder(tf.float32, [None])
    self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.critic_variables])
    self.loss = tf.reduce_mean(tf.square(self.target - self.q_value)) + 0.01*self.l2_loss
    self.optimize = self.optimizer.minimize(self.loss)
    self.update_target_op = [self.critic_variables_target[i].assign(tf.multiply(self.critic_variables[i], self.tau) + tf.multiply(self.critic_variables_target[i], 1 - self.tau)) for i in range(len(self.critic_variables))]
    self.action_gradients = tf.gradients(self.q_value, self.action)

Source File: attacks_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License

6 votes

def _compute_gradients(self, loss_fn, x, unused_optim_state):
        """Compute a new value of `x` to minimize `loss_fn`.

        Args:
            loss_fn: a callable that takes `x`, a batch of images, and returns
                a batch of loss values. `x` will be optimized to minimize
                `loss_fn(x)`.
            x: A list of Tensors, the values to be updated. This is analogous
                to the `var_list` argument in standard TF Optimizer.
            unused_optim_state: A (possibly nested) dict, containing any state
                info needed for the optimizer.

        Returns:
            new_x: A list of Tensors, the same length as `x`, which are updated
            new_optim_state: A dict, with the same structure as `optim_state`,
                which have been updated.
        """

        # Assumes `x` is a list,
        # and contains a tensor representing a batch of images
        assert len(x) == 1 and isinstance(x, list), \
            'x should be a list and contain only one image tensor'
        x = x[0]
        loss = reduce_mean(loss_fn(x), axis=0)
        return tf.gradients(loss, x)

Source File: test_utils_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License

6 votes

def test_clip_eta_goldilocks(self):
        # Test that the clipping handles perturbations that are
        # too small, just right, and too big correctly
        eta = tf.constant([[2.], [3.], [4.]])
        assert eta.dtype == tf.float32, eta.dtype
        eps = 3.
        for ord_arg in [np.inf, 1, 2]:
            for sign in [-1., 1.]:
                clipped = clip_eta(eta * sign, ord_arg, eps)
                clipped_value = self.sess.run(clipped)
                gold = sign * np.array([[2.], [3.], [3.]])
                self.assertClose(clipped_value, gold)
                grad, = tf.gradients(clipped, eta)
                grad_value = self.sess.run(grad)
                # Note: the second 1. is debatable (the left-sided derivative
                # and the right-sided derivative do not match, so formally
                # the derivative is not defined). This test makes sure that
                # we at least handle this oddity consistently across all the
                # argument values we test
                gold = sign * np.array([[1.], [1.], [0.]])
                assert np.allclose(grad_value, gold)

Source File: model_unsupervise.py From GroundeR with MIT License

6 votes

def build_train_op(self, loss):
        if self.optim == 'adam':
            print 'Adam optimizer'
            v_dict = self.get_variables_by_name([""], True)
            var_list1 = [i for i in v_dict[""] if 'vis_enc' not in i.name]
            var_list2 = self.get_variables_by_name(["vis_enc"], True)
            var_list2 = var_list2["vis_enc"]

            opt1 = tf.train.AdamOptimizer(self.lr, name="Adam")
            opt2 = tf.train.AdamOptimizer(self.lr*0.1, name="Adam_vis_enc")
            grads = tf.gradients(loss, var_list1 + var_list2)
            grads1 = grads[:len(var_list1)]
            grads2 = grads[len(var_list1):]
            train_op1 = opt1.apply_gradients(zip(grads1, var_list1))
            train_op2 = opt2.apply_gradients(zip(grads2, var_list2))
            train_op = tf.group(train_op1, train_op2)            
        else:
            print 'SGD optimizer'
            tvars = tf.trainable_variables()
            optimizer = tf.train.GradientDescentOptimizer(self._lr)
            grads = tf.gradients(cost, tvars)
            train_op = optimizer.apply_gradients(zip(grads, tvars))
        return train_op

Source File: seq2seq_attention_model.py From DOTA_models with Apache License 2.0

6 votes

def _add_train_op(self):
    """Sets self._train_op, op to run for training."""
    hps = self._hps

    self._lr_rate = tf.maximum(
        hps.min_lr,  # min_lr_rate.
        tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98))

    tvars = tf.trainable_variables()
    with tf.device(self._get_gpu(self._num_gpus-1)):
      grads, global_norm = tf.clip_by_global_norm(
          tf.gradients(self._loss, tvars), hps.max_grad_norm)
    tf.summary.scalar('global_norm', global_norm)
    optimizer = tf.train.GradientDescentOptimizer(self._lr_rate)
    tf.summary.scalar('learning rate', self._lr_rate)
    self._train_op = optimizer.apply_gradients(
        zip(grads, tvars), global_step=self.global_step, name='train_step')

Source File: resnet_model.py From DOTA_models with Apache License 2.0

6 votes

def _build_train_op(self):
    """Build training specific ops for the graph."""
    self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
    tf.summary.scalar('learning_rate', self.lrn_rate)

    trainable_variables = tf.trainable_variables()
    grads = tf.gradients(self.cost, trainable_variables)

    if self.hps.optimizer == 'sgd':
      optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate)
    elif self.hps.optimizer == 'mom':
      optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9)

    apply_op = optimizer.apply_gradients(
        zip(grads, trainable_variables),
        global_step=self.global_step, name='train_step')

    train_ops = [apply_op] + self._extra_train_ops
    self.train_op = tf.group(*train_ops)

  # TODO(xpan): Consider batch_norm in contrib/layers/python/layers/layers.py

Source File: actor.py From reinforcement_learning with MIT License

6 votes

def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001):
    self.state_size = state_size
    self.action_size = action_size
    self.optimizer = tf.train.AdamOptimizer(lr)
    self.tau = tau

    self.n_h1 = n_h1
    self.n_h2 = n_h2

    self.input_s, self.actor_variables, self.action_values = self._build_network("actor")
    self.input_s_target, self.actor_variables_target, self.action_values_target = self._build_network("actor_target")

    self.action_gradients = tf.placeholder(tf.float32, [None, self.action_size])
    self.actor_gradients = tf.gradients(self.action_values, self.actor_variables, -self.action_gradients)
    self.update_target_op = [self.actor_variables_target[i].assign(tf.multiply(self.actor_variables[i], self.tau) + tf.multiply(self.actor_variables_target[i], 1 - self.tau)) 
                              for i in range(len(self.actor_variables))]
    self.optimize = self.optimizer.apply_gradients(zip(self.actor_gradients, self.actor_variables))

Source File: common_layers_test.py From fine-lm with MIT License

5 votes

def testConvHiddenReluMemoryEfficient(self):
    batch = 3
    length = 23
    io_size = 16
    filter_size = 7
    x = np.random.rand(batch, length, io_size)
    dy = np.random.rand(batch, length, io_size)
    with self.test_session() as session:
      x = tf.to_float(x)
      dy = tf.to_float(dy)
      f1 = tf.get_variable("f1", [1, io_size, filter_size])
      f2 = tf.get_variable("f2", [1, filter_size, io_size])
      norm_scale, norm_bias = common_layers.layer_norm_vars(io_size)
      y = common_layers.conv_hidden_relu_memory_efficient(
          x, filter_size, forget=False,
          test_vars=(f1, f2, norm_scale, norm_bias))
      y_forget = common_layers.conv_hidden_relu_memory_efficient(
          x, filter_size, forget=True,
          test_vars=(f1, f2, norm_scale, norm_bias))
      dx, df1, df2, dnorm_scale, dnorm_bias = tf.gradients(
          ys=[y], xs=[x, f1, f2, norm_scale, norm_bias], grad_ys=[dy])
      dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f = tf.gradients(
          ys=[y_forget], xs=[x, f1, f2, norm_scale, norm_bias], grad_ys=[dy])
      session.run(tf.global_variables_initializer())
      (y, y_forget,
       dx, df1, df2, dnorm_scale, dnorm_bias,
       dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f) = session.run(
           [y, y_forget,
            dx, df1, df2, dnorm_scale, dnorm_bias,
            dx_f, df1_f, df2_f, dnorm_scale_f, dnorm_bias_f])
    self.assertAllClose(y, y_forget)
    self.assertAllClose(df2, df2_f)
    self.assertAllClose(df1, df1_f)
    self.assertAllClose(dnorm_scale, dnorm_scale_f)
    self.assertAllClose(dnorm_bias, dnorm_bias_f)
    self.assertAllClose(dx, dx_f)

Source File: model.py From TransferRL with MIT License

5 votes

def _add_shared_train_op(self):
    """Sets self._train_op, the op to run for training."""
    # Take gradients of the trainable variables w.r.t. the loss function to minimize
    loss_to_minimize = self._pgen_loss
    if self._hps.coverage:
      loss_to_minimize = self._pointer_cov_total_loss
    if self._hps.rl_training:
      loss_to_minimize = self._reinforce_shared_loss
      if self._hps.coverage:
        loss_to_minimize = self._reinforce_cov_total_loss

    tvars = tf.trainable_variables()
    gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)

    # Clip the gradients
    grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm)

    # Add a summary
    tf.summary.scalar('global_norm', global_norm)

    # Apply adagrad optimizer
    self.epoch = (self.global_step * FLAGS.batch_size) / FLAGS.train_size
    new_lr = tf.cond(tf.greater(self.epoch, 0), lambda: self._hps.lr/tf.cast(self.epoch, tf.float32), lambda: self._hps.lr)
    #new_lr = self._hps.lr/self.epoch if self.epoch>0 else self._hps.lr
    optimizer = tf.train.AdagradOptimizer(new_lr, initial_accumulator_value=self._hps.adagrad_init_acc)
    #optimizer = tf.train.AdamOptimizer()
    self._shared_train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')

Source File: kfac.py From lirpg with MIT License

5 votes

def compute_gradients(self, loss, var_list=None):
        varlist = var_list
        if varlist is None:
            varlist = tf.trainable_variables()
        g = tf.gradients(loss, varlist)

        return [(a, b) for a, b in zip(g, varlist)]

Source File: common_layers_test.py From fine-lm with MIT License

5 votes

def testCustomGrad(self):

    def fn(a, b, c):
      return tf.layers.dense(a, 10, use_bias=False) + tf.matmul(b, c)

    def grad_fn(inputs, variables, unused_outputs, unused_grad_outputs):
      grad_inputs = [tf.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)]
      grad_vars = [
          tf.ones_like(t) * (i + len(inputs) + 1.)
          for i, t in enumerate(variables)
      ]
      return grad_inputs, grad_vars

    a = tf.random_uniform([11, 6])
    b = tf.random_uniform([11, 7])
    c = tf.random_uniform([7, 10])
    w = tf.random_uniform([6, 10])
    out = common_layers.fn_with_custom_grad(grad_fn)(fn)(a, b, c)
    loss = tf.reduce_mean(out)
    grads = tf.gradients(loss, [a, b, c, tf.trainable_variables()[0]])
    expected_grads = [
        tf.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w])
    ]
    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
      g_val, eg_val = sess.run([grads, expected_grads])
      for g1, g2 in zip(g_val, eg_val):
        self.assertAllClose(g1, g2)

Source File: common_layers.py From fine-lm with MIT License

5 votes

def shakeshake2_grad(x1, x2, dy):
  """Overriding gradient for shake-shake of 2 tensors."""
  y = shakeshake2_py(x1, x2)
  dx = tf.gradients(ys=[y], xs=[x1, x2], grad_ys=[dy])
  return dx

Source File: rev_block.py From fine-lm with MIT License

5 votes

def _acc_grads(*lists_of_grads):
  """Accumulates lists of gradients."""
  acc_grads = []
  for grads in zip(*lists_of_grads):
    grads = [g for g in grads if g is not None]
    if grads:
      acc_grads.append(tf.add_n(grads))
    else:
      acc_grads.append(None)
  return acc_grads

Source File: diet_test.py From fine-lm with MIT License

5 votes

def testDiet(self):

    params = diet.diet_adam_optimizer_params()

    @diet.fn_with_diet_vars(params)
    def model_fn(x):
      y = tf.layers.dense(x, 10, use_bias=False)
      return y

    @diet.fn_with_diet_vars(params)
    def model_fn2(x):
      y = tf.layers.dense(x, 10, use_bias=False)
      return y

    x = tf.random_uniform((10, 10))
    y = model_fn(x) + 10.
    y = model_fn2(y) + 10.
    grads = tf.gradients(y, [x])
    with tf.control_dependencies(grads):
      incr_step = tf.assign_add(tf.train.get_or_create_global_step(), 1)

    train_op = tf.group(incr_step, *grads)
    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
      orig_vals = sess.run(tf.global_variables())
      for _ in range(10):
        sess.run(train_op)
      new_vals = sess.run(tf.global_variables())

      different = []
      for old, new in zip(orig_vals, new_vals):
        try:
          self.assertAllClose(old, new)
        except AssertionError:
          different.append(True)
      self.assertEqual(len(different), len(tf.global_variables()))

Source File: kfac.py From HardRLWithYoutube with MIT License

5 votes

def compute_gradients(self, loss, var_list=None):
        varlist = var_list
        if varlist is None:
            varlist = tf.trainable_variables()
        g = tf.gradients(loss, varlist)

        return [(a, b) for a, b in zip(g, varlist)]

Source File: yellowfin.py From fine-lm with MIT License

5 votes

def compute_gradients(self,
                        loss,
                        var_list,
                        global_step=None,
                        gate_gradients=GATE_OP,
                        aggregation_method=None,
                        colocate_gradients_with_ops=False,
                        name=None,
                        grad_loss=None):
    """Compute gradients through momentum optimizer.

    Args:
      loss: A Tensor containing the value to minimize.
      var_list: Optional list or tuple of tf.Variable to update
        to minimize loss. Defaults to the list of variables collected
        in the graph under the key GraphKey.TRAINABLE_VARIABLES.
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      gate_gradients: How to gate the computation of gradients.
        Can be GATE_NONE, GATE_OP, or GATE_GRAPH.
      aggregation_method: Specifies the method used to combine
        gradient terms. Valid values are defined in the class AggregationMethod.
      colocate_gradients_with_ops: If True, try collocating gradients with
        the corresponding op.
      name: Optional name for the returned operation. Default to the name
        passed to the Optimizer constructor.
      grad_loss: Optional. A Tensor holding the gradient computed for loss.

    Returns:
      A list of (gradient, variable) pairs. Variable is always present,
        but gradient can be None.
    """
    del global_step, name  # Unused for now.
    return self._momentum_optimizer.compute_gradients(
        loss,
        var_list=var_list,
        gate_gradients=gate_gradients,
        aggregation_method=aggregation_method,
        colocate_gradients_with_ops=colocate_gradients_with_ops,
        grad_loss=grad_loss)

Source File: per_example_gradients.py From DOTA_models with Apache License 2.0

5 votes

def __call__(self, w, z_grads):
    idx = list(self.op.inputs).index(w)
    # Make sure that `op` was actually applied to `w`
    assert idx != -1
    assert len(z_grads) == len(self.op.outputs)
    # The following assert may be removed when we are ready to use this
    # for general purpose code.
    # This assert is only expected to hold in the contex of our preliminary
    # MNIST experiments.
    assert idx == 1  # We expect convolution weights to be arg 1

    images, filters = self.op.inputs
    strides = self.op.get_attr("strides")
    padding = self.op.get_attr("padding")
    # Currently assuming that one specifies at most these four arguments and
    # that all other arguments to conv2d are set to default.

    conv, w_px = self._PxConv2DBuilder(images, filters, strides, padding)
    z_grads, = z_grads

    gradients_list = tf.gradients(conv, w_px, z_grads,
                                  colocate_gradients_with_ops=
                                  self.colocate_gradients_with_ops,
                                  gate_gradients=self.gate_gradients)

    return tf.stack(gradients_list)

Source File: model.py From cs294-112_hws with MIT License

5 votes

def __init__(self, FLAGS, algorithm, expert_returns=None, expert_policy_fn=None):
        print('Initializing the model...')
        if not algorithm.strip().lower() in ['behavioral_cloning', 'dagger']:
            raise NotImplementedError('Algorithm {} not implemented.'.format(algorithm))
        self.FLAGS = FLAGS
        self.algorithm = algorithm.strip().lower()
        self.expert_returns = expert_returns
        self.expert_policy_fn = expert_policy_fn
        if self.algorithm == 'dagger' and self.expert_policy_fn is None:
            raise ValueError('No expert policy found.')
        
        self.scope = self.algorithm + '_' + time.strftime('%Y-%m-%d-%H-%M-%S')
        
        with tf.variable_scope(
            self.scope, 
            initializer=tf.keras.initializers.he_normal(), 
            regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), 
            reuse=tf.AUTO_REUSE
        ):
            self.add_placeholders()
            self.build_graph()
            self.add_loss()
            
        params = tf.trainable_variables()
        gradients = tf.gradients(self.loss, params)
        self.gradient_norm = tf.global_norm(gradients)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.FLAGS['max_gradient_norm'])
        self.param_norm = tf.global_norm(params)
        
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        lr = self.FLAGS['learning_rate']
        opt = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.8, beta2=0.999, epsilon=1e-7)
        self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
        
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        self.bestmodel_saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        self.summaries = tf.summary.merge_all()

Source File: trust_region.py From DOTA_models with Apache License 2.0

5 votes

def flatgrad(loss, var_list):
  grads = gradients(loss, var_list)
  return tf.concat([tf.reshape(grad, [-1])
                    for (v, grad) in zip(var_list, grads)
                    if grad is not None], 0)

Source File: trust_region.py From DOTA_models with Apache License 2.0

5 votes

def gradients(loss, var_list):
  grads = tf.gradients(loss, var_list)
  return [g if g is not None else tf.zeros(v.shape)
          for g, v in zip(grads, var_list)]

Source File: common_layers.py From fine-lm with MIT License

5 votes

def smoothing_cross_entropy_factored_grad(op, dy):
  """Gradient function for smoothing_cross_entropy_factored."""
  a = op.inputs[0]
  b = op.inputs[1]
  labels = op.inputs[2]
  confidence = op.inputs[3]
  num_splits = 16
  vocab_size = shape_list(b)[0]
  labels = approximate_split(labels, num_splits)
  a = approximate_split(a, num_splits)
  dy = approximate_split(dy, num_splits)
  b_grad = None
  a_grad_parts = []
  deps = []
  for part in range(num_splits):
    with tf.control_dependencies(deps):
      logits = tf.matmul(a[part], b, transpose_b=True)
      output_part = smoothing_cross_entropy(logits, labels[part], vocab_size,
                                            confidence)
      a_grad_part, b_grad_part = tf.gradients(
          ys=[output_part], xs=[a[part], b], grad_ys=[dy[part]])
      a_grad_parts.append(a_grad_part)
      if part > 0:
        b_grad += b_grad_part
      else:
        b_grad = b_grad_part
      deps = [b_grad, a_grad_part]
  a_grad = tf.concat(a_grad_parts, 0)
  return a_grad, b_grad, None, None

Python tensorflow.gradients() Examples