Python Examples of tensorflow.compat.v2.GradientTape

Source File: extensions_test.py From trax with Apache License 2.0

7 votes

def testGrad(self):

    def f(a, b):
      return tf_np.sum(tf_np.sqrt(tf_np.exp(a)) + b)

    g = extensions.grad(f)

    def compare(a, b):
      with tf.GradientTape() as tape:
        tape.watch(a.data)
        r = f(a, b)
      expected = tape.gradient(r.data, a.data)
      self.assertAllEqual(expected, g(a, b))

    shape = [10]
    a = tf_np.random.randn(*shape)
    b = tf_np.random.randn(*shape)
    compare(a, b)

Source File: export.py From hub with Apache License 2.0

6 votes

def train_step(model, loss_fn, optimizer_fn, metric, image, label):
  """Perform one training step for the model.

  Args:
    model: Keras model to train.
    loss_fn: Loss function to use.
    optimizer_fn: Optimizer function to use.
    metric: keras.metric to use.
    image: Tensor of training images of shape [batch_size, 28, 28, 1].
    label: Tensor of class labels of shape [batch_size].
  """
  with tf.GradientTape() as tape:
    preds = model(image)
    label_onehot = tf.one_hot(label, 10)
    loss_ = loss_fn(label_onehot, preds)
  grads = tape.gradient(loss_, model.trainable_variables)
  optimizer_fn.apply_gradients(zip(grads, model.trainable_variables))
  metric(loss_)

Source File: test_util.py From spectral-density with Apache License 2.0

6 votes

def hessian(function: Callable[[Parameters], tf.Tensor],
            parameters: Parameters) -> Parameters:
  """Computes the Hessian of a given function.

  Useful for testing, although scales very poorly.

  Args:
    function: A function for which we want to compute the Hessian.
    parameters: Parameters with respect to the Hessian should be computed.

  Returns:
    A tensor or list of tensors of same nested structure as `Parameters`,
      representing the Hessian.
  """
  with tf.GradientTape() as outer_tape:
    with tf.GradientTape() as inner_tape:
      value = function(parameters)
    grads = inner_tape.gradient(value, parameters)
    grads = tensor_list_util.tensor_list_to_vector(grads)
  return outer_tape.jacobian(grads, parameters)

Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0

6 votes

def test_multiple_state_vars(self):
    x = tf.constant([3.0, 4.0])
    y = tf.constant([5.0, 6.0])
    z = tf.constant([7.0, 8.0])
    alpha = tf.constant(2.0)
    beta = tf.constant(1.0)

    with tf.GradientTape(persistent=True) as tape:
      tape.watch([alpha, beta])
      def body(i, state):
        x, y, z = state
        k = tf.cast(i + 1, tf.float32)
        return [x * alpha - beta, y * k * alpha * beta, z * beta + x]
      out = for_loop(body, [x, y, z], [alpha, beta], 3)

    with self.subTest("independent_vars"):
      grad = tape.gradient(out[1], alpha)
      self.assertAllEqual(792, grad)
    with self.subTest("dependent_vars"):
      grad = tape.gradient(out[2], beta)
      self.assertAllEqual(63, grad)

Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0

6 votes

def test_batching(self):
    x = tf.constant([[3.0, 4.0], [30.0, 40.0]])
    y = tf.constant([[5.0, 6.0], [50.0, 60.0]])
    z = tf.constant([[7.0, 8.0], [70.0, 80.0]])
    alpha = tf.constant(2.0)
    beta = tf.constant(1.0)

    with tf.GradientTape(persistent=True) as tape:
      tape.watch([alpha, beta])
      def body(i, state):
        x, y, z = state
        k = tf.cast(i + 1, tf.float32)
        return [x * alpha - beta, y * k * alpha * beta, z * beta + x]
      out = for_loop(body, [x, y, z], [alpha, beta], 3)
    with self.subTest("independent_vars"):
      grad = tape.gradient(out[1], alpha)
      self.assertAllEqual(8712, grad)
    with self.subTest("dependent_vars"):
      grad = tape.gradient(out[2], beta)
      self.assertAllEqual(783, grad)

Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0

6 votes

def test_with_xla(self):
    @tf.function
    def fn():
      x = tf.constant([[3.0, 4.0], [30.0, 40.0]])
      y = tf.constant([[7.0, 8.0], [70.0, 80.0]])
      alpha = tf.constant(2.0)
      beta = tf.constant(1.0)
      with tf.GradientTape(persistent=True) as tape:
        tape.watch([alpha, beta])
        def body(i, state):
          del i
          x, y = state
          return [x * alpha - beta, y * beta + x]
        out = for_loop(body, [x, y], [alpha, beta], 3)
      return tape.gradient(out[1], beta)

    grad = self.evaluate(tf.xla.experimental.compile(fn))[0]
    self.assertAllEqual(783, grad)

Source File: model.py From trax with Apache License 2.0

5 votes

def train(self, x, y, learning_rate=0.01):
    """Runs a single training pass.

    Args:
      x: 2-d array of size batch_size x image_size.
      y: 2-d array of size batch_size x num_classes in one-hot notation.
      learning_rate: The learning rate.
    """
    x = np.array(x, copy=False)
    y = np.array(y, copy=False)

    def mean_squared_error(x, y):
      diff = x - y
      return np.sum(diff * diff) / len(x)

    wb_tensors = [p.data for p in self.weights + self.biases]
    with tf.GradientTape() as g:
      g.watch(wb_tensors)
      loss = mean_squared_error(self.forward(x), y)
    gradients = g.gradient(loss.data, wb_tensors)
    gradients = [np.asarray(grad) for grad in gradients]

    new_weights_and_biases = []
    for v, dv in zip(self.weights + self.biases, gradients):
      new_weights_and_biases.append(v - learning_rate * dv)

    total_len = len(new_weights_and_biases)
    self.weights = new_weights_and_biases[:total_len // 2]
    self.biases = new_weights_and_biases[total_len // 2:]

Source File: deep_factorized_test.py From compression with Apache License 2.0

5 votes

def test_variables_receive_gradients(self):
    df = deep_factorized.DeepFactorized()
    with tf.GradientTape() as tape:
      x = tf.random.normal([20])
      loss = -tf.reduce_mean(df.log_prob(x))
    grads = tape.gradient(loss, df.trainable_variables)
    self.assertLen(grads, 8)
    self.assertNotIn(None, grads)

Source File: uniform_noise_test.py From compression with Apache License 2.0

5 votes

def test_variables_receive_gradients(self):
    loc = tf.Variable(tf.ones([2], dtype=tf.float32))
    log_scale = tf.Variable(tf.zeros([2], dtype=tf.float32))
    logit_weight = tf.Variable(tf.constant([.3, .7], dtype=tf.float32))
    with tf.GradientTape() as tape:
      dist = self.dist_cls(
          loc=loc, scale=tf.exp(log_scale), weight=tf.nn.softmax(logit_weight))
      x = tf.random.normal([20])
      loss = -tf.reduce_mean(dist.log_prob(x))
    grads = tape.gradient(loss, [loc, log_scale, logit_weight])
    self.assertLen(grads, 3)
    self.assertNotIn(None, grads)

Source File: uniform_noise_test.py From compression with Apache License 2.0

5 votes

def test_variables_receive_gradients(self):
    loc = tf.Variable(1., dtype=tf.float32)
    log_scale = tf.Variable(0., dtype=tf.float32)
    with tf.GradientTape() as tape:
      dist = self.dist_cls(loc=loc, scale=tf.exp(log_scale))
      x = tf.random.normal([20])
      loss = -tf.reduce_mean(dist.log_prob(x))
    grads = tape.gradient(loss, [loc, log_scale])
    self.assertLen(grads, 2)
    self.assertNotIn(None, grads)

Source File: gdn_test.py From compression with Apache License 2.0

5 votes

def test_variables_receive_gradients(self):
    x = tf.random.uniform((1, 2), dtype=tf.float32)
    layer = gdn.GDN(inverse=False, rectify=True)
    with tf.GradientTape() as g:
      y = layer(x)
    grads = g.gradient(y, layer.trainable_variables)
    self.assertLen(grads, 2)
    self.assertNotIn(None, grads)

Source File: nql_test.py From language with Apache License 2.0

5 votes

def test_gradients(self):
    with tf.GradientTape(persistent=True) as g:
      x = self.context.one(cell(2, 2), 'place_t')
      near_x = x.follow('n') + x.follow('s') + x.follow('e') + x.follow('w')
      lr_near_x = near_x.weighted_by('trained_distance_to', 'ul')
      g.watch(self.context.get_underlying_parameter('trained_distance_to'))
      expected_y = self.context.one(cell(
          1, 2), 'place_t') * 3 + self.context.one(
              cell(2, 1), 'place_t') * 3 + self.context.one(
                  cell(3, 2), 'place_t') * 5 + self.context.one(
                      cell(2, 3), 'place_t') * 5
      almost_y = self.context.one(cell(1, 2), 'place_t') * 2 + self.context.one(
          cell(2, 1), 'place_t') * 3 + self.context.one(
              cell(3, 2), 'place_t') * 4 + self.context.one(
                  cell(2, 3), 'place_t') * 5
      # compute some gradients
      loss_1 = tf.reduce_sum(
          input_tensor=tf.multiply(lr_near_x.tf - expected_y.tf, lr_near_x.tf -
                                   expected_y.tf))
      loss_2 = tf.reduce_sum(
          input_tensor=tf.multiply(lr_near_x.tf - almost_y.tf, lr_near_x.tf -
                                   almost_y.tf))

      grad_1 = g.gradient(
          target=loss_1,
          sources=self.context.get_underlying_parameter('trained_distance_to'))
      grad_2 = g.gradient(
          target=loss_2,
          sources=self.context.get_underlying_parameter('trained_distance_to'))
    self.assertEqual(loss_1.numpy(), 0.0)
    self.assertEqual(loss_2.numpy(), 2.0)
    sum_grad_1 = tf.reduce_sum(input_tensor=grad_1)
    sum_grad_2 = tf.reduce_sum(input_tensor=grad_2)
    self.assertEqual(sum_grad_1.numpy(), 0.0)
    self.assertEqual(sum_grad_2.numpy(), 4.0)

Source File: cms_swap.py From tf-quant-finance with Apache License 2.0

5 votes

def _f_atm_second_derivative(self, s, cms_rates):
    """Computes second order derivative of _f_atm."""
    with tf.GradientTape() as g:
      g.watch(s)
      with tf.GradientTape() as gg:
        gg.watch(s)
        fx = self._f_atm(s, cms_rates)
      dfx = tf.squeeze(gg.gradient(fx, s))
    d2fx = tf.squeeze(g.gradient(dfx, s))
    return d2fx

Source File: cms_swap.py From tf-quant-finance with Apache License 2.0

5 votes

def _f_atm_first_derivative(self, s, cms_rates):
    """Computes first order derivative of _f_atm."""
    with tf.GradientTape() as g:
      g.watch(s)
      fx = self._f_atm(s, cms_rates)
    dfx = tf.squeeze(g.gradient(fx, s))
    return dfx

Source File: linear_interpolation_test.py From tf-quant-finance with Apache License 2.0

5 votes

def test_valid_gradients(self, optimize_for_tpu):
    """Tests none of the gradients is nan."""

    # In this example, `x[0]` and `x[1]` are both less than or equal to
    # `x_data[0]`. `x[-2]` and `x[-1]` are both greater than or equal to
    # `x_data[-1]`. They are set up this way to test none of the tf.where
    # branches of the implementation have any nan. An unselected nan could still
    # propagate through gradient calculation with the end result being nan.
    x = [[-10.0, -1.0, 1.0, 3.0, 6.0, 7.0], [8.0, 15.0, 18.0, 25.0, 30.0, 35.0]]
    x_data = [[-1.0, 2.0, 6.0], [8.0, 18.0, 30.0]]

    def _value_helper_fn(y_data):
      """A helper function that returns sum of squared interplated values."""

      interpolated_values = tff.math.interpolation.linear.interpolate(
          x, x_data, y_data,
          optimize_for_tpu=optimize_for_tpu,
          dtype=tf.float64)
      return tf.reduce_sum(tf.math.square(interpolated_values))

    y_data = tf.convert_to_tensor([[10.0, -1.0, -5.0], [7.0, 9.0, 20.0]],
                                  dtype=tf.float64)
    if tf.executing_eagerly():
      with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(y_data)
        value = _value_helper_fn(y_data=y_data)
        gradients = tape.gradient(value, y_data)
    else:
      value = _value_helper_fn(y_data=y_data)
      gradients = tf.gradients(value, y_data)[0]

    gradients = tf.convert_to_tensor(gradients)

    self.assertFalse(self.evaluate(tf.reduce_any(tf.math.is_nan(gradients))))

Source File: custom_loops.py From tf-quant-finance with Apache License 2.0

5 votes

def _jacobian_wrt_parameter(y, param, tape):
  """Computes a Jacobian w.r.t. a parameter."""
  # For input shapes (b, dy), yields shape (b, dy, 1) (1 is added for
  # convenience elsewhere).
  # To avoid having to broadcast param to y's shape, we need to take a forward
  # gradient.
  with tf.GradientTape() as w_tape:
    w = tf.zeros_like(y)
    w_tape.watch(w)
    vjp = tape.gradient(y, param, output_gradients=w)
  if vjp is None:  # Unconnected.
    return tf.expand_dims(tf.zeros_like(y), axis=-1)
  return tf.expand_dims(w_tape.gradient(vjp, w), axis=-1)

Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0

5 votes

def test_shapes(self, state_dims, num_params, times):
    # Checks that the loop can handle various shapes and outputs correct shapes.
    def test_with_batch_shape(batch_shape):
      initial_state = [tf.ones(shape=batch_shape + (d,)) for d in state_dims]
      params = [tf.constant(1.0) for _ in range(num_params)]
      with tf.GradientTape(persistent=True) as tape:
        tape.watch(initial_state)
        tape.watch(params)
        def body(i, state):
          del i
          if not params:
            return state
          sum_params = tf.add_n(params)
          state = [s * sum_params for s in state]
          return state
        final_state = for_loop(body, initial_state, params, times)

      for s_in in initial_state:
        for s_out in final_state:
          grad = tape.gradient(s_out, s_in)
          self.assertAllEqual(s_in.shape, grad.shape)

      for p in params:
        for s_out in final_state:
          grad = tape.gradient(s_out, p)
          self.assertAllEqual([], grad.shape)

    with self.subTest("no_batch"):
      test_with_batch_shape(batch_shape=())
    with self.subTest("simple_batch"):
      test_with_batch_shape(batch_shape=(5,))
    with self.subTest("complex_batch"):
      test_with_batch_shape(batch_shape=(2, 8, 3))

Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0

5 votes

def test_simple_grad_wrt_initial_state(self):
    x = tf.constant([3.0])
    sigma = tf.constant(2.0)

    with tf.GradientTape() as tape:
      tape.watch(x)
      def body(i, state):
        del i
        x = state[0]
        return [x * sigma]
      out = for_loop(body, [x], [sigma], 3)[0]

    grad = tape.gradient(out, x)
    self.assertAllEqual([8], grad)

Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0

5 votes

def test_simple_grad_wrt_parameter(self):
    x = tf.constant([3.0])
    sigma = tf.constant(2.0)

    with tf.GradientTape() as tape:
      tape.watch(sigma)
      def body(i, state):
        del i
        x = state[0]
        return [x * sigma]
      out = for_loop(body, [x], [sigma], 3)[0]

    grad = tape.gradient(out, sigma)
    self.assertAllEqual(36, grad)

Source File: matrix_vector_product.py From spectral-density with Apache License 2.0

5 votes

def _hessian_vector_product(
    function: Callable[[Parameters], tf.Tensor],
    parameters: Parameters,
    v: Parameters) -> Parameters:
  """Computes Hessian-vector products.

  Computes the product H.v where v is an arbitrary vector and H is the Hessian
  of a function evaluated at `parameters`.

  The result is the same as if the Hessian was computed explicitly and
  multiplied the vector. However, this function uses the autograd in backward
  then forward mode in order to compute this Hessian vector product without
  having to explicitly compute the Hessian.

  Args:
    function: A (twice) differentiable function that takes as input a tensor or
      a list of tensors and returns a scalar.
    parameters: The parameters with respect to which we want to compute the
      Hessian for the hessian vector product.
    v: An arbitrary vector or list of vectors of the same nested structure as
      `parameters`.

  Returns:
    A vector or list of vectors of the same nested structure as
      `parameters`, equal to H.v.
  """
  with tf.autodiff.ForwardAccumulator(
      primals=parameters, tangents=v) as acc:
    with tf.GradientTape() as tape:
      tape.watch(parameters)
      value = function(parameters)
    backward = tape.gradient(value, parameters)
  return acc.jvp(backward)

Source File: backprop_test.py From trax with Apache License 2.0

5 votes

def test_setitem(self):
    # Single integer index.
    a = array_ops.array([1., 2., 3.])
    b = array_ops.array(5.)
    c = array_ops.array(10.)

    tensors = [arr.data for arr in [a, b, c]]
    with tf.GradientTape() as g:
      g.watch(tensors)
      a[1] = b + c
      loss = array_ops.sum(a)

    gradients = g.gradient(loss.data, tensors)
    self.assertSequenceEqual(
        array_ops.array(gradients[0]).tolist(), [1., 0., 1.])
    self.assertEqual(array_ops.array(gradients[1]).tolist(), 1.)
    self.assertEqual(array_ops.array(gradients[2]).tolist(), 1.)

    # Tuple index.
    a = array_ops.array([[[1., 2.], [3., 4.]], [[5., 6.],
                                                [7., 8.]]])  # 2x2x2 array.
    b = array_ops.array([10., 11.])

    tensors = [arr.data for arr in [a, b]]
    with tf.GradientTape() as g:
      g.watch(tensors)
      a[(1, 0)] = b
      loss = array_ops.sum(a)

    gradients = g.gradient(loss.data, tensors)
    self.assertSequenceEqual(
        array_ops.array(gradients[0]).tolist(),
        [[[1., 1.], [1., 1.]], [[0., 0.], [1., 1.]]])
    self.assertEqual(array_ops.array(gradients[1]).tolist(), [1., 1.])

Source File: extensions_test.py From trax with Apache License 2.0

5 votes

def testVjp(self, has_aux):
    x_shape = (tf.TensorShape([10]), tf.TensorShape([1, 10]))
    y_shape = (tf.TensorShape([]))
    dtype = np.float32

    def f(a, b):
      y = tf_np.sum(tf_np.sqrt(tf_np.exp(a)) + b)
      if has_aux:
        return y, tf_np.asarray(1)
      else:
        return y

    rng = tf.random.Generator.from_seed(1234)
    x, dy_list = tf.nest.map_structure(lambda shape: uniform(rng, shape, dtype),
                                       [x_shape, [y_shape] * 2])
    tf_x = to_tf(x)
    outputs = extensions.vjp(f, *x, has_aux=has_aux)
    if has_aux:
      y, vjp, aux = outputs
    else:
      y, vjp = outputs
    with tf.GradientTape(persistent=True) as tape:
      tape.watch(tf_x)
      outputs = f(*x)
      if has_aux:
        expected_y, expected_aux = outputs
        self.assertAllClose(to_tf(expected_aux), to_tf(aux))
      else:
        expected_y = outputs
    self.assertAllClose(to_tf(expected_y), to_tf(y))
    for dy in dy_list:
      expected_dx = tape.gradient(
          to_tf(expected_y), tf_x, output_gradients=to_tf(dy))
      self.assertAllClose(expected_dx, to_tf(vjp(dy)))

Source File: model_tf2.py From machine-learning-for-programming-samples with MIT License

5 votes

def run_one_epoch(
        self, minibatches: Iterable[np.ndarray], training: bool = False,
    ):
        total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0
        for step, minibatch_data in enumerate(minibatches):
            with tf.GradientTape() as tape:
                model_outputs = self.compute_logits(minibatch_data, training=training)
                result = self.compute_loss_and_acc(model_outputs, minibatch_data)

            total_loss += result.token_ce_loss
            num_samples += minibatch_data.shape[0]
            num_tokens += result.num_predictions
            num_correct_tokens += result.num_correct_token_predictions

            if training:
                gradients = tape.gradient(
                    result.token_ce_loss, self.trainable_variables
                )
                self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

            print(
                "   Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f"
                % (
                    step,
                    total_loss / num_samples,
                    result.token_ce_loss,
                    result.num_correct_token_predictions
                    / (float(result.num_predictions) + 1e-7),
                ),
                end="\r",
            )
        print("\r\x1b[K", end="")
        return (
            total_loss / num_samples,
            num_correct_tokens / (float(num_tokens) + 1e-7),
        )

Source File: gradient.py From tf-quant-finance with Apache License 2.0

4 votes

def value_and_gradient(f,
                       xs,
                       output_gradients=None,
                       use_gradient_tape=False,
                       unconnected_gradients=None,
                       name=None):
  """Computes `f(*xs)` and its gradients wrt to `*xs`.

  Args:
    f: Python `callable` to be differentiated. If `f` returns a scalar, this
      scalar will be differentiated. If `f` returns a tensor or list of tensors,
      by default a scalar will be computed by adding all their values to produce
      a single scalar. If desired, the tensors can be elementwise multiplied by
      the tensors passed as the `dy` keyword argument to the returned gradient
      function.
    xs: Python list of parameters of `f` for which to differentiate. (Can also
      be single `Tensor`.)
    output_gradients: A `Tensor` or list of `Tensor`s the same size as the
      result `ys = f(*xs)` and holding the gradients computed for each `y` in
      `ys`. This argument is forwarded to the underlying gradient implementation
      (i.e., either the `grad_ys` argument of `tf.gradients` or the
      `output_gradients` argument of `tf.GradientTape.gradient`).
    use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be
      used regardless of `tf.executing_eagerly()` status.
      Default value: `False`.
    unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the
      gradient value returned when the given input tensors are unconnected.
      Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., `'value_and_gradient'`).

  Returns:
    A tuple of two elements. The first one is a `Tensor` representing the value
    of the function at `xs` and the second one is either a `Tensot` or a list of
    `Tensor`s representing grafient of `f(*xs)` wrt `xs`.
    y: `y = f(*xs)`.
    dydx: Gradient of `y` wrt each of `xs`.
  """
  unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE
  xs, is_xs_list_like = _prepare_args(xs)
  with tf.name_scope(name or "value_and_gradient"):
    if tf.executing_eagerly() or use_gradient_tape:
      with tf.GradientTape() as tape:
        for x in xs:
          tape.watch(x)
        y = f(*xs)
      grad = tape.gradient(y, xs, output_gradients=output_gradients,
                           unconnected_gradients=unconnected_gradients)
    else:
      y = f(*xs)
      grad = tf.gradients(ys=y, xs=xs, grad_ys=output_gradients,
                          unconnected_gradients=unconnected_gradients)
    if is_xs_list_like:
      return y, grad
    else:
      return y, grad[0]

Source File: gradient.py From tf-quant-finance with Apache License 2.0

4 votes

def gradients(func_or_y, xs, output_gradients=None, use_gradient_tape=False,
              unconnected_gradients=None,
              name=None):
  """Computes the gradients of `func_or_y` wrt to `*xs`.

  Args:
   func_or_y: Either a `Tensor` conencted to the input `x` or a Python callable
      accepting one `Tensor` of shape of `x` and returning a `Tensor` of any
      shape. The function whose gradient is to be computed. If eagerly
      executing, can only be a callable, i.e., one should not supply a Tensor
      in eager mode.
    xs: Python list of parameters of `f` for which to differentiate. (Can also
      be single `Tensor`.)
    output_gradients: A `Tensor` or list of `Tensor`s the same size as the
      result `ys = f(*xs)` and holding the gradients computed for each `y` in
      `ys`. This argument is forwarded to the underlying gradient implementation
      (i.e., either the `grad_ys` argument of `tf.gradients` or the
      `output_gradients` argument of `tf.GradientTape.gradient`).
      Default value: `None` which maps to a ones-like `Tensor` of `ys`.
    use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be
      used regardless of `tf.executing_eagerly()` status.
      Default value: `False`.
    unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the
      gradient value returned when the given input tensors are unconnected.
      Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., 'gradients').

  Returns:
    A `Tensor` with the gradient of `y` wrt each of `xs` or a list of `Tensor`s
    if `xs` is a list.
  """
  unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE
  f = _prepare_func(func_or_y)
  with tf.name_scope(name or "gradients"):
    xs, is_xs_list_like = _prepare_args(xs)
    if not tf.executing_eagerly() and not use_gradient_tape:
      y = f(*xs)
      grad = tf.gradients(y, xs, grad_ys=output_gradients,
                          unconnected_gradients=unconnected_gradients)
    else:
      if not callable(func_or_y):
        raise ValueError("`func_or_y` should be a callable in eager mode or "
                         "when `tf.GradientTape` is used.")
      with tf.GradientTape() as tape:
        for x in xs:
          tape.watch(x)
        y = f(*xs)
      grad = tape.gradient(y, xs, output_gradients=output_gradients,
                           unconnected_gradients=unconnected_gradients)
    if is_xs_list_like:
      return grad
    else:
      return grad[0]

Source File: extensions.py From trax with Apache License 2.0

4 votes

def grad(f, has_aux=False):
  """Returns a function that computes gradient of f.

  Gradients can only be computed through numpy and tensorflow operations and not
  through python float operations and values.

  Args:
    f: a function of type (params, *args) -> scalar. 'params' can be a nested
      structure (made of lists and tuples) of ndarrays and the gradient is
      evaluated against it. `scalar` is a scalar ndarray.
    has_aux: bool, indicates whether fun returns a pair where the first element
      is considered the output of the mathematical function to be differentiated
      and the second element is auxiliary data.

  Returns:
    A gradient function of type (params, *args) -> gradients, where the result
    'gradients' has the same structure and shapes as 'params'.
  """

  def check_loss_shape(np_loss):
    if not isinstance(np_loss, tf_np.ndarray):
      raise ValueError(
          "The result of the function to take gradient must be an ndarray.")
    if not np_loss.data.shape.is_compatible_with([]):
      raise ValueError(
          "The result of the function to take gradient must be a scalar.")

  def _f(params, *args):
    """The gradient function to be returned."""
    tf_params = _np_to_tf(params)
    with tf.GradientTape() as g:
      g.watch(tf.nest.flatten(tf_params))
      outputs = f(params, *args)
      if has_aux:
        np_loss, aux = outputs
      else:
        np_loss = outputs
      check_loss_shape(np_loss)
      tf_grads = g.gradient(np_loss.data, tf_params)
      if has_aux:
        res = (tf_grads, aux)
      else:
        res = tf_grads
      return _tf_to_np(res)

  return _f


# A workaround for b/121383831

Source File: extensions.py From trax with Apache License 2.0

4 votes

def vjp(f, *primals, has_aux=False):
  """Returns the result and the VJP function of `f`.

  This function returns the result and the vector-Jacobian-product (VJP)
  function of `f`.

  Args:
    f: a function from (nested structures of) tf_np.ndarrays to a (nested
      structure of) tf_np.ndarray. If `has_aux` is True, it should return an
      extra output.
    *primals: the inputs to be fed to `f`.
    has_aux: if True, the second output of `f` will be regarded as an auxiliary,
      non-differentiable output that will be ignored by the VJP function.

  Returns:
    A pair `(y, vjpfun)` if `has_aux` is False; a tuple `(y, vjpfun, aux)`
    otherwise. `y` and `aux` are the outputs of `f`, i.e. `y, aux =
    f(*primals)`. `vjpfun` is a function `dx = vjpfun(dy)`, where `dy` is the
    cotengents of `y`, having the same structures, shapes and dtypes as
    `y`. `dx` is the cotengents of `x`, having the same structures, shapes and
    dtypes as `x`.
  """
  tf_primals = _np_to_tf(primals)
  with tf.GradientTape(persistent=True) as tape:
    tape.watch(tf.nest.flatten(tf_primals))
    outputs = f(*primals)
    if has_aux:
      np_out, aux = outputs
    else:
      np_out = outputs
    tf_out = _np_to_tf(np_out)

    def _vjp(dy):
      tf_dy = _np_to_tf(dy)
      tf_dx = tape.gradient(tf_out, tf_primals, output_gradients=tf_dy)
      return _tf_to_np(tf_dx)

  if has_aux:
    ret = (np_out, _vjp, aux)
  else:
    ret = (np_out, _vjp)
  return ret


# TODO(wangpeng): match JAX's handling of kwargs and non-ndarray args

Source File: helpers.py From compression with Apache License 2.0

4 votes

def estimate_tails(func, target, shape, dtype):
  """Estimates approximate tail quantiles.

  This runs a simple Adam iteration to determine tail quantiles. The
  objective is to find an `x` such that:
  ```
  func(x) == target
  ```
  For instance, if `func` is a CDF and the target is a quantile value, this
  would find the approximate location of that quantile. Note that `func` is
  assumed to be monotonic. When each tail estimate has passed the optimal value
  of `x`, the algorithm does 10 additional iterations and then stops.

  This operation is vectorized. The tensor shape of `x` is given by `shape`, and
  `target` must have a shape that is broadcastable to the output of `func(x)`.

  Arguments:
    func: A callable that computes cumulative distribution function, survival
      function, or similar.
    target: The desired target value.
    shape: The shape of the `tf.Tensor` representing `x`.
    dtype: The `tf.dtypes.Dtype` of the computation (and the return value).

  Returns:
    A `tf.Tensor` representing the solution (`x`).
  """
  with tf.name_scope("estimate_tails"):
    dtype = tf.as_dtype(dtype)
    shape = tf.convert_to_tensor(shape, tf.int32)
    target = tf.convert_to_tensor(target, dtype)

    def loop_cond(tails, m, v, count):
      del tails, m, v  # unused
      return tf.reduce_min(count) < 10

    def loop_body(tails, m, v, count):
      with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(tails)
        loss = abs(func(tails) - target)
      grad = tape.gradient(loss, tails)
      m = .5 * m + .5 * grad  # Adam mean estimate.
      v = .9 * v + .1 * tf.square(grad)  # Adam variance estimate.
      tails -= .5 * m / (tf.sqrt(v) + 1e-7)
      # Start counting when the gradient flips sign (note that this assumes
      # `tails` is initialized to zero).
      count = tf.where(
          tf.math.logical_or(count > 0, tails * grad > 0),
          count + 1, count)
      return tails, m, v, count

    init_tails = tf.zeros(shape, dtype=dtype)
    init_m = tf.zeros(shape, dtype=dtype)
    init_v = tf.ones(shape, dtype=dtype)
    init_count = tf.zeros(shape, dtype=tf.int32)
    return tf.while_loop(
        loop_cond, loop_body, (init_tails, init_m, init_v, init_count),
        back_prop=False)[0]

Source File: grad_utils.py From models with Apache License 2.0

4 votes

def minimize_using_explicit_allreduce(tape,
                                      optimizer,
                                      loss,
                                      trainable_variables,
                                      pre_allreduce_callbacks=None,
                                      post_allreduce_callbacks=None):
  """Minimizes loss for one step by updating `trainable_variables`.

  Minimizes loss for one step by updating `trainable_variables`.
  This explicitly performs gradient allreduce, instead of relying on implicit
  allreduce in optimizer.apply_gradients(). If training using FP16 mixed
  precision, explicit allreduce will aggregate gradients in FP16 format.
  For TPU and GPU training using FP32, explicit allreduce will aggregate
  gradients in FP32 format.

  Arguments:
      tape: An instance of `tf.GradientTape`.
      optimizer: An instance of `tf.keras.optimizers.Optimizer`.
      loss: the loss tensor.
      trainable_variables: A list of model Variables.
      pre_allreduce_callbacks: A list of callback functions that takes gradients
        and model variables pairs as input, manipulate them, and returns a new
        gradients and model variables pairs. The callback functions will be
        invoked in the list order and before gradients are allreduced.
        With mixed precision training, the pre_allreduce_allbacks will be
        applied on scaled_gradients. Default is no callbacks.
      post_allreduce_callbacks: A list of callback functions that takes
        gradients and model variables pairs as input, manipulate them, and
        returns a new gradients and model variables paris. The callback
        functions will be invoked in the list order and right before gradients
        are applied to variables for updates. Default is no callbacks.
  """
  if isinstance(optimizer,
                tf.keras.mixed_precision.experimental.LossScaleOptimizer):
    # FP16 GPU code path
    with tape:
      scaled_loss = optimizer.get_scaled_loss(loss)
    scaled_grads = tape.gradient(scaled_loss, trainable_variables)
    grads_and_vars = zip(scaled_grads, trainable_variables)
    if pre_allreduce_callbacks:
      grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
    (allreduced_scaled_grads,
     filtered_training_vars) = _filter_and_allreduce_gradients(
         grads_and_vars, allreduce_precision="float16")
    allreduced_unscaled_grads = optimizer.get_unscaled_gradients(
        allreduced_scaled_grads)
    grads_and_vars = zip(allreduced_unscaled_grads, filtered_training_vars)
  else:
    # TPU or FP32 GPU code path
    grads = tape.gradient(loss, trainable_variables)
    grads_and_vars = zip(grads, trainable_variables)
    if pre_allreduce_callbacks:
      grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
    (allreduced_grads,
     filtered_training_vars) = _filter_and_allreduce_gradients(
         grads_and_vars, allreduce_precision="float32")
    grads_and_vars = zip(allreduced_grads, filtered_training_vars)
  if post_allreduce_callbacks:
    grads_and_vars = _run_callbacks(post_allreduce_callbacks, grads_and_vars)
  optimizer.apply_gradients(
      grads_and_vars, experimental_aggregate_gradients=False)

Python tensorflow.compat.v2.GradientTape() Examples