Python tensorflow.compat.v2.GradientTape() Examples
The following are 29
code examples of tensorflow.compat.v2.GradientTape().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.compat.v2
, or try the search function
.
Example #1
Source File: extensions_test.py From trax with Apache License 2.0 | 7 votes |
def testGrad(self): def f(a, b): return tf_np.sum(tf_np.sqrt(tf_np.exp(a)) + b) g = extensions.grad(f) def compare(a, b): with tf.GradientTape() as tape: tape.watch(a.data) r = f(a, b) expected = tape.gradient(r.data, a.data) self.assertAllEqual(expected, g(a, b)) shape = [10] a = tf_np.random.randn(*shape) b = tf_np.random.randn(*shape) compare(a, b)
Example #2
Source File: export.py From hub with Apache License 2.0 | 6 votes |
def train_step(model, loss_fn, optimizer_fn, metric, image, label): """Perform one training step for the model. Args: model: Keras model to train. loss_fn: Loss function to use. optimizer_fn: Optimizer function to use. metric: keras.metric to use. image: Tensor of training images of shape [batch_size, 28, 28, 1]. label: Tensor of class labels of shape [batch_size]. """ with tf.GradientTape() as tape: preds = model(image) label_onehot = tf.one_hot(label, 10) loss_ = loss_fn(label_onehot, preds) grads = tape.gradient(loss_, model.trainable_variables) optimizer_fn.apply_gradients(zip(grads, model.trainable_variables)) metric(loss_)
Example #3
Source File: test_util.py From spectral-density with Apache License 2.0 | 6 votes |
def hessian(function: Callable[[Parameters], tf.Tensor], parameters: Parameters) -> Parameters: """Computes the Hessian of a given function. Useful for testing, although scales very poorly. Args: function: A function for which we want to compute the Hessian. parameters: Parameters with respect to the Hessian should be computed. Returns: A tensor or list of tensors of same nested structure as `Parameters`, representing the Hessian. """ with tf.GradientTape() as outer_tape: with tf.GradientTape() as inner_tape: value = function(parameters) grads = inner_tape.gradient(value, parameters) grads = tensor_list_util.tensor_list_to_vector(grads) return outer_tape.jacobian(grads, parameters)
Example #4
Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0 | 6 votes |
def test_multiple_state_vars(self): x = tf.constant([3.0, 4.0]) y = tf.constant([5.0, 6.0]) z = tf.constant([7.0, 8.0]) alpha = tf.constant(2.0) beta = tf.constant(1.0) with tf.GradientTape(persistent=True) as tape: tape.watch([alpha, beta]) def body(i, state): x, y, z = state k = tf.cast(i + 1, tf.float32) return [x * alpha - beta, y * k * alpha * beta, z * beta + x] out = for_loop(body, [x, y, z], [alpha, beta], 3) with self.subTest("independent_vars"): grad = tape.gradient(out[1], alpha) self.assertAllEqual(792, grad) with self.subTest("dependent_vars"): grad = tape.gradient(out[2], beta) self.assertAllEqual(63, grad)
Example #5
Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0 | 6 votes |
def test_batching(self): x = tf.constant([[3.0, 4.0], [30.0, 40.0]]) y = tf.constant([[5.0, 6.0], [50.0, 60.0]]) z = tf.constant([[7.0, 8.0], [70.0, 80.0]]) alpha = tf.constant(2.0) beta = tf.constant(1.0) with tf.GradientTape(persistent=True) as tape: tape.watch([alpha, beta]) def body(i, state): x, y, z = state k = tf.cast(i + 1, tf.float32) return [x * alpha - beta, y * k * alpha * beta, z * beta + x] out = for_loop(body, [x, y, z], [alpha, beta], 3) with self.subTest("independent_vars"): grad = tape.gradient(out[1], alpha) self.assertAllEqual(8712, grad) with self.subTest("dependent_vars"): grad = tape.gradient(out[2], beta) self.assertAllEqual(783, grad)
Example #6
Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0 | 6 votes |
def test_with_xla(self): @tf.function def fn(): x = tf.constant([[3.0, 4.0], [30.0, 40.0]]) y = tf.constant([[7.0, 8.0], [70.0, 80.0]]) alpha = tf.constant(2.0) beta = tf.constant(1.0) with tf.GradientTape(persistent=True) as tape: tape.watch([alpha, beta]) def body(i, state): del i x, y = state return [x * alpha - beta, y * beta + x] out = for_loop(body, [x, y], [alpha, beta], 3) return tape.gradient(out[1], beta) grad = self.evaluate(tf.xla.experimental.compile(fn))[0] self.assertAllEqual(783, grad)
Example #7
Source File: model.py From trax with Apache License 2.0 | 5 votes |
def train(self, x, y, learning_rate=0.01): """Runs a single training pass. Args: x: 2-d array of size batch_size x image_size. y: 2-d array of size batch_size x num_classes in one-hot notation. learning_rate: The learning rate. """ x = np.array(x, copy=False) y = np.array(y, copy=False) def mean_squared_error(x, y): diff = x - y return np.sum(diff * diff) / len(x) wb_tensors = [p.data for p in self.weights + self.biases] with tf.GradientTape() as g: g.watch(wb_tensors) loss = mean_squared_error(self.forward(x), y) gradients = g.gradient(loss.data, wb_tensors) gradients = [np.asarray(grad) for grad in gradients] new_weights_and_biases = [] for v, dv in zip(self.weights + self.biases, gradients): new_weights_and_biases.append(v - learning_rate * dv) total_len = len(new_weights_and_biases) self.weights = new_weights_and_biases[:total_len // 2] self.biases = new_weights_and_biases[total_len // 2:]
Example #8
Source File: deep_factorized_test.py From compression with Apache License 2.0 | 5 votes |
def test_variables_receive_gradients(self): df = deep_factorized.DeepFactorized() with tf.GradientTape() as tape: x = tf.random.normal([20]) loss = -tf.reduce_mean(df.log_prob(x)) grads = tape.gradient(loss, df.trainable_variables) self.assertLen(grads, 8) self.assertNotIn(None, grads)
Example #9
Source File: uniform_noise_test.py From compression with Apache License 2.0 | 5 votes |
def test_variables_receive_gradients(self): loc = tf.Variable(tf.ones([2], dtype=tf.float32)) log_scale = tf.Variable(tf.zeros([2], dtype=tf.float32)) logit_weight = tf.Variable(tf.constant([.3, .7], dtype=tf.float32)) with tf.GradientTape() as tape: dist = self.dist_cls( loc=loc, scale=tf.exp(log_scale), weight=tf.nn.softmax(logit_weight)) x = tf.random.normal([20]) loss = -tf.reduce_mean(dist.log_prob(x)) grads = tape.gradient(loss, [loc, log_scale, logit_weight]) self.assertLen(grads, 3) self.assertNotIn(None, grads)
Example #10
Source File: uniform_noise_test.py From compression with Apache License 2.0 | 5 votes |
def test_variables_receive_gradients(self): loc = tf.Variable(1., dtype=tf.float32) log_scale = tf.Variable(0., dtype=tf.float32) with tf.GradientTape() as tape: dist = self.dist_cls(loc=loc, scale=tf.exp(log_scale)) x = tf.random.normal([20]) loss = -tf.reduce_mean(dist.log_prob(x)) grads = tape.gradient(loss, [loc, log_scale]) self.assertLen(grads, 2) self.assertNotIn(None, grads)
Example #11
Source File: gdn_test.py From compression with Apache License 2.0 | 5 votes |
def test_variables_receive_gradients(self): x = tf.random.uniform((1, 2), dtype=tf.float32) layer = gdn.GDN(inverse=False, rectify=True) with tf.GradientTape() as g: y = layer(x) grads = g.gradient(y, layer.trainable_variables) self.assertLen(grads, 2) self.assertNotIn(None, grads)
Example #12
Source File: nql_test.py From language with Apache License 2.0 | 5 votes |
def test_gradients(self): with tf.GradientTape(persistent=True) as g: x = self.context.one(cell(2, 2), 'place_t') near_x = x.follow('n') + x.follow('s') + x.follow('e') + x.follow('w') lr_near_x = near_x.weighted_by('trained_distance_to', 'ul') g.watch(self.context.get_underlying_parameter('trained_distance_to')) expected_y = self.context.one(cell( 1, 2), 'place_t') * 3 + self.context.one( cell(2, 1), 'place_t') * 3 + self.context.one( cell(3, 2), 'place_t') * 5 + self.context.one( cell(2, 3), 'place_t') * 5 almost_y = self.context.one(cell(1, 2), 'place_t') * 2 + self.context.one( cell(2, 1), 'place_t') * 3 + self.context.one( cell(3, 2), 'place_t') * 4 + self.context.one( cell(2, 3), 'place_t') * 5 # compute some gradients loss_1 = tf.reduce_sum( input_tensor=tf.multiply(lr_near_x.tf - expected_y.tf, lr_near_x.tf - expected_y.tf)) loss_2 = tf.reduce_sum( input_tensor=tf.multiply(lr_near_x.tf - almost_y.tf, lr_near_x.tf - almost_y.tf)) grad_1 = g.gradient( target=loss_1, sources=self.context.get_underlying_parameter('trained_distance_to')) grad_2 = g.gradient( target=loss_2, sources=self.context.get_underlying_parameter('trained_distance_to')) self.assertEqual(loss_1.numpy(), 0.0) self.assertEqual(loss_2.numpy(), 2.0) sum_grad_1 = tf.reduce_sum(input_tensor=grad_1) sum_grad_2 = tf.reduce_sum(input_tensor=grad_2) self.assertEqual(sum_grad_1.numpy(), 0.0) self.assertEqual(sum_grad_2.numpy(), 4.0)
Example #13
Source File: cms_swap.py From tf-quant-finance with Apache License 2.0 | 5 votes |
def _f_atm_second_derivative(self, s, cms_rates): """Computes second order derivative of _f_atm.""" with tf.GradientTape() as g: g.watch(s) with tf.GradientTape() as gg: gg.watch(s) fx = self._f_atm(s, cms_rates) dfx = tf.squeeze(gg.gradient(fx, s)) d2fx = tf.squeeze(g.gradient(dfx, s)) return d2fx
Example #14
Source File: cms_swap.py From tf-quant-finance with Apache License 2.0 | 5 votes |
def _f_atm_first_derivative(self, s, cms_rates): """Computes first order derivative of _f_atm.""" with tf.GradientTape() as g: g.watch(s) fx = self._f_atm(s, cms_rates) dfx = tf.squeeze(g.gradient(fx, s)) return dfx
Example #15
Source File: linear_interpolation_test.py From tf-quant-finance with Apache License 2.0 | 5 votes |
def test_valid_gradients(self, optimize_for_tpu): """Tests none of the gradients is nan.""" # In this example, `x[0]` and `x[1]` are both less than or equal to # `x_data[0]`. `x[-2]` and `x[-1]` are both greater than or equal to # `x_data[-1]`. They are set up this way to test none of the tf.where # branches of the implementation have any nan. An unselected nan could still # propagate through gradient calculation with the end result being nan. x = [[-10.0, -1.0, 1.0, 3.0, 6.0, 7.0], [8.0, 15.0, 18.0, 25.0, 30.0, 35.0]] x_data = [[-1.0, 2.0, 6.0], [8.0, 18.0, 30.0]] def _value_helper_fn(y_data): """A helper function that returns sum of squared interplated values.""" interpolated_values = tff.math.interpolation.linear.interpolate( x, x_data, y_data, optimize_for_tpu=optimize_for_tpu, dtype=tf.float64) return tf.reduce_sum(tf.math.square(interpolated_values)) y_data = tf.convert_to_tensor([[10.0, -1.0, -5.0], [7.0, 9.0, 20.0]], dtype=tf.float64) if tf.executing_eagerly(): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(y_data) value = _value_helper_fn(y_data=y_data) gradients = tape.gradient(value, y_data) else: value = _value_helper_fn(y_data=y_data) gradients = tf.gradients(value, y_data)[0] gradients = tf.convert_to_tensor(gradients) self.assertFalse(self.evaluate(tf.reduce_any(tf.math.is_nan(gradients))))
Example #16
Source File: custom_loops.py From tf-quant-finance with Apache License 2.0 | 5 votes |
def _jacobian_wrt_parameter(y, param, tape): """Computes a Jacobian w.r.t. a parameter.""" # For input shapes (b, dy), yields shape (b, dy, 1) (1 is added for # convenience elsewhere). # To avoid having to broadcast param to y's shape, we need to take a forward # gradient. with tf.GradientTape() as w_tape: w = tf.zeros_like(y) w_tape.watch(w) vjp = tape.gradient(y, param, output_gradients=w) if vjp is None: # Unconnected. return tf.expand_dims(tf.zeros_like(y), axis=-1) return tf.expand_dims(w_tape.gradient(vjp, w), axis=-1)
Example #17
Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0 | 5 votes |
def test_shapes(self, state_dims, num_params, times): # Checks that the loop can handle various shapes and outputs correct shapes. def test_with_batch_shape(batch_shape): initial_state = [tf.ones(shape=batch_shape + (d,)) for d in state_dims] params = [tf.constant(1.0) for _ in range(num_params)] with tf.GradientTape(persistent=True) as tape: tape.watch(initial_state) tape.watch(params) def body(i, state): del i if not params: return state sum_params = tf.add_n(params) state = [s * sum_params for s in state] return state final_state = for_loop(body, initial_state, params, times) for s_in in initial_state: for s_out in final_state: grad = tape.gradient(s_out, s_in) self.assertAllEqual(s_in.shape, grad.shape) for p in params: for s_out in final_state: grad = tape.gradient(s_out, p) self.assertAllEqual([], grad.shape) with self.subTest("no_batch"): test_with_batch_shape(batch_shape=()) with self.subTest("simple_batch"): test_with_batch_shape(batch_shape=(5,)) with self.subTest("complex_batch"): test_with_batch_shape(batch_shape=(2, 8, 3))
Example #18
Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0 | 5 votes |
def test_simple_grad_wrt_initial_state(self): x = tf.constant([3.0]) sigma = tf.constant(2.0) with tf.GradientTape() as tape: tape.watch(x) def body(i, state): del i x = state[0] return [x * sigma] out = for_loop(body, [x], [sigma], 3)[0] grad = tape.gradient(out, x) self.assertAllEqual([8], grad)
Example #19
Source File: custom_loops_test.py From tf-quant-finance with Apache License 2.0 | 5 votes |
def test_simple_grad_wrt_parameter(self): x = tf.constant([3.0]) sigma = tf.constant(2.0) with tf.GradientTape() as tape: tape.watch(sigma) def body(i, state): del i x = state[0] return [x * sigma] out = for_loop(body, [x], [sigma], 3)[0] grad = tape.gradient(out, sigma) self.assertAllEqual(36, grad)
Example #20
Source File: matrix_vector_product.py From spectral-density with Apache License 2.0 | 5 votes |
def _hessian_vector_product( function: Callable[[Parameters], tf.Tensor], parameters: Parameters, v: Parameters) -> Parameters: """Computes Hessian-vector products. Computes the product H.v where v is an arbitrary vector and H is the Hessian of a function evaluated at `parameters`. The result is the same as if the Hessian was computed explicitly and multiplied the vector. However, this function uses the autograd in backward then forward mode in order to compute this Hessian vector product without having to explicitly compute the Hessian. Args: function: A (twice) differentiable function that takes as input a tensor or a list of tensors and returns a scalar. parameters: The parameters with respect to which we want to compute the Hessian for the hessian vector product. v: An arbitrary vector or list of vectors of the same nested structure as `parameters`. Returns: A vector or list of vectors of the same nested structure as `parameters`, equal to H.v. """ with tf.autodiff.ForwardAccumulator( primals=parameters, tangents=v) as acc: with tf.GradientTape() as tape: tape.watch(parameters) value = function(parameters) backward = tape.gradient(value, parameters) return acc.jvp(backward)
Example #21
Source File: backprop_test.py From trax with Apache License 2.0 | 5 votes |
def test_setitem(self): # Single integer index. a = array_ops.array([1., 2., 3.]) b = array_ops.array(5.) c = array_ops.array(10.) tensors = [arr.data for arr in [a, b, c]] with tf.GradientTape() as g: g.watch(tensors) a[1] = b + c loss = array_ops.sum(a) gradients = g.gradient(loss.data, tensors) self.assertSequenceEqual( array_ops.array(gradients[0]).tolist(), [1., 0., 1.]) self.assertEqual(array_ops.array(gradients[1]).tolist(), 1.) self.assertEqual(array_ops.array(gradients[2]).tolist(), 1.) # Tuple index. a = array_ops.array([[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]]]) # 2x2x2 array. b = array_ops.array([10., 11.]) tensors = [arr.data for arr in [a, b]] with tf.GradientTape() as g: g.watch(tensors) a[(1, 0)] = b loss = array_ops.sum(a) gradients = g.gradient(loss.data, tensors) self.assertSequenceEqual( array_ops.array(gradients[0]).tolist(), [[[1., 1.], [1., 1.]], [[0., 0.], [1., 1.]]]) self.assertEqual(array_ops.array(gradients[1]).tolist(), [1., 1.])
Example #22
Source File: extensions_test.py From trax with Apache License 2.0 | 5 votes |
def testVjp(self, has_aux): x_shape = (tf.TensorShape([10]), tf.TensorShape([1, 10])) y_shape = (tf.TensorShape([])) dtype = np.float32 def f(a, b): y = tf_np.sum(tf_np.sqrt(tf_np.exp(a)) + b) if has_aux: return y, tf_np.asarray(1) else: return y rng = tf.random.Generator.from_seed(1234) x, dy_list = tf.nest.map_structure(lambda shape: uniform(rng, shape, dtype), [x_shape, [y_shape] * 2]) tf_x = to_tf(x) outputs = extensions.vjp(f, *x, has_aux=has_aux) if has_aux: y, vjp, aux = outputs else: y, vjp = outputs with tf.GradientTape(persistent=True) as tape: tape.watch(tf_x) outputs = f(*x) if has_aux: expected_y, expected_aux = outputs self.assertAllClose(to_tf(expected_aux), to_tf(aux)) else: expected_y = outputs self.assertAllClose(to_tf(expected_y), to_tf(y)) for dy in dy_list: expected_dx = tape.gradient( to_tf(expected_y), tf_x, output_gradients=to_tf(dy)) self.assertAllClose(expected_dx, to_tf(vjp(dy)))
Example #23
Source File: model_tf2.py From machine-learning-for-programming-samples with MIT License | 5 votes |
def run_one_epoch( self, minibatches: Iterable[np.ndarray], training: bool = False, ): total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0 for step, minibatch_data in enumerate(minibatches): with tf.GradientTape() as tape: model_outputs = self.compute_logits(minibatch_data, training=training) result = self.compute_loss_and_acc(model_outputs, minibatch_data) total_loss += result.token_ce_loss num_samples += minibatch_data.shape[0] num_tokens += result.num_predictions num_correct_tokens += result.num_correct_token_predictions if training: gradients = tape.gradient( result.token_ce_loss, self.trainable_variables ) self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) print( " Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f" % ( step, total_loss / num_samples, result.token_ce_loss, result.num_correct_token_predictions / (float(result.num_predictions) + 1e-7), ), end="\r", ) print("\r\x1b[K", end="") return ( total_loss / num_samples, num_correct_tokens / (float(num_tokens) + 1e-7), )
Example #24
Source File: gradient.py From tf-quant-finance with Apache License 2.0 | 4 votes |
def value_and_gradient(f, xs, output_gradients=None, use_gradient_tape=False, unconnected_gradients=None, name=None): """Computes `f(*xs)` and its gradients wrt to `*xs`. Args: f: Python `callable` to be differentiated. If `f` returns a scalar, this scalar will be differentiated. If `f` returns a tensor or list of tensors, by default a scalar will be computed by adding all their values to produce a single scalar. If desired, the tensors can be elementwise multiplied by the tensors passed as the `dy` keyword argument to the returned gradient function. xs: Python list of parameters of `f` for which to differentiate. (Can also be single `Tensor`.) output_gradients: A `Tensor` or list of `Tensor`s the same size as the result `ys = f(*xs)` and holding the gradients computed for each `y` in `ys`. This argument is forwarded to the underlying gradient implementation (i.e., either the `grad_ys` argument of `tf.gradients` or the `output_gradients` argument of `tf.GradientTape.gradient`). use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be used regardless of `tf.executing_eagerly()` status. Default value: `False`. unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the gradient value returned when the given input tensors are unconnected. Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., `'value_and_gradient'`). Returns: A tuple of two elements. The first one is a `Tensor` representing the value of the function at `xs` and the second one is either a `Tensot` or a list of `Tensor`s representing grafient of `f(*xs)` wrt `xs`. y: `y = f(*xs)`. dydx: Gradient of `y` wrt each of `xs`. """ unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE xs, is_xs_list_like = _prepare_args(xs) with tf.name_scope(name or "value_and_gradient"): if tf.executing_eagerly() or use_gradient_tape: with tf.GradientTape() as tape: for x in xs: tape.watch(x) y = f(*xs) grad = tape.gradient(y, xs, output_gradients=output_gradients, unconnected_gradients=unconnected_gradients) else: y = f(*xs) grad = tf.gradients(ys=y, xs=xs, grad_ys=output_gradients, unconnected_gradients=unconnected_gradients) if is_xs_list_like: return y, grad else: return y, grad[0]
Example #25
Source File: gradient.py From tf-quant-finance with Apache License 2.0 | 4 votes |
def gradients(func_or_y, xs, output_gradients=None, use_gradient_tape=False, unconnected_gradients=None, name=None): """Computes the gradients of `func_or_y` wrt to `*xs`. Args: func_or_y: Either a `Tensor` conencted to the input `x` or a Python callable accepting one `Tensor` of shape of `x` and returning a `Tensor` of any shape. The function whose gradient is to be computed. If eagerly executing, can only be a callable, i.e., one should not supply a Tensor in eager mode. xs: Python list of parameters of `f` for which to differentiate. (Can also be single `Tensor`.) output_gradients: A `Tensor` or list of `Tensor`s the same size as the result `ys = f(*xs)` and holding the gradients computed for each `y` in `ys`. This argument is forwarded to the underlying gradient implementation (i.e., either the `grad_ys` argument of `tf.gradients` or the `output_gradients` argument of `tf.GradientTape.gradient`). Default value: `None` which maps to a ones-like `Tensor` of `ys`. use_gradient_tape: Python `bool` indicating that `tf.GradientTape` should be used regardless of `tf.executing_eagerly()` status. Default value: `False`. unconnected_gradients: An enum `tf.UnconnectedGradients` which specifies the gradient value returned when the given input tensors are unconnected. Default value: `None`, which maps to `tf.UnconnectedGradients.NONE`. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'gradients'). Returns: A `Tensor` with the gradient of `y` wrt each of `xs` or a list of `Tensor`s if `xs` is a list. """ unconnected_gradients = unconnected_gradients or tf.UnconnectedGradients.NONE f = _prepare_func(func_or_y) with tf.name_scope(name or "gradients"): xs, is_xs_list_like = _prepare_args(xs) if not tf.executing_eagerly() and not use_gradient_tape: y = f(*xs) grad = tf.gradients(y, xs, grad_ys=output_gradients, unconnected_gradients=unconnected_gradients) else: if not callable(func_or_y): raise ValueError("`func_or_y` should be a callable in eager mode or " "when `tf.GradientTape` is used.") with tf.GradientTape() as tape: for x in xs: tape.watch(x) y = f(*xs) grad = tape.gradient(y, xs, output_gradients=output_gradients, unconnected_gradients=unconnected_gradients) if is_xs_list_like: return grad else: return grad[0]
Example #26
Source File: extensions.py From trax with Apache License 2.0 | 4 votes |
def grad(f, has_aux=False): """Returns a function that computes gradient of f. Gradients can only be computed through numpy and tensorflow operations and not through python float operations and values. Args: f: a function of type (params, *args) -> scalar. 'params' can be a nested structure (made of lists and tuples) of ndarrays and the gradient is evaluated against it. `scalar` is a scalar ndarray. has_aux: bool, indicates whether fun returns a pair where the first element is considered the output of the mathematical function to be differentiated and the second element is auxiliary data. Returns: A gradient function of type (params, *args) -> gradients, where the result 'gradients' has the same structure and shapes as 'params'. """ def check_loss_shape(np_loss): if not isinstance(np_loss, tf_np.ndarray): raise ValueError( "The result of the function to take gradient must be an ndarray.") if not np_loss.data.shape.is_compatible_with([]): raise ValueError( "The result of the function to take gradient must be a scalar.") def _f(params, *args): """The gradient function to be returned.""" tf_params = _np_to_tf(params) with tf.GradientTape() as g: g.watch(tf.nest.flatten(tf_params)) outputs = f(params, *args) if has_aux: np_loss, aux = outputs else: np_loss = outputs check_loss_shape(np_loss) tf_grads = g.gradient(np_loss.data, tf_params) if has_aux: res = (tf_grads, aux) else: res = tf_grads return _tf_to_np(res) return _f # A workaround for b/121383831
Example #27
Source File: extensions.py From trax with Apache License 2.0 | 4 votes |
def vjp(f, *primals, has_aux=False): """Returns the result and the VJP function of `f`. This function returns the result and the vector-Jacobian-product (VJP) function of `f`. Args: f: a function from (nested structures of) tf_np.ndarrays to a (nested structure of) tf_np.ndarray. If `has_aux` is True, it should return an extra output. *primals: the inputs to be fed to `f`. has_aux: if True, the second output of `f` will be regarded as an auxiliary, non-differentiable output that will be ignored by the VJP function. Returns: A pair `(y, vjpfun)` if `has_aux` is False; a tuple `(y, vjpfun, aux)` otherwise. `y` and `aux` are the outputs of `f`, i.e. `y, aux = f(*primals)`. `vjpfun` is a function `dx = vjpfun(dy)`, where `dy` is the cotengents of `y`, having the same structures, shapes and dtypes as `y`. `dx` is the cotengents of `x`, having the same structures, shapes and dtypes as `x`. """ tf_primals = _np_to_tf(primals) with tf.GradientTape(persistent=True) as tape: tape.watch(tf.nest.flatten(tf_primals)) outputs = f(*primals) if has_aux: np_out, aux = outputs else: np_out = outputs tf_out = _np_to_tf(np_out) def _vjp(dy): tf_dy = _np_to_tf(dy) tf_dx = tape.gradient(tf_out, tf_primals, output_gradients=tf_dy) return _tf_to_np(tf_dx) if has_aux: ret = (np_out, _vjp, aux) else: ret = (np_out, _vjp) return ret # TODO(wangpeng): match JAX's handling of kwargs and non-ndarray args
Example #28
Source File: helpers.py From compression with Apache License 2.0 | 4 votes |
def estimate_tails(func, target, shape, dtype): """Estimates approximate tail quantiles. This runs a simple Adam iteration to determine tail quantiles. The objective is to find an `x` such that: ``` func(x) == target ``` For instance, if `func` is a CDF and the target is a quantile value, this would find the approximate location of that quantile. Note that `func` is assumed to be monotonic. When each tail estimate has passed the optimal value of `x`, the algorithm does 10 additional iterations and then stops. This operation is vectorized. The tensor shape of `x` is given by `shape`, and `target` must have a shape that is broadcastable to the output of `func(x)`. Arguments: func: A callable that computes cumulative distribution function, survival function, or similar. target: The desired target value. shape: The shape of the `tf.Tensor` representing `x`. dtype: The `tf.dtypes.Dtype` of the computation (and the return value). Returns: A `tf.Tensor` representing the solution (`x`). """ with tf.name_scope("estimate_tails"): dtype = tf.as_dtype(dtype) shape = tf.convert_to_tensor(shape, tf.int32) target = tf.convert_to_tensor(target, dtype) def loop_cond(tails, m, v, count): del tails, m, v # unused return tf.reduce_min(count) < 10 def loop_body(tails, m, v, count): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(tails) loss = abs(func(tails) - target) grad = tape.gradient(loss, tails) m = .5 * m + .5 * grad # Adam mean estimate. v = .9 * v + .1 * tf.square(grad) # Adam variance estimate. tails -= .5 * m / (tf.sqrt(v) + 1e-7) # Start counting when the gradient flips sign (note that this assumes # `tails` is initialized to zero). count = tf.where( tf.math.logical_or(count > 0, tails * grad > 0), count + 1, count) return tails, m, v, count init_tails = tf.zeros(shape, dtype=dtype) init_m = tf.zeros(shape, dtype=dtype) init_v = tf.ones(shape, dtype=dtype) init_count = tf.zeros(shape, dtype=tf.int32) return tf.while_loop( loop_cond, loop_body, (init_tails, init_m, init_v, init_count), back_prop=False)[0]
Example #29
Source File: grad_utils.py From models with Apache License 2.0 | 4 votes |
def minimize_using_explicit_allreduce(tape, optimizer, loss, trainable_variables, pre_allreduce_callbacks=None, post_allreduce_callbacks=None): """Minimizes loss for one step by updating `trainable_variables`. Minimizes loss for one step by updating `trainable_variables`. This explicitly performs gradient allreduce, instead of relying on implicit allreduce in optimizer.apply_gradients(). If training using FP16 mixed precision, explicit allreduce will aggregate gradients in FP16 format. For TPU and GPU training using FP32, explicit allreduce will aggregate gradients in FP32 format. Arguments: tape: An instance of `tf.GradientTape`. optimizer: An instance of `tf.keras.optimizers.Optimizer`. loss: the loss tensor. trainable_variables: A list of model Variables. pre_allreduce_callbacks: A list of callback functions that takes gradients and model variables pairs as input, manipulate them, and returns a new gradients and model variables pairs. The callback functions will be invoked in the list order and before gradients are allreduced. With mixed precision training, the pre_allreduce_allbacks will be applied on scaled_gradients. Default is no callbacks. post_allreduce_callbacks: A list of callback functions that takes gradients and model variables pairs as input, manipulate them, and returns a new gradients and model variables paris. The callback functions will be invoked in the list order and right before gradients are applied to variables for updates. Default is no callbacks. """ if isinstance(optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer): # FP16 GPU code path with tape: scaled_loss = optimizer.get_scaled_loss(loss) scaled_grads = tape.gradient(scaled_loss, trainable_variables) grads_and_vars = zip(scaled_grads, trainable_variables) if pre_allreduce_callbacks: grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars) (allreduced_scaled_grads, filtered_training_vars) = _filter_and_allreduce_gradients( grads_and_vars, allreduce_precision="float16") allreduced_unscaled_grads = optimizer.get_unscaled_gradients( allreduced_scaled_grads) grads_and_vars = zip(allreduced_unscaled_grads, filtered_training_vars) else: # TPU or FP32 GPU code path grads = tape.gradient(loss, trainable_variables) grads_and_vars = zip(grads, trainable_variables) if pre_allreduce_callbacks: grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars) (allreduced_grads, filtered_training_vars) = _filter_and_allreduce_gradients( grads_and_vars, allreduce_precision="float32") grads_and_vars = zip(allreduced_grads, filtered_training_vars) if post_allreduce_callbacks: grads_and_vars = _run_callbacks(post_allreduce_callbacks, grads_and_vars) optimizer.apply_gradients( grads_and_vars, experimental_aggregate_gradients=False)