Python tensorflow.compat.v1.stop_gradient() Examples
The following are 30
code examples of tensorflow.compat.v1.stop_gradient().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.compat.v1
, or try the search function
.
Example #1
Source File: transformer_nat.py From tensor2tensor with Apache License 2.0 | 6 votes |
def vq_nearest_neighbor(x, hparams): """Find the nearest element in means to elements in x.""" bottleneck_size = 2**hparams.bottleneck_bits means = hparams.means x_norm_sq = tf.reduce_sum(tf.square(x), axis=-1, keepdims=True) means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keepdims=True) scalar_prod = tf.matmul(x, means, transpose_b=True) dist = x_norm_sq + tf.transpose(means_norm_sq) - 2 * scalar_prod if hparams.bottleneck_kind == "em": x_means_idx = tf.multinomial(-dist, num_samples=hparams.num_samples) x_means_hot = tf.one_hot( x_means_idx, depth=bottleneck_size) x_means_hot = tf.reduce_mean(x_means_hot, axis=1) else: x_means_idx = tf.argmax(-dist, axis=-1) x_means_hot = tf.one_hot(x_means_idx, depth=bottleneck_size) x_means = tf.matmul(x_means_hot, means) e_loss = tf.reduce_mean(tf.squared_difference(x, tf.stop_gradient(x_means))) return x_means_hot, e_loss
Example #2
Source File: multi_head_dqn_agent.py From batch_rl with Apache License 2.0 | 6 votes |
def _build_train_op(self): """Builds a training op. Returns: train_op: An op performing one step of training from replay data. """ actions = self._replay.actions indices = tf.stack([tf.range(actions.shape[0]), actions], axis=-1) replay_chosen_q = tf.gather_nd( self._replay_net_outputs.q_heads, indices=indices) target = tf.stop_gradient(self._build_target_q_op()) loss = tf.losses.huber_loss( target, replay_chosen_q, reduction=tf.losses.Reduction.NONE) q_head_losses = tf.reduce_mean(loss, axis=0) final_loss = tf.reduce_mean(q_head_losses) if self.summary_writer is not None: with tf.variable_scope('Losses'): tf.summary.scalar('HuberLoss', final_loss) return self.optimizer.minimize(final_loss)
Example #3
Source File: autoencoders.py From tensor2tensor with Apache License 2.0 | 6 votes |
def gumbel_sample(self, reconstr_gan): hparams = self.hparams is_training = hparams.mode == tf.estimator.ModeKeys.TRAIN vocab_size = self._problem_hparams.vocab_size["targets"] if hasattr(self._hparams, "vocab_divisor"): vocab_size += (-vocab_size) % self._hparams.vocab_divisor reconstr_gan = tf.nn.log_softmax(reconstr_gan) if is_training and hparams.gumbel_temperature > 0.0: gumbel_samples = discretization.gumbel_sample( common_layers.shape_list(reconstr_gan)) gumbel_samples *= hparams.gumbel_noise_factor reconstr_gan += gumbel_samples reconstr_sample = latent_layers.multinomial_sample( reconstr_gan, temperature=hparams.gumbel_temperature) reconstr_gan = tf.nn.softmax(reconstr_gan / hparams.gumbel_temperature) else: reconstr_sample = tf.argmax(reconstr_gan, axis=-1) reconstr_gan = tf.nn.softmax(reconstr_gan / 0.1) # Sharpen a bit. # Use 1-hot forward, softmax backward. reconstr_hot = tf.one_hot(reconstr_sample, vocab_size) reconstr_gan += reconstr_hot - tf.stop_gradient(reconstr_gan) return reconstr_gan
Example #4
Source File: discretization.py From tensor2tensor with Apache License 2.0 | 6 votes |
def tanh_discrete_bottleneck(x, bottleneck_bits, bottleneck_noise, discretize_warmup_steps, mode): """Simple discretization through tanh, flip bottleneck_noise many bits.""" x = tf.layers.dense(x, bottleneck_bits, name="tanh_discrete_bottleneck") d0 = tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x))) - 1.0 if mode == tf.estimator.ModeKeys.TRAIN: x += tf.truncated_normal( common_layers.shape_list(x), mean=0.0, stddev=0.2) x = tf.tanh(x) d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x) if mode == tf.estimator.ModeKeys.TRAIN: noise = tf.random_uniform(common_layers.shape_list(x)) noise = 2.0 * tf.to_float(tf.less(bottleneck_noise, noise)) - 1.0 d *= noise d = common_layers.mix(d, x, discretize_warmup_steps, mode == tf.estimator.ModeKeys.TRAIN) return d, d0
Example #5
Source File: utils.py From lamb with Apache License 2.0 | 6 votes |
def mask_from_lengths(lengths, max_length=None, dtype=None, name=None): """Convert a length scalar to a vector of binary masks. This function will convert a vector of lengths to a matrix of binary masks. E.g. [2, 4, 3] will become [[1, 1, 0, 0], [1, 1, 1, 1], [1, 1, 1, 0]] Args: lengths: a d-dimensional vector of integers corresponding to lengths. max_length: an optional (default: None) scalar-like or 0-dimensional tensor indicating the maximum length of the masks. If not provided, the maximum length will be inferred from the lengths vector. dtype: the dtype of the returned mask, if specified. If None, the dtype of the lengths will be used. name: a name for the operation (optional). Returns: A d x max_length tensor of binary masks (int32). """ with tf.name_scope(name, 'mask_from_lengths'): dtype = lengths.dtype if dtype is None else dtype max_length = tf.reduce_max(lengths) if max_length is None else max_length indexes = tf.range(max_length, dtype=lengths.dtype) mask = tf.less(tf.expand_dims(indexes, 0), tf.expand_dims(lengths, 1)) cast_mask = tf.cast(mask, dtype) return tf.stop_gradient(cast_mask)
Example #6
Source File: discretization.py From tensor2tensor with Apache License 2.0 | 6 votes |
def bit_to_int(x_bit, num_bits, base=2): """Turn x_bit representing numbers bitwise (lower-endian) to int tensor. Args: x_bit: Tensor containing numbers in a particular base to be converted to int. num_bits: Number of bits in the representation. base: Base of the representation. Returns: Integer representation of this number. """ x_l = tf.stop_gradient(tf.to_int32(tf.reshape(x_bit, [-1, num_bits]))) x_labels = [ x_l[:, i] * tf.to_int32(base)**tf.to_int32(i) for i in range(num_bits)] res = sum(x_labels) return tf.to_int32(tf.reshape(res, common_layers.shape_list(x_bit)[:-1]))
Example #7
Source File: shake_shake.py From tensor2tensor with Apache License 2.0 | 6 votes |
def shake_shake_branch(x, output_filters, stride, rand_forward, rand_backward, hparams): """Building a 2 branching convnet.""" is_training = hparams.mode == tf.estimator.ModeKeys.TRAIN x = tf.nn.relu(x) x = tf.layers.conv2d( x, output_filters, (3, 3), strides=(stride, stride), padding="SAME", name="conv1") x = tf.layers.batch_normalization(x, training=is_training, name="bn1") x = tf.nn.relu(x) x = tf.layers.conv2d(x, output_filters, (3, 3), padding="SAME", name="conv2") x = tf.layers.batch_normalization(x, training=is_training, name="bn2") if is_training: x = x * rand_backward + tf.stop_gradient(x * rand_forward - x * rand_backward) else: x *= 1.0 / hparams.shake_shake_num_branches return x
Example #8
Source File: vq_discrete.py From tensor2tensor with Apache License 2.0 | 6 votes |
def bit_to_int(self, x_bit, num_bits, base=2): """Turn x_bit representing numbers bitwise (lower-endian) to int tensor. Args: x_bit: Tensor containing numbers in a particular base to be converted to int. num_bits: Number of bits in the representation. base: Base of the representation. Returns: Integer representation of this number. """ x_l = tf.stop_gradient(tf.to_int32(tf.reshape(x_bit, [-1, num_bits]))) # pylint: disable=g-complex-comprehension x_labels = [ x_l[:, i] * tf.to_int32(base)**tf.to_int32(i) for i in range(num_bits)] res = sum(x_labels) return tf.to_int32(tf.reshape(res, common_layers.shape_list(x_bit)[:-1]))
Example #9
Source File: vq_discrete.py From tensor2tensor with Apache License 2.0 | 6 votes |
def embedding_lookup(self, x, means): """Compute nearest neighbors and loss for training the embeddings. Args: x: Batch of encoder continuous latent states sliced/projected into shape [-1, num_blocks, block_dim]. means: Embedding means. Returns: The nearest neighbor in one hot form, the nearest neighbor itself, the commitment loss, embedding training loss. """ x_means_hot = self.nearest_neighbor(x, means) x_means_hot_flat = tf.reshape( x_means_hot, [-1, self.hparams.num_blocks, self.hparams.block_v_size]) x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means) x_means = tf.transpose(x_means, [1, 0, 2]) q_loss = tf.reduce_mean( tf.squared_difference(tf.stop_gradient(x), x_means)) e_loss = tf.reduce_mean( tf.squared_difference(x, tf.stop_gradient(x_means))) return x_means_hot, x_means, q_loss, e_loss
Example #10
Source File: base.py From tensor2tensor with Apache License 2.0 | 6 votes |
def pixels_from_softmax(frame_logits, pure_sampling=False, temperature=1.0, gumbel_noise_factor=0.2): """Given frame_logits from a per-pixel softmax, generate colors.""" # If we're purely sampling, just sample each pixel. if pure_sampling or temperature == 0.0: return common_layers.sample_with_temperature(frame_logits, temperature) # Gumbel-sample from the pixel sofmax and average by pixel values. pixel_range = tf.to_float(tf.range(256)) for _ in range(len(frame_logits.get_shape().as_list()) - 1): pixel_range = tf.expand_dims(pixel_range, axis=0) frame_logits = tf.nn.log_softmax(frame_logits) gumbel_samples = discretization.gumbel_sample( common_layers.shape_list(frame_logits)) * gumbel_noise_factor frame = tf.nn.softmax((frame_logits + gumbel_samples) / temperature, axis=-1) result = tf.reduce_sum(frame * pixel_range, axis=-1) # Round on the forward pass, not on the backward one. return result + tf.stop_gradient(tf.round(result) - result)
Example #11
Source File: archs.py From compression with Apache License 2.0 | 5 votes |
def _get_moments(self, inputs): # Like tf.nn.moments but unbiased sample std. deviation. # Reduce over channels only. mean = tf.reduce_mean(inputs, [self.axis], keepdims=True, name="mean") variance = tf.reduce_sum( tf.squared_difference(inputs, tf.stop_gradient(mean)), [self.axis], keepdims=True, name="variance_sum") # Divide by N-1 inputs_shape = tf.shape(inputs) counts = tf.reduce_prod([inputs_shape[ax] for ax in [self.axis]]) variance /= (tf.cast(counts, tf.float32) - 1) return mean, variance
Example #12
Source File: layers.py From interval-bound-propagation with Apache License 2.0 | 5 votes |
def scale(self): self._ensure_is_connected() return tf.stop_gradient(self._gamma) if self._gamma is not None else None
Example #13
Source File: layers.py From interval-bound-propagation with Apache License 2.0 | 5 votes |
def bias(self): self._ensure_is_connected() return tf.stop_gradient(self._beta) if self._beta is not None else None
Example #14
Source File: layers.py From interval-bound-propagation with Apache License 2.0 | 5 votes |
def variance(self): self._ensure_is_connected() return tf.stop_gradient(self._variance)
Example #15
Source File: model.py From compression with Apache License 2.0 | 5 votes |
def _compute_discriminator_out(self, nodes: Nodes, create_summaries, gradients_to_generator=True ) -> archs.DiscOutSplit: """Get discriminator outputs.""" with tf.name_scope("disc"): input_image = nodes.input_image_scaled reconstruction = nodes.reconstruction_scaled if not gradients_to_generator: reconstruction = tf.stop_gradient(reconstruction) discriminator_in = tf.concat([input_image, reconstruction], axis=0) # Condition D. latent = tf.stop_gradient(nodes.latent_quantized) latent = tf.concat([latent, latent], axis=0) discriminator_in = (discriminator_in, latent) disc_out_all = self._discriminator(discriminator_in, training=self.training) d_real, d_fake = tf.split(disc_out_all.d_all, 2) d_real_logits, d_fake_logits = tf.split(disc_out_all.d_all_logits, 2) disc_out_split = archs.DiscOutSplit(d_real, d_fake, d_real_logits, d_fake_logits) if create_summaries: tf.summary.scalar("d_real", tf.reduce_mean(disc_out_split.d_real)) tf.summary.scalar("d_fake", tf.reduce_mean(disc_out_split.d_fake)) return disc_out_split
Example #16
Source File: layers.py From interval-bound-propagation with Apache License 2.0 | 5 votes |
def mean(self): self._ensure_is_connected() return tf.stop_gradient(self._mean)
Example #17
Source File: archs.py From compression with Apache License 2.0 | 5 votes |
def _quantize(inputs, mean): half = tf.constant(.5, dtype=tf.float32) outputs = inputs outputs -= mean # Rounding latents for the forward pass (straight-through). outputs = outputs + tf.stop_gradient(tf.math.floor(outputs + half) - outputs) outputs += mean return outputs
Example #18
Source File: entropy_models.py From compression with Apache License 2.0 | 5 votes |
def _logits_cumulative(self, inputs, stop_gradient): """Evaluate logits of the cumulative densities. Arguments: inputs: The values at which to evaluate the cumulative densities, expected to be a `Tensor` of shape `(channels, 1, batch)`. stop_gradient: Boolean. Whether to add `tf.stop_gradient` calls so that the gradient of the output with respect to the density model parameters is disconnected (the gradient with respect to `inputs` is left untouched). Returns: A `Tensor` of the same shape as `inputs`, containing the logits of the cumulative densities evaluated at the given inputs. """ logits = inputs for i in range(len(self.filters) + 1): matrix = self._matrices[i] if stop_gradient: matrix = tf.stop_gradient(matrix) logits = tf.linalg.matmul(matrix, logits) bias = self._biases[i] if stop_gradient: bias = tf.stop_gradient(bias) logits += bias if i < len(self._factors): factor = self._factors[i] if stop_gradient: factor = tf.stop_gradient(factor) logits += factor * tf.math.tanh(logits) return logits
Example #19
Source File: entropy_models.py From compression with Apache License 2.0 | 5 votes |
def _likelihood(self, inputs): ndim, channel_axis, _, _ = self._get_input_dims() half = tf.constant(.5, dtype=self.dtype) # Convert to (channels, 1, batch) format by commuting channels to front # and then collapsing. order = list(range(ndim)) order.pop(channel_axis) order.insert(0, channel_axis) inputs = tf.transpose(inputs, order) shape = tf.shape(inputs) inputs = tf.reshape(inputs, (shape[0], 1, -1)) # Evaluate densities. # We can use the special rule below to only compute differences in the left # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1 # for large x, 0 for small x. Subtracting two numbers close to 0 can be done # with much higher precision than subtracting two numbers close to 1. lower = self._logits_cumulative(inputs - half, stop_gradient=False) upper = self._logits_cumulative(inputs + half, stop_gradient=False) # Flip signs if we can move more towards the left tail of the sigmoid. sign = -tf.math.sign(tf.math.add_n([lower, upper])) sign = tf.stop_gradient(sign) likelihood = abs( tf.math.sigmoid(sign * upper) - tf.math.sigmoid(sign * lower)) # Convert back to input tensor shape. order = list(range(1, ndim)) order.insert(channel_axis, 0) likelihood = tf.reshape(likelihood, shape) likelihood = tf.transpose(likelihood, order) return likelihood
Example #20
Source File: value_ops.py From trfl with Apache License 2.0 | 5 votes |
def td_learning(v_tm1, r_t, pcont_t, v_t, name="TDLearning"): """Implements the TD(0)-learning loss as a TensorFlow op. The TD loss is `0.5` times the squared difference between `v_tm1` and the target `r_t + pcont_t * v_t`. See "Learning to Predict by the Methods of Temporal Differences" by Sutton. (https://link.springer.com/article/10.1023/A:1022633531479). Args: v_tm1: Tensor holding values at previous timestep, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. v_t: Tensor holding values at current timestep, shape `[B]`. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `v_tm1`, shape `[B]`. * `td_error`: batch of temporal difference errors, shape `[B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert([[v_tm1, v_t, r_t, pcont_t]], [1], name) # TD(0)-learning op. with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, v_t]): # Build target. target = tf.stop_gradient(r_t + pcont_t * v_t) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - v_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, TDExtra(target, td_error))
Example #21
Source File: value_ops.py From trfl with Apache License 2.0 | 5 votes |
def qv_max(v_tm1, r_t, pcont_t, q_t, name="QVMAX"): """Implements the QVMAX learning loss as a TensorFlow op. The QVMAX loss is `0.5` times the squared difference between `v_tm1` and the target `r_t + pcont_t * max q_t`, where `q_t` is separately learned through QV learning (c.f. `action_value_ops.qv_learning`). See "The QV Family Compared to Other Reinforcement Learning Algorithms" by Wiering and van Hasselt (2009). (http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.713.1931) Args: v_tm1: Tensor holding values at previous timestep, shape `[B]`. r_t: Tensor holding rewards, shape `[B]`. pcont_t: Tensor holding pcontinue values, shape `[B]`. q_t: Tensor of action values at current timestep, shape `[B, num_actions]`. name: name to prefix ops created by this function. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `target`: batch of target values for `v_tm1`, shape `[B]`. * `td_error`: batch of temporal difference errors, shape `[B]`. """ # Rank and compatibility checks. base_ops.wrap_rank_shape_assert([[v_tm1, r_t, pcont_t], [q_t]], [1, 2], name) # The QVMAX op. with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, q_t]): # Build target. target = tf.stop_gradient(r_t + pcont_t * tf.reduce_max(q_t, axis=1)) # Temporal difference error and loss. # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error. td_error = target - v_tm1 loss = 0.5 * tf.square(td_error) return base_ops.LossOutput(loss, TDExtra(target, td_error))
Example #22
Source File: dist_value_ops.py From trfl with Apache License 2.0 | 5 votes |
def _slice_with_actions(embeddings, actions): """Slice a Tensor. Take embeddings of the form [batch_size, num_actions, embed_dim] and actions of the form [batch_size, 1], and return the sliced embeddings like embeddings[:, actions, :]. Args: embeddings: Tensor of embeddings to index. actions: int Tensor to use as index into embeddings Returns: Tensor of embeddings indexed by actions """ batch_size, num_actions = embeddings.get_shape()[:2] # Values are the 'values' in a sparse tensor we will be setting act_indx = tf.cast(actions, tf.int64)[:, None] values = tf.reshape(tf.cast(tf.ones(tf.shape(actions)), tf.bool), [-1]) # Create a range for each index into the batch act_range = tf.range(0, batch_size, dtype=tf.int64)[:, None] # Combine this into coordinates with the action indices indices = tf.concat([act_range, act_indx], 1) actions_mask = tf.SparseTensor(indices, values, [batch_size, num_actions]) actions_mask = tf.stop_gradient( tf.sparse_tensor_to_dense(actions_mask, default_value=False)) sliced_emb = tf.boolean_mask(embeddings, actions_mask) return sliced_emb
Example #23
Source File: policy_gradient_ops.py From trfl with Apache License 2.0 | 5 votes |
def policy_gradient(policies, actions, action_values, policy_vars=None, name="policy_gradient"): """Computes policy gradient losses for a batch of trajectories. See `policy_gradient_loss` for more information on expected inputs and usage. Args: policies: A distribution over a batch supporting a `log_prob` method, e.g. an instance of `tfp.distributions.Distribution`. For example, for a diagonal gaussian policy: `policies = tfp.distributions.MultivariateNormalDiag(mus, sigmas)` actions: An action batch Tensor used as the argument for `log_prob`. Has shape equal to the batch shape of the policies concatenated with the event shape of the policies (which may be scalar, in which case concatenation leaves shape just equal to batch shape). action_values: A Tensor containing estimates of the values of the `actions`. Has shape equal to the batch shape of the policies. policy_vars: An optional iterable of Tensors used by `policies`. If provided is used in scope checks. For the multivariate normal example above this would be `[mus, sigmas]`. name: Customises the name_scope for this op. Returns: loss: Tensor with same shape as `actions` containing the total loss for each element in the batch. Differentiable w.r.t the variables in `policies` only. """ policy_vars = list(policy_vars) if policy_vars else list() with tf.name_scope(values=policy_vars + [actions, action_values], name=name): actions = tf.stop_gradient(actions) action_values = tf.stop_gradient(action_values) log_prob_actions = policies.log_prob(actions) # Prevent accidental broadcasting if possible at construction time. action_values.get_shape().assert_is_compatible_with( log_prob_actions.get_shape()) return -tf.multiply(log_prob_actions, action_values)
Example #24
Source File: learner.py From meta-dataset with Apache License 2.0 | 5 votes |
def gradient_descent_step(loss, variables, stop_grads, allow_grads_to_batch_norm_vars, learning_rate, get_update_ops=True): """Returns the updated vars after one step of gradient descent.""" grads = tf.gradients(loss, variables) if stop_grads: grads = [tf.stop_gradient(dv) for dv in grads] def _apply_grads(variables, grads): """Applies gradients using SGD on a list of variables.""" v_new, update_ops = [], [] for (v, dv) in zip(variables, grads): if (not allow_grads_to_batch_norm_vars and ('offset' in v.name or 'scale' in v.name)): updated_value = v # no update. else: updated_value = v - learning_rate * dv # gradient descent update. if get_update_ops: update_ops.append(tf.assign(v, updated_value)) v_new.append(updated_value) return v_new, update_ops updated_vars, update_ops = _apply_grads(variables, grads) return {'updated_vars': updated_vars, 'update_ops': update_ops}
Example #25
Source File: cycle_gan.py From tensor2tensor with Apache License 2.0 | 5 votes |
def discriminator(x, compress, hparams, name, reuse=None): with tf.variable_scope(name, reuse=reuse): x = tf.stop_gradient(2 * x) - x # Reverse gradient. if compress: x = transformer_vae.compress(x, None, False, hparams, "compress") else: x = transformer_vae.residual_conv(x, 1, 3, hparams, "compress_rc") y = tf.reduce_mean(x, axis=1) return tf.tanh(tf.layers.dense(y, 1, name="reduce"))
Example #26
Source File: utils.py From lamb with Apache License 2.0 | 5 votes |
def compute_lengths(symbols_list, eos_symbol, name=None, dtype=tf.int64): """Computes sequence lengths given end-of-sequence symbol. Args: symbols_list: list of [batch_size] tensors of symbols (e.g. integers). eos_symbol: end of sequence symbol (e.g. integer). name: name for the name scope of this op. dtype: type of symbols, default: tf.int64. Returns: Tensor [batch_size] of lengths of sequences. """ with tf.name_scope(name, 'compute_lengths'): max_len = len(symbols_list) eos_symbol_ = tf.constant(eos_symbol, dtype=dtype) # Array with max_len-time where we have EOS, 0 otherwise. Maximum of this is # the first EOS in that example. ends = [tf.constant(max_len - i, dtype=tf.int64) * tf.to_int64(tf.equal(s, eos_symbol_)) for i, s in enumerate(symbols_list)] # Lengths of sequences, or max_len for sequences that didn't have EOS. # Note: examples that don't have EOS will have max value of 0 and value of # max_len+1 in lens_. lens_ = max_len + 1 - tf.reduce_max(tf.stack(ends, 1), axis=1) # For examples that didn't have EOS decrease max_len+1 to max_len as the # length. lens = tf.subtract(lens_, tf.to_int64(tf.equal(lens_, max_len + 1))) return tf.stop_gradient(tf.reshape(lens, [-1]))
Example #27
Source File: transformer_nat.py From tensor2tensor with Apache License 2.0 | 5 votes |
def get_latent_pred_loss(latents_pred, latents_discrete_hot, hparams): """Latent prediction and loss.""" latents_logits = tf.layers.dense( latents_pred, 2**hparams.bottleneck_bits, name="extra_logits") loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.stop_gradient(latents_discrete_hot), logits=latents_logits) return loss
Example #28
Source File: autoencoders.py From tensor2tensor with Apache License 2.0 | 5 votes |
def reverse_gradient(x, lr=1.0): return -lr * x + tf.stop_gradient((1.0 + lr) * x)
Example #29
Source File: autoencoders.py From tensor2tensor with Apache License 2.0 | 5 votes |
def bottleneck(self, x): hparams = self.hparams x = tf.tanh(tf.layers.dense(x, hparams.bottleneck_bits, name="bottleneck")) d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x) if hparams.mode == tf.estimator.ModeKeys.TRAIN: noise = tf.random_uniform(common_layers.shape_list(x)) noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise, noise)) - 1.0 d *= noise x = common_layers.mix(d, x, hparams.discretize_warmup_steps, hparams.mode == tf.estimator.ModeKeys.TRAIN) return x, 0.0
Example #30
Source File: transformer_vae_flow_prior_ops.py From tensor2tensor with Apache License 2.0 | 5 votes |
def predict_target_lengths( encoder_output, inputs_mask, hparams, length_diff=None): """Predict target lengths.""" bound = hparams.lendiff_bound inputs_length = tf.cast(tf.reduce_sum(inputs_mask, 1), tf.int32) targets_length = inputs_length loss = None if hparams.predict_target_length: encoder_output = gops.reduce_mean_over_l(encoder_output, inputs_mask) logits = tf.stop_gradient(encoder_output) logits = lenpred_mlp("lenpred", logits, hparams.hidden_size, bound) if length_diff is not None: labels = tf.maximum(tf.minimum(length_diff, bound), -bound) labels = tf.cast(labels + bound, tf.int32) labels = tf.stop_gradient(labels) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) loss = tf.reduce_mean(loss) diff_pred = tf.argmax(logits, 1) diff_pred = tf.cast(diff_pred - bound, tf.int32) targets_length = inputs_length + diff_pred targets_length = tf.maximum(targets_length, 1) divi = 4 targets_length = tf.ceil(targets_length / divi) * divi targets_length = tf.cast(targets_length, tf.int32) return targets_length, loss