Python tensorflow.Tensor() Examples
The following are 30
code examples of tensorflow.Tensor().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
Example #1
Source File: graph_builder.py From DOTA_models with Apache License 2.0 | 9 votes |
def _clip_gradients(self, grad): """Clips gradients if the hyperparameter `gradient_clip_norm` requires it. Sparse tensors, in the form of IndexedSlices returned for the gradients of embeddings, require special handling. Args: grad: Gradient Tensor, IndexedSlices, or None. Returns: Optionally clipped gradient. """ if grad is not None and self.hyperparams.gradient_clip_norm > 0: logging.info('Clipping gradient %s', grad) if isinstance(grad, tf.IndexedSlices): tmp = tf.clip_by_norm(grad.values, self.hyperparams.gradient_clip_norm) return tf.IndexedSlices(tmp, grad.indices, grad.dense_shape) else: return tf.clip_by_norm(grad, self.hyperparams.gradient_clip_norm) else: return grad
Example #2
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def restore(self, x): """Add padding back to the given tensor. Args: x (tf.Tensor): of shape [dim_compressed,...] Returns: a tensor of shape [dim_origin,...] with dim_compressed >= dim_origin. The dim is restored from the original reference tensor """ with tf.name_scope("pad_reduce/restore"): x = tf.scatter_nd( indices=self.nonpad_ids, updates=x, shape=tf.concat([self.dim_origin, tf.shape(x)[1:]], axis=0), ) return x
Example #3
Source File: graph_builder.py From DOTA_models with Apache License 2.0 | 6 votes |
def _create_learning_rate(hyperparams, step_var): """Creates learning rate var, with decay and switching for CompositeOptimizer. Args: hyperparams: a GridPoint proto containing optimizer spec, particularly learning_method to determine optimizer class to use. step_var: tf.Variable, global training step. Returns: a scalar `Tensor`, the learning rate based on current step and hyperparams. """ if hyperparams.learning_method != 'composite': base_rate = hyperparams.learning_rate else: spec = hyperparams.composite_optimizer_spec switch = tf.less(step_var, spec.switch_after_steps) base_rate = tf.cond(switch, lambda: tf.constant(spec.method1.learning_rate), lambda: tf.constant(spec.method2.learning_rate)) return tf.train.exponential_decay( base_rate, step_var, hyperparams.decay_steps, hyperparams.decay_base, staircase=hyperparams.decay_staircase)
Example #4
Source File: attention_lm_moe.py From fine-lm with MIT License | 6 votes |
def remove_pad(x, pad_remover, mode): """Remove padding by concatenating all dimension into one. Args: x (tf.Tensor): input of shape [batch_size, length, depth] pad_remover (obj): a PadRemover object mode (ModeKeys): infer, train or eval. If inference, the padding remover is not applied Returns: tf.Tensor of shape [1,length_nonpad,depth] where length_nonpad <= batch_size*length """ # Concatenate all tokens (without padding) x = expert_utils.flatten_all_but_last(x) # Remove padding for training and eval if mode != ModeKeys.PREDICT: # This is a hack to allows inference when the <go> token # is detected as padding and removed. This works for now because there is # no padding at inference. x = pad_remover.remove(x) x = tf.expand_dims(x, axis=0) # Now batch_size=1 return x
Example #5
Source File: attention_lm_moe.py From fine-lm with MIT License | 6 votes |
def expand_batch_coordinates(bc, length_factor): """Duplicate elements of bc by length_factor. Args: bc (tf.Tensor): int32 tensor of shape [1, length, 1] length_factor (int): Returns: tf.Tensor: of shape [1, length*length_factor, 1] where every elements has been duplicated length_factor times. """ assert bc.get_shape().as_list() == [1, None, 1] # bc has shape [1, length, 1] bc *= tf.constant([[1] * length_factor]) # bc has shape [1, length, length_factor] bc = tf.reshape(bc, [1, -1, 1]) # bc has shape [1, length*length_factor] return bc
Example #6
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def dispatch(self, inp): """Create one input Tensor for each expert. Args: inp: a list of length num_datashards `Tensor`s with shapes `[batch_size[d], <extra_input_dims>]`. Returns: a list of `num_experts` `Tensor`s with shapes `[num_examples[i], <extra_input_dims>]`. """ dispatched = self._dp(lambda a, b: a.dispatch(b), self._dispatchers, inp) ret = self._ep(tf.concat, transpose_list_of_lists(dispatched), 0) if ret[0].dtype == tf.float32: # see comments on common_layers.convert_gradient_to_tensor ret = self._ep(common_layers.convert_gradient_to_tensor, ret) return ret
Example #7
Source File: t2t_model.py From fine-lm with MIT License | 6 votes |
def body(self, features): """Most models will override this function. Compute label logits for one shard as a function of the transformed features. Args: features: A dictionary of key to Tensor. Each Tensor has shape [batch_size, ?, ?, hidden_size]. Returns: output: tensor of logits with shape [batch_size, O, P, body_output_size. losses: either single loss as a scalar, a list, a tensor (to be averaged) or a dictionary of losses. """ raise NotImplementedError("Abstract Method")
Example #8
Source File: t2t_model.py From fine-lm with MIT License | 6 votes |
def eval_autoregressive(self, features=None, decode_length=50): """Autoregressive eval. Quadratic time in decode_length. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. Returns: logits: `Tensor` losses: a dictionary: {loss-name (string): floating point `Scalar`}. Contains a single key "training". """ results = self._slow_greedy_infer(features, decode_length=decode_length) return results["logits"], results["losses"]
Example #9
Source File: t2t_model.py From fine-lm with MIT License | 6 votes |
def average_sharded_losses(sharded_losses): """Average losses across datashards. Args: sharded_losses: list<dict<str loss_name, Tensor loss>>. The loss can be a single Tensor or a 2-tuple (numerator and denominator). Returns: losses: dict<str loss_name, Tensor avg_loss> """ losses = {} for loss_name in sorted(sharded_losses[0]): all_shards = [shard_losses[loss_name] for shard_losses in sharded_losses] if isinstance(all_shards[0], tuple): sharded_num, sharded_den = zip(*all_shards) mean_loss = ( tf.add_n(sharded_num) / tf.maximum( tf.cast(1.0, sharded_den[0].dtype), tf.add_n(sharded_den))) else: mean_loss = tf.reduce_mean(all_shards) losses[loss_name] = mean_loss return losses
Example #10
Source File: modeling.py From BERT-Classification-Tutorial with Apache License 2.0 | 6 votes |
def dropout(input_tensor, dropout_prob): """Perform dropout. Args: input_tensor: float Tensor. dropout_prob: Python float. The probability of dropping out a value (NOT of *keeping* a dimension as in `tf.nn.dropout`). Returns: A version of `input_tensor` with dropout applied. """ if dropout_prob is None or dropout_prob == 0.0: return input_tensor output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) return output
Example #11
Source File: t2t_model.py From fine-lm with MIT License | 6 votes |
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha): """Beam search decoding. Models should ideally implement a more efficient version of this function. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for longer translations. Returns: samples: an integer `Tensor`. Top samples from the beam search """ return self._beam_decode_slow(features, decode_length, beam_size, top_beams, alpha)
Example #12
Source File: t2t_model.py From fine-lm with MIT License | 6 votes |
def _greedy_infer(self, features, decode_length, use_tpu=False): """A greedy inference method. Models should ideally implement a more efficient version of this function. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. use_tpu: A bool, whether to build the inference graph for TPU. Returns: A dict of decoding results { "outputs": integer `Tensor` of decoded ids of shape [batch_size, <= decode_length] if beam_size == 1 or [batch_size, top_beams, <= decode_length] "scores": None "logits": `Tensor` of shape [batch_size, time, 1, 1, vocab_size]. "losses": a dictionary: {loss-name (string): floating point `Scalar`} } """ return (self._slow_greedy_infer_tpu(features, decode_length) if use_tpu else self._slow_greedy_infer(features, decode_length))
Example #13
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def combine(self, x): """Return the output from the experts. When one example goes to multiple experts, the outputs are summed. Args: x: a Tensor with shape [batch, num_experts, expert_capacity, depth] Returns: a `Tensor` with shape `[batch, length, depth] """ depth = tf.shape(x)[-1] x *= tf.expand_dims(self._nonpadding, -1) ret = tf.unsorted_segment_sum( x, self._flat_indices, num_segments=self._batch * self._length) ret = tf.reshape(ret, [self._batch, self._length, depth]) return ret
Example #14
Source File: common_attention.py From fine-lm with MIT License | 6 votes |
def attention_bias_local(length, max_backward, max_forward): """Create an bias tensor to be added to attention logits. A position may attend to positions at most max_distance from it, forward and backwards. This does not actually save any computation. Args: length: int max_backward: int, maximum distance backward to attend. Negative values indicate unlimited. max_forward: int, maximum distance forward to attend. Negative values indicate unlimited. Returns: a `Tensor` with shape [1, 1, length, length]. """ band = common_layers.ones_matrix_band_part( length, length, max_backward, max_forward, out_shape=[1, 1, length, length]) return -1e9 * (1.0 - band)
Example #15
Source File: common_attention.py From fine-lm with MIT License | 6 votes |
def get_layer_timing_signal_learned_1d(channels, layer, num_layers): """get n-dimensional embedding as the layer (vertical) timing signal. Adds embeddings to represent the position of the layer in the tower. Args: channels: dimension of the timing signal layer: layer num num_layers: total number of layers Returns: a Tensor of timing signals [1, 1, channels]. """ shape = [num_layers, 1, 1, channels] layer_embedding = ( tf.get_variable( "layer_embedding", shape, initializer=tf.random_normal_initializer(0, channels**-0.5)) * (channels**0.5)) return layer_embedding[layer, :, :, :]
Example #16
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def combine(self, expert_out, multiply_by_gates=True): """Sum together the expert output, weighted by the gates. The slice corresponding to a particular batch element `b` is computed as the sum over all experts `i` of the expert output, weighted by the corresponding gate values. If `multiply_by_gates` is set to False, the gate values are ignored. Args: expert_out: a list of `num_experts` `Tensor`s, each with shape `[expert_batch_size_i, <extra_output_dims>]`. multiply_by_gates: a boolean Returns: a `Tensor` with shape `[batch_size, <extra_output_dims>]`. """ # see comments on convert_gradient_to_tensor stitched = common_layers.convert_gradient_to_tensor( tf.concat(expert_out, 0)) if multiply_by_gates: stitched *= tf.expand_dims(self._nonzero_gates, 1) combined = tf.unsorted_segment_sum(stitched, self._batch_index, tf.shape(self._gates)[0]) return combined
Example #17
Source File: optimizer_builder_test.py From DOTA_models with Apache License 2.0 | 6 votes |
def testBuildManualStepLearningRate(self): learning_rate_text_proto = """ manual_step_learning_rate { schedule { step: 0 learning_rate: 0.006 } schedule { step: 90000 learning_rate: 0.00006 } } """ global_summaries = set([]) learning_rate_proto = optimizer_pb2.LearningRate() text_format.Merge(learning_rate_text_proto, learning_rate_proto) learning_rate = optimizer_builder._create_learning_rate( learning_rate_proto, global_summaries) self.assertTrue(isinstance(learning_rate, tf.Tensor))
Example #18
Source File: dqn_utils.py From cs294-112_hws with MIT License | 6 votes |
def compute_exponential_averages(variables, decay): """Given a list of tensorflow scalar variables create ops corresponding to their exponential averages Parameters ---------- variables: [tf.Tensor] List of scalar tensors. Returns ------- averages: [tf.Tensor] List of scalar tensors corresponding to averages of al the `variables` (in order) apply_op: tf.runnable Op to be run to update the averages with current value of variables. """ averager = tf.train.ExponentialMovingAverage(decay=decay) apply_op = averager.apply(variables) return [averager.average(v) for v in variables], apply_op
Example #19
Source File: common_attention.py From fine-lm with MIT License | 6 votes |
def add_positional_embedding(x, max_length, name, positions=None): """Add positional embedding. Args: x: a Tensor with shape [batch, length, depth] max_length: an integer. static maximum size of any dimension. name: a name for this layer. positions: an optional tensor with shape [batch, length] Returns: a Tensor the same shape as x. """ _, length, depth = common_layers.shape_list(x) var = tf.cast(tf.get_variable(name, [max_length, depth]), x.dtype) if positions is None: sliced = tf.cond( tf.less(length, max_length), lambda: tf.slice(var, [0, 0], [length, -1]), lambda: tf.pad(var, [[0, length - max_length], [0, 0]])) return x + tf.expand_dims(sliced, 0) else: return x + tf.gather(var, tf.to_int32(positions))
Example #20
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def __init__(self, num_experts, gates): """Create a SparseDispatcher. Args: num_experts: an integer. gates: a `Tensor` of shape `[batch_size, num_experts]`. Returns: a SparseDispatcher """ self._gates = gates self._num_experts = num_experts where = tf.to_int32(tf.where(tf.transpose(gates) > 0)) self._expert_index, self._batch_index = tf.unstack(where, num=2, axis=1) self._part_sizes_tensor = tf.reduce_sum(tf.to_int32(gates > 0), [0]) self._nonzero_gates = tf.gather( tf.reshape(self._gates, [-1]), self._batch_index * num_experts + self._expert_index)
Example #21
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def _normal_distribution_cdf(x, stddev): """Evaluates the CDF of the normal distribution. Normal distribution with mean 0 and standard deviation stddev, evaluated at x=x. input and output `Tensor`s have matching shapes. Args: x: a `Tensor` stddev: a `Tensor` with the same shape as `x`. Returns: a `Tensor` with the same shape as `x`. """ return 0.5 * (1.0 + tf.erf(x / (math.sqrt(2) * stddev + 1e-20)))
Example #22
Source File: common_attention.py From fine-lm with MIT License | 6 votes |
def add_layer_timing_signal_sinusoid_1d(x, layer, num_layers): """Add sinusoids of different frequencies as layer (vertical) timing signal. Args: x: a Tensor with shape [batch, length, channels] layer: layer num num_layers: total number of layers Returns: a Tensor the same shape as x. """ channels = common_layers.shape_list(x)[-1] signal = get_layer_timing_signal_sinusoid_1d(channels, layer, num_layers) return x + signal
Example #23
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def cv_squared(x): """The squared coefficient of variation of a sample. Useful as a loss to encourage a positive distribution to be more uniform. Epsilons added for numerical stability. Returns 0 for an empty Tensor. Args: x: a `Tensor`. Returns: a `Scalar`. """ epsilon = 1e-10 float_size = tf.to_float(tf.size(x)) + epsilon mean = tf.reduce_sum(x) / float_size variance = tf.reduce_sum(tf.square(x - mean)) / float_size return variance / (tf.square(mean) + epsilon)
Example #24
Source File: common_attention.py From fine-lm with MIT License | 6 votes |
def get_layer_timing_signal_sinusoid_1d(channels, layer, num_layers): """Add sinusoids of different frequencies as layer (vertical) timing signal. Args: channels: dimension of the timing signal layer: layer num num_layers: total number of layers Returns: a Tensor of timing signals [1, 1, channels]. """ signal = get_timing_signal_1d(num_layers, channels) layer_signal = tf.expand_dims(signal[:, layer, :], axis=1) return layer_signal
Example #25
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def __init__(self, pad_mask): """Compute and store the location of the padding. Args: pad_mask (tf.Tensor): Reference padding tensor of shape [batch_size,length] or [dim_origin] (dim_origin=batch_size*length) containing non-zeros positive values to indicate padding location. """ self.nonpad_ids = None self.dim_origin = None with tf.name_scope("pad_reduce/get_ids"): pad_mask = tf.reshape(pad_mask, [-1]) # Flatten the batch # nonpad_ids contains coordinates of zeros rows (as pad_mask is # float32, checking zero equality is done with |x| < epsilon, with # epsilon=1e-9 as standard, here pad_mask only contains positive values # so tf.abs would be redundant) self.nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9)) self.dim_origin = tf.shape(pad_mask)[:1]
Example #26
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def remove(self, x): """Remove padding from the given tensor. Args: x (tf.Tensor): of shape [dim_origin,...] Returns: a tensor of shape [dim_compressed,...] with dim_compressed <= dim_origin """ with tf.name_scope("pad_reduce/remove"): x_shape = x.get_shape().as_list() x = tf.gather_nd( x, indices=self.nonpad_ids, ) if not tf.contrib.eager.in_eager_mode(): # This is a hack but for some reason, gather_nd return a tensor of # undefined shape, so the shape is set up manually x.set_shape([None] + x_shape[1:]) return x
Example #27
Source File: expert_utils.py From fine-lm with MIT License | 6 votes |
def __init__(self, data_parallelism, expert_parallelism, gates): """Create a DistributedSparseDispatcher. Args: data_parallelism: a Parallelism object. expert_parallelism: a Parallelism object. gates: a list of datashard_parallelism.n `Tensor`s of shapes `[batch_size[d], num_experts]`. Returns: a DistributedSparseDispatcher """ self._gates = gates self._dp = data_parallelism self._ep = expert_parallelism assert len(gates) == self._dp.n self._dispatchers = self._dp(SparseDispatcher, self._ep.n, gates)
Example #28
Source File: common_attention.py From fine-lm with MIT License | 5 votes |
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, start_index=0): """Adds a bunch of sinusoids of different frequencies to a Tensor. Each channel of the input Tensor is incremented by a sinusoid of a different frequency and phase. This allows attention to learn to use absolute and relative positions. Timing signals should be added to some precursors of both the query and the memory inputs to attention. The use of relative position is possible because sin(x+y) and cos(x+y) can be experessed in terms of y, sin(x) and cos(x). In particular, we use a geometric sequence of timescales starting with min_timescale and ending with max_timescale. The number of different timescales is equal to channels / 2. For each timescale, we generate the two sinusoidal signals sin(timestep/timescale) and cos(timestep/timescale). All of these sinusoids are concatenated in the channels dimension. Args: x: a Tensor with shape [batch, length, channels] min_timescale: a float max_timescale: a float start_index: index of first position Returns: a Tensor the same shape as x. """ length = common_layers.shape_list(x)[1] channels = common_layers.shape_list(x)[2] signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale, start_index) return x + signal
Example #29
Source File: common_attention.py From fine-lm with MIT License | 5 votes |
def add_timing_signal_1d_given_position(x, position, min_timescale=1.0, max_timescale=1.0e4): """Adds sinusoids of diff frequencies to a Tensor, with timing position given. Args: x: a Tensor with shape [batch, length, channels] position: a Tensor with shape [batch, length] min_timescale: a float max_timescale: a float Returns: a Tensor the same shape as x. """ channels = common_layers.shape_list(x)[2] num_timescales = channels // 2 log_timescale_increment = ( math.log(float(max_timescale) / float(min_timescale)) / (tf.to_float(num_timescales) - 1)) inv_timescales = min_timescale * tf.exp( tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) scaled_time = ( tf.expand_dims(tf.to_float(position), 2) * tf.expand_dims( tf.expand_dims(inv_timescales, 0), 0)) signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=2) signal = tf.pad(signal, [[0, 0], [0, 0], [0, tf.mod(channels, 2)]]) signal = common_layers.cast_like(signal, x) return x + signal
Example #30
Source File: attention_lm_moe.py From fine-lm with MIT License | 5 votes |
def attention_lm_moe_prepare_decoder(targets, hparams): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a Tensor, containing large negative values to implement masked attention and possibly biases for diagonal alignments pad_remover (expert_utils.PadRemover): an util object to remove padding """ targets_pad_mask = common_attention.embedding_to_padding(targets) with tf.name_scope("pad_remover"): # Because of the shift_right, the <eos> token will be considered as # padding. In practice, it doesn't really matter, due to the triangular # mask, this token should never be attended. pad_remover = expert_utils.PadRemover(targets_pad_mask) if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( common_attention.attention_bias_prepend_inputs_full_attention( targets_pad_mask)) else: decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) decoder_input = common_layers.shift_right_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias, pad_remover)