Python tensorflow.compat.v1.zeros_initializer() Examples
The following are 30
code examples of tensorflow.compat.v1.zeros_initializer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.compat.v1
, or try the search function
.
Example #1
Source File: neural_stack.py From tensor2tensor with Apache License 2.0 | 6 votes |
def add_vector_projection(self, name, size): """A helper function for mapping embedding controller outputs. Args: name: A prefix for the variable names. size: The desired number of embedding outputs. Returns: A tuple of (weights, bias) where weights has shape [num_units, size * embedding_size] and bias has shape [size * embedding_size]. """ weights = self.add_variable( name + "_projection_weights", shape=[self._num_units, size * self._embedding_size], dtype=self.dtype) bias = self.add_variable( name + "_projection_bias", shape=[size * self._embedding_size], initializer=tf.zeros_initializer(dtype=self.dtype)) return weights, bias
Example #2
Source File: export_checkpoints.py From albert with Apache License 2.0 | 6 votes |
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights): """From run_pretraining.py.""" input_tensor = gather_indexes(input_tensor, mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul( input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
Example #3
Source File: transformer_glow_layers_ops.py From tensor2tensor with Apache License 2.0 | 6 votes |
def dense_weightnorm( name, x, n_out, x_mask, init_scale, init, dtype=tf.float32): """Dense layer with weight normalization.""" n_in = common_layers.shape_list(x)[2] eps = tf.keras.backend.epsilon() with tf.variable_scope(name, reuse=tf.AUTO_REUSE): v = tf.get_variable( "v", [n_in, n_out], dtype, initializer=tf.random_normal_initializer(0, 0.05), trainable=True) v = v / tf.norm(v, axis=0, keepdims=True) t = tf.matmul(x, v) # [B, L, n_out] mean, var = moments_over_bl(t, x_mask) g_init = init_scale / (tf.sqrt(var) + eps) g = get_variable_ddi( "g", [n_out], g_init, init, initializer=tf.zeros_initializer, dtype=dtype, trainable=True) b = get_variable_ddi( "b", [n_out], -mean*g_init, init, initializer=tf.zeros_initializer, dtype=dtype, trainable=True) w = g * v y = tf.matmul(x, w) + b tf.summary.histogram("_g", g) return y
Example #4
Source File: common_layers.py From tensor2tensor with Apache License 2.0 | 6 votes |
def zero_add(previous_value, x, name=None, reuse=None): """Resnet connection with zero initialization. Another type of resnet connection which returns previous_value + gamma * x. gamma is a trainable scalar and initialized with zero. It is useful when a module is plugged into a trained model and we want to make sure it matches the original model's performance. Args: previous_value: A tensor. x: A tensor. name: name of variable scope; defaults to zero_add. reuse: reuse scope. Returns: previous_value + gamma * x. """ with tf.variable_scope(name, default_name="zero_add", reuse=reuse): gamma = tf.get_variable("gamma", (), initializer=tf.zeros_initializer()) return previous_value + gamma * x
Example #5
Source File: export_checkpoints.py From albert with Apache License 2.0 | 6 votes |
def get_sentence_order_logits(input_tensor, albert_config): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, albert_config.hidden_size], initializer=modeling.create_initializer( albert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
Example #6
Source File: common_layers.py From tensor2tensor with Apache License 2.0 | 6 votes |
def group_norm(x, filters=None, num_groups=8, epsilon=1e-5): """Group normalization as in https://arxiv.org/abs/1803.08494.""" x_shape = shape_list(x) if filters is None: filters = x_shape[-1] assert len(x_shape) == 4 assert filters % num_groups == 0 # Prepare variables. scale = tf.get_variable( "group_norm_scale", [filters], initializer=tf.ones_initializer()) bias = tf.get_variable( "group_norm_bias", [filters], initializer=tf.zeros_initializer()) epsilon, scale, bias = [cast_like(t, x) for t in [epsilon, scale, bias]] # Reshape and compute group norm. x = tf.reshape(x, x_shape[:-1] + [num_groups, filters // num_groups]) # Calculate mean and variance on heights, width, channels (not groups). mean, variance = tf.nn.moments(x, [1, 2, 4], keep_dims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return tf.reshape(norm_x, x_shape) * scale + bias
Example #7
Source File: ops_test.py From mesh with Apache License 2.0 | 6 votes |
def testVariableOperations(self): var = mtf.Variable(self.mesh, "test_variable", self.ab_shape, mtf.VariableDType(tf.int32, tf.int32, tf.int32), initializer=tf.zeros_initializer(), trainable=True) self.assertEqual(var.splittable_dims, frozenset(["a", "b"])) self.assertEqual(var.unsplittable_dims, frozenset()) read_variable = mtf.ReadVariable(var) self.assertEqual(read_variable.splittable_dims, frozenset(["a", "b"])) self.assertEqual(read_variable.unsplittable_dims, frozenset()) assign = mtf.Assign([var], [self.x]) self.assertEqual(assign.splittable_dims, frozenset(["a", "b"])) self.assertEqual(assign.unsplittable_dims, frozenset()) depend = mtf.Depend(read_variable.outputs[0], [assign]) self.assertEqual(depend.splittable_dims, frozenset(["a", "b"])) self.assertEqual(depend.unsplittable_dims, frozenset())
Example #8
Source File: transformer_glow_layers.py From tensor2tensor with Apache License 2.0 | 6 votes |
def actnorm(name, x, x_mask, inverse, init, logscale_factor=3.0): """Activation normalization, returns logabsdet of shape [B].""" eps = tf.keras.backend.epsilon() n_channels = common_layers.shape_list(x)[2] with tf.variable_scope(name, reuse=tf.AUTO_REUSE): x_mean, x_var = gops.moments_over_bl(x, x_mask) b = gops.get_variable_ddi( "b", (n_channels), -x_mean, init, tf.zeros_initializer) log_w_init = -0.5 * tf.log(x_var + eps) / logscale_factor log_w = gops.get_variable_ddi( "log_w", (n_channels), log_w_init, init, tf.zeros_initializer) * logscale_factor if not inverse: x = (x + b) * tf.exp(log_w) else: x = x * tf.exp(-log_w) - b x_length = tf.reduce_sum(x_mask, -1) logabsdet = x_length * tf.reduce_sum(log_w) if inverse: logabsdet *= -1 return x, logabsdet
Example #9
Source File: common_layers.py From language with Apache License 2.0 | 6 votes |
def linear_transform(x, output_size, scope, bias=False, input_size=None): """Simple linear transform of x. Args: x: <float>[batch_size, length, input_size] output_size: Integer specifying output size. scope: String name for variable scope. bias: If True, adds a learned bias term. input_size: Explicitly specify input_size if not set as static shape. Returns: <float>[batch_size, length, output_size] """ input_size = input_size or x.get_shape()[-1] with tf.variable_scope(scope): batch_size = tf.shape(x)[0] weights = tf.get_variable("weights", shape=(input_size, output_size)) weights = tf.expand_dims(weights, 0) weights = tf.tile(weights, [batch_size, 1, 1]) x = tf.matmul(x, weights) if bias: bias = tf.get_variable( "bias", shape=(output_size), initializer=tf.zeros_initializer()) x += bias return x
Example #10
Source File: common_layers.py From language with Apache License 2.0 | 6 votes |
def apply_norm(x, epsilon=1e-6): """Applies layer normalization to x. Based on "Layer Normalization": https://arxiv.org/abs/1607.06450 Args: x: <float>[..., input_size] epsilon: Used to avoid division by 0. Returns: <float>[..., input_size] """ input_size = x.get_shape()[-1] with tf.variable_scope("layer_norm", values=[x]): scale = tf.get_variable( "layer_norm_scale", [input_size], initializer=tf.ones_initializer()) bias = tf.get_variable( "layer_norm_bias", [input_size], initializer=tf.zeros_initializer()) mean = tf.reduce_mean(x, axis=[-1], keepdims=True) variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) result = norm_x * scale + bias return result
Example #11
Source File: glow_ops.py From tensor2tensor with Apache License 2.0 | 6 votes |
def scale_gaussian_prior(name, z, logscale_factor=3.0, trainable=True): """Returns N(s^i * z^i, std^i) where s^i and std^i are pre-component. s^i is a learnable parameter with identity initialization. std^i is optionally learnable with identity initialization. Args: name: variable scope. z: input_tensor logscale_factor: equivalent to scaling up the learning_rate by a factor of logscale_factor. trainable: Whether or not std^i is learnt. """ with tf.variable_scope(name, reuse=tf.AUTO_REUSE): z_shape = common_layers.shape_list(z) latent_multiplier = tf.get_variable( "latent_multiplier", shape=z_shape, dtype=tf.float32, initializer=tf.ones_initializer()) log_scale = tf.get_variable( "log_scale_latent", shape=z_shape, dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=trainable) log_scale = log_scale * logscale_factor return tfp.distributions.Normal( loc=latent_multiplier * z, scale=tf.exp(log_scale))
Example #12
Source File: neural_stack.py From tensor2tensor with Apache License 2.0 | 6 votes |
def add_scalar_projection(self, name, size): """A helper function for mapping scalar controller outputs. Args: name: A prefix for the variable names. size: The desired number of scalar outputs. Returns: A tuple of (weights, bias) where weights has shape [num_units, size] and bias has shape [size]. """ weights = self.add_variable( name + "_projection_weights", shape=[self._num_units, size], dtype=self.dtype) bias = self.add_variable( name + "_projection_bias", shape=[size], initializer=tf.zeros_initializer(dtype=self.dtype)) return weights, bias
Example #13
Source File: export_to_tfhub.py From albert with Apache License 2.0 | 6 votes |
def get_mlm_logits(model, albert_config, mlm_positions): """From run_pretraining.py.""" input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul( input_tensor, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
Example #14
Source File: mnist_benchmark.py From autograph with Apache License 2.0 | 6 votes |
def get_data_and_params(): """Set up input dataset and variables.""" (train_x, train_y), _ = tf.keras.datasets.mnist.load_data() tf.set_random_seed(0) hparams = contrib_training.HParams( batch_size=200, learning_rate=0.1, train_steps=101, ) dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y)) dataset = dataset.repeat() dataset = dataset.shuffle(hparams.batch_size * 10) dataset = dataset.batch(hparams.batch_size) def reshape_ex(x, y): return (tf.to_float(tf.reshape(x, (-1, 28 * 28))) / 256.0, tf.one_hot(tf.squeeze(y), 10)) dataset = dataset.map(reshape_ex) w = tf.get_variable('w0', (28 * 28, 10)) b = tf.get_variable('b0', (10,), initializer=tf.zeros_initializer()) opt = tf.train.GradientDescentOptimizer(hparams.learning_rate) return dataset, opt, hparams, w, b
Example #15
Source File: slate_decomp_q_agent.py From recsim with Apache License 2.0 | 6 votes |
def _build_select_slate_op(self): p_no_click = self._prob_no_click_ph p = self._doc_affinity_scores_ph q = self._net_outputs.q_values[0] with tf.name_scope('select_slate'): self._output_slate = self._select_slate_fn(self._slate_size, p_no_click, p, q) self._output_slate = tf.Print( self._output_slate, [tf.constant('cp 1'), self._output_slate, p, q], summarize=10000) self._output_slate = tf.reshape(self._output_slate, (self._slate_size,)) self._action_counts = tf.get_variable( 'action_counts', shape=[self._num_candidates], initializer=tf.zeros_initializer()) output_slate = tf.reshape(self._output_slate, [-1]) output_one_hot = tf.one_hot(output_slate, self._num_candidates) update_ops = [] for i in range(self._slate_size): update_ops.append(tf.assign_add(self._action_counts, output_one_hot[i])) self._select_action_update_op = tf.group(*update_ops)
Example #16
Source File: bert_as_summarizer.py From DeepPavlov with Apache License 2.0 | 6 votes |
def _init_graph(self): self._init_placeholders() self.bert = BertModel(config=self.bert_config, is_training=self.is_train_ph, input_ids=self.input_ids_ph, input_mask=self.input_masks_ph, token_type_ids=self.token_types_ph, use_one_hot_embeddings=False, ) # next sentence prediction head with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, self.bert_config.hidden_size], initializer=create_initializer(self.bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) nsp_logits = tf.matmul(self.bert.get_pooled_output(), output_weights, transpose_b=True) nsp_logits = tf.nn.bias_add(nsp_logits, output_bias) self.nsp_probs = tf.nn.softmax(nsp_logits, axis=-1)
Example #17
Source File: tiled_linear.py From lamb with Apache License 2.0 | 6 votes |
def _build_tiled_linear(self, inputs, input_name_and_sizes, output_name_and_sizes, add_bias): results = [] for output_name, output_size in output_name_and_sizes: r = 0.0 for input_, (input_name, input_size) in zip(inputs, input_name_and_sizes): name = 'W_{}_{}'.format(input_name, output_name) weight = self._get_variable( name, shape=[output_size, input_size]) r += tf.sparse_tensor_dense_matmul(weight, input_, adjoint_b=True) r = tf.transpose(r) if add_bias: # Biases are dense, hence we call _get_variable of the base # class. r += super(SparseTiledLinear, self)._get_variable( 'B_{}'.format(output_name), shape=[output_size], default_initializer=tf.zeros_initializer()) results.append(r) return results # TODO(melisgl): Since computation is the same as in TiledLinear, # perhaps this should be implemented as a custom getter (see # tf.get_variable) instead of being tied to tiling.
Example #18
Source File: run_pretraining.py From albert with Apache License 2.0 | 6 votes |
def get_sentence_order_output(albert_config, input_tensor, labels): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, albert_config.hidden_size], initializer=modeling.create_initializer( albert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
Example #19
Source File: utils.py From lamb with Apache License 2.0 | 6 votes |
def layer_norm(x, reduction_indices, epsilon=1e-9, gain=None, bias=None, per_element=True, scope=None): """DOC.""" reduction_indices = ensure_list(reduction_indices) mean = tf.reduce_mean(x, reduction_indices, keep_dims=True) variance = tf.reduce_mean(tf.squared_difference(x, mean), reduction_indices, keep_dims=True) normalized = (x - mean) / tf.sqrt(variance + epsilon) dtype = x.dtype shape = x.get_shape().as_list() for i in six.moves.range(len(shape)): if i not in reduction_indices or not per_element: shape[i] = 1 with tf.variable_scope(scope or 'layer_norm'): if gain is None: gain = tf.get_variable('gain', shape=shape, dtype=dtype, initializer=tf.ones_initializer()) if bias is None: bias = tf.get_variable('bias', shape=shape, dtype=dtype, initializer=tf.zeros_initializer()) return gain*normalized+bias
Example #20
Source File: averaged.py From lamb with Apache License 2.0 | 6 votes |
def __init__(self, tensors): tensors = list(tensors) with tf.variable_scope('averaged'): self._num_samples = tf.Variable(0, name='num_samples', trainable=False) with tf.variable_scope('avg'): self._averages = [ tf.get_variable( tensor.name.replace('/', '-').replace(':', '-'), tensor.get_shape(), initializer=tf.zeros_initializer(), trainable=False) for tensor in tensors] with tf.variable_scope('save'): self._saves = [ tf.get_variable( tensor.name.replace('/', '-').replace(':', '-'), tensor.get_shape(), initializer=tf.zeros_initializer(), trainable=False) for tensor in tensors] self._tensors = tensors self._take_sample = self._make_take_sample() self._switch = self._make_swith_to_average() self._restore = self._make_restore() self._reset = self._make_reset()
Example #21
Source File: model_fns.py From language with Apache License 2.0 | 5 votes |
def _get_bert_embeddings(model, layers_to_use, aggregation_fn, name="bert"): """Extract embeddings from BERT model.""" all_hidden = model.get_all_encoder_layers() layers_hidden = [all_hidden[i] for i in layers_to_use] hidden_shapes = [ modeling.get_shape_list(hid, expected_rank=3) for hid in all_hidden ] if len(layers_hidden) == 1: hidden_emb = layers_hidden[0] hidden_size = hidden_shapes[0][2] elif aggregation_fn == "concat": hidden_emb = tf.concat(layers_hidden, 2) hidden_size = sum([hidden_shapes[i][2] for i in layers_to_use]) elif aggregation_fn == "average": hidden_size = hidden_shapes[0][2] assert all([shape[2] == hidden_size for shape in hidden_shapes ]), hidden_shapes hidden_emb = tf.add_n(layers_hidden) / len(layers_hidden) elif aggregation_fn == "attention": hidden_size = hidden_shapes[0][2] mixing_weights = tf.get_variable( name + "/mixing/weights", [len(layers_hidden)], initializer=tf.zeros_initializer()) mixing_scores = tf.nn.softmax(mixing_weights) hidden_emb = tf.tensordot( tf.stack(layers_hidden, axis=-1), mixing_scores, [[-1], [0]]) else: raise ValueError("Unrecognized aggregation function %s." % aggregation_fn) return hidden_emb, hidden_size
Example #22
Source File: run_dualencoder_qa.py From language with Apache License 2.0 | 5 votes |
def _get_bert_embeddings(model, layers_to_use, aggregation_fn, name="bert"): """Extract embeddings from BERT model.""" all_hidden = model.get_all_encoder_layers() layers_hidden = [all_hidden[i] for i in layers_to_use] hidden_shapes = [ modeling.get_shape_list(hid, expected_rank=3) for hid in all_hidden ] if len(layers_hidden) == 1: hidden_emb = layers_hidden[0] hidden_size = hidden_shapes[0][2] elif aggregation_fn == "concat": hidden_emb = tf.concat(layers_hidden, 2) hidden_size = sum([hidden_shapes[i][2] for i in layers_to_use]) elif aggregation_fn == "average": hidden_size = hidden_shapes[0][2] assert all([shape[2] == hidden_size for shape in hidden_shapes ]), hidden_shapes hidden_emb = tf.add_n(layers_hidden) / len(layers_hidden) elif aggregation_fn == "attention": hidden_size = hidden_shapes[0][2] mixing_weights = tf.get_variable( name + "/mixing/weights", [len(layers_hidden)], initializer=tf.zeros_initializer()) mixing_scores = tf.nn.softmax(mixing_weights) hidden_emb = tf.tensordot( tf.stack(layers_hidden, axis=-1), mixing_scores, [[-1], [0]]) else: raise ValueError("Unrecognized aggregation function %s." % aggregation_fn) return hidden_emb, hidden_size
Example #23
Source File: modeling.py From training with Apache License 2.0 | 5 votes |
def dense_layer_3d(input_tensor, num_attention_heads, size_per_head, initializer, activation, name=None): """A dense layer with 3D kernel. Args: input_tensor: float Tensor of shape [batch, seq_length, hidden_size]. num_attention_heads: Number of attention heads. size_per_head: The size per attention head. initializer: Kernel initializer. activation: Actication function. name: The name scope of this layer. Returns: float logits Tensor. """ last_dim = get_shape_list(input_tensor)[-1] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[last_dim, num_attention_heads * size_per_head], initializer=initializer) w = tf.reshape(w, [last_dim, num_attention_heads, size_per_head]) b = tf.get_variable( name="bias", shape=[num_attention_heads * size_per_head], initializer=tf.zeros_initializer) b = tf.reshape(b, [num_attention_heads, size_per_head]) ret = tf.einsum("abc,cde->abde", input_tensor, w) ret += b if activation is not None: return activation(ret) else: return ret
Example #24
Source File: ops.py From language with Apache License 2.0 | 5 votes |
def affine(x, output_size, weight_name, bias_name=None, weight_init=None): """Affine transformation of the input `x`. Args: x: <float32>[..., x_dim] output_size: size of the last output dimension weight_name: Name of the weight variable to use bias_name: Name for the bias variable, if one should be used weight_init: Initializer of the weight variable Returns: transformed <float32>[..., `output_size`] """ dim = x.shape.as_list()[-1] w = tf.get_variable( weight_name, (dim, output_size), tf.float32, initializer=weight_init) out = tf.tensordot(x, w, [[len(x.shape) - 1], [0]]) if bias_name: b = tf.get_variable( bias_name, (output_size,), tf.float32, initializer=tf.zeros_initializer()) for _ in range(len(out.shape) - 1): b = tf.expand_dims(b, 0) out += b return out
Example #25
Source File: run_squad.py From language with Apache License 2.0 | 5 votes |
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)
Example #26
Source File: bert.py From mesh with Apache License 2.0 | 5 votes |
def get_masked_lm_output(self, positions, label_ids, label_weights): """Get loss and logits for the masked LM.""" input_tensor = self.get_sequence_output() output_weights = self.get_embedding_table() # [batch_size, num_position, hidden] input_tensor = mtf.gather(input_tensor, positions, self.seq_dim) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = mtf.layers.dense( input_tensor, reduced_dims=[self.model_dim], new_dims=[self.model_dim], activation=get_activation(self.config.feedforward_intermediate_act), kernel_initializer=self.dense_initializer, use_bias=self.config.use_bias) input_tensor = self.normalize(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = mtf.get_variable( input_tensor.mesh, name="output_bias", shape=[self.vocab_dim], initializer=tf.zeros_initializer()) logits = mtf.einsum([input_tensor, output_weights], reduced_dims=[self.model_dim]) + output_bias per_example_loss = mtf.layers.softmax_cross_entropy_with_logits( logits, label_ids, self.vocab_dim, z_loss=1e-4) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. numerator = mtf.reduce_sum(label_weights * per_example_loss) denominator = mtf.reduce_sum(label_weights) + mtf.constant( input_tensor.mesh, 1e-5, dtype=tf.float32) loss = numerator / denominator return (loss, per_example_loss, logits)
Example #27
Source File: layers.py From mesh with Apache License 2.0 | 5 votes |
def layer_norm(x, dim, epsilon=1e-6, name="layer_prepostprocess"): """Layer normalization over dimension dim. Args: x: a mtf.Tensor whose shape contains dim. dim: a mtf.Dimension epsilon: a floating point number name: a string used for tf.variable_scope. Returns: a mtf.Tensor with same shape as x. """ with tf.variable_scope(name + "/layer_norm"): scale = mtf.get_variable( x.mesh, "layer_norm_scale", mtf.Shape([dim]), initializer=tf.ones_initializer(), activation_dtype=x.dtype) bias = mtf.get_variable( x.mesh, "layer_norm_bias", mtf.Shape([dim]), initializer=tf.zeros_initializer(), activation_dtype=x.dtype) reduced_shape = x.shape - dim mean = mtf.reduce_mean(x, output_shape=reduced_shape) variance = mtf.reduce_mean(mtf.square(x - mean), output_shape=reduced_shape) norm_x = (x - mean) * mtf.rsqrt(variance + epsilon) return norm_x * scale + bias
Example #28
Source File: optimize.py From mesh with Apache License 2.0 | 5 votes |
def apply_grad(self, grad, var): if grad is None: tf.logging.warning("Gradient is None for variable %s" % var.name) return [] updates = [] v = mtf.get_variable( var.mesh, var.name + "_momentum_v", var.shape, dtype=var.dtype, initializer=tf.zeros_initializer(), trainable=False) with tf.variable_scope(var.name + "/sgd_momentum"): updates.append(mtf.assign(v, grad * self.lr + v * self.momentum)) updates.append(mtf.assign_sub(var, v)) return updates
Example #29
Source File: model.py From interval-bound-propagation with Apache License 2.0 | 5 votes |
def _create_linear_initializer(input_size, output_size, dtype=tf.float32): # pylint: disable=unused-argument """Returns a default initializer for the weights of a linear module.""" return { 'w': tf.orthogonal_initializer(), 'b': tf.zeros_initializer(dtype=dtype), }
Example #30
Source File: convnet_builder.py From benchmarks with Apache License 2.0 | 5 votes |
def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon): """Batch normalization on `input_layer` without tf.layers.""" # We make this function as similar as possible to the # tf.contrib.layers.batch_norm, to minimize the differences between using # layers and not using layers. shape = input_layer.shape num_channels = shape[3] if self.data_format == 'NHWC' else shape[1] beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32, initializer=tf.zeros_initializer()) if use_scale: gamma = self.get_variable('gamma', [num_channels], tf.float32, tf.float32, initializer=tf.ones_initializer()) else: gamma = tf.constant(1.0, tf.float32, [num_channels]) # For moving variables, we use tf.get_variable instead of self.get_variable, # since self.get_variable returns the result of tf.cast which we cannot # assign to. moving_mean = tf.get_variable('moving_mean', [num_channels], tf.float32, initializer=tf.zeros_initializer(), trainable=False) moving_variance = tf.get_variable('moving_variance', [num_channels], tf.float32, initializer=tf.ones_initializer(), trainable=False) if self.phase_train: bn, batch_mean, batch_variance = tf.nn.fused_batch_norm( input_layer, gamma, beta, epsilon=epsilon, data_format=self.data_format, is_training=True) mean_update = moving_averages.assign_moving_average( moving_mean, batch_mean, decay=decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( moving_variance, batch_variance, decay=decay, zero_debias=False) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update) else: bn, _, _ = tf.nn.fused_batch_norm( input_layer, gamma, beta, mean=moving_mean, variance=moving_variance, epsilon=epsilon, data_format=self.data_format, is_training=False) return bn