Python tensorflow.einsum() Examples
The following are 30
code examples of tensorflow.einsum().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
Example #1
Source File: matmul.py From spektral with MIT License | 7 votes |
def mixed_mode_dot(a, b): """ Computes the equivalent of `tf.einsum('ij,bjk->bik', a, b)`, but works for both dense and sparse inputs. :param a: Tensor or SparseTensor with rank 2. :param b: Tensor or SparseTensor with rank 3. :return: Tensor or SparseTensor with rank 3. """ s_0_, s_1_, s_2_ = K.int_shape(b) B_T = ops.transpose(b, (1, 2, 0)) B_T = ops.reshape(B_T, (s_1_, -1)) output = dot(a, B_T) output = ops.reshape(output, (s_1_, s_2_, -1)) output = ops.transpose(output, (2, 0, 1)) return output
Example #2
Source File: gaussian_process.py From BERT with Apache License 2.0 | 6 votes |
def laplace_attention(q, k, v, scale, normalise): """Computes laplace exponential attention. Args: q: queries. Tensor of shape [batch_size, m, d_k]. k: keys. Tensor of shape [batch_size, n, d_k]. v: values. Tensor of shape [batch_size, n, d_v]. scale: float that scales the L1 distance. normalise: Boolean that determines whether weights sum to 1. Returns: Tensor of shape [batch_size, m, d_v]. """ k = tf.expand_dims(k, axis=1) # [batch_size, 1, n, d_k] q = tf.expand_dims(q, axis=2) # [batch_size, m, 1, d_k] unnorm_weights = - tf.abs((k - q) / scale) # [batch_size, m, n, d_k] unnorm_weights = tf.reduce_sum(unnorm_weights, axis=-1) # [batch_size, m, n] if normalise: weight_fn = tf.nn.softmax else: weight_fn = lambda x: 1 + tf.tanh(x) weights = weight_fn(unnorm_weights) # [batch_size, m, n] rep = tf.einsum('bik,bkj->bij', weights, v) # [batch_size, m, d_v] return rep
Example #3
Source File: input_moe_model.py From youtube-8m with Apache License 2.0 | 6 votes |
def create_model(self, model_input, vocab_size, num_mixtures=None, l2_penalty=1e-8, sub_scope="", original_input=None, **unused_params): num_methods = model_input.get_shape().as_list()[-1] num_features = model_input.get_shape().as_list()[-2] original_input = tf.nn.l2_normalize(original_input, dim=1) gate_activations = slim.fully_connected( original_input, num_methods, activation_fn=tf.nn.softmax, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates"+sub_scope) output = tf.einsum("ijk,ik->ij", model_input, gate_activations) return {"predictions": output}
Example #4
Source File: losses.py From youtube-8m with Apache License 2.0 | 6 votes |
def calculate_loss(self, predictions, labels, weights=None, **unused_params): with tf.name_scope("loss_xent"): epsilon = 10e-6 if FLAGS.label_smoothing: float_labels = smoothing(labels) else: float_labels = tf.cast(labels, tf.float32) cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + ( 1 - float_labels) * tf.log(1 - predictions + epsilon) cross_entropy_loss = tf.negative(cross_entropy_loss) if weights is not None: print cross_entropy_loss, weights weighted_loss = tf.einsum("ij,i->ij", cross_entropy_loss, weights) print "create weighted_loss", weighted_loss return tf.reduce_mean(tf.reduce_sum(weighted_loss, 1)) else: return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1))
Example #5
Source File: linear_regression_model.py From youtube-8m with Apache License 2.0 | 6 votes |
def create_model(self, model_input, vocab_size, l2_penalty=1e-8, original_input=None, **unused_params): """Creates a linear regression model. Args: model_input: 'batch' x 'num_features' x 'num_methods' matrix of input features. vocab_size: The number of classes in the dataset. Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are batch_size x num_classes.""" num_methods = model_input.get_shape().as_list()[-1] weight = tf.get_variable("ensemble_weight", shape=[num_methods], regularizer=slim.l2_regularizer(l2_penalty)) weight = tf.nn.softmax(weight) output = tf.einsum("ijk,k->ij", model_input, weight) return {"predictions": output}
Example #6
Source File: losses.py From youtube-8m with Apache License 2.0 | 6 votes |
def calculate_loss(self, predictions, labels, weights=None, **unused_params): with tf.name_scope("loss_xent"): epsilon = 10e-6 if FLAGS.label_smoothing: float_labels = smoothing(labels) else: float_labels = tf.cast(labels, tf.float32) cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + ( 1 - float_labels) * tf.log(1 - predictions + epsilon) cross_entropy_loss = tf.negative(cross_entropy_loss) if weights is not None: print cross_entropy_loss, weights weighted_loss = tf.einsum("ij,i->ij", cross_entropy_loss, weights) print "create weighted_loss", weighted_loss return tf.reduce_mean(tf.reduce_sum(weighted_loss, 1)) else: return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1))
Example #7
Source File: bnn_vi.py From zhusuan with MIT License | 6 votes |
def build_bnn(x, layer_sizes, n_particles): bn = zs.BayesianNet() h = tf.tile(x[None, ...], [n_particles, 1, 1]) for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): w = bn.normal("w" + str(i), tf.zeros([n_out, n_in + 1]), std=1., group_ndims=2, n_samples=n_particles) h = tf.concat([h, tf.ones(tf.shape(h)[:-1])[..., None]], -1) h = tf.einsum("imk,ijk->ijm", w, h) / tf.sqrt( tf.cast(tf.shape(h)[2], tf.float32)) if i < len(layer_sizes) - 2: h = tf.nn.relu(h) y_mean = bn.deterministic("y_mean", tf.squeeze(h, 2)) y_logstd = tf.get_variable("y_logstd", shape=[], initializer=tf.constant_initializer(0.)) bn.normal("y", y_mean, logstd=y_logstd) return bn
Example #8
Source File: bnn_sgmcmc.py From zhusuan with MIT License | 6 votes |
def build_bnn(x, layer_sizes, logstds, n_particles): bn = zs.BayesianNet() h = tf.tile(x[None, ...], [n_particles, 1, 1]) for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): w = bn.normal("w" + str(i), tf.zeros([n_out, n_in + 1]), logstd=logstds[i], group_ndims=2, n_samples=n_particles) h = tf.concat([h, tf.ones(tf.shape(h)[:-1])[..., None]], -1) h = tf.einsum("imk,ijk->ijm", w, h) / tf.sqrt( tf.cast(tf.shape(h)[2], tf.float32)) if i < len(layer_sizes) - 2: h = tf.nn.relu(h) y_mean = bn.deterministic("y_mean", tf.squeeze(h, 2)) y_logstd = -0.95 bn.normal("y", y_mean, logstd=y_logstd) return bn
Example #9
Source File: layers.py From Pixel2MeshPlusPlus with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _call(self, inputs): x = inputs # N, S, VF # dropout x = tf.nn.dropout(x, 1 - self.dropout) # convolve supports = list() for i in range(len(self.support)): pre_sup = tf.einsum('ijk,kl->ijl', x, self.vars['weights_' + str(i)]) support = tf.einsum('ij,kjl->kil', self.support[i], pre_sup) supports.append(support) output = tf.add_n(supports) # bias if self.bias: output += self.vars['bias'] return self.act(output)
Example #10
Source File: gaussian_process.py From BERT with Apache License 2.0 | 6 votes |
def fit(self, x=None, y=None): # p(coeffs | x, y) = Normal(coeffs | # mean = (1/noise_variance) (1/noise_variance x^T x + I)^{-1} x^T y, # covariance = (1/noise_variance x^T x + I)^{-1}) # TODO(trandustin): We newly fit the data at each call. Extend to do # Bayesian updating. kernel_matrix = tf.matmul(x, x, transpose_a=True) / self.noise_variance coeffs_precision = tf.matrix_set_diag( kernel_matrix, tf.matrix_diag_part(kernel_matrix) + 1.) coeffs_precision_tril = tf.linalg.cholesky(coeffs_precision) self.coeffs_precision_tril_op = tf.linalg.LinearOperatorLowerTriangular( coeffs_precision_tril) self.coeffs_mean = self.coeffs_precision_tril_op.solvevec( self.coeffs_precision_tril_op.solvevec(tf.einsum('nm,n->m', x, y)), adjoint=True) / self.noise_variance # TODO(trandustin): To be fully Keras-compatible, return History object. return
Example #11
Source File: gaussian_process.py From BERT with Apache License 2.0 | 6 votes |
def call(self, inputs): if self.coeffs_mean is None and self.coeffs_precision_tril_op is None: # p(mean(ynew) | xnew) = Normal(ynew | mean = 0, variance = xnew xnew^T) predictive_mean = 0. predictive_variance = tf.reduce_sum(tf.square(inputs), -1) else: # p(mean(ynew) | xnew, x, y) = Normal(ynew | # mean = xnew (1/noise_variance) (1/noise_variance x^T x + I)^{-1}x^T y, # variance = xnew (1/noise_variance x^T x + I)^{-1} xnew^T) predictive_mean = tf.einsum('nm,m->n', inputs, self.coeffs_mean) predictive_covariance = tf.matmul( inputs, self.coeffs_precision_tril_op.solve( self.coeffs_precision_tril_op.solve(inputs, adjoint_arg=True), adjoint=True)) predictive_variance = tf.diag_part(predictive_covariance) return ed.Normal(loc=predictive_mean, scale=tf.sqrt(predictive_variance))
Example #12
Source File: graph_attention.py From spektral with MIT License | 6 votes |
def _call_dense(self, X, A): shape = tf.shape(A)[:-1] A = tf.linalg.set_diag(A, tf.zeros(shape, A.dtype)) A = tf.linalg.set_diag(A, tf.ones(shape, A.dtype)) X = tf.einsum("...NI , IHO -> ...NHO", X, self.kernel) attn_for_self = tf.einsum("...NHI , IHO -> ...NHO", X, self.attn_kernel_self) attn_for_neighs = tf.einsum("...NHI , IHO -> ...NHO", X, self.attn_kernel_neighs) attn_for_neighs = tf.einsum("...ABC -> ...CBA", attn_for_neighs) attn_coef = attn_for_self + attn_for_neighs attn_coef = tf.nn.leaky_relu(attn_coef, alpha=0.2) mask = -10e9 * (1.0 - A) attn_coef += mask[..., None, :] attn_coef = tf.nn.softmax(attn_coef, axis=-1) attn_coef_drop = self.dropout(attn_coef) output = tf.einsum("...NHM , ...MHI -> ...NHI", attn_coef_drop, X) return output, attn_coef
Example #13
Source File: modeling.py From XLnet-gen with MIT License | 6 votes |
def post_attention(h, attn_vec, d_model, n_head, d_head, dropout, is_training, kernel_initializer, residual=True): """Post-attention processing.""" # post-attention projection (back to `d_model`) proj_o = tf.get_variable('o/kernel', [d_model, n_head, d_head], dtype=h.dtype, initializer=kernel_initializer) attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, proj_o) attn_out = tf.layers.dropout(attn_out, dropout, training=is_training) if residual: output = tf.contrib.layers.layer_norm(attn_out + h, begin_norm_axis=-1, scope='LayerNorm') else: output = tf.contrib.layers.layer_norm(attn_out, begin_norm_axis=-1, scope='LayerNorm') return output
Example #14
Source File: bert_esim.py From BERT with Apache License 2.0 | 6 votes |
def bert_layer_aggerate(encoding_lst, max_len, scope, reuse): with tf.variable_scope(scope, reuse=reuse): valid_tensor = tf.stack(encoding_lst, axis=1) # batch x num_layer x seq x dim attn = tf.get_variable(scope+"/layer_attention", dtype=tf.float32, shape=[len(encoding_lst),], initializer=tf.initializers.random_uniform(0,1)) prob = tf.exp(tf.nn.log_softmax(attn)) layer_repres = tf.einsum("abcd,b->acd", valid_tensor, prob) # layer_repres = encoding_lst[-1] # since input_target_a means b->a # and input_target_b means a->b layer_repres = layer_repres[:,0:max_len,:] # print(" bert layer output shape w{}".format(layer_repres.get_shape())) return layer_repres
Example #15
Source File: bert_esim_v1.py From BERT with Apache License 2.0 | 6 votes |
def bert_layer_aggerate(encoding_lst, scope, reuse): with tf.variable_scope(scope, reuse=reuse): valid_tensor = tf.stack(encoding_lst, axis=1) # batch x num_layer x seq x dim attn = tf.get_variable(scope+"/layer_attention", dtype=tf.float32, shape=[len(encoding_lst),], initializer=tf.initializers.random_uniform(-0.01,0.01)) prob = tf.exp(tf.nn.log_softmax(attn)) layer_repres = tf.einsum("abcd,b->acd", valid_tensor, prob) # since input_target_a means b->a # and input_target_b means a->b # print(" bert layer output shape w{}".format(layer_repres.get_shape())) return layer_repres
Example #16
Source File: textcnn.py From BERT with Apache License 2.0 | 5 votes |
def build_output_logits(self, **kargs): input_tensor = self.sequence_output input_shape_list = bert_utils.get_shape_list(self.sequence_output, expected_rank=3) batch_size = input_shape_list[0] seq_length = input_shape_list[1] hidden_dims = input_shape_list[2] embedding_projection = kargs.get('embedding_projection', None) scope = kargs.get('scope', None) if scope: scope = scope + '/' + 'cls/predictions' else: scope = 'cls/predictions' tf.logging.info("**** mlm generator scope **** %s", str(scope)) # with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): projection_width = self.config.emb_size with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=projection_width, activation=bert_modules.get_activation(self.config.hidden_act), kernel_initializer=bert_modules.create_initializer( self.config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[self.config.vocab_size], initializer=tf.zeros_initializer()) # batch x seq x embedding logits = tf.einsum("abc,dc->abd", input_tensor, self.emb_mat) self.logits = tf.nn.bias_add(logits, output_bias)
Example #17
Source File: PNN_TensorFlow.py From Awesome-RecSystem-Models with MIT License | 5 votes |
def call(self, feat_index, feat_value, use_dropout=True): # embedding part feat_embedding = self.feat_embeddings(feat_index) # Batch * N * M # linear part lz = tf.einsum('bnm,dnm->bd', feat_embedding, self.linear_weights) # Batch * D1 # quadratic part if self.product_type == 'inner': theta = tf.einsum('bnm,dn->bdnm', feat_embedding, self.theta) # Batch * D1 * N * M lp = tf.einsum('bdnm,bdnm->bd', theta, theta) else: embed_sum = tf.reduce_sum(feat_embedding, axis=1) p = tf.einsum('bm,bn->bmn', embed_sum, embed_sum) lp = tf.einsum('bmn,dmn->bd', p, self.quadratic_weights) # Batch * D1 y_deep = tf.concat((lz, lp), axis=1) if use_dropout: y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep) for i in range(len(self.deep_layer_sizes)): y_deep = getattr(self, 'dense_' + str(i))(y_deep) y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep) y_deep = getattr(self, 'activation_' + str(i))(y_deep) if use_dropout: y_deep = getattr(self, 'dropout_' + str(i))(y_deep) output = self.fc(y_deep) return output
Example #18
Source File: match_pyramid.py From BERT with Apache License 2.0 | 5 votes |
def _encode(self, input_ids, input_char_ids, is_training, **kargs): reuse = kargs.get("reuse", None) with tf.variable_scope(self.config.scope+"_semantic_encode", reuse=reuse): emb_seq = self._embd_seq(input_ids, input_char_ids, is_training, reuse=reuse) if self.config.compress_emb: eW = tf.get_variable(self.scope+"_eW", initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.2, dtype=tf.float32), dtype=tf.float32, shape=[emb_seq.shape[-1].value, self.config["embedding_dim_compressed"]]) emb_seq = tf.einsum("abd,dc->abc", emb_seq, eW) input_dim = emb_seq.shape[-1].value input_mask = tf.cast(input_ids, tf.bool) input_len = tf.reduce_sum(tf.cast(input_mask, tf.int32), -1) enc_seq = encode(emb_seq, method=self.config["encode_method"], input_dim=input_dim, params=self.config, sequence_length=input_len, mask_zero=self.config["embedding_mask_zero"], scope_name=self.scope + "enc_seq", reuse=reuse, training=is_training) return emb_seq, enc_seq
Example #19
Source File: match_pyramid.py From BERT with Apache License 2.0 | 5 votes |
def _semantic_interaction(self, input_ids_a, input_char_ids_a, input_ids_b, input_char_ids_b, emb_seq_a, enc_seq_a, emb_seq_b, enc_seq_b, is_training, **kargs): emb_match_matrix_dot_product = tf.einsum("abd,acd->abc", emb_seq_a, emb_seq_b) emb_match_matrix_dot_product = tf.expand_dims(emb_match_matrix_dot_product, axis=-1) # batch x seq_len_a x seq_len_b x 1 match_matrix_identity = tf.expand_dims(tf.cast( tf.equal( tf.expand_dims(input_ids_a, 2), tf.expand_dims(input_ids_b, 1) ), tf.float32), axis=-1) # batch x seq_len_a x seq_len_b x 1 input_mask_a = tf.expand_dims(tf.cast(tf.cast(input_ids_a, tf.bool), tf.float32), axis=2) # batch x seq_len_a x 1 input_mask_b = tf.expand_dims(tf.cast(tf.cast(input_ids_b, tf.bool), tf.float32), axis=1) # batch x 1 x seq_len_b match_matrix_identity *= tf.expand_dims(input_mask_a*input_mask_b, axis=-1) emb_match_matrix_element_product = tf.expand_dims(emb_seq_a, 2) * tf.expand_dims( emb_seq_b, 1) # emb_match_matrix_element_product *= tf.expand_dims(input_mask_a*input_mask_b, axis=-1) enc_match_matrix_dot_product = tf.expand_dims( tf.einsum("abd,acd->abc", enc_seq_a, enc_seq_b), axis=-1) # enc_match_matrix_dot_product *= tf.expand_dims(input_mask_a*input_mask_b, axis=-1) enc_match_matrix_element_product = tf.expand_dims(enc_seq_a, 2) * tf.expand_dims( enc_seq_b, 1) # enc_match_matrix_element_product *= tf.expand_dims(input_mask_a*input_mask_b, axis=-1) match_matrix = tf.concat([ emb_match_matrix_dot_product, match_matrix_identity, emb_match_matrix_element_product, enc_match_matrix_dot_product, enc_match_matrix_element_product ], axis=-1) return match_matrix
Example #20
Source File: textcnn.py From BERT with Apache License 2.0 | 5 votes |
def build_other_output_logits(self, sequence_output, **kargs): input_tensor = sequence_output input_shape_list = bert_utils.get_shape_list(sequence_output, expected_rank=3) batch_size = input_shape_list[0] seq_length = input_shape_list[1] hidden_dims = input_shape_list[2] embedding_projection = kargs.get('embedding_projection', None) scope = kargs.get('scope', None) if scope: scope = scope + '/' + 'cls/predictions' else: scope = 'cls/predictions' tf.logging.info("**** mlm generator scope **** %s", str(scope)) # with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): projection_width = self.config.emb_size with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=projection_width, activation=bert_modules.get_activation(self.config.hidden_act), kernel_initializer=bert_modules.create_initializer( self.config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[self.config.vocab_size], initializer=tf.zeros_initializer()) # batch x seq x embedding logits = tf.einsum("abc,dc->abd", input_tensor, self.emb_mat) logits = tf.nn.bias_add(logits, output_bias) return logits
Example #21
Source File: attention_rectified_linear_model.py From youtube-8m with Apache License 2.0 | 5 votes |
def create_model(self, model_input, vocab_size, num_mixtures=None, l2_penalty=1e-8, sub_scope="", original_input=None, **unused_params): num_methods = model_input.get_shape().as_list()[-1] num_features = model_input.get_shape().as_list()[-2] num_mixtures = FLAGS.moe_num_mixtures # gating coefficients original_input = tf.nn.l2_normalize(original_input, dim=1) mean_output = tf.reduce_mean(model_input, axis=2) ## batch_size x moe_num_mixtures gate_activations = slim.fully_connected( tf.concat([original_input, mean_output], axis=1), num_mixtures, activation_fn=tf.nn.softmax, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates"+sub_scope) # matrix weight_var = tf.get_variable("ensemble_weight", shape=[num_mixtures, num_methods], regularizer=slim.l2_regularizer(l2_penalty)) # weight gated_weight = tf.einsum("ij,jk->ik", gate_activations, weight_var) rl_gated_weight = tf.nn.relu(gated_weight) + 1e-9 sum_gated_weight = tf.reduce_sum(rl_gated_weight, axis=1, keep_dims=True) weight = rel_gated_weight / sum_gated_weight # weighted output output = tf.einsum("ik,ijk->ij", weight, model_input) return {"predictions": output}
Example #22
Source File: attention_linear_model.py From youtube-8m with Apache License 2.0 | 5 votes |
def create_model(self, model_input, vocab_size, num_mixtures=None, l2_penalty=1e-8, sub_scope="", original_input=None, **unused_params): num_methods = model_input.get_shape().as_list()[-1] num_features = model_input.get_shape().as_list()[-2] num_mixtures = FLAGS.moe_num_mixtures # gating coefficients original_input = tf.nn.l2_normalize(original_input, dim=1) mean_output = tf.reduce_mean(model_input, axis=2) ## batch_size x moe_num_mixtures gate_activations = slim.fully_connected( tf.concat([original_input, mean_output], axis=1), num_mixtures, activation_fn=tf.nn.softmax, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates"+sub_scope) # matrix weight_var = tf.get_variable("ensemble_weight", shape=[num_mixtures, num_methods], regularizer=slim.l2_regularizer(l2_penalty)) # weight gated_weight = tf.einsum("ij,jk->ik", gate_activations, weight_var) weight = tf.nn.softmax(gated_weight) # weighted output output = tf.einsum("ik,ijk->ij", weight, model_input) return {"predictions": output}
Example #23
Source File: attention_moe_model.py From youtube-8m with Apache License 2.0 | 5 votes |
def create_model(self, model_input, vocab_size, num_mixtures=None, l2_penalty=1e-8, sub_scope="", original_input=None, **unused_params): num_relu = FLAGS.attention_relu_cells num_methods = model_input.get_shape().as_list()[-1] num_features = model_input.get_shape().as_list()[-2] original_input = tf.nn.l2_normalize(original_input, dim=1) model_input_list = tf.unstack(model_input, axis=2) relu_units = [self.relu(original_input, num_relu, sub_scope="input")] i = 0 for mi in model_input_list: relu_units.append(self.relu(mi, num_relu, sub_scope="sub"+str(i))) i += 1 gate_activations = slim.fully_connected( tf.concat(relu_units, axis=1), num_methods, activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gate") gate = tf.nn.softmax(gate_activations) output = tf.einsum("ijk,ik->ij", model_input, gate) return {"predictions": output}
Example #24
Source File: man_utils.py From BERT with Apache License 2.0 | 5 votes |
def minus_attention(query, context, query_mask, context_mask, dropout_ratio, scope, reuse=None): hidden_dim = query.get_shape()[-1] Wm = tf.get_variable("Wm", dtype=tf.float32, shape=[hidden_dim, hidden_dim], initializer=initializer) Vm = tf.get_variable("Vm", dtype=tf.float32, shape=[hidden_dim, 1], initializer=initializer) # batch x len_query x 1 x hidden_dim query_ = tf.expand_dims(query, 2) # batch x 1 x len_context x hidden_dim context_ = tf.expand_dims(context, 1) # batch x len_query x len_context x hidden_dim minus_attention = tf.abs(query_ - context_) minus_attention = tf.einsum("abcd,de->abce", minus_attention, Wm) minus_attention = tf.einsum("abce,ef->abcf", minus_attention, Vm) # batch x len_query x len_context S = tf.squeeze(minus_attention, -1) mask_q = tf.expand_dims(query_mask, 1) # batch x 1 x query_len mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x context_len S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c)) c2q = tf.matmul(S_, context) S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q)) q2c = tf.matmul(S_T, query) return c2q, q2c
Example #25
Source File: man_utils.py From BERT with Apache License 2.0 | 5 votes |
def dot_attention(query, context, query_mask, context_mask, dropout_ratio, scope, reuse=None): hidden_dim = query.get_shape()[-1] Wd = tf.get_variable("Wd", dtype=tf.float32, shape=[hidden_dim, hidden_dim], initializer=initializer) Vd = tf.get_variable("Vd", dtype=tf.float32, shape=[hidden_dim, 1], initializer=initializer) # batch x len_query x 1 x hidden_dim query_ = tf.expand_dims(query, 2) # batch x 1 x len_context x hidden_dim context_ = tf.expand_dims(context, 1) # batch x len_query x len_context x hidden_dim dot_attention = query_ * context_ dot_attention = tf.einsum("abcd,de->abce", dot_attention, Wd) dot_attention = tf.einsum("abce,ef->abcf", dot_attention, Vd) # batch x len_query x len_context S = tf.squeeze(dot_attention, -1) mask_q = tf.expand_dims(query_mask, 1) # batch x 1 x query_len mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x context_len S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c)) c2q = tf.matmul(S_, context) S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q)) q2c = tf.matmul(S_T, query) return c2q, q2c
Example #26
Source File: generator.py From UROP-Adversarial-Feature-Matching-for-Text-Generation with GNU Affero General Public License v3.0 | 5 votes |
def lstm(self, prev_y, prev_h, prev_c, z): hs = self.hidden_size preact = tf.einsum('ijk,ka->ija', prev_h, self.h2h_W) + \ tf.einsum('ijk,ka->ija', prev_y, self.i2h_W) + \ tf.matmul(z, self.z2h_W) + \ self.b # preactivation # [1, batch_size, hidden_size * 4] i = tf.sigmoid(preact[:, :, 0*hs: 1*hs]) f = tf.sigmoid(preact[:, :, 1*hs: 2*hs]) o = tf.sigmoid(preact[:, :, 2*hs: 3*hs]) c = tf.tanh(preact[:, :, 3*hs: 4*hs]) c = f * prev_c + i * c # [1, batch_size, hidden_size] (element-wise multiply) h = o * tf.tanh(c) # [1, batch_size, hidden_size] y = tf.einsum('ijk,ka->ija', h, self.Vhid) + self.bhid # [1, batch_size, vocab_size] # Author doesn't mention this part in his paper, but it appers in his code # So I assume this is part of his soft-max approx. strategy ---| max_y = tf.reduce_max(y, axis=1, keep_dims=True) # [1, 1, vocab_size] e = tf.exp((y - max_y) * self.L) # [1, batch_size, vocab_size] w = e / tf.reduce_sum(e, axis=1, keep_dims=True) # [1, batch_size, vocab_size] # Assumption ends here ----------------------------------------| y = tf.einsum('ijk,ka->ija', w, self.Wemb) # [1, batch_size, input_dim] return y, h, c
Example #27
Source File: trf_bert_ebm_gpt.py From BERT with Apache License 2.0 | 5 votes |
def ebm_logz_length_cond_loss(config, features, ebm_all_loss, valid_mask=None): """ we group by length and mean over loss by length and apply sgd to optimize logz's parameters just like center-loss for center updating """ input_mask = features['input_mask'] shape = bert_utils.get_shape_list(input_mask) valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings) onehot_length_ids = tf.cast(onehot_length_ids, tf.float32) if_provided = 1 if valid_mask is None: valid_mask = tf.ones(shape=[shape[0]]) if_provided = 0 tf.logging.info("====ones valid mask ====") if if_provided == 1: tf.logging.info("====provided valid mask ====") valid_mask = tf.expand_dims(tf.cast(valid_mask, tf.float32), axis=-1) # batch_size x 1 length_accumulate_loss = tf.einsum("ab,a->ab", onehot_length_ids, ebm_all_loss) length_loss = tf.reduce_sum(length_accumulate_loss*valid_mask, axis=0) length_appear_time = tf.reduce_sum(onehot_length_ids*valid_mask, axis=0) + 1 logz_length_attribute_loss = length_loss / length_appear_time # 1 x max_position_embeddings logz_length_loss = tf.reduce_sum(logz_length_attribute_loss) return logz_length_loss
Example #28
Source File: classifier_adapter.py From BERT with Apache License 2.0 | 5 votes |
def multi_choice_classifier(config, pooled_output, num_labels, labels, dropout_prob): output_layer = pooled_output final_hidden_shape = bert_utils.get_shape_list(output_layer, expected_rank=2) print(final_hidden_shape, "====multi-choice shape====") output_layer = tf.reshape(output_layer, [-1, num_labels, final_hidden_shape[-1]]) # batch x num_choices x hidden_dim hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) output_layer = tf.nn.dropout(output_layer, keep_prob=1 - dropout_prob) logits = tf.einsum("abc,c->ab", output_layer, output_weights) logits = tf.nn.bias_add(logits, output_bias) # batch x num_labels if config.get("loss_type", "entropy") == "focal_loss": per_example_loss = loss_utils.focal_loss_multi_v1(logits=logits, labels=labels) else: per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits)
Example #29
Source File: engineer_transformer.py From youtube-8m with Apache License 2.0 | 5 votes |
def std(self, model_input_raw, num_frames, mask): mean_input = self.avg(model_input_raw, num_frames, mask) error = tf.einsum("ijk,ij->ijk", model_input_raw - mean_input, mask) return error
Example #30
Source File: albert_modules_official.py From BERT with Apache License 2.0 | 5 votes |
def dense_layer_3d_proj(input_tensor, hidden_size, head_size, initializer, activation, name=None): """A dense layer with 3D kernel for projection. Args: input_tensor: float Tensor of shape [batch,from_seq_length, num_attention_heads, size_per_head]. hidden_size: The size of hidden layer. num_attention_heads: The size of output dimension. head_size: The size of head. initializer: Kernel initializer. activation: Actication function. name: The name scope of this layer. Returns: float logits Tensor. """ input_shape = albert_utils_official.get_shape_list(input_tensor) num_attention_heads= input_shape[2] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[num_attention_heads * head_size, hidden_size], initializer=initializer) w = tf.reshape(w, [num_attention_heads, head_size, hidden_size]) b = tf.get_variable( name="bias", shape=[hidden_size], initializer=tf.zeros_initializer) ret = tf.einsum("BFND,NDH->BFH", input_tensor, w) ret += b if activation is not None: return activation(ret) else: return ret