Python tensorflow.sparse_add() Examples

The following are 17 code examples of tensorflow.sparse_add(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: sparse_add_op_test.py    From deep_image_model with Apache License 2.0 6 votes vote down vote up
def testAddSparseDense(self):
    np.random.seed(1618)  # Make it reproducible.
    n, m = np.random.randint(30, size=2)
    for dtype in [np.float32, np.float64, np.int64, np.complex64]:
      for index_dtype in [np.int32, np.int64]:
        rand_vals_np = np.random.randn(n, m).astype(dtype)
        dense_np = np.random.randn(n, m).astype(dtype)

        with self.test_session(use_gpu=False):
          sparse, unused_nnz = _sparsify(rand_vals_np, index_dtype=index_dtype)
          s = tf.sparse_add(sparse, tf.constant(dense_np)).eval()
          self.assertAllEqual(dense_np + rand_vals_np, s)
          self.assertTrue(s.dtype == dtype)

          # check commutativity
          s = tf.sparse_add(tf.constant(dense_np), sparse).eval()
          self.assertAllEqual(dense_np + rand_vals_np, s)
          self.assertTrue(s.dtype == dtype) 
Example #2
Source File: sparse_add_op_test.py    From deep_image_model with Apache License 2.0 6 votes vote down vote up
def _s2d_add_vs_sparse_add(sparsity, n, m, num_iters=50):
  np.random.seed(1618)

  with tf.Session(graph=tf.Graph()) as sess:
    sp_vals = np.random.rand(n, m).astype(np.float32)
    sp_t, unused_nnz = _sparsify(sp_vals, thresh=sparsity, index_dtype=np.int32)
    vals = np.random.rand(n, m).astype(np.float32)

    s2d = tf.add(tf.sparse_tensor_to_dense(sp_t), tf.constant(vals))
    sa = tf.sparse_add(sp_t, tf.constant(vals))

    timeit.timeit(lambda: sess.run(s2d), number=3)
    timeit.timeit(lambda: sess.run(sa), number=3)

    s2d_total = timeit.timeit(lambda: sess.run(s2d), number=num_iters)
    sa_total = timeit.timeit(lambda: sess.run(sa), number=num_iters)

  # per-iter latency; secs to millis
  return s2d_total * 1e3 / num_iters, sa_total * 1e3 / num_iters 
Example #3
Source File: model_utils.py    From gcnn-survey-paper with Apache License 2.0 6 votes vote down vote up
def sp_compute_adj_att(node_features, adj_matrix_sp):
  """Self-attention for edges as in GAT with sparse adjacency."""
  out_dim = node_features.shape[-1]
  # Self-attention mechanism
  a_row = tf.get_variable(
      initializer=WEIGHT_INIT,
      dtype=tf.float32,
      name='selfatt-row',
      shape=(out_dim, 1))
  a_col = tf.get_variable(
      initializer=WEIGHT_INIT,
      dtype=tf.float32,
      name='selfatt-col',
      shape=(out_dim, 1))
  alpha_row = tf.matmul(node_features, a_row)
  alpha_col = tf.matmul(node_features, a_col)
  # Compute matrix with self-attention scores using broadcasting
  alpha = tf.sparse_add(adj_matrix_sp * alpha_row,
                        adj_matrix_sp * tf.transpose(alpha_col, perm=[1, 0]))
  return alpha 
Example #4
Source File: sparse_add_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def testAddSelfAndNegation(self):
    with self.test_session(use_gpu=False) as sess:
      sp_a = self._SparseTensor_3x3()
      sp_b = self._SparseTensor_3x3(negate=True)

      sp_sum = tf.sparse_add(sp_a, sp_b, 0.1)
      sum_out = sess.run(sp_sum)

      self.assertEqual(sp_sum.shape.get_shape(), [2])
      self.assertAllEqual(sum_out.indices, np.empty([0, 2]))
      self.assertAllEqual(sum_out.values, [])
      self.assertAllEqual(sum_out.shape, [3, 3]) 
Example #5
Source File: sparse_add_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def testSmallValuesShouldVanish(self):
    with self.test_session(use_gpu=False) as sess:
      sp_a = self._SparseTensor_3x3()
      sp_b = self._SparseTensor_3x3_v2()

      # sum:
      # [       2]
      # [.1      ]
      # [ 6   -.2]

      # two values should vanish: |.1| < .21, and |-.2| < .21
      sp_sum = tf.sparse_add(sp_a, sp_b, thresh=0.21)
      sum_out = sess.run(sp_sum)

      self.assertEqual(sp_sum.shape.get_shape(), [2])
      self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0]])
      self.assertAllEqual(sum_out.values, [2, 6])
      self.assertAllEqual(sum_out.shape, [3, 3])

      # only .1 vanishes
      sp_sum = tf.sparse_add(sp_a, sp_b, thresh=0.11)
      sum_out = sess.run(sp_sum)

      self.assertEqual(sp_sum.shape.get_shape(), [2])
      self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0], [2, 1]])
      self.assertAllClose(sum_out.values, [2, 6, -.2])
      self.assertAllEqual(sum_out.shape, [3, 3]) 
Example #6
Source File: sparse_add_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def testGradients(self):
    np.random.seed(1618)  # Make it reproducible.
    with self.test_session(use_gpu=False):
      for n in [10, 31]:
        for m in [4, 17]:
          sp_a, nnz_a = self._randomTensor([n, m], np.float32)
          sp_b, nnz_b = self._randomTensor([n, m], np.float32)
          sp_sum = tf.sparse_add(sp_a, sp_b)
          nnz_sum = len(sp_sum.values.eval())

          err = tf.test.compute_gradient_error([sp_a.values, sp_b.values],
                                               [(nnz_a,), (nnz_b,)],
                                               sp_sum.values, (nnz_sum,))
          self.assertLess(err, 1e-3) 
Example #7
Source File: sparse_add_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def benchmarkSparseAddDense(self):

    print("SparseAddDense: add with sparse_to_dense vs. sparse_add")
    print("%nnz \t n \t m \t millis(s2d) \t millis(sparse_add) \t speedup")

    for sparsity in [0.99, 0.5, 0.01]:
      for n in [1, 256, 50000]:
        for m in [100, 1000]:
          s2d_dt, sa_dt = _s2d_add_vs_sparse_add(sparsity, n, m)
          print("%.2f \t %d \t %d \t %.4f \t %.4f \t %.2f" % (sparsity, n, m,
                                                              s2d_dt, sa_dt,
                                                              s2d_dt / sa_dt)) 
Example #8
Source File: loss_graphs.py    From tensorrec with Apache License 2.0 5 votes vote down vote up
def connect_loss_graph(self, tf_interactions, tf_prediction, **kwargs):
        error = tf.sparse_add(tf_interactions, -1.0 * tf_prediction)
        return tf.sqrt(tf.reduce_mean(tf.square(error))) 
Example #9
Source File: sparse_add_op_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def testAddSelf(self):
    with self.test_session(use_gpu=False) as sess:
      for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
        for sp_b in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
          sp_sum = tf.sparse_add(sp_a, sp_b)

          sum_out = sess.run(sp_sum)

          self.assertEqual(sp_sum.shape.get_shape(), [2])
          self.assertAllEqual(
              sum_out.indices, [[0, 1], [1, 0], [2, 0], [2, 1]])
          self.assertAllEqual(sum_out.values, [2, 4, 6, 8])
          self.assertAllEqual(sum_out.shape, [3, 3]) 
Example #10
Source File: rotation_matrix_builder.py    From MedicalDataAugmentationTool with GNU General Public License v3.0 5 votes vote down vote up
def create(self):
        rotation = tf.eye(self.dim)
        #rotation = tf.SparseTensor(indices=[[i, i] for i in range(self.dim)], values=[1.0 for _ in range(self.dim)], dense_shape=[self.dim, self.dim])

        for plane in self.planes:
            old_rotation = rotation
            new_rotation = self.rotation_matrix(plane)
            #rotation = tf.matmul(old_rotation, tf.sparse_add(tf.zeros([self.dim, self.dim]), new_rotation))
            rotation = tf.sparse_tensor_dense_matmul(new_rotation, old_rotation)

        return rotation 
Example #11
Source File: layers.py    From GAT with MIT License 4 votes vote down vote up
def sp_attn_head(seq, out_sz, adj_mat, activation, nb_nodes, in_drop=0.0, coef_drop=0.0, residual=False):
    with tf.name_scope('sp_attn'):
        if in_drop != 0.0:
            seq = tf.nn.dropout(seq, 1.0 - in_drop)

        seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False)

        # simplest self-attention possible
        f_1 = tf.layers.conv1d(seq_fts, 1, 1)
        f_2 = tf.layers.conv1d(seq_fts, 1, 1)
        
        f_1 = tf.reshape(f_1, (nb_nodes, 1))
        f_2 = tf.reshape(f_2, (nb_nodes, 1))

        f_1 = adj_mat*f_1
        f_2 = adj_mat * tf.transpose(f_2, [1,0])

        logits = tf.sparse_add(f_1, f_2)
        lrelu = tf.SparseTensor(indices=logits.indices, 
                values=tf.nn.leaky_relu(logits.values), 
                dense_shape=logits.dense_shape)
        coefs = tf.sparse_softmax(lrelu)

        if coef_drop != 0.0:
            coefs = tf.SparseTensor(indices=coefs.indices,
                    values=tf.nn.dropout(coefs.values, 1.0 - coef_drop),
                    dense_shape=coefs.dense_shape)
        if in_drop != 0.0:
            seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)

        # As tf.sparse_tensor_dense_matmul expects its arguments to have rank-2,
        # here we make an assumption that our input is of batch size 1, and reshape appropriately.
        # The method will fail in all other cases!
        coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes])
        seq_fts = tf.squeeze(seq_fts)
        vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts)
        vals = tf.expand_dims(vals, axis=0)
        vals.set_shape([1, nb_nodes, out_sz])
        ret = tf.contrib.layers.bias_add(vals)

        # residual connection
        if residual:
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
            else:
                ret = ret + seq

        return activation(ret)  # activation 
Example #12
Source File: GAT.py    From OpenHINE with MIT License 4 votes vote down vote up
def sp_attn_head(seq, out_sz, adj_mat, activation, nb_nodes, in_drop=0.0, coef_drop=0.0, residual=False):
    with tf.name_scope('sp_attn'):
        if in_drop != 0.0:
            seq = tf.nn.dropout(seq, 1.0 - in_drop)

        seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False)

        # simplest self-attention possible
        f_1 = tf.layers.conv1d(seq_fts, 1, 1)
        f_2 = tf.layers.conv1d(seq_fts, 1, 1)
        logits = tf.sparse_add(adj_mat * f_1, adj_mat *
                               tf.transpose(f_2, [0, 2, 1]))
        lrelu = tf.SparseTensor(indices=logits.indices,
                                values=tf.nn.leaky_relu(logits.values),
                                dense_shape=logits.dense_shape)
        coefs = tf.sparse_softmax(lrelu)

        if coef_drop != 0.0:
            coefs = tf.SparseTensor(indices=coefs.indices,
                                    values=tf.nn.dropout(
                                        coefs.values, 1.0 - coef_drop),
                                    dense_shape=coefs.dense_shape)
        if in_drop != 0.0:
            seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)

        # As tf.sparse_tensor_dense_matmul expects its arguments to have rank-2,
        # here we make an assumption that our input is of batch size 1, and reshape appropriately.
        # The method will fail in all other cases!
        coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes])
        seq_fts = tf.squeeze(seq_fts)
        vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts)
        vals = tf.expand_dims(vals, axis=0)
        vals.set_shape([1, nb_nodes, out_sz])
        ret = tf.contrib.layers.bias_add(vals)

        # residual connection
        if residual:
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + conv1d(seq, ret.shape[-1], 1)  # activation
            else:
                seq_fts = ret + seq

        return activation(ret)  # activation


# final_embed, att_val = layers.SimpleAttLayer(multi_embed, mp_att_size,
#                                                      time_major=False,
#                                                      return_alphas=True) 
Example #13
Source File: models.py    From GP-VAE with MIT License 4 votes vote down vote up
def __call__(self, x):
        mapped = self.net(x)

        batch_size = mapped.shape.as_list()[0]
        time_length = mapped.shape.as_list()[1]

        # Obtain mean and precision matrix components
        num_dim = len(mapped.shape.as_list())
        perm = list(range(num_dim - 2)) + [num_dim - 1, num_dim - 2]
        mapped_transposed = tf.transpose(mapped, perm=perm)
        mapped_mean = mapped_transposed[:, :self.z_size]
        mapped_covar = mapped_transposed[:, self.z_size:]

        # tf.nn.sigmoid provides more stable performance on Physionet dataset
        if self.data_type == 'physionet':
            mapped_covar = tf.nn.sigmoid(mapped_covar)
        else:
            mapped_covar = tf.nn.softplus(mapped_covar)

        mapped_reshaped = tf.reshape(mapped_covar, [batch_size, self.z_size, 2*time_length])

        dense_shape = [batch_size, self.z_size, time_length, time_length]
        idxs_1 = np.repeat(np.arange(batch_size), self.z_size*(2*time_length-1))
        idxs_2 = np.tile(np.repeat(np.arange(self.z_size), (2*time_length-1)), batch_size)
        idxs_3 = np.tile(np.concatenate([np.arange(time_length), np.arange(time_length-1)]), batch_size*self.z_size)
        idxs_4 = np.tile(np.concatenate([np.arange(time_length), np.arange(1,time_length)]), batch_size*self.z_size)
        idxs_all = np.stack([idxs_1, idxs_2, idxs_3, idxs_4], axis=1)

        # ~10x times faster on CPU then on GPU
        with tf.device('/cpu:0'):
            # Obtain covariance matrix from precision one
            mapped_values = tf.reshape(mapped_reshaped[:, :, :-1], [-1])
            prec_sparse = tf.sparse.SparseTensor(indices=idxs_all, values=mapped_values, dense_shape=dense_shape)
            prec_sparse = tf.sparse.reorder(prec_sparse)
            prec_tril = tf.sparse_add(tf.zeros(prec_sparse.dense_shape, dtype=tf.float32), prec_sparse)
            eye = tf.eye(num_rows=prec_tril.shape.as_list()[-1], batch_shape=prec_tril.shape.as_list()[:-2])
            prec_tril = prec_tril + eye
            cov_tril = tf.linalg.triangular_solve(matrix=prec_tril, rhs=eye, lower=False)
            cov_tril = tf.where(tf.math.is_finite(cov_tril), cov_tril, tf.zeros_like(cov_tril))

        num_dim = len(cov_tril.shape)
        perm = list(range(num_dim - 2)) + [num_dim - 1, num_dim - 2]
        cov_tril_lower = tf.transpose(cov_tril, perm=perm)
        z_dist = tfd.MultivariateNormalTriL(loc=mapped_mean, scale_tril=cov_tril_lower)
        return z_dist


# Decoders 
Example #14
Source File: layers.py    From hetsann with Apache License 2.0 4 votes vote down vote up
def sp_attn_head(seq, out_sz, adj_mat, activation, nb_nodes, in_drop=0.0, coef_drop=0.0, residual=False):
    with tf.name_scope('sp_attn'):
        if in_drop != 0.0:
            seq = tf.nn.dropout(seq, 1.0 - in_drop)

        seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False)

        # simplest self-attention possible
        f_1 = tf.layers.conv1d(seq_fts, 1, 1)
        f_2 = tf.layers.conv1d(seq_fts, 1, 1)
        
        f_1 = tf.reshape(f_1, (nb_nodes, 1))
        f_2 = tf.reshape(f_2, (nb_nodes, 1))

        f_1 = adj_mat*f_1
        f_2 = adj_mat * tf.transpose(f_2, [1,0])

        logits = tf.sparse_add(f_1, f_2)
        lrelu = tf.SparseTensor(indices=logits.indices, 
                values=tf.nn.leaky_relu(logits.values), 
                dense_shape=logits.dense_shape)
        coefs = tf.sparse_softmax(lrelu)

        if coef_drop != 0.0:
            coefs = tf.SparseTensor(indices=coefs.indices,
                    values=tf.nn.dropout(coefs.values, 1.0 - coef_drop),
                    dense_shape=coefs.dense_shape)
        if in_drop != 0.0:
            seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)

        # As tf.sparse_tensor_dense_matmul expects its arguments to have rank-2,
        # here we make an assumption that our input is of batch size 1, and reshape appropriately.
        # The method will fail in all other cases!
        coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes])
        seq_fts = tf.squeeze(seq_fts)
        vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts)
        vals = tf.expand_dims(vals, axis=0)
        vals.set_shape([1, nb_nodes, out_sz])
        ret = tf.contrib.layers.bias_add(vals)

        # residual connection
        if residual:
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
            else:
                ret = ret + seq

        return activation(ret)  # activation 
Example #15
Source File: layers.py    From hetsann with Apache License 2.0 4 votes vote down vote up
def sp_attn_head(seq, out_sz, adj_mat, activation, nb_nodes, in_drop=0.0, coef_drop=0.0, residual=False):
    with tf.name_scope('sp_attn'):
        if in_drop != 0.0:
            seq = tf.nn.dropout(seq, 1.0 - in_drop)

        seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False)

        # simplest self-attention possible
        f_1 = tf.layers.conv1d(seq_fts, 1, 1)
        f_2 = tf.layers.conv1d(seq_fts, 1, 1)
        
        f_1 = tf.reshape(f_1, (nb_nodes, 1))
        f_2 = tf.reshape(f_2, (nb_nodes, 1))

        f_1 = adj_mat*f_1
        f_2 = adj_mat * tf.transpose(f_2, [1,0])

        logits = tf.sparse_add(f_1, f_2)
        lrelu = tf.SparseTensor(indices=logits.indices, 
                values=tf.nn.leaky_relu(logits.values), 
                dense_shape=logits.dense_shape)
        coefs = tf.sparse_softmax(lrelu)

        if coef_drop != 0.0:
            coefs = tf.SparseTensor(indices=coefs.indices,
                    values=tf.nn.dropout(coefs.values, 1.0 - coef_drop),
                    dense_shape=coefs.dense_shape)
        if in_drop != 0.0:
            seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)

        # As tf.sparse_tensor_dense_matmul expects its arguments to have rank-2,
        # here we make an assumption that our input is of batch size 1, and reshape appropriately.
        # The method will fail in all other cases!
        coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes])
        seq_fts = tf.squeeze(seq_fts)
        vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts)
        vals = tf.expand_dims(vals, axis=0)
        vals.set_shape([1, nb_nodes, out_sz])
        ret = tf.contrib.layers.bias_add(vals)

        # residual connection
        if residual:
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
            else:
                ret = ret + seq

        return activation(ret)  # activation 
Example #16
Source File: layers.py    From hetsann with Apache License 2.0 4 votes vote down vote up
def sp_attn_head(seq, out_sz, adj_mat, activation, nb_nodes, in_drop=0.0, coef_drop=0.0, residual=False):
    with tf.name_scope('sp_attn'):
        if in_drop != 0.0:
            seq = tf.nn.dropout(seq, 1.0 - in_drop)

        seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False)

        # simplest self-attention possible
        f_1 = tf.layers.conv1d(seq_fts, 1, 1)
        f_2 = tf.layers.conv1d(seq_fts, 1, 1)
        
        f_1 = tf.reshape(f_1, (nb_nodes, 1))
        f_2 = tf.reshape(f_2, (nb_nodes, 1))

        f_1 = adj_mat*f_1
        f_2 = adj_mat * tf.transpose(f_2, [1,0])

        logits = tf.sparse_add(f_1, f_2)
        lrelu = tf.SparseTensor(indices=logits.indices, 
                values=tf.nn.leaky_relu(logits.values), 
                dense_shape=logits.dense_shape)
        coefs = tf.sparse_softmax(lrelu)

        if coef_drop != 0.0:
            coefs = tf.SparseTensor(indices=coefs.indices,
                    values=tf.nn.dropout(coefs.values, 1.0 - coef_drop),
                    dense_shape=coefs.dense_shape)
        if in_drop != 0.0:
            seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)

        # As tf.sparse_tensor_dense_matmul expects its arguments to have rank-2,
        # here we make an assumption that our input is of batch size 1, and reshape appropriately.
        # The method will fail in all other cases!
        coefs = tf.sparse_reshape(coefs, [nb_nodes, nb_nodes])
        seq_fts = tf.squeeze(seq_fts)
        vals = tf.sparse_tensor_dense_matmul(coefs, seq_fts)
        vals = tf.expand_dims(vals, axis=0)
        vals.set_shape([1, nb_nodes, out_sz])
        ret = tf.contrib.layers.bias_add(vals)

        # residual connection
        if residual:
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
            else:
                ret = ret + seq

        return activation(ret)  # activation 
Example #17
Source File: adversarial_sparse.py    From neural-structured-learning with Apache License 2.0 4 votes vote down vote up
def get_loss_vat(inputs, predictions, mask, is_train, model, placeholders,
                 predictions_var_scope):
  """Computes the virtual adversarial loss for the provided inputs.

  Args:
    inputs: A batch of input features, where the batch is the first dimension.
    predictions: The logits predicted by a model on the provided inputs.
    mask: A tensor of booleans specifying which samples to apply the virtual
      adversarial loss to.
    is_train: A boolean placeholder specifying if this is a training or testing
      setting.
    model: The model that generated the logits.
    placeholders: Placeholders for model encodings.
    predictions_var_scope: Variable scope for obtaining the predictions.

  Returns:
    A float value representing the virtual adversarial loss.
  """
  mask = tf.cast(mask, dtype=tf.float32)
  r_vadv = generate_virtual_adversarial_perturbation(
      inputs,
      predictions,
      model,
      placeholders,
      mask,
      predictions_var_scope,
      is_train=is_train)
  predictions = tf.stop_gradient(predictions)
  logit_p = predictions
  new_inputs = tf.sparse_add(inputs, r_vadv)
  with tf.variable_scope(
      predictions_var_scope, auxiliary_name_scope=False, reuse=True):
    encoding_m, _, _ = model.get_encoding_and_params(
        inputs=new_inputs,
        is_train=is_train,
        update_batch_stats=False,
        **placeholders)
    logit_m, _, _ = model.get_predictions_and_params(
        encoding=encoding_m, is_train=is_train, **placeholders)
  num_non_zero = tf.reduce_sum(mask)
  loss = kl_divergence_with_logit(logit_p, logit_m, mask)
  return tf.reduce_sum(loss) / num_non_zero