Python tensorflow.compat.v1.matmul() Examples

The following are 30 code examples of tensorflow.compat.v1.matmul(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow.compat.v1 , or try the search function .
Example #1
Source File: flop_regularizer_test.py    From morph-net with Apache License 2.0 6 votes vote down vote up
def testFlopRegularizerDontConvertToVariable(self):
    tf.reset_default_graph()
    tf.set_random_seed(1234)

    x = tf.constant(1.0, shape=[2, 6], name='x', dtype=tf.float32)
    w = tf.Variable(tf.truncated_normal([6, 4], stddev=1.0), use_resource=True)
    net = tf.matmul(x, w)

    # Create FLOPs network regularizer.
    threshold = 0.9
    flop_reg = flop_regularizer.GroupLassoFlopsRegularizer([net.op], threshold,
                                                           0)

    with self.cached_session():
      tf.global_variables_initializer().run()
      flop_reg.get_regularization_term().eval() 
Example #2
Source File: run_pretraining.py    From albert with Apache License 2.0 6 votes vote down vote up
def get_sentence_order_output(albert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, albert_config.hidden_size],
        initializer=modeling.create_initializer(
            albert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    labels = tf.reshape(labels, [-1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs) 
Example #3
Source File: utils.py    From magenta with Apache License 2.0 6 votes vote down vote up
def linear(x, n_inputs, n_outputs, name):
  """Simple linear layer.

  Args:
    x: The [mb, time, channels] tensor input.
    n_inputs: The input number of channels.
    n_outputs: The output number of channels.
    name: The variable scope to provide to W and biases.

  Returns:
    y: The output of the operation.
  """
  w = tf.get_variable(
      name=name + "/W", shape=[1, 1, n_inputs, n_outputs], dtype=tf.float32)
  b = tf.get_variable(
      name=name + "/biases", shape=[n_outputs], dtype=tf.float32)
  y = tf.nn.bias_add(tf.matmul(x[:, 0, :], w[0][0]), b)
  y = tf.expand_dims(y, 1)
  return y 
Example #4
Source File: common_layers.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def smoothing_cross_entropy_factored(a, b, labels, confidence):
  """Memory-efficient computation of smoothing cross-entropy.

  Avoids realizing the entire logits matrix at once.

  Args:
    a: a Tensor with shape [batch, inner_dim]
    b: a Tensor with shape [vocab_size, inner_dim]
    labels: an integer Tensor with shape [batch]
    confidence: a float

  Returns:
    A Tensor with shape [batch]
  """
  num_splits = 16
  vocab_size = shape_list(b)[0]
  labels = approximate_split(labels, num_splits)
  a = approximate_split(a, num_splits)
  parts = []
  for part in range(num_splits):
    with tf.control_dependencies(parts[-1:]):
      logits = tf.matmul(a[part], b, transpose_b=True)
      parts.append(
          smoothing_cross_entropy(logits, labels[part], vocab_size, confidence))
  return tf.concat(parts, 0) 
Example #5
Source File: export_to_tfhub.py    From albert with Apache License 2.0 6 votes vote down vote up
def get_mlm_logits(model, albert_config, mlm_positions):
  """From run_pretraining.py."""
  input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions)
  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=albert_config.embedding_size,
          activation=modeling.get_activation(albert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              albert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[albert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(
        input_tensor, model.get_embedding_table(), transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
  return logits 
Example #6
Source File: nade.py    From magenta with Apache License 2.0 6 votes vote down vote up
def _cond_prob(self, a, w_dec_i, b_dec_i):
    """Gets the conditional probability for a single dimension.

    Args:
      a: Model's hidden state, sized `[batch_size, num_hidden]`.
      w_dec_i: The decoder weight terms for the dimension, sized
          `[num_hidden, 1]`.
      b_dec_i: The decoder bias terms, sized `[batch_size, 1]`.

    Returns:
      cond_p_i: The conditional probability of the dimension, sized
        `[batch_size, 1]`.
      cond_l_i: The conditional logits of the dimension, sized
        `[batch_size, 1]`.
    """
    # Decode hidden units to get conditional probability.
    h = tf.sigmoid(a)
    cond_l_i = b_dec_i + tf.matmul(h, w_dec_i)
    cond_p_i = tf.sigmoid(cond_l_i)
    return cond_p_i, cond_l_i 
Example #7
Source File: transformer_glow_layers_ops.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def dense_weightnorm(
    name, x, n_out, x_mask, init_scale, init, dtype=tf.float32):
  """Dense layer with weight normalization."""
  n_in = common_layers.shape_list(x)[2]
  eps = tf.keras.backend.epsilon()
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    v = tf.get_variable(
        "v", [n_in, n_out], dtype,
        initializer=tf.random_normal_initializer(0, 0.05), trainable=True)
    v = v / tf.norm(v, axis=0, keepdims=True)
    t = tf.matmul(x, v)  # [B, L, n_out]
    mean, var = moments_over_bl(t, x_mask)
    g_init = init_scale / (tf.sqrt(var) + eps)
    g = get_variable_ddi(
        "g", [n_out], g_init, init,
        initializer=tf.zeros_initializer, dtype=dtype, trainable=True)
    b = get_variable_ddi(
        "b", [n_out], -mean*g_init, init,
        initializer=tf.zeros_initializer, dtype=dtype, trainable=True)
    w = g * v
    y = tf.matmul(x, w) + b
    tf.summary.histogram("_g", g)
    return y 
Example #8
Source File: relative_bounds.py    From interval-bound-propagation with Apache License 2.0 6 votes vote down vote up
def apply_linear(self, wrapper, w, b):
    """Propagates the bounds through a linear layer.

    Args:
      wrapper: Contains prior bounds from a previous iteration.
      w: 2D tensor of shape (input_size, output_size) containing
        weights for the linear layer.
      b: 1D tensor of shape (output_size) containing biases for the linear
        layer, or `None` if no bias.

    Returns:
      Output bounds.
    """
    w_pos = tf.maximum(w, 0)
    w_neg = tf.minimum(w, 0)
    lb = (tf.matmul(self.lower_offset, w_pos) +
          tf.matmul(self.upper_offset, w_neg))
    ub = (tf.matmul(self.upper_offset, w_pos) +
          tf.matmul(self.lower_offset, w_neg))

    nominal_out = tf.matmul(self.nominal, w)
    if b is not None:
      nominal_out += b

    return RelativeIntervalBounds(lb, ub, nominal_out) 
Example #9
Source File: vq_discrete.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def embedding_lookup(self, x, means):
    """Compute nearest neighbors and loss for training the embeddings.

    Args:
        x: Batch of encoder continuous latent states sliced/projected into
        shape
        [-1, num_blocks, block_dim].
        means: Embedding means.

    Returns:
        The nearest neighbor in one hot form, the nearest neighbor
        itself, the
        commitment loss, embedding training loss.
    """
    x_means_hot = self.nearest_neighbor(x, means)
    x_means_hot_flat = tf.reshape(
        x_means_hot, [-1, self.hparams.num_blocks, self.hparams.block_v_size])
    x_means = tf.matmul(tf.transpose(x_means_hot_flat, perm=[1, 0, 2]), means)
    x_means = tf.transpose(x_means, [1, 0, 2])
    q_loss = tf.reduce_mean(
        tf.squared_difference(tf.stop_gradient(x), x_means))
    e_loss = tf.reduce_mean(
        tf.squared_difference(x, tf.stop_gradient(x_means)))
    return x_means_hot, x_means, q_loss, e_loss 
Example #10
Source File: neural_assistant.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def compute_average_embedding(input_embeddings, input_lengths):
  """Computes bag-of-words embedding.

  Args:
    input_embeddings: <tf.float32>[bs, max_seq_len, emb_dim]
    input_lengths: <tf.int64>[bs, 1]

  Returns:
    bow_embedding: <tf.float32>[bs, emb_dim]
  """
  max_seq_len = tf.shape(input_embeddings)[1]
  # <tf.float32>[bs, 1, max_seq_len]
  mask = tf.sequence_mask(input_lengths, max_seq_len, dtype=tf.float32)
  # <tf.float32>[bs, 1, emb_dim]
  sum_embedding = tf.matmul(mask, input_embeddings)
  # <tf.float32>[bs, 1, emb_dim]
  avg_embedding = sum_embedding / tf.to_float(tf.expand_dims(input_lengths, 2))
  # <tf.float32>[bs, dim]
  return tf.squeeze(avg_embedding, 1) 
Example #11
Source File: neural_assistant.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def compute_last_embedding(input_embeddings, input_lengths, hparams):
  """Computes average of last K embedding.

  Args:
    input_embeddings: <tf.float32>[bs, max_seq_len, emb_dim]
    input_lengths: <tf.int64>[bs, 1]
    hparams: model hparams

  Returns:
    last_k_embedding: <tf.float32>[bs, emb_dim]
  """
  max_seq_len = tf.shape(input_embeddings)[1]
  # <tf.float32>[bs, 1, max_seq_len]
  mask = tf.sequence_mask(input_lengths, max_seq_len, dtype=tf.float32)
  del_mask = tf.sequence_mask(
      input_lengths - hparams.last_k, max_seq_len, dtype=tf.float32)
  final_mask = mask - del_mask
  # <tf.float32>[bs, 1, emb_dim]
  sum_embedding = tf.matmul(final_mask, input_embeddings)
  # <tf.float32>[bs, 1, emb_dim]
  last_k_embedding = sum_embedding / tf.to_float(
      tf.expand_dims(
          tf.ones([tf.shape(input_embeddings)[0], 1]) * hparams.last_k, 2))
  # <tf.float32>[bs, dim]
  return tf.squeeze(last_k_embedding, 1) 
Example #12
Source File: vqa_attention.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def attn(image_feat, query, hparams, name="attn"):
  """Attention on image feature with question as query."""
  with tf.variable_scope(name, "attn", values=[image_feat, query]):
    attn_dim = hparams.attn_dim
    num_glimps = hparams.num_glimps
    num_channels = common_layers.shape_list(image_feat)[-1]
    if len(common_layers.shape_list(image_feat)) == 4:
      image_feat = common_layers.flatten4d3d(image_feat)
    query = tf.expand_dims(query, 1)
    image_proj = common_attention.compute_attention_component(
        image_feat, attn_dim, name="image_proj")
    query_proj = common_attention.compute_attention_component(
        query, attn_dim, name="query_proj")
    h = tf.nn.relu(image_proj + query_proj)
    h_proj = common_attention.compute_attention_component(
        h, num_glimps, name="h_proj")
    p = tf.nn.softmax(h_proj, axis=1)
    image_ave = tf.matmul(image_feat, p, transpose_a=True)
    image_ave = tf.reshape(image_ave, [-1, num_channels*num_glimps])

    return image_ave 
Example #13
Source File: message_passing_attention.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def compute_values(edge_compatibility, v):
  """Compute values. If edge compatibilities is just adjacency, we get ggnn.

  Args:
    edge_compatibility: A tensor of shape [batch, num_transforms, length, depth]
    v: A tensor of shape [batch, num_transforms, length, depth]

  Returns:
    output: A [batch, length, depth] tensor
  """

  # Computes the incoming value vectors for each node by weighting them
  # according to the attention weights. These values are still segregated by
  # edge type.
  # Shape = [B, T, N, V].
  all_edge_values = tf.matmul(tf.to_float(edge_compatibility), v)

  # Combines the weighted value vectors together across edge types into a
  # single N x V matrix for each batch.
  output = tf.reduce_sum(all_edge_values, axis=1)  # Shape [B, N, V].
  return output 
Example #14
Source File: transformer_nat.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def vq_nearest_neighbor(x, hparams):
  """Find the nearest element in means to elements in x."""
  bottleneck_size = 2**hparams.bottleneck_bits
  means = hparams.means
  x_norm_sq = tf.reduce_sum(tf.square(x), axis=-1, keepdims=True)
  means_norm_sq = tf.reduce_sum(tf.square(means), axis=-1, keepdims=True)
  scalar_prod = tf.matmul(x, means, transpose_b=True)
  dist = x_norm_sq + tf.transpose(means_norm_sq) - 2 * scalar_prod
  if hparams.bottleneck_kind == "em":
    x_means_idx = tf.multinomial(-dist, num_samples=hparams.num_samples)
    x_means_hot = tf.one_hot(
        x_means_idx, depth=bottleneck_size)
    x_means_hot = tf.reduce_mean(x_means_hot, axis=1)
  else:
    x_means_idx = tf.argmax(-dist, axis=-1)
    x_means_hot = tf.one_hot(x_means_idx, depth=bottleneck_size)
  x_means = tf.matmul(x_means_hot, means)
  e_loss = tf.reduce_mean(tf.squared_difference(x, tf.stop_gradient(x_means)))
  return x_means_hot, e_loss 
Example #15
Source File: export_checkpoints.py    From albert with Apache License 2.0 6 votes vote down vote up
def get_sentence_order_logits(input_tensor, albert_config):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, albert_config.hidden_size],
        initializer=modeling.create_initializer(
            albert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    return logits 
Example #16
Source File: export_checkpoints.py    From albert with Apache License 2.0 6 votes vote down vote up
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights):
  """From run_pretraining.py."""
  input_tensor = gather_indexes(input_tensor, mlm_positions)
  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=albert_config.embedding_size,
          activation=modeling.get_activation(albert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              albert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[albert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(
        input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
  return logits 
Example #17
Source File: matmul_source_op_handler_test.py    From morph-net with Apache License 2.0 6 votes vote down vote up
def testMatMul2D(self, size):
    inputs = tf.zeros((13, 2))
    handler = matmul_source_op_handler.MatMulSourceOpHandler(0.1)

    kernel = tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.float32)
    x = tf.matmul(inputs, kernel, transpose_b=False, name='MatMul')
    op_slice = orm.OpSlice(x.op, orm.Slice(0, size))

    transpose_kernel = tf.constant([[1, 4], [2, 5], [3, 6]], dtype=tf.float32)
    x_other = tf.matmul(
        inputs,
        transpose_kernel,
        transpose_b=True,
        name='MatMulTransposedKernel')
    op_slice_other = orm.OpSlice(x_other.op, orm.Slice(0, size))

    self.assertAllClose(
        handler.create_regularizer(op_slice).regularization_vector,
        handler.create_regularizer(op_slice_other).regularization_vector) 
Example #18
Source File: util.py    From nni with MIT License 6 votes vote down vote up
def lstm(xs, ms, s, scope, nh, init_scale=1.0):
    """lstm cell"""
    _, nin = [v.value for v in xs[0].get_shape()] # the first is nbatch
    with tf.variable_scope(scope):
        wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
        wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
        b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))

    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
    for idx, (x, m) in enumerate(zip(xs, ms)):
        c = c*(1-m)
        h = h*(1-m)
        z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
        i = tf.nn.sigmoid(i)
        f = tf.nn.sigmoid(f)
        o = tf.nn.sigmoid(o)
        u = tf.tanh(u)
        c = f*c + i*u
        h = o*tf.tanh(c)
        xs[idx] = h
    s = tf.concat(axis=1, values=[c, h])
    return xs, s 
Example #19
Source File: tiled_linear.py    From lamb with Apache License 2.0 6 votes vote down vote up
def _build_tiled_linear(self, inputs, input_name_and_sizes,
                          output_name_and_sizes, add_bias):
    # pylint: disable=missing-docstring
    def split_output(output):
      if len(output_name_and_sizes) == 1:
        return output
      elif len(set([size for _, size in output_name_and_sizes])) == 1:
        # This is a bit faster than several tf.slice calls.
        return tf.split(output, len(output_name_and_sizes), axis=1)
      else:
        outputs = []
        offset = 0
        for _, output_size in output_name_and_sizes:
          outputs.append(tf.slice(output, [0, offset], [-1, output_size]))
          offset += output_size
        return outputs

    weights = self._ensure_weights()
    if len(inputs) > 1:
      inputs = tf.concat(inputs, 1)
    if add_bias:
      biases = self._ensure_biases()
      return split_output(tf.nn.xw_plus_b(inputs, weights, biases))
    else:
      return split_output(tf.matmul(inputs, weights)) 
Example #20
Source File: transformer_memory.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def _address_content(self, x):
    """Address the memory based on content similarity.

    Args:
      x: a tensor in the shape of [batch_size, length, depth].
    Returns:
      the logits for each memory entry [batch_size, length, memory_size].
    """
    mem_keys = tf.layers.dense(self.mem_vals, self.key_depth,
                               bias_initializer=tf.constant_initializer(1.0),
                               name="mem_key")
    mem_query = tf.layers.dense(x, self.key_depth,
                                bias_initializer=tf.constant_initializer(1.0),
                                name="mem_query")
    norm = tf.matmul(self._norm(mem_query), self._norm(mem_keys),
                     transpose_b=True)
    dot_product = tf.matmul(mem_query, mem_keys, transpose_b=True)
    cos_dist = tf.div(dot_product, norm + 1e-7, name="cos_dist")
    access_logits = self.sharpen_factor * cos_dist
    return access_logits 
Example #21
Source File: fastlin.py    From interval-bound-propagation with Apache License 2.0 5 votes vote down vote up
def apply_linear(self, wrapper, w, b):
    bounds_out = super(RelativeSymbolicBounds, self).apply_linear(
        wrapper, w, b=None)

    nominal_out = tf.matmul(self._nominal, w)
    if b is not None:
      nominal_out += b

    return RelativeSymbolicBounds(
        bounds_out.lower, bounds_out.upper, nominal_out).with_priors(
            wrapper.output_bounds) 
Example #22
Source File: modeling.py    From albert with Apache License 2.0 5 votes vote down vote up
def dense_layer_2d(input_tensor,
                   output_size,
                   initializer,
                   activation,
                   use_einsum,
                   num_attention_heads=1,
                   name=None):
  """A dense layer with 2D kernel.

  Args:
    input_tensor: Float tensor with rank 3.
    output_size: The size of output dimension.
    initializer: Kernel initializer.
    activation: Activation function.
    use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers.
    num_attention_heads: number of attention head in attention layer.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """
  del num_attention_heads  # unused
  input_shape = get_shape_list(input_tensor)
  hidden_size = input_shape[2]
  with tf.variable_scope(name):
    w = tf.get_variable(
        name="kernel",
        shape=[hidden_size, output_size],
        initializer=initializer)
    b = tf.get_variable(
        name="bias", shape=[output_size], initializer=tf.zeros_initializer)
    if use_einsum:
      ret = tf.einsum("BFH,HO->BFO", input_tensor, w)
    else:
      ret = tf.matmul(input_tensor, w)
    ret += b
  if activation is not None:
    return activation(ret)
  else:
    return ret 
Example #23
Source File: model.py    From gpt2-estimator with MIT License 5 votes vote down vote up
def conv1d(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        *start, nx = shape_list(x)
        w = tf.get_variable(
            'w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
        c = tf.reshape(tf.matmul(tf.reshape(
            x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf])
        return c 
Example #24
Source File: snail_test.py    From tensor2robot with Apache License 2.0 5 votes vote down vote up
def test_CausallyMaskedSoftmax(self):
    num_rows = 5
    x = tf.random.normal((num_rows, 3))
    logits = tf.matmul(x, tf.linalg.transpose(x))
    y = snail.CausallyMaskedSoftmax(logits)
    with self.test_session() as sess:
      y_ = sess.run(y)
      idx = np.triu_indices(num_rows, 1)
      np.testing.assert_array_equal(y_[idx], 0.)
      # Testing that each row sums to 1.
      for i in range(num_rows):
        np.testing.assert_almost_equal(np.sum(y_[i, :]), 1.0) 
Example #25
Source File: snail.py    From tensor2robot with Apache License 2.0 5 votes vote down vote up
def AttentionBlock(x, key_size, value_size, scope = ""):
  """Self-attention key-value lookup, styled after Vaswani et al. '17.

  query and key are of shape [T, K]. query * transpose(key) yields logits of
  shape [T, T]. logits[i, j] corresponds to unnormalized attention vector over
  values [T, V] for each timestep i. Because this attention is over a set of
  temporal values, we causally mask the pre-softmax logits[i, j] := 0, for all
  j > i.

  Citations:
    Vaswani et al. '17: Attention is All you need
      https://arxiv.org/abs/1706.03762.

  Args:
    x: Input tensor of shape [batch, sequence_length, channels].
    key_size: Integer key dimensionality.
    value_size: Integer value dimensionality.
    scope: Variable scope for this layer.
  Returns:
    result: Tensor of shape [batch, sequence_length, channels + value_size]
    end_points: Dictionary of intermediate values (e.g. debugging).
  """
  end_points = {}
  with tf.variable_scope(scope):
    key = layers.fully_connected(x, key_size, activation_fn=None)  # [T, K]
    query = layers.fully_connected(x, key_size, activation_fn=None)  # [T, K]
    logits = tf.matmul(query, key, transpose_b=True)  # [T, T]
    # Useful for visualizing attention alignment matrices.
    probs = CausallyMaskedSoftmax(logits/np.sqrt(key_size))  # [T, T]
    end_points["attn_prob"] = probs
    values = layers.fully_connected(x, value_size, activation_fn=None)  # [T, V]
    read = tf.matmul(probs, values)  # [T, V]
    result = tf.concat([x, read], axis=2)  # [T, K + V]
    return result, end_points 
Example #26
Source File: rnn.py    From magenta with Apache License 2.0 5 votes vote down vote up
def super_linear(x,
                 output_size,
                 scope=None,
                 reuse=False,
                 init_w='ortho',
                 weight_start=0.0,
                 use_bias=True,
                 bias_start=0.0,
                 input_size=None):
  """Performs linear operation. Uses ortho init defined earlier."""
  shape = x.get_shape().as_list()
  with tf.variable_scope(scope or 'linear'):
    if reuse:
      tf.get_variable_scope().reuse_variables()

    w_init = None  # uniform
    if input_size is None:
      x_size = shape[1]
    else:
      x_size = input_size
    if init_w == 'zeros':
      w_init = tf.constant_initializer(0.0)
    elif init_w == 'constant':
      w_init = tf.constant_initializer(weight_start)
    elif init_w == 'gaussian':
      w_init = tf.random_normal_initializer(stddev=weight_start)
    elif init_w == 'ortho':
      w_init = lstm_ortho_initializer(1.0)

    w = tf.get_variable(
        'super_linear_w', [x_size, output_size], tf.float32, initializer=w_init)
    if use_bias:
      b = tf.get_variable(
          'super_linear_b', [output_size],
          tf.float32,
          initializer=tf.constant_initializer(bias_start))
      return tf.matmul(x, w) + b
    return tf.matmul(x, w) 
Example #27
Source File: rnn.py    From magenta with Apache License 2.0 5 votes vote down vote up
def __call__(self, x, state, scope=None):
    with tf.variable_scope(scope or type(self).__name__):
      c, h = tf.split(state, 2, 1)

      x_size = x.get_shape().as_list()[1]

      w_init = None  # uniform

      h_init = lstm_ortho_initializer(1.0)

      # Keep W_xh and W_hh separate here as well to use different init methods.
      w_xh = tf.get_variable(
          'W_xh', [x_size, 4 * self.num_units], initializer=w_init)
      w_hh = tf.get_variable(
          'W_hh', [self.num_units, 4 * self.num_units], initializer=h_init)
      bias = tf.get_variable(
          'bias', [4 * self.num_units],
          initializer=tf.constant_initializer(0.0))

      concat = tf.concat([x, h], 1)
      w_full = tf.concat([w_xh, w_hh], 0)
      hidden = tf.matmul(concat, w_full) + bias

      i, j, f, o = tf.split(hidden, 4, 1)

      if self.use_recurrent_dropout:
        g = tf.nn.dropout(tf.tanh(j), self.dropout_keep_prob)
      else:
        g = tf.tanh(j)

      new_c = c * tf.sigmoid(f + self.forget_bias) + tf.sigmoid(i) * g
      new_h = tf.tanh(new_c) * tf.sigmoid(o)

      return new_h, tf.concat([new_c, new_h], 1)  # fuk tuples. 
Example #28
Source File: learning.py    From magenta with Apache License 2.0 5 votes vote down vote up
def gram_matrix(feature_maps):
  """Computes the Gram matrix for a set of feature maps."""
  batch_size, height, width, channels = tf.unstack(tf.shape(feature_maps))
  denominator = tf.to_float(height * width)
  feature_maps = tf.reshape(
      feature_maps, tf.stack([batch_size, height * width, channels]))
  matrix = tf.matmul(feature_maps, feature_maps, adjoint_a=True)
  return matrix / denominator 
Example #29
Source File: models.py    From graphics with Apache License 2.0 5 votes vote down vote up
def _compute_sdf(self, x, translations, blend_terms, points):
    """Compute signed distances between query points and hyperplanes."""
    n_parts = tf.shape(x)[1]
    n_planes = tf.shape(x)[2]
    norm_logit = x[..., 0:self._dims - 1]
    offset = (-(tf.nn.sigmoid(x[..., self._dims - 1:self._dims]) *
                self._offset_scale + self._offset_lbound))
    blend_planes = (
        tf.nn.sigmoid(blend_terms[..., :n_parts]) * self._blend_scale +
        self._blend_lbound)

    # Norm of the boundary line
    norm_rad = tf.tanh(norm_logit) * np.pi  # [..., (azimuth, altitude)]
    if self._dims == 3:
      norm = tf.stack([
          tf.sin(norm_rad[..., 1]) * tf.cos(norm_rad[..., 0]),
          tf.sin(norm_rad[..., 1]) * tf.sin(norm_rad[..., 0]),
          tf.cos(norm_rad[..., 1])
      ],
                      axis=-1)
    else:
      norm = tf.concat([tf.cos(norm_rad), tf.sin(norm_rad)], axis=-1)

    # Calculate signed distances to hyperplanes.
    points = (
        tf.expand_dims(points, axis=1) - tf.expand_dims(translations, axis=2))
    points = tf.expand_dims(points, axis=2)
    points = tf.tile(points, [1, 1, n_planes, 1, 1])
    signed_dis = tf.matmul(points, tf.expand_dims(norm, axis=-1))
    signed_dis = signed_dis + tf.expand_dims(offset, axis=-2)

    return signed_dis, translations, blend_planes, offset 
Example #30
Source File: modeling.py    From albert with Apache License 2.0 5 votes vote down vote up
def dense_layer_3d_proj(input_tensor,
                        hidden_size,
                        head_size,
                        initializer,
                        activation,
                        use_einsum,
                        name=None):
  """A dense layer with 3D kernel for projection.

  Args:
    input_tensor: float Tensor of shape [batch,from_seq_length,
      num_attention_heads, size_per_head].
    hidden_size: The size of hidden layer.
    head_size: The size of head.
    initializer: Kernel initializer.
    activation: Actication function.
    use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """
  input_shape = get_shape_list(input_tensor)
  num_attention_heads = input_shape[2]
  with tf.variable_scope(name):
    w = tf.get_variable(
        name="kernel",
        shape=[num_attention_heads * head_size, hidden_size],
        initializer=initializer)
    w = tf.reshape(w, [num_attention_heads, head_size, hidden_size])
    b = tf.get_variable(
        name="bias", shape=[hidden_size], initializer=tf.zeros_initializer)
    if use_einsum:
      ret = tf.einsum("BFND,NDH->BFH", input_tensor, w)
    else:
      ret = einsum_via_matmul(input_tensor, w, 2)
    ret += b
  if activation is not None:
    return activation(ret)
  else:
    return ret