Python Examples of tensorflow.compat.v1.zeros

Source File: neural_stack.py From tensor2tensor with Apache License 2.0

6 votes

def add_vector_projection(self, name, size):
    """A helper function for mapping embedding controller outputs.

    Args:
      name: A prefix for the variable names.
      size: The desired number of embedding outputs.

    Returns:
      A tuple of (weights, bias) where weights has shape
      [num_units, size * embedding_size] and bias has shape
      [size * embedding_size].
    """
    weights = self.add_variable(
        name + "_projection_weights",
        shape=[self._num_units, size * self._embedding_size],
        dtype=self.dtype)
    bias = self.add_variable(
        name + "_projection_bias",
        shape=[size * self._embedding_size],
        initializer=tf.zeros_initializer(dtype=self.dtype))
    return weights, bias

Source File: export_checkpoints.py From albert with Apache License 2.0

6 votes

def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights):
  """From run_pretraining.py."""
  input_tensor = gather_indexes(input_tensor, mlm_positions)
  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=albert_config.embedding_size,
          activation=modeling.get_activation(albert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              albert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[albert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(
        input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
  return logits

Source File: transformer_glow_layers_ops.py From tensor2tensor with Apache License 2.0

6 votes

def dense_weightnorm(
    name, x, n_out, x_mask, init_scale, init, dtype=tf.float32):
  """Dense layer with weight normalization."""
  n_in = common_layers.shape_list(x)[2]
  eps = tf.keras.backend.epsilon()
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    v = tf.get_variable(
        "v", [n_in, n_out], dtype,
        initializer=tf.random_normal_initializer(0, 0.05), trainable=True)
    v = v / tf.norm(v, axis=0, keepdims=True)
    t = tf.matmul(x, v)  # [B, L, n_out]
    mean, var = moments_over_bl(t, x_mask)
    g_init = init_scale / (tf.sqrt(var) + eps)
    g = get_variable_ddi(
        "g", [n_out], g_init, init,
        initializer=tf.zeros_initializer, dtype=dtype, trainable=True)
    b = get_variable_ddi(
        "b", [n_out], -mean*g_init, init,
        initializer=tf.zeros_initializer, dtype=dtype, trainable=True)
    w = g * v
    y = tf.matmul(x, w) + b
    tf.summary.histogram("_g", g)
    return y

Source File: common_layers.py From tensor2tensor with Apache License 2.0

6 votes

def zero_add(previous_value, x, name=None, reuse=None):
  """Resnet connection with zero initialization.

  Another type of resnet connection which returns previous_value + gamma * x.
  gamma is a trainable scalar and initialized with zero. It is useful when a
  module is plugged into a trained model and we want to make sure it matches the
  original model's performance.

  Args:
    previous_value:  A tensor.
    x: A tensor.
    name: name of variable scope; defaults to zero_add.
    reuse: reuse scope.

  Returns:
    previous_value + gamma * x.
  """
  with tf.variable_scope(name, default_name="zero_add", reuse=reuse):
    gamma = tf.get_variable("gamma", (), initializer=tf.zeros_initializer())
    return previous_value + gamma * x

Source File: export_checkpoints.py From albert with Apache License 2.0

6 votes

def get_sentence_order_logits(input_tensor, albert_config):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, albert_config.hidden_size],
        initializer=modeling.create_initializer(
            albert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    return logits

Source File: common_layers.py From tensor2tensor with Apache License 2.0

6 votes

def group_norm(x, filters=None, num_groups=8, epsilon=1e-5):
  """Group normalization as in https://arxiv.org/abs/1803.08494."""
  x_shape = shape_list(x)
  if filters is None:
    filters = x_shape[-1]
  assert len(x_shape) == 4
  assert filters % num_groups == 0
  # Prepare variables.
  scale = tf.get_variable(
      "group_norm_scale", [filters], initializer=tf.ones_initializer())
  bias = tf.get_variable(
      "group_norm_bias", [filters], initializer=tf.zeros_initializer())
  epsilon, scale, bias = [cast_like(t, x) for t in [epsilon, scale, bias]]
  # Reshape and compute group norm.
  x = tf.reshape(x, x_shape[:-1] + [num_groups, filters // num_groups])
  # Calculate mean and variance on heights, width, channels (not groups).
  mean, variance = tf.nn.moments(x, [1, 2, 4], keep_dims=True)
  norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
  return tf.reshape(norm_x, x_shape) * scale + bias

Source File: ops_test.py From mesh with Apache License 2.0

6 votes

def testVariableOperations(self):
    var = mtf.Variable(self.mesh,
                       "test_variable",
                       self.ab_shape,
                       mtf.VariableDType(tf.int32, tf.int32, tf.int32),
                       initializer=tf.zeros_initializer(),
                       trainable=True)

    self.assertEqual(var.splittable_dims, frozenset(["a", "b"]))
    self.assertEqual(var.unsplittable_dims, frozenset())

    read_variable = mtf.ReadVariable(var)
    self.assertEqual(read_variable.splittable_dims, frozenset(["a", "b"]))
    self.assertEqual(read_variable.unsplittable_dims, frozenset())

    assign = mtf.Assign([var], [self.x])
    self.assertEqual(assign.splittable_dims, frozenset(["a", "b"]))
    self.assertEqual(assign.unsplittable_dims, frozenset())

    depend = mtf.Depend(read_variable.outputs[0], [assign])
    self.assertEqual(depend.splittable_dims, frozenset(["a", "b"]))
    self.assertEqual(depend.unsplittable_dims, frozenset())

Source File: transformer_glow_layers.py From tensor2tensor with Apache License 2.0

6 votes

def actnorm(name, x, x_mask, inverse, init, logscale_factor=3.0):
  """Activation normalization, returns logabsdet of shape [B]."""
  eps = tf.keras.backend.epsilon()
  n_channels = common_layers.shape_list(x)[2]

  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    x_mean, x_var = gops.moments_over_bl(x, x_mask)
    b = gops.get_variable_ddi(
        "b", (n_channels), -x_mean, init, tf.zeros_initializer)
    log_w_init = -0.5 * tf.log(x_var + eps) / logscale_factor
    log_w = gops.get_variable_ddi(
        "log_w", (n_channels), log_w_init, init,
        tf.zeros_initializer) * logscale_factor

    if not inverse:
      x = (x + b) * tf.exp(log_w)
    else:
      x = x * tf.exp(-log_w) - b

    x_length = tf.reduce_sum(x_mask, -1)
    logabsdet = x_length * tf.reduce_sum(log_w)
    if inverse:
      logabsdet *= -1
    return x, logabsdet

Source File: common_layers.py From language with Apache License 2.0

6 votes

def linear_transform(x, output_size, scope, bias=False, input_size=None):
  """Simple linear transform of x.

  Args:
    x: <float>[batch_size, length, input_size]
    output_size: Integer specifying output size.
    scope: String name for variable scope.
    bias: If True, adds a learned bias term.
    input_size: Explicitly specify input_size if not set as static shape.

  Returns:
    <float>[batch_size, length, output_size]
  """
  input_size = input_size or x.get_shape()[-1]
  with tf.variable_scope(scope):
    batch_size = tf.shape(x)[0]
    weights = tf.get_variable("weights", shape=(input_size, output_size))
    weights = tf.expand_dims(weights, 0)
    weights = tf.tile(weights, [batch_size, 1, 1])
    x = tf.matmul(x, weights)
    if bias:
      bias = tf.get_variable(
          "bias", shape=(output_size), initializer=tf.zeros_initializer())
      x += bias
    return x

Source File: common_layers.py From language with Apache License 2.0

6 votes

def apply_norm(x, epsilon=1e-6):
  """Applies layer normalization to x.

  Based on "Layer Normalization":
  https://arxiv.org/abs/1607.06450

  Args:
    x: <float>[..., input_size]
    epsilon: Used to avoid division by 0.

  Returns:
    <float>[..., input_size]
  """
  input_size = x.get_shape()[-1]
  with tf.variable_scope("layer_norm", values=[x]):
    scale = tf.get_variable(
        "layer_norm_scale", [input_size], initializer=tf.ones_initializer())
    bias = tf.get_variable(
        "layer_norm_bias", [input_size], initializer=tf.zeros_initializer())
    mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
    result = norm_x * scale + bias
    return result

Source File: glow_ops.py From tensor2tensor with Apache License 2.0

6 votes

def scale_gaussian_prior(name, z, logscale_factor=3.0, trainable=True):
  """Returns N(s^i * z^i, std^i) where s^i and std^i are pre-component.

  s^i is a learnable parameter with identity initialization.
  std^i is optionally learnable with identity initialization.

  Args:
    name: variable scope.
    z: input_tensor
    logscale_factor: equivalent to scaling up the learning_rate by a factor
                     of logscale_factor.
    trainable: Whether or not std^i is learnt.
  """
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    z_shape = common_layers.shape_list(z)
    latent_multiplier = tf.get_variable(
        "latent_multiplier", shape=z_shape, dtype=tf.float32,
        initializer=tf.ones_initializer())
    log_scale = tf.get_variable(
        "log_scale_latent", shape=z_shape, dtype=tf.float32,
        initializer=tf.zeros_initializer(), trainable=trainable)
    log_scale = log_scale * logscale_factor
    return tfp.distributions.Normal(
        loc=latent_multiplier * z, scale=tf.exp(log_scale))

Source File: neural_stack.py From tensor2tensor with Apache License 2.0

6 votes

def add_scalar_projection(self, name, size):
    """A helper function for mapping scalar controller outputs.

    Args:
      name: A prefix for the variable names.
      size: The desired number of scalar outputs.

    Returns:
      A tuple of (weights, bias) where weights has shape [num_units, size] and
      bias has shape [size].
    """
    weights = self.add_variable(
        name + "_projection_weights",
        shape=[self._num_units, size],
        dtype=self.dtype)
    bias = self.add_variable(
        name + "_projection_bias",
        shape=[size],
        initializer=tf.zeros_initializer(dtype=self.dtype))
    return weights, bias

Source File: export_to_tfhub.py From albert with Apache License 2.0

6 votes

def get_mlm_logits(model, albert_config, mlm_positions):
  """From run_pretraining.py."""
  input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions)
  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=albert_config.embedding_size,
          activation=modeling.get_activation(albert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              albert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[albert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(
        input_tensor, model.get_embedding_table(), transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
  return logits

Source File: mnist_benchmark.py From autograph with Apache License 2.0

6 votes

def get_data_and_params():
  """Set up input dataset and variables."""
  (train_x, train_y), _ = tf.keras.datasets.mnist.load_data()
  tf.set_random_seed(0)
  hparams = contrib_training.HParams(
      batch_size=200,
      learning_rate=0.1,
      train_steps=101,
  )
  dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
  dataset = dataset.repeat()
  dataset = dataset.shuffle(hparams.batch_size * 10)
  dataset = dataset.batch(hparams.batch_size)

  def reshape_ex(x, y):
    return (tf.to_float(tf.reshape(x, (-1, 28 * 28))) / 256.0,
            tf.one_hot(tf.squeeze(y), 10))

  dataset = dataset.map(reshape_ex)
  w = tf.get_variable('w0', (28 * 28, 10))
  b = tf.get_variable('b0', (10,), initializer=tf.zeros_initializer())
  opt = tf.train.GradientDescentOptimizer(hparams.learning_rate)
  return dataset, opt, hparams, w, b

Source File: slate_decomp_q_agent.py From recsim with Apache License 2.0

6 votes

def _build_select_slate_op(self):
    p_no_click = self._prob_no_click_ph
    p = self._doc_affinity_scores_ph
    q = self._net_outputs.q_values[0]
    with tf.name_scope('select_slate'):
      self._output_slate = self._select_slate_fn(self._slate_size, p_no_click,
                                                 p, q)

    self._output_slate = tf.Print(
        self._output_slate, [tf.constant('cp 1'), self._output_slate, p, q],
        summarize=10000)
    self._output_slate = tf.reshape(self._output_slate, (self._slate_size,))

    self._action_counts = tf.get_variable(
        'action_counts',
        shape=[self._num_candidates],
        initializer=tf.zeros_initializer())
    output_slate = tf.reshape(self._output_slate, [-1])
    output_one_hot = tf.one_hot(output_slate, self._num_candidates)
    update_ops = []
    for i in range(self._slate_size):
      update_ops.append(tf.assign_add(self._action_counts, output_one_hot[i]))
    self._select_action_update_op = tf.group(*update_ops)

Source File: bert_as_summarizer.py From DeepPavlov with Apache License 2.0

6 votes

def _init_graph(self):
        self._init_placeholders()

        self.bert = BertModel(config=self.bert_config,
                              is_training=self.is_train_ph,
                              input_ids=self.input_ids_ph,
                              input_mask=self.input_masks_ph,
                              token_type_ids=self.token_types_ph,
                              use_one_hot_embeddings=False,
                              )
        # next sentence prediction head
        with tf.variable_scope("cls/seq_relationship"):
            output_weights = tf.get_variable(
                "output_weights",
                shape=[2, self.bert_config.hidden_size],
                initializer=create_initializer(self.bert_config.initializer_range))
            output_bias = tf.get_variable(
                "output_bias", shape=[2], initializer=tf.zeros_initializer())

        nsp_logits = tf.matmul(self.bert.get_pooled_output(), output_weights, transpose_b=True)
        nsp_logits = tf.nn.bias_add(nsp_logits, output_bias)
        self.nsp_probs = tf.nn.softmax(nsp_logits, axis=-1)

Source File: tiled_linear.py From lamb with Apache License 2.0

6 votes

def _build_tiled_linear(self, inputs, input_name_and_sizes,
                          output_name_and_sizes, add_bias):
    results = []
    for output_name, output_size in output_name_and_sizes:
      r = 0.0
      for input_, (input_name, input_size) in zip(inputs, input_name_and_sizes):
        name = 'W_{}_{}'.format(input_name, output_name)
        weight = self._get_variable(
            name, shape=[output_size, input_size])
        r += tf.sparse_tensor_dense_matmul(weight, input_, adjoint_b=True)
      r = tf.transpose(r)
      if add_bias:
        # Biases are dense, hence we call _get_variable of the base
        # class.
        r += super(SparseTiledLinear, self)._get_variable(
            'B_{}'.format(output_name), shape=[output_size],
            default_initializer=tf.zeros_initializer())
      results.append(r)
    return results


# TODO(melisgl): Since computation is the same as in TiledLinear,
# perhaps this should be implemented as a custom getter (see
# tf.get_variable) instead of being tied to tiling.

Source File: run_pretraining.py From albert with Apache License 2.0

6 votes

def get_sentence_order_output(albert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, albert_config.hidden_size],
        initializer=modeling.create_initializer(
            albert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    labels = tf.reshape(labels, [-1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs)

Source File: utils.py From lamb with Apache License 2.0

6 votes

def layer_norm(x, reduction_indices, epsilon=1e-9, gain=None, bias=None,
               per_element=True, scope=None):
  """DOC."""
  reduction_indices = ensure_list(reduction_indices)
  mean = tf.reduce_mean(x, reduction_indices, keep_dims=True)
  variance = tf.reduce_mean(tf.squared_difference(x, mean),
                            reduction_indices, keep_dims=True)
  normalized = (x - mean) / tf.sqrt(variance + epsilon)
  dtype = x.dtype
  shape = x.get_shape().as_list()
  for i in six.moves.range(len(shape)):
    if i not in reduction_indices or not per_element:
      shape[i] = 1
  with tf.variable_scope(scope or 'layer_norm'):
    if gain is None:
      gain = tf.get_variable('gain', shape=shape, dtype=dtype,
                             initializer=tf.ones_initializer())
    if bias is None:
      bias = tf.get_variable('bias', shape=shape, dtype=dtype,
                             initializer=tf.zeros_initializer())
  return gain*normalized+bias

Source File: averaged.py From lamb with Apache License 2.0

6 votes

def __init__(self, tensors):
    tensors = list(tensors)
    with tf.variable_scope('averaged'):
      self._num_samples = tf.Variable(0, name='num_samples', trainable=False)
      with tf.variable_scope('avg'):
        self._averages = [
            tf.get_variable(
                tensor.name.replace('/', '-').replace(':', '-'),
                tensor.get_shape(), initializer=tf.zeros_initializer(),
                trainable=False)
            for tensor in tensors]
      with tf.variable_scope('save'):
        self._saves = [
            tf.get_variable(
                tensor.name.replace('/', '-').replace(':', '-'),
                tensor.get_shape(), initializer=tf.zeros_initializer(),
                trainable=False)
            for tensor in tensors]
    self._tensors = tensors
    self._take_sample = self._make_take_sample()
    self._switch = self._make_swith_to_average()
    self._restore = self._make_restore()
    self._reset = self._make_reset()

Source File: model_fns.py From language with Apache License 2.0

5 votes

def _get_bert_embeddings(model, layers_to_use, aggregation_fn, name="bert"):
  """Extract embeddings from BERT model."""
  all_hidden = model.get_all_encoder_layers()
  layers_hidden = [all_hidden[i] for i in layers_to_use]
  hidden_shapes = [
      modeling.get_shape_list(hid, expected_rank=3) for hid in all_hidden
  ]

  if len(layers_hidden) == 1:
    hidden_emb = layers_hidden[0]
    hidden_size = hidden_shapes[0][2]
  elif aggregation_fn == "concat":
    hidden_emb = tf.concat(layers_hidden, 2)
    hidden_size = sum([hidden_shapes[i][2] for i in layers_to_use])
  elif aggregation_fn == "average":
    hidden_size = hidden_shapes[0][2]
    assert all([shape[2] == hidden_size for shape in hidden_shapes
               ]), hidden_shapes
    hidden_emb = tf.add_n(layers_hidden) / len(layers_hidden)
  elif aggregation_fn == "attention":
    hidden_size = hidden_shapes[0][2]
    mixing_weights = tf.get_variable(
        name + "/mixing/weights", [len(layers_hidden)],
        initializer=tf.zeros_initializer())
    mixing_scores = tf.nn.softmax(mixing_weights)
    hidden_emb = tf.tensordot(
        tf.stack(layers_hidden, axis=-1), mixing_scores, [[-1], [0]])
  else:
    raise ValueError("Unrecognized aggregation function %s." % aggregation_fn)

  return hidden_emb, hidden_size

Source File: run_dualencoder_qa.py From language with Apache License 2.0

5 votes

def _get_bert_embeddings(model, layers_to_use, aggregation_fn, name="bert"):
  """Extract embeddings from BERT model."""
  all_hidden = model.get_all_encoder_layers()
  layers_hidden = [all_hidden[i] for i in layers_to_use]
  hidden_shapes = [
      modeling.get_shape_list(hid, expected_rank=3) for hid in all_hidden
  ]

  if len(layers_hidden) == 1:
    hidden_emb = layers_hidden[0]
    hidden_size = hidden_shapes[0][2]
  elif aggregation_fn == "concat":
    hidden_emb = tf.concat(layers_hidden, 2)
    hidden_size = sum([hidden_shapes[i][2] for i in layers_to_use])
  elif aggregation_fn == "average":
    hidden_size = hidden_shapes[0][2]
    assert all([shape[2] == hidden_size for shape in hidden_shapes
               ]), hidden_shapes
    hidden_emb = tf.add_n(layers_hidden) / len(layers_hidden)
  elif aggregation_fn == "attention":
    hidden_size = hidden_shapes[0][2]
    mixing_weights = tf.get_variable(
        name + "/mixing/weights", [len(layers_hidden)],
        initializer=tf.zeros_initializer())
    mixing_scores = tf.nn.softmax(mixing_weights)
    hidden_emb = tf.tensordot(
        tf.stack(layers_hidden, axis=-1), mixing_scores, [[-1], [0]])
  else:
    raise ValueError("Unrecognized aggregation function %s." % aggregation_fn)

  return hidden_emb, hidden_size

Source File: modeling.py From training with Apache License 2.0

5 votes

def dense_layer_3d(input_tensor,
                   num_attention_heads,
                   size_per_head,
                   initializer,
                   activation,
                   name=None):
  """A dense layer with 3D kernel.

  Args:
    input_tensor: float Tensor of shape [batch, seq_length, hidden_size].
    num_attention_heads: Number of attention heads.
    size_per_head: The size per attention head.
    initializer: Kernel initializer.
    activation: Actication function.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """

  last_dim = get_shape_list(input_tensor)[-1]

  with tf.variable_scope(name):
    w = tf.get_variable(
        name="kernel",
        shape=[last_dim, num_attention_heads * size_per_head],
        initializer=initializer)
    w = tf.reshape(w, [last_dim, num_attention_heads, size_per_head])
    b = tf.get_variable(
        name="bias",
        shape=[num_attention_heads * size_per_head],
        initializer=tf.zeros_initializer)
    b = tf.reshape(b, [num_attention_heads, size_per_head])
    ret = tf.einsum("abc,cde->abde", input_tensor, w)
    ret += b
    if activation is not None:
      return activation(ret)
    else:
      return ret

Source File: ops.py From language with Apache License 2.0

5 votes

def affine(x, output_size, weight_name, bias_name=None, weight_init=None):
  """Affine transformation of the input `x`.

  Args:
    x: <float32>[..., x_dim]
    output_size: size of the last output dimension
    weight_name: Name of the weight variable to use
    bias_name: Name for the bias variable, if one should be used
    weight_init: Initializer of the weight variable

  Returns:
    transformed <float32>[..., `output_size`]
  """
  dim = x.shape.as_list()[-1]
  w = tf.get_variable(
      weight_name, (dim, output_size), tf.float32, initializer=weight_init)
  out = tf.tensordot(x, w, [[len(x.shape) - 1], [0]])
  if bias_name:
    b = tf.get_variable(
        bias_name, (output_size,),
        tf.float32,
        initializer=tf.zeros_initializer())
    for _ in range(len(out.shape) - 1):
      b = tf.expand_dims(b, 0)
    out += b
  return out

Source File: run_squad.py From language with Apache License 2.0

5 votes

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_hidden = model.get_sequence_output()

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  output_weights = tf.get_variable(
      "cls/squad/output_weights", [2, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])
  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  return (start_logits, end_logits)

Source File: bert.py From mesh with Apache License 2.0

5 votes

def get_masked_lm_output(self, positions, label_ids, label_weights):
    """Get loss and logits for the masked LM."""
    input_tensor = self.get_sequence_output()
    output_weights = self.get_embedding_table()

    # [batch_size, num_position, hidden]
    input_tensor = mtf.gather(input_tensor, positions, self.seq_dim)
    with tf.variable_scope("cls/predictions"):
      # We apply one more non-linear transformation before the output layer.
      # This matrix is not used after pre-training.
      with tf.variable_scope("transform"):
        input_tensor = mtf.layers.dense(
            input_tensor,
            reduced_dims=[self.model_dim],
            new_dims=[self.model_dim],
            activation=get_activation(self.config.feedforward_intermediate_act),
            kernel_initializer=self.dense_initializer,
            use_bias=self.config.use_bias)
        input_tensor = self.normalize(input_tensor)
      # The output weights are the same as the input embeddings, but there is
      # an output-only bias for each token.
      output_bias = mtf.get_variable(
          input_tensor.mesh,
          name="output_bias",
          shape=[self.vocab_dim],
          initializer=tf.zeros_initializer())
      logits = mtf.einsum([input_tensor, output_weights],
                          reduced_dims=[self.model_dim]) + output_bias
      per_example_loss = mtf.layers.softmax_cross_entropy_with_logits(
          logits, label_ids, self.vocab_dim, z_loss=1e-4)
      # The `positions` tensor might be zero-padded (if the sequence is too
      # short to have the maximum number of predictions). The `label_weights`
      # tensor has a value of 1.0 for every real prediction and 0.0 for the
      # padding predictions.
      numerator = mtf.reduce_sum(label_weights * per_example_loss)
      denominator = mtf.reduce_sum(label_weights) + mtf.constant(
          input_tensor.mesh, 1e-5, dtype=tf.float32)
      loss = numerator / denominator
    return (loss, per_example_loss, logits)

Source File: layers.py From mesh with Apache License 2.0

5 votes

def layer_norm(x, dim, epsilon=1e-6, name="layer_prepostprocess"):
  """Layer normalization over dimension dim.

  Args:
    x: a mtf.Tensor whose shape contains dim.
    dim: a mtf.Dimension
    epsilon: a floating point number
    name: a string used for tf.variable_scope.

  Returns:
    a mtf.Tensor with same shape as x.
  """
  with tf.variable_scope(name + "/layer_norm"):
    scale = mtf.get_variable(
        x.mesh,
        "layer_norm_scale",
        mtf.Shape([dim]),
        initializer=tf.ones_initializer(),
        activation_dtype=x.dtype)
    bias = mtf.get_variable(
        x.mesh,
        "layer_norm_bias",
        mtf.Shape([dim]),
        initializer=tf.zeros_initializer(),
        activation_dtype=x.dtype)
    reduced_shape = x.shape - dim
    mean = mtf.reduce_mean(x, output_shape=reduced_shape)
    variance = mtf.reduce_mean(mtf.square(x - mean), output_shape=reduced_shape)
    norm_x = (x - mean) * mtf.rsqrt(variance + epsilon)
    return norm_x * scale + bias

Source File: optimize.py From mesh with Apache License 2.0

5 votes

def apply_grad(self, grad, var):
    if grad is None:
      tf.logging.warning("Gradient is None for variable %s" % var.name)
      return []

    updates = []
    v = mtf.get_variable(
        var.mesh, var.name + "_momentum_v", var.shape,
        dtype=var.dtype, initializer=tf.zeros_initializer(), trainable=False)

    with tf.variable_scope(var.name + "/sgd_momentum"):
      updates.append(mtf.assign(v, grad * self.lr + v * self.momentum))
      updates.append(mtf.assign_sub(var, v))

    return updates

Source File: model.py From interval-bound-propagation with Apache License 2.0

5 votes

def _create_linear_initializer(input_size, output_size, dtype=tf.float32):  # pylint: disable=unused-argument
  """Returns a default initializer for the weights of a linear module."""
  return {
      'w': tf.orthogonal_initializer(),
      'b': tf.zeros_initializer(dtype=dtype),
  }

Source File: convnet_builder.py From benchmarks with Apache License 2.0

5 votes

def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon):
    """Batch normalization on `input_layer` without tf.layers."""
    # We make this function as similar as possible to the
    # tf.contrib.layers.batch_norm, to minimize the differences between using
    # layers and not using layers.
    shape = input_layer.shape
    num_channels = shape[3] if self.data_format == 'NHWC' else shape[1]
    beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32,
                             initializer=tf.zeros_initializer())
    if use_scale:
      gamma = self.get_variable('gamma', [num_channels], tf.float32,
                                tf.float32, initializer=tf.ones_initializer())
    else:
      gamma = tf.constant(1.0, tf.float32, [num_channels])
    # For moving variables, we use tf.get_variable instead of self.get_variable,
    # since self.get_variable returns the result of tf.cast which we cannot
    # assign to.
    moving_mean = tf.get_variable('moving_mean', [num_channels],
                                  tf.float32,
                                  initializer=tf.zeros_initializer(),
                                  trainable=False)
    moving_variance = tf.get_variable('moving_variance', [num_channels],
                                      tf.float32,
                                      initializer=tf.ones_initializer(),
                                      trainable=False)
    if self.phase_train:
      bn, batch_mean, batch_variance = tf.nn.fused_batch_norm(
          input_layer, gamma, beta, epsilon=epsilon,
          data_format=self.data_format, is_training=True)
      mean_update = moving_averages.assign_moving_average(
          moving_mean, batch_mean, decay=decay, zero_debias=False)
      variance_update = moving_averages.assign_moving_average(
          moving_variance, batch_variance, decay=decay, zero_debias=False)
      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update)
      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update)
    else:
      bn, _, _ = tf.nn.fused_batch_norm(
          input_layer, gamma, beta, mean=moving_mean,
          variance=moving_variance, epsilon=epsilon,
          data_format=self.data_format, is_training=False)
    return bn

Python tensorflow.compat.v1.zeros_initializer() Examples