Python Examples of tensor2tensor.models.transformer.transformer_ffn

Source File: universal_transformer_util.py From fine-lm with MIT License

5 votes

def transformer_decoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None):
  """Applies a feed-forward function which is parametrised for decoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.

  Returns:
    the output tensor

  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          conv_padding="LEFT",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="LEFT",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: universal_transformer_util.py From tensor2tensor with Apache License 2.0

5 votes

def transformer_decoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None):
  """Applies a feed-forward function which is parametrised for decoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.

  Returns:
    the output tensor

  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          conv_padding="LEFT",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="LEFT",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: universal_transformer_util.py From BERT with Apache License 2.0

5 votes

def transformer_decoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None):
  """Applies a feed-forward function which is parametrised for decoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.

  Returns:
    the output tensor

  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          conv_padding="LEFT",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="LEFT",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: universal_transformer_util.py From training_results_v0.5 with Apache License 2.0

5 votes

def transformer_decoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None):
  """Applies a feed-forward function which is parametrised for decoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.

  Returns:
    the output tensor

  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          conv_padding="LEFT",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="LEFT",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: universal_transformer_modified_utils.py From Graph-Transformer with Apache License 2.0

5 votes

def transformer_decoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None):
  """Applies a feed-forward function which is parametrised for decoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.

  Returns:
    the output tensor

  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          conv_padding="LEFT",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="LEFT",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: transformer_revnet.py From fine-lm with MIT License

4 votes

def transformer_revnet_encoder(encoder_input,
                               encoder_self_attention_bias,
                               hparams,
                               name="encoder"):
  """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string

  Returns:
    y: a Tensors
  """

  def f(x, side_input):
    """f(x) for reversible layer, self-attention layer."""
    encoder_self_attention_bias = side_input[0]

    old_hid_size = hparams.hidden_size
    hparams.hidden_size = old_hid_size // 2

    with tf.variable_scope("self_attention"):
      y = common_attention.multihead_attention(
          common_layers.layer_preprocess(
              x, hparams), None, encoder_self_attention_bias,
          hparams.attention_key_channels or hparams.hidden_size,
          hparams.attention_value_channels or hparams.hidden_size,
          hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
      y = common_layers.layer_postprocess(x, y, hparams)
    hparams.hidden_size = old_hid_size
    return y

  def g(x):
    """g(x) for reversible layer, feed-forward layer."""
    old_hid_size = hparams.hidden_size
    hparams.hidden_size = old_hid_size // 2

    with tf.variable_scope("ffn"):
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams), hparams)
      y = common_layers.layer_postprocess(x, y, hparams)
    hparams.hidden_size = old_hid_size
    return y

  x1, x2 = tf.split(encoder_input, 2, axis=-1)

  with tf.variable_scope(name):
    y1, y2 = rev_block.rev_block(
        x1,
        x2,
        f,
        g,
        num_layers=hparams.num_hidden_layers,
        f_side_input=[encoder_self_attention_bias],
        is_training=hparams.mode == tf.estimator.ModeKeys.TRAIN)
    y = tf.concat([y1, y2], axis=-1)

  return common_layers.layer_preprocess(y, hparams)

Source File: universal_transformer_util.py From fine-lm with MIT License

4 votes

def transformer_encoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None,
                                 pad_remover=None):
  """Applies a feed-forward function which is parametrised for encoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.
    pad_remover: to mask out padding in convolutional layers (efficiency).

  Returns:
    the output tensor
  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          pad_remover,
          conv_padding="SAME",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      assert nonpadding_mask is not None, (
          "The nonpadding_mask should be provided, otherwise the model uses "
          "the leaked padding information to estimate the length!")
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="SAME",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: transformer_revnet.py From tensor2tensor with Apache License 2.0

4 votes

def transformer_revnet_encoder(encoder_input,
                               encoder_self_attention_bias,
                               hparams,
                               name="encoder"):
  """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string

  Returns:
    y: a Tensors
  """

  def f(x, side_input):
    """f(x) for reversible layer, self-attention layer."""
    encoder_self_attention_bias = side_input[0]

    old_hid_size = hparams.hidden_size
    hparams.hidden_size = old_hid_size // 2

    with tf.variable_scope("self_attention"):
      y = common_attention.multihead_attention(
          common_layers.layer_preprocess(
              x, hparams), None, encoder_self_attention_bias,
          hparams.attention_key_channels or hparams.hidden_size,
          hparams.attention_value_channels or hparams.hidden_size,
          hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
      y = common_layers.layer_postprocess(x, y, hparams)
    hparams.hidden_size = old_hid_size
    return y

  def g(x):
    """g(x) for reversible layer, feed-forward layer."""
    old_hid_size = hparams.hidden_size
    hparams.hidden_size = old_hid_size // 2

    with tf.variable_scope("ffn"):
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams), hparams)
      y = common_layers.layer_postprocess(x, y, hparams)
    hparams.hidden_size = old_hid_size
    return y

  x1, x2 = tf.split(encoder_input, 2, axis=-1)

  with tf.variable_scope(name):
    y1, y2 = contrib.layers().rev_block(
        x1,
        x2,
        f,
        g,
        num_layers=hparams.num_hidden_layers,
        f_side_input=[encoder_self_attention_bias],
        is_training=hparams.mode == tf.estimator.ModeKeys.TRAIN)
    y = tf.concat([y1, y2], axis=-1)

  return common_layers.layer_preprocess(y, hparams)

Source File: universal_transformer_util.py From tensor2tensor with Apache License 2.0

4 votes

def transformer_encoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None,
                                 pad_remover=None):
  """Applies a feed-forward function which is parametrised for encoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.
    pad_remover: to mask out padding in convolutional layers (efficiency).

  Returns:
    the output tensor
  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          pad_remover,
          conv_padding="SAME",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      assert nonpadding_mask is not None, (
          "The nonpadding_mask should be provided, otherwise the model uses "
          "the leaked padding information to estimate the length!")
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="SAME",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: transformer_revnet.py From BERT with Apache License 2.0

4 votes

def transformer_revnet_encoder(encoder_input,
                               encoder_self_attention_bias,
                               hparams,
                               name="encoder"):
  """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string

  Returns:
    y: a Tensors
  """

  def f(x, side_input):
    """f(x) for reversible layer, self-attention layer."""
    encoder_self_attention_bias = side_input[0]

    old_hid_size = hparams.hidden_size
    hparams.hidden_size = old_hid_size // 2

    with tf.variable_scope("self_attention"):
      y = common_attention.multihead_attention(
          common_layers.layer_preprocess(
              x, hparams), None, encoder_self_attention_bias,
          hparams.attention_key_channels or hparams.hidden_size,
          hparams.attention_value_channels or hparams.hidden_size,
          hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
      y = common_layers.layer_postprocess(x, y, hparams)
    hparams.hidden_size = old_hid_size
    return y

  def g(x):
    """g(x) for reversible layer, feed-forward layer."""
    old_hid_size = hparams.hidden_size
    hparams.hidden_size = old_hid_size // 2

    with tf.variable_scope("ffn"):
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams), hparams)
      y = common_layers.layer_postprocess(x, y, hparams)
    hparams.hidden_size = old_hid_size
    return y

  x1, x2 = tf.split(encoder_input, 2, axis=-1)

  with tf.variable_scope(name):
    y1, y2 = tf.contrib.layers.rev_block(
        x1,
        x2,
        f,
        g,
        num_layers=hparams.num_hidden_layers,
        f_side_input=[encoder_self_attention_bias],
        is_training=hparams.mode == tf.estimator.ModeKeys.TRAIN)
    y = tf.concat([y1, y2], axis=-1)

  return common_layers.layer_preprocess(y, hparams)

Source File: universal_transformer_util.py From BERT with Apache License 2.0

4 votes

def transformer_encoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None,
                                 pad_remover=None):
  """Applies a feed-forward function which is parametrised for encoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.
    pad_remover: to mask out padding in convolutional layers (efficiency).

  Returns:
    the output tensor
  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          pad_remover,
          conv_padding="SAME",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      assert nonpadding_mask is not None, (
          "The nonpadding_mask should be provided, otherwise the model uses "
          "the leaked padding information to estimate the length!")
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="SAME",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: transformer_revnet.py From training_results_v0.5 with Apache License 2.0

4 votes

def transformer_revnet_encoder(encoder_input,
                               encoder_self_attention_bias,
                               hparams,
                               name="encoder"):
  """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string

  Returns:
    y: a Tensors
  """

  def f(x, side_input):
    """f(x) for reversible layer, self-attention layer."""
    encoder_self_attention_bias = side_input[0]

    old_hid_size = hparams.hidden_size
    hparams.hidden_size = old_hid_size // 2

    with tf.variable_scope("self_attention"):
      y = common_attention.multihead_attention(
          common_layers.layer_preprocess(
              x, hparams), None, encoder_self_attention_bias,
          hparams.attention_key_channels or hparams.hidden_size,
          hparams.attention_value_channels or hparams.hidden_size,
          hparams.hidden_size, hparams.num_heads, hparams.attention_dropout)
      y = common_layers.layer_postprocess(x, y, hparams)
    hparams.hidden_size = old_hid_size
    return y

  def g(x):
    """g(x) for reversible layer, feed-forward layer."""
    old_hid_size = hparams.hidden_size
    hparams.hidden_size = old_hid_size // 2

    with tf.variable_scope("ffn"):
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams), hparams)
      y = common_layers.layer_postprocess(x, y, hparams)
    hparams.hidden_size = old_hid_size
    return y

  x1, x2 = tf.split(encoder_input, 2, axis=-1)

  with tf.variable_scope(name):
    y1, y2 = tf.contrib.layers.rev_block(
        x1,
        x2,
        f,
        g,
        num_layers=hparams.num_hidden_layers,
        f_side_input=[encoder_self_attention_bias],
        is_training=hparams.mode == tf.estimator.ModeKeys.TRAIN)
    y = tf.concat([y1, y2], axis=-1)

  return common_layers.layer_preprocess(y, hparams)

Source File: universal_transformer_util.py From training_results_v0.5 with Apache License 2.0

4 votes

def transformer_encoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None,
                                 pad_remover=None):
  """Applies a feed-forward function which is parametrised for encoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.
    pad_remover: to mask out padding in convolutional layers (efficiency).

  Returns:
    the output tensor
  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          pad_remover,
          conv_padding="SAME",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      assert nonpadding_mask is not None, (
          "The nonpadding_mask should be provided, otherwise the model uses "
          "the leaked padding information to estimate the length!")
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="SAME",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Source File: universal_transformer_modified_utils.py From Graph-Transformer with Apache License 2.0

4 votes

def transformer_encoder_ffn_unit(x,
                                 hparams,
                                 nonpadding_mask=None,
                                 pad_remover=None):
  """Applies a feed-forward function which is parametrised for encoding.

  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.
    pad_remover: to mask out padding in convolutional layers (efficiency).

  Returns:
    the output tensor
  """

  with tf.variable_scope("ffn"):
    if hparams.transformer_ffn_type == "fc":
      y = transformer.transformer_ffn_layer(
          common_layers.layer_preprocess(x, hparams),
          hparams,
          pad_remover,
          conv_padding="SAME",
          nonpadding_mask=nonpadding_mask)

    if hparams.transformer_ffn_type == "sepconv":
      assert nonpadding_mask is not None, (
          "The nonpadding_mask should be provided, otherwise the model uses "
          "the leaked padding information to estimate the length!")
      y = common_layers.sepconv_relu_sepconv(
          common_layers.layer_preprocess(x, hparams),
          filter_size=hparams.filter_size,
          output_size=hparams.hidden_size,
          first_kernel_size=(3, 1),
          second_kernel_size=(5, 1),
          padding="SAME",
          nonpadding_mask=nonpadding_mask,
          dropout=hparams.relu_dropout)

    x = common_layers.layer_postprocess(x, y, hparams)

  return x

Python tensor2tensor.models.transformer.transformer_ffn_layer() Examples