Python tensor2tensor.models.transformer.transformer_prepare_decoder() Examples

The following are 17 code examples of tensor2tensor.models.transformer.transformer_prepare_decoder(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensor2tensor.models.transformer , or try the search function .
Example #1
Source File: transformer_nat.py    From fine-lm with MIT License 6 votes vote down vote up
def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets,
                       hparams, name):
  """Original Transformer decoder."""
  with tf.variable_scope(name):
    targets = common_layers.flatten4d3d(targets)

    decoder_input, decoder_self_bias = (
        transformer.transformer_prepare_decoder(targets, hparams))

    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = transformer.transformer_decoder(
        decoder_input, encoder_output, decoder_self_bias,
        encoder_decoder_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, axis=2)
    decoder_output_shape = common_layers.shape_list(decoder_output)
    decoder_output = tf.reshape(
        decoder_output, [decoder_output_shape[0], -1, 1, hparams.hidden_size])
    # Expand since t2t expects 4d tensors.
    return decoder_output 
Example #2
Source File: transformer_nat.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets,
                       hparams, name):
  """Original Transformer decoder."""
  with tf.variable_scope(name):
    targets = common_layers.flatten4d3d(targets)

    decoder_input, decoder_self_bias = (
        transformer.transformer_prepare_decoder(targets, hparams))

    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = transformer.transformer_decoder(
        decoder_input, encoder_output, decoder_self_bias,
        encoder_decoder_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, axis=2)
    decoder_output_shape = common_layers.shape_list(decoder_output)
    decoder_output = tf.reshape(
        decoder_output, [decoder_output_shape[0], -1, 1, hparams.hidden_size])
    # Expand since t2t expects 4d tensors.
    return decoder_output 
Example #3
Source File: transformer_nat.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets,
                       hparams, name):
  """Original Transformer decoder."""
  with tf.variable_scope(name):
    targets = common_layers.flatten4d3d(targets)

    decoder_input, decoder_self_bias = (
        transformer.transformer_prepare_decoder(targets, hparams))

    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = transformer.transformer_decoder(
        decoder_input, encoder_output, decoder_self_bias,
        encoder_decoder_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, axis=2)
    decoder_output_shape = common_layers.shape_list(decoder_output)
    decoder_output = tf.reshape(
        decoder_output, [decoder_output_shape[0], -1, 1, hparams.hidden_size])
    # Expand since t2t expects 4d tensors.
    return decoder_output 
Example #4
Source File: transformer_nat.py    From BERT with Apache License 2.0 6 votes vote down vote up
def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets,
                       hparams, name):
  """Original Transformer decoder."""
  with tf.variable_scope(name):
    targets = common_layers.flatten4d3d(targets)

    decoder_input, decoder_self_bias = (
        transformer.transformer_prepare_decoder(targets, hparams))

    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = transformer.transformer_decoder(
        decoder_input, encoder_output, decoder_self_bias,
        encoder_decoder_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, axis=2)
    decoder_output_shape = common_layers.shape_list(decoder_output)
    decoder_output = tf.reshape(
        decoder_output, [decoder_output_shape[0], -1, 1, hparams.hidden_size])
    # Expand since t2t expects 4d tensors.
    return decoder_output 
Example #5
Source File: transformer_moe.py    From fine-lm with MIT License 5 votes vote down vote up
def _prepare_decoder(self, targets):
    """Process the transformer decoder input."""
    targets = common_layers.flatten4d3d(targets)

    output = transformer.transformer_prepare_decoder(
        targets, self._hparams, features=None,
    )
    deco_input, deco_self_attention_bias = output

    deco_input = tf.nn.dropout(
        deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout
    )
    return deco_input, deco_self_attention_bias 
Example #6
Source File: transformer_revnet.py    From fine-lm with MIT License 5 votes vote down vote up
def body(self, features):
    hparams = self._hparams
    targets = features["targets"]
    inputs = features["inputs"]
    target_space = features["target_space_id"]

    inputs = common_layers.flatten4d3d(inputs)
    targets = common_layers.flatten4d3d(targets)

    (encoder_input, encoder_self_attention_bias,
     encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder(
         inputs, target_space, hparams))
    (decoder_input,
     decoder_self_attention_bias) = transformer.transformer_prepare_decoder(
         targets, hparams)

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    encoder_output = transformer_revnet_encoder(
        encoder_input, encoder_self_attention_bias, hparams)

    decoder_output = transformer_revnet_decoder(
        decoder_input, encoder_output, decoder_self_attention_bias,
        encoder_decoder_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, 2)

    return decoder_output 
Example #7
Source File: transformer_revnet.py    From tensor2tensor with Apache License 2.0 5 votes vote down vote up
def body(self, features):
    hparams = self._hparams
    targets = features["targets"]
    inputs = features["inputs"]
    target_space = features["target_space_id"]

    inputs = common_layers.flatten4d3d(inputs)
    targets = common_layers.flatten4d3d(targets)

    (encoder_input, encoder_self_attention_bias,
     encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder(
         inputs, target_space, hparams))
    (decoder_input,
     decoder_self_attention_bias) = transformer.transformer_prepare_decoder(
         targets, hparams)

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    encoder_output = transformer_revnet_encoder(
        encoder_input, encoder_self_attention_bias, hparams)

    decoder_output = transformer_revnet_decoder(
        decoder_input, encoder_output, decoder_self_attention_bias,
        encoder_decoder_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, 2)

    return decoder_output 
Example #8
Source File: transformer_moe.py    From tensor2tensor with Apache License 2.0 5 votes vote down vote up
def _prepare_decoder(self, targets):
    """Process the transformer decoder input."""
    targets = common_layers.flatten4d3d(targets)

    output = transformer.transformer_prepare_decoder(
        targets, self._hparams, features=None,
    )
    deco_input, deco_self_attention_bias = output

    deco_input = tf.nn.dropout(
        deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout
    )
    return deco_input, deco_self_attention_bias 
Example #9
Source File: transformer_moe.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def _prepare_decoder(self, targets):
    """Process the transformer decoder input."""
    targets = common_layers.flatten4d3d(targets)

    output = transformer.transformer_prepare_decoder(
        targets, self._hparams, features=None,
    )
    deco_input, deco_self_attention_bias = output

    deco_input = tf.nn.dropout(
        deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout
    )
    return deco_input, deco_self_attention_bias 
Example #10
Source File: transformer_revnet.py    From BERT with Apache License 2.0 5 votes vote down vote up
def body(self, features):
    hparams = self._hparams
    targets = features["targets"]
    inputs = features["inputs"]
    target_space = features["target_space_id"]

    inputs = common_layers.flatten4d3d(inputs)
    targets = common_layers.flatten4d3d(targets)

    (encoder_input, encoder_self_attention_bias,
     encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder(
         inputs, target_space, hparams))
    (decoder_input,
     decoder_self_attention_bias) = transformer.transformer_prepare_decoder(
         targets, hparams)

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    encoder_output = transformer_revnet_encoder(
        encoder_input, encoder_self_attention_bias, hparams)

    decoder_output = transformer_revnet_decoder(
        decoder_input, encoder_output, decoder_self_attention_bias,
        encoder_decoder_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, 2)

    return decoder_output 
Example #11
Source File: transformer_moe.py    From BERT with Apache License 2.0 5 votes vote down vote up
def _prepare_decoder(self, targets):
    """Process the transformer decoder input."""
    targets = common_layers.flatten4d3d(targets)

    output = transformer.transformer_prepare_decoder(
        targets, self._hparams, features=None,
    )
    deco_input, deco_self_attention_bias = output

    deco_input = tf.nn.dropout(
        deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout
    )
    return deco_input, deco_self_attention_bias 
Example #12
Source File: transformer_revnet.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def body(self, features):
    hparams = self._hparams
    targets = features["targets"]
    inputs = features["inputs"]
    target_space = features["target_space_id"]

    inputs = common_layers.flatten4d3d(inputs)
    targets = common_layers.flatten4d3d(targets)

    (encoder_input, encoder_self_attention_bias,
     encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder(
         inputs, target_space, hparams))
    (decoder_input,
     decoder_self_attention_bias) = transformer.transformer_prepare_decoder(
         targets, hparams)

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    encoder_output = transformer_revnet_encoder(
        encoder_input, encoder_self_attention_bias, hparams)

    decoder_output = transformer_revnet_decoder(
        decoder_input, encoder_output, decoder_self_attention_bias,
        encoder_decoder_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, 2)

    return decoder_output 
Example #13
Source File: transformer_moe.py    From training_results_v0.5 with Apache License 2.0 4 votes vote down vote up
def transformer_moe_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.norm_type = "layer"
  hparams.hidden_size = 512
  hparams.batch_size = 4096
  hparams.max_length = 2001
  hparams.max_input_seq_length = 2000
  hparams.max_target_seq_length = 2000
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 2000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 5
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = True
  # According to noam, ("n", "da") seems better for harder-to-learn models
  hparams.layer_preprocess_sequence = "n"
  hparams.layer_postprocess_sequence = "da"

  # Hparams used by transformer_prepare_decoder() function
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("proximity_bias", False)
  hparams.add_hparam("causal_decoder_self_attention", True)

  hparams = common_attention.add_standard_attention_hparams(hparams)

  # Decoder layers type. If set, num_decoder_layers parameter will be ignored
  # and the number of decoder layer will be deduced from the string
  # See top file comment for example of usage
  hparams.add_hparam("layer_types", "")
  # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc,
  # sep, moe,...)
  hparams.add_hparam("default_att", "a")
  hparams.add_hparam("default_ff", "fc")

  return hparams 
Example #14
Source File: transformer_moe.py    From BERT with Apache License 2.0 4 votes vote down vote up
def transformer_moe_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.norm_type = "layer"
  hparams.hidden_size = 512
  hparams.batch_size = 4096
  hparams.max_length = 2001
  hparams.max_input_seq_length = 2000
  hparams.max_target_seq_length = 2000
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 2000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 5
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = True
  # According to noam, ("n", "da") seems better for harder-to-learn models
  hparams.layer_preprocess_sequence = "n"
  hparams.layer_postprocess_sequence = "da"

  # Hparams used by transformer_prepare_decoder() function
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("proximity_bias", False)
  hparams.add_hparam("causal_decoder_self_attention", True)

  hparams = common_attention.add_standard_attention_hparams(hparams)

  # Decoder layers type. If set, num_decoder_layers parameter will be ignored
  # and the number of decoder layer will be deduced from the string
  # See top file comment for example of usage
  hparams.add_hparam("layer_types", "")
  # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc,
  # sep, moe,...)
  hparams.add_hparam("default_att", "a")
  hparams.add_hparam("default_ff", "fc")

  return hparams 
Example #15
Source File: transformer.py    From BERT with Apache License 2.0 4 votes vote down vote up
def __init__(self, features_info=None, input_names=None, target_names=None,
               hidden_size=512, filter_size=2048):
    super(Transformer, self).__init__()
    # TODO(lukaszkaiser): gin'ify and split into encoder/decoder classes.
    self._has_input = True if input_names else False
    self._input_name = input_names[0]
    self._target_name = target_names[0]
    try:
      target_vocab_size = features_info[self._target_name].num_classes
    except AttributeError:
      target_vocab_size = features_info[self._target_name].encoder.vocab_size
    hparams = transformer.transformer_base()
    hparams.hidden_size = hidden_size
    hparams.filter_size = filter_size

    # Now the model.
    self._embedding = tf.keras.layers.Embedding(
        target_vocab_size, hidden_size, mask_zero=True)
    def transformer_encoder(inputs, features):
      return transformer.transformer_encode(
          transformer_layers.transformer_encoder, inputs, None,
          hparams, features=features)

    def transformer_prepare_decoder(targets, features):
      return transformer.transformer_prepare_decoder(targets, hparams, features)

    def transformer_decoder(decoder_input, encoder_output,
                            encoder_decoder_attention_bias,
                            decoder_self_attention_bias,
                            features):
      return transformer.transformer_decode(
          transformer.transformer_decoder,
          decoder_input,
          encoder_output,
          encoder_decoder_attention_bias,
          decoder_self_attention_bias,
          hparams,
          nonpadding=transformer.features_to_nonpadding(features, "targets"))

    if self._has_input:
      self._encoder = keras_utils.FunctionLayer(transformer_encoder)
    self._prepare_decoder = keras_utils.FunctionLayer(
        transformer_prepare_decoder)
    self._decoder = keras_utils.FunctionLayer(transformer_decoder)
    self._logits = tf.keras.layers.Dense(
        target_vocab_size, activation=None) 
Example #16
Source File: transformer_moe.py    From tensor2tensor with Apache License 2.0 4 votes vote down vote up
def transformer_moe_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.norm_type = "layer"
  hparams.hidden_size = 512
  hparams.batch_size = 4096
  hparams.max_length = 2001
  hparams.max_input_seq_length = 2000
  hparams.max_target_seq_length = 2000
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 2000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 5
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = True
  # According to noam, ("n", "da") seems better for harder-to-learn models
  hparams.layer_preprocess_sequence = "n"
  hparams.layer_postprocess_sequence = "da"

  # Hparams used by transformer_prepare_decoder() function
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("proximity_bias", False)
  hparams.add_hparam("causal_decoder_self_attention", True)

  hparams = common_attention.add_standard_attention_hparams(hparams)

  # Decoder layers type. If set, num_decoder_layers parameter will be ignored
  # and the number of decoder layer will be deduced from the string
  # See top file comment for example of usage
  hparams.add_hparam("layer_types", "")
  # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc,
  # sep, moe,...)
  hparams.add_hparam("default_att", "a")
  hparams.add_hparam("default_ff", "fc")

  return hparams 
Example #17
Source File: transformer_moe.py    From fine-lm with MIT License 4 votes vote down vote up
def transformer_moe_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.norm_type = "layer"
  hparams.hidden_size = 512
  hparams.batch_size = 4096
  hparams.max_length = 2001
  hparams.max_input_seq_length = 2000
  hparams.max_target_seq_length = 2000
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 0.1
  hparams.learning_rate_warmup_steps = 2000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 5
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.0
  hparams.shared_embedding_and_softmax_weights = True
  # According to noam, ("n", "da") seems better for harder-to-learn models
  hparams.layer_preprocess_sequence = "n"
  hparams.layer_postprocess_sequence = "da"

  # Hparams used by transformer_prepare_decoder() function
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("proximity_bias", False)
  hparams.add_hparam("causal_decoder_self_attention", True)

  hparams = common_attention.add_standard_attention_hparams(hparams)

  # Decoder layers type. If set, num_decoder_layers parameter will be ignored
  # and the number of decoder layer will be deduced from the string
  # See top file comment for example of usage
  hparams.add_hparam("layer_types", "")
  # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc,
  # sep, moe,...)
  hparams.add_hparam("default_att", "a")
  hparams.add_hparam("default_ff", "fc")

  return hparams