Python tensor2tensor.models.transformer.Transformer() Examples

The following are 30 code examples of tensor2tensor.models.transformer.Transformer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensor2tensor.models.transformer , or try the search function .
Example #1
Source File: transformer_test.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def testTransformerWithoutProblem(self):
    hparams = transformer.transformer_test()

    embedded_inputs = np.random.random_sample(
        (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size))
    embedded_targets = np.random.random_sample(
        (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size))

    transformed_features = {
        "inputs": tf.constant(embedded_inputs, dtype=tf.float32),
        "targets": tf.constant(embedded_targets, dtype=tf.float32)
    }

    model = transformer.Transformer(hparams)
    body_out, _ = model(transformed_features)

    self.assertAllEqual(
        body_out.get_shape().as_list(),
        [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size]) 
Example #2
Source File: evolved_transformer.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def add_evolved_transformer_hparams(hparams):
  """Add Evolved Transformer hparams.

  Note: These are for the Adam optimizer, not the Adafactor optimizer used in
  the paper.

  Args:
    hparams: Current hparams.

  Returns:
    hparams updated with Evolved Transformer values.
  """
  # Evolved Transformer "layers" are twice as deep as Transformer, so roughly
  # halve the number that we use. These numbers are taken from
  # arxiv.org/abs/1901.11117 .
  hparams.num_encoder_layers = 3
  hparams.num_decoder_layers = 4

  # Learning rate and decay scheme that mimics the transformer Adam config,
  # but with cosine decay instead of rsqrt.
  hparams.learning_rate_constant /= hparams.learning_rate_warmup_steps ** 0.5
  hparams.learning_rate_schedule = (
      "constant*linear_warmup*single_cycle_cos_decay*rsqrt_hidden_size")
  return hparams 
Example #3
Source File: transformer_test.py    From BERT with Apache License 2.0 6 votes vote down vote up
def testTransformerWithoutProblem(self):
    hparams = transformer.transformer_test()

    embedded_inputs = np.random.random_sample(
        (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size))
    embedded_targets = np.random.random_sample(
        (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size))

    transformed_features = {
        "inputs": tf.constant(embedded_inputs, dtype=tf.float32),
        "targets": tf.constant(embedded_targets, dtype=tf.float32)
    }

    model = transformer.Transformer(hparams)
    body_out, _ = model(transformed_features)

    self.assertAllEqual(
        body_out.get_shape().as_list(),
        [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size]) 
Example #4
Source File: transformer_test.py    From BERT with Apache License 2.0 6 votes vote down vote up
def testVarNames(self):
    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.PREDICT,
          model_cls=transformer.TransformerScorer)
      _ = model.infer(features)
      scorer_vars = [v.name for v in tf.global_variables()]

    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.EVAL,
          model_cls=transformer.TransformerScorer)
      _ = model(features)
      scorer_eval_vars = [v.name for v in tf.global_variables()]

    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.EVAL,
          model_cls=transformer.Transformer)
      _ = model(features)
      transformer_vars = [v.name for v in tf.global_variables()]

    self.assertEqual(sorted(scorer_vars), sorted(transformer_vars))
    self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars)) 
Example #5
Source File: evolved_transformer.py    From BERT with Apache License 2.0 6 votes vote down vote up
def add_evolved_transformer_hparams(hparams):
  """Add Evolved Transformer hparams.

  Note: These are for the Adam optimizer, not the Adafactor optimizer used in
  the paper.

  Args:
    hparams: Current hparams.

  Returns:
    hparams updated with Evolved Transformer values.
  """
  # Evolved Transformer "layers" are twice as deep as Transformer, so roughly
  # halve the number that we use. These numbers are taken from
  # arxiv.org/abs/1901.11117 .
  hparams.num_encoder_layers = 3
  hparams.num_decoder_layers = 4

  # Learning rate and decay scheme that mimics the transformer Adam config,
  # but with cosine decay instead of rsqrt.
  hparams.learning_rate_constant /= hparams.learning_rate_warmup_steps ** 0.5
  hparams.learning_rate_schedule = (
      "constant*linear_warmup*single_cycle_cos_decay*rsqrt_hidden_size")
  return hparams 
Example #6
Source File: universal_transformer.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha):
    """Beam search decoding.

    Args:
      features: an map of string to `Tensor`
      decode_length: an integer.  How many additional timesteps to decode.
      beam_size: number of beams.
      top_beams: an integer. How many of the beams to return.
      alpha: Float that controls the length penalty. larger the alpha, stronger
        the preference for longer translations.

    Returns:
      A dict of decoding results {
          "outputs": integer `Tensor` of decoded ids of shape
              [batch_size, <= decode_length] if beam_size == 1 or
              [batch_size, top_beams, <= decode_length]
          "scores": decoding log probs from the beam search,
              None if using greedy decoding (beam_size=1)
      }
    """
    # Caching is not ebabled in Universal Transformer
    # TODO(dehghani): Support fast decoding for Universal Transformer
    return self._beam_decode_slow(features, decode_length, beam_size,
                                  top_beams, alpha) 
Example #7
Source File: transformer_test.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def testTransformerWithoutProblem(self):
    hparams = transformer.transformer_test()

    embedded_inputs = np.random.random_sample(
        (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size))
    embedded_targets = np.random.random_sample(
        (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size))

    transformed_features = {
        "inputs": tf.constant(embedded_inputs, dtype=tf.float32),
        "targets": tf.constant(embedded_targets, dtype=tf.float32)
    }

    model = transformer.Transformer(hparams)
    body_out, _ = model(transformed_features)

    self.assertAllEqual(
        body_out.get_shape().as_list(),
        [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size]) 
Example #8
Source File: transformer_test.py    From fine-lm with MIT License 6 votes vote down vote up
def testVarNames(self):
    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.PREDICT,
          model_cls=transformer.TransformerScorer)
      _ = model.infer(features)
      scorer_vars = [v.name for v in tf.global_variables()]

    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.EVAL,
          model_cls=transformer.TransformerScorer)
      _ = model(features)
      scorer_eval_vars = [v.name for v in tf.global_variables()]

    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.EVAL,
          model_cls=transformer.Transformer)
      _ = model(features)
      transformer_vars = [v.name for v in tf.global_variables()]

    self.assertEqual(sorted(scorer_vars), sorted(transformer_vars))
    self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars)) 
Example #9
Source File: transformer_test.py    From fine-lm with MIT License 6 votes vote down vote up
def testTransformerWithoutProblem(self):
    hparams = transformer.transformer_test()

    embedded_inputs = np.random.random_sample(
        (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size))
    embedded_targets = np.random.random_sample(
        (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size))

    transformed_features = {
        "inputs": tf.constant(embedded_inputs, dtype=tf.float32),
        "targets": tf.constant(embedded_targets, dtype=tf.float32)
    }

    model = transformer.Transformer(hparams)
    body_out, _ = model(transformed_features)

    self.assertAllEqual(
        body_out.get_shape().as_list(),
        [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size]) 
Example #10
Source File: transformer_test.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def testVarNames(self):
    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.PREDICT,
          model_cls=transformer.TransformerScorer)
      _ = model.infer(features)
      scorer_vars = [v.name for v in tf.global_variables()]

    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.EVAL,
          model_cls=transformer.TransformerScorer)
      _ = model(features)
      scorer_eval_vars = [v.name for v in tf.global_variables()]

    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.EVAL,
          model_cls=transformer.Transformer)
      _ = model(features)
      transformer_vars = [v.name for v in tf.global_variables()]

    self.assertEqual(sorted(scorer_vars), sorted(transformer_vars))
    self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars)) 
Example #11
Source File: universal_transformer_modified.py    From Graph-Transformer with Apache License 2.0 6 votes vote down vote up
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha):
    """Beam search decoding.

    Args:
      features: an map of string to `Tensor`
      decode_length: an integer.  How many additional timesteps to decode.
      beam_size: number of beams.
      top_beams: an integer. How many of the beams to return.
      alpha: Float that controls the length penalty. larger the alpha, stronger
        the preference for longer translations.

    Returns:
      A dict of decoding results {
          "outputs": integer `Tensor` of decoded ids of shape
              [batch_size, <= decode_length] if beam_size == 1 or
              [batch_size, top_beams, <= decode_length]
          "scores": decoding log probs from the beam search,
              None if using greedy decoding (beam_size=1)
      }
    """
    # Caching is not ebabled in Universal Transformer
    # TODO(dehghani): Support fast decoding for Universal Transformer
    return self._beam_decode_slow(features, decode_length, beam_size,
                                  top_beams, alpha) 
Example #12
Source File: universal_transformer.py    From fine-lm with MIT License 6 votes vote down vote up
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha):
    """Beam search decoding.

    Args:
      features: an map of string to `Tensor`
      decode_length: an integer.  How many additional timesteps to decode.
      beam_size: number of beams.
      top_beams: an integer. How many of the beams to return.
      alpha: Float that controls the length penalty. larger the alpha, stronger
        the preference for longer translations.

    Returns:
      A dict of decoding results {
          "outputs": integer `Tensor` of decoded ids of shape
              [batch_size, <= decode_length] if beam_size == 1 or
              [batch_size, top_beams, <= decode_length]
          "scores": decoding log probs from the beam search,
              None if using greedy decoding (beam_size=1)
      }
    """
    # Caching is not ebabled in Universal Transformer
    # TODO(dehghani): Support fast decoding for Universal Transformer
    return self._beam_decode_slow(features, decode_length, beam_size,
                                  top_beams, alpha) 
Example #13
Source File: transformer_test.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def testVarNames(self):
    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.PREDICT,
          model_cls=transformer.TransformerScorer)
      _ = model.infer(features)
      scorer_vars = [v.name for v in tf.global_variables()]

    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.EVAL,
          model_cls=transformer.TransformerScorer)
      _ = model(features)
      scorer_eval_vars = [v.name for v in tf.global_variables()]

    with tf.Graph().as_default():
      model, features = get_model(
          mode=tf.estimator.ModeKeys.EVAL,
          model_cls=transformer.Transformer)
      _ = model(features)
      transformer_vars = [v.name for v in tf.global_variables()]

    self.assertEqual(sorted(scorer_vars), sorted(transformer_vars))
    self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars)) 
Example #14
Source File: universal_transformer_modified.py    From Graph-Transformer with Apache License 2.0 5 votes vote down vote up
def body(self, features):
    """Universal Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
          "targets": Target decoder outputs.
              [batch_size, decoder_length, hidden_dim]
          "target_space_id"

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    hparams = self._hparams

    assert self.has_input, ("universal_transformer_encoder is applicable on "
                            "problems with inputs")

    inputs = features["inputs"]
    target_space = features["target_space_id"]
    encoder_output, enc_extra_output = self.encode(
        inputs, target_space, hparams, features=features)

    encoder_output = tf.expand_dims(encoder_output, 2)

    if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0:
      ponder_times, remainders = enc_extra_output
      act_loss = hparams.act_loss_weight * tf.reduce_mean(ponder_times +
                                                          remainders)
      tf.contrib.summary.scalar("act_loss", act_loss)

      return encoder_output, {"act_loss": act_loss}
    return encoder_output 
Example #15
Source File: universal_transformer_modified.py    From Graph-Transformer with Apache License 2.0 5 votes vote down vote up
def encode(self, inputs, target_space, hparams, features=None, losses=None):
    """Encode transformer inputs.

    Args:
      inputs: Transformer inputs [batch_size, input_length, input_height,
        hidden_dim] which will be flattened along the two spatial dimensions.
      target_space: scalar, target space ID.
      hparams: hyperparmeters for model.
      features: optionally pass the entire features dictionary as well.
        This is needed now for "packed" datasets.
      losses: Unused.

    Returns:
      Tuple of:
          encoder_output: Encoder representation.
              [batch_size, input_length, hidden_dim]
          encoder_extra_output: which is extra encoder output used in some
            variants of the model (e.g. in ACT, to pass the ponder-time to body)
    """
    del losses
    inputs = common_layers.flatten4d3d(inputs)

    (encoder_input, self_attention_bias, _) = (
        transformer.transformer_prepare_encoder(inputs, target_space, hparams))

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    (encoder_output, encoder_extra_output) = (
        universal_transformer_util.universal_transformer_encoder(
            encoder_input,
            self_attention_bias,
            hparams,
            nonpadding=transformer.features_to_nonpadding(features, "inputs"),
            save_weights_to=self.attention_weights))

    return encoder_output, encoder_extra_output 
Example #16
Source File: transformer_test.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def get_model(hparams=None, mode=tf.estimator.ModeKeys.TRAIN,
              has_input=True, model_cls=transformer.Transformer):
  if hparams is None:
    hparams = transformer.transformer_tiny()
  hparams.hidden_size = 8
  hparams.filter_size = 32
  hparams.num_heads = 1
  hparams.layer_prepostprocess_dropout = 0.0

  p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE,
                                                   VOCAB_SIZE,
                                                   hparams)
  if not has_input:
    del p_hparams.modality["inputs"]
  hparams.problem_hparams = p_hparams

  inputs = -1 + np.random.random_integers(
      VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1))
  targets = -1 + np.random.random_integers(
      VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1))
  features = {
      "targets": tf.constant(targets, dtype=tf.int32, name="targets"),
      "target_space_id": tf.constant(1, dtype=tf.int32)
  }
  if has_input:
    features["inputs"] = tf.constant(inputs, dtype=tf.int32, name="inputs")

  return model_cls(hparams, mode, p_hparams), features 
Example #17
Source File: evolved_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def evolved_transformer_deep():
  """Deep parameters for Evolved Transformer model on WMT."""
  hparams = add_evolved_transformer_hparams(transformer.transformer_big())
  hparams.num_encoder_layers = 9
  hparams.num_decoder_layers = 10
  hparams.hidden_size = 640
  return hparams 
Example #18
Source File: universal_transformer.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def body(self, features):
    """Universal Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs [batch_size, input_length, hidden_dim]
          "targets": Target decoder outputs.
              [batch_size, decoder_length, hidden_dim]
          "target_space_id"

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    hparams = self._hparams

    assert self.has_input, ("universal_transformer_encoder is applicable on "
                            "problems with inputs")

    inputs = features["inputs"]
    target_space = features["target_space_id"]
    encoder_output, enc_extra_output = self.encode(
        inputs, target_space, hparams, features=features)

    encoder_output = tf.expand_dims(encoder_output, 2)

    if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0:
      ponder_times, remainders = enc_extra_output
      act_loss = hparams.act_loss_weight * tf.reduce_mean(ponder_times +
                                                          remainders)
      tf.contrib.summary.scalar("act_loss", act_loss)

      return encoder_output, {"act_loss": act_loss}
    return encoder_output 
Example #19
Source File: transformer_test.py    From BERT with Apache License 2.0 5 votes vote down vote up
def get_model(hparams=None, mode=tf.estimator.ModeKeys.TRAIN,
              has_input=True, model_cls=transformer.Transformer):
  if hparams is None:
    hparams = transformer.transformer_tiny()
  hparams.hidden_size = 8
  hparams.filter_size = 32
  hparams.num_heads = 1
  hparams.layer_prepostprocess_dropout = 0.0

  p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE,
                                                   VOCAB_SIZE,
                                                   hparams)
  if not has_input:
    del p_hparams.modality["inputs"]
  hparams.problem_hparams = p_hparams

  inputs = np.random.randint(
      VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1))
  targets = np.random.randint(
      VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1))
  features = {
      "targets": tf.constant(targets, dtype=tf.int32, name="targets"),
      "target_space_id": tf.constant(1, dtype=tf.int32)
  }
  if has_input:
    features["inputs"] = tf.constant(inputs, dtype=tf.int32, name="inputs")

  return model_cls(hparams, mode, p_hparams), features 
Example #20
Source File: evolved_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def evolved_transformer_base_tpu():
  """Base parameters for Evolved Transformer model on TPU."""
  hparams = add_evolved_transformer_hparams(transformer.transformer_tpu())
  hparams.learning_rate_constant = 1 / hparams.learning_rate_warmup_steps ** 0.5
  hparams.learning_rate_schedule = (
      "constant*single_cycle_cos_decay")
  return hparams 
Example #21
Source File: evolved_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def evolved_transformer_big_tpu():
  """Big parameters for Evolved Transformer model on TPU."""
  hparams = add_evolved_transformer_hparams(transformer.transformer_big_tpu())
  hparams.learning_rate_constant = 1 / hparams.learning_rate_warmup_steps ** 0.5
  hparams.learning_rate_schedule = (
      "constant*single_cycle_cos_decay")
  return hparams 
Example #22
Source File: transformer_revnet.py    From fine-lm with MIT License 5 votes vote down vote up
def transformer_revnet_big():
  """Base hparams for TransformerRevnet."""
  hparams = transformer_revnet_base()

  # The TransformerRevnet uses significantly less memory than the Transformer.
  # Increase batch size and model size.
  hparams.batch_size *= 2
  hparams.hidden_size *= 2
  hparams.num_heads *= 2
  hparams.num_hidden_layers += 1
  return hparams 
Example #23
Source File: transformer_aux.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def body(self, features):
    """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs.
              [batch_size, input_length, 1, hidden_dim].
          "targets": Target decoder outputs.
              [batch_size, target_length, 1, hidden_dim]
          "target_space_id": A scalar int from data_generators.problem.SpaceID.

    Returns:
      A 2-tuple containing:
          Logit tensor. [batch_size, decoder_length, vocab_size]
          Map of keys to loss tensors. Should contain the following:
              "training": Training loss (shift == 0).
              "auxiliary": Auxiliary loss (shift != 0).
    """
    output = super(TransformerAux, self).body(features)
    output, losses = self._normalize_body_output(output)

    aux = 0.0
    for shift in self._extract_shift_values():
      loss_num, loss_den = self.auxiliary_loss(output, features, shift)
      aux += loss_num / loss_den
    losses["auxiliary"] = aux

    return output, losses 
Example #24
Source File: evolved_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def evolved_transformer_base():
  """Base parameters for Evolved Transformer model."""
  return add_evolved_transformer_hparams(transformer.transformer_base()) 
Example #25
Source File: universal_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def adaptive_universal_transformer_multilayer_hard():
  """Multi-layer config for adaptive Transformer with hard attention."""
  hparams = adaptive_universal_transformer_multilayer_tpu()
  hparams.batch_size = 256
  hparams.hard_attention_k = 8
  hparams.add_step_timing_signal = True
  # hparams.add_sru = True  # This is very slow on GPUs, does it help?
  hparams.self_attention_type = "dot_product_relative_v2"
  hparams.max_relative_position = 256
  return hparams 
Example #26
Source File: universal_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def adaptive_universal_transformer_multilayer_tpu():
  """Multi-layer config for adaptive Transformer on TPU."""
  hparams = adaptive_universal_transformer_base_tpu()
  hparams.num_inrecurrence_layers = 2
  hparams.mix_with_transformer = "before_ut,after_ut"
  hparams.num_mixedin_layers = 1
  hparams.transformer_ffn_type = "sepconv"
  # TODO(lukaszkaiser): the options below don't work on TPU yet, make them work.
  # hparams.add_step_timing_signal = True
  # hparams.add_sru = True
  # hparams.self_attention_type = "dot_product_relative_v2"
  # hparams.max_relative_position = 256
  return hparams 
Example #27
Source File: universal_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def universal_transformer_base():
  """Base parameters for Universal Transformer."""
  hparams = transformer.transformer_base()
  # To have a similar capacity to the transformer_base with 6 layers,
  # we need to increase the size of the UT's layer
  # since, in fact, UT has a single layer repeating multiple times.
  hparams.hidden_size = 1024
  hparams.filter_size = 4096
  hparams.num_heads = 16
  hparams.layer_prepostprocess_dropout = 0.3
  hparams = update_hparams_for_universal_transformer(hparams)
  return hparams 
Example #28
Source File: universal_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def encode(self, inputs, target_space, hparams, features=None, losses=None):
    """Encode transformer inputs.

    Args:
      inputs: Transformer inputs [batch_size, input_length, input_height,
        hidden_dim] which will be flattened along the two spatial dimensions.
      target_space: scalar, target space ID.
      hparams: hyperparmeters for model.
      features: optionally pass the entire features dictionary as well.
        This is needed now for "packed" datasets.
      losses: Unused.

    Returns:
      Tuple of:
          encoder_output: Encoder representation.
              [batch_size, input_length, hidden_dim]
          encoder_extra_output: which is extra encoder output used in some
            variants of the model (e.g. in ACT, to pass the ponder-time to body)
    """
    del losses
    inputs = common_layers.flatten4d3d(inputs)

    (encoder_input, self_attention_bias, _) = (
        transformer.transformer_prepare_encoder(inputs, target_space, hparams))

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    (encoder_output, encoder_extra_output) = (
        universal_transformer_util.universal_transformer_encoder(
            encoder_input,
            self_attention_bias,
            hparams,
            nonpadding=transformer.features_to_nonpadding(features, "inputs"),
            save_weights_to=self.attention_weights))

    return encoder_output, encoder_extra_output 
Example #29
Source File: universal_transformer.py    From BERT with Apache License 2.0 5 votes vote down vote up
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha,
                   use_tpu=False):
    """Beam search decoding.

    Args:
      features: an map of string to `Tensor`
      decode_length: an integer.  How many additional timesteps to decode.
      beam_size: number of beams.
      top_beams: an integer. How many of the beams to return.
      alpha: Float that controls the length penalty. larger the alpha, stronger
        the preference for longer translations.
      use_tpu: Whether we should use TPU or not.

    Returns:
      A dict of decoding results {
          "outputs": integer `Tensor` of decoded ids of shape
              [batch_size, <= decode_length] if beam_size == 1 or
              [batch_size, top_beams, <= decode_length]
          "scores": decoding log probs from the beam search,
              None if using greedy decoding (beam_size=1)
      }
    """
    # Caching is not ebabled in Universal Transformer
    # TODO(dehghani): Support fast decoding for Universal Transformer
    return self._beam_decode_slow(features, decode_length, beam_size,
                                  top_beams, alpha, use_tpu) 
Example #30
Source File: transformer_aux.py    From BERT with Apache License 2.0 5 votes vote down vote up
def body(self, features):
    """Transformer main model_fn.

    Args:
      features: Map of features to the model. Should contain the following:
          "inputs": Transformer inputs.
              [batch_size, input_length, 1, hidden_dim].
          "targets": Target decoder outputs.
              [batch_size, target_length, 1, hidden_dim]
          "target_space_id": A scalar int from data_generators.problem.SpaceID.

    Returns:
      A 2-tuple containing:
          Logit tensor. [batch_size, decoder_length, vocab_size]
          Map of keys to loss tensors. Should contain the following:
              "training": Training loss (shift == 0).
              "auxiliary": Auxiliary loss (shift != 0).
    """
    output = super(TransformerAux, self).body(features)
    output, losses = self._normalize_body_output(output)

    aux = 0.0
    for shift in self._extract_shift_values():
      loss_num, loss_den = self.auxiliary_loss(output, features, shift)
      aux += loss_num / loss_den
    losses["auxiliary"] = aux

    return output, losses