Python tensor2tensor.models.transformer.Transformer() Examples
The following are 30
code examples of tensor2tensor.models.transformer.Transformer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensor2tensor.models.transformer
, or try the search function
.
Example #1
Source File: transformer_test.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def testTransformerWithoutProblem(self): hparams = transformer.transformer_test() embedded_inputs = np.random.random_sample( (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size)) embedded_targets = np.random.random_sample( (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size)) transformed_features = { "inputs": tf.constant(embedded_inputs, dtype=tf.float32), "targets": tf.constant(embedded_targets, dtype=tf.float32) } model = transformer.Transformer(hparams) body_out, _ = model(transformed_features) self.assertAllEqual( body_out.get_shape().as_list(), [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size])
Example #2
Source File: evolved_transformer.py From tensor2tensor with Apache License 2.0 | 6 votes |
def add_evolved_transformer_hparams(hparams): """Add Evolved Transformer hparams. Note: These are for the Adam optimizer, not the Adafactor optimizer used in the paper. Args: hparams: Current hparams. Returns: hparams updated with Evolved Transformer values. """ # Evolved Transformer "layers" are twice as deep as Transformer, so roughly # halve the number that we use. These numbers are taken from # arxiv.org/abs/1901.11117 . hparams.num_encoder_layers = 3 hparams.num_decoder_layers = 4 # Learning rate and decay scheme that mimics the transformer Adam config, # but with cosine decay instead of rsqrt. hparams.learning_rate_constant /= hparams.learning_rate_warmup_steps ** 0.5 hparams.learning_rate_schedule = ( "constant*linear_warmup*single_cycle_cos_decay*rsqrt_hidden_size") return hparams
Example #3
Source File: transformer_test.py From BERT with Apache License 2.0 | 6 votes |
def testTransformerWithoutProblem(self): hparams = transformer.transformer_test() embedded_inputs = np.random.random_sample( (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size)) embedded_targets = np.random.random_sample( (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size)) transformed_features = { "inputs": tf.constant(embedded_inputs, dtype=tf.float32), "targets": tf.constant(embedded_targets, dtype=tf.float32) } model = transformer.Transformer(hparams) body_out, _ = model(transformed_features) self.assertAllEqual( body_out.get_shape().as_list(), [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size])
Example #4
Source File: transformer_test.py From BERT with Apache License 2.0 | 6 votes |
def testVarNames(self): with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.PREDICT, model_cls=transformer.TransformerScorer) _ = model.infer(features) scorer_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.TransformerScorer) _ = model(features) scorer_eval_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.Transformer) _ = model(features) transformer_vars = [v.name for v in tf.global_variables()] self.assertEqual(sorted(scorer_vars), sorted(transformer_vars)) self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars))
Example #5
Source File: evolved_transformer.py From BERT with Apache License 2.0 | 6 votes |
def add_evolved_transformer_hparams(hparams): """Add Evolved Transformer hparams. Note: These are for the Adam optimizer, not the Adafactor optimizer used in the paper. Args: hparams: Current hparams. Returns: hparams updated with Evolved Transformer values. """ # Evolved Transformer "layers" are twice as deep as Transformer, so roughly # halve the number that we use. These numbers are taken from # arxiv.org/abs/1901.11117 . hparams.num_encoder_layers = 3 hparams.num_decoder_layers = 4 # Learning rate and decay scheme that mimics the transformer Adam config, # but with cosine decay instead of rsqrt. hparams.learning_rate_constant /= hparams.learning_rate_warmup_steps ** 0.5 hparams.learning_rate_schedule = ( "constant*linear_warmup*single_cycle_cos_decay*rsqrt_hidden_size") return hparams
Example #6
Source File: universal_transformer.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha): """Beam search decoding. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for longer translations. Returns: A dict of decoding results { "outputs": integer `Tensor` of decoded ids of shape [batch_size, <= decode_length] if beam_size == 1 or [batch_size, top_beams, <= decode_length] "scores": decoding log probs from the beam search, None if using greedy decoding (beam_size=1) } """ # Caching is not ebabled in Universal Transformer # TODO(dehghani): Support fast decoding for Universal Transformer return self._beam_decode_slow(features, decode_length, beam_size, top_beams, alpha)
Example #7
Source File: transformer_test.py From tensor2tensor with Apache License 2.0 | 6 votes |
def testTransformerWithoutProblem(self): hparams = transformer.transformer_test() embedded_inputs = np.random.random_sample( (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size)) embedded_targets = np.random.random_sample( (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size)) transformed_features = { "inputs": tf.constant(embedded_inputs, dtype=tf.float32), "targets": tf.constant(embedded_targets, dtype=tf.float32) } model = transformer.Transformer(hparams) body_out, _ = model(transformed_features) self.assertAllEqual( body_out.get_shape().as_list(), [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size])
Example #8
Source File: transformer_test.py From fine-lm with MIT License | 6 votes |
def testVarNames(self): with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.PREDICT, model_cls=transformer.TransformerScorer) _ = model.infer(features) scorer_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.TransformerScorer) _ = model(features) scorer_eval_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.Transformer) _ = model(features) transformer_vars = [v.name for v in tf.global_variables()] self.assertEqual(sorted(scorer_vars), sorted(transformer_vars)) self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars))
Example #9
Source File: transformer_test.py From fine-lm with MIT License | 6 votes |
def testTransformerWithoutProblem(self): hparams = transformer.transformer_test() embedded_inputs = np.random.random_sample( (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size)) embedded_targets = np.random.random_sample( (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size)) transformed_features = { "inputs": tf.constant(embedded_inputs, dtype=tf.float32), "targets": tf.constant(embedded_targets, dtype=tf.float32) } model = transformer.Transformer(hparams) body_out, _ = model(transformed_features) self.assertAllEqual( body_out.get_shape().as_list(), [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size])
Example #10
Source File: transformer_test.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def testVarNames(self): with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.PREDICT, model_cls=transformer.TransformerScorer) _ = model.infer(features) scorer_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.TransformerScorer) _ = model(features) scorer_eval_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.Transformer) _ = model(features) transformer_vars = [v.name for v in tf.global_variables()] self.assertEqual(sorted(scorer_vars), sorted(transformer_vars)) self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars))
Example #11
Source File: universal_transformer_modified.py From Graph-Transformer with Apache License 2.0 | 6 votes |
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha): """Beam search decoding. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for longer translations. Returns: A dict of decoding results { "outputs": integer `Tensor` of decoded ids of shape [batch_size, <= decode_length] if beam_size == 1 or [batch_size, top_beams, <= decode_length] "scores": decoding log probs from the beam search, None if using greedy decoding (beam_size=1) } """ # Caching is not ebabled in Universal Transformer # TODO(dehghani): Support fast decoding for Universal Transformer return self._beam_decode_slow(features, decode_length, beam_size, top_beams, alpha)
Example #12
Source File: universal_transformer.py From fine-lm with MIT License | 6 votes |
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha): """Beam search decoding. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for longer translations. Returns: A dict of decoding results { "outputs": integer `Tensor` of decoded ids of shape [batch_size, <= decode_length] if beam_size == 1 or [batch_size, top_beams, <= decode_length] "scores": decoding log probs from the beam search, None if using greedy decoding (beam_size=1) } """ # Caching is not ebabled in Universal Transformer # TODO(dehghani): Support fast decoding for Universal Transformer return self._beam_decode_slow(features, decode_length, beam_size, top_beams, alpha)
Example #13
Source File: transformer_test.py From tensor2tensor with Apache License 2.0 | 6 votes |
def testVarNames(self): with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.PREDICT, model_cls=transformer.TransformerScorer) _ = model.infer(features) scorer_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.TransformerScorer) _ = model(features) scorer_eval_vars = [v.name for v in tf.global_variables()] with tf.Graph().as_default(): model, features = get_model( mode=tf.estimator.ModeKeys.EVAL, model_cls=transformer.Transformer) _ = model(features) transformer_vars = [v.name for v in tf.global_variables()] self.assertEqual(sorted(scorer_vars), sorted(transformer_vars)) self.assertEqual(sorted(scorer_eval_vars), sorted(transformer_vars))
Example #14
Source File: universal_transformer_modified.py From Graph-Transformer with Apache License 2.0 | 5 votes |
def body(self, features): """Universal Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "targets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id" Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams assert self.has_input, ("universal_transformer_encoder is applicable on " "problems with inputs") inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, enc_extra_output = self.encode( inputs, target_space, hparams, features=features) encoder_output = tf.expand_dims(encoder_output, 2) if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0: ponder_times, remainders = enc_extra_output act_loss = hparams.act_loss_weight * tf.reduce_mean(ponder_times + remainders) tf.contrib.summary.scalar("act_loss", act_loss) return encoder_output, {"act_loss": act_loss} return encoder_output
Example #15
Source File: universal_transformer_modified.py From Graph-Transformer with Apache License 2.0 | 5 votes |
def encode(self, inputs, target_space, hparams, features=None, losses=None): """Encode transformer inputs. Args: inputs: Transformer inputs [batch_size, input_length, input_height, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparmeters for model. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: Unused. Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_extra_output: which is extra encoder output used in some variants of the model (e.g. in ACT, to pass the ponder-time to body) """ del losses inputs = common_layers.flatten4d3d(inputs) (encoder_input, self_attention_bias, _) = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) (encoder_output, encoder_extra_output) = ( universal_transformer_util.universal_transformer_encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs"), save_weights_to=self.attention_weights)) return encoder_output, encoder_extra_output
Example #16
Source File: transformer_test.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def get_model(hparams=None, mode=tf.estimator.ModeKeys.TRAIN, has_input=True, model_cls=transformer.Transformer): if hparams is None: hparams = transformer.transformer_tiny() hparams.hidden_size = 8 hparams.filter_size = 32 hparams.num_heads = 1 hparams.layer_prepostprocess_dropout = 0.0 p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE, hparams) if not has_input: del p_hparams.modality["inputs"] hparams.problem_hparams = p_hparams inputs = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) targets = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) features = { "targets": tf.constant(targets, dtype=tf.int32, name="targets"), "target_space_id": tf.constant(1, dtype=tf.int32) } if has_input: features["inputs"] = tf.constant(inputs, dtype=tf.int32, name="inputs") return model_cls(hparams, mode, p_hparams), features
Example #17
Source File: evolved_transformer.py From BERT with Apache License 2.0 | 5 votes |
def evolved_transformer_deep(): """Deep parameters for Evolved Transformer model on WMT.""" hparams = add_evolved_transformer_hparams(transformer.transformer_big()) hparams.num_encoder_layers = 9 hparams.num_decoder_layers = 10 hparams.hidden_size = 640 return hparams
Example #18
Source File: universal_transformer.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def body(self, features): """Universal Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "targets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id" Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams assert self.has_input, ("universal_transformer_encoder is applicable on " "problems with inputs") inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, enc_extra_output = self.encode( inputs, target_space, hparams, features=features) encoder_output = tf.expand_dims(encoder_output, 2) if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0: ponder_times, remainders = enc_extra_output act_loss = hparams.act_loss_weight * tf.reduce_mean(ponder_times + remainders) tf.contrib.summary.scalar("act_loss", act_loss) return encoder_output, {"act_loss": act_loss} return encoder_output
Example #19
Source File: transformer_test.py From BERT with Apache License 2.0 | 5 votes |
def get_model(hparams=None, mode=tf.estimator.ModeKeys.TRAIN, has_input=True, model_cls=transformer.Transformer): if hparams is None: hparams = transformer.transformer_tiny() hparams.hidden_size = 8 hparams.filter_size = 32 hparams.num_heads = 1 hparams.layer_prepostprocess_dropout = 0.0 p_hparams = problem_hparams.test_problem_hparams(VOCAB_SIZE, VOCAB_SIZE, hparams) if not has_input: del p_hparams.modality["inputs"] hparams.problem_hparams = p_hparams inputs = np.random.randint( VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) targets = np.random.randint( VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) features = { "targets": tf.constant(targets, dtype=tf.int32, name="targets"), "target_space_id": tf.constant(1, dtype=tf.int32) } if has_input: features["inputs"] = tf.constant(inputs, dtype=tf.int32, name="inputs") return model_cls(hparams, mode, p_hparams), features
Example #20
Source File: evolved_transformer.py From BERT with Apache License 2.0 | 5 votes |
def evolved_transformer_base_tpu(): """Base parameters for Evolved Transformer model on TPU.""" hparams = add_evolved_transformer_hparams(transformer.transformer_tpu()) hparams.learning_rate_constant = 1 / hparams.learning_rate_warmup_steps ** 0.5 hparams.learning_rate_schedule = ( "constant*single_cycle_cos_decay") return hparams
Example #21
Source File: evolved_transformer.py From BERT with Apache License 2.0 | 5 votes |
def evolved_transformer_big_tpu(): """Big parameters for Evolved Transformer model on TPU.""" hparams = add_evolved_transformer_hparams(transformer.transformer_big_tpu()) hparams.learning_rate_constant = 1 / hparams.learning_rate_warmup_steps ** 0.5 hparams.learning_rate_schedule = ( "constant*single_cycle_cos_decay") return hparams
Example #22
Source File: transformer_revnet.py From fine-lm with MIT License | 5 votes |
def transformer_revnet_big(): """Base hparams for TransformerRevnet.""" hparams = transformer_revnet_base() # The TransformerRevnet uses significantly less memory than the Transformer. # Increase batch size and model size. hparams.batch_size *= 2 hparams.hidden_size *= 2 hparams.num_heads *= 2 hparams.num_hidden_layers += 1 return hparams
Example #23
Source File: transformer_aux.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs. [batch_size, input_length, 1, hidden_dim]. "targets": Target decoder outputs. [batch_size, target_length, 1, hidden_dim] "target_space_id": A scalar int from data_generators.problem.SpaceID. Returns: A 2-tuple containing: Logit tensor. [batch_size, decoder_length, vocab_size] Map of keys to loss tensors. Should contain the following: "training": Training loss (shift == 0). "auxiliary": Auxiliary loss (shift != 0). """ output = super(TransformerAux, self).body(features) output, losses = self._normalize_body_output(output) aux = 0.0 for shift in self._extract_shift_values(): loss_num, loss_den = self.auxiliary_loss(output, features, shift) aux += loss_num / loss_den losses["auxiliary"] = aux return output, losses
Example #24
Source File: evolved_transformer.py From BERT with Apache License 2.0 | 5 votes |
def evolved_transformer_base(): """Base parameters for Evolved Transformer model.""" return add_evolved_transformer_hparams(transformer.transformer_base())
Example #25
Source File: universal_transformer.py From BERT with Apache License 2.0 | 5 votes |
def adaptive_universal_transformer_multilayer_hard(): """Multi-layer config for adaptive Transformer with hard attention.""" hparams = adaptive_universal_transformer_multilayer_tpu() hparams.batch_size = 256 hparams.hard_attention_k = 8 hparams.add_step_timing_signal = True # hparams.add_sru = True # This is very slow on GPUs, does it help? hparams.self_attention_type = "dot_product_relative_v2" hparams.max_relative_position = 256 return hparams
Example #26
Source File: universal_transformer.py From BERT with Apache License 2.0 | 5 votes |
def adaptive_universal_transformer_multilayer_tpu(): """Multi-layer config for adaptive Transformer on TPU.""" hparams = adaptive_universal_transformer_base_tpu() hparams.num_inrecurrence_layers = 2 hparams.mix_with_transformer = "before_ut,after_ut" hparams.num_mixedin_layers = 1 hparams.transformer_ffn_type = "sepconv" # TODO(lukaszkaiser): the options below don't work on TPU yet, make them work. # hparams.add_step_timing_signal = True # hparams.add_sru = True # hparams.self_attention_type = "dot_product_relative_v2" # hparams.max_relative_position = 256 return hparams
Example #27
Source File: universal_transformer.py From BERT with Apache License 2.0 | 5 votes |
def universal_transformer_base(): """Base parameters for Universal Transformer.""" hparams = transformer.transformer_base() # To have a similar capacity to the transformer_base with 6 layers, # we need to increase the size of the UT's layer # since, in fact, UT has a single layer repeating multiple times. hparams.hidden_size = 1024 hparams.filter_size = 4096 hparams.num_heads = 16 hparams.layer_prepostprocess_dropout = 0.3 hparams = update_hparams_for_universal_transformer(hparams) return hparams
Example #28
Source File: universal_transformer.py From BERT with Apache License 2.0 | 5 votes |
def encode(self, inputs, target_space, hparams, features=None, losses=None): """Encode transformer inputs. Args: inputs: Transformer inputs [batch_size, input_length, input_height, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparmeters for model. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: Unused. Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_extra_output: which is extra encoder output used in some variants of the model (e.g. in ACT, to pass the ponder-time to body) """ del losses inputs = common_layers.flatten4d3d(inputs) (encoder_input, self_attention_bias, _) = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) (encoder_output, encoder_extra_output) = ( universal_transformer_util.universal_transformer_encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs"), save_weights_to=self.attention_weights)) return encoder_output, encoder_extra_output
Example #29
Source File: universal_transformer.py From BERT with Apache License 2.0 | 5 votes |
def _beam_decode(self, features, decode_length, beam_size, top_beams, alpha, use_tpu=False): """Beam search decoding. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for longer translations. use_tpu: Whether we should use TPU or not. Returns: A dict of decoding results { "outputs": integer `Tensor` of decoded ids of shape [batch_size, <= decode_length] if beam_size == 1 or [batch_size, top_beams, <= decode_length] "scores": decoding log probs from the beam search, None if using greedy decoding (beam_size=1) } """ # Caching is not ebabled in Universal Transformer # TODO(dehghani): Support fast decoding for Universal Transformer return self._beam_decode_slow(features, decode_length, beam_size, top_beams, alpha, use_tpu)
Example #30
Source File: transformer_aux.py From BERT with Apache License 2.0 | 5 votes |
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs. [batch_size, input_length, 1, hidden_dim]. "targets": Target decoder outputs. [batch_size, target_length, 1, hidden_dim] "target_space_id": A scalar int from data_generators.problem.SpaceID. Returns: A 2-tuple containing: Logit tensor. [batch_size, decoder_length, vocab_size] Map of keys to loss tensors. Should contain the following: "training": Training loss (shift == 0). "auxiliary": Auxiliary loss (shift != 0). """ output = super(TransformerAux, self).body(features) output, losses = self._normalize_body_output(output) aux = 0.0 for shift in self._extract_shift_values(): loss_num, loss_den = self.auxiliary_loss(output, features, shift) aux += loss_num / loss_den losses["auxiliary"] = aux return output, losses