Python tensor2tensor.models.transformer.transformer_prepare_decoder() Examples
The following are 17
code examples of tensor2tensor.models.transformer.transformer_prepare_decoder().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensor2tensor.models.transformer
, or try the search function
.
Example #1
Source File: transformer_nat.py From fine-lm with MIT License | 6 votes |
def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets, hparams, name): """Original Transformer decoder.""" with tf.variable_scope(name): targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_bias = ( transformer.transformer_prepare_decoder(targets, hparams)) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer.transformer_decoder( decoder_input, encoder_output, decoder_self_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, axis=2) decoder_output_shape = common_layers.shape_list(decoder_output) decoder_output = tf.reshape( decoder_output, [decoder_output_shape[0], -1, 1, hparams.hidden_size]) # Expand since t2t expects 4d tensors. return decoder_output
Example #2
Source File: transformer_nat.py From tensor2tensor with Apache License 2.0 | 6 votes |
def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets, hparams, name): """Original Transformer decoder.""" with tf.variable_scope(name): targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_bias = ( transformer.transformer_prepare_decoder(targets, hparams)) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer.transformer_decoder( decoder_input, encoder_output, decoder_self_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, axis=2) decoder_output_shape = common_layers.shape_list(decoder_output) decoder_output = tf.reshape( decoder_output, [decoder_output_shape[0], -1, 1, hparams.hidden_size]) # Expand since t2t expects 4d tensors. return decoder_output
Example #3
Source File: transformer_nat.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets, hparams, name): """Original Transformer decoder.""" with tf.variable_scope(name): targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_bias = ( transformer.transformer_prepare_decoder(targets, hparams)) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer.transformer_decoder( decoder_input, encoder_output, decoder_self_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, axis=2) decoder_output_shape = common_layers.shape_list(decoder_output) decoder_output = tf.reshape( decoder_output, [decoder_output_shape[0], -1, 1, hparams.hidden_size]) # Expand since t2t expects 4d tensors. return decoder_output
Example #4
Source File: transformer_nat.py From BERT with Apache License 2.0 | 6 votes |
def decode_transformer(encoder_output, encoder_decoder_attention_bias, targets, hparams, name): """Original Transformer decoder.""" with tf.variable_scope(name): targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_bias = ( transformer.transformer_prepare_decoder(targets, hparams)) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer.transformer_decoder( decoder_input, encoder_output, decoder_self_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, axis=2) decoder_output_shape = common_layers.shape_list(decoder_output) decoder_output = tf.reshape( decoder_output, [decoder_output_shape[0], -1, 1, hparams.hidden_size]) # Expand since t2t expects 4d tensors. return decoder_output
Example #5
Source File: transformer_moe.py From fine-lm with MIT License | 5 votes |
def _prepare_decoder(self, targets): """Process the transformer decoder input.""" targets = common_layers.flatten4d3d(targets) output = transformer.transformer_prepare_decoder( targets, self._hparams, features=None, ) deco_input, deco_self_attention_bias = output deco_input = tf.nn.dropout( deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout ) return deco_input, deco_self_attention_bias
Example #6
Source File: transformer_revnet.py From fine-lm with MIT License | 5 votes |
def body(self, features): hparams = self._hparams targets = features["targets"] inputs = features["inputs"] target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder( inputs, target_space, hparams)) (decoder_input, decoder_self_attention_bias) = transformer.transformer_prepare_decoder( targets, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer_revnet_encoder( encoder_input, encoder_self_attention_bias, hparams) decoder_output = transformer_revnet_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output
Example #7
Source File: transformer_revnet.py From tensor2tensor with Apache License 2.0 | 5 votes |
def body(self, features): hparams = self._hparams targets = features["targets"] inputs = features["inputs"] target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder( inputs, target_space, hparams)) (decoder_input, decoder_self_attention_bias) = transformer.transformer_prepare_decoder( targets, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer_revnet_encoder( encoder_input, encoder_self_attention_bias, hparams) decoder_output = transformer_revnet_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output
Example #8
Source File: transformer_moe.py From tensor2tensor with Apache License 2.0 | 5 votes |
def _prepare_decoder(self, targets): """Process the transformer decoder input.""" targets = common_layers.flatten4d3d(targets) output = transformer.transformer_prepare_decoder( targets, self._hparams, features=None, ) deco_input, deco_self_attention_bias = output deco_input = tf.nn.dropout( deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout ) return deco_input, deco_self_attention_bias
Example #9
Source File: transformer_moe.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def _prepare_decoder(self, targets): """Process the transformer decoder input.""" targets = common_layers.flatten4d3d(targets) output = transformer.transformer_prepare_decoder( targets, self._hparams, features=None, ) deco_input, deco_self_attention_bias = output deco_input = tf.nn.dropout( deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout ) return deco_input, deco_self_attention_bias
Example #10
Source File: transformer_revnet.py From BERT with Apache License 2.0 | 5 votes |
def body(self, features): hparams = self._hparams targets = features["targets"] inputs = features["inputs"] target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder( inputs, target_space, hparams)) (decoder_input, decoder_self_attention_bias) = transformer.transformer_prepare_decoder( targets, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer_revnet_encoder( encoder_input, encoder_self_attention_bias, hparams) decoder_output = transformer_revnet_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output
Example #11
Source File: transformer_moe.py From BERT with Apache License 2.0 | 5 votes |
def _prepare_decoder(self, targets): """Process the transformer decoder input.""" targets = common_layers.flatten4d3d(targets) output = transformer.transformer_prepare_decoder( targets, self._hparams, features=None, ) deco_input, deco_self_attention_bias = output deco_input = tf.nn.dropout( deco_input, 1.0 - self._hparams.layer_prepostprocess_dropout ) return deco_input, deco_self_attention_bias
Example #12
Source File: transformer_revnet.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def body(self, features): hparams = self._hparams targets = features["targets"] inputs = features["inputs"] target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder( inputs, target_space, hparams)) (decoder_input, decoder_self_attention_bias) = transformer.transformer_prepare_decoder( targets, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer_revnet_encoder( encoder_input, encoder_self_attention_bias, hparams) decoder_output = transformer_revnet_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output
Example #13
Source File: transformer_moe.py From training_results_v0.5 with Apache License 2.0 | 4 votes |
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = True # According to noam, ("n", "da") seems better for harder-to-learn models hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # Hparams used by transformer_prepare_decoder() function hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("proximity_bias", False) hparams.add_hparam("causal_decoder_self_attention", True) hparams = common_attention.add_standard_attention_hparams(hparams) # Decoder layers type. If set, num_decoder_layers parameter will be ignored # and the number of decoder layer will be deduced from the string # See top file comment for example of usage hparams.add_hparam("layer_types", "") # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc, # sep, moe,...) hparams.add_hparam("default_att", "a") hparams.add_hparam("default_ff", "fc") return hparams
Example #14
Source File: transformer_moe.py From BERT with Apache License 2.0 | 4 votes |
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = True # According to noam, ("n", "da") seems better for harder-to-learn models hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # Hparams used by transformer_prepare_decoder() function hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("proximity_bias", False) hparams.add_hparam("causal_decoder_self_attention", True) hparams = common_attention.add_standard_attention_hparams(hparams) # Decoder layers type. If set, num_decoder_layers parameter will be ignored # and the number of decoder layer will be deduced from the string # See top file comment for example of usage hparams.add_hparam("layer_types", "") # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc, # sep, moe,...) hparams.add_hparam("default_att", "a") hparams.add_hparam("default_ff", "fc") return hparams
Example #15
Source File: transformer.py From BERT with Apache License 2.0 | 4 votes |
def __init__(self, features_info=None, input_names=None, target_names=None, hidden_size=512, filter_size=2048): super(Transformer, self).__init__() # TODO(lukaszkaiser): gin'ify and split into encoder/decoder classes. self._has_input = True if input_names else False self._input_name = input_names[0] self._target_name = target_names[0] try: target_vocab_size = features_info[self._target_name].num_classes except AttributeError: target_vocab_size = features_info[self._target_name].encoder.vocab_size hparams = transformer.transformer_base() hparams.hidden_size = hidden_size hparams.filter_size = filter_size # Now the model. self._embedding = tf.keras.layers.Embedding( target_vocab_size, hidden_size, mask_zero=True) def transformer_encoder(inputs, features): return transformer.transformer_encode( transformer_layers.transformer_encoder, inputs, None, hparams, features=features) def transformer_prepare_decoder(targets, features): return transformer.transformer_prepare_decoder(targets, hparams, features) def transformer_decoder(decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, features): return transformer.transformer_decode( transformer.transformer_decoder, decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "targets")) if self._has_input: self._encoder = keras_utils.FunctionLayer(transformer_encoder) self._prepare_decoder = keras_utils.FunctionLayer( transformer_prepare_decoder) self._decoder = keras_utils.FunctionLayer(transformer_decoder) self._logits = tf.keras.layers.Dense( target_vocab_size, activation=None)
Example #16
Source File: transformer_moe.py From tensor2tensor with Apache License 2.0 | 4 votes |
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = True # According to noam, ("n", "da") seems better for harder-to-learn models hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # Hparams used by transformer_prepare_decoder() function hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("proximity_bias", False) hparams.add_hparam("causal_decoder_self_attention", True) hparams = common_attention.add_standard_attention_hparams(hparams) # Decoder layers type. If set, num_decoder_layers parameter will be ignored # and the number of decoder layer will be deduced from the string # See top file comment for example of usage hparams.add_hparam("layer_types", "") # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc, # sep, moe,...) hparams.add_hparam("default_att", "a") hparams.add_hparam("default_ff", "fc") return hparams
Example #17
Source File: transformer_moe.py From fine-lm with MIT License | 4 votes |
def transformer_moe_base(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.norm_type = "layer" hparams.hidden_size = 512 hparams.batch_size = 4096 hparams.max_length = 2001 hparams.max_input_seq_length = 2000 hparams.max_target_seq_length = 2000 hparams.dropout = 0.0 hparams.clip_grad_norm = 0. # i.e. no gradient clipping hparams.optimizer_adam_epsilon = 1e-9 hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.learning_rate_warmup_steps = 2000 hparams.initializer_gain = 1.0 hparams.num_hidden_layers = 5 hparams.initializer = "uniform_unit_scaling" hparams.weight_decay = 0.0 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 hparams.num_sampled_classes = 0 hparams.label_smoothing = 0.0 hparams.shared_embedding_and_softmax_weights = True # According to noam, ("n", "da") seems better for harder-to-learn models hparams.layer_preprocess_sequence = "n" hparams.layer_postprocess_sequence = "da" # Hparams used by transformer_prepare_decoder() function hparams.add_hparam("pos", "timing") # timing, none hparams.add_hparam("proximity_bias", False) hparams.add_hparam("causal_decoder_self_attention", True) hparams = common_attention.add_standard_attention_hparams(hparams) # Decoder layers type. If set, num_decoder_layers parameter will be ignored # and the number of decoder layer will be deduced from the string # See top file comment for example of usage hparams.add_hparam("layer_types", "") # Default attention type (ex: a, loc, red,...) and feed-forward type (ex: fc, # sep, moe,...) hparams.add_hparam("default_att", "a") hparams.add_hparam("default_ff", "fc") return hparams