Python torch.nn.LayerNorm() Examples
The following are 30
code examples of torch.nn.LayerNorm().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.nn
, or try the search function
.
Example #1
Source File: scorenet.py From ncsn with GNU General Public License v3.0 | 7 votes |
def __init__(self, config): super().__init__() self.config = config self.main = nn.Sequential( nn.Linear(10 * 10, 1024), nn.LayerNorm(1024), nn.ELU(), nn.Linear(1024, 1024), nn.LayerNorm(1024), nn.ELU(), nn.Linear(1024, 512), nn.LayerNorm(512), nn.ELU(), nn.Linear(512, 100), nn.LayerNorm(100) )
Example #2
Source File: modules.py From dgl with Apache License 2.0 | 7 votes |
def __init__(self, in_feats, out_feats, activation, dropout, bias=True, use_pp=False, use_lynorm=True): super(GraphSAGELayer, self).__init__() # The input feature size gets doubled as we concatenated the original # features with the new features. self.linear = nn.Linear(2 * in_feats, out_feats, bias=bias) self.activation = activation self.use_pp = use_pp if dropout: self.dropout = nn.Dropout(p=dropout) else: self.dropout = 0. if use_lynorm: self.lynorm = nn.LayerNorm(out_feats, elementwise_affine=True) else: self.lynorm = lambda x: x self.reset_parameters()
Example #3
Source File: model.py From dgl with Apache License 2.0 | 6 votes |
def __init__(self, c, T, n, Lk, p, num_layers,control_str = 'TNTSTNTST'): super(STGCN_WAVE, self).__init__() self.control_str = control_str # model structure controller self.num_layers = len(control_str) self.layers = [] cnt = 0 diapower = 0 for i in range(self.num_layers): i_layer = control_str[i] if i_layer == 'T': # Temporal Layer self.layers.append(TemporalConvLayer(c[cnt], c[cnt + 1], dia = 2**diapower)) diapower += 1 cnt += 1 if i_layer == 'S': # Spatio Layer self.layers.append(SpatioConvLayer(c[cnt], Lk)) if i_layer == 'N': # Norm Layer self.layers.append(nn.LayerNorm([n,c[cnt]])) self.output = OutputLayer(c[cnt], T + 1 - 2**(diapower), n) for layer in self.layers: layer = layer.cuda()
Example #4
Source File: Transformer.py From ConvLab with MIT License | 6 votes |
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super(AverageHeadAttention, self).__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Linear(d_model, n_head * d_k) self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) self.layer_norm = nn.LayerNorm(d_model) self.fc = nn.Linear(d_v, d_model) nn.init.xavier_normal_(self.fc.weight) self.dropout = nn.Dropout(dropout)
Example #5
Source File: Transformer.py From ConvLab with MIT License | 6 votes |
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super(MultiHeadAttention, self).__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Linear(d_model, n_head * d_k) self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) self.layer_norm = nn.LayerNorm(d_model) self.fc = nn.Linear(n_head * d_v, d_model) nn.init.xavier_normal_(self.fc.weight) self.dropout = nn.Dropout(dropout)
Example #6
Source File: attention.py From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, d_model, d_k, d_v, h, dropout=.1, identity_map_reordering=False, can_be_stateful=False, attention_module=None, attention_module_kwargs=None): super(MultiHeadAttention, self).__init__() self.identity_map_reordering = identity_map_reordering if attention_module is not None: if attention_module_kwargs is not None: self.attention = attention_module(d_model=d_model, d_k=d_k, d_v=d_v, h=h, **attention_module_kwargs) else: self.attention = attention_module(d_model=d_model, d_k=d_k, d_v=d_v, h=h) else: self.attention = ScaledDotProductAttention(d_model=d_model, d_k=d_k, d_v=d_v, h=h) self.dropout = nn.Dropout(p=dropout) self.layer_norm = nn.LayerNorm(d_model) self.can_be_stateful = can_be_stateful if self.can_be_stateful: self.register_state('running_keys', torch.zeros((0, d_model))) self.register_state('running_values', torch.zeros((0, d_model)))
Example #7
Source File: encoder.py From TVQAplus with MIT License | 6 votes |
def __init__(self, n_conv, kernel_size=7, n_filters=128, dropout=0.1, num_heads=4): super(EncoderBlock, self).__init__() self.dropout = dropout self.n_conv = n_conv self.num_heads = num_heads self.position_encoding = PositionEncoding(n_filters=n_filters) self.layer_norm = nn.ModuleList([nn.LayerNorm(n_filters) for _ in range(n_conv)]) self.final_layer_norm = nn.LayerNorm(n_filters) self.conv = nn.ModuleList([ DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True) for _ in range(n_conv)]) if self.num_heads != 0: self.multi_head_attn = MultiHeadedAttention(nh=num_heads, d_model=n_filters) self.attn_layer_norm = nn.LayerNorm(n_filters)
Example #8
Source File: encoder.py From pytorch_sac_ae with MIT License | 6 votes |
def __init__(self, obs_shape, feature_dim, num_layers=2, num_filters=32): super().__init__() assert len(obs_shape) == 3 self.feature_dim = feature_dim self.num_layers = num_layers self.convs = nn.ModuleList( [nn.Conv2d(obs_shape[0], num_filters, 3, stride=2)] ) for i in range(num_layers - 1): self.convs.append(nn.Conv2d(num_filters, num_filters, 3, stride=1)) out_dim = OUT_DIM[num_layers] self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim) self.ln = nn.LayerNorm(self.feature_dim) self.outputs = dict()
Example #9
Source File: meta.py From ScenarioMeta with MIT License | 6 votes |
def __init__(self, hidden_size, layer_norm=False, input_gate=True, forget_gate=True): nn.Module.__init__(self) self.hidden_size = hidden_size # gradient(2), param(2), loss self.lstm = nn.LSTMCell(input_size=5, hidden_size=hidden_size) if layer_norm: self.layer_norm = nn.LayerNorm(hidden_size) else: self.layer_norm = None self.input_gate = input_gate self.forget_gate = forget_gate if self.input_gate: self.lr_layer = nn.Linear(hidden_size, 1) self.lrs = [] else: self.output_layer = nn.Linear(hidden_size, 1) self.dets = [] if forget_gate: self.fg_layer = nn.Linear(hidden_size, 1) self.fgs = [] self.h_0 = nn.Parameter(torch.randn((hidden_size,), requires_grad=True)) self.c_0 = nn.Parameter(torch.randn((hidden_size,), requires_grad=True))
Example #10
Source File: transformer_blocks.py From Character-Level-Language-Modeling-with-Deeper-Self-Attention-pytorch with MIT License | 6 votes |
def __init__(self, input_size, inner_linear, inner_groups=1, layer_norm=True, weight_norm=False, dropout=0, batch_first=True): super(AverageNetwork, self).__init__() wn_func = wn if weight_norm else lambda x: x self.input_size = input_size self.time_step = 0 self.batch_dim, self.time_dim = (0, 1) if batch_first else (1, 0) self.gates = nn.Sequential( wn_func(nn.Linear(2 * input_size, 2 * input_size)), nn.Sigmoid() ) if layer_norm: self.lnorm = nn.LayerNorm(input_size) self.fc = nn.Sequential(wn_func(Linear(input_size, inner_linear, groups=inner_groups)), nn.ReLU(inplace=True), nn.Dropout(dropout), wn_func(Linear(inner_linear, input_size, groups=inner_groups)))
Example #11
Source File: transformer.py From crosentgec with GNU General Public License v3.0 | 6 votes |
def __init__(self, args): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.decoder_normalize_before self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(3)])
Example #12
Source File: absa_layer.py From BERT-E2E-ABSA with Apache License 2.0 | 6 votes |
def __init__(self, input_size, hidden_size, bidirectional=True): """ :param input_size: :param hidden_size: :param bidirectional: """ super(LSTM, self).__init__() self.input_size = input_size if bidirectional: self.hidden_size = hidden_size // 2 else: self.hidden_size = hidden_size self.bidirectional = bidirectional self.LNx = nn.LayerNorm(4*self.hidden_size) self.LNh = nn.LayerNorm(4*self.hidden_size) self.LNc = nn.LayerNorm(self.hidden_size) self.Wx = nn.Linear(in_features=self.input_size, out_features=4*self.hidden_size, bias=True) self.Wh = nn.Linear(in_features=self.hidden_size, out_features=4*self.hidden_size, bias=True)
Example #13
Source File: utils.py From cortex with BSD 3-Clause "New" or "Revised" License | 6 votes |
def finish_layer_1d(models, name, dim_out, dropout=False, layer_norm=False, batch_norm=False, nonlinearity=None): if layer_norm and batch_norm: logger.warning('Ignoring layer_norm because batch_norm is True') if dropout: models.add_module(name + '_do', nn.Dropout(p=dropout)) if layer_norm: models.add_module(name + '_ln', nn.LayerNorm(dim_out)) elif batch_norm: models.add_module(name + '_bn', nn.BatchNorm1d(dim_out)) if nonlinearity: nonlinearity = get_nonlinearity(nonlinearity) models.add_module( '{}_{}'.format(name, nonlinearity.__class__.__name__), nonlinearity)
Example #14
Source File: utils.py From cortex with BSD 3-Clause "New" or "Revised" License | 6 votes |
def finish_layer_2d(models, name, dim_x, dim_y, dim_out, dropout=False, layer_norm=False, batch_norm=False, nonlinearity=None): if layer_norm and batch_norm: logger.warning('Ignoring layer_norm because batch_norm is True') if dropout: models.add_module(name + '_do', nn.Dropout2d(p=dropout)) if layer_norm: models.add_module(name + '_ln', nn.LayerNorm((dim_out, dim_x, dim_y))) elif batch_norm: models.add_module(name + '_bn', nn.BatchNorm2d(dim_out)) if nonlinearity: nonlinearity = get_nonlinearity(nonlinearity) models.add_module( '{}_{}'.format(name, nonlinearity.__class__.__name__), nonlinearity)
Example #15
Source File: absa_layer.py From BERT-E2E-ABSA with Apache License 2.0 | 6 votes |
def __init__(self, input_size, hidden_size, bidirectional=True): """ :param input_size: :param hidden_size: :param bidirectional: """ super(GRU, self).__init__() self.input_size = input_size if bidirectional: self.hidden_size = hidden_size // 2 else: self.hidden_size = hidden_size self.bidirectional = bidirectional self.Wxrz = nn.Linear(in_features=self.input_size, out_features=2*self.hidden_size, bias=True) self.Whrz = nn.Linear(in_features=self.hidden_size, out_features=2*self.hidden_size, bias=True) self.Wxn = nn.Linear(in_features=self.input_size, out_features=self.hidden_size, bias=True) self.Whn = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=True) self.LNx1 = nn.LayerNorm(2*self.hidden_size) self.LNh1 = nn.LayerNorm(2*self.hidden_size) self.LNx2 = nn.LayerNorm(self.hidden_size) self.LNh2 = nn.LayerNorm(self.hidden_size)
Example #16
Source File: models.py From cerl with Apache License 2.0 | 6 votes |
def __init__(self, state_dim, action_dim, wwid): super(Actor, self).__init__() self.wwid = torch.Tensor([wwid]) l1 = 400; l2 = 300 # Construct Hidden Layer 1 self.f1 = nn.Linear(state_dim, l1) self.ln1 = nn.LayerNorm(l1) #Hidden Layer 2 self.f2 = nn.Linear(l1, l2) self.ln2 = nn.LayerNorm(l2) #Out self.w_out = nn.Linear(l2, action_dim)
Example #17
Source File: module.py From Transformer-TTS with MIT License | 6 votes |
def __init__(self, num_hidden, h=4): """ :param num_hidden: dimension of hidden :param h: num of heads """ super(Attention, self).__init__() self.num_hidden = num_hidden self.num_hidden_per_attn = num_hidden // h self.h = h self.key = Linear(num_hidden, num_hidden, bias=False) self.value = Linear(num_hidden, num_hidden, bias=False) self.query = Linear(num_hidden, num_hidden, bias=False) self.multihead = MultiheadAttention(self.num_hidden_per_attn) self.residual_dropout = nn.Dropout(p=0.1) self.final_linear = Linear(num_hidden * 2, num_hidden) self.layer_norm_1 = nn.LayerNorm(num_hidden)
Example #18
Source File: transformer.py From ITDD with MIT License | 5 votes |
def __init__(self, num_layers, d_model, heads, d_ff, attn_type, copy_attn, self_attn_type, dropout, embeddings): super(TransformerDecoder, self).__init__() # Basic attributes. self.decoder_type = 'transformer' self.num_layers = num_layers self.embeddings = embeddings self.self_attn_type = self_attn_type # Decoder State self.state = {} # Build TransformerDecoder. self.transformer_layers = nn.ModuleList( [TransformerDecoderLayer(d_model, heads, d_ff, dropout, self_attn_type=self_attn_type) for _ in range(num_layers)]) # TransformerDecoder has its own attention mechanism. # Set up a separated copy attention layer, if needed. self._copy = False if copy_attn: self.copy_attn = onmt.modules.GlobalAttention( d_model, attn_type=attn_type) self._copy = True self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
Example #19
Source File: transformer.py From ITDD with MIT License | 5 votes |
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings): super(TransformerEncoder, self).__init__() self.num_layers = num_layers self.embeddings = embeddings self.transformer = nn.ModuleList( [TransformerEncoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)]) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
Example #20
Source File: transformer.py From ITDD with MIT License | 5 votes |
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
Example #21
Source File: position_ffn.py From ITDD with MIT License | 5 votes |
def __init__(self, d_model, d_ff, dropout=0.1): super(PositionwiseFeedForward, self).__init__() self.w_1 = nn.Linear(d_model, d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout_1 = nn.Dropout(dropout) self.relu = nn.ReLU() self.dropout_2 = nn.Dropout(dropout)
Example #22
Source File: utils.py From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, d_model=512, d_ff=2048, dropout=.1, identity_map_reordering=False): super(PositionWiseFeedForward, self).__init__() self.identity_map_reordering = identity_map_reordering self.fc1 = nn.Linear(d_model, d_ff) self.fc2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(p=dropout) self.dropout_2 = nn.Dropout(p=dropout) self.layer_norm = nn.LayerNorm(d_model)
Example #23
Source File: layer_norm.py From fairseq with MIT License | 5 votes |
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False): if not export and torch.cuda.is_available() and has_fused_layernorm: return FusedLayerNorm(normalized_shape, eps, elementwise_affine) return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
Example #24
Source File: dummy_model.py From fairseq with MIT License | 5 votes |
def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24): super().__init__(Dictionary()) self.embed = nn.Embedding( num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0 ) self.layers_a = nn.ModuleList([ nn.Sequential( nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 3*embed_dim), # q, k, v input projection nn.Linear(3*embed_dim, embed_dim), # skip self-attention nn.Linear(embed_dim, embed_dim), # output projection nn.Dropout(), ) for i in range(num_layers) ]) self.layers_b = nn.ModuleList([ nn.Sequential( nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 4*embed_dim), # FFN nn.ReLU(), nn.Linear(4*embed_dim, embed_dim), # FFN nn.Dropout(0.1), ) for i in range(num_layers) ]) self.out_proj = nn.Linear(embed_dim, num_embed)
Example #25
Source File: vggtransformer.py From fairseq with MIT License | 5 votes |
def LayerNorm(embedding_dim): m = nn.LayerNorm(embedding_dim) return m # seq2seq models
Example #26
Source File: encoders.py From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, N, padding_idx, d_in=2048, **kwargs): super(MemoryAugmentedEncoder, self).__init__(N, padding_idx, **kwargs) self.fc = nn.Linear(d_in, self.d_model) self.dropout = nn.Dropout(p=self.dropout) self.layer_norm = nn.LayerNorm(self.d_model)
Example #27
Source File: module.py From Transformer-TTS with MIT License | 5 votes |
def __init__(self, num_hidden): """ :param num_hidden: dimension of hidden """ super(FFN, self).__init__() self.w_1 = Conv(num_hidden, num_hidden * 4, kernel_size=1, w_init='relu') self.w_2 = Conv(num_hidden * 4, num_hidden, kernel_size=1) self.dropout = nn.Dropout(p=0.1) self.layer_norm = nn.LayerNorm(num_hidden)
Example #28
Source File: layers.py From dgl with Apache License 2.0 | 5 votes |
def __init__(self, size, dropout): super(SubLayerWrapper, self).__init__() self.norm = LayerNorm(size) self.dropout = nn.Dropout(dropout)
Example #29
Source File: scorenet.py From ncsn with GNU General Public License v3.0 | 5 votes |
def __init__(self, config): super().__init__() self.config = config nef = config.model.nef * 4 self.u_net = nn.Sequential( # input is (nc) x 10 x 10 nn.Conv2d(config.data.channels, nef, 4, stride=2, padding=1), # nn.Softplus(), nn.GroupNorm(4, nef), nn.ELU(), # state size. (nef) x 6 x 6 nn.Conv2d(nef, nef * 2, 3, stride=1, padding=1), nn.GroupNorm(4, nef * 2), # nn.Softplus(), nn.ELU(), # state size. (nef*2) x 6 x 6 nn.ConvTranspose2d(nef * 2, nef, 3, stride=1, padding=1), nn.GroupNorm(4, nef), # nn.Softplus(), nn.ELU(), # state size. (nef*2) x 6 x 6 nn.ConvTranspose2d(nef, config.data.channels, 4, stride=2, padding=1), # nn.Softplus(), nn.ELU(), ) self.fc = nn.Sequential( nn.Linear(config.data.channels * 10 ** 2, 256), nn.LayerNorm(256), nn.ELU(), nn.Linear(256, config.data.channels * 10 ** 2) )
Example #30
Source File: ktransformer.py From ITDD with MIT License | 5 votes |
def __init__(self, d_model, heads, d_ff, dropout): super(HTransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)