Python Examples of torch.nn.LayerNorm

Source File: scorenet.py From ncsn with GNU General Public License v3.0

7 votes

def __init__(self, config):
        super().__init__()
        self.config = config
        self.main = nn.Sequential(
            nn.Linear(10 * 10, 1024),
            nn.LayerNorm(1024),
            nn.ELU(),
            nn.Linear(1024, 1024),
            nn.LayerNorm(1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ELU(),
            nn.Linear(512, 100),
            nn.LayerNorm(100)
        )

Source File: modules.py From dgl with Apache License 2.0

7 votes

def __init__(self,
                 in_feats,
                 out_feats,
                 activation,
                 dropout,
                 bias=True,
                 use_pp=False,
                 use_lynorm=True):
        super(GraphSAGELayer, self).__init__()
        # The input feature size gets doubled as we concatenated the original
        # features with the new features.
        self.linear = nn.Linear(2 * in_feats, out_feats, bias=bias)
        self.activation = activation
        self.use_pp = use_pp
        if dropout:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = 0.
        if use_lynorm:
            self.lynorm = nn.LayerNorm(out_feats, elementwise_affine=True)
        else:
            self.lynorm = lambda x: x
        self.reset_parameters()

Source File: model.py From dgl with Apache License 2.0

6 votes

def __init__(self, c, T, n, Lk, p, num_layers,control_str = 'TNTSTNTST'):
        super(STGCN_WAVE, self).__init__()
        self.control_str = control_str # model structure controller
        self.num_layers = len(control_str)
        self.layers = []
        cnt = 0
        diapower = 0
        for i in range(self.num_layers):
            i_layer = control_str[i]
            if i_layer == 'T': # Temporal Layer
                self.layers.append(TemporalConvLayer(c[cnt], c[cnt + 1], dia = 2**diapower))
                diapower += 1
                cnt += 1
            if i_layer == 'S': # Spatio Layer
                self.layers.append(SpatioConvLayer(c[cnt], Lk))
            if i_layer == 'N': # Norm Layer
                self.layers.append(nn.LayerNorm([n,c[cnt]]))
        self.output = OutputLayer(c[cnt], T + 1 - 2**(diapower), n)
        for layer in self.layers:
            layer = layer.cuda()

Source File: Transformer.py From ConvLab with MIT License

6 votes

def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super(AverageHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)

Source File: Transformer.py From ConvLab with MIT License

6 votes

def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)

Source File: attention.py From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, d_model, d_k, d_v, h, dropout=.1, identity_map_reordering=False, can_be_stateful=False,
                 attention_module=None, attention_module_kwargs=None):
        super(MultiHeadAttention, self).__init__()
        self.identity_map_reordering = identity_map_reordering
        if attention_module is not None:
            if attention_module_kwargs is not None:
                self.attention = attention_module(d_model=d_model, d_k=d_k, d_v=d_v, h=h, **attention_module_kwargs)
            else:
                self.attention = attention_module(d_model=d_model, d_k=d_k, d_v=d_v, h=h)
        else:
            self.attention = ScaledDotProductAttention(d_model=d_model, d_k=d_k, d_v=d_v, h=h)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

        self.can_be_stateful = can_be_stateful
        if self.can_be_stateful:
            self.register_state('running_keys', torch.zeros((0, d_model)))
            self.register_state('running_values', torch.zeros((0, d_model)))

Source File: encoder.py From TVQAplus with MIT License

6 votes

def __init__(self, n_conv, kernel_size=7, n_filters=128, dropout=0.1, num_heads=4):
        super(EncoderBlock, self).__init__()
        self.dropout = dropout
        self.n_conv = n_conv
        self.num_heads = num_heads

        self.position_encoding = PositionEncoding(n_filters=n_filters)

        self.layer_norm = nn.ModuleList([nn.LayerNorm(n_filters) for _ in range(n_conv)])
        self.final_layer_norm = nn.LayerNorm(n_filters)
        self.conv = nn.ModuleList([
            DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True)
            for _ in range(n_conv)])

        if self.num_heads != 0:
            self.multi_head_attn = MultiHeadedAttention(nh=num_heads, d_model=n_filters)
            self.attn_layer_norm = nn.LayerNorm(n_filters)

Source File: encoder.py From pytorch_sac_ae with MIT License

6 votes

def __init__(self, obs_shape, feature_dim, num_layers=2, num_filters=32):
        super().__init__()

        assert len(obs_shape) == 3

        self.feature_dim = feature_dim
        self.num_layers = num_layers

        self.convs = nn.ModuleList(
            [nn.Conv2d(obs_shape[0], num_filters, 3, stride=2)]
        )
        for i in range(num_layers - 1):
            self.convs.append(nn.Conv2d(num_filters, num_filters, 3, stride=1))

        out_dim = OUT_DIM[num_layers]
        self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim)
        self.ln = nn.LayerNorm(self.feature_dim)

        self.outputs = dict()

Source File: meta.py From ScenarioMeta with MIT License

6 votes

def __init__(self, hidden_size, layer_norm=False, input_gate=True, forget_gate=True):
            nn.Module.__init__(self)
            self.hidden_size = hidden_size
            # gradient(2), param(2), loss
            self.lstm = nn.LSTMCell(input_size=5, hidden_size=hidden_size)
            if layer_norm:
                self.layer_norm = nn.LayerNorm(hidden_size)
            else:
                self.layer_norm = None
            self.input_gate = input_gate
            self.forget_gate = forget_gate
            if self.input_gate:
                self.lr_layer = nn.Linear(hidden_size, 1)
                self.lrs = []
            else:
                self.output_layer = nn.Linear(hidden_size, 1)
                self.dets = []
            if forget_gate:
                self.fg_layer = nn.Linear(hidden_size, 1)
                self.fgs = []
            self.h_0 = nn.Parameter(torch.randn((hidden_size,), requires_grad=True))
            self.c_0 = nn.Parameter(torch.randn((hidden_size,), requires_grad=True))

Source File: transformer_blocks.py From Character-Level-Language-Modeling-with-Deeper-Self-Attention-pytorch with MIT License

6 votes

def __init__(self, input_size, inner_linear, inner_groups=1, layer_norm=True, weight_norm=False, dropout=0, batch_first=True):
        super(AverageNetwork, self).__init__()
        wn_func = wn if weight_norm else lambda x: x
        self.input_size = input_size
        self.time_step = 0
        self.batch_dim, self.time_dim = (0, 1) if batch_first else (1, 0)
        self.gates = nn.Sequential(
            wn_func(nn.Linear(2 * input_size, 2 * input_size)),
            nn.Sigmoid()
        )
        if layer_norm:
            self.lnorm = nn.LayerNorm(input_size)
        self.fc = nn.Sequential(wn_func(Linear(input_size, inner_linear, groups=inner_groups)),
                                nn.ReLU(inplace=True),
                                nn.Dropout(dropout),
                                wn_func(Linear(inner_linear, input_size, groups=inner_groups)))

Source File: transformer.py From crosentgec with GNU General Public License v3.0

6 votes

def __init__(self, args):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.self_attn = MultiheadAttention(
            self.embed_dim, args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before
        self.encoder_attn = MultiheadAttention(
            self.embed_dim, args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
        self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(3)])

Source File: absa_layer.py From BERT-E2E-ABSA with Apache License 2.0

6 votes

def __init__(self, input_size, hidden_size, bidirectional=True):
        """

        :param input_size:
        :param hidden_size:
        :param bidirectional:
        """
        super(LSTM, self).__init__()
        self.input_size = input_size
        if bidirectional:
            self.hidden_size = hidden_size // 2
        else:
            self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.LNx = nn.LayerNorm(4*self.hidden_size)
        self.LNh = nn.LayerNorm(4*self.hidden_size)
        self.LNc = nn.LayerNorm(self.hidden_size)
        self.Wx = nn.Linear(in_features=self.input_size, out_features=4*self.hidden_size, bias=True)
        self.Wh = nn.Linear(in_features=self.hidden_size, out_features=4*self.hidden_size, bias=True)

Source File: utils.py From cortex with BSD 3-Clause "New" or "Revised" License

6 votes

def finish_layer_1d(models, name, dim_out,
                    dropout=False, layer_norm=False, batch_norm=False,
                    nonlinearity=None):
    if layer_norm and batch_norm:
        logger.warning('Ignoring layer_norm because batch_norm is True')

    if dropout:
        models.add_module(name + '_do', nn.Dropout(p=dropout))

    if layer_norm:
        models.add_module(name + '_ln', nn.LayerNorm(dim_out))
    elif batch_norm:
        models.add_module(name + '_bn', nn.BatchNorm1d(dim_out))

    if nonlinearity:
        nonlinearity = get_nonlinearity(nonlinearity)
        models.add_module(
            '{}_{}'.format(name, nonlinearity.__class__.__name__),
            nonlinearity)

Source File: utils.py From cortex with BSD 3-Clause "New" or "Revised" License

6 votes

def finish_layer_2d(models, name, dim_x, dim_y, dim_out,
                    dropout=False, layer_norm=False, batch_norm=False,
                    nonlinearity=None):
    if layer_norm and batch_norm:
        logger.warning('Ignoring layer_norm because batch_norm is True')

    if dropout:
        models.add_module(name + '_do', nn.Dropout2d(p=dropout))

    if layer_norm:
        models.add_module(name + '_ln', nn.LayerNorm((dim_out, dim_x, dim_y)))
    elif batch_norm:
        models.add_module(name + '_bn', nn.BatchNorm2d(dim_out))

    if nonlinearity:
        nonlinearity = get_nonlinearity(nonlinearity)
        models.add_module(
            '{}_{}'.format(name, nonlinearity.__class__.__name__),
            nonlinearity)

Source File: absa_layer.py From BERT-E2E-ABSA with Apache License 2.0

6 votes

def __init__(self, input_size, hidden_size, bidirectional=True):
        """

        :param input_size:
        :param hidden_size:
        :param bidirectional:
        """
        super(GRU, self).__init__()
        self.input_size = input_size
        if bidirectional:
            self.hidden_size = hidden_size // 2
        else:
            self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.Wxrz = nn.Linear(in_features=self.input_size, out_features=2*self.hidden_size, bias=True)
        self.Whrz = nn.Linear(in_features=self.hidden_size, out_features=2*self.hidden_size, bias=True)
        self.Wxn = nn.Linear(in_features=self.input_size, out_features=self.hidden_size, bias=True)
        self.Whn = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=True)
        self.LNx1 = nn.LayerNorm(2*self.hidden_size)
        self.LNh1 = nn.LayerNorm(2*self.hidden_size)
        self.LNx2 = nn.LayerNorm(self.hidden_size)
        self.LNh2 = nn.LayerNorm(self.hidden_size)

Source File: models.py From cerl with Apache License 2.0

6 votes

def __init__(self, state_dim, action_dim, wwid):
        super(Actor, self).__init__()

        self.wwid = torch.Tensor([wwid])
        l1 = 400; l2 = 300

        # Construct Hidden Layer 1
        self.f1 = nn.Linear(state_dim, l1)
        self.ln1 = nn.LayerNorm(l1)

        #Hidden Layer 2
        self.f2 = nn.Linear(l1, l2)
        self.ln2 = nn.LayerNorm(l2)

        #Out
        self.w_out = nn.Linear(l2, action_dim)

Source File: module.py From Transformer-TTS with MIT License

6 votes

def __init__(self, num_hidden, h=4):
        """
        :param num_hidden: dimension of hidden
        :param h: num of heads 
        """
        super(Attention, self).__init__()

        self.num_hidden = num_hidden
        self.num_hidden_per_attn = num_hidden // h
        self.h = h

        self.key = Linear(num_hidden, num_hidden, bias=False)
        self.value = Linear(num_hidden, num_hidden, bias=False)
        self.query = Linear(num_hidden, num_hidden, bias=False)

        self.multihead = MultiheadAttention(self.num_hidden_per_attn)

        self.residual_dropout = nn.Dropout(p=0.1)

        self.final_linear = Linear(num_hidden * 2, num_hidden)

        self.layer_norm_1 = nn.LayerNorm(num_hidden)

Source File: transformer.py From ITDD with MIT License

5 votes

def __init__(self, num_layers, d_model, heads, d_ff, attn_type,
                 copy_attn, self_attn_type, dropout, embeddings):
        super(TransformerDecoder, self).__init__()

        # Basic attributes.
        self.decoder_type = 'transformer'
        self.num_layers = num_layers
        self.embeddings = embeddings
        self.self_attn_type = self_attn_type

        # Decoder State
        self.state = {}

        # Build TransformerDecoder.
        self.transformer_layers = nn.ModuleList(
            [TransformerDecoderLayer(d_model, heads, d_ff, dropout,
             self_attn_type=self_attn_type)
             for _ in range(num_layers)])

        # TransformerDecoder has its own attention mechanism.
        # Set up a separated copy attention layer, if needed.
        self._copy = False
        if copy_attn:
            self.copy_attn = onmt.modules.GlobalAttention(
                d_model, attn_type=attn_type)
            self._copy = True
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

Source File: transformer.py From ITDD with MIT License

5 votes

def __init__(self, num_layers, d_model, heads, d_ff,
                 dropout, embeddings):
        super(TransformerEncoder, self).__init__()

        self.num_layers = num_layers
        self.embeddings = embeddings
        self.transformer = nn.ModuleList(
            [TransformerEncoderLayer(d_model, heads, d_ff, dropout)
             for _ in range(num_layers)])
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

Source File: transformer.py From ITDD with MIT License

5 votes

def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

Source File: position_ffn.py From ITDD with MIT License

5 votes

def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout_1 = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.dropout_2 = nn.Dropout(dropout)

Source File: utils.py From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, d_model=512, d_ff=2048, dropout=.1, identity_map_reordering=False):
        super(PositionWiseFeedForward, self).__init__()
        self.identity_map_reordering = identity_map_reordering
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

Source File: layer_norm.py From fairseq with MIT License

5 votes

def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
    if not export and torch.cuda.is_available() and has_fused_layernorm:
        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)

Source File: dummy_model.py From fairseq with MIT License

5 votes

def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24):
        super().__init__(Dictionary())
        self.embed = nn.Embedding(
            num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0
        )
        self.layers_a = nn.ModuleList([
            nn.Sequential(
                nn.LayerNorm(embed_dim),
                nn.Linear(embed_dim, 3*embed_dim),  # q, k, v input projection
                nn.Linear(3*embed_dim, embed_dim),  # skip self-attention
                nn.Linear(embed_dim, embed_dim),    # output projection
                nn.Dropout(),
            )
            for i in range(num_layers)
        ])
        self.layers_b = nn.ModuleList([
            nn.Sequential(
                nn.LayerNorm(embed_dim),
                nn.Linear(embed_dim, 4*embed_dim),  # FFN
                nn.ReLU(),
                nn.Linear(4*embed_dim, embed_dim),  # FFN
                nn.Dropout(0.1),
            )
            for i in range(num_layers)
        ])
        self.out_proj = nn.Linear(embed_dim, num_embed)

Source File: vggtransformer.py From fairseq with MIT License

5 votes

def LayerNorm(embedding_dim):
    m = nn.LayerNorm(embedding_dim)
    return m


# seq2seq models

Source File: encoders.py From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, N, padding_idx, d_in=2048, **kwargs):
        super(MemoryAugmentedEncoder, self).__init__(N, padding_idx, **kwargs)
        self.fc = nn.Linear(d_in, self.d_model)
        self.dropout = nn.Dropout(p=self.dropout)
        self.layer_norm = nn.LayerNorm(self.d_model)

Source File: module.py From Transformer-TTS with MIT License

5 votes

def __init__(self, num_hidden):
        """
        :param num_hidden: dimension of hidden 
        """
        super(FFN, self).__init__()
        self.w_1 = Conv(num_hidden, num_hidden * 4, kernel_size=1, w_init='relu')
        self.w_2 = Conv(num_hidden * 4, num_hidden, kernel_size=1)
        self.dropout = nn.Dropout(p=0.1)
        self.layer_norm = nn.LayerNorm(num_hidden)

Source File: layers.py From dgl with Apache License 2.0

5 votes

def __init__(self, size, dropout):
        super(SubLayerWrapper, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

Source File: scorenet.py From ncsn with GNU General Public License v3.0

5 votes

def __init__(self, config):
        super().__init__()
        self.config = config
        nef = config.model.nef * 4
        self.u_net = nn.Sequential(
            # input is (nc) x 10 x 10
            nn.Conv2d(config.data.channels, nef, 4, stride=2, padding=1),
            # nn.Softplus(),
            nn.GroupNorm(4, nef),
            nn.ELU(),
            # state size. (nef) x 6 x 6
            nn.Conv2d(nef, nef * 2, 3, stride=1, padding=1),
            nn.GroupNorm(4, nef * 2),
            # nn.Softplus(),
            nn.ELU(),
            # state size. (nef*2) x 6 x 6
            nn.ConvTranspose2d(nef * 2, nef, 3, stride=1, padding=1),
            nn.GroupNorm(4, nef),
            # nn.Softplus(),
            nn.ELU(),
            # state size. (nef*2) x 6 x 6
            nn.ConvTranspose2d(nef, config.data.channels, 4, stride=2, padding=1),
            # nn.Softplus(),
            nn.ELU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(config.data.channels * 10 ** 2, 256),
            nn.LayerNorm(256),
            nn.ELU(),
            nn.Linear(256, config.data.channels * 10 ** 2)
        )

Source File: ktransformer.py From ITDD with MIT License

5 votes

def __init__(self, d_model, heads, d_ff, dropout):
        super(HTransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.knowledge_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

Python torch.nn.LayerNorm() Examples