Python Examples of torch.softmax

Source File: modules.py From BAMnet with Apache License 2.0

7 votes

def forward(self, x, x_len, atten_mask):
        CoAtt = torch.bmm(x, x.transpose(1, 2))
        CoAtt = atten_mask.unsqueeze(1) * CoAtt - (1 - atten_mask).unsqueeze(1) * INF
        CoAtt = torch.softmax(CoAtt, dim=-1)
        new_x = torch.cat([torch.bmm(CoAtt, x), x], -1)

        sorted_x_len, indx = torch.sort(x_len, 0, descending=True)
        new_x = pack_padded_sequence(new_x[indx], sorted_x_len.data.tolist(), batch_first=True)

        h0 = to_cuda(torch.zeros(2, x_len.size(0), self.hidden_size // 2), self.use_cuda)
        c0 = to_cuda(torch.zeros(2, x_len.size(0), self.hidden_size // 2), self.use_cuda)
        packed_h, (packed_h_t, _) = self.model(new_x, (h0, c0))

        # restore the sorting
        _, inverse_indx = torch.sort(indx, 0)
        packed_h_t = torch.cat([packed_h_t[i] for i in range(packed_h_t.size(0))], -1)
        restore_packed_h_t = packed_h_t[inverse_indx]
        output = restore_packed_h_t
        return output

Source File: tool.py From lightNLP with Apache License 2.0

6 votes

def get_score(self, model, texts, labels, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        }
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        assert texts.size(0) == len(labels)
        vec_predict = model(texts)
        soft_predict = torch.softmax(vec_predict, dim=1)
        predict_prob, predict_index = torch.max(soft_predict.cpu().data, dim=1)
        # print('prob', predict_prob)
        # print('index', predict_index)
        # print('labels', labels)
        labels = labels.view(-1).cpu().data.numpy()
        return metric_func(predict_index, labels, average='micro')

Source File: modules.py From BAMnet with Apache License 2.0

6 votes

def update_coatt_cat_maxpool(self, query_embed, in_memory_embed, out_memory_embed, query_att, atten_mask=None, ctx_mask=None, query_mask=None):
        attention = torch.bmm(query_embed, in_memory_embed.view(in_memory_embed.size(0), -1, in_memory_embed.size(-1))\
            .transpose(1, 2)).view(query_embed.size(0), query_embed.size(1), in_memory_embed.size(1), -1) # bs * N * M * k
        if ctx_mask is not None:
            attention[:, :, :, -1] = ctx_mask.unsqueeze(1) * attention[:, :, :, -1].clone() - (1 - ctx_mask).unsqueeze(1) * INF
        if atten_mask is not None:
            attention = atten_mask.unsqueeze(1).unsqueeze(-1) * attention - (1 - atten_mask).unsqueeze(1).unsqueeze(-1) * INF
        if query_mask is not None:
            attention = query_mask.unsqueeze(2).unsqueeze(-1) * attention - (1 - query_mask).unsqueeze(2).unsqueeze(-1) * INF

        # Importance module
        kb_feature_att = F.max_pool1d(attention.view(attention.size(0), attention.size(1), -1).transpose(1, 2), kernel_size=attention.size(1)).squeeze(-1).view(attention.size(0), -1, attention.size(-1))
        kb_feature_att = torch.softmax(kb_feature_att, dim=-1).view(-1, kb_feature_att.size(-1)).unsqueeze(1)
        in_memory_embed = torch.bmm(kb_feature_att, in_memory_embed.view(-1, in_memory_embed.size(2), in_memory_embed.size(-1))).squeeze(1).view(in_memory_embed.size(0), in_memory_embed.size(1), -1)
        out_memory_embed = out_memory_embed.sum(2)

        # Enhanced module
        attention = F.max_pool1d(attention.view(attention.size(0), -1, attention.size(-1)), kernel_size=attention.size(-1)).squeeze(-1).view(attention.size(0), attention.size(1), attention.size(2))
        probs = torch.softmax(attention, dim=-1)
        new_query_embed = query_embed + query_att.unsqueeze(2) * torch.bmm(probs, out_memory_embed)

        probs2 = torch.softmax(attention, dim=1)
        kb_att = torch.bmm(query_att.unsqueeze(1), probs).squeeze(1)
        in_memory_embed = in_memory_embed + kb_att.unsqueeze(2) * torch.bmm(probs2.transpose(1, 2), new_query_embed)
        return new_query_embed, in_memory_embed, out_memory_embed

Source File: modules.py From BAMnet with Apache License 2.0

6 votes

def forward(self, query_embed, in_memory_embed, atten_mask=None):
        if self.atten_type == 'simple': # simple attention
            attention = torch.bmm(in_memory_embed, query_embed.unsqueeze(2)).squeeze(2)
        elif self.atten_type == 'mul': # multiplicative attention
            attention = torch.bmm(in_memory_embed, torch.mm(query_embed, self.W).unsqueeze(2)).squeeze(2)
        elif self.atten_type == 'add': # additive attention
            attention = torch.tanh(torch.mm(in_memory_embed.view(-1, in_memory_embed.size(-1)), self.W2)\
                .view(in_memory_embed.size(0), -1, self.W2.size(-1)) \
                + torch.mm(query_embed, self.W).unsqueeze(1))
            attention = torch.mm(attention.view(-1, attention.size(-1)), self.W3).view(attention.size(0), -1)
        else:
            raise RuntimeError('Unknown atten_type: {}'.format(self.atten_type))

        if atten_mask is not None:
            # Exclude masked elements from the softmax
            attention = atten_mask * attention - (1 - atten_mask) * INF
        return attention

Source File: fpn.py From seamseg with BSD 3-Clause "New" or "Revised" License

6 votes

def inference(self, head, x, proposals, valid_size, img_size):
        x = x[self.min_level:self.min_level + self.levels]

        if not proposals.all_none:
            # Run head on the given proposals
            proposals, proposals_idx = proposals.contiguous
            cls_logits, bbx_logits = self._head(head, x, proposals, proposals_idx, img_size)

            # Shift the proposals according to the logits
            bbx_reg_weights = x[0].new(self.bbx_reg_weights)
            boxes = shift_boxes(proposals.unsqueeze(1), bbx_logits / bbx_reg_weights)
            scores = torch.softmax(cls_logits, dim=1)

            # Split boxes and scores by image, clip to valid size
            boxes, scores = self._split_and_clip(boxes, scores, proposals_idx, valid_size)

            bbx_pred, cls_pred, obj_pred = self.prediction_generator(boxes, scores)
        else:
            bbx_pred = PackedSequence([None for _ in range(x[0].size(0))])
            cls_pred = PackedSequence([None for _ in range(x[0].size(0))])
            obj_pred = PackedSequence([None for _ in range(x[0].size(0))])

        return bbx_pred, cls_pred, obj_pred

Source File: self_attention.py From TVQAplus with MIT License

6 votes

def attention(self, query, key, value, mask=None, dropout=None):
        """ Compute 'Scaled Dot Product Attention'
        Args:
            query: (N, nh, L, d_k)
            key: (N, nh, L, d_k)
            value: (N, nh, L, d_k)
            mask: (N, 1, L, 1)
            dropout:
        """
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)  # (N, nh, L, L)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        p_attn = torch.softmax(scores, dim=-1)
        if dropout is not None:
            p_attn = dropout(p_attn)
        return torch.matmul(p_attn, value), p_attn  # (N, nh, L, d_k), (N, nh, L, L)

Source File: logistic_mixture.py From L3C-PyTorch with GNU General Public License v3.0

6 votes

def _visualize_params(logits_pis, means, log_scales, channel):
    """
    :param logits_pis:  NCKHW
    :param means: NCKHW
    :param log_scales: NCKHW
    :param channel: int
    :return:
    """
    assert logits_pis.shape == means.shape == log_scales.shape
    logits_pis = logits_pis[0, channel, ...].detach()
    means = means[0, channel, ...].detach()
    log_scales = log_scales[0, channel, ...].detach()

    pis = torch.softmax(logits_pis, dim=0)  # Kdim==0 -> KHW

    mixtures = ft.lconcat(
            zip(_iter_Kdim_normalized(pis, normalize=False),
                _iter_Kdim_normalized(means),
                _iter_Kdim_normalized(log_scales)))
    grid = vis.grid.prep_for_grid(mixtures)
    img = torchvision.utils.make_grid(grid, nrow=3)
    return img

Source File: jumping_knowledge.py From pytorch_geometric with MIT License

6 votes

def forward(self, xs):
        r"""Aggregates representations across different layers.

        Args:
            xs (list or tuple): List containing layer-wise representations.
        """

        assert isinstance(xs, list) or isinstance(xs, tuple)

        if self.mode == 'cat':
            return torch.cat(xs, dim=-1)
        elif self.mode == 'max':
            return torch.stack(xs, dim=-1).max(dim=-1)[0]
        elif self.mode == 'lstm':
            x = torch.stack(xs, dim=1)  # [num_nodes, num_layers, num_channels]
            alpha, _ = self.lstm(x)
            alpha = self.att(alpha).squeeze(-1)  # [num_nodes, num_layers]
            alpha = torch.softmax(alpha, dim=-1)
            return (x * alpha.unsqueeze(-1)).sum(dim=1)

Source File: gat.py From pytorch_geometric with MIT License

6 votes

def __init__(self,
                 g,
                 in_channels,
                 out_channels,
                 heads=1,
                 negative_slope=0.2,
                 dropout=0):
        super(GATSPMVConv, self).__init__()
        self.g = g
        self.out_channels = out_channels
        self.heads = heads
        self.negative_slope = negative_slope
        self.dropout = dropout
        self.weight = Parameter(
            torch.Tensor(in_channels, heads * out_channels))
        self.att_l = Parameter(torch.Tensor(heads, out_channels, 1))
        self.att_r = Parameter(torch.Tensor(heads, out_channels, 1))
        self.bias = Parameter(torch.Tensor(heads * out_channels))
        self.softmax = EdgeSoftmax()
        self.reset_parameters()

Source File: infer.py From BERT-Relation-Extraction with Apache License 2.0

6 votes

def infer_one_sentence(self, sentence):
        self.net.eval()
        tokenized = self.tokenizer.encode(sentence); #print(tokenized)
        e1_e2_start = self.get_e1e2_start(tokenized); #print(e1_e2_start)
        tokenized = torch.LongTensor(tokenized).unsqueeze(0)
        e1_e2_start = torch.LongTensor(e1_e2_start).unsqueeze(0)
        attention_mask = (tokenized != self.pad_id).float()
        token_type_ids = torch.zeros((tokenized.shape[0], tokenized.shape[1])).long()
        
        if self.cuda:
            tokenized = tokenized.cuda()
            attention_mask = attention_mask.cuda()
            token_type_ids = token_type_ids.cuda()
        
        with torch.no_grad():
            classification_logits = self.net(tokenized, token_type_ids=token_type_ids, attention_mask=attention_mask, Q=None,\
                                        e1_e2_start=e1_e2_start)
            predicted = torch.softmax(classification_logits, dim=1).max(1)[1].item()
        print("Sentence: ", sentence)
        print("Predicted: ", self.rm.idx2rel[predicted].strip(), '\n')
        return predicted

Source File: train.py From dgl with Apache License 2.0

6 votes

def evaluate(args, net, dataset, segment='valid'):
    possible_rating_values = dataset.possible_rating_values
    nd_possible_rating_values = th.FloatTensor(possible_rating_values).to(args.device)

    if segment == "valid":
        rating_values = dataset.valid_truths
        enc_graph = dataset.valid_enc_graph
        dec_graph = dataset.valid_dec_graph
    elif segment == "test":
        rating_values = dataset.test_truths
        enc_graph = dataset.test_enc_graph
        dec_graph = dataset.test_dec_graph
    else:
        raise NotImplementedError

    # Evaluate RMSE
    net.eval()
    with th.no_grad():
        pred_ratings = net(enc_graph, dec_graph,
                           dataset.user_feature, dataset.movie_feature)
    real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                         nd_possible_rating_values.view(1, -1)).sum(dim=1)
    rmse = ((real_pred_ratings - rating_values) ** 2.).mean().item()
    rmse = np.sqrt(rmse)
    return rmse

Source File: losses.py From centerpose with MIT License

6 votes

def _fspecial_gauss(window_size, sigma=1.5):
    # Function to mimic the 'fspecial' gaussian MATLAB function.
    coords = np.arange(0, window_size, dtype=np.float32)
    coords -= (window_size - 1) / 2.0

    g = coords ** 2
    g *= (-0.5 / (sigma ** 2))
    g = np.reshape(g, (1, -1)) + np.reshape(g, (-1, 1))
    g = torch.from_numpy(np.reshape(g, (1, -1)))
    g = torch.softmax(g, dim=1)
    g = g / g.sum()
    return g


# 2019.05.26. butterworth filter.
# ref: http://www.cnblogs.com/laumians-notes/p/8592968.html

Source File: attention.py From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License

6 votes

def forward(self, queries, keys, values, attention_mask=None, attention_weights=None):
        '''
        Computes
        :param queries: Queries (b_s, nq, d_model)
        :param keys: Keys (b_s, nk, d_model)
        :param values: Values (b_s, nk, d_model)
        :param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking.
        :param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk).
        :return:
        '''
        b_s, nq = queries.shape[:2]
        nk = keys.shape[1]
        q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3)  # (b_s, h, nq, d_k)
        k = self.fc_k(keys).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1)  # (b_s, h, d_k, nk)
        v = self.fc_v(values).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3)  # (b_s, h, nk, d_v)

        att = torch.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)
        if attention_weights is not None:
            att = att * attention_weights
        if attention_mask is not None:
            att = att.masked_fill(attention_mask, -np.inf)
        att = torch.softmax(att, -1)
        out = torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v)  # (b_s, nq, h*d_v)
        out = self.fc_o(out)  # (b_s, nq, d_model)
        return out

Source File: module.py From Transformer-TTS with MIT License

6 votes

def forward(self, key, value, query, mask=None, query_mask=None):
        # Get attention score
        attn = t.bmm(query, key.transpose(1, 2))
        attn = attn / math.sqrt(self.num_hidden_k)

        # Masking to ignore padding (key side)
        if mask is not None:
            attn = attn.masked_fill(mask, -2 ** 32 + 1)
            attn = t.softmax(attn, dim=-1)
        else:
            attn = t.softmax(attn, dim=-1)

        # Masking to ignore padding (query side)
        if query_mask is not None:
            attn = attn * query_mask

        # Dropout
        # attn = self.attn_dropout(attn)
        
        # Get Context Vector
        result = t.bmm(attn, value)

        return result, attn

Source File: tool.py From lightNLP with Apache License 2.0

6 votes

def get_score(self, model, texta, textb, labels, score_type='f1'):
        metrics_map = {
            'f1': f1_score,
            'p': precision_score,
            'r': recall_score,
            'acc': accuracy_score
        }
        metric_func = metrics_map[score_type] if score_type in metrics_map else metrics_map['f1']
        assert texta.size(1) == textb.size(1) == len(labels)
        vec_predict = model(texta, textb)
        soft_predict = torch.softmax(vec_predict, dim=1)
        predict_prob, predict_index = torch.max(soft_predict.cpu().data, dim=1)
        # print('prob', predict_prob)
        # print('index', predict_index)
        # print('labels', labels)
        labels = labels.view(-1).cpu().data.numpy()
        return metric_func(predict_index, labels, average='micro')

Source File: model.py From FlexTensor with MIT License

6 votes

def message(self, edge_index_i, x_i, x_j, size_i, edge_type_index):
        # Compute attention coefficients.
        assert len(edge_type_index) == self.num_edge_type + 1
        x_j = x_j.view(-1, self.heads, self.out_channels)

        x_i = x_i.view(-1, self.heads, self.out_channels)
        alpha_lst = []

        for i in range(0, self.num_edge_type):
            beg = edge_type_index[i]
            end = edge_type_index[i + 1]
            alpha_tmp = (torch.cat([x_i[beg:end], x_j[beg:end]], dim=-1) * getattr(self, "edge_weight_%d" % i)).sum(dim=-1)
            alpha_lst.append(alpha_tmp)

        alpha = torch.cat(alpha_lst)

        alpha = F.leaky_relu(alpha, self.negative_slope)
        alpha = softmax(alpha, edge_index_i, size_i)
        # Sample attention coefficients stochastically.
        alpha = F.dropout(alpha, p=self.dropout, training=self.training)

        return x_j * alpha.view(-1, self.heads, 1)

Source File: vae.py From torchsupport with MIT License

6 votes

def tc_discriminator_loss(discriminator, true_batch, shuffle_batch):
  shuffle_indices = [
    shuffle_batch[torch.randperm(shuffle_batch.size(0)), idx:idx+1]
    for idx in range(shuffle_batch.size(-1))
  ]
  shuffle_batch = torch.cat(shuffle_indices, dim=1)

  sample_prediction = discriminator(true_batch)
  shuffle_prediction = discriminator(shuffle_batch)

  softmax_sample = torch.softmax(sample_prediction, dim=1)
  softmax_shuffle = torch.softmax(shuffle_prediction, dim=1)

  discriminator_loss = \
    -0.5 * (torch.log(softmax_sample[:, 0]).mean() \
    + torch.log(softmax_shuffle[:, 1]).mean())

  return discriminator_loss

Source File: mwan.py From fastNLP with Apache License 2.0

6 votes

def forward(self, hs, mask):
        '''
            hs: [(batch_size, len_q, input_size), ...]
            mask: (batch_size, len_q)
        '''
        
        hs = tc.cat([h.unsqueeze(0) for h in hs], dim=0)# (4, batch_size, len_q, input_size)

        vq = self.vq.view(1,1,1,-1).expand(hs.size(0), hs.size(1), hs.size(2), self.vq.size(0))

        s = self.v(tc.tanh(self.ln(tc.cat([hs,vq],-1)))).squeeze(-1)# (4, batch_size, len_q)

        s = s - ((mask.unsqueeze(0).eq(False)).float() * 10000)
        a = tc.softmax(s, dim=0)

        x = a.unsqueeze(-1) * hs
        x = tc.sum(x, dim=0)#(batch_size, len_q, input_size)

        return self.drop(x)

Source File: mwan.py From fastNLP with Apache License 2.0

6 votes

def Attention(hq, hp, mask_hq, mask_hp, my_method):
    standard_size = (hq.size(0), hq.size(1), hp.size(1), hq.size(-1))
    mask_mat = get_2dmask(mask_hq, mask_hp, standard_size[:-1])

    hq_mat = hq.unsqueeze(2).expand(standard_size)
    hp_mat = hp.unsqueeze(1).expand(standard_size)

    s = my_method(hq_mat, hp_mat)           # (batch_size, len_q, len_p)

    s = s - ((mask_mat.eq(False)).float() * 10000)
    a = tc.softmax(s, dim=1)

    q = a.unsqueeze(-1) * hq_mat            #(batch_size, len_q, len_p, input_size)
    q = tc.sum(q, dim=1)                    #(batch_size, len_p, input_size)

    return q

Source File: mwan.py From fastNLP with Apache License 2.0

6 votes

def forward(self, h, mask):
        '''
            h: (batch_size, len, input_size)
            mask: (batch_size, len)
        '''

        vq = self.vq.view(1,1,-1).expand(h.size(0), h.size(1), self.vq.size(0))

        s = self.v(tc.tanh(self.ln(tc.cat([h,vq],-1)))).squeeze(-1)    # (batch_size, len)
        
        s = s - ((mask.eq(False)).float() * 10000)
        a = tc.softmax(s, dim=1)

        r = a.unsqueeze(-1) * h       # (batch_size, len, input_size)
        r = tc.sum(r, dim=1)          # (batch_size, input_size)

        return self.drop(r)

Source File: semantic_composite.py From MatchZoo-py with Apache License 2.0

6 votes

def forward(self, x):
        """Forward."""
        seq_length = x.shape[1]

        x_1 = x.unsqueeze(dim=2).repeat(1, 1, seq_length, 1)
        x_2 = x.unsqueeze(dim=1).repeat(1, seq_length, 1, 1)
        x_concat = torch.cat([x_1, x_2, x_1 * x_2], dim=-1)

        # Self-attention layer.
        x_concat = self.dropout(x_concat)
        attn_matrix = self.att_linear(x_concat).squeeze(dim=-1)
        attn_weight = torch.softmax(attn_matrix, dim=2)
        attn = torch.bmm(attn_weight, x)

        # Semantic composite fuse gate.
        x_attn_concat = self.dropout(torch.cat([x, attn], dim=-1))
        x_attn_concat = torch.cat([x, attn], dim=-1)
        z = torch.tanh(self.z_gate(x_attn_concat))
        r = torch.sigmoid(self.r_gate(x_attn_concat))
        f = torch.sigmoid(self.f_gate(x_attn_concat))
        encoding = r * x + f * z

        return encoding

Source File: recurrent.py From Tagger with BSD 3-Clause "New" or "Revised" License

6 votes

def top_k_softmax(logits, k, n):
        top_logits, top_indices = torch.topk(logits, k=min(k + 1, n))

        top_k_logits = top_logits[:, :k]
        top_k_indices = top_indices[:, :k]

        probs = torch.softmax(top_k_logits, dim=-1)
        batch = top_k_logits.shape[0]
        k = top_k_logits.shape[1]

        # Flat to 1D
        indices_flat = torch.reshape(top_k_indices, [-1])
        indices_flat = indices_flat + torch.div(
            torch.arange(batch * k, device=logits.device), k) * n

        tensor = torch.zeros([batch * n], dtype=logits.dtype,
                             device=logits.device)
        tensor = tensor.scatter_add(0, indices_flat.long(),
                                    torch.reshape(probs, [-1]))

        return torch.reshape(tensor, [batch, n])

Source File: tc.py From torchtest with GNU General Public License v3.0

6 votes

def predict(model, sentence, _fields):
  # expand fields
  text_field, label_field = _fields
  # encode sentence
  encoded_sequence = torch.LongTensor([ text_field.vocab.stoi[token]
      for token in text_field.preprocess(sentence) ]).view(1, -1)
  if torch.cuda.is_available():
    encoded_sequence = encoded_sequence.cuda()

  # forward; explicitly state batch_size
  with torch.no_grad():
    likelihood = model(encoded_sequence, batch_size=1)

  sentiment = label_field.vocab.itos[
      torch.softmax(likelihood.view(2), dim=-1).argmax().item()
      ]
  # present results
  print('\ninput : {}\noutput : {}\n'.format(sentence, sentiment))

  return sentiment

Source File: mlp_policy_disc.py From PyTorch-RL with MIT License

5 votes

def forward(self, x):
        for affine in self.affine_layers:
            x = self.activation(affine(x))

        action_prob = torch.softmax(self.action_head(x), dim=1)
        return action_prob

Source File: attention.py From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License

5 votes

def forward(self, queries, keys, values, attention_mask=None, attention_weights=None):
        '''
        Computes
        :param queries: Queries (b_s, nq, d_model)
        :param keys: Keys (b_s, nk, d_model)
        :param values: Values (b_s, nk, d_model)
        :param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking.
        :param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk).
        :return:
        '''
        b_s, nq = queries.shape[:2]
        nk = keys.shape[1]

        m_k = np.sqrt(self.d_k) * self.m_k.expand(b_s, self.m, self.h * self.d_k)
        m_v = np.sqrt(self.m) * self.m_v.expand(b_s, self.m, self.h * self.d_v)

        q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3)  # (b_s, h, nq, d_k)
        k = torch.cat([self.fc_k(keys), m_k], 1).view(b_s, nk + self.m, self.h, self.d_k).permute(0, 2, 3, 1)  # (b_s, h, d_k, nk)
        v = torch.cat([self.fc_v(values), m_v], 1).view(b_s, nk + self.m, self.h, self.d_v).permute(0, 2, 1, 3)  # (b_s, h, nk, d_v)

        att = torch.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)
        if attention_weights is not None:
            att = torch.cat([att[:, :, :, :nk] * attention_weights, att[:, :, :, nk:]], -1)
        if attention_mask is not None:
            att[:, :, :, :nk] = att[:, :, :, :nk].masked_fill(attention_mask, -np.inf)
        att = torch.softmax(att, -1)
        out = torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v)  # (b_s, nq, h*d_v)
        out = self.fc_o(out)  # (b_s, nq, d_model)
        return out

Source File: gradient.py From torchsupport with MIT License

5 votes

def _k_hot_compute_probability(alpha, temperature=0.1):
  alpha_linear = alpha.view(alpha.size(0), alpha.size(1), -1)
  result = torch.softmax(alpha_linear / temperature, dim=-1)
  return result.view(*alpha.size())

Source File: model.py From OpenKiwi with GNU Affero General Public License v3.0

5 votes

def predict(self, batch, class_name=const.BAD, unmask=True):
        model_out = self(batch)
        predictions = {}
        class_index = torch.tensor([const.LABELS.index(class_name)])

        for key in model_out:
            if key in [const.TARGET_TAGS, const.SOURCE_TAGS, const.GAP_TAGS]:
                # Models are assumed to return logits, not probabilities
                logits = model_out[key]
                probs = torch.softmax(logits, dim=-1)
                class_probs = probs.index_select(
                    -1, class_index.to(device=probs.device)
                )
                class_probs = class_probs.squeeze(-1).tolist()
                if unmask:
                    if key == const.SOURCE_TAGS:
                        input_key = const.SOURCE
                    else:
                        input_key = const.TARGET
                    mask = self.get_mask(batch, input_key)
                    if key == const.GAP_TAGS:
                        # Append one extra token
                        mask = torch.cat(
                            [mask.new_ones((mask.shape[0], 1)), mask], dim=1
                        )

                    lengths = mask.int().sum(dim=-1)
                    for i, x in enumerate(class_probs):
                        class_probs[i] = x[: lengths[i]]
                predictions[key] = class_probs
            elif key == const.SENTENCE_SCORES:
                predictions[key] = model_out[key].tolist()
            elif key == const.BINARY:
                logits = model_out[key]
                probs = torch.softmax(logits, dim=-1)
                class_probs = probs.index_select(
                    -1, class_index.to(device=probs.device)
                )
                predictions[key] = class_probs.tolist()

        return predictions

Source File: crossentropyloss.py From backpack with MIT License

5 votes

def _get_probs(self, module):
        return softmax(module.input0, dim=1)

Source File: logistic_mixture.py From L3C-PyTorch with GNU General Public License v3.0

5 votes

def cdf_step_non_shared(self, l, targets, c_cur, C, x_c=None) -> CDFOut:
        assert c_cur < C

        # NKHW         NKHW     NKHW
        logit_probs_c, means_c, log_scales_c, K = self._extract_non_shared_c(c_cur, C, l, x_c)

        logit_probs_c_softmax = F.softmax(logit_probs_c, dim=1)  # NKHW, pi_k
        return CDFOut(logit_probs_c_softmax, means_c, log_scales_c, K, targets.to(l.device))

Source File: pooling.py From torchsupport with MIT License

5 votes

def forward(self, nodes, indices):
    weights = torch.zeros(nodes.size(0), 1)
    for idx in range(self.k):
      smax = torch.softmax(weights, dim=0)
      sigma = scatter.add(smax * nodes)
      norm = torch.norm(sigma, dim=1, keepdim=True)
      norm2 = norm ** 2
      sval = sigma / norm * norm2 / (1 + norm2)
      weights = weights + nodes * sigma[indices]
    return sigma

Python torch.softmax() Examples