Python gensim.matutils.argsort() Examples

The following are 11 code examples of gensim.matutils.argsort(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.matutils , or try the search function .
Example #1
Source File: chatbot_sentence_vec_by_word.py    From nlp_xiaojiang with MIT License 6 votes vote down vote up
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
    """
      最相似的句子,句向量与矩阵点乘
    :param vec: 
    :param matrix: 
    :param keys: 
    :param topn: 
    :return: 
    """
    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
    return index_score 
Example #2
Source File: chatbot_sentence_vec_by_char.py    From nlp_xiaojiang with MIT License 6 votes vote down vote up
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
    """
      最相似的句子,句向量与矩阵点乘
    :param vec: 
    :param matrix: 
    :param keys: 
    :param topn: 
    :return: 
    """
    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
    return index_score 
Example #3
Source File: tdlm_test.py    From topically-driven-language-model with Apache License 2.0 6 votes vote down vote up
def compute_dt_dist(docs, labels, tags, model, max_len, batch_size, pad_id, idxvocab, output_file):
    #generate batches
    num_batches = int(math.ceil(float(len(docs)) / batch_size))
    dt_dist = []
    t = []
    combined = []
    docid = 0
    for i in xrange(num_batches):
        x, _, _, t, s = get_batch_doc(docs, labels, tags, i, max_len, cf.tag_len, batch_size, pad_id)
        attention, mean_topic = sess.run([model.attention, model.mean_topic], {model.doc: x, model.tag: t})
        dt_dist.extend(attention[:s])

        if debug:
            for si in xrange(s):
                d = x[si]
                print "\n\nDoc", docid, "=", " ".join([idxvocab[item] for item in d if (item != pad_id)])
                sorted_dist = matutils.argsort(attention[si], reverse=True)
                for ti in sorted_dist:
                    print "Topic", ti, "=", attention[si][ti]
                docid += 1

    np.save(open(output_file, "w"), dt_dist) 
Example #4
Source File: tdlm_model.py    From topically-driven-language-model with Apache License 2.0 5 votes vote down vote up
def get_topics(self, sess, topn):
        topics = []
        entropy = []
        tw_dist = sess.run(tf.nn.softmax(tf.matmul(self.topic_output_embedding, self.tm_softmax_w) + self.tm_softmax_b))
        for ti in xrange(self.config.topic_number):
            best = matutils.argsort(tw_dist[ti], topn=topn, reverse=True)
            topics.append(best)
            entropy.append(scipy.stats.entropy(tw_dist[ti]))

        return topics, entropy

    #get top topics and words given a doc 
Example #5
Source File: tdlm_model.py    From topically-driven-language-model with Apache License 2.0 5 votes vote down vote up
def get_topics_on_doc(self, sess, doc, tag, topn):
        tw_dist, logits = sess.run([self.attention, self.tm_logits], {self.doc: doc, self.tag: tag})
        probs = sess.run(tf.nn.softmax(logits))[0]
        best_words = matutils.argsort(probs, topn=topn, reverse=True)
        best_words = [ (item, probs[item]) for item in best_words ] #attach word probability
        best_topics = matutils.argsort(tw_dist[0], topn=topn, reverse=True)
        best_topics = [ (item, tw_dist[0][item]) for item in best_topics ] #attach topic probability

        return best_topics, best_words

#convolutional topic model + lstm language model 
Example #6
Source File: wordEmbeddings.py    From semeval2017-scienceie with Apache License 2.0 5 votes vote down vote up
def most_similar(self, sWord, iTopN=10, fMinDist=-1.0):
    npaWord_unit = self.getUnitVector(sWord)

    if npaWord_unit is None:
      return None

    npaCosineSimilarities = np.dot(self.npaWordEmbeddings_units, npaWord_unit)

    npaBestIndices = \
        matutils.argsort(npaCosineSimilarities, topn=iTopN +1, reverse=True)

    # npaBestIndices[1:] - Ignore the first one (which is sWord itself)
    return [(self.oVocab.index2word(x), npaCosineSimilarities[x]) for x in npaBestIndices[1:] if npaCosineSimilarities[x] > fMinDist] 
Example #7
Source File: wordEmbeddings.py    From semeval2017-scienceie with Apache License 2.0 5 votes vote down vote up
def sortByNorm(self, iMin, iMax):
    if not hasattr(self, 'npaIndicesByNorm'):
      self.npaNorms = np.sqrt(np.square(self.npaWordEmbeddings).sum(axis=1))
      self.npaIndicesByNorm = matutils.argsort(self.npaNorms)

    return [(self.oVocab.index2word(x), self.npaNorms[x]) for x in self.npaIndicesByNorm[iMin:iMax]] 
Example #8
Source File: wordEmbeddings.py    From semeval2017-scienceie with Apache License 2.0 5 votes vote down vote up
def most_similar_simple(self, sWord, iTopN=10):
    npaWordEmbedding = self[sWord]

    if npaWordEmbedding is None:
      return None

    npaSimilarities = np.dot(self.npaWordEmbeddings, npaWordEmbedding)

    npaBestIndices = \
        matutils.argsort(npaSimilarities, topn=iTopN +1, reverse=True)

    # npaBestIndices[1:] - Ignore the first one (which is sWord itself)
    return [(self.oVocab.index2word(x), npaSimilarities[x]) for x in npaBestIndices[1:]] 
Example #9
Source File: NLP.py    From Financial-NLP with Apache License 2.0 4 votes vote down vote up
def topn_similarity_label(self, words, topn=10, normalization=True):
        if self.model==None:
            raise Exception('no model.')
        if isinstance(words, string_types):
            words=[words]
        
            """ we can discard this version.
            vectors=np.transpose(self.model.wv.__getitem__(words))
            if normalization:
                unit_vector=np.zeros((len(vectors),len(words)))
                for i in range(len(words)):
                    unit_vector[:,i]=matutils.unitvec(vectors[:,i])
                dists=np.dot(self.Label_vec_u, unit_vector)
            else:
                dists=np.dot(self.Label_vec, vectors)
            # 排除掉自身(因为有可能word本身就在label_dict里)
            # best = matutils.argsort(dists, topn = topn+1, reverse=True)
            # result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
            best = matutils.argsort(dists[:,0], topn = topn, reverse=True)
            result = [(self.Label_index[sim], float(dists[sim])) for sim in best]
            return result
        else:
            """
        vectors=np.transpose(self.model.wv.__getitem__(words))
        if normalization:
            unit_vector=unitvec(vectors,ax=0)
            dists=np.dot(self.Label_vec_u, unit_vector)
        else:
            dists=np.dot(self.Label_vec, vectors)
            #topwords=np.empty((topn,len(words)), np.string_)
        topwords=[]
        topsims=np.empty((topn,len(words)))
        best = np.argsort(dists, axis=0)
        for i in range(topn):
            topword=[]
            for j in range(len(words)):
                topword.append(self.Label_index[best[-i-1][j]])
                topsims[i][j]=dists[best[-i-1][j]][j]
            topwords.append(topword)
        result=[(topwords[i], topsims[i]) for i in range(topn)]
        return result
        """ print this result by:

            | for iword,isim in result:  |
            |     print(iword, isim)     |
            or
            | for iword, isim in b:                               |
            |     for i in range(len(b[0])):                      |
            |         print("%s:%f\t" %(iword[i],isim[i]),end="") |
            |     print("")                                       |
                
        """ 
Example #10
Source File: NLP.py    From Financial-NLP with Apache License 2.0 4 votes vote down vote up
def topn_synonym_label(self, word, topn=10, calc='all', calc_k=5):
        ww=list()
        for w in self.findWordNet(word):
            ww.append(self.id2ss(w))
        if (len(ww)==0):
            return 0
        else:
            similarities=[0]*len(self.Label_index)
            if calc=='all': # 默认全部平均
                for i in range(len(self.Label_index)):
                    count=0
                    for w in ww:
                        for l in self.Label_wn[self.Label_index[i]]:
                            sim=w.path_similarity(l)
                            if(sim!=None):
                                similarities[i]+=sim
                            else:
                                count+=1
                    try:
                        similarities[i]/=(len(ww)*len(self.Label_wn[self.Label_index[i]])-count) # 平均similarity
                    except:
                        similarities[i]=0
                        
            elif calc=='calc_k': # 仅取前calc_k个词义
                for i in range(len(self.Label_index)):
                    count=0
                    simlist=[]
                    for w in ww:
                        for l in self.Label_wn[self.Label_index[i]]:
                            sim=w.path_similarity(l)
                            if(sim!=None):
                                simlist.append(sim)
                                count+=1
                    if count<=calc_k:
                        similarities[i]=np.mean(simlist)
                    else:
                        simlist=sorted(simlist,reverse=True)
                        similarities[i]=simlist[:calc_k-1]/calc_k # 取最大的k个用于计算平均的similarity
                        
        best=matutils.argsort(similarities, topn = topn, reverse=True)
        result = [(self.Label_index[sim], float(similarities[sim])) for sim in best]
        return result 
Example #11
Source File: dag_emb_model.py    From hyperbolic_cones with Apache License 2.0 4 votes vote down vote up
def most_similar(self, node_or_vector, topn=10, restrict_vocab=None):
        """
        Find the top-N most similar nodes to the given node or vector, sorted in increasing order of distance.

        Parameters
        ----------

        node_or_vector : str/int or numpy.array
            node key or vector for which similar nodes are to be found.
        topn : int or None, optional
            number of similar nodes to return, if `None`, returns all.
        restrict_vocab : int or None, optional
            Optional integer which limits the range of vectors which are searched for most-similar values.
            For example, restrict_vocab=10000 would only check the first 10000 node vectors in the vocabulary order.
            This may be meaningful if vocabulary is sorted by descending frequency.

        Returns
        --------
        list of tuples (str, float)
            List of tuples containing (node, distance) pairs in increasing order of distance.

        Examples
        --------
        >>> vectors.most_similar('lion.n.01')
        [('lion_cub.n.01', 0.4484), ('lionet.n.01', 0.6552), ...]

        """
        if not restrict_vocab:
            all_distances = self.distances(node_or_vector)
        else:
            nodes_to_use = self.index2word[:restrict_vocab]
            all_distances = self.distances(node_or_vector, nodes_to_use)

        if isinstance(node_or_vector, string_types + (int,)):
            node_index = self.vocab[node_or_vector].index
        else:
            node_index = None
        if not topn:
            closest_indices = matutils.argsort(all_distances)
        else:
            closest_indices = matutils.argsort(all_distances, topn=1 + topn)
        result = [
            (self.index2word[index], float(all_distances[index]))
            for index in closest_indices if (not node_index or index != node_index)  # ignore the input node
        ]
        if topn:
            result = result[:topn]
        return result