Python gensim.matutils.cossim() Examples

The following are 13 code examples of gensim.matutils.cossim(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.matutils , or try the search function .
Example #1
Source File: Snowball.py    From Snowball with GNU General Public License v3.0 5 votes vote down vote up
def similarity(self, t, extraction_pattern):

        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and extraction_pattern.centroid_bef is not None:
            bef = cossim(t.bef_vector, extraction_pattern.centroid_bef)

        if t.bet_vector is not None and extraction_pattern.centroid_bet is not None:
            bet = cossim(t.bet_vector, extraction_pattern.centroid_bet)

        if t.aft_vector is not None and extraction_pattern.centroid_aft is not None:
            aft = cossim(t.aft_vector, extraction_pattern.centroid_aft)

        return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft 
Example #2
Source File: textpro.py    From comparable-text-miner with Apache License 2.0 5 votes vote down vote up
def getComparable(source_lsi_doc, target_lsi_corpus):
	sims = []
	for i in range(len(target_lsi_corpus)):
		sims.append( matutils.cossim(source_lsi_doc, target_lsi_corpus[i]) )
	sortedSims = sorted(enumerate(sims), key=lambda item: -item[1])
	topIndex = sortedSims[0][0]
	topSim = sortedSims[0][1]
	return sortedSims[0]

##################################################################################


##################################################################################
# takses wiki text and a list of language codes, and returns the interlanguage links
# language code list:
# ar arabic
# en english
# fr french
# es Español
# de Deutsch
# it Italiano
# pt portuguese
# fa farsi
# ur urdo
# he hebrew
# ps peshto (Afghānī)
# sd Sindhi (sindi)
# ug Uyghur أويغورية
# pnb punjabi (Pakistan - India)
# ckb kurdi
# arz egyptian 
Example #3
Source File: cluster_for_data.py    From single-pass-clustering-for-chinese-text with MIT License 5 votes vote down vote up
def getMaxSimilarity(dictTopic, vector):
    maxValue = 0
    maxIndex = -1
    for k,cluster in dictTopic.iteritems():
        oneSimilarity = mean([matutils.cossim(vector, v) for v in cluster])
        if oneSimilarity > maxValue:
            maxValue = oneSimilarity
            maxIndex = k
    return maxIndex, maxValue 
Example #4
Source File: tf_idf_helpers.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
        headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
        headline_tfidf = self.tfidf_model[headline_bow]

        scored_sentences = []
        'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
        #sentences = sentences.replace('\n', ' ')

        for sentence in self.tokenizer.tokenize(sentences):
            sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
            sim = cossim(headline_tfidf, sentence_tfidf)
            #print(str(sim))
            scored_sentences.append([sentence, sim])

        sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True)
        '''
        for sentence in sorted_sentences:
        print(str(sentence))
        '''
        ' return sorted_sentences '

        sentences_string = ""
        current_sentence_number = 0
        for sentence in sorted_sentences:
            current_sentence_number += 1
            sentences_string += sentence[0] + ' '
            if current_sentence_number == number_of_sentences:
                break
        #print("Ranked: \n " + sentences_string)
        return sentences_string 
Example #5
Source File: models.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def tfidf_sim(self, train_data, body_dict, threshold):
        '''
        :param 
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
        
        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
        
        vocab = corpora.Dictionary(bodyText_w)
        corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
        tfidf_model = models.TfidfModel(corporaBody_bow)
        
        unrelated, related, y_true, y_pred = [], [], [], []
        for headline, bodyID, stance in train_data:        
            headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
            
            headlines_tfidf = tfidf_model[headline_bow]
            corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
            
            sim = cossim(headlines_tfidf, corporaBody_tfidf)
            unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
        
        print_results([unrelated, related, y_true, y_pred], self.model_type) 
Example #6
Source File: sentence_sim_feature.py    From nlp_xiaojiang with MIT License 5 votes vote down vote up
def tdidf_all_vec(self):

        return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2) 
Example #7
Source File: sentence_sim_feature.py    From nlp_xiaojiang with MIT License 5 votes vote down vote up
def tdidf_all_vec_pinyin(self):

        return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin) 
Example #8
Source File: test_lee.py    From topical_word_embeddings with MIT License 4 votes vote down vote up
def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[matutils.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s" % cor)
        self.assertTrue(cor > 0.6)


    # def test_lee_mallet(self):
    #     global bg_corpus, corpus, bg_corpus2, corpus2

    #     # create a dictionary and corpus (bag of words)
    #     dictionary = corpora.Dictionary(bg_corpus2)
    #     bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
    #     corpus = [dictionary.doc2bow(text) for text in corpus2]

    #     # initialize an LDA transformation from background corpus
    #     lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
    #         corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
    #     corpus_lda = lda[corpus]

    #     # compute pairwise similarity matrix and extract upper triangular
    #     res = np.zeros((len(corpus), len(corpus)))
    #     for i, par1 in enumerate(corpus_lda):
    #         for j, par2 in enumerate(corpus_lda):
    #             res[i, j] = matutils.cossim(par1, par2)
    #     flat = res[matutils.triu_indices(len(corpus), 1)]

    #     cor = np.corrcoef(flat, human_sim_vector)[0, 1]
    #     logging.info("LDA correlation coefficient is %s" % cor)
    #     self.assertTrue(cor > 0.35) 
Example #9
Source File: test_lee.py    From topical_word_embeddings with MIT License 4 votes vote down vote up
def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[matutils.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s" % cor)
        self.assertTrue(cor > 0.6)


    # def test_lee_mallet(self):
    #     global bg_corpus, corpus, bg_corpus2, corpus2

    #     # create a dictionary and corpus (bag of words)
    #     dictionary = corpora.Dictionary(bg_corpus2)
    #     bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
    #     corpus = [dictionary.doc2bow(text) for text in corpus2]

    #     # initialize an LDA transformation from background corpus
    #     lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
    #         corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
    #     corpus_lda = lda[corpus]

    #     # compute pairwise similarity matrix and extract upper triangular
    #     res = np.zeros((len(corpus), len(corpus)))
    #     for i, par1 in enumerate(corpus_lda):
    #         for j, par2 in enumerate(corpus_lda):
    #             res[i, j] = matutils.cossim(par1, par2)
    #     flat = res[matutils.triu_indices(len(corpus), 1)]

    #     cor = np.corrcoef(flat, human_sim_vector)[0, 1]
    #     logging.info("LDA correlation coefficient is %s" % cor)
    #     self.assertTrue(cor > 0.35) 
Example #10
Source File: test_lee.py    From topical_word_embeddings with MIT License 4 votes vote down vote up
def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[matutils.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s" % cor)
        self.assertTrue(cor > 0.6)


    # def test_lee_mallet(self):
    #     global bg_corpus, corpus, bg_corpus2, corpus2

    #     # create a dictionary and corpus (bag of words)
    #     dictionary = corpora.Dictionary(bg_corpus2)
    #     bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
    #     corpus = [dictionary.doc2bow(text) for text in corpus2]

    #     # initialize an LDA transformation from background corpus
    #     lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
    #         corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
    #     corpus_lda = lda[corpus]

    #     # compute pairwise similarity matrix and extract upper triangular
    #     res = np.zeros((len(corpus), len(corpus)))
    #     for i, par1 in enumerate(corpus_lda):
    #         for j, par2 in enumerate(corpus_lda):
    #             res[i, j] = matutils.cossim(par1, par2)
    #     flat = res[matutils.triu_indices(len(corpus), 1)]

    #     cor = np.corrcoef(flat, human_sim_vector)[0, 1]
    #     logging.info("LDA correlation coefficient is %s" % cor)
    #     self.assertTrue(cor > 0.35) 
Example #11
Source File: test_lee.py    From topical_word_embeddings with MIT License 4 votes vote down vote up
def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[matutils.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s" % cor)
        self.assertTrue(cor > 0.6)


    # def test_lee_mallet(self):
    #     global bg_corpus, corpus, bg_corpus2, corpus2

    #     # create a dictionary and corpus (bag of words)
    #     dictionary = corpora.Dictionary(bg_corpus2)
    #     bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
    #     corpus = [dictionary.doc2bow(text) for text in corpus2]

    #     # initialize an LDA transformation from background corpus
    #     lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
    #         corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
    #     corpus_lda = lda[corpus]

    #     # compute pairwise similarity matrix and extract upper triangular
    #     res = np.zeros((len(corpus), len(corpus)))
    #     for i, par1 in enumerate(corpus_lda):
    #         for j, par2 in enumerate(corpus_lda):
    #             res[i, j] = matutils.cossim(par1, par2)
    #     flat = res[matutils.triu_indices(len(corpus), 1)]

    #     cor = np.corrcoef(flat, human_sim_vector)[0, 1]
    #     logging.info("LDA correlation coefficient is %s" % cor)
    #     self.assertTrue(cor > 0.35) 
Example #12
Source File: test_lee.py    From topical_word_embeddings with MIT License 4 votes vote down vote up
def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[matutils.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s" % cor)
        self.assertTrue(cor > 0.6)


    # def test_lee_mallet(self):
    #     global bg_corpus, corpus, bg_corpus2, corpus2

    #     # create a dictionary and corpus (bag of words)
    #     dictionary = corpora.Dictionary(bg_corpus2)
    #     bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
    #     corpus = [dictionary.doc2bow(text) for text in corpus2]

    #     # initialize an LDA transformation from background corpus
    #     lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
    #         corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
    #     corpus_lda = lda[corpus]

    #     # compute pairwise similarity matrix and extract upper triangular
    #     res = np.zeros((len(corpus), len(corpus)))
    #     for i, par1 in enumerate(corpus_lda):
    #         for j, par2 in enumerate(corpus_lda):
    #             res[i, j] = matutils.cossim(par1, par2)
    #     flat = res[matutils.triu_indices(len(corpus), 1)]

    #     cor = np.corrcoef(flat, human_sim_vector)[0, 1]
    #     logging.info("LDA correlation coefficient is %s" % cor)
    #     self.assertTrue(cor > 0.35) 
Example #13
Source File: test_lee.py    From topical_word_embeddings with MIT License 4 votes vote down vote up
def test_lee(self):
        """correlation with human data > 0.6
        (this is the value which was achieved in the original paper)
        """

        global bg_corpus, corpus

        # create a dictionary and corpus (bag of words)
        dictionary = corpora.Dictionary(bg_corpus)
        bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus]
        corpus = [dictionary.doc2bow(text) for text in corpus]

        # transform the bag of words with log_entropy normalization
        log_ent = models.LogEntropyModel(bg_corpus)
        bg_corpus_ent = log_ent[bg_corpus]

        # initialize an LSI transformation from background corpus
        lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200)
        # transform small corpus to lsi bow->log_ent->fold-in-lsi
        corpus_lsi = lsi[log_ent[corpus]]

        # compute pairwise similarity matrix and extract upper triangular
        res = np.zeros((len(corpus), len(corpus)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                res[i, j] = matutils.cossim(par1, par2)
        flat = res[matutils.triu_indices(len(corpus), 1)]

        cor = np.corrcoef(flat, human_sim_vector)[0, 1]
        logging.info("LSI correlation coefficient is %s" % cor)
        self.assertTrue(cor > 0.6)


    # def test_lee_mallet(self):
    #     global bg_corpus, corpus, bg_corpus2, corpus2

    #     # create a dictionary and corpus (bag of words)
    #     dictionary = corpora.Dictionary(bg_corpus2)
    #     bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2]
    #     corpus = [dictionary.doc2bow(text) for text in corpus2]

    #     # initialize an LDA transformation from background corpus
    #     lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet',
    #         corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10)
    #     corpus_lda = lda[corpus]

    #     # compute pairwise similarity matrix and extract upper triangular
    #     res = np.zeros((len(corpus), len(corpus)))
    #     for i, par1 in enumerate(corpus_lda):
    #         for j, par2 in enumerate(corpus_lda):
    #             res[i, j] = matutils.cossim(par1, par2)
    #     flat = res[matutils.triu_indices(len(corpus), 1)]

    #     cor = np.corrcoef(flat, human_sim_vector)[0, 1]
    #     logging.info("LDA correlation coefficient is %s" % cor)
    #     self.assertTrue(cor > 0.35)