Python Examples of gensim.matutils.cossim

Source File: Snowball.py From Snowball with GNU General Public License v3.0

5 votes

def similarity(self, t, extraction_pattern):

        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and extraction_pattern.centroid_bef is not None:
            bef = cossim(t.bef_vector, extraction_pattern.centroid_bef)

        if t.bet_vector is not None and extraction_pattern.centroid_bet is not None:
            bet = cossim(t.bet_vector, extraction_pattern.centroid_bet)

        if t.aft_vector is not None and extraction_pattern.centroid_aft is not None:
            aft = cossim(t.aft_vector, extraction_pattern.centroid_aft)

        return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft

Source File: textpro.py From comparable-text-miner with Apache License 2.0

5 votes

def getComparable(source_lsi_doc, target_lsi_corpus):
	sims = []
	for i in range(len(target_lsi_corpus)):
		sims.append( matutils.cossim(source_lsi_doc, target_lsi_corpus[i]) )
	sortedSims = sorted(enumerate(sims), key=lambda item: -item[1])
	topIndex = sortedSims[0][0]
	topSim = sortedSims[0][1]
	return sortedSims[0]

##################################################################################


##################################################################################
# takses wiki text and a list of language codes, and returns the interlanguage links
# language code list:
# ar arabic
# en english
# fr french
# es Español
# de Deutsch
# it Italiano
# pt portuguese
# fa farsi
# ur urdo
# he hebrew
# ps peshto (Afghānī)
# sd Sindhi (sindi)
# ug Uyghur أويغورية
# pnb punjabi (Pakistan - India)
# ckb kurdi
# arz egyptian

Source File: cluster_for_data.py From single-pass-clustering-for-chinese-text with MIT License

5 votes

def getMaxSimilarity(dictTopic, vector):
    maxValue = 0
    maxIndex = -1
    for k,cluster in dictTopic.iteritems():
        oneSimilarity = mean([matutils.cossim(vector, v) for v in cluster])
        if oneSimilarity > maxValue:
            maxValue = oneSimilarity
            maxIndex = k
    return maxIndex, maxValue

Source File: tf_idf_helpers.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def order_by_tf_id_rank(self, headline, sentences, number_of_sentences):
        headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline))
        headline_tfidf = self.tfidf_model[headline_bow]

        scored_sentences = []
        'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]'
        #sentences = sentences.replace('\n', ' ')

        for sentence in self.tokenizer.tokenize(sentences):
            sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence))
            sim = cossim(headline_tfidf, sentence_tfidf)
            #print(str(sim))
            scored_sentences.append([sentence, sim])

        sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True)
        '''
        for sentence in sorted_sentences:
        print(str(sentence))
        '''
        ' return sorted_sentences '

        sentences_string = ""
        current_sentence_number = 0
        for sentence in sorted_sentences:
            current_sentence_number += 1
            sentences_string += sentence[0] + ' '
            if current_sentence_number == number_of_sentences:
                break
        #print("Ranked: \n " + sentences_string)
        return sentences_string

Source File: models.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def tfidf_sim(self, train_data, body_dict, threshold):
        '''
        :param 
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
        
        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
        
        vocab = corpora.Dictionary(bodyText_w)
        corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
        tfidf_model = models.TfidfModel(corporaBody_bow)
        
        unrelated, related, y_true, y_pred = [], [], [], []
        for headline, bodyID, stance in train_data:        
            headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
            
            headlines_tfidf = tfidf_model[headline_bow]
            corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
            
            sim = cossim(headlines_tfidf, corporaBody_tfidf)
            unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
        
        print_results([unrelated, related, y_true, y_pred], self.model_type)

Source File: sentence_sim_feature.py From nlp_xiaojiang with MIT License

5 votes

def tdidf_all_vec(self):

        return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2)

Source File: sentence_sim_feature.py From nlp_xiaojiang with MIT License

5 votes

def tdidf_all_vec_pinyin(self):

        return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin)