Python gensim.matutils.cossim() Examples
The following are 13
code examples of gensim.matutils.cossim().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.matutils
, or try the search function
.
Example #1
Source File: Snowball.py From Snowball with GNU General Public License v3.0 | 5 votes |
def similarity(self, t, extraction_pattern): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and extraction_pattern.centroid_bef is not None: bef = cossim(t.bef_vector, extraction_pattern.centroid_bef) if t.bet_vector is not None and extraction_pattern.centroid_bet is not None: bet = cossim(t.bet_vector, extraction_pattern.centroid_bet) if t.aft_vector is not None and extraction_pattern.centroid_aft is not None: aft = cossim(t.aft_vector, extraction_pattern.centroid_aft) return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
Example #2
Source File: textpro.py From comparable-text-miner with Apache License 2.0 | 5 votes |
def getComparable(source_lsi_doc, target_lsi_corpus): sims = [] for i in range(len(target_lsi_corpus)): sims.append( matutils.cossim(source_lsi_doc, target_lsi_corpus[i]) ) sortedSims = sorted(enumerate(sims), key=lambda item: -item[1]) topIndex = sortedSims[0][0] topSim = sortedSims[0][1] return sortedSims[0] ################################################################################## ################################################################################## # takses wiki text and a list of language codes, and returns the interlanguage links # language code list: # ar arabic # en english # fr french # es Español # de Deutsch # it Italiano # pt portuguese # fa farsi # ur urdo # he hebrew # ps peshto (Afghānī) # sd Sindhi (sindi) # ug Uyghur أويغورية # pnb punjabi (Pakistan - India) # ckb kurdi # arz egyptian
Example #3
Source File: cluster_for_data.py From single-pass-clustering-for-chinese-text with MIT License | 5 votes |
def getMaxSimilarity(dictTopic, vector): maxValue = 0 maxIndex = -1 for k,cluster in dictTopic.iteritems(): oneSimilarity = mean([matutils.cossim(vector, v) for v in cluster]) if oneSimilarity > maxValue: maxValue = oneSimilarity maxIndex = k return maxIndex, maxValue
Example #4
Source File: tf_idf_helpers.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def order_by_tf_id_rank(self, headline, sentences, number_of_sentences): headline_bow = self.vocab.doc2bow(sent2stokens_wostop(headline)) headline_tfidf = self.tfidf_model[headline_bow] scored_sentences = [] 'Replace newlines with blank, since the punkt tokenizer does not recognize .[newline]' #sentences = sentences.replace('\n', ' ') for sentence in self.tokenizer.tokenize(sentences): sentence_tfidf = self.vocab.doc2bow(sent2stokens_wostop(sentence)) sim = cossim(headline_tfidf, sentence_tfidf) #print(str(sim)) scored_sentences.append([sentence, sim]) sorted_sentences= sorted(scored_sentences, key=lambda scored_sentences: scored_sentences[1], reverse= True) ''' for sentence in sorted_sentences: print(str(sentence)) ''' ' return sorted_sentences ' sentences_string = "" current_sentence_number = 0 for sentence in sorted_sentences: current_sentence_number += 1 sentences_string += sentence[0] + ' ' if current_sentence_number == number_of_sentences: break #print("Ranked: \n " + sentences_string) return sentences_string
Example #5
Source File: models.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def tfidf_sim(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] vocab = corpora.Dictionary(bodyText_w) corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w] tfidf_model = models.TfidfModel(corporaBody_bow) unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_bow = vocab.doc2bow(sent2stokens_wostop(headline)) headlines_tfidf = tfidf_model[headline_bow] corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]] sim = cossim(headlines_tfidf, corporaBody_tfidf) unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
Example #6
Source File: sentence_sim_feature.py From nlp_xiaojiang with MIT License | 5 votes |
def tdidf_all_vec(self): return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2)
Example #7
Source File: sentence_sim_feature.py From nlp_xiaojiang with MIT License | 5 votes |
def tdidf_all_vec_pinyin(self): return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin)
Example #8
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #9
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #10
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #11
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #12
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #13
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)