Python gensim.models.LsiModel() Examples
The following are 24
code examples of gensim.models.LsiModel().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models
, or try the search function
.
Example #1
Source File: textpro.py From comparable-text-miner with Apache License 2.0 | 6 votes |
def build_lsi_model(corpus_name, corpus_path, topics=300): logging.info( 'building lsi model for %s corpus', corpus_name ) dictFile = corpus_path + corpus_name + '.dict' corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm' logging.info( 'loading dictionary ...' ) dictionary = corpora.Dictionary.load(dictFile) logging.info( 'loading tfidf corpus ...' ) corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file) logging.info( 'building lsi model' ) lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics) logging.info( 'saving lsi' ) lsiFile = corpus_path + corpus_name + '.lsi' lsi.save(lsiFile) logging.info( 'lsi model is ready' ) ##################################################################################
Example #2
Source File: builder.py From Greynir with GNU General Public License v3.0 | 6 votes |
def create_lsi_model(self, **kwargs): """ Create an LSI model from the entire words database table """ corpus_tfidf = self.load_tfidf_corpus() if self._dictionary is None: self.load_dictionary() # Initialize an LSI transformation lsi = models.LsiModel( corpus_tfidf, id2word=self._dictionary, num_topics=self._dimensions, **kwargs ) # if self._verbose: # lsi.print_topics(num_topics = self._dimensions) # Save the generated model lsi.save(self._LSI_MODEL_FILE.format(self._dimensions))
Example #3
Source File: text2vec.py From text2vec with Apache License 2.0 | 5 votes |
def get_lsi(self, num_topics=300): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict) docs_lsi = model_lsi[docs_corpus] docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi]) return docs_vecs # Get Random Projections(RP) vector for document list
Example #4
Source File: builder.py From Greynir with GNU General Public License v3.0 | 5 votes |
def load_lsi_model(self): """ Load a previously generated LSI model """ self._model = models.LsiModel.load( self._LSI_MODEL_FILE.format(self._dimensions), mmap="r" ) self._model_name = "lsi"
Example #5
Source File: lsi_neighbor.py From aca with MIT License | 5 votes |
def create_lsi_model(num_topics,dictionary,corpus): print ("create lsi model ...") tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics) corpus_lsi = lsi_model[corpus_tfidf] corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi) return (tfidf_model,lsi_model,corpus_simi_matrix)
Example #6
Source File: lsi_author.py From aca with MIT License | 5 votes |
def create_lsi_model(num_topics,dictionary,corpus): print ("create lsi model ...") tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics) corpus_lsi = lsi_model[corpus_tfidf] corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi) return (tfidf_model,lsi_model,corpus_simi_matrix)
Example #7
Source File: lsi_model.py From aca with MIT License | 5 votes |
def create_lsi_model(num_topics,dictionary,corpus): print ("create lsi model ...") tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics) #lsi_model = models.LsiModel(corpus,id2word=dictionary,num_topics = num_topics) corpus_lsi = lsi_model[corpus_tfidf] #corpus_lsi = lsi_model[corpus] corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi) #corpus_simi_matrix = similarities.MatrixSimilarity(corpus_tfidf) return (tfidf_model,lsi_model,corpus_simi_matrix)
Example #8
Source File: topic_modeling.py From text-analytics-with-python with Apache License 2.0 | 5 votes |
def train_lsi_model_gensim(corpus, total_topics=2): norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True) dictionary = corpora.Dictionary(norm_tokenized_corpus) mapped_corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] tfidf = models.TfidfModel(mapped_corpus) corpus_tfidf = tfidf[mapped_corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=total_topics) return lsi
Example #9
Source File: sentenceSimilarity.py From QAmodel-for-Retrievalchatbot with MIT License | 5 votes |
def LsiModel(self): self.simple_model() # 转换模型 self.model = models.LsiModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # lda模型
Example #10
Source File: sentenceSimilarity.py From Customer-Chatbot with MIT License | 5 votes |
def LsiModel(self): self.simple_model() # 转换模型 self.model = models.LsiModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # lda模型
Example #11
Source File: sentenceSimilarity.py From Customer-Chatbot with MIT License | 5 votes |
def LsiModel(self): self.simple_model() # 转换模型 self.model = models.LsiModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # lda模型
Example #12
Source File: similarity.py From bugbug with Mozilla Public License 2.0 | 5 votes |
def __init__( self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8 ): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) self.corpus = [] for bug in bugzilla.get_bugs(): textual_features = self.text_preprocess(self.get_text(bug)) self.corpus.append([bug["id"], textual_features]) # Assigning unique integer ids to all words self.dictionary = Dictionary(text for bug_id, text in self.corpus) # Conversion to BoW corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus] # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions tfidf = models.TfidfModel(corpus_final) corpus_tfidf = tfidf[corpus_final] # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing self.lsi = models.LsiModel( corpus_tfidf, id2word=self.dictionary, num_topics=300 ) corpus_lsi = self.lsi[corpus_tfidf] # Indexing the corpus self.index = similarities.Similarity( output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300 )
Example #13
Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License | 5 votes |
def CalSim(self,test_document,Type,best_num): '''Calculate similarities between test document wth all news(articles/documents). # Arguments: test_document: List of raw documents. Type: Models of calculating similarities. best_num: refer to 'num_best' parameter in Gensim module. ''' if Type == 'Similarity-tfidf-index': tfidf = models.TfidfModel(self._BowVecOfEachDoc) tfidfVec = tfidf[self._BowVecOfEachDoc] self._num_features = len(self._dictionary.token2id.keys()) self._similarity = similarities.Similarity(Type, tfidfVec, \ num_features=self._num_features,num_best=best_num) test_cut_raw = list(jieba.cut(test_document)) test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) self._test_BowVecOfEachDoc = tfidf[test_BowVecOfEachDoc] elif Type == 'Similarity-LSI-index': lsi_model = models.LsiModel(self._BowVecOfEachDoc) corpus_lsi = lsi_model[self._BowVecOfEachDoc] self._num_features = len(self._dictionary.token2id.keys()) self._similarity = similarities.Similarity(Type, corpus_lsi, \ num_features=self._num_features,num_best=best_num) test_cut_raw = list(jieba.cut(test_document)) test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) self._test_BowVecOfEachDoc = lsi_model[test_BowVecOfEachDoc] self.Print_CalSim() IdLst = [] SimRltLst = [] SimTxLst = [] for Id, Sim in self._similarity[self._test_BowVecOfEachDoc]: IdLst.append(Id) SimRltLst.append(Sim) SimTxLst.append(self._raw_documents[Id]) return IdLst,SimTxLst,SimRltLst
Example #14
Source File: docsim.py From nlp_learning with MIT License | 5 votes |
def calc_similarity(self, prefix: str, text: str): """计算相似度 返回索引和余弦值 Arguments: prefix {str} -- 模型前缀 text {str} -- 文本数据 value {float} -- 设定的阈值,返回大于这个值的数据 """ dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix)) # 加载字典 corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix)) # 加载语料 tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix)) # 加载Tfidf模型 corpus_tfidf = tfidf_model[corpus] lsi = models.LsiModel(corpus_tfidf) corpus_lsi = lsi[corpus_tfidf] similarity_lsi = similarities.Similarity('./models/similarity-lsi-index', corpus_lsi, num_features=400, num_best=3) cut_raw = self.segment(text) # 1.分词 corpus = dictionary.doc2bow(cut_raw) # 2.转换成bow向量 corpus_tfidf = tfidf_model[corpus] # 3.计算tfidf值 corpus_lsi = lsi[corpus_tfidf] # 4.计算lsi值 sims = similarity_lsi[corpus_lsi] with open('./data/idx_dic.dic', 'r') as f: dt = f.read() idx_dic = eval(dt) result = [] if sims is not None: result = [idx_dic[idx] for idx, val in sims if val > self.keep_val] return result
Example #15
Source File: textpro.py From comparable-text-miner with Apache License 2.0 | 5 votes |
def align_sentences_lsi(source_sentences, target_sentences, model_path, model_name): logging.info( 'Sentence level alignment using LSI' ) dictionaryFile = model_path + model_name + '.dict' lsiFile = model_path + model_name + '.lsi' dictionary = corpora.Dictionary.load(dictionaryFile) ; logging.info( 'dictionary loaded' ) lsi = models.LsiModel.load(lsiFile) ; logging.info( 'lsi model loaded' ) source_lsi_sentences = generateLSIvectors(source_sentences, dictionary, lsi); logging.info( 'projects source sentences into LSI space') target_lsi_sentences = generateLSIvectors(target_sentences, dictionary, lsi); logging.info( 'projects target sentences into LSI space' ) source_index = 0 new_source_doc = [] ; new_target_doc = [] for d in source_lsi_sentences: target_index, sim = getComparable(d, target_lsi_sentences) source_sent = source_sentences[source_index] ; target_sent = target_sentences[target_index] del target_lsi_sentences[target_index] ; del target_sentences[target_index] # remove the already aligned sentences from the target document new_source_doc.append(source_sent) new_target_doc.append(target_sent) if not target_lsi_sentences: break # all target sentences are aligned source_index+=1 return new_source_doc, new_target_doc ################################################################################## # projecting a corpus into LSI space
Example #16
Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License | 4 votes |
def CallTransformationModel(self,Dict,Bowvec,**kwarg): '''Invoke specific transformation models of Gensim module. # Arguments: Dict: Dictionary made by all tokenized news(articles/documents). Bowvec: Bow-vector created by all tokenized news(articles/documents). modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel. tfDim: The number of topics that will be extracted from each news(articles/documents). renewModel: Re-train the transformation models or not(bool type). modelPath: The path of saving trained transformation models. ''' if kwarg['renewModel']: tfidf = models.TfidfModel(Bowvec) # initialize tfidf model tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf") if kwarg['modelType'] == 'lsi': model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi model.save(kwarg['modelPath']) # same for tfidf, lda, ... elif kwarg['modelType'] == 'lda': model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重 model.save(kwarg['modelPath']) # same for tfidf, lda, ... elif kwarg['modelType'] == 'None': model = tfidf modelVec = tfidfVec else: if not os.path.exists(kwarg['modelPath']+"tfidf_model.tfidf"): tfidf = models.TfidfModel(Bowvec) # initialize tfidf model tfidfVec = tfidf[Bowvec] # tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf") else: tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus if kwarg['modelType'] == 'lsi': if not os.path.exists(kwarg['modelPath']+"lsi_model.lsi"): tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi model.save(kwarg['modelPath']+"lsi_model.lsi") # same for tfidf, lda, ... else: model = models.LsiModel.load(kwarg['modelPath']+"lsi_model.lsi") modelVec = model[tfidfVec] elif kwarg['modelType'] == 'lda': if not os.path.exists(kwarg['modelPath']+"lda_model.lda"): tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重 model.save(kwarg['modelPath']+"lda_model.lda") # same for tfidf, lda, ... else: model = models.LdaModel.load(kwarg['modelPath']+"lda_model.lda") modelVec = model[tfidfVec] elif kwarg['modelType'] == 'None': model = tfidf modelVec = tfidfVec return tfidfVec, modelVec
Example #17
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #18
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #19
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #20
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #21
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #22
Source File: test_lee.py From topical_word_embeddings with MIT License | 4 votes |
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
Example #23
Source File: textpro.py From comparable-text-miner with Apache License 2.0 | 4 votes |
def align_documents_lsi(source_test_corpus, target_test_corpus, model_path, model_name, output_path, top_n=20, doc_separator=x_seperator): logging.info( 'aligning source and target documents using LSI model' ) dictionaryFile = model_path + model_name + '.dict' lsiFile = model_path + model_name + '.lsi' dictionary = corpora.Dictionary.load(dictionaryFile) ; logging.info( 'dictionary loaded' ) lsi = models.LsiModel.load(lsiFile) ; logging.info( 'lsi model loaded' ) logging.info( '# of source docs %d \t# of target docs %d', len(source_test_corpus), len(target_test_corpus) ) source_lsi_corpus = generateLSIvectors(source_test_corpus, dictionary, lsi) logging.info( 'projects source corpus into LSI space' ) target_lsi_corpus = generateLSIvectors(target_test_corpus, dictionary, lsi) logging.info( 'projects target corpus into LSI space' ) allSims = [] ; doc_tuple = [] ; source_index = 0 for d in source_lsi_corpus: target_index, sim = getComparable(d, target_lsi_corpus) allSims.append(sim) source_doc = source_test_corpus[source_index] ; target_doc = target_test_corpus[target_index] del target_lsi_corpus[target_index] ; del target_test_corpus[target_index] # remove the already aligned document from the target corpus doc_tuple.append((source_index,target_index, source_doc, target_doc)) if not target_lsi_corpus: break # all target docs are aligned source_index+=1 sortedAllSims = sorted(enumerate(allSims), key=lambda item: -item[1]) topNList = sortedAllSims[:top_n] out = open (output_path + 'results.txt', 'w') count = 0 print '\n#, src, target, sim' for e in topNList: i, sim = e srcIndx = doc_tuple[i][0] ; targetIndx = doc_tuple[i][1] ; sdoc = doc_tuple[i][2] ; tdoc = doc_tuple[i][3] print count, srcIndx, targetIndx, '%0.2f' % sim print>>out, count, srcIndx, targetIndx, '%0.2f' % sim source_out = open(output_path + str(count) + '.source.txt', 'w') target_out = open(output_path + str(count) + '.target.txt' , 'w') print>>source_out, sdoc.encode('utf-8') print>>target_out, tdoc.encode('utf-8') source_out.close(); target_out.close(); count+=1 out.close(); logging.info( 'aligning source and target documents using LSI model is done!' ) ##################################################################################
Example #24
Source File: topics_analysis.py From contextualLSTM with Apache License 2.0 | 4 votes |
def topic_analysis(corpus, dictionary, models_path, technique): import uuid uuid = str(uuid.uuid4()) print("[BLOCK] Starting models for context") sys.stdout.flush() if technique == "all" or technique == "hdp": t1 = time() # HDP model model = HdpModel(corpus, id2word=dictionary) model.save("%s/hdp_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2))) sys.stdout.flush() if technique == "all" or technique == "ldap": t1 = time() # Parallel LDA model model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=23, passes=20) model.save("%s/lda_parallel_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2))) sys.stdout.flush() if technique == "all" or technique == "lsa": t1 = time() # LSA model model = LsiModel(corpus, id2word=dictionary, num_topics=400) model.save("%s/lsa_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2))) sys.stdout.flush() if technique == "all" or technique == "ldao": t1 = time() # Online LDA model model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5) model.save("%s/lda_online_%s" % (models_path, uuid)) t2 = time() print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2))) sys.stdout.flush() if technique == "all" or technique == "lda": t1 = time() # Offline LDA model model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20) model.save("%s/lda_offline_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2))) sys.stdout.flush()