Python gensim.models.LdaModel() Examples
The following are 20
code examples of gensim.models.LdaModel().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models
, or try the search function
.
Example #1
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #2
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #3
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #4
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #5
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #6
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #7
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #8
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #9
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 6 votes |
def Execute( self, tokenRegex, numTopics, numPasses ): if not os.path.exists( self.modelPath ): os.makedirs( self.modelPath ) # Generate gensim objects corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex ) corpus.dictionary.filter_extremes( no_above = 0.2 ) # remove words that are too frequent/too infrequent model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses ) self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim ) corpus.dictionary.save( self.dictionaryInGensim ) self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim ) corpus.save( self.corpusInGensim ) self.logger.info( 'Saving model to disk: %s', self.modelInGensim ) model.save( self.modelInGensim )
Example #10
Source File: text2vec.py From text2vec with Apache License 2.0 | 5 votes |
def get_lda(self, num_topics=100): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict) docs_lda = model_lda[docs_corpus] docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda]) return docs_vecs # Get Hierarchical Dirichlet Process(HDP) vector for document list
Example #11
Source File: topic_modeling.py From text-analytics-with-python with Apache License 2.0 | 5 votes |
def train_lda_model_gensim(corpus, total_topics=2): norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True) dictionary = corpora.Dictionary(norm_tokenized_corpus) mapped_corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] tfidf = models.TfidfModel(mapped_corpus) corpus_tfidf = tfidf[mapped_corpus] lda = models.LdaModel(corpus_tfidf, id2word=dictionary, iterations=1000, num_topics=total_topics) return lda
Example #12
Source File: sentenceSimilarity.py From QAmodel-for-Retrievalchatbot with MIT License | 5 votes |
def LdaModel(self): self.simple_model() # 转换模型 self.model = models.LdaModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # 对新输入的句子(比较的句子)进行预处理
Example #13
Source File: sentenceSimilarity.py From Customer-Chatbot with MIT License | 5 votes |
def LdaModel(self): self.simple_model() # 转换模型 self.model = models.LdaModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # 对新输入的句子(比较的句子)进行预处理
Example #14
Source File: sentenceSimilarity.py From Customer-Chatbot with MIT License | 5 votes |
def LdaModel(self): self.simple_model() # 转换模型 self.model = models.LdaModel(self.corpus_simple) self.corpus = self.model[self.corpus_simple] # 创建相似度矩阵 self.index = similarities.MatrixSimilarity(self.corpus) # 对新输入的句子(比较的句子)进行预处理
Example #15
Source File: LDAModel_English.py From LDA_RecEngine with Apache License 2.0 | 5 votes |
def trainModel(self): ''' Train a LDA model, inclusive of 4 steps: 1. Parse the whole corpora into unigram token collections and document mapping (for later use) 2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc) 3. Indexing the token collections and do TF-IDF transformation 4. Call gensim.models.LdaModel and generate topic distributions of the corpora ''' print 'Start preparing unigram tokens....' ## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW) # Get document_count, tokens, and document-index mapping from the corpora doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) # Put the training data into gensim.corpora for later use dic = corpora.Dictionary(train_set) denominator = len(dic) # Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality) dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc) nominator = len(dic) corpus = [dic.doc2bow(text) for text in train_set] # transform every token into BOW print 'There are %i documents in the pool' % (doc_count) print "In the corpus there are ", denominator, " raw tokens" print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%" print 'Finished preparing unigram tokens....' ##END print 'Start training LDA model....' ## Implementing TF-IDF as a vector for each document, and train LDA model on top of that tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes) corpus_lda = lda[corpus_tfidf] # Once done training, print all the topics and related words print 'Finished training LDA model.......Here is the list of all topics & their most frequent words' for i in range(self.num_topics): print 'Topic %s : ' % (str(i)) + lda.print_topic(i) # Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better print '===============================' print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics) print '===============================' return lda,doc_mapping,link_mapping,corpus
Example #16
Source File: lda_demo.py From Zhihu_Spider with Apache License 2.0 | 5 votes |
def test_model(): ''' after setting the lda model, test the model ''' lda_model = models.LdaModel.load('./zhihu_dat/zhihu_10.lda') # transform the question_corpus into lda space, print the lda feature question_lda = lda_model[question_corpus] for doc in question_lda: print doc
Example #17
Source File: lda_demo.py From Zhihu_Spider with Apache License 2.0 | 5 votes |
def build_lda_mode(): # corpus is bag of words, which is the original feature corpus = corpora.BleiCorpus('./zhihu_dat/item.dat') # the bag of words feature of question data # build up lda model: using lda model, given a bag of words feature, return the topic feature, so the topic model is to reduce the dimension of the features of a document lda_model = models.LdaModel(corpus, id2word = dictionary, num_topics = 10) # save the model to disk for future use(Given a document such as question, return the topic feature of the document) lda_model.save('./zhihu_dat/zhihu_10.lda') print 'Building complete'
Example #18
Source File: topic_modelling.py From Sarcasm-Detection with MIT License | 4 votes |
def gensim_lda_topic_modelling(path, documents, num_of_topics=6, passes=50, verbose=True, plotTopicsResults=True): dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] if verbose: print("Cleaned documents:\n", documents) print("\nDictionary:\n", dictionary) print("\nCorpus in BoW form: \n", corpus) start = time.time() ldamodel = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary) end = time.time() print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0)) ldatopics = ldamodel.show_topics(formatted=False) ldatopics_words = [[[word, prob] for word, prob in topic] for topicid, topic in ldatopics] if verbose: print("\nList of words associated with each topic:\n") for i in range(len(ldatopics_words)): print("\nTopic %d:\n" % i) for w, p in ldatopics_words[i]: print(p, " - ", w) if plotTopicsResults: plot_top_10_words_per_topic(path, ldatopics_words, num_topics=6, num_top_words=10) all_documents_topics = [(doc_topics, word_topics, word_phis) for doc_topics, word_topics, word_phis in ldamodel.get_document_topics(corpus, per_word_topics=True)] all_doc_topics = [] for i in range(len(all_documents_topics)): doc_topics, word_topics, phi_values = all_documents_topics[i] all_doc_topics.append([doc_topics[i][1] for i in range(len(doc_topics))]) if verbose: print('Document topics:', doc_topics) print('Word topics:', word_topics) print('Phi values:', phi_values) print('-------------- \n') if plotTopicsResults: plot_share_of_topics(path, all_doc_topics, no_random_tweets=10) # Plot words coloured differently depending on the topic for doc in documents[0:100]: if len(doc) > 4: color_words(ldamodel, doc)
Example #19
Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License | 4 votes |
def CallTransformationModel(self,Dict,Bowvec,**kwarg): '''Invoke specific transformation models of Gensim module. # Arguments: Dict: Dictionary made by all tokenized news(articles/documents). Bowvec: Bow-vector created by all tokenized news(articles/documents). modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel. tfDim: The number of topics that will be extracted from each news(articles/documents). renewModel: Re-train the transformation models or not(bool type). modelPath: The path of saving trained transformation models. ''' if kwarg['renewModel']: tfidf = models.TfidfModel(Bowvec) # initialize tfidf model tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf") if kwarg['modelType'] == 'lsi': model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi model.save(kwarg['modelPath']) # same for tfidf, lda, ... elif kwarg['modelType'] == 'lda': model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重 model.save(kwarg['modelPath']) # same for tfidf, lda, ... elif kwarg['modelType'] == 'None': model = tfidf modelVec = tfidfVec else: if not os.path.exists(kwarg['modelPath']+"tfidf_model.tfidf"): tfidf = models.TfidfModel(Bowvec) # initialize tfidf model tfidfVec = tfidf[Bowvec] # tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf") else: tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus if kwarg['modelType'] == 'lsi': if not os.path.exists(kwarg['modelPath']+"lsi_model.lsi"): tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi model.save(kwarg['modelPath']+"lsi_model.lsi") # same for tfidf, lda, ... else: model = models.LsiModel.load(kwarg['modelPath']+"lsi_model.lsi") modelVec = model[tfidfVec] elif kwarg['modelType'] == 'lda': if not os.path.exists(kwarg['modelPath']+"lda_model.lda"): tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重 model.save(kwarg['modelPath']+"lda_model.lda") # same for tfidf, lda, ... else: model = models.LdaModel.load(kwarg['modelPath']+"lda_model.lda") modelVec = model[tfidfVec] elif kwarg['modelType'] == 'None': model = tfidf modelVec = tfidfVec return tfidfVec, modelVec
Example #20
Source File: topics_analysis.py From contextualLSTM with Apache License 2.0 | 4 votes |
def topic_analysis(corpus, dictionary, models_path, technique): import uuid uuid = str(uuid.uuid4()) print("[BLOCK] Starting models for context") sys.stdout.flush() if technique == "all" or technique == "hdp": t1 = time() # HDP model model = HdpModel(corpus, id2word=dictionary) model.save("%s/hdp_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2))) sys.stdout.flush() if technique == "all" or technique == "ldap": t1 = time() # Parallel LDA model model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=23, passes=20) model.save("%s/lda_parallel_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2))) sys.stdout.flush() if technique == "all" or technique == "lsa": t1 = time() # LSA model model = LsiModel(corpus, id2word=dictionary, num_topics=400) model.save("%s/lsa_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2))) sys.stdout.flush() if technique == "all" or technique == "ldao": t1 = time() # Online LDA model model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5) model.save("%s/lda_online_%s" % (models_path, uuid)) t2 = time() print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2))) sys.stdout.flush() if technique == "all" or technique == "lda": t1 = time() # Offline LDA model model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20) model.save("%s/lda_offline_%s" % (models_path, uuid)) del model t2 = time() print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2))) sys.stdout.flush()