Python gensim.models.LdaModel() Examples

The following are 20 code examples of gensim.models.LdaModel(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models , or try the search function .
Example #1
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #2
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #3
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #4
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #5
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #6
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #7
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #8
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #9
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim ) 
Example #10
Source File: text2vec.py    From text2vec with Apache License 2.0 5 votes vote down vote up
def get_lda(self, num_topics=100):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict)
        docs_lda  = model_lda[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
        return docs_vecs

    # Get Hierarchical Dirichlet Process(HDP) vector for document list 
Example #11
Source File: topic_modeling.py    From text-analytics-with-python with Apache License 2.0 5 votes vote down vote up
def train_lda_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda 
Example #12
Source File: sentenceSimilarity.py    From QAmodel-for-Retrievalchatbot with MIT License 5 votes vote down vote up
def LdaModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # 对新输入的句子(比较的句子)进行预处理 
Example #13
Source File: sentenceSimilarity.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def LdaModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # 对新输入的句子(比较的句子)进行预处理 
Example #14
Source File: sentenceSimilarity.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def LdaModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # 对新输入的句子(比较的句子)进行预处理 
Example #15
Source File: LDAModel_English.py    From LDA_RecEngine with Apache License 2.0 5 votes vote down vote up
def trainModel(self):
		'''
		Train a LDA model, inclusive of 4 steps:
		1. Parse the whole corpora into unigram token collections and document mapping (for later use)
		2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
		3. Indexing the token collections and do TF-IDF transformation
		4. Call gensim.models.LdaModel and generate topic distributions of the corpora
		'''
		print 'Start preparing unigram tokens....'		
		## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
		# Get document_count, tokens, and document-index mapping from the corpora
		doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) 
		# Put the training data into gensim.corpora for later use
		dic = corpora.Dictionary(train_set) 
		denominator = len(dic)
		# Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
		dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
		nominator = len(dic)
		corpus = [dic.doc2bow(text) for text in train_set]  # transform every token into BOW
		print 'There are %i documents in the pool' % (doc_count)
		print "In the corpus there are ", denominator, " raw tokens"
		print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
		print 'Finished preparing unigram tokens....'	
		##END 

		print 'Start training LDA model....'
		## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
		tfidf = models.TfidfModel(corpus)
		corpus_tfidf = tfidf[corpus]
		lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
		corpus_lda = lda[corpus_tfidf]
		# Once done training, print all the topics and related words
		print 'Finished training LDA model.......Here is the list of all topics & their most frequent words' 	
		for i in range(self.num_topics):
		    print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
		# Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
		print '==============================='
		print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
		print '==============================='   
		
		return lda,doc_mapping,link_mapping,corpus 
Example #16
Source File: lda_demo.py    From Zhihu_Spider with Apache License 2.0 5 votes vote down vote up
def test_model():
    '''
    after setting the lda model, test the model
    '''
    lda_model = models.LdaModel.load('./zhihu_dat/zhihu_10.lda')

    # transform the question_corpus into lda space, print the lda feature
    question_lda = lda_model[question_corpus]
    for doc in question_lda:
        print doc 
Example #17
Source File: lda_demo.py    From Zhihu_Spider with Apache License 2.0 5 votes vote down vote up
def build_lda_mode():
    # corpus is bag of words, which is the original feature
    corpus = corpora.BleiCorpus('./zhihu_dat/item.dat') # the bag of words feature of question data

    # build up lda model: using lda model, given a bag of words feature, return the topic feature, so the topic model is to reduce the dimension of the features of  a document
    lda_model = models.LdaModel(corpus, id2word = dictionary, num_topics = 10)

    # save the model to disk for future use(Given a document such as question, return the topic feature of the document)

    lda_model.save('./zhihu_dat/zhihu_10.lda')
    print 'Building complete' 
Example #18
Source File: topic_modelling.py    From Sarcasm-Detection with MIT License 4 votes vote down vote up
def gensim_lda_topic_modelling(path, documents, num_of_topics=6, passes=50, verbose=True, plotTopicsResults=True):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    if verbose:
        print("Cleaned documents:\n", documents)
        print("\nDictionary:\n", dictionary)
        print("\nCorpus in BoW form: \n", corpus)
    start = time.time()
    ldamodel = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary)
    end = time.time()
    print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))

    ldatopics = ldamodel.show_topics(formatted=False)
    ldatopics_words = [[[word, prob] for word, prob in topic] for topicid, topic in ldatopics]

    if verbose:
        print("\nList of words associated with each topic:\n")
        for i in range(len(ldatopics_words)):
            print("\nTopic %d:\n" % i)
            for w, p in ldatopics_words[i]:
                print(p, " - ", w)

    if plotTopicsResults:
        plot_top_10_words_per_topic(path, ldatopics_words, num_topics=6, num_top_words=10)

    all_documents_topics = [(doc_topics, word_topics, word_phis)
                            for doc_topics, word_topics, word_phis
                            in ldamodel.get_document_topics(corpus, per_word_topics=True)]
    all_doc_topics = []
    for i in range(len(all_documents_topics)):
        doc_topics, word_topics, phi_values = all_documents_topics[i]
        all_doc_topics.append([doc_topics[i][1] for i in range(len(doc_topics))])
        if verbose:
            print('Document topics:', doc_topics)
            print('Word topics:', word_topics)
            print('Phi values:', phi_values)
            print('-------------- \n')

    if plotTopicsResults:
        plot_share_of_topics(path, all_doc_topics, no_random_tweets=10)

    # Plot words coloured differently depending on the topic
    for doc in documents[0:100]:
        if len(doc) > 4:
            color_words(ldamodel, doc) 
Example #19
Source File: text_processing.py    From Listed-company-news-crawl-and-text-analysis with MIT License 4 votes vote down vote up
def CallTransformationModel(self,Dict,Bowvec,**kwarg):
        '''Invoke specific transformation models of Gensim module.

        # Arguments:
            Dict: Dictionary made by all tokenized news(articles/documents).
            Bowvec: Bow-vector created by all tokenized news(articles/documents).
            modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel.
            tfDim: The number of topics that will be extracted from each news(articles/documents). 
            renewModel: Re-train the transformation models or not(bool type).
            modelPath: The path of saving trained transformation models.
        '''
        if kwarg['renewModel']:
            tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
            tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
            tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
            if kwarg['modelType'] == 'lsi':
                model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
                modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                model.save(kwarg['modelPath']) # same for tfidf, lda, ...
            elif kwarg['modelType'] == 'lda':
                model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
                modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重 
                model.save(kwarg['modelPath']) # same for tfidf, lda, ...
            elif kwarg['modelType'] == 'None': 
                model = tfidf
                modelVec = tfidfVec
        else:
            if not os.path.exists(kwarg['modelPath']+"tfidf_model.tfidf"):
                tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
                tfidfVec = tfidf[Bowvec] #
                tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
            else:
                tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
            if kwarg['modelType'] == 'lsi':
                if not os.path.exists(kwarg['modelPath']+"lsi_model.lsi"):
                    tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                    tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
                    model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
                    modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                    model.save(kwarg['modelPath']+"lsi_model.lsi") # same for tfidf, lda, ...
                else:
                    model = models.LsiModel.load(kwarg['modelPath']+"lsi_model.lsi")
                    modelVec = model[tfidfVec] 
            elif kwarg['modelType'] == 'lda':
                if not os.path.exists(kwarg['modelPath']+"lda_model.lda"):
                    tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                    tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
                    model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
                    modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重 
                    model.save(kwarg['modelPath']+"lda_model.lda") # same for tfidf, lda, ...
                else:
                    model = models.LdaModel.load(kwarg['modelPath']+"lda_model.lda")
                    modelVec = model[tfidfVec] 
            elif kwarg['modelType'] == 'None': 
                model = tfidf
                modelVec = tfidfVec
        return tfidfVec, modelVec 
Example #20
Source File: topics_analysis.py    From contextualLSTM with Apache License 2.0 4 votes vote down vote up
def topic_analysis(corpus, dictionary, models_path, technique):

    import uuid
    uuid = str(uuid.uuid4())
    print("[BLOCK] Starting models for context")
    sys.stdout.flush()

    if technique == "all" or technique == "hdp":
        t1 = time()
        # HDP model
        model = HdpModel(corpus, id2word=dictionary)
        model.save("%s/hdp_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldap":
        t1 = time()
        # Parallel LDA model
        model = LdaMulticore(corpus, id2word=dictionary, num_topics=100,  workers=23, passes=20)
        model.save("%s/lda_parallel_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2)))
    sys.stdout.flush()

    if technique == "all" or technique == "lsa":
        t1 = time()
        # LSA model
        model = LsiModel(corpus, id2word=dictionary, num_topics=400)
        model.save("%s/lsa_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldao":
        t1 = time()
        # Online LDA model
        model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)
        model.save("%s/lda_online_%s" % (models_path, uuid))
        t2 = time()
        print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "lda":
        t1 = time()
        # Offline LDA model
        model = LdaModel(corpus, id2word=dictionary, num_topics=100,  update_every=0, passes=20)
        model.save("%s/lda_offline_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()