Python Examples of gensim.models.LdaModel

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

6 votes

def Execute( self, tokenRegex, numTopics, numPasses ):
		if not os.path.exists( self.modelPath ):
			os.makedirs( self.modelPath )

		# Generate gensim objects
		corpus = GensimTermiteCorpusReader( self.corpusPath, tokenRegex )
		corpus.dictionary.filter_extremes( no_above = 0.2 )  # remove words that are too frequent/too infrequent
		model = models.LdaModel( corpus, id2word = corpus.dictionary, num_topics = numTopics, passes = numPasses )

		self.logger.info( 'Saving dictionary to disk: %s', self.dictionaryInGensim )
		corpus.dictionary.save( self.dictionaryInGensim )
		
		self.logger.info( 'Saving corpus to disk: %s', self.corpusInGensim )
		corpus.save( self.corpusInGensim )

		self.logger.info( 'Saving model to disk: %s', self.modelInGensim )
		model.save( self.modelInGensim )

Source File: text2vec.py From text2vec with Apache License 2.0

5 votes

def get_lda(self, num_topics=100):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict)
        docs_lda  = model_lda[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
        return docs_vecs

    # Get Hierarchical Dirichlet Process(HDP) vector for document list

Source File: topic_modeling.py From text-analytics-with-python with Apache License 2.0

5 votes

def train_lda_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda

Source File: sentenceSimilarity.py From QAmodel-for-Retrievalchatbot with MIT License

5 votes

def LdaModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # 对新输入的句子（比较的句子）进行预处理

Source File: sentenceSimilarity.py From Customer-Chatbot with MIT License

5 votes

def LdaModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # 对新输入的句子（比较的句子）进行预处理

Source File: sentenceSimilarity.py From Customer-Chatbot with MIT License

5 votes

def LdaModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # 对新输入的句子（比较的句子）进行预处理

Source File: LDAModel_English.py From LDA_RecEngine with Apache License 2.0

5 votes

def trainModel(self):
		'''
		Train a LDA model, inclusive of 4 steps:
		1. Parse the whole corpora into unigram token collections and document mapping (for later use)
		2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
		3. Indexing the token collections and do TF-IDF transformation
		4. Call gensim.models.LdaModel and generate topic distributions of the corpora
		'''
		print 'Start preparing unigram tokens....'		
		## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
		# Get document_count, tokens, and document-index mapping from the corpora
		doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) 
		# Put the training data into gensim.corpora for later use
		dic = corpora.Dictionary(train_set) 
		denominator = len(dic)
		# Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
		dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
		nominator = len(dic)
		corpus = [dic.doc2bow(text) for text in train_set]  # transform every token into BOW
		print 'There are %i documents in the pool' % (doc_count)
		print "In the corpus there are ", denominator, " raw tokens"
		print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
		print 'Finished preparing unigram tokens....'	
		##END 

		print 'Start training LDA model....'
		## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
		tfidf = models.TfidfModel(corpus)
		corpus_tfidf = tfidf[corpus]
		lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
		corpus_lda = lda[corpus_tfidf]
		# Once done training, print all the topics and related words
		print 'Finished training LDA model.......Here is the list of all topics & their most frequent words' 	
		for i in range(self.num_topics):
		    print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
		# Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
		print '==============================='
		print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
		print '==============================='   
		
		return lda,doc_mapping,link_mapping,corpus

Source File: lda_demo.py From Zhihu_Spider with Apache License 2.0

5 votes

def test_model():
    '''
    after setting the lda model, test the model
    '''
    lda_model = models.LdaModel.load('./zhihu_dat/zhihu_10.lda')

    # transform the question_corpus into lda space, print the lda feature
    question_lda = lda_model[question_corpus]
    for doc in question_lda:
        print doc

Source File: lda_demo.py From Zhihu_Spider with Apache License 2.0

5 votes

def build_lda_mode():
    # corpus is bag of words, which is the original feature
    corpus = corpora.BleiCorpus('./zhihu_dat/item.dat') # the bag of words feature of question data

    # build up lda model: using lda model, given a bag of words feature, return the topic feature, so the topic model is to reduce the dimension of the features of  a document
    lda_model = models.LdaModel(corpus, id2word = dictionary, num_topics = 10)

    # save the model to disk for future use(Given a document such as question, return the topic feature of the document)

    lda_model.save('./zhihu_dat/zhihu_10.lda')
    print 'Building complete'

Source File: topic_modelling.py From Sarcasm-Detection with MIT License

4 votes

def gensim_lda_topic_modelling(path, documents, num_of_topics=6, passes=50, verbose=True, plotTopicsResults=True):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    if verbose:
        print("Cleaned documents:\n", documents)
        print("\nDictionary:\n", dictionary)
        print("\nCorpus in BoW form: \n", corpus)
    start = time.time()
    ldamodel = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary)
    end = time.time()
    print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))

    ldatopics = ldamodel.show_topics(formatted=False)
    ldatopics_words = [[[word, prob] for word, prob in topic] for topicid, topic in ldatopics]

    if verbose:
        print("\nList of words associated with each topic:\n")
        for i in range(len(ldatopics_words)):
            print("\nTopic %d:\n" % i)
            for w, p in ldatopics_words[i]:
                print(p, " - ", w)

    if plotTopicsResults:
        plot_top_10_words_per_topic(path, ldatopics_words, num_topics=6, num_top_words=10)

    all_documents_topics = [(doc_topics, word_topics, word_phis)
                            for doc_topics, word_topics, word_phis
                            in ldamodel.get_document_topics(corpus, per_word_topics=True)]
    all_doc_topics = []
    for i in range(len(all_documents_topics)):
        doc_topics, word_topics, phi_values = all_documents_topics[i]
        all_doc_topics.append([doc_topics[i][1] for i in range(len(doc_topics))])
        if verbose:
            print('Document topics:', doc_topics)
            print('Word topics:', word_topics)
            print('Phi values:', phi_values)
            print('-------------- \n')

    if plotTopicsResults:
        plot_share_of_topics(path, all_doc_topics, no_random_tweets=10)

    # Plot words coloured differently depending on the topic
    for doc in documents[0:100]:
        if len(doc) > 4:
            color_words(ldamodel, doc)

Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License

4 votes

def CallTransformationModel(self,Dict,Bowvec,**kwarg):
        '''Invoke specific transformation models of Gensim module.

        # Arguments:
            Dict: Dictionary made by all tokenized news(articles/documents).
            Bowvec: Bow-vector created by all tokenized news(articles/documents).
            modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel.
            tfDim: The number of topics that will be extracted from each news(articles/documents). 
            renewModel: Re-train the transformation models or not(bool type).
            modelPath: The path of saving trained transformation models.
        '''
        if kwarg['renewModel']:
            tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
            tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
            tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
            if kwarg['modelType'] == 'lsi':
                model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
                modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                model.save(kwarg['modelPath']) # same for tfidf, lda, ...
            elif kwarg['modelType'] == 'lda':
                model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
                modelVec = model[tfidfVec] #每个文本对应的LDA向量，稀疏的，元素值是隶属与对应序数类的权重 
                model.save(kwarg['modelPath']) # same for tfidf, lda, ...
            elif kwarg['modelType'] == 'None': 
                model = tfidf
                modelVec = tfidfVec
        else:
            if not os.path.exists(kwarg['modelPath']+"tfidf_model.tfidf"):
                tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
                tfidfVec = tfidf[Bowvec] #
                tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
            else:
                tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
            if kwarg['modelType'] == 'lsi':
                if not os.path.exists(kwarg['modelPath']+"lsi_model.lsi"):
                    tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                    tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
                    model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
                    modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                    model.save(kwarg['modelPath']+"lsi_model.lsi") # same for tfidf, lda, ...
                else:
                    model = models.LsiModel.load(kwarg['modelPath']+"lsi_model.lsi")
                    modelVec = model[tfidfVec] 
            elif kwarg['modelType'] == 'lda':
                if not os.path.exists(kwarg['modelPath']+"lda_model.lda"):
                    tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                    tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
                    model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
                    modelVec = model[tfidfVec] #每个文本对应的LDA向量，稀疏的，元素值是隶属与对应序数类的权重 
                    model.save(kwarg['modelPath']+"lda_model.lda") # same for tfidf, lda, ...
                else:
                    model = models.LdaModel.load(kwarg['modelPath']+"lda_model.lda")
                    modelVec = model[tfidfVec] 
            elif kwarg['modelType'] == 'None': 
                model = tfidf
                modelVec = tfidfVec
        return tfidfVec, modelVec

Source File: topics_analysis.py From contextualLSTM with Apache License 2.0

4 votes

def topic_analysis(corpus, dictionary, models_path, technique):

    import uuid
    uuid = str(uuid.uuid4())
    print("[BLOCK] Starting models for context")
    sys.stdout.flush()

    if technique == "all" or technique == "hdp":
        t1 = time()
        # HDP model
        model = HdpModel(corpus, id2word=dictionary)
        model.save("%s/hdp_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldap":
        t1 = time()
        # Parallel LDA model
        model = LdaMulticore(corpus, id2word=dictionary, num_topics=100,  workers=23, passes=20)
        model.save("%s/lda_parallel_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2)))
    sys.stdout.flush()

    if technique == "all" or technique == "lsa":
        t1 = time()
        # LSA model
        model = LsiModel(corpus, id2word=dictionary, num_topics=400)
        model.save("%s/lsa_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "ldao":
        t1 = time()
        # Online LDA model
        model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)
        model.save("%s/lda_online_%s" % (models_path, uuid))
        t2 = time()
        print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

    if technique == "all" or technique == "lda":
        t1 = time()
        # Offline LDA model
        model = LdaModel(corpus, id2word=dictionary, num_topics=100,  update_every=0, passes=20)
        model.save("%s/lda_offline_%s" % (models_path, uuid))
        del model
        t2 = time()
        print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2)))
        sys.stdout.flush()

Python gensim.models.LdaModel() Examples