Python Examples of gensim.models.LsiModel

Source File: textpro.py From comparable-text-miner with Apache License 2.0

6 votes

def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
##################################################################################

Source File: builder.py From Greynir with GNU General Public License v3.0

6 votes

def create_lsi_model(self, **kwargs):
        """ Create an LSI model from the entire words database table """
        corpus_tfidf = self.load_tfidf_corpus()
        if self._dictionary is None:
            self.load_dictionary()
        # Initialize an LSI transformation
        lsi = models.LsiModel(
            corpus_tfidf,
            id2word=self._dictionary,
            num_topics=self._dimensions,
            **kwargs
        )
        # if self._verbose:
        #    lsi.print_topics(num_topics = self._dimensions)
        # Save the generated model
        lsi.save(self._LSI_MODEL_FILE.format(self._dimensions))

Source File: text2vec.py From text2vec with Apache License 2.0

5 votes

def get_lsi(self, num_topics=300):
        docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
        model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict)
        docs_lsi  = model_lsi[docs_corpus]
        docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi])
        return docs_vecs

    # Get Random Projections(RP) vector for document list

Source File: builder.py From Greynir with GNU General Public License v3.0

5 votes

def load_lsi_model(self):
        """ Load a previously generated LSI model """
        self._model = models.LsiModel.load(
            self._LSI_MODEL_FILE.format(self._dimensions), mmap="r"
        )
        self._model_name = "lsi"

Source File: lsi_neighbor.py From aca with MIT License

5 votes

def create_lsi_model(num_topics,dictionary,corpus):
    print ("create lsi model ...")

    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    return (tfidf_model,lsi_model,corpus_simi_matrix)

Source File: lsi_author.py From aca with MIT License

5 votes

def create_lsi_model(num_topics,dictionary,corpus):
    print ("create lsi model ...")

    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    return (tfidf_model,lsi_model,corpus_simi_matrix)

Source File: lsi_model.py From aca with MIT License

5 votes

def create_lsi_model(num_topics,dictionary,corpus):

    print ("create lsi model ...")
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    #lsi_model = models.LsiModel(corpus,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    #corpus_lsi = lsi_model[corpus]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    #corpus_simi_matrix = similarities.MatrixSimilarity(corpus_tfidf)
    return (tfidf_model,lsi_model,corpus_simi_matrix)

Source File: topic_modeling.py From text-analytics-with-python with Apache License 2.0

5 votes

def train_lsi_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi

Source File: sentenceSimilarity.py From QAmodel-for-Retrievalchatbot with MIT License

5 votes

def LsiModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LsiModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lda模型

Source File: sentenceSimilarity.py From Customer-Chatbot with MIT License

5 votes

def LsiModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LsiModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lda模型

Source File: sentenceSimilarity.py From Customer-Chatbot with MIT License

5 votes

def LsiModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LsiModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lda模型

Source File: similarity.py From bugbug with Mozilla Public License 2.0

5 votes

def __init__(
        self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []

        for bug in bugzilla.get_bugs():

            textual_features = self.text_preprocess(self.get_text(bug))
            self.corpus.append([bug["id"], textual_features])

        # Assigning unique integer ids to all words
        self.dictionary = Dictionary(text for bug_id, text in self.corpus)

        # Conversion to BoW
        corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus]

        # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
        tfidf = models.TfidfModel(corpus_final)
        corpus_tfidf = tfidf[corpus_final]

        # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
        self.lsi = models.LsiModel(
            corpus_tfidf, id2word=self.dictionary, num_topics=300
        )
        corpus_lsi = self.lsi[corpus_tfidf]

        # Indexing the corpus
        self.index = similarities.Similarity(
            output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300
        )

Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License

5 votes

def CalSim(self,test_document,Type,best_num):
        '''Calculate similarities between test document wth all news(articles/documents).

        # Arguments:
            test_document: List of raw documents.
            Type: Models of calculating similarities.
            best_num: refer to 'num_best' parameter in Gensim module.
        '''
        if Type == 'Similarity-tfidf-index':
            tfidf = models.TfidfModel(self._BowVecOfEachDoc)  
            tfidfVec = tfidf[self._BowVecOfEachDoc]
            self._num_features = len(self._dictionary.token2id.keys())
            self._similarity = similarities.Similarity(Type, tfidfVec, \
                num_features=self._num_features,num_best=best_num)  
            test_cut_raw = list(jieba.cut(test_document))  
            test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) 
            self._test_BowVecOfEachDoc = tfidf[test_BowVecOfEachDoc]
        elif Type == 'Similarity-LSI-index':
            lsi_model = models.LsiModel(self._BowVecOfEachDoc)  
            corpus_lsi = lsi_model[self._BowVecOfEachDoc]
            self._num_features = len(self._dictionary.token2id.keys())
            self._similarity = similarities.Similarity(Type, corpus_lsi, \
                num_features=self._num_features,num_best=best_num)  
            test_cut_raw = list(jieba.cut(test_document))  
            test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) 
            self._test_BowVecOfEachDoc = lsi_model[test_BowVecOfEachDoc]
        self.Print_CalSim()
        IdLst = []
        SimRltLst = []
        SimTxLst = []
        for Id, Sim in self._similarity[self._test_BowVecOfEachDoc]:
            IdLst.append(Id)
            SimRltLst.append(Sim)
            SimTxLst.append(self._raw_documents[Id])
        return IdLst,SimTxLst,SimRltLst

Source File: docsim.py From nlp_learning with MIT License

5 votes

def calc_similarity(self, prefix: str, text: str):
        """计算相似度
        返回索引和余弦值

        Arguments:
            prefix {str} -- 模型前缀
            text {str} -- 文本数据
            value {float} -- 设定的阈值，返回大于这个值的数据
        """
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载字典
        corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix))  # 加载语料
        tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix))  # 加载Tfidf模型
        corpus_tfidf = tfidf_model[corpus]

        lsi = models.LsiModel(corpus_tfidf)
        corpus_lsi = lsi[corpus_tfidf]
        similarity_lsi = similarities.Similarity('./models/similarity-lsi-index',
                                                 corpus_lsi,
                                                 num_features=400,
                                                 num_best=3)
        cut_raw = self.segment(text)  # 1.分词
        corpus = dictionary.doc2bow(cut_raw)  # 2.转换成bow向量
        corpus_tfidf = tfidf_model[corpus]  # 3.计算tfidf值
        corpus_lsi = lsi[corpus_tfidf]  # 4.计算lsi值
        sims = similarity_lsi[corpus_lsi]

        with open('./data/idx_dic.dic', 'r') as f:
            dt = f.read()
            idx_dic = eval(dt)

        result = []
        if sims is not None:
            result = [idx_dic[idx] for idx, val in sims if val > self.keep_val]

        return result

Source File: textpro.py From comparable-text-miner with Apache License 2.0

5 votes

def align_sentences_lsi(source_sentences, target_sentences, model_path, model_name):
	logging.info( 'Sentence level alignment using LSI' )
	
	dictionaryFile = model_path +  model_name + '.dict'
	lsiFile = model_path +  model_name + '.lsi'
	
	dictionary = corpora.Dictionary.load(dictionaryFile) ; logging.info( 'dictionary loaded' )
	lsi = models.LsiModel.load(lsiFile) ; logging.info( 'lsi model loaded' )
	
	source_lsi_sentences = generateLSIvectors(source_sentences, dictionary, lsi); 
	logging.info( 'projects source sentences into LSI space')
	target_lsi_sentences = generateLSIvectors(target_sentences, dictionary, lsi); 
	logging.info( 'projects target sentences into LSI space' )
	
	source_index = 0 	
	new_source_doc = [] ; new_target_doc = []
	
	for d in source_lsi_sentences:
		target_index, sim = getComparable(d, target_lsi_sentences)
		source_sent = source_sentences[source_index] ; target_sent = target_sentences[target_index]
		del target_lsi_sentences[target_index] ; 
		del target_sentences[target_index] # remove the already aligned sentences from the target document
		new_source_doc.append(source_sent) 
		new_target_doc.append(target_sent)
		if not target_lsi_sentences: break # all target sentences are aligned
		source_index+=1
		
	return new_source_doc, new_target_doc
##################################################################################
# projecting a corpus into LSI space

Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License

4 votes

def CallTransformationModel(self,Dict,Bowvec,**kwarg):
        '''Invoke specific transformation models of Gensim module.

        # Arguments:
            Dict: Dictionary made by all tokenized news(articles/documents).
            Bowvec: Bow-vector created by all tokenized news(articles/documents).
            modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel.
            tfDim: The number of topics that will be extracted from each news(articles/documents). 
            renewModel: Re-train the transformation models or not(bool type).
            modelPath: The path of saving trained transformation models.
        '''
        if kwarg['renewModel']:
            tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
            tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
            tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
            if kwarg['modelType'] == 'lsi':
                model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
                modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                model.save(kwarg['modelPath']) # same for tfidf, lda, ...
            elif kwarg['modelType'] == 'lda':
                model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
                modelVec = model[tfidfVec] #每个文本对应的LDA向量，稀疏的，元素值是隶属与对应序数类的权重 
                model.save(kwarg['modelPath']) # same for tfidf, lda, ...
            elif kwarg['modelType'] == 'None': 
                model = tfidf
                modelVec = tfidfVec
        else:
            if not os.path.exists(kwarg['modelPath']+"tfidf_model.tfidf"):
                tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
                tfidfVec = tfidf[Bowvec] #
                tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
            else:
                tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
            if kwarg['modelType'] == 'lsi':
                if not os.path.exists(kwarg['modelPath']+"lsi_model.lsi"):
                    tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                    tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
                    model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
                    modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                    model.save(kwarg['modelPath']+"lsi_model.lsi") # same for tfidf, lda, ...
                else:
                    model = models.LsiModel.load(kwarg['modelPath']+"lsi_model.lsi")
                    modelVec = model[tfidfVec] 
            elif kwarg['modelType'] == 'lda':
                if not os.path.exists(kwarg['modelPath']+"lda_model.lda"):
                    tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf") 
                    tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
                    model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
                    modelVec = model[tfidfVec] #每个文本对应的LDA向量，稀疏的，元素值是隶属与对应序数类的权重 
                    model.save(kwarg['modelPath']+"lda_model.lda") # same for tfidf, lda, ...
                else:
                    model = models.LdaModel.load(kwarg['modelPath']+"lda_model.lda")
                    modelVec = model[tfidfVec] 
            elif kwarg['modelType'] == 'None': 
                model = tfidf
                modelVec = tfidfVec
        return tfidfVec, modelVec