Python gensim.models.TfidfModel() Examples

The following are 30 code examples of gensim.models.TfidfModel(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models , or try the search function .
Example #1
Source File: test_miislita.py    From topical_word_embeddings with MIT License 7 votes vote down vote up
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2) 
Example #2
Source File: lexrankr.py    From lexrankr with MIT License 6 votes vote down vote up
def summarize(self, text):
        self.sentences = self.factory.text2sentences(text)
        self.num_sentences = len(self.sentences)
        self.corpus = SentenceCorpus(self.sentences, self.no_below_word_count, self.no_above_word_portion, self.max_dictionary_size)
        self.model = TfidfModel(self.corpus.bows, id2word=self.corpus.dictionary, normalize=True)
        self.tfidfs = self.model[self.corpus.bows]
        self._inject_tfidfs()
        self._build_matrix()
        self._clustering()
        if self.compactify:
            self._compactify()
        self.graphs = []
        for i in range(self.num_clusters):
            graph = self.sentences2graph(self.clusters[i])
            pagerank = networkx.pagerank(graph, weight='weight')
            self.clusters[i] = sorted(pagerank, key=pagerank.get, reverse=True)
            self.graphs.append(graph) 
Example #3
Source File: VectorSpaceModel.py    From Snowball with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens") 
Example #4
Source File: train_TFIDF_model.py    From Short-Text-Summarization with Apache License 2.0 6 votes vote down vote up
def train_TFIDF():
	
	list_cut_short_text = get_data.get_cut_PARTI_short_text()
	
	print "list_cut_short_text is %d"%(len(list_cut_short_text))

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level = logging.INFO)
	
	dictionary = corpora.Dictionary(list_cut_short_text)

	dictionary.save("dictionary.tfidf.dic")
	
	corpus = [dictionary.doc2bow(text) for text in list_cut_short_text]

	tfidf = models.TfidfModel(corpus)
	
	tfidf.save('./model/PARTI_tfidf_model') 
Example #5
Source File: cut_td_idf.py    From nlp_xiaojiang with MIT License 6 votes vote down vote up
def init_tfidf_chinese_or_pinyin(sources_path):
    """
      构建td_idf
    :param path: 
    :return: 
    """
    questions = txtRead(sources_path)
    corpora_documents = []
    for item_text in questions:
        item_seg = list(jieba.cut(str(item_text).strip()))
        corpora_documents.append(item_seg)

    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
    file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
    pickle.dump([dictionary, tfidf_model], file) 
Example #6
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2) 
Example #7
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2) 
Example #8
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2) 
Example #9
Source File: docsim.py    From nlp_learning with MIT License 6 votes vote down vote up
def train(self, prefix: str, corporas: list):
        """ 训练模型
        保存字典,语料,模型到磁盘

        Arguments:
            prefix {str} -- 模型名称前缀
            corpora_documents {list} -- 分词后的文本
        """
        # 生成字典和向量语料
        dictionary = corpora.Dictionary(corporas)
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corpus = [dictionary.doc2bow(text) for text in corporas]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)  # 保存生成的语料
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))  # 保存Tfidf模型 
Example #10
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2) 
Example #11
Source File: test_miislita.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2) 
Example #12
Source File: corpusbuilder.py    From simsearch with MIT License 6 votes vote down vote up
def buildCorpus(self):
        """
        Build the corpus from the documents:
            1. Remove words that only appeared once.
            2. Create the Dictionary object.
            3. Convert the documents to simple bag-of-words representation.
            4. Convert the bag-of-words vectors to tf-idf.
        """
        # Remove words that only appear once.
        self.documents = [[token for token in doc if self.frequency[token] > 1]
                          for doc in self.documents]
        
        # Build a dictionary from the text.
        self.dictionary = corpora.Dictionary(self.documents)
        
        # Map the documents to vectors.
        corpus = [self.dictionary.doc2bow(text) for text in self.documents]

        # Delete the tokenized representation of the documents--no need to
        # carry this around!
        del self.documents[:]

        # Convert the simple bag-of-words vectors to a tf-idf representation.        
        self.tfidf_model = TfidfModel(corpus)
        self.corpus_tfidf = self.tfidf_model[corpus] 
Example #13
Source File: dtm.py    From PyShortTextCategorization with MIT License 6 votes vote down vote up
def generate_dtm(self, corpus, tfidf=False):
        """ Generate the inside document-term matrix and other peripherical information
        objects. This is run when the class is instantiated.

        :param corpus: corpus.
        :param tfidf: whether to weigh using tf-idf. (Default: False)
        :return: None
        :type corpus: list
        :type tfidf: bool
        """
        self.dictionary = Dictionary(corpus)
        self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float)
        bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
        if tfidf:
            weighted_model = TfidfModel(bow_corpus)
            bow_corpus = weighted_model[bow_corpus]
        for docid in self.docids:
            for tokenid, count in bow_corpus[self.docid_dict[docid]]:
                self.dtm[self.docid_dict[docid], tokenid] = count 
Example #14
Source File: topic_modeling.py    From text-analytics-with-python with Apache License 2.0 5 votes vote down vote up
def train_lsi_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi 
Example #15
Source File: sentenceSimilarity.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def TfidfModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.TfidfModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lsi模型 
Example #16
Source File: sentenceSimilarity.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def TfidfModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.TfidfModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lsi模型 
Example #17
Source File: document_embedder.py    From fake-news-detection-pipeline with Apache License 2.0 5 votes vote down vote up
def _set_tfidf(self):
        self._tfidf = TfidfModel(corpus=self.docs.get_bow())
        self._tfidf_score = [[(index, score) for index, score in self._tfidf[doc]] for doc in self.docs.get_bow()] 
Example #18
Source File: tfidf.py    From DeepNews with Apache License 2.0 5 votes vote down vote up
def train_tfidf_model(self,file_path='../../temp_results/corpus.txt'):
        textfile = codecs.open(file_path, "r", "utf-8")   
        
        print("Reading and Processing Text File")
        first_lines=[]
        for line in textfile:
            first_lines.append(line.strip())
        
        print ("--------Building Corpora Dictionary---------------" )
        dictionary = corpora.Dictionary(line.split('#|#')[1].split() for line in first_lines)
        
        #remove words that appear less than 2 times
        #twoids = [tokenid for tokenid,docfreq in iteritems(dictionary.dfs) if docfreq < 2]
        #dictionary.filter_tokens(fiveids)
        
        #Remove Gaps
        dictionary.compactify()
        dictionary.save_as_text('../../temp_results/tfidf_dictionary.txt',sort_by_word=False)
        dictionary.save('../../temp_results/tfidf_dictionary')
        print("Dictionary Saved")
                    
        print ("--Now Transforming to Bag of Words Vectors on the Fly--")
        class MyCorpus(object):
            def __iter__(self):
                for line in first_lines:
                    yield dictionary.doc2bow(line.split()) 
                
        news_corpus  = MyCorpus()
        print("Corpus Built...Now Starting Model Training")
        tfidf_model = models.TfidfModel(news_corpus)
        tfidf_model.save('../../temp_results/tfidf_model')
        print("Model Trained & Saved") 
Example #19
Source File: sentenceSimilarity.py    From QAmodel-for-Retrievalchatbot with MIT License 5 votes vote down vote up
def TfidfModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.TfidfModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lsi模型 
Example #20
Source File: keyphrase_extraction.py    From text-analytics-with-python with Apache License 2.0 5 votes vote down vote up
def get_tfidf_weighted_keyphrases(sentences, 
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):
    
    valid_chunks = get_chunks(sentences, grammar=grammar)
                                     
    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]
    
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    weighted_phrases = {dictionary.get(id): round(value,3) 
                        for doc in corpus_tfidf 
                        for id, value in doc}
                            
    weighted_phrases = sorted(weighted_phrases.items(), 
                              key=itemgetter(1), reverse=True)
    
    return weighted_phrases[:top_n] 
Example #21
Source File: lsi_neighbor.py    From aca with MIT License 5 votes vote down vote up
def create_lsi_model(num_topics,dictionary,corpus):
    print ("create lsi model ...")

    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    return (tfidf_model,lsi_model,corpus_simi_matrix) 
Example #22
Source File: topic_modeling.py    From text-analytics-with-python with Apache License 2.0 5 votes vote down vote up
def train_lda_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda 
Example #23
Source File: builder.py    From Greynir with GNU General Public License v3.0 5 votes vote down vote up
def load_tfidf_model(self):
        """ Load an already generated TFIDF model """
        self._tfidf = models.TfidfModel.load(self._TFIDF_MODEL_FILE, mmap="r") 
Example #24
Source File: builder.py    From Greynir with GNU General Public License v3.0 5 votes vote down vote up
def create_tfidf_model(self):
        """ Create a fresh TFIDF model from a dictionary """
        if self._dictionary is None:
            self.load_dictionary()
        tfidf = models.TfidfModel(dictionary=self._dictionary)
        tfidf.save(self._TFIDF_MODEL_FILE)
        self._tfidf = tfidf 
Example #25
Source File: lsi_model.py    From aca with MIT License 5 votes vote down vote up
def create_lsi_model(num_topics,dictionary,corpus):

    print ("create lsi model ...")
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    #lsi_model = models.LsiModel(corpus,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    #corpus_lsi = lsi_model[corpus]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    #corpus_simi_matrix = similarities.MatrixSimilarity(corpus_tfidf)
    return (tfidf_model,lsi_model,corpus_simi_matrix) 
Example #26
Source File: lsi_author.py    From aca with MIT License 5 votes vote down vote up
def create_lsi_model(num_topics,dictionary,corpus):
    print ("create lsi model ...")

    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    lsi_model = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics = num_topics)
    corpus_lsi = lsi_model[corpus_tfidf]
    corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
    return (tfidf_model,lsi_model,corpus_simi_matrix) 
Example #27
Source File: docsim.py    From nlp_learning with MIT License 5 votes vote down vote up
def update_model(self, prefix: str, sysno: int, doc: str):
        """
        更新字典
        :param prefix:
        :param sysno: 系统编号
        :param doc:   文本内容
        :return:
        """

        corporas = self.segment(doc)
        # # 更新字典
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载
        dictionary.add_documents([corporas])
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corporas_docs = np.load("./data/{}_words.npy".format(prefix))
        corporas_docs = list(corporas_docs)
        corporas_docs.append(corporas)
        np.save("./data/{}_words.npy".format(prefix), corporas_docs)
        # 更新corpus
        corpus = [dictionary.doc2bow(text) for text in corporas_docs]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)

        # 更新TfidfModel
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))

        # 更新索引字典
        with open('./data/idx_dic.dic', 'r') as f:
            dt = f.read()
            idx_dic = eval(dt)

        if sysno not in idx_dic.values():
            idx_dic[len(idx_dic)] = sysno

        with open('./data/idx_dic.dic', 'w') as f:
            f.write(str(idx_dic)) 
Example #28
Source File: tfidfmodel.py    From dialogbot with Apache License 2.0 5 votes vote down vote up
def _train_model(self, min_freq=1):
        # Create tfidf model.
        self.dct = corpora.Dictionary(self.contexts)
        # Filter low frequency words from dictionary.
        low_freq_ids = [id_ for id_, freq in
                        self.dct.dfs.items() if freq <= min_freq]
        self.dct.filter_tokens(low_freq_ids)
        self.dct.compactify()
        # Build tfidf model.
        self.corpus = [self.dct.doc2bow(s) for s in self.contexts]
        self.tfidf_model = models.TfidfModel(self.corpus) 
Example #29
Source File: textpro.py    From comparable-text-miner with Apache License 2.0 5 votes vote down vote up
def prepare_gensim_corpus(corpus_name, corpus, output_path, min_freq=5):
	if not output_path.endswith('/'): output_path = output_path + '/'
	check_dir(output_path) # if directory does not exist, then create
	
	logging.info( 'building gensim corpus and dictionary for %s corpus', corpus_name )
	logging.info( 'loading corpus' )
	texts = [[word for word in process_text(document, removePunct=True, removeSW=True, removeNum=True)] for document in corpus]
	logging.info( 'tokenizing' )
	all_tokens = [item for sublist in texts for item in sublist]
	logging.info( 'mark tokens which have frequency less than %d', min_freq )
	tokens_once = set([k for k, v in collections.Counter(all_tokens).iteritems() if v < min_freq ])
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'filter low frequency tokens' )
	texts = [[word for word in text if word not in tokens_once] for text in texts]
	logging.info( '|D|=%d' , len(texts) )
	logging.info( 'building dictionary' )
	dictionary = corpora.Dictionary(texts)
	logging.info( 'saving dictionary' )
	dictFile = output_path + corpus_name + '.dict'
	dictionary.save(dictFile) 
	logging.info( 'building corpus in  mm format' )
	corpus = [dictionary.doc2bow(text) for text in texts]
	logging.info( 'saving corpus' )
	gensim_corpus_file = output_path + corpus_name + '.mm'
	corpora.MmCorpus.serialize(gensim_corpus_file, corpus)
	logging.info( 'computing tfidf' )
	tfidf = models.TfidfModel(corpus) # tfidf model 
	corpus_tfidf = tfidf[corpus] # tfidf corpus 
	logging.info( 'saving tfidf corpus' )
	corpus_tfidf_file = output_path + corpus_name + '.tfidf.mm'
	corpora.MmCorpus.serialize(corpus_tfidf_file, corpus_tfidf)
	logging.info( 'gensim corpus is ready' )
################################################################################## 
Example #30
Source File: scdv.py    From redshells with MIT License 5 votes vote down vote up
def _build_idf(dictionary: gensim.corpora.Dictionary) -> np.ndarray:
        model = TfidfModel(dictionary=dictionary)
        idf = np.zeros(len(dictionary.token2id))
        for idx, value in model.idfs.items():
            idf[idx] = value
        return idf