Python Examples of gensim.corpora.Dictionary

Source File: text.py From nlp_learning with MIT License

6 votes

def main():
    corpora_documents = []
    for item_text in raw_documents:
        item_str = list(jieba.cut(item_text))
        corpora_documents.append(item_str)

    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]

    similarity =similarities.Similarity('-Similarity-index', corpus, num_features=400)

    test_data_1 = '你好，我想问一下我想离婚他不想离，孩子他说不要，是六个月就自动生效离婚'
    test_cut_raw_1 = jieba.cut(test_data_1)
    test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)
    similarity.num_best = 5
    # 返回最相似的样本材料,(index_of_document, similarity) tuples
    print(similarity[test_corpus_1])

Source File: textpro.py From comparable-text-miner with Apache License 2.0

6 votes

def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
##################################################################################

Source File: similarity.py From bugbug with Mozilla Public License 2.0

6 votes

def __init__(
        self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []
        self.bug_ids = []
        for bug in bugzilla.get_bugs():
            self.corpus.append(self.text_preprocess(self.get_text(bug)))
            self.bug_ids.append(bug["id"])

        indexes = list(range(len(self.corpus)))
        random.shuffle(indexes)
        self.corpus = [self.corpus[idx] for idx in indexes]
        self.bug_ids = [self.bug_ids[idx] for idx in indexes]

        self.dictionary = Dictionary(self.corpus)

        self.model = LdaModel([self.dictionary.doc2bow(text) for text in self.corpus])

Source File: similarity.py From bugbug with Mozilla Public License 2.0

6 votes

def __init__(
        self,
        cut_off=0.2,
        cleanup_urls=True,
        nltk_tokenizer=False,
        confidence_threshold=0.8,
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )

        terms_idx = WordEmbeddingSimilarityIndex(self.w2vmodel.wv)
        self.dictionary = Dictionary(self.corpus)

        bow = [self.dictionary.doc2bow(doc) for doc in self.corpus]

        similarity_matrix = SparseTermSimilarityMatrix(terms_idx, self.dictionary)
        self.softcosinesimilarity = SoftCosineSimilarity(
            bow, similarity_matrix, num_best=10
        )

Source File: lex_sem_ft.py From DeepLearn with MIT License

6 votes

def LDA_train(doc):
    red = []
    en_stop = get_stop_words('en')
    for d in doc:
        try:
            raw = d.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            red.append(stopped_tokens)
        except:
            continue
    print("Forming Dictionary.....")
    dictionary = corpora.Dictionary(red)
    print("Forming Corpus.....")
    corpus = [dictionary.doc2bow(text) for text in red]
    print("Training Model.....")
    lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
    return lda

#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):

Source File: lex_sem_ft.py From DeepLearn with MIT License

6 votes

def sum_bigram(sent, model):
    sent = sent.split()
    first = True
    tot = 0
    for i in range(len(sent)):
        try:
            if first:
                tot += model[None][sent[i]]
                first = False
            else:
                tot += model[sent[i-1]][sent[i]]
        except:
            continue
    return tot

#Training Trigram Model[Returns Dictionary of Dictionaries]:

Source File: utils.py From CLAtoolkit with GNU General Public License v3.0

6 votes

def get_LDAVis_JSON(platform, num_topics, course_code, start_date=None, end_date=None):
    #print "get_LDAVis_JSON"
    docs,ids = get_allcontent_byplatform(platform, course_code, start_date=start_date, end_date=end_date)
    documents = remove_stopwords(docs)

    # Make dictionary
    dictionary = corpora.Dictionary(documents)

    #Create and save corpus
    corpus = [dictionary.doc2bow(text) for text in documents]

    #Run LDA
    model = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

    tmp = pyLDAvis.gensim.prepare(model, corpus, dictionary).to_json()
    #print tmp
    #tmp = model.show_topics(num_topics=20, num_words=5, log=False, formatted=False)

    return tmp

Source File: textrank_gensim.py From nlg-yongzhuo with MIT License

6 votes

def _build_corpus(sentences):
    """Construct corpus from provided sentences.

    Parameters
    ----------
    sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Given sentences.

    Returns
    -------
    list of list of (int, int)
        Corpus built from sentences.

    """
    split_tokens = [jieba_cut(sentence) for sentence in sentences]
    dictionary = Dictionary(split_tokens)
    return [dictionary.doc2bow(token) for token in split_tokens]

Source File: fastfm_recommender.py From yelp with GNU Lesser General Public License v2.1

6 votes

def preprocess_records(train_records, test_records):
    """
    Creates a bag of words and a corpus for each record and creates a dictionary
    based on all the text contained in the records
    """

    records = train_records + test_records

    all_words = []

    for record in records:
        bow = record['context_text'].split()
        record[Constants.BOW_FIELD] = bow
        all_words.append(bow)

    dictionary = corpora.Dictionary(all_words)

    for record in records:
        record[Constants.CORPUS_FIELD] = \
            dictionary.doc2bow(record[Constants.BOW_FIELD])

    return dictionary

Source File: context_knn.py From yelp with GNU Lesser General Public License v2.1

6 votes

def get_topic_distribution(self, review):
        """

        :type review: str
        """
        review_bow = lda_context_utils.create_bag_of_words([review])
        dictionary = corpora.Dictionary(review_bow)
        corpus = dictionary.doc2bow(review_bow[0])
        lda_corpus = self.lda_model.get_document_topics(corpus)

        topic_distribution =\
            lda_document_to_topic_distribution(lda_corpus, self.num_topics)

        return topic_distribution

    # TODO: Adapt this to a data structure in which a user can rate the same
    # item multiple times in different contexts

Source File: reviews_preprocessor.py From yelp with GNU Lesser General Public License v2.1

6 votes

def build_dictionary(self):
        print('%s: build dictionary' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if self.use_cache and os.path.exists(Constants.DICTIONARY_FILE):
            print('Dictionary already exists')
            self.dictionary = corpora.Dictionary.load(Constants.DICTIONARY_FILE)
            return

        all_words = []

        for record in self.records:
            all_words.append(record[Constants.BOW_FIELD])

        self.dictionary = corpora.Dictionary(all_words)

        self.dictionary.filter_extremes(
            Constants.MIN_DICTIONARY_WORD_COUNT,
            Constants.MAX_DICTIONARY_WORD_COUNT)

        self.dictionary.save(Constants.DICTIONARY_FILE)

Source File: lex_sem_ft.py From DL-text with MIT License

6 votes

def LDA_train(doc):
    red = []
    en_stop = get_stop_words('en')
    for d in doc:
        try:
            raw = d.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            red.append(stopped_tokens)
        except:
            continue
    print("Forming Dictionary.....")
    dictionary = corpora.Dictionary(red)
    print("Forming Corpus.....")
    corpus = [dictionary.doc2bow(text) for text in red]
    print("Training Model.....")
    lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
    return lda

#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):

Source File: VectorSpaceModel.py From Snowball with GNU General Public License v3.0

6 votes

def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens")

Source File: docsim.py From nlp_learning with MIT License

6 votes

def train(self, prefix: str, corporas: list):
        """ 训练模型
        保存字典，语料，模型到磁盘

        Arguments:
            prefix {str} -- 模型名称前缀
            corpora_documents {list} -- 分词后的文本
        """
        # 生成字典和向量语料
        dictionary = corpora.Dictionary(corporas)
        dictionary.save('./models/{}_dict.dic'.format(prefix))  # 保存生成的词典

        corpus = [dictionary.doc2bow(text) for text in corporas]
        corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus)  # 保存生成的语料
        tfidf_model = models.TfidfModel(corpus)
        tfidf_model.save("./models/{}_tfidf_model.model".format(prefix))  # 保存Tfidf模型

Source File: sent_utils.py From embedding with MIT License

6 votes

def latent_dirichlet_allocation(corpus_fname, output_fname, tokenizer_name="mecab"):
    make_save_path(output_fname)
    documents, tokenized_corpus = [], []
    tokenizer = get_tokenizer(tokenizer_name)
    with open(corpus_fname, 'r', encoding='utf-8') as f:
        for document in f:
            tokens = list(set(tokenizer.morphs(document.strip())))
            documents.append(document)
            tokenized_corpus.append(tokens)
    dictionary = corpora.Dictionary(tokenized_corpus)
    corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]
    LDA = ldamulticore.LdaMulticore(corpus, id2word=dictionary,
                                    num_topics=30,
                                    minimum_probability=0.0,
                                    workers=4)
    # 특정 토픽의 확률이 0.5보다 클 경우에만 데이터를 리턴한다
    # 확률의 합은 1이기 때문에 해당 토픽이 해당 문서에서 확률값이 가장 큰 토픽이 된다
    all_topics = LDA.get_document_topics(corpus, minimum_probability=0.5, per_word_topics=False)
    with open(output_fname + ".results", 'w') as f:
        for doc_idx, topic in enumerate(all_topics):
            if len(topic) == 1:
                topic_id, prob = topic[0]
                f.writelines(documents[doc_idx].strip() + "\u241E" + ' '.join(tokenized_corpus[doc_idx]) + "\u241E" + str(topic_id) + "\u241E" + str(prob) + "\n")
    LDA.save(output_fname + ".model")

Source File: util.py From seq2seq with MIT License

6 votes

def load_dictionary(filename: str) -> corpora.Dictionary:
    """辞書をロードする。

    Args:
        filename (str): ファイル名。
    Returns:
        corpora.Dictionary: 辞書。
    """
    dic = corpora.Dictionary.load(filename)
    # if with_symbol and \
    #         not (dic.token2id["<S>"] == 0 and dic.token2id["</S>"] == 1):
    #     raise Exception("<S> and </S> ids should be 0 and 1")

    print("load dictionary: {} items".format(len(dic.values())))
    # print([item for item in dic.items()][:10])
    return dic

Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License

6 votes

def genDictionary(self,documents,**kwarg):
        '''Generate dictionary and bow-vector of all tokenzied news(articles).

        # Arguments:
            documents: List of news(articles).
            saveDict: Save dictionary or not(bool type).
            saveBowvec: Save bow-vector or not(bool type).
            returnValue: Return value or not(bool type).
        '''
        self._raw_documents = documents
        token = self.jieba_tokenize(documents) #jieba tokenize
        #corpora_documents = self.RemoveWordAppearOnce(token)  # remove thw words appearing once in the dictionary
        self._dictionary = corpora.Dictionary(token)  # generate dictionary using tokenized documents  
        if kwarg['saveDict']:
            self._dictionary.save(kwarg['saveDictPath']) # store the dictionary, for future reference
        self._BowVecOfEachDoc = [self._dictionary.doc2bow(text) for text in token]  # convert tokenized documents to vectors
        if kwarg['saveBowvec']:
            corpora.MmCorpus.serialize(kwarg['saveBowvecPath'], self._BowVecOfEachDoc)  # store to disk, for later use
        if kwarg['returnValue']:
            return token, self._dictionary, self._BowVecOfEachDoc

Source File: util.py From seq2seq with MIT License

6 votes

def tokens2ids(
    tokens: List[str],
    dictionary: corpora.Dictionary,
    verbose: bool=False
) -> List[int]:
    if verbose:
        not_found_lst = [
            word for word in tokens if word not in dictionary.token2id
        ]
        if not_found_lst:
            print("not found in dict: {}".format(
                not_found_lst
            ))
        for word in tokens:
            if word in dictionary and dictionary.token2id[word] < 0:
                raise("word id < 0: {}".format(word))

    # 未知語は UNK にする
    return [
        dictionary.token2id[word] if word in dictionary.token2id
        else dictionary.token2id[config.UNK_SYMBOL]
        for word in tokens
    ]

Source File: cut_td_idf.py From nlp_xiaojiang with MIT License

6 votes

def init_tfidf_chinese_or_pinyin(sources_path):
    """
      构建td_idf
    :param path: 
    :return: 
    """
    questions = txtRead(sources_path)
    corpora_documents = []
    for item_text in questions:
        item_seg = list(jieba.cut(str(item_text).strip()))
        corpora_documents.append(item_seg)

    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
    file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
    pickle.dump([dictionary, tfidf_model], file)

Source File: dbscan_analysis.py From ns4_chatbot with Apache License 2.0

6 votes

def get_dictionary(df):
		logger.debug("一共%d行文本数据", len(df))

		all_rows = []
		for one in df['html_cut']:
			#不知为何从csv加载的有的html切词是nan，然后识别成float类型了，加上这个做错误处理
			if not isinstance(one, str):
				logger.error("当前行的html_cut数据类型不是Str：%r", one)
				continue
			cut_content_list = one.split(" ")
			all_rows.append(cut_content_list)
		# 得到词典
		dictionary = corpora.Dictionary(all_rows)
		#为了防止未来词表乱掉，先保存一个词表，固定下来。后续的分类验证的时候，会用这个固定词表
		# dictionary.save("out/dictionary.dic")
		logger.debug("词袋一共%d个词", len(dictionary.keys()))

		return dictionary

Source File: lex_sem_ft.py From DL-text with MIT License

6 votes

def sum_bigram(sent, model):
    sent = sent.split()
    first = True
    tot = 0
    for i in range(len(sent)):
        try:
            if first:
                tot += model[None][sent[i]]
                first = False
            else:
                tot += model[sent[i-1]][sent[i]]
        except:
            continue
    return tot

#Training Trigram Model[Returns Dictionary of Dictionaries]:

Source File: rock_gensim.py From MusicTaster with MIT License

5 votes

def prepare_song_artist_dict(tag=''):
    playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists')
    print playlist_dao_inst.db_inst.find(
        {'trackCount': {'$gte': 5, '$lte': 1000}, 'playCount': {'$gte': 5}},
        {'tracks': 1, 'name': 1}).limit(100000).count()
    find_result = playlist_dao_inst.db_inst.find(
        {'trackCount': {'$gte': 5, '$lte': 1000}, 'playCount': {'$gte': 5}},
        {'tracks': 1, 'name': 1}).limit(100000)
    # 将歌单中的歌曲名组合成歌曲名序列
    total_song_artist_set = []
    count = 0
    for item in find_result:
        data_process_logger.info('No.%s %s' % (count, item['name']))
        # 保存歌单中的歌曲序列
        song_artist_seq = []
        for song in item['tracks']:
            sname = song['name']
            artist = song['artists'][0]['name'].lower()
            song_artist_seq.append((sname.lower(), artist))
        total_song_artist_set.append(song_artist_seq)
        count += 1
    data_process_logger.info('start building dictionary')
    # song_dictionary = corpora.Dictionary(total_song_artist_set)
    # print u'歌单数', song_dictionary.num_docs
    # print u'歌曲数', song_dictionary.num_pos
    data_process_logger.info('start saving datas')
    # song_dictionary.save('../datas/song_artist_dictionary_%s.dict' % tag)
    pickle.dump(total_song_artist_set, open('../datas/songs_artists_seq_%s.dat' % tag, 'wb'))
    # return song_dictionary

Source File: document_sequence.py From fake-news-detection-pipeline with Apache License 2.0

5 votes

def _set_dictionary(self):
        """stores the dictionary of current corpus"""
        self._dictionary = Dictionary(self._tokenized)

Source File: corpora.py From Topic_Disc with MIT License

5 votes

def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for dialog in self.train_corpus:
            for turn in dialog:
                all_words.append(turn.utt)

        self.vocab_bow = Dictionary(all_words)
        raw_vocab_size = len(self.vocab_bow)
        raw_wc = np.sum(list(self.vocab_bow.dfs.values()))

        # build useless stopwords vocab (e.g, very few words, single ascii words, some punctuation ,."')
        self.vocab_bow.filter_extremes(no_below=20)
        self.vocab_bow.filter_extremes(keep_n=max_vocab_cnt)
        bad_ids = [HT, MEN, URL] + TWITTER_STOPWORDS
        self.vocab_bow.filter_tokens(list(map(self.vocab_bow.token2id.get, bad_ids)))
        len_1_words = list(filter(lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["?", "!", "\"", "i"] and True or False, self.vocab_bow.values()))
        self.vocab_bow.filter_tokens(list(map(self.vocab_bow.token2id.get, len_1_words)))
        self.vocab_bow.compactify()
        # here we keep stopwords and some meaningful punctuations
        non_stopwords = filter(lambda w: re.match(r"^(?=.*[a-zA-Z\d])[\w\d_-]*$", w) and w not in STOPWORDS and True or False, self.vocab_bow.values())
        self.vocab_bow_stopwords = copy.deepcopy(self.vocab_bow)
        self.vocab_bow_stopwords.filter_tokens(map(self.vocab_bow_stopwords.token2id.get, non_stopwords))
        self.vocab_bow_stopwords.compactify()
        self.vocab_bow_non_stopwords = copy.deepcopy(self.vocab_bow)
        self.vocab_bow_non_stopwords.filter_tokens(map(self.vocab_bow_non_stopwords.token2id.get, self.vocab_bow_stopwords.values()))
        self.vocab_bow_non_stopwords.compactify()
        remain_wc = np.sum(list(self.vocab_bow.dfs.values()))
        min_count = np.min(list(self.vocab_bow.dfs.values()))
        # create vocabulary list sorted by count
        print("Load corpus with train size %d, valid size %d, "
              "test size %d raw vocab size %d vocab size %d at cut_off %d OOV rate %f"
              % (len(self.train_corpus), len(self.valid_corpus),
                 len(self.test_corpus),
                 raw_vocab_size, len(self.vocab_bow), min_count,
                 1 - float(remain_wc) / raw_wc))

Source File: models.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def tfidf_sim(self, train_data, body_dict, threshold):
        '''
        :param 
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}
        
        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]
        
        vocab = corpora.Dictionary(bodyText_w)
        corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w]
        tfidf_model = models.TfidfModel(corporaBody_bow)
        
        unrelated, related, y_true, y_pred = [], [], [], []
        for headline, bodyID, stance in train_data:        
            headline_bow = vocab.doc2bow(sent2stokens_wostop(headline))
            
            headlines_tfidf = tfidf_model[headline_bow]
            corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]]
            
            sim = cossim(headlines_tfidf, corporaBody_tfidf)
            unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred])
        
        print_results([unrelated, related, y_true, y_pred], self.model_type)

Source File: lda_model_calculator.py From moviegeek with MIT License

5 votes

def build_lda_model(self, data, docs, n_topics=5):

        texts = []
        tokenizer = RegexpTokenizer(r'\w+')
        for d in tqdm(data):
            raw = d.lower()

            tokens = tokenizer.tokenize(raw)

            stopped_tokens = self.remove_stopwords(tokens)

            stemmed_tokens = stopped_tokens
            #stemmer = PorterStemmer()
            #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]

            texts.append(stemmed_tokens)

        dictionary = corpora.Dictionary(texts)

        corpus = [dictionary.doc2bow(text) for text in texts]

        lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                                 num_topics=n_topics)

        index = similarities.MatrixSimilarity(corpus)

        self.save_lda_model(lda_model, corpus, dictionary, index)
        self.save_similarities(index, docs)

        return dictionary, texts, lda_model

Source File: corpora.py From Document2Vec with MIT License

5 votes

def __init__(self, series, vocab=None, stem=False, bigram=None,
                 labels=True):
        """ Create a corpus that returns one row at a time out
            of a Pandas Series"""
        self.series = series
        self.metadata = False
        if vocab is not None:
            vocab = set(vocab)
        self.vocab = vocab
        self.labels = labels
        self.kwargs = dict(stem=stem, bigram=bigram)
        logging.info("Building SeriesCorpus")
        self.dictionary = Dictionary()
        self.dictionary.add_documents(self.get_texts())

Source File: similarity.py From bugbug with Mozilla Public License 2.0

5 votes

def __init__(
        self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []

        for bug in bugzilla.get_bugs():

            textual_features = self.text_preprocess(self.get_text(bug))
            self.corpus.append([bug["id"], textual_features])

        # Assigning unique integer ids to all words
        self.dictionary = Dictionary(text for bug_id, text in self.corpus)

        # Conversion to BoW
        corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus]

        # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
        tfidf = models.TfidfModel(corpus_final)
        corpus_tfidf = tfidf[corpus_final]

        # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
        self.lsi = models.LsiModel(
            corpus_tfidf, id2word=self.dictionary, num_topics=300
        )
        corpus_lsi = self.lsi[corpus_tfidf]

        # Indexing the corpus
        self.index = similarities.Similarity(
            output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300
        )

Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0

5 votes

def build_statistic(self):
        self.sentences = self.train_df['splited_spn_1'].tolist() + self.train_df['splited_spn_2'].tolist() + self.test_df['splited_spn_1'].tolist() + self.test_df['splited_spn_2'].tolist() + self.unlabeled_df['splited_spn_1'].tolist()
        self.sentences = np.unique(np.array(self.sentences)).tolist()

        words = []
        for comment in self.sentences:
            for w in comment:
                words.append(w)

        counts = Counter(words)
        self.weights = {word: self._get_weight(count) for word, count in counts.items()}

        self.dictionary = corpora.Dictionary(self.sentences)
        self.dictionary.compactify()
        print ("No of words in the dictionary = %s" % len(self.dictionary.token2id))

Source File: tf_idf_helpers.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def generate_tf_idf_corpora(self):
        #data_path = myConstants.data_path
        #reader = CorpusReader(data_path)
        #body_dict = reader.load_body(myConstants.train_bodies)
        body_dict = myConstants.d.articles
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())}

        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        self.vocab = corpora.Dictionary(bodyText_w)
        corporaBody_bow = [self.vocab.doc2bow(text) for text in bodyText_w]
        self.tfidf_model = models.TfidfModel(corporaBody_bow)

Python gensim.corpora.Dictionary() Examples