Python gensim.corpora.Dictionary() Examples
The following are 30
code examples of gensim.corpora.Dictionary().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.corpora
, or try the search function
.
Example #1
Source File: text.py From nlp_learning with MIT License | 6 votes |
def main(): corpora_documents = [] for item_text in raw_documents: item_str = list(jieba.cut(item_text)) corpora_documents.append(item_str) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] similarity =similarities.Similarity('-Similarity-index', corpus, num_features=400) test_data_1 = '你好,我想问一下我想离婚他不想离,孩子他说不要,是六个月就自动生效离婚' test_cut_raw_1 = jieba.cut(test_data_1) test_corpus_1 = dictionary.doc2bow(test_cut_raw_1) similarity.num_best = 5 # 返回最相似的样本材料,(index_of_document, similarity) tuples print(similarity[test_corpus_1])
Example #2
Source File: textpro.py From comparable-text-miner with Apache License 2.0 | 6 votes |
def build_lsi_model(corpus_name, corpus_path, topics=300): logging.info( 'building lsi model for %s corpus', corpus_name ) dictFile = corpus_path + corpus_name + '.dict' corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm' logging.info( 'loading dictionary ...' ) dictionary = corpora.Dictionary.load(dictFile) logging.info( 'loading tfidf corpus ...' ) corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file) logging.info( 'building lsi model' ) lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics) logging.info( 'saving lsi' ) lsiFile = corpus_path + corpus_name + '.lsi' lsi.save(lsiFile) logging.info( 'lsi model is ready' ) ##################################################################################
Example #3
Source File: similarity.py From bugbug with Mozilla Public License 2.0 | 6 votes |
def __init__( self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8 ): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) self.corpus = [] self.bug_ids = [] for bug in bugzilla.get_bugs(): self.corpus.append(self.text_preprocess(self.get_text(bug))) self.bug_ids.append(bug["id"]) indexes = list(range(len(self.corpus))) random.shuffle(indexes) self.corpus = [self.corpus[idx] for idx in indexes] self.bug_ids = [self.bug_ids[idx] for idx in indexes] self.dictionary = Dictionary(self.corpus) self.model = LdaModel([self.dictionary.doc2bow(text) for text in self.corpus])
Example #4
Source File: similarity.py From bugbug with Mozilla Public License 2.0 | 6 votes |
def __init__( self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8, ): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) terms_idx = WordEmbeddingSimilarityIndex(self.w2vmodel.wv) self.dictionary = Dictionary(self.corpus) bow = [self.dictionary.doc2bow(doc) for doc in self.corpus] similarity_matrix = SparseTermSimilarityMatrix(terms_idx, self.dictionary) self.softcosinesimilarity = SoftCosineSimilarity( bow, similarity_matrix, num_best=10 )
Example #5
Source File: lex_sem_ft.py From DeepLearn with MIT License | 6 votes |
def LDA_train(doc): red = [] en_stop = get_stop_words('en') for d in doc: try: raw = d.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] red.append(stopped_tokens) except: continue print("Forming Dictionary.....") dictionary = corpora.Dictionary(red) print("Forming Corpus.....") corpus = [dictionary.doc2bow(text) for text in red] print("Training Model.....") lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1) return lda #Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):
Example #6
Source File: lex_sem_ft.py From DeepLearn with MIT License | 6 votes |
def sum_bigram(sent, model): sent = sent.split() first = True tot = 0 for i in range(len(sent)): try: if first: tot += model[None][sent[i]] first = False else: tot += model[sent[i-1]][sent[i]] except: continue return tot #Training Trigram Model[Returns Dictionary of Dictionaries]:
Example #7
Source File: utils.py From CLAtoolkit with GNU General Public License v3.0 | 6 votes |
def get_LDAVis_JSON(platform, num_topics, course_code, start_date=None, end_date=None): #print "get_LDAVis_JSON" docs,ids = get_allcontent_byplatform(platform, course_code, start_date=start_date, end_date=end_date) documents = remove_stopwords(docs) # Make dictionary dictionary = corpora.Dictionary(documents) #Create and save corpus corpus = [dictionary.doc2bow(text) for text in documents] #Run LDA model = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=num_topics) tmp = pyLDAvis.gensim.prepare(model, corpus, dictionary).to_json() #print tmp #tmp = model.show_topics(num_topics=20, num_words=5, log=False, formatted=False) return tmp
Example #8
Source File: textrank_gensim.py From nlg-yongzhuo with MIT License | 6 votes |
def _build_corpus(sentences): """Construct corpus from provided sentences. Parameters ---------- sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Given sentences. Returns ------- list of list of (int, int) Corpus built from sentences. """ split_tokens = [jieba_cut(sentence) for sentence in sentences] dictionary = Dictionary(split_tokens) return [dictionary.doc2bow(token) for token in split_tokens]
Example #9
Source File: fastfm_recommender.py From yelp with GNU Lesser General Public License v2.1 | 6 votes |
def preprocess_records(train_records, test_records): """ Creates a bag of words and a corpus for each record and creates a dictionary based on all the text contained in the records """ records = train_records + test_records all_words = [] for record in records: bow = record['context_text'].split() record[Constants.BOW_FIELD] = bow all_words.append(bow) dictionary = corpora.Dictionary(all_words) for record in records: record[Constants.CORPUS_FIELD] = \ dictionary.doc2bow(record[Constants.BOW_FIELD]) return dictionary
Example #10
Source File: context_knn.py From yelp with GNU Lesser General Public License v2.1 | 6 votes |
def get_topic_distribution(self, review): """ :type review: str """ review_bow = lda_context_utils.create_bag_of_words([review]) dictionary = corpora.Dictionary(review_bow) corpus = dictionary.doc2bow(review_bow[0]) lda_corpus = self.lda_model.get_document_topics(corpus) topic_distribution =\ lda_document_to_topic_distribution(lda_corpus, self.num_topics) return topic_distribution # TODO: Adapt this to a data structure in which a user can rate the same # item multiple times in different contexts
Example #11
Source File: reviews_preprocessor.py From yelp with GNU Lesser General Public License v2.1 | 6 votes |
def build_dictionary(self): print('%s: build dictionary' % time.strftime("%Y/%m/%d-%H:%M:%S")) if self.use_cache and os.path.exists(Constants.DICTIONARY_FILE): print('Dictionary already exists') self.dictionary = corpora.Dictionary.load(Constants.DICTIONARY_FILE) return all_words = [] for record in self.records: all_words.append(record[Constants.BOW_FIELD]) self.dictionary = corpora.Dictionary(all_words) self.dictionary.filter_extremes( Constants.MIN_DICTIONARY_WORD_COUNT, Constants.MAX_DICTIONARY_WORD_COUNT) self.dictionary.save(Constants.DICTIONARY_FILE)
Example #12
Source File: lex_sem_ft.py From DL-text with MIT License | 6 votes |
def LDA_train(doc): red = [] en_stop = get_stop_words('en') for d in doc: try: raw = d.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] red.append(stopped_tokens) except: continue print("Forming Dictionary.....") dictionary = corpora.Dictionary(red) print("Forming Corpus.....") corpus = [dictionary.doc2bow(text) for text in red] print("Training Model.....") lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1) return lda #Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):
Example #13
Source File: VectorSpaceModel.py From Snowball with GNU General Public License v3.0 | 6 votes |
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() count = 0 print("Gathering sentences and removing stopwords") for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # remove stop words and tokenize document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords] documents.append(document) count += 1 if count % 10000 == 0: sys.stdout.write(".") f_sentences.close() self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) print(len(documents), "documents red") print(len(self.dictionary), " unique tokens")
Example #14
Source File: docsim.py From nlp_learning with MIT License | 6 votes |
def train(self, prefix: str, corporas: list): """ 训练模型 保存字典,语料,模型到磁盘 Arguments: prefix {str} -- 模型名称前缀 corpora_documents {list} -- 分词后的文本 """ # 生成字典和向量语料 dictionary = corpora.Dictionary(corporas) dictionary.save('./models/{}_dict.dic'.format(prefix)) # 保存生成的词典 corpus = [dictionary.doc2bow(text) for text in corporas] corpora.MmCorpus.serialize('./models/{}_corpuse.mm'.format(prefix), corpus) # 保存生成的语料 tfidf_model = models.TfidfModel(corpus) tfidf_model.save("./models/{}_tfidf_model.model".format(prefix)) # 保存Tfidf模型
Example #15
Source File: sent_utils.py From embedding with MIT License | 6 votes |
def latent_dirichlet_allocation(corpus_fname, output_fname, tokenizer_name="mecab"): make_save_path(output_fname) documents, tokenized_corpus = [], [] tokenizer = get_tokenizer(tokenizer_name) with open(corpus_fname, 'r', encoding='utf-8') as f: for document in f: tokens = list(set(tokenizer.morphs(document.strip()))) documents.append(document) tokenized_corpus.append(tokens) dictionary = corpora.Dictionary(tokenized_corpus) corpus = [dictionary.doc2bow(text) for text in tokenized_corpus] LDA = ldamulticore.LdaMulticore(corpus, id2word=dictionary, num_topics=30, minimum_probability=0.0, workers=4) # 특정 토픽의 확률이 0.5보다 클 경우에만 데이터를 리턴한다 # 확률의 합은 1이기 때문에 해당 토픽이 해당 문서에서 확률값이 가장 큰 토픽이 된다 all_topics = LDA.get_document_topics(corpus, minimum_probability=0.5, per_word_topics=False) with open(output_fname + ".results", 'w') as f: for doc_idx, topic in enumerate(all_topics): if len(topic) == 1: topic_id, prob = topic[0] f.writelines(documents[doc_idx].strip() + "\u241E" + ' '.join(tokenized_corpus[doc_idx]) + "\u241E" + str(topic_id) + "\u241E" + str(prob) + "\n") LDA.save(output_fname + ".model")
Example #16
Source File: util.py From seq2seq with MIT License | 6 votes |
def load_dictionary(filename: str) -> corpora.Dictionary: """辞書をロードする。 Args: filename (str): ファイル名。 Returns: corpora.Dictionary: 辞書。 """ dic = corpora.Dictionary.load(filename) # if with_symbol and \ # not (dic.token2id["<S>"] == 0 and dic.token2id["</S>"] == 1): # raise Exception("<S> and </S> ids should be 0 and 1") print("load dictionary: {} items".format(len(dic.values()))) # print([item for item in dic.items()][:10]) return dic
Example #17
Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License | 6 votes |
def genDictionary(self,documents,**kwarg): '''Generate dictionary and bow-vector of all tokenzied news(articles). # Arguments: documents: List of news(articles). saveDict: Save dictionary or not(bool type). saveBowvec: Save bow-vector or not(bool type). returnValue: Return value or not(bool type). ''' self._raw_documents = documents token = self.jieba_tokenize(documents) #jieba tokenize #corpora_documents = self.RemoveWordAppearOnce(token) # remove thw words appearing once in the dictionary self._dictionary = corpora.Dictionary(token) # generate dictionary using tokenized documents if kwarg['saveDict']: self._dictionary.save(kwarg['saveDictPath']) # store the dictionary, for future reference self._BowVecOfEachDoc = [self._dictionary.doc2bow(text) for text in token] # convert tokenized documents to vectors if kwarg['saveBowvec']: corpora.MmCorpus.serialize(kwarg['saveBowvecPath'], self._BowVecOfEachDoc) # store to disk, for later use if kwarg['returnValue']: return token, self._dictionary, self._BowVecOfEachDoc
Example #18
Source File: util.py From seq2seq with MIT License | 6 votes |
def tokens2ids( tokens: List[str], dictionary: corpora.Dictionary, verbose: bool=False ) -> List[int]: if verbose: not_found_lst = [ word for word in tokens if word not in dictionary.token2id ] if not_found_lst: print("not found in dict: {}".format( not_found_lst )) for word in tokens: if word in dictionary and dictionary.token2id[word] < 0: raise("word id < 0: {}".format(word)) # 未知語は UNK にする return [ dictionary.token2id[word] if word in dictionary.token2id else dictionary.token2id[config.UNK_SYMBOL] for word in tokens ]
Example #19
Source File: cut_td_idf.py From nlp_xiaojiang with MIT License | 6 votes |
def init_tfidf_chinese_or_pinyin(sources_path): """ 构建td_idf :param path: :return: """ questions = txtRead(sources_path) corpora_documents = [] for item_text in questions: item_seg = list(jieba.cut(str(item_text).strip())) corpora_documents.append(item_seg) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf_model = models.TfidfModel(corpus) print("init_tfidf_chinese_or_pinyin ok! " + sources_path) file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb') pickle.dump([dictionary, tfidf_model], file)
Example #20
Source File: dbscan_analysis.py From ns4_chatbot with Apache License 2.0 | 6 votes |
def get_dictionary(df): logger.debug("一共%d行文本数据", len(df)) all_rows = [] for one in df['html_cut']: #不知为何从csv加载的有的html切词是nan,然后识别成float类型了,加上这个做错误处理 if not isinstance(one, str): logger.error("当前行的html_cut数据类型不是Str:%r", one) continue cut_content_list = one.split(" ") all_rows.append(cut_content_list) # 得到词典 dictionary = corpora.Dictionary(all_rows) #为了防止未来词表乱掉,先保存一个词表,固定下来。后续的分类验证的时候,会用这个固定词表 # dictionary.save("out/dictionary.dic") logger.debug("词袋一共%d个词", len(dictionary.keys())) return dictionary
Example #21
Source File: lex_sem_ft.py From DL-text with MIT License | 6 votes |
def sum_bigram(sent, model): sent = sent.split() first = True tot = 0 for i in range(len(sent)): try: if first: tot += model[None][sent[i]] first = False else: tot += model[sent[i-1]][sent[i]] except: continue return tot #Training Trigram Model[Returns Dictionary of Dictionaries]:
Example #22
Source File: rock_gensim.py From MusicTaster with MIT License | 5 votes |
def prepare_song_artist_dict(tag=''): playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') print playlist_dao_inst.db_inst.find( {'trackCount': {'$gte': 5, '$lte': 1000}, 'playCount': {'$gte': 5}}, {'tracks': 1, 'name': 1}).limit(100000).count() find_result = playlist_dao_inst.db_inst.find( {'trackCount': {'$gte': 5, '$lte': 1000}, 'playCount': {'$gte': 5}}, {'tracks': 1, 'name': 1}).limit(100000) # 将歌单中的歌曲名组合成歌曲名序列 total_song_artist_set = [] count = 0 for item in find_result: data_process_logger.info('No.%s %s' % (count, item['name'])) # 保存歌单中的歌曲序列 song_artist_seq = [] for song in item['tracks']: sname = song['name'] artist = song['artists'][0]['name'].lower() song_artist_seq.append((sname.lower(), artist)) total_song_artist_set.append(song_artist_seq) count += 1 data_process_logger.info('start building dictionary') # song_dictionary = corpora.Dictionary(total_song_artist_set) # print u'歌单数', song_dictionary.num_docs # print u'歌曲数', song_dictionary.num_pos data_process_logger.info('start saving datas') # song_dictionary.save('../datas/song_artist_dictionary_%s.dict' % tag) pickle.dump(total_song_artist_set, open('../datas/songs_artists_seq_%s.dat' % tag, 'wb')) # return song_dictionary
Example #23
Source File: document_sequence.py From fake-news-detection-pipeline with Apache License 2.0 | 5 votes |
def _set_dictionary(self): """stores the dictionary of current corpus""" self._dictionary = Dictionary(self._tokenized)
Example #24
Source File: corpora.py From Topic_Disc with MIT License | 5 votes |
def _build_vocab(self, max_vocab_cnt): all_words = [] for dialog in self.train_corpus: for turn in dialog: all_words.append(turn.utt) self.vocab_bow = Dictionary(all_words) raw_vocab_size = len(self.vocab_bow) raw_wc = np.sum(list(self.vocab_bow.dfs.values())) # build useless stopwords vocab (e.g, very few words, single ascii words, some punctuation ,."') self.vocab_bow.filter_extremes(no_below=20) self.vocab_bow.filter_extremes(keep_n=max_vocab_cnt) bad_ids = [HT, MEN, URL] + TWITTER_STOPWORDS self.vocab_bow.filter_tokens(list(map(self.vocab_bow.token2id.get, bad_ids))) len_1_words = list(filter(lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["?", "!", "\"", "i"] and True or False, self.vocab_bow.values())) self.vocab_bow.filter_tokens(list(map(self.vocab_bow.token2id.get, len_1_words))) self.vocab_bow.compactify() # here we keep stopwords and some meaningful punctuations non_stopwords = filter(lambda w: re.match(r"^(?=.*[a-zA-Z\d])[\w\d_-]*$", w) and w not in STOPWORDS and True or False, self.vocab_bow.values()) self.vocab_bow_stopwords = copy.deepcopy(self.vocab_bow) self.vocab_bow_stopwords.filter_tokens(map(self.vocab_bow_stopwords.token2id.get, non_stopwords)) self.vocab_bow_stopwords.compactify() self.vocab_bow_non_stopwords = copy.deepcopy(self.vocab_bow) self.vocab_bow_non_stopwords.filter_tokens(map(self.vocab_bow_non_stopwords.token2id.get, self.vocab_bow_stopwords.values())) self.vocab_bow_non_stopwords.compactify() remain_wc = np.sum(list(self.vocab_bow.dfs.values())) min_count = np.min(list(self.vocab_bow.dfs.values())) # create vocabulary list sorted by count print("Load corpus with train size %d, valid size %d, " "test size %d raw vocab size %d vocab size %d at cut_off %d OOV rate %f" % (len(self.train_corpus), len(self.valid_corpus), len(self.test_corpus), raw_vocab_size, len(self.vocab_bow), min_count, 1 - float(remain_wc) / raw_wc))
Example #25
Source File: models.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def tfidf_sim(self, train_data, body_dict, threshold): ''' :param train_data : a list of training samples of type ['headline', 'bodyID', 'stance'] body_dict : a dictionary of values containing {bodyID:'bodyText'} threshold : used distinguish between similar and not similar ''' bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] vocab = corpora.Dictionary(bodyText_w) corporaBody_bow = [vocab.doc2bow(text) for text in bodyText_w] tfidf_model = models.TfidfModel(corporaBody_bow) unrelated, related, y_true, y_pred = [], [], [], [] for headline, bodyID, stance in train_data: headline_bow = vocab.doc2bow(sent2stokens_wostop(headline)) headlines_tfidf = tfidf_model[headline_bow] corporaBody_tfidf = tfidf_model[corporaBody_bow[bodyIds_index[bodyID]]] sim = cossim(headlines_tfidf, corporaBody_tfidf) unrelated, related, y_true, y_pred = create_lists(sim, stance, threshold, [unrelated, related, y_true, y_pred]) print_results([unrelated, related, y_true, y_pred], self.model_type)
Example #26
Source File: lda_model_calculator.py From moviegeek with MIT License | 5 votes |
def build_lda_model(self, data, docs, n_topics=5): texts = [] tokenizer = RegexpTokenizer(r'\w+') for d in tqdm(data): raw = d.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = self.remove_stopwords(tokens) stemmed_tokens = stopped_tokens #stemmer = PorterStemmer() #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics) index = similarities.MatrixSimilarity(corpus) self.save_lda_model(lda_model, corpus, dictionary, index) self.save_similarities(index, docs) return dictionary, texts, lda_model
Example #27
Source File: corpora.py From Document2Vec with MIT License | 5 votes |
def __init__(self, series, vocab=None, stem=False, bigram=None, labels=True): """ Create a corpus that returns one row at a time out of a Pandas Series""" self.series = series self.metadata = False if vocab is not None: vocab = set(vocab) self.vocab = vocab self.labels = labels self.kwargs = dict(stem=stem, bigram=bigram) logging.info("Building SeriesCorpus") self.dictionary = Dictionary() self.dictionary.add_documents(self.get_texts())
Example #28
Source File: similarity.py From bugbug with Mozilla Public License 2.0 | 5 votes |
def __init__( self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8 ): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) self.corpus = [] for bug in bugzilla.get_bugs(): textual_features = self.text_preprocess(self.get_text(bug)) self.corpus.append([bug["id"], textual_features]) # Assigning unique integer ids to all words self.dictionary = Dictionary(text for bug_id, text in self.corpus) # Conversion to BoW corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus] # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions tfidf = models.TfidfModel(corpus_final) corpus_tfidf = tfidf[corpus_final] # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing self.lsi = models.LsiModel( corpus_tfidf, id2word=self.dictionary, num_topics=300 ) corpus_lsi = self.lsi[corpus_tfidf] # Indexing the corpus self.index = similarities.Similarity( output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300 )
Example #29
Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0 | 5 votes |
def build_statistic(self): self.sentences = self.train_df['splited_spn_1'].tolist() + self.train_df['splited_spn_2'].tolist() + self.test_df['splited_spn_1'].tolist() + self.test_df['splited_spn_2'].tolist() + self.unlabeled_df['splited_spn_1'].tolist() self.sentences = np.unique(np.array(self.sentences)).tolist() words = [] for comment in self.sentences: for w in comment: words.append(w) counts = Counter(words) self.weights = {word: self._get_weight(count) for word, count in counts.items()} self.dictionary = corpora.Dictionary(self.sentences) self.dictionary.compactify() print ("No of words in the dictionary = %s" % len(self.dictionary.token2id))
Example #30
Source File: tf_idf_helpers.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def generate_tf_idf_corpora(self): #data_path = myConstants.data_path #reader = CorpusReader(data_path) #body_dict = reader.load_body(myConstants.train_bodies) body_dict = myConstants.d.articles bodyText_list = list(body_dict.values()) bodyIds_index = {k:index for index, k in enumerate(body_dict.keys())} bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list] self.vocab = corpora.Dictionary(bodyText_w) corporaBody_bow = [self.vocab.doc2bow(text) for text in bodyText_w] self.tfidf_model = models.TfidfModel(corporaBody_bow)