Python gensim.corpora() Examples
The following are 3
code examples of gensim.corpora().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim
, or try the search function
.
![](https://www.programcreek.com/common/static/images/search.png)
Example #1
Source File: train.py From topics with Apache License 2.0 | 5 votes |
def build(self): self.cursor.rewind() dictionary = corpora.Dictionary(review["words"] for review in self.cursor) dictionary.filter_extremes(keep_n=10000) dictionary.compactify() corpora.Dictionary.save(dictionary, self.dictionary_path) return dictionary
Example #2
Source File: train.py From topics with Apache License 2.0 | 5 votes |
def run(lda_model_path, corpus_path, num_topics, id2word): corpus = corpora.BleiCorpus(corpus_path) lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=id2word) lda.save(lda_model_path) return lda
Example #3
Source File: similarity.py From bugbug with Mozilla Public License 2.0 | 4 votes |
def wmdistance(self, document1, document2, all_distances, distance_metric="cosine"): model = self.w2vmodel if len(document1) == 0 or len(document2) == 0: print( "At least one of the documents had no words that were in the vocabulary. Aborting (returning inf)." ) return float("inf") dictionary = gensim.corpora.Dictionary(documents=[document1, document2]) vocab_len = len(dictionary) # Sets for faster look-up. docset1 = set(document1) docset2 = set(document2) distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double) for i, t1 in dictionary.items(): for j, t2 in dictionary.items(): if t1 not in docset1 or t2 not in docset2: continue if distance_metric == "euclidean": distance_matrix[i, j] = np.sqrt( np.sum((model.wv[t1] - model.wv[t2]) ** 2) ) elif distance_metric == "cosine": distance_matrix[i, j] = all_distances[model.wv.vocab[t2].index, i] if np.sum(distance_matrix) == 0.0: print("The distance matrix is all zeros. Aborting (returning inf).") return float("inf") def nbow(document): d = np.zeros(vocab_len, dtype=np.double) nbow = dictionary.doc2bow(document) doc_len = len(document) for idx, freq in nbow: d[idx] = freq / float(doc_len) return d d1 = nbow(document1) d2 = nbow(document2) return emd(d1, d2, distance_matrix)