Python gensim.corpora() Examples
The following are 3
code examples of gensim.corpora().
Example #1
Source File: From topics with Apache License 2.0 | 5 votes |
def build(self): self.cursor.rewind() dictionary = corpora.Dictionary(review["words"] for review in self.cursor) dictionary.filter_extremes(keep_n=10000) dictionary.compactify(), self.dictionary_path) return dictionary
Example #2
Source File: From topics with Apache License 2.0 | 5 votes |
def run(lda_model_path, corpus_path, num_topics, id2word): corpus = corpora.BleiCorpus(corpus_path) lda = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=id2word) return lda
Example #3
Source File: From bugbug with Mozilla Public License 2.0 | 4 votes |
def wmdistance(self, document1, document2, all_distances, distance_metric="cosine"): model = self.w2vmodel if len(document1) == 0 or len(document2) == 0: print( "At least one of the documents had no words that were in the vocabulary. Aborting (returning inf)." ) return float("inf") dictionary = gensim.corpora.Dictionary(documents=[document1, document2]) vocab_len = len(dictionary) # Sets for faster look-up. docset1 = set(document1) docset2 = set(document2) distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double) for i, t1 in dictionary.items(): for j, t2 in dictionary.items(): if t1 not in docset1 or t2 not in docset2: continue if distance_metric == "euclidean": distance_matrix[i, j] = np.sqrt( np.sum((model.wv[t1] - model.wv[t2]) ** 2) ) elif distance_metric == "cosine": distance_matrix[i, j] = all_distances[model.wv.vocab[t2].index, i] if np.sum(distance_matrix) == 0.0: print("The distance matrix is all zeros. Aborting (returning inf).") return float("inf") def nbow(document): d = np.zeros(vocab_len, dtype=np.double) nbow = dictionary.doc2bow(document) doc_len = len(document) for idx, freq in nbow: d[idx] = freq / float(doc_len) return d d1 = nbow(document1) d2 = nbow(document2) return emd(d1, d2, distance_matrix)