Python gensim.matutils.corpus2dense() Examples
The following are 11
code examples of gensim.matutils.corpus2dense().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.matutils
, or try the search function
.
Example #1
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #2
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #3
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #4
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #5
Source File: utils.py From HotPepperGourmetDialogue with MIT License | 5 votes |
def to_features(dictionary, words): tmp = dictionary.doc2bow(words) dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) return dense
Example #6
Source File: lsimodel.py From topical_word_embeddings with MIT License | 4 votes |
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = numpy.dot(self.projection.u[:, :self.num_topics].T, vec) # # use numpy's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = numpy.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = numpy.dot(u.take(indices, axis=0).T, numpy.array(data, dtype=u.dtype)) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
Example #7
Source File: lsimodel.py From topical_word_embeddings with MIT License | 4 votes |
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = numpy.dot(self.projection.u[:, :self.num_topics].T, vec) # # use numpy's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = numpy.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = numpy.dot(u.take(indices, axis=0).T, numpy.array(data, dtype=u.dtype)) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
Example #8
Source File: lsimodel.py From topical_word_embeddings with MIT License | 4 votes |
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = numpy.dot(self.projection.u[:, :self.num_topics].T, vec) # # use numpy's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = numpy.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = numpy.dot(u.take(indices, axis=0).T, numpy.array(data, dtype=u.dtype)) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
Example #9
Source File: lsimodel.py From topical_word_embeddings with MIT License | 4 votes |
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = numpy.dot(self.projection.u[:, :self.num_topics].T, vec) # # use numpy's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = numpy.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = numpy.dot(u.take(indices, axis=0).T, numpy.array(data, dtype=u.dtype)) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
Example #10
Source File: lsimodel.py From topical_word_embeddings with MIT License | 4 votes |
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = numpy.dot(self.projection.u[:, :self.num_topics].T, vec) # # use numpy's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = numpy.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = numpy.dot(u.take(indices, axis=0).T, numpy.array(data, dtype=u.dtype)) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
Example #11
Source File: lsimodel.py From topical_word_embeddings with MIT License | 4 votes |
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = numpy.dot(self.projection.u[:, :self.num_topics].T, vec) # # use numpy's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = numpy.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = numpy.dot(u.take(indices, axis=0).T, numpy.array(data, dtype=u.dtype)) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result