Python gensim.matutils.sparse2full() Examples
The following are 30
code examples of gensim.matutils.sparse2full().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.matutils
, or try the search function
.
Example #1
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(5): # restart at most 5 times # create the transformation model model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
Example #2
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(5): # restart at most 5 times # create the transformation model model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
Example #3
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #4
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(5): # restart at most 5 times # create the transformation model model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
Example #5
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #6
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #7
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(5): # restart at most 5 times # create the transformation model model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
Example #8
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #9
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #10
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(5): # restart at most 5 times # create the transformation model model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
Example #11
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): passed = False # sometimes, LDA training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(5): # restart at most 5 times # create the transformation model model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100) model.update(corpus) # transform one document doc = list(corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.13, 0.87] passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering if passed: break logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" % (i, sorted(vec), sorted(expected))) self.assertTrue(passed)
Example #12
Source File: similarity.py From bugbug with Mozilla Public License 2.0 | 6 votes |
def search_similar_bugs(self, query): query = self.text_preprocess(self.get_text(query)) dense1 = sparse2full( self.model[self.dictionary.doc2bow(query)], self.model.num_topics ) distances = [] for idx in range(len(self.corpus)): dense2 = sparse2full( self.model[self.dictionary.doc2bow(self.corpus[idx])], self.model.num_topics, ) hellinger_distance = np.sqrt( 0.5 * ((np.sqrt(dense1) - np.sqrt(dense2)) ** 2).sum() ) distances.append((self.bug_ids[idx], hellinger_distance)) distances.sort(key=lambda v: v[1]) return [distance[0] for distance in distances[:10]]
Example #13
Source File: test_models.py From topical_word_embeddings with MIT License | 6 votes |
def testTransform(self): """Test lsi[vector] transformation.""" # create the transformation model model = lsimodel.LsiModel(self.corpus, num_topics=2) # make sure the decomposition is enough accurate u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
Example #14
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features)) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))
Example #15
Source File: test_models.py From topical_word_embeddings with MIT License | 5 votes |
def testTransform(self): # create the transformation model numpy.random.seed(13) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results) model = rpmodel.RpModel(self.corpus, num_topics=2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.70710677, 0.70710677]) self.assertTrue(numpy.allclose(vec, expected)) # transformed entries must be equal up to sign
Example #16
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus is not None: if self.num_features <= 0: raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)") logger.info("creating matrix for %s documents and %i features" % (len(corpus), num_features)) self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus))) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) self.index[docno] = vector
Example #17
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def get_similarities(self, query): """ Return similarity of sparse vector `query` to all documents in the corpus, as a numpy array. If `query` is a collection of documents, return a 2D array of similarities of each document in `query` to all documents in the corpus (=batch query, faster than processing each document in turn). **Do not use this function directly; use the self[query] syntax instead.** """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = numpy.asarray([matutils.sparse2full(vec, self.num_features) for vec in query], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.toarray() # convert sparse to dense elif isinstance(query, numpy.ndarray): pass else: # default case: query is a single vector in sparse gensim format query = matutils.sparse2full(query, self.num_features) query = numpy.asarray(query, dtype=self.index.dtype) # do a little transposition dance to stop numpy from making a copy of # self.index internally in numpy.dot (very slow). result = numpy.dot(self.index, query.T).T # return #queries x #index return result # XXX: removed casting the result from array to list; does anyone care? #endclass MatrixSimilarity
Example #18
Source File: test_similarities.py From topical_word_embeddings with MIT License | 5 votes |
def testFull(self, num_best=None, shardsize=100): if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=shardsize) else: index = self.cls(corpus, num_features=len(dictionary)) if isinstance(index, similarities.MatrixSimilarity): expected = numpy.array([ [ 0.57735026, 0.57735026, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ], [ 0.40824831, 0.0, 0.0, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.40824831, 0.0, 0.0, 0.0, 0.0 ], [ 0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0 ], [ 0.0, 0.40824831, 0.0, 0.0, 0.0, 0.81649661, 0.0, 0.0, 0.40824831, 0.0, 0.0, 0.0 ], [ 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.57735026, 0.57735026, 0.0, 0.0, 0.0, 0.0 ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0 ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.70710677, 0.70710677, 0.0 ], [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026, 0.57735026 ], [ 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.0, 0.0, 0.0, 0.0, 0.0, 0.57735026, 0.57735026 ] ], dtype=numpy.float32) self.assertTrue(numpy.allclose(expected, index.index)) index.num_best = num_best query = corpus[0] sims = index[query] expected = [(0, 0.99999994), (2, 0.28867513), (3, 0.23570226), (1, 0.23570226)][ : num_best] # convert sims to full numpy arrays, so we can use allclose() and ignore # ordering of items with the same similarity value expected = matutils.sparse2full(expected, len(index)) if num_best is not None: # when num_best is None, sims is already a numpy array sims = matutils.sparse2full(sims, len(index)) self.assertTrue(numpy.allclose(expected, sims)) if self.cls == similarities.Similarity: index.destroy()
Example #19
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def get_similarities(self, query): """ Return similarity of sparse vector `query` to all documents in the corpus, as a numpy array. If `query` is a collection of documents, return a 2D array of similarities of each document in `query` to all documents in the corpus (=batch query, faster than processing each document in turn). **Do not use this function directly; use the self[query] syntax instead.** """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = numpy.asarray([matutils.sparse2full(vec, self.num_features) for vec in query], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.toarray() # convert sparse to dense elif isinstance(query, numpy.ndarray): pass else: # default case: query is a single vector in sparse gensim format query = matutils.sparse2full(query, self.num_features) query = numpy.asarray(query, dtype=self.index.dtype) # do a little transposition dance to stop numpy from making a copy of # self.index internally in numpy.dot (very slow). result = numpy.dot(self.index, query.T).T # return #queries x #index return result # XXX: removed casting the result from array to list; does anyone care? #endclass MatrixSimilarity
Example #20
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus is not None: if self.num_features <= 0: raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)") logger.info("creating matrix for %s documents and %i features" % (len(corpus), num_features)) self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus))) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) self.index[docno] = vector
Example #21
Source File: text2vec.py From text2vec with Apache License 2.0 | 5 votes |
def get_tfidf(self): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict) docs_tfidf = model_tfidf[docs_corpus] docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_tfidf]) return docs_vecs # Get Latent Semantic Indexing(LSI) vector for document list
Example #22
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features)) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))
Example #23
Source File: rpmodel.py From topical_word_embeddings with MIT License | 5 votes |
def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics) vec = numpy.asfortranarray(vec, dtype=numpy.float32) topic_dist = numpy.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1) return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
Example #24
Source File: text2vec.py From text2vec with Apache License 2.0 | 5 votes |
def get_lsi(self, num_topics=300): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_lsi = models.LsiModel(docs_corpus, num_topics, id2word=self.docs_dict) docs_lsi = model_lsi[docs_corpus] docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lsi]) return docs_vecs # Get Random Projections(RP) vector for document list
Example #25
Source File: text2vec.py From text2vec with Apache License 2.0 | 5 votes |
def get_rp(self): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_rp = models.RpModel(docs_corpus, id2word=self.docs_dict) docs_rp = model_rp[docs_corpus] docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_rp]) return docs_vecs # Get Latent Dirichlet Allocation(LDA) vector for document list
Example #26
Source File: test_models.py From topical_word_embeddings with MIT License | 5 votes |
def testTransform(self): # create the transformation model numpy.random.seed(13) # HACK; set fixed seed so that we always get the same random matrix (and can compare against expected results) model = rpmodel.RpModel(self.corpus, num_topics=2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.70710677, 0.70710677]) self.assertTrue(numpy.allclose(vec, expected)) # transformed entries must be equal up to sign
Example #27
Source File: test_models.py From topical_word_embeddings with MIT License | 5 votes |
def testOnlineTransform(self): corpus = list(self.corpus) doc = corpus[0] # use the corpus' first document for testing # create the transformation model model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later # train model on a single document model.add_documents([corpus[0]]) # transform the testing document with this partial transformation transformed = model[doc] vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on another 4 documents model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols # transform a document with this partial transformation transformed = model[doc] vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign # train on the rest of documents model.add_documents(corpus[5:]) # make sure the final transformation is the same as if we had decomposed the whole corpus at once vec1 = matutils.sparse2full(model[doc], model.num_topics) vec2 = matutils.sparse2full(model2[doc], model2.num_topics) self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign
Example #28
Source File: test_models.py From topical_word_embeddings with MIT License | 5 votes |
def testCorpusTransform(self): """Test lsi[corpus] transformation.""" model = lsimodel.LsiModel(self.corpus, num_topics=2) got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[corpus]) expected = numpy.array([ [ 0.65946639, 0.14211544], [ 2.02454305, -0.42088759], [ 1.54655361, 0.32358921], [ 1.81114125, 0.5890525 ], [ 0.9336738 , -0.27138939], [ 0.01274618, -0.49016181], [ 0.04888203, -1.11294699], [ 0.08063836, -1.56345594], [ 0.27381003, -1.34694159]]) self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign
Example #29
Source File: text2vec.py From text2vec with Apache License 2.0 | 5 votes |
def get_lda(self, num_topics=100): docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs] model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict) docs_lda = model_lda[docs_corpus] docs_vecs = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda]) return docs_vecs # Get Hierarchical Dirichlet Process(HDP) vector for document list
Example #30
Source File: text2vec.py From text2vec with Apache License 2.0 | 5 votes |
def _get_tfidf(self, docs, docs_dict): docs_corpus = [docs_dict.doc2bow(doc) for doc in docs] model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict) docs_tfidf = model_tfidf[docs_corpus] docs_vecs = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf]) return docs_vecs #Get avg w2v for one document