Python gensim.matutils.unitvec() Examples
The following are 30
code examples of gensim.matutils.unitvec().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.matutils
, or try the search function
.
Example #1
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 7 votes |
def similarity_label(self, words, normalization=True): """ you can calculate more than one word at the same time. """ if self.model==None: raise Exception('no model.') if isinstance(words, string_types): words=[words] vectors=np.transpose(self.model.wv.__getitem__(words)) if normalization: unit_vector=unitvec(vectors,ax=0) # 这样写比原来那样速度提升一倍 #unit_vector=np.zeros((len(vectors),len(words))) #for i in range(len(words)): # unit_vector[:,i]=matutils.unitvec(vectors[:,i]) dists=np.dot(self.Label_vec_u, unit_vector) else: dists=np.dot(self.Label_vec, vectors) return dists
Example #2
Source File: tfidfmodel.py From topical_word_embeddings with MIT License | 6 votes |
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector #endclass TfidfModel
Example #3
Source File: create_sppmi.py From dutchembeddings with GNU General Public License v2.0 | 6 votes |
def shift_clip_pmi(pmimtr, k_shift=1.0): """ Turns a pmi matrix into a PPMI matrix by setting all negative values to 0 and then shifting by a factor of -log(k). :param pmimtr: The matrix of PMI values. :param k_shift: The shift factor. :return: A PPMI matrix. """ logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, )) pmimtr -= np.log(k_shift) # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k) logger.info("clipping PMI scores to be non-negative PPMI") pmimtr.clip(0.0, out=pmimtr) # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k)) logger.info("normalizing PPMI word vectors to unit length") for i, vec in enumerate(pmimtr): pmimtr[i] = matutils.unitvec(vec) return matutils.corpus2csc(matutils.Dense2Corpus(pmimtr, documents_columns=False)).T
Example #4
Source File: tfidfmodel.py From topical_word_embeddings with MIT License | 6 votes |
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector #endclass TfidfModel
Example #5
Source File: chatbot_sentence_vec_by_char.py From nlp_xiaojiang with MIT License | 6 votes |
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec): """ 最相似的句子,句向量与矩阵点乘 :param vec: :param matrix: :param keys: :param topn: :return: """ # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged. vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type) # 矩阵点乘, 即问句与标准问句库里边的问句点乘, matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean) # 相似度排序 most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True) # 获取最相似标准问句的index和得分score index_score = [] for t in most_similar_sentence_vec_sort[:top_vec]: index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])]) return index_score
Example #6
Source File: chatbot_sentence_vec_by_word.py From nlp_xiaojiang with MIT License | 6 votes |
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec): """ 最相似的句子,句向量与矩阵点乘 :param vec: :param matrix: :param keys: :param topn: :return: """ # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged. vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type) # 矩阵点乘, 即问句与标准问句库里边的问句点乘, matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean) # 相似度排序 most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True) # 获取最相似标准问句的index和得分score index_score = [] for t in most_similar_sentence_vec_sort[:top_vec]: index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])]) return index_score
Example #7
Source File: tfidfmodel.py From topical_word_embeddings with MIT License | 6 votes |
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector #endclass TfidfModel
Example #8
Source File: tfidfmodel.py From topical_word_embeddings with MIT License | 6 votes |
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector #endclass TfidfModel
Example #9
Source File: tfidfmodel.py From topical_word_embeddings with MIT License | 6 votes |
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector #endclass TfidfModel
Example #10
Source File: sentencevectors.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 6 votes |
def similarity(self, d1: int, d2: int) -> float: """Compute cosine similarity between two sentences from the training set. Parameters ---------- d1 : int index of sentence d2 : int index of sentence Returns ------- float The cosine similarity between the vectors of the two sentences. """ return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
Example #11
Source File: breds-parallel.py From BREDS with GNU Lesser General Public License v3.0 | 6 votes |
def similarity_cluster(self, p1, p2): count = 0 score = 0 if self.config.alpha == 0 and self.config.gamma == 0: p1.merge_all_tuples_bet() p2.merge_all_tuples_bet() for v_bet1 in p1.bet_uniques_vectors: for v_bet2 in p2.bet_uniques_vectors: if v_bet1 is not None and v_bet2 is not None: score += dot( matutils.unitvec(asarray(v_bet1)), matutils.unitvec(asarray(v_bet2)) ) count += 1 else: for t1 in p1.tuples: for t2 in p2.tuples: score += self.similarity_3_contexts(t1, t2) count += 1 return float(score) / float(count)
Example #12
Source File: breds-parallel.py From BREDS with GNU Lesser General Public License v3.0 | 6 votes |
def similarity_3_contexts(self, t, p): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and p.bef_vector is not None: bef = dot( matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector) ) if t.bet_vector is not None and p.bet_vector is not None: bet = dot( matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector) ) if t.aft_vector is not None and p.aft_vector is not None: aft = dot( matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector) ) return self.config.alpha*bef + \ self.config.beta*bet + \ self.config.gamma*aft
Example #13
Source File: tfidfmodel.py From topical_word_embeddings with MIT License | 6 votes |
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector #endclass TfidfModel
Example #14
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus is not None: if self.num_features <= 0: raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)") logger.info("creating matrix for %s documents and %i features" % (len(corpus), num_features)) self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus))) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) self.index[docno] = vector
Example #15
Source File: interfaces.py From topical_word_embeddings with MIT License | 5 votes |
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. Using this type of batch query is more efficient than computing the similarities one document after another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): import warnings # warnings.warn("non-gensim input must already come normalized") else: if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
Example #16
Source File: logentropy_model.py From topical_word_embeddings with MIT License | 5 votes |
def __getitem__(self, bow): """ Return log entropy representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr] if self.normalize: vector = matutils.unitvec(vector) return vector
Example #17
Source File: breds.py From BREDS with GNU Lesser General Public License v3.0 | 5 votes |
def similarity_3_contexts(self, p, t): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and p.bef_vector is not None: bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)) if t.bet_vector is not None and p.bet_vector is not None: bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)) if t.aft_vector is not None and p.aft_vector is not None: aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)) return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
Example #18
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features)) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))
Example #19
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus is not None: if self.num_features <= 0: raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)") logger.info("creating matrix for %s documents and %i features" % (len(corpus), num_features)) self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus))) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) self.index[docno] = vector
Example #20
Source File: interfaces.py From topical_word_embeddings with MIT License | 5 votes |
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. Using this type of batch query is more efficient than computing the similarities one document after another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): import warnings # warnings.warn("non-gensim input must already come normalized") else: if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
Example #21
Source File: logentropy_model.py From topical_word_embeddings with MIT License | 5 votes |
def __getitem__(self, bow): """ Return log entropy representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr] if self.normalize: vector = matutils.unitvec(vector) return vector
Example #22
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features)) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))
Example #23
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def add_documents(self, corpus): """ Extend the index with new documents. Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them (or when a query is issued). """ min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard self.reopen_shard() for doc in corpus: if isinstance(doc, numpy.ndarray): doclen = len(doc) elif scipy.sparse.issparse(doc): doclen = doc.nnz else: doclen = len(doc) if doclen < 0.3 * self.num_features: doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T) else: doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features)) self.fresh_docs.append(doc) self.fresh_nnz += doclen if len(self.fresh_docs) >= self.shardsize: self.close_shard() if len(self.fresh_docs) % 10000 == 0: logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs))
Example #24
Source File: multi_k_model.py From dna2vec with MIT License | 5 votes |
def unitvec(self, vec): return matutils.unitvec(vec)
Example #25
Source File: docsim.py From topical_word_embeddings with MIT License | 5 votes |
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256): """ `num_features` is the number of features in the corpus (will be determined automatically by scanning the corpus if not specified). See `Similarity` class for description of the other parameters. """ if num_features is None: logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)") num_features = 1 + utils.get_max_id(corpus) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = chunksize if corpus is not None: if self.num_features <= 0: raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)") logger.info("creating matrix for %s documents and %i features" % (len(corpus), num_features)) self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype) # iterate over corpus, populating the numpy index matrix with (normalized) # document vectors for docno, vector in enumerate(corpus): if docno % 1000 == 0: logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus))) # individual documents in fact may be in numpy.scipy.sparse format as well. # it's not documented because other it's not fully supported throughout. # the user better know what he's doing (no normalization, must # explicitly supply num_features etc). if isinstance(vector, numpy.ndarray): pass elif scipy.sparse.issparse(vector): vector = vector.toarray().flatten() else: vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) self.index[docno] = vector
Example #26
Source File: interfaces.py From topical_word_embeddings with MIT License | 5 votes |
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. Using this type of batch query is more efficient than computing the similarities one document after another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): import warnings # warnings.warn("non-gensim input must already come normalized") else: if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
Example #27
Source File: logentropy_model.py From topical_word_embeddings with MIT License | 5 votes |
def __getitem__(self, bow): """ Return log entropy representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr] if self.normalize: vector = matutils.unitvec(vector) return vector
Example #28
Source File: logentropy_model.py From topical_word_embeddings with MIT License | 5 votes |
def __getitem__(self, bow): """ Return log entropy representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr] if self.normalize: vector = matutils.unitvec(vector) return vector
Example #29
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 5 votes |
def renew_label_vec(self): """ initialize word vectors of words in label_dict. fast version(unstable) !Attention! : use it only when you make sure that all words in Label_index can calculate the word vector. """ self.Label_vec=self.model.wv.__getitem__(self.Label_index) self.Label_vec_u=unitvec(self.Label_vec)
Example #30
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 5 votes |
def unitvec(vector, ax=1): v=vector*vector if len(vector.shape)==1: sqrtv=np.sqrt(np.sum(v)) elif len(vector.shape)==2: sqrtv=np.sqrt([np.sum(v, axis=ax)]) else: raise Exception('It\'s too large.') if ax==1: result=np.divide(vector,sqrtv.T) elif ax==0: result=np.divide(vector,sqrtv) return result