Python gensim.matutils.unitvec() Examples

The following are 30 code examples of gensim.matutils.unitvec(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.matutils , or try the search function .
Example #1
Source File: NLP.py    From Financial-NLP with Apache License 2.0 7 votes vote down vote up
def similarity_label(self, words, normalization=True):
        """
        you can calculate more than one word at the same time.
        """
        if self.model==None:
            raise Exception('no model.')
        if isinstance(words, string_types):
            words=[words]
        vectors=np.transpose(self.model.wv.__getitem__(words))
        if normalization:
            unit_vector=unitvec(vectors,ax=0) # 这样写比原来那样速度提升一倍
            #unit_vector=np.zeros((len(vectors),len(words)))
            #for i in range(len(words)):
            #    unit_vector[:,i]=matutils.unitvec(vectors[:,i])
            dists=np.dot(self.Label_vec_u, unit_vector)
        else:
            dists=np.dot(self.Label_vec, vectors)
        return dists 
Example #2
Source File: tfidfmodel.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
#endclass TfidfModel 
Example #3
Source File: create_sppmi.py    From dutchembeddings with GNU General Public License v2.0 6 votes vote down vote up
def shift_clip_pmi(pmimtr, k_shift=1.0):
        """
        Turns a pmi matrix into a PPMI matrix by setting all negative values to 0 and then shifting by a factor of
        -log(k).

        :param pmimtr: The matrix of PMI values.
        :param k_shift: The shift factor.
        :return: A PPMI matrix.
        """

        logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, ))
        pmimtr -= np.log(k_shift)  # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k)

        logger.info("clipping PMI scores to be non-negative PPMI")
        pmimtr.clip(0.0, out=pmimtr)  # SPPMI = max(0, log(#(w, c) * D / (#w * #c)) - log(k))

        logger.info("normalizing PPMI word vectors to unit length")
        for i, vec in enumerate(pmimtr):
            pmimtr[i] = matutils.unitvec(vec)

        return matutils.corpus2csc(matutils.Dense2Corpus(pmimtr, documents_columns=False)).T 
Example #4
Source File: tfidfmodel.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
#endclass TfidfModel 
Example #5
Source File: chatbot_sentence_vec_by_char.py    From nlp_xiaojiang with MIT License 6 votes vote down vote up
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
    """
      最相似的句子,句向量与矩阵点乘
    :param vec: 
    :param matrix: 
    :param keys: 
    :param topn: 
    :return: 
    """
    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
    return index_score 
Example #6
Source File: chatbot_sentence_vec_by_word.py    From nlp_xiaojiang with MIT License 6 votes vote down vote up
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
    """
      最相似的句子,句向量与矩阵点乘
    :param vec: 
    :param matrix: 
    :param keys: 
    :param topn: 
    :return: 
    """
    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
    return index_score 
Example #7
Source File: tfidfmodel.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
#endclass TfidfModel 
Example #8
Source File: tfidfmodel.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
#endclass TfidfModel 
Example #9
Source File: tfidfmodel.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
#endclass TfidfModel 
Example #10
Source File: sentencevectors.py    From Fast_Sentence_Embeddings with GNU General Public License v3.0 6 votes vote down vote up
def similarity(self, d1: int, d2: int) -> float:
        """Compute cosine similarity between two sentences from the training set.

        Parameters
        ----------
        d1 : int
            index of sentence 
        d2 : int
            index of sentence 

        Returns
        -------
        float
            The cosine similarity between the vectors of the two sentences.

        """
        return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2])) 
Example #11
Source File: breds-parallel.py    From BREDS with GNU Lesser General Public License v3.0 6 votes vote down vote up
def similarity_cluster(self, p1, p2):
        count = 0
        score = 0
        if self.config.alpha == 0 and self.config.gamma == 0:
            p1.merge_all_tuples_bet()
            p2.merge_all_tuples_bet()
            for v_bet1 in p1.bet_uniques_vectors:
                for v_bet2 in p2.bet_uniques_vectors:
                    if v_bet1 is not None and v_bet2 is not None:
                        score += dot(
                            matutils.unitvec(asarray(v_bet1)),
                            matutils.unitvec(asarray(v_bet2))
                        )
                        count += 1
        else:
            for t1 in p1.tuples:
                for t2 in p2.tuples:
                    score += self.similarity_3_contexts(t1, t2)
                    count += 1

        return float(score) / float(count) 
Example #12
Source File: breds-parallel.py    From BREDS with GNU Lesser General Public License v3.0 6 votes vote down vote up
def similarity_3_contexts(self, t, p):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(
                matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)
            )

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(
                matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)
            )

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(
                matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)
            )

        return self.config.alpha*bef + \
               self.config.beta*bet + \
               self.config.gamma*aft 
Example #13
Source File: tfidfmodel.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __getitem__(self, bow, eps=1e-12):
        """
        Return tf-idf representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, self.wlocal(tf) * self.idfs.get(termid))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
#endclass TfidfModel 
Example #14
Source File: docsim.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
            logger.info("creating matrix for %s documents and %i features" %
                         (len(corpus), num_features))
            self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector 
Example #15
Source File: interfaces.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. Using this type of batch
        query is more efficient than computing the similarities one document after
        another.
        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if matutils.ismatrix(query):
                import warnings
                # warnings.warn("non-gensim input must already come normalized")
            else:
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best) 
Example #16
Source File: logentropy_model.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector 
Example #17
Source File: breds.py    From BREDS with GNU Lesser General Public License v3.0 5 votes vote down vote up
def similarity_3_contexts(self, p, t):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))

        return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft 
Example #18
Source File: docsim.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features))
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs)) 
Example #19
Source File: docsim.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
            logger.info("creating matrix for %s documents and %i features" %
                         (len(corpus), num_features))
            self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector 
Example #20
Source File: interfaces.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. Using this type of batch
        query is more efficient than computing the similarities one document after
        another.
        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if matutils.ismatrix(query):
                import warnings
                # warnings.warn("non-gensim input must already come normalized")
            else:
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best) 
Example #21
Source File: logentropy_model.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector 
Example #22
Source File: docsim.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features))
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs)) 
Example #23
Source File: docsim.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def add_documents(self, corpus):
        """
        Extend the index with new documents.

        Internally, documents are buffered and then spilled to disk when there's
        `self.shardsize` of them (or when a query is issued).
        """
        min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
            self.reopen_shard()
        for doc in corpus:
            if isinstance(doc, numpy.ndarray):
                doclen = len(doc)
            elif scipy.sparse.issparse(doc):
                doclen = doc.nnz
            else:
                doclen = len(doc)
                if doclen < 0.3 * self.num_features:
                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T)
                else:
                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features))
            self.fresh_docs.append(doc)
            self.fresh_nnz += doclen
            if len(self.fresh_docs) >= self.shardsize:
                self.close_shard()
            if len(self.fresh_docs) % 10000 == 0:
                logger.info("PROGRESS: fresh_shard size=%i" % len(self.fresh_docs)) 
Example #24
Source File: multi_k_model.py    From dna2vec with MIT License 5 votes vote down vote up
def unitvec(self, vec):
        return matutils.unitvec(vec) 
Example #25
Source File: docsim.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256):
        """
        `num_features` is the number of features in the corpus (will be determined
        automatically by scanning the corpus if not specified). See `Similarity`
        class for description of the other parameters.

        """
        if num_features is None:
            logger.warning("scanning corpus to determine the number of features (consider setting `num_features` explicitly)")
            num_features = 1 + utils.get_max_id(corpus)

        self.num_features = num_features
        self.num_best = num_best
        self.normalize = True
        self.chunksize = chunksize

        if corpus is not None:
            if self.num_features <= 0:
                raise ValueError("cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)")
            logger.info("creating matrix for %s documents and %i features" %
                         (len(corpus), num_features))
            self.index = numpy.empty(shape=(len(corpus), num_features), dtype=dtype)
            # iterate over corpus, populating the numpy index matrix with (normalized)
            # document vectors
            for docno, vector in enumerate(corpus):
                if docno % 1000 == 0:
                    logger.debug("PROGRESS: at document #%i/%i" % (docno, len(corpus)))
                # individual documents in fact may be in numpy.scipy.sparse format as well.
                # it's not documented because other it's not fully supported throughout.
                # the user better know what he's doing (no normalization, must
                # explicitly supply num_features etc).
                if isinstance(vector, numpy.ndarray):
                    pass
                elif scipy.sparse.issparse(vector):
                    vector = vector.toarray().flatten()
                else:
                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
                self.index[docno] = vector 
Example #26
Source File: interfaces.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. Using this type of batch
        query is more efficient than computing the similarities one document after
        another.
        """
        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if matutils.ismatrix(query):
                import warnings
                # warnings.warn("non-gensim input must already come normalized")
            else:
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                else:
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
        else:
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best) 
Example #27
Source File: logentropy_model.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector 
Example #28
Source File: logentropy_model.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __getitem__(self, bow):
        """
        Return log entropy representation of the input vector and/or corpus.
        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector 
Example #29
Source File: NLP.py    From Financial-NLP with Apache License 2.0 5 votes vote down vote up
def renew_label_vec(self):
        """
        initialize word vectors of words in label_dict.
        fast version(unstable)
        !Attention! : use it only when you make sure that all words in Label_index can calculate the word vector.
        """
        self.Label_vec=self.model.wv.__getitem__(self.Label_index)
        self.Label_vec_u=unitvec(self.Label_vec) 
Example #30
Source File: NLP.py    From Financial-NLP with Apache License 2.0 5 votes vote down vote up
def unitvec(vector, ax=1):
    v=vector*vector
    if len(vector.shape)==1:
        sqrtv=np.sqrt(np.sum(v))
    elif len(vector.shape)==2:
        sqrtv=np.sqrt([np.sum(v, axis=ax)])
    else:
        raise Exception('It\'s too large.')
    if ax==1:
        result=np.divide(vector,sqrtv.T)
    elif ax==0:
        result=np.divide(vector,sqrtv)
    return result