Python gensim.models.Doc2Vec() Examples

The following are 9 code examples of gensim.models.Doc2Vec(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models , or try the search function .
Example #1
Source File: embedding_trainer.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def train_word2vec_model(df, columns):
    model_param = {
        "alpha": config.EMBEDDING_ALPHA,
        "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY,
        "n_epoch": config.EMBEDDING_N_EPOCH,
        "sg": 1,
        "hs": 1,
        "min_count": config.EMBEDDING_MIN_COUNT,
        "size": config.EMBEDDING_DIM,
        "sample": 0.001,
        "window": config.EMBEDDING_WINDOW,
        "workers": config.EMBEDDING_WORKERS,
    }
    model_dir = config.WORD2VEC_MODEL_DIR
    model_name = "Homedepot-word2vec-D%d-min_count%d.model"%(
                    model_param["size"], model_param["min_count"])

    word2vec = DataFrameWord2Vec(df, columns, model_param)
    word2vec.train()
    word2vec.save(model_dir, model_name)


#---------------------- Doc2Vec ---------------------- 
Example #2
Source File: diffusion_2_vec.py    From diff2vec with GNU General Public License v3.0 6 votes vote down vote up
def learn_non_pooled_embeddings(walks, counts, args):
    """
    Method to learn an embedding given the sequences and arguments.
    :param walks: Linear vertex sequences.
    :param counts: Number of nodes.
    :param args: Arguments.
    """
    walks = process_non_pooled_model_data(walks, counts, args)
    model = Doc2Vec(walks,
                    size=args.dimensions,
                    window=0,
                    dm=0,
                    alpha=args.alpha,
                    iter=args.iter,
                    workers=args.workers)

    save_embedding(args, model, counts) 
Example #3
Source File: embedding_trainer.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def __init__(self, df, columns, model_param):
        super().__init__(df, columns, model_param)
        self.model = Doc2Vec(dm=self.model_param["dm"], 
                                hs=self.model_param["hs"], 
                                alpha=self.model_param["alpha"],
                                min_alpha=self.model_param["alpha"],
                                min_count=self.model_param["min_count"], 
                                size=self.model_param["size"], 
                                sample=self.model_param["sample"], 
                                window=self.model_param["window"], 
                                workers=self.model_param["workers"]) 
Example #4
Source File: sent_utils.py    From embedding with MIT License 5 votes vote down vote up
def doc2vec(corpus_fname, output_fname):
    make_save_path(output_fname)
    corpus = Doc2VecInput(corpus_fname)
    model = Doc2Vec(corpus, vector_size=100)
    model.save(output_fname) 
Example #5
Source File: features_nn.py    From Semantic-Texual-Similarity-Toolkits with MIT License 5 votes vote down vote up
def extract_instances(self, train_instances):
        sentences = []
        for idx, train_instance in enumerate(train_instances):
            sa, sb = train_instance.get_word(type='lemma', lower=True)
            sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx]))
            sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx]))

        model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000)

        features = []
        infos = []
        for idx in range(len(train_instances)):
            vec_a = model.docvecs['sa_%d' % idx]
            vec_b = model.docvecs['sb_%d' % idx]
            feature, info = vk.get_all_kernel(vec_a, vec_b)
            features.append(feature)
            infos.append([])
            # infos.append([vec_a, vec_b])

        return features, infos

    # def load_instances(self, train_instances):
    #     """
    #     extract cosine distance from already trained feature file
    #     without modify the feature_file
    #     this function's priority is higher that the above extract_instances
    #     """
    #
    #     _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file)
    #     features = []
    #     infos = []
    #     ''' get features from train instances'''
    #     for _feature in _features:
    #         feature = Feature._feat_string_to_list(_feature, _n_dim)
    #         features.append([feature[1]])
    #         infos.append(['cosine'])
    #
    #     features = [ Feature._feat_list_to_string(feature) for feature in features ]
    #
    #     return features, 1, _n_instance 
Example #6
Source File: doc2vec.py    From vec4ir with MIT License 5 votes vote down vote up
def __init__(self,
                 analyzer=None, matching=None,
                 name=None,
                 verbose=0,
                 n_epochs=10,
                 alpha=0.25,
                 min_alpha=0.05,
                 n_jobs=4,
                 **kwargs):
        # self.model = model
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.verbose = verbose
        self.name = "paragraph-vectors" if name is None else name

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))

        self.analyzer = analyzer
        self.model = Doc2Vec(alpha=alpha,
                             min_alpha=alpha,
                             size=500,
                             window=8,
                             min_count=1,
                             sample=1e-5,
                             workers=n_jobs,
                             negative=20,
                             dm=0, dbow_words=1,  # words only with dm!=0?
                             dm_mean=0,  # unused when in concat mode
                             dm_concat=1,
                             dm_tag_count=1
                             )
        self.n_epochs = n_epochs
        self._neighbors = NearestNeighbors(**kwargs) 
Example #7
Source File: doc2vec.py    From vec4ir with MIT License 5 votes vote down vote up
def fit(self, docs, y):
        assert len(docs) == len(y)
        model = self.model
        n_epochs = self.n_epochs
        verbose = self.verbose
        decay = (self.alpha - self.min_alpha) / n_epochs
        X = [TaggedDocument(self.analyzer(doc), [label])
             for doc, label in zip(docs, y)]

        if verbose > 0:
            print("First 3 tagged documents:\n", X[:3])
            print("Training doc2vec model")
        # d2v = Doc2Vec()
        # d2v.build_vocab(X)
        # if self.intersect is not None:
        #     d2v.intersect_word2vec_format(self.intersect)
        model.build_vocab(X)
        for epoch in range(n_epochs):
            if verbose:
                print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs))
            model.train(X)
            model.alpha -= decay  # apply global decay
            model.min_alpha = model.alpha  # but no decay inside one epoch

        if verbose > 0:
            print("Finished.")
            print("model:", self.model)

        if self._matching:
            self._matching.fit(docs)
        else:
            # if we dont do matching, its enough to fit a nearest neighbors on
            # all centroids before query time
            dvs = np.asarray([model.docvecs[tag] for tag in y])
            self._neighbors.fit(dvs)

        self._y = y

        return self 
Example #8
Source File: document_embedder.py    From fake-news-detection-pipeline with Apache License 2.0 5 votes vote down vote up
def __init__(self, docs: DocumentSequence, pretrained_word2vec=None):
        """
        This class features interfaces to different methods of computing document embeddings.
        Supported embedding mechanisms are:
            Dov2Vec:                               see self.get_doc2vec()
            Naive Doc2Vec:                         see self.get_naive_doc2vec()
            One-Hot Sum:                           see self.get_onehot()
            Attention is all you need              To be implemented
            FastText                               To be implemented

        :param docs: a DocumentSequence instance
        :pretrained_word2vec: path to pretrained word2vec model, in .bin format
        """
        self.docs = docs
        self.pretrained = pretrained_word2vec 
Example #9
Source File: document_embedder.py    From fake-news-detection-pipeline with Apache License 2.0 5 votes vote down vote up
def _set_doc2vec(self, vector_size=300, window=5, min_count=5, dm=1, epochs=20):
        # instantiate a Doc2Vec model, setting pretrained GoogleNews Vector
        self._d2v = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, dm=dm, epochs=epochs,
                            pretrained=self.pretrained)
        # build vocabulary from corpus
        self._d2v.build_vocab(self.docs.get_tagged())

        # somehow, the training won't start automatically, and must be manually started
        self._d2v.train(self.docs.get_tagged(), total_examples=self._d2v.corpus_count, epochs=epochs)

        # list document embeddings by order of their tags
        self._d2v_embedding = np.stack(self._d2v.docvecs[index] for index in range(len(self.docs.get_tagged())))