Python Examples of gensim.models.Doc2Vec

Source File: embedding_trainer.py From kaggle-HomeDepot with MIT License

6 votes

def train_word2vec_model(df, columns):
    model_param = {
        "alpha": config.EMBEDDING_ALPHA,
        "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY,
        "n_epoch": config.EMBEDDING_N_EPOCH,
        "sg": 1,
        "hs": 1,
        "min_count": config.EMBEDDING_MIN_COUNT,
        "size": config.EMBEDDING_DIM,
        "sample": 0.001,
        "window": config.EMBEDDING_WINDOW,
        "workers": config.EMBEDDING_WORKERS,
    }
    model_dir = config.WORD2VEC_MODEL_DIR
    model_name = "Homedepot-word2vec-D%d-min_count%d.model"%(
                    model_param["size"], model_param["min_count"])

    word2vec = DataFrameWord2Vec(df, columns, model_param)
    word2vec.train()
    word2vec.save(model_dir, model_name)


#---------------------- Doc2Vec ----------------------

Source File: diffusion_2_vec.py From diff2vec with GNU General Public License v3.0

6 votes

def learn_non_pooled_embeddings(walks, counts, args):
    """
    Method to learn an embedding given the sequences and arguments.
    :param walks: Linear vertex sequences.
    :param counts: Number of nodes.
    :param args: Arguments.
    """
    walks = process_non_pooled_model_data(walks, counts, args)
    model = Doc2Vec(walks,
                    size=args.dimensions,
                    window=0,
                    dm=0,
                    alpha=args.alpha,
                    iter=args.iter,
                    workers=args.workers)

    save_embedding(args, model, counts)

Source File: embedding_trainer.py From kaggle-HomeDepot with MIT License

5 votes

def __init__(self, df, columns, model_param):
        super().__init__(df, columns, model_param)
        self.model = Doc2Vec(dm=self.model_param["dm"], 
                                hs=self.model_param["hs"], 
                                alpha=self.model_param["alpha"],
                                min_alpha=self.model_param["alpha"],
                                min_count=self.model_param["min_count"], 
                                size=self.model_param["size"], 
                                sample=self.model_param["sample"], 
                                window=self.model_param["window"], 
                                workers=self.model_param["workers"])

Source File: sent_utils.py From embedding with MIT License

5 votes

def doc2vec(corpus_fname, output_fname):
    make_save_path(output_fname)
    corpus = Doc2VecInput(corpus_fname)
    model = Doc2Vec(corpus, vector_size=100)
    model.save(output_fname)

Source File: features_nn.py From Semantic-Texual-Similarity-Toolkits with MIT License

5 votes

def extract_instances(self, train_instances):
        sentences = []
        for idx, train_instance in enumerate(train_instances):
            sa, sb = train_instance.get_word(type='lemma', lower=True)
            sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx]))
            sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx]))

        model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000)

        features = []
        infos = []
        for idx in range(len(train_instances)):
            vec_a = model.docvecs['sa_%d' % idx]
            vec_b = model.docvecs['sb_%d' % idx]
            feature, info = vk.get_all_kernel(vec_a, vec_b)
            features.append(feature)
            infos.append([])
            # infos.append([vec_a, vec_b])

        return features, infos

    # def load_instances(self, train_instances):
    #     """
    #     extract cosine distance from already trained feature file
    #     without modify the feature_file
    #     this function's priority is higher that the above extract_instances
    #     """
    #
    #     _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file)
    #     features = []
    #     infos = []
    #     ''' get features from train instances'''
    #     for _feature in _features:
    #         feature = Feature._feat_string_to_list(_feature, _n_dim)
    #         features.append([feature[1]])
    #         infos.append(['cosine'])
    #
    #     features = [ Feature._feat_list_to_string(feature) for feature in features ]
    #
    #     return features, 1, _n_instance

Source File: doc2vec.py From vec4ir with MIT License

5 votes

def __init__(self,
                 analyzer=None, matching=None,
                 name=None,
                 verbose=0,
                 n_epochs=10,
                 alpha=0.25,
                 min_alpha=0.05,
                 n_jobs=4,
                 **kwargs):
        # self.model = model
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.verbose = verbose
        self.name = "paragraph-vectors" if name is None else name

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))

        self.analyzer = analyzer
        self.model = Doc2Vec(alpha=alpha,
                             min_alpha=alpha,
                             size=500,
                             window=8,
                             min_count=1,
                             sample=1e-5,
                             workers=n_jobs,
                             negative=20,
                             dm=0, dbow_words=1,  # words only with dm!=0?
                             dm_mean=0,  # unused when in concat mode
                             dm_concat=1,
                             dm_tag_count=1
                             )
        self.n_epochs = n_epochs
        self._neighbors = NearestNeighbors(**kwargs)

Source File: doc2vec.py From vec4ir with MIT License

5 votes

def fit(self, docs, y):
        assert len(docs) == len(y)
        model = self.model
        n_epochs = self.n_epochs
        verbose = self.verbose
        decay = (self.alpha - self.min_alpha) / n_epochs
        X = [TaggedDocument(self.analyzer(doc), [label])
             for doc, label in zip(docs, y)]

        if verbose > 0:
            print("First 3 tagged documents:\n", X[:3])
            print("Training doc2vec model")
        # d2v = Doc2Vec()
        # d2v.build_vocab(X)
        # if self.intersect is not None:
        #     d2v.intersect_word2vec_format(self.intersect)
        model.build_vocab(X)
        for epoch in range(n_epochs):
            if verbose:
                print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs))
            model.train(X)
            model.alpha -= decay  # apply global decay
            model.min_alpha = model.alpha  # but no decay inside one epoch

        if verbose > 0:
            print("Finished.")
            print("model:", self.model)

        if self._matching:
            self._matching.fit(docs)
        else:
            # if we dont do matching, its enough to fit a nearest neighbors on
            # all centroids before query time
            dvs = np.asarray([model.docvecs[tag] for tag in y])
            self._neighbors.fit(dvs)

        self._y = y

        return self

Source File: document_embedder.py From fake-news-detection-pipeline with Apache License 2.0

5 votes

def __init__(self, docs: DocumentSequence, pretrained_word2vec=None):
        """
        This class features interfaces to different methods of computing document embeddings.
        Supported embedding mechanisms are:
            Dov2Vec:                               see self.get_doc2vec()
            Naive Doc2Vec:                         see self.get_naive_doc2vec()
            One-Hot Sum:                           see self.get_onehot()
            Attention is all you need              To be implemented
            FastText                               To be implemented

        :param docs: a DocumentSequence instance
        :pretrained_word2vec: path to pretrained word2vec model, in .bin format
        """
        self.docs = docs
        self.pretrained = pretrained_word2vec

Source File: document_embedder.py From fake-news-detection-pipeline with Apache License 2.0

5 votes

def _set_doc2vec(self, vector_size=300, window=5, min_count=5, dm=1, epochs=20):
        # instantiate a Doc2Vec model, setting pretrained GoogleNews Vector
        self._d2v = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, dm=dm, epochs=epochs,
                            pretrained=self.pretrained)
        # build vocabulary from corpus
        self._d2v.build_vocab(self.docs.get_tagged())

        # somehow, the training won't start automatically, and must be manually started
        self._d2v.train(self.docs.get_tagged(), total_examples=self._d2v.corpus_count, epochs=epochs)

        # list document embeddings by order of their tags
        self._d2v_embedding = np.stack(self._d2v.docvecs[index] for index in range(len(self.docs.get_tagged())))

Python gensim.models.Doc2Vec() Examples