Python gensim.models.Doc2Vec() Examples
The following are 9
code examples of gensim.models.Doc2Vec().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models
, or try the search function
.
Example #1
Source File: embedding_trainer.py From kaggle-HomeDepot with MIT License | 6 votes |
def train_word2vec_model(df, columns): model_param = { "alpha": config.EMBEDDING_ALPHA, "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY, "n_epoch": config.EMBEDDING_N_EPOCH, "sg": 1, "hs": 1, "min_count": config.EMBEDDING_MIN_COUNT, "size": config.EMBEDDING_DIM, "sample": 0.001, "window": config.EMBEDDING_WINDOW, "workers": config.EMBEDDING_WORKERS, } model_dir = config.WORD2VEC_MODEL_DIR model_name = "Homedepot-word2vec-D%d-min_count%d.model"%( model_param["size"], model_param["min_count"]) word2vec = DataFrameWord2Vec(df, columns, model_param) word2vec.train() word2vec.save(model_dir, model_name) #---------------------- Doc2Vec ----------------------
Example #2
Source File: diffusion_2_vec.py From diff2vec with GNU General Public License v3.0 | 6 votes |
def learn_non_pooled_embeddings(walks, counts, args): """ Method to learn an embedding given the sequences and arguments. :param walks: Linear vertex sequences. :param counts: Number of nodes. :param args: Arguments. """ walks = process_non_pooled_model_data(walks, counts, args) model = Doc2Vec(walks, size=args.dimensions, window=0, dm=0, alpha=args.alpha, iter=args.iter, workers=args.workers) save_embedding(args, model, counts)
Example #3
Source File: embedding_trainer.py From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, df, columns, model_param): super().__init__(df, columns, model_param) self.model = Doc2Vec(dm=self.model_param["dm"], hs=self.model_param["hs"], alpha=self.model_param["alpha"], min_alpha=self.model_param["alpha"], min_count=self.model_param["min_count"], size=self.model_param["size"], sample=self.model_param["sample"], window=self.model_param["window"], workers=self.model_param["workers"])
Example #4
Source File: sent_utils.py From embedding with MIT License | 5 votes |
def doc2vec(corpus_fname, output_fname): make_save_path(output_fname) corpus = Doc2VecInput(corpus_fname) model = Doc2Vec(corpus, vector_size=100) model.save(output_fname)
Example #5
Source File: features_nn.py From Semantic-Texual-Similarity-Toolkits with MIT License | 5 votes |
def extract_instances(self, train_instances): sentences = [] for idx, train_instance in enumerate(train_instances): sa, sb = train_instance.get_word(type='lemma', lower=True) sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx])) sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx])) model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000) features = [] infos = [] for idx in range(len(train_instances)): vec_a = model.docvecs['sa_%d' % idx] vec_b = model.docvecs['sb_%d' % idx] feature, info = vk.get_all_kernel(vec_a, vec_b) features.append(feature) infos.append([]) # infos.append([vec_a, vec_b]) return features, infos # def load_instances(self, train_instances): # """ # extract cosine distance from already trained feature file # without modify the feature_file # this function's priority is higher that the above extract_instances # """ # # _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file) # features = [] # infos = [] # ''' get features from train instances''' # for _feature in _features: # feature = Feature._feat_string_to_list(_feature, _n_dim) # features.append([feature[1]]) # infos.append(['cosine']) # # features = [ Feature._feat_list_to_string(feature) for feature in features ] # # return features, 1, _n_instance
Example #6
Source File: doc2vec.py From vec4ir with MIT License | 5 votes |
def __init__(self, analyzer=None, matching=None, name=None, verbose=0, n_epochs=10, alpha=0.25, min_alpha=0.05, n_jobs=4, **kwargs): # self.model = model self.alpha = alpha self.min_alpha = min_alpha self.verbose = verbose self.name = "paragraph-vectors" if name is None else name if matching is True: self._matching = Matching() elif matching is False or matching is None: self._matching = None else: self._matching = Matching(**dict(matching)) self.analyzer = analyzer self.model = Doc2Vec(alpha=alpha, min_alpha=alpha, size=500, window=8, min_count=1, sample=1e-5, workers=n_jobs, negative=20, dm=0, dbow_words=1, # words only with dm!=0? dm_mean=0, # unused when in concat mode dm_concat=1, dm_tag_count=1 ) self.n_epochs = n_epochs self._neighbors = NearestNeighbors(**kwargs)
Example #7
Source File: doc2vec.py From vec4ir with MIT License | 5 votes |
def fit(self, docs, y): assert len(docs) == len(y) model = self.model n_epochs = self.n_epochs verbose = self.verbose decay = (self.alpha - self.min_alpha) / n_epochs X = [TaggedDocument(self.analyzer(doc), [label]) for doc, label in zip(docs, y)] if verbose > 0: print("First 3 tagged documents:\n", X[:3]) print("Training doc2vec model") # d2v = Doc2Vec() # d2v.build_vocab(X) # if self.intersect is not None: # d2v.intersect_word2vec_format(self.intersect) model.build_vocab(X) for epoch in range(n_epochs): if verbose: print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs)) model.train(X) model.alpha -= decay # apply global decay model.min_alpha = model.alpha # but no decay inside one epoch if verbose > 0: print("Finished.") print("model:", self.model) if self._matching: self._matching.fit(docs) else: # if we dont do matching, its enough to fit a nearest neighbors on # all centroids before query time dvs = np.asarray([model.docvecs[tag] for tag in y]) self._neighbors.fit(dvs) self._y = y return self
Example #8
Source File: document_embedder.py From fake-news-detection-pipeline with Apache License 2.0 | 5 votes |
def __init__(self, docs: DocumentSequence, pretrained_word2vec=None): """ This class features interfaces to different methods of computing document embeddings. Supported embedding mechanisms are: Dov2Vec: see self.get_doc2vec() Naive Doc2Vec: see self.get_naive_doc2vec() One-Hot Sum: see self.get_onehot() Attention is all you need To be implemented FastText To be implemented :param docs: a DocumentSequence instance :pretrained_word2vec: path to pretrained word2vec model, in .bin format """ self.docs = docs self.pretrained = pretrained_word2vec
Example #9
Source File: document_embedder.py From fake-news-detection-pipeline with Apache License 2.0 | 5 votes |
def _set_doc2vec(self, vector_size=300, window=5, min_count=5, dm=1, epochs=20): # instantiate a Doc2Vec model, setting pretrained GoogleNews Vector self._d2v = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, dm=dm, epochs=epochs, pretrained=self.pretrained) # build vocabulary from corpus self._d2v.build_vocab(self.docs.get_tagged()) # somehow, the training won't start automatically, and must be manually started self._d2v.train(self.docs.get_tagged(), total_examples=self._d2v.corpus_count, epochs=epochs) # list document embeddings by order of their tags self._d2v_embedding = np.stack(self._d2v.docvecs[index] for index in range(len(self.docs.get_tagged())))