Python gensim.models.doc2vec.Doc2Vec() Examples
The following are 18
code examples of gensim.models.doc2vec.Doc2Vec().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models.doc2vec
, or try the search function
.
Example #1
Source File: gl2vec.py From karateclub with GNU General Public License v3.0 | 6 votes |
def fit(self, graphs): """ Fitting a GL2Vec model. Arg types: * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded. """ self._set_seed() self._check_graphs(graphs) graphs = [self._create_line_graph(graph) for graph in graphs] documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, False) for graph in graphs] documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)] model = Doc2Vec(documents, vector_size=self.dimensions, window=0, min_count=self.min_count, dm=0, sample=self.down_sampling, workers=self.workers, epochs=self.epochs, alpha=self.learning_rate, seed=self.seed) self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]
Example #2
Source File: role2vec.py From role2vec with GNU General Public License v3.0 | 6 votes |
def create_embedding(self): """ Fitting an embedding. """ document_collections = create_documents(self.pooled_features) model = Doc2Vec(document_collections, vector_size=self.args.dimensions, window=0, min_count=self.args.min_count, alpha=self.args.alpha, dm=0, min_alpha=self.args.min_alpha, sample=self.args.down_sampling, workers=self.args.workers, epochs=self.args.epochs) embedding = np.array([model.docvecs[str(node)] for node in self.graph.nodes()]) return embedding
Example #3
Source File: doc2vec.py From broca with MIT License | 6 votes |
def train_doc2vec(paths, out='data/model.d2v', tokenizer=word_tokenize, sentences=False, **kwargs): """ Train a doc2vec model on a list of files. """ kwargs = { 'size': 400, 'window': 8, 'min_count': 2, 'workers': 8 }.update(kwargs) n = 0 for path in paths: print('Counting lines for {0}...'.format(path)) n += sum(1 for line in open(path, 'r')) print('Processing {0} lines...'.format(n)) print('Training doc2vec model...') m = Doc2Vec(_doc2vec_doc_stream(paths, n, tokenizer=tokenizer, sentences=sentences), **kwargs) print('Saving...') m.save(out)
Example #4
Source File: graph2vec.py From graph2vec with GNU General Public License v3.0 | 6 votes |
def main(args): """ Main function to read the graph list, extract features. Learn the embedding and save it. :param args: Object with the arguments. """ graphs = glob.glob(args.input_path + "*.json") print("\nFeature extraction started.\n") document_collections = Parallel(n_jobs=args.workers)(delayed(feature_extractor)(g, args.wl_iterations) for g in tqdm(graphs)) print("\nOptimization started.\n") model = Doc2Vec(document_collections, vector_size=args.dimensions, window=0, min_count=args.min_count, dm=0, sample=args.down_sampling, workers=args.workers, epochs=args.epochs, alpha=args.learning_rate) save_embedding(args.output_path, model, graphs, args.dimensions)
Example #5
Source File: graph2vec.py From cogdl with MIT License | 6 votes |
def forward(self, graphs, **kwargs): if self.doc_collections is None: self.doc_collections = Parallel(n_jobs=self.worker)( delayed(Graph2Vec.feature_extractor)(graph, self.rounds, str(i)) for i, graph in enumerate(graphs) ) self.model = Doc2Vec( self.doc_collections, vector_size=self.dimension, window=self.window_size, min_count=self.min_count, dm=self.dm, sample=self.sampling_rate, workers=self.worker, epochs=self.epoch, alpha=self.lr ) vectors = np.array([self.model["g_"+str(i)] for i in range(len(graphs))]) return vectors, None
Example #6
Source File: doc2vec_model.py From doc2vec with MIT License | 6 votes |
def initialize_model(self, corpus): logging.info("Building Doc2Vec vocabulary") self.corpus = corpus self.model = doc2vec.Doc2Vec(min_count=1, # Ignores all words with # total frequency lower than this window=10, # The maximum distance between the current # and predicted word within a sentence vector_size=300, # Dimensionality of the # generated feature vectors workers=5, # Number of worker threads to # train the model alpha=0.025, # The initial learning rate min_alpha=0.00025, # Learning rate will linearly drop to # min_alpha as training progresses dm=1) # dm defines the training algorithm. # If dm=1 means 'distributed memory' (PV-DM) # and dm =0 means 'distributed bag of words' (PV-DBOW) self.model.build_vocab(self.corpus)
Example #7
Source File: musae.py From MUSAE with GNU General Public License v3.0 | 6 votes |
def _create_single_embedding(self, features): """ Learning an embedding from a feature hash table. :param features: A hash table with node keys and feature list values. :return embedding: Numpy array of embedding. """ print("\nLearning the embedding.") document_collections = create_documents(features) model = Doc2Vec(document_collections, vector_size=self.args.dimensions, window=0, min_count=self.args.min_count, alpha=self.args.alpha, dm=0, negative=self.args.negative_samples, ns_exponent=self.args.exponent, min_alpha=self.args.min_alpha, sample=self.args.down_sampling, workers=self.args.workers, epochs=self.args.epochs) emb = np.array([model.docvecs[str(n)] for n in range(self.graph.number_of_nodes())]) return emb
Example #8
Source File: graph2vec.py From karateclub with GNU General Public License v3.0 | 6 votes |
def fit(self, graphs): """ Fitting a Graph2Vec model. Arg types: * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded. """ self._set_seed() self._check_graphs(graphs) documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, self.attributed) for graph in graphs] documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)] model = Doc2Vec(documents, vector_size=self.dimensions, window=0, min_count=self.min_count, dm=0, sample=self.down_sampling, workers=self.workers, epochs=self.epochs, alpha=self.learning_rate, seed=self.seed) self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]
Example #9
Source File: transformer.py From atap with Apache License 2.0 | 5 votes |
def transform(self, documents): docs = [ TaggedDocument(words, ['d{}'.format(idx)]) for idx, words in enumerate(documents) ] model = Doc2Vec(docs, size=self.size, min_count=self.min_count) return np.array(list(model.docvecs))
Example #10
Source File: build_doc2vec_model.py From altair with Apache License 2.0 | 5 votes |
def build_doc2vec_model(doc2vec_tagged_documents,training_algorithm=2,num_cores=1,epochs=5,vector_size=300,window=5,min_count=10,alpha=0.05, negative=0): ''' Doc2Vec parameters dm_mean - 0 uses sum, 1 uses mean. Only applies when dm is non-concatenative mode dm - defines the training algorithm. By default (dm=1), ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed. dbow_words - if set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW doc-vector training; default is 0 (faster training of doc-vectors only). dm_concat - if 1, use concatenation of context vectors rather than sum/average; default is 0 (off). Note concatenation results in a much-larger model, as the input is no longer the size of one (sampled or arithmatically combined) word vector, but the size of the tag(s) and all words in the context strung together. dm_tag_count = expected constant number of document tags per document, when using dm_concat mode; default is 1. trim_rule = vocabulary trimming rule, specifies whether certain words should remain size is the dimensionality of the feature vectors window is the maximum distance between the predicted word and context words used for prediction within a document. alpha is the initial learning rate (will linearly drop to zero as training progresses). min_count = ignore all words with total frequency lower than this. max_vocab_size = limit RAM during vocabulary building sample = threshold for configuring which higher-frequency words are randomly downsampled; default is 0 (off), useful value is 1e-5. iter = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, but values of 10 or 20 are common in published ‘Paragraph Vector’ experiments. hs = if 1 (default), hierarchical sampling will be used for model training (else set to 0). negative = if > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). ''' # build Doc2Vec's vocab doc2vec_model = doc2vec.Doc2Vec(dm=training_algorithm, size=vector_size, sample=1e-5, window=window, min_count=min_count, iter=20, dbow_words=1, workers=num_cores, alpha=0.05, min_alpha=0.001, negative=negative) doc2vec_model.build_vocab(doc2vec_tagged_documents) # run training epochs while shuffling data and lowering learning rate (alpha) for i in range(epochs): logger.info("starting code epoch %d" % int(i+1)) doc2vec_model.train(doc2vec_tagged_documents) doc2vec_model.alpha -= 0.002 shuffle(doc2vec_tagged_documents) return doc2vec_model
Example #11
Source File: build_doc2vec_model.py From altair with Apache License 2.0 | 5 votes |
def main(script_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, max_script_count, min_script_len, negative): doc2vec_tagged_documents = list() counter = 0 logger.info("retrieving files") # Retrieve files containing Python scripts # Altair's JSON format uses the 'content' label for the script code for py_file in sorted(os.listdir(script_folder)): if counter >= max_script_count: break if counter % 100000 == 0: logger.info("processed %d files" % counter) fullpath = os.path.join(script_folder, py_file) with open(fullpath, "r") as py_file_contents: for line in py_file_contents: parsed_json = json.loads(line) code, comments = separate_code_and_comments(parsed_json['content'],py_file) if len(code) < min_script_len: continue else: tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True) doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter])) counter += 1 doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,training_algorithm,num_cores,epochs,vector_size,window,min_count,alpha,negative) # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available # If keep_inference is set to false, infer_vector on a new document is no longer possible doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True) # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own method for saving/loading models # doc2vec_model.save(model_pickle_filename) # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename) #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename) pickle.dump(doc2vec_model, open(model_pickle_filename, "wb")) logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename) # Run this when called from CLI
Example #12
Source File: build_doc2vec_model_from_training_set.py From altair with Apache License 2.0 | 5 votes |
def main(trainingset_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, negative): doc2vec_model = doc2vec.Doc2Vec(dm=training_algorithm, size=vector_size, sample=1e-5, window=window, min_count=min_count, iter=20, dbow_words=1, workers=num_cores, alpha=0.05, min_alpha=0.001, negative=negative) doc2vec_tagged_documents = list() for trainingset in os.listdir(trainingset_folder): logger.info("starting training set %s" % trainingset) doc2vec_tagged_documents += pickle.load(open(os.path.join(trainingset_folder,trainingset),"rb")) #doc2vec_model = train_doc2vec_model(doc2vec_model, doc2vec_tagged_documents,epochs) # build Doc2Vec's vocab logger.info("building vocabulary") doc2vec_model.build_vocab(doc2vec_tagged_documents) # run training epochs while shuffling data and lowering learning rate (alpha) for i in range(epochs): logger.info("starting code epoch %d" % int(i+1)) doc2vec_model.train(doc2vec_tagged_documents) doc2vec_model.alpha -= 0.002 shuffle(doc2vec_tagged_documents) #logger.info("saving model pickle for %s" % trainingset) #pickle.dump(doc2vec_model, open(model_pickle_filename[:-4]+"_"+str(int(time.time()))+os.path.splitext(model_pickle_filename)[1], "wb")) #doc2vec_model.alpha = 0.05 #in_loop = True # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available # If keep_inference is set to false, infer_vector on a new document is no longer possible doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True) # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own method for saving/loading models # doc2vec_model.save(model_pickle_filename) # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename) #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename) pickle.dump(doc2vec_model, open(model_pickle_filename, "wb")) logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename) # Run this when called from CLI
Example #13
Source File: vectorization.py From atap with Apache License 2.0 | 5 votes |
def gensim_doc2vec_vectorize(corpus): from gensim.models.doc2vec import TaggedDocument, Doc2Vec corpus = [list(tokenize(doc)) for doc in corpus] docs = [ TaggedDocument(words, ['d{}'.format(idx)]) for idx, words in enumerate(corpus) ] model = Doc2Vec(docs, size=5, min_count=0) return model.docvecs
Example #14
Source File: doc2vec_model.py From doc2vec with MIT License | 5 votes |
def train_model(self): logging.info("Training Doc2Vec model") # 10 epochs take around 10 minutes on my machine (i7), # if you have more time/computational power make it 20 for epoch in range(10): logging.info('Training iteration #{0}'.format(epoch)) self.model.train( self.corpus, total_examples=self.model.corpus_count, epochs=self.model.epochs) # shuffle the corpus random.shuffle(self.corpus) # decrease the learning rate self.model.alpha -= 0.0002 # fix the learning rate, no decay self.model.min_alpha = self.model.alpha
Example #15
Source File: doc2vec_model.py From doc2vec with MIT License | 5 votes |
def get_vectors(self, corpus_size, vectors_size, vectors_type): """ Get vectors from trained doc2vec model :param doc2vec_model: Trained Doc2Vec model :param corpus_size: Size of the data :param vectors_size: Size of the embedding vectors :param vectors_type: Training or Testing vectors :return: list of vectors """ vectors = np.zeros((corpus_size, vectors_size)) for i in range(0, corpus_size): prefix = vectors_type + '_' + str(i) vectors[i] = self.model.docvecs[prefix] return vectors
Example #16
Source File: doc2vec_model.py From doc2vec with MIT License | 5 votes |
def label_sentences(corpus, label_type): """ Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it. We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is a dummy index of the review. """ labeled = [] for i, v in enumerate(corpus): label = label_type + '_' + str(i) labeled.append(doc2vec.LabeledSentence(v.split(), [label])) return labeled
Example #17
Source File: musae.py From karateclub with GNU General Public License v3.0 | 5 votes |
def _create_single_embedding(self, document_collections): model = Doc2Vec(document_collections, vector_size=self.dimensions, window=0, min_count=self.min_count, alpha=self.learning_rate, dm=0, sample=self.down_sampling, workers=self.workers, epochs=self.epochs, seed=self.seed) emb = np.array([model.docvecs[str(n)] for n in range(self.graph.number_of_nodes())]) return emb
Example #18
Source File: role2vec.py From karateclub with GNU General Public License v3.0 | 5 votes |
def fit(self, graph): """ Fitting a Role2vec model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._set_seed() self._check_graph(graph) walker = RandomWalker(self.walk_length, self.walk_number) walker.do_walks(graph) hasher = WeisfeilerLehmanHashing(graph=graph, wl_iterations=self.wl_iterations, attributed=False) node_features = hasher.get_node_features() documents = self._create_documents(walker.walks, node_features) model = Doc2Vec(documents, vector_size=self.dimensions, window=0, min_count=self.min_count, dm=0, workers=self.workers, sample=self.down_sampling, epochs=self.epochs, alpha=self.learning_rate, seed=self.seed) self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]