Python gensim.models.doc2vec.TaggedDocument() Examples
The following are 27
code examples of gensim.models.doc2vec.TaggedDocument().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models.doc2vec
, or try the search function
.
Example #1
Source File: paragraph_vectors.py From sato with Apache License 2.0 | 7 votes |
def tagcol_paragraph_embeddings_features(train_data): # Expects a dataframe with a 'values' column train_data_values = train_data['values'] columns = [TaggedDocument( random.sample(col, min(1000, len(col))) , [i]) for i, col in enumerate(train_data_values.values)] return columns # Input: returned tagged document collection from tagcol_paragraph_embeddings_features # Only needed for training.
Example #2
Source File: doc2vec.py From asreview with Apache License 2.0 | 7 votes |
def fit(self, texts): model_param = { "vector_size": self.vector_size, "epochs": self.epochs, "min_count": self.min_count, "workers": self.n_jobs, "window": self.window, "dm_concat": self.dm_concat, "dbow_words": self.dbow_words, } corpus = [TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts)] # If self.dm is 2, train both models and concatenate the feature # vectors later. Resulting vector size should be the same. if self.dm == 2: model_param["vector_size"] = int(model_param["vector_size"]/2) self.model_dm = _train_model(corpus, **model_param, dm=1) self.model_dbow = _train_model(corpus, **model_param, dm=0) else: self.model = _train_model(corpus, **model_param, dm=self.dm)
Example #3
Source File: graph2vec.py From karateclub with GNU General Public License v3.0 | 6 votes |
def fit(self, graphs): """ Fitting a Graph2Vec model. Arg types: * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded. """ self._set_seed() self._check_graphs(graphs) documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, self.attributed) for graph in graphs] documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)] model = Doc2Vec(documents, vector_size=self.dimensions, window=0, min_count=self.min_count, dm=0, sample=self.down_sampling, workers=self.workers, epochs=self.epochs, alpha=self.learning_rate, seed=self.seed) self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]
Example #4
Source File: parseundp.py From Semantic-Search-for-Sustainable-Development with Apache License 2.0 | 6 votes |
def read_corpus(path = '.', exclude = [], targets = None): i= 0 for file in os.listdir(path): if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file print(file) with open(os.path.join(path, file), encoding="utf8") as document_text: for line in document_text: count = 0 words = simple_preprocess(line) for word in words: # count the number of words with <= 3 characters if len(word) <= 3: count += 1 if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words i+=1 if targets: for key, val in targets.items(): yield(doc2vec.TaggedDocument(simple_preprocess(val), [i])) i+=1
Example #5
Source File: helper.py From diff2vec with GNU General Public License v3.0 | 6 votes |
def process_non_pooled_model_data(walks, counts, args): """ Function to extract proximity statistics. :param walks: Diffusion lists. :param counts: Number of nodes. :param args: Arguments objects. :return docs: Processed walks. """ print("Run feature extraction across windows.") features = {str(node): [] for node in range(counts)} for walk in tqdm(walks): for i in range(len(walk)-args.window_size): for j in range(1, args.window_size+1): features[walk[i]].append(["+"+str(j)+"_"+walk[i+j]]) features[walk[i+j]].append(["_"+str(j)+"_"+walk[i]]) docs = [TaggedDocument(words=[x[0] for x in v], tags=[str(k)]) for k, v in features.items()] return docs
Example #6
Source File: run_doc2vec.py From KATE with BSD 3-Clause "New" or "Revised" License | 6 votes |
def train(args): vocab = load_json(args.vocab) # import pdb;pdb.set_trace() # load corpus corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True) # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True) # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True) # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True) corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus) d2v = MyDoc2Vec(args.n_dim, window=args.window_size, \ negative=args.negative, epoches=args.n_epoch, dm_concat=1) start = timeit.default_timer() d2v.train(corpus_iter) print 'runtime: %ss' % (timeit.default_timer() - start) save_doc2vec(d2v.model, args.save_model) import pdb;pdb.set_trace()
Example #7
Source File: role2vec.py From karateclub with GNU General Public License v3.0 | 6 votes |
def _create_documents(self, walks, features): """ Accumulating the WL feature in neighbourhoods. Arg types: * **walks** *(list of lists)* - Random walks with string ids. Return types: * **new_features** *(list of TaggedDocument objects)* - The pooled features of nodes. """ new_features = {node: [] for node, feature in features.items()} walks = self._transform_walks(walks) for walk in walks: for i in range(self.walk_length-self.window_size): for j in range(self.window_size): source = walk[i] target = walk[i+j] new_features[source].append(features[target]) new_features[target].append(features[source]) new_features = {node: [feature for features in new_features[node] for feature in features] for node, _ in new_features.items()} new_features = [TaggedDocument(words=feature, tags=[str(node)]) for node, feature in new_features.items()] return new_features
Example #8
Source File: gl2vec.py From karateclub with GNU General Public License v3.0 | 6 votes |
def fit(self, graphs): """ Fitting a GL2Vec model. Arg types: * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded. """ self._set_seed() self._check_graphs(graphs) graphs = [self._create_line_graph(graph) for graph in graphs] documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, False) for graph in graphs] documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)] model = Doc2Vec(documents, vector_size=self.dimensions, window=0, min_count=self.min_count, dm=0, sample=self.down_sampling, workers=self.workers, epochs=self.epochs, alpha=self.learning_rate, seed=self.seed) self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]
Example #9
Source File: doc2vec.py From asreview with Apache License 2.0 | 5 votes |
def transform(self, texts): corpus = [TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts)] if self.dm == 2: X_dm = _transform_text(self.model_dm, corpus) X_dbow = _transform_text(self.model_dbow, corpus) X = np.concatenate((X_dm, X_dbow), axis=1) else: X = _transform_text(self.model, corpus) return X
Example #10
Source File: graph2vec.py From graph2vec with GNU General Public License v3.0 | 5 votes |
def feature_extractor(path, rounds): """ Function to extract WL features from a graph. :param path: The path to the graph json. :param rounds: Number of WL iterations. :return doc: Document collection object. """ graph, features, name = dataset_reader(path) machine = WeisfeilerLehmanMachine(graph, features, rounds) doc = TaggedDocument(words=machine.extracted_features, tags=["g_" + name]) return doc
Example #11
Source File: echoDoc0.1.py From EchoBurst with MIT License | 5 votes |
def __iter__(self): deck = [] for line in open(self.filename, encoding="utf-8"): deck.append(line) if len(deck) >= 10000000: shuffle(deck) for card in deck: csv = card.split(",") subreddit = csv[0] body = csv[1].split() yield TaggedDocument(words=body, tags=[subreddit, clusterLabel[subreddit]]) deck = []
Example #12
Source File: document_sequence.py From fake-news-detection-pipeline with Apache License 2.0 | 5 votes |
def _set_tagged(self): """set self._set_tagged to list[TaggedDocument] each TaggedDocument has a tag of [index]""" print("listing tagged documents in memory") self._tagged = [TaggedDocument(doc, tags=[index]) for index, doc in enumerate(self._tokenized)]
Example #13
Source File: graph2vec.py From cogdl with MIT License | 5 votes |
def feature_extractor(data, rounds, name): graph = nx.from_edgelist(np.array(data.edge_index.T.cpu(), dtype=int)) if data.x is not None: feature = {int(key): str(val) for key, val in enumerate(np.array(data.x.cpu()))} else: feature = dict(nx.degree(graph)) graph_wl_features = Graph2Vec.wl_iterations(graph, feature, rounds) doc = TaggedDocument(words=graph_wl_features, tags=["g_" + name]) return doc
Example #14
Source File: doc2vec_sentiment.py From textlytics with MIT License | 5 votes |
def to_array(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append( TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])) return self.sentences
Example #15
Source File: utils.py From role2vec with GNU General Public License v3.0 | 5 votes |
def create_documents(features): """ Created tagged documents object from a dictionary. :param features: Keys are document ids and values are strings of the document. :return docs: List of tagged documents. """ docs = [TaggedDocument(words=v, tags=[str(k)]) for k, v in features.items()] return docs
Example #16
Source File: doc2vec_sentiment.py From textlytics with MIT License | 5 votes |
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
Example #17
Source File: test_vec4ir.py From vec4ir with MIT License | 5 votes |
def test_doc2vec_inference_saveload(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10) model.save(TEST_FILE) del model model = Doc2Vec.load(TEST_FILE) os.remove(TEST_FILE) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
Example #18
Source File: test_vec4ir.py From vec4ir with MIT License | 5 votes |
def test_doc2vec_inference(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
Example #19
Source File: doc2vec.py From vec4ir with MIT License | 5 votes |
def fit(self, docs, y): assert len(docs) == len(y) model = self.model n_epochs = self.n_epochs verbose = self.verbose decay = (self.alpha - self.min_alpha) / n_epochs X = [TaggedDocument(self.analyzer(doc), [label]) for doc, label in zip(docs, y)] if verbose > 0: print("First 3 tagged documents:\n", X[:3]) print("Training doc2vec model") # d2v = Doc2Vec() # d2v.build_vocab(X) # if self.intersect is not None: # d2v.intersect_word2vec_format(self.intersect) model.build_vocab(X) for epoch in range(n_epochs): if verbose: print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs)) model.train(X) model.alpha -= decay # apply global decay model.min_alpha = model.alpha # but no decay inside one epoch if verbose > 0: print("Finished.") print("model:", self.model) if self._matching: self._matching.fit(docs) else: # if we dont do matching, its enough to fit a nearest neighbors on # all centroids before query time dvs = np.asarray([model.docvecs[tag] for tag in y]) self._neighbors.fit(dvs) self._y = y return self
Example #20
Source File: features_nn.py From Semantic-Texual-Similarity-Toolkits with MIT License | 5 votes |
def extract_instances(self, train_instances): sentences = [] for idx, train_instance in enumerate(train_instances): sa, sb = train_instance.get_word(type='lemma', lower=True) sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx])) sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx])) model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000) features = [] infos = [] for idx in range(len(train_instances)): vec_a = model.docvecs['sa_%d' % idx] vec_b = model.docvecs['sb_%d' % idx] feature, info = vk.get_all_kernel(vec_a, vec_b) features.append(feature) infos.append([]) # infos.append([vec_a, vec_b]) return features, infos # def load_instances(self, train_instances): # """ # extract cosine distance from already trained feature file # without modify the feature_file # this function's priority is higher that the above extract_instances # """ # # _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file) # features = [] # infos = [] # ''' get features from train instances''' # for _feature in _features: # feature = Feature._feat_string_to_list(_feature, _n_dim) # features.append([feature[1]]) # infos.append(['cosine']) # # features = [ Feature._feat_list_to_string(feature) for feature in features ] # # return features, 1, _n_instance
Example #21
Source File: build_doc2vec_trainingset.py From altair with Apache License 2.0 | 5 votes |
def main(script_folder,output_folder,min_script_len,max_total_files,max_per_pkl): doc2vec_tagged_documents = list() counter = 0 logger.info("retrieving files") just_started = True # Retrieve files containing Python scripts # Altair's JSON format uses the 'content' label for the script code for py_file in sorted(os.listdir(script_folder)): if counter>= max_total_files: break fullpath = os.path.join(script_folder, py_file) with open(fullpath, "r") as py_file_contents: for line in py_file_contents: if counter >= max_total_files: break if counter!=0 and counter % 50000 == 0: logger.info("processed %d files" % counter) if not just_started and counter % max_per_pkl == 0: logger.info("Saving pickle file of tagged documents for size %d",max_per_pkl) pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb")) doc2vec_tagged_documents = list() just_started = True parsed_json = json.loads(line) code, _ = separate_code_and_comments(parsed_json['content'],py_file) if len(code) < min_script_len: continue else: tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True) if len(tokenized_code) > 1: doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter])) counter += 1 just_started = False logger.info("Saving final pickle file of tagged documents for size %d",max_per_pkl) pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb")) # Run this when called from CLI
Example #22
Source File: build_doc2vec_model.py From altair with Apache License 2.0 | 5 votes |
def main(script_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, max_script_count, min_script_len, negative): doc2vec_tagged_documents = list() counter = 0 logger.info("retrieving files") # Retrieve files containing Python scripts # Altair's JSON format uses the 'content' label for the script code for py_file in sorted(os.listdir(script_folder)): if counter >= max_script_count: break if counter % 100000 == 0: logger.info("processed %d files" % counter) fullpath = os.path.join(script_folder, py_file) with open(fullpath, "r") as py_file_contents: for line in py_file_contents: parsed_json = json.loads(line) code, comments = separate_code_and_comments(parsed_json['content'],py_file) if len(code) < min_script_len: continue else: tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True) doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter])) counter += 1 doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,training_algorithm,num_cores,epochs,vector_size,window,min_count,alpha,negative) # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available # If keep_inference is set to false, infer_vector on a new document is no longer possible doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True) # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own method for saving/loading models # doc2vec_model.save(model_pickle_filename) # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename) #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename) pickle.dump(doc2vec_model, open(model_pickle_filename, "wb")) logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename) # Run this when called from CLI
Example #23
Source File: run_doc2vec.py From KATE with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test(args): vocab = load_json(args.vocab) # load corpus corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True) # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True) # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True) # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True) corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus) d2v = load_doc2vec(args.load_model) doc_codes = predict(d2v, corpus_iter) dump_json(doc_codes, args.output) import pdb;pdb.set_trace()
Example #24
Source File: sent_utils.py From embedding with MIT License | 5 votes |
def __iter__(self): with open(self.fname, encoding='utf-8') as f: for line in f: try: sentence, movie_id = line.strip().split("\u241E") tokens = self.tokenizer.morphs(sentence) tagged_doc = TaggedDocument(words=tokens, tags=['MOVIE_%s' % movie_id]) yield tagged_doc except: continue
Example #25
Source File: utils.py From MUSAE with GNU General Public License v3.0 | 5 votes |
def create_documents(features): """ From a feature hash create a list of TaggedDocuments. :param features: Feature hash table - keys are nodes, values are feature lists. :return docs: Tagged Documents list. """ docs = [TaggedDocument(words=v, tags=[str(k)]) for k, v in features.items()] return docs
Example #26
Source File: musae.py From karateclub with GNU General Public License v3.0 | 5 votes |
def _create_base_docs(self): features_out = [TaggedDocument(words=[str(feature) for feature in features], tags = [str(node)]) for node, features in self.features.items()] return features_out
Example #27
Source File: musae.py From karateclub with GNU General Public License v3.0 | 5 votes |
def _create_documents(self, features): features_out = [TaggedDocument(words=[str(feat) for feat_elems in feature_set for feat in feat_elems], tags = [str(node)]) for node, feature_set in features.items()] return features_out