Python Examples of gensim.models.doc2vec.TaggedDocument

Source File: paragraph_vectors.py From sato with Apache License 2.0

7 votes

def tagcol_paragraph_embeddings_features(train_data):
    
    # Expects a dataframe with a 'values' column
    train_data_values = train_data['values']
    columns = [TaggedDocument( random.sample(col, min(1000, len(col))) , [i]) for i, col in enumerate(train_data_values.values)]
    
    return columns

# Input: returned tagged document collection from tagcol_paragraph_embeddings_features
# Only needed for training.

Source File: doc2vec.py From asreview with Apache License 2.0

7 votes

def fit(self, texts):

        model_param = {
            "vector_size": self.vector_size,
            "epochs": self.epochs,
            "min_count": self.min_count,
            "workers": self.n_jobs,
            "window": self.window,
            "dm_concat": self.dm_concat,
            "dbow_words": self.dbow_words,
        }

        corpus = [TaggedDocument(simple_preprocess(text), [i])
                  for i, text in enumerate(texts)]

        # If self.dm is 2, train both models and concatenate the feature
        # vectors later. Resulting vector size should be the same.
        if self.dm == 2:
            model_param["vector_size"] = int(model_param["vector_size"]/2)
            self.model_dm = _train_model(corpus, **model_param, dm=1)
            self.model_dbow = _train_model(corpus, **model_param, dm=0)
        else:
            self.model = _train_model(corpus, **model_param, dm=self.dm)

Source File: graph2vec.py From karateclub with GNU General Public License v3.0

6 votes

def fit(self, graphs):
        """
        Fitting a Graph2Vec model.

        Arg types:
            * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
        """
        self._set_seed()
        self._check_graphs(graphs)
        documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, self.attributed) for graph in graphs]
        documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]

        model = Doc2Vec(documents,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        epochs=self.epochs,
                        alpha=self.learning_rate,
                        seed=self.seed)

        self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]

Source File: parseundp.py From Semantic-Search-for-Sustainable-Development with Apache License 2.0

6 votes

def read_corpus(path = '.', exclude = [], targets = None):
    i= 0
    for file in os.listdir(path):
        if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file
            print(file)
            with open(os.path.join(path, file),  encoding="utf8") as document_text:
                for line in document_text:
                    count = 0
                    words = simple_preprocess(line)
                    for word in words: # count the number of words with <= 3 characters
                        if len(word) <= 3:
                            count += 1
                    if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less 
                        yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words
                        i+=1
    if targets:
        for key, val in targets.items():
            yield(doc2vec.TaggedDocument(simple_preprocess(val), [i]))
            i+=1

Source File: helper.py From diff2vec with GNU General Public License v3.0

6 votes

def process_non_pooled_model_data(walks, counts, args):
    """
    Function to extract proximity statistics.
    :param walks: Diffusion lists.
    :param counts: Number of nodes.
    :param args: Arguments objects.
    :return docs: Processed walks.
    """
    print("Run feature extraction across windows.")
    features = {str(node): [] for node in range(counts)}
    for walk in tqdm(walks):
        for i in range(len(walk)-args.window_size):
            for j in range(1, args.window_size+1):
                features[walk[i]].append(["+"+str(j)+"_"+walk[i+j]])
                features[walk[i+j]].append(["_"+str(j)+"_"+walk[i]])

    docs = [TaggedDocument(words=[x[0] for x in v], tags=[str(k)]) for k, v in features.items()]
    return docs

Source File: run_doc2vec.py From KATE with BSD 3-Clause "New" or "Revised" License

6 votes

def train(args):
    vocab = load_json(args.vocab)
    # import pdb;pdb.set_trace()
    # load corpus
    corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
    # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
    corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)

    d2v = MyDoc2Vec(args.n_dim, window=args.window_size, \
        negative=args.negative, epoches=args.n_epoch, dm_concat=1)

    start = timeit.default_timer()
    d2v.train(corpus_iter)
    print 'runtime: %ss' % (timeit.default_timer() - start)

    save_doc2vec(d2v.model, args.save_model)
    import pdb;pdb.set_trace()

Source File: role2vec.py From karateclub with GNU General Public License v3.0

6 votes

def _create_documents(self, walks, features):
        """
        Accumulating the WL feature in neighbourhoods.
        
        Arg types:
            * **walks** *(list of lists)* - Random walks with string ids.

        Return types:
            * **new_features** *(list of TaggedDocument objects)* - The pooled features of nodes.
        """
        new_features = {node: [] for node, feature in features.items()}
        walks = self._transform_walks(walks)
        for walk in walks:
            for i in range(self.walk_length-self.window_size):
                for j in range(self.window_size):
                    source = walk[i]
                    target = walk[i+j]
                    new_features[source].append(features[target])
                    new_features[target].append(features[source])

        new_features = {node: [feature for features in new_features[node] for feature in features] for node, _ in new_features.items()}
        new_features = [TaggedDocument(words=feature, tags=[str(node)]) for node, feature in new_features.items()]
        return new_features

Source File: gl2vec.py From karateclub with GNU General Public License v3.0

6 votes

def fit(self, graphs):
        """
        Fitting a GL2Vec model.

        Arg types:
            * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
        """
        self._set_seed()
        self._check_graphs(graphs)
        graphs = [self._create_line_graph(graph) for graph in graphs]
        documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, False) for graph in graphs]
        documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]

        model = Doc2Vec(documents,
                        vector_size=self.dimensions,
                        window=0,
                        min_count=self.min_count,
                        dm=0,
                        sample=self.down_sampling,
                        workers=self.workers,
                        epochs=self.epochs,
                        alpha=self.learning_rate,
                        seed=self.seed)

        self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]

Source File: doc2vec.py From asreview with Apache License 2.0

5 votes

def transform(self, texts):
        corpus = [TaggedDocument(simple_preprocess(text), [i])
                  for i, text in enumerate(texts)]

        if self.dm == 2:
            X_dm = _transform_text(self.model_dm, corpus)
            X_dbow = _transform_text(self.model_dbow, corpus)
            X = np.concatenate((X_dm, X_dbow), axis=1)
        else:
            X = _transform_text(self.model, corpus)
        return X

Source File: graph2vec.py From graph2vec with GNU General Public License v3.0

5 votes

def feature_extractor(path, rounds):
    """
    Function to extract WL features from a graph.
    :param path: The path to the graph json.
    :param rounds: Number of WL iterations.
    :return doc: Document collection object.
    """
    graph, features, name = dataset_reader(path)
    machine = WeisfeilerLehmanMachine(graph, features, rounds)
    doc = TaggedDocument(words=machine.extracted_features, tags=["g_" + name])
    return doc

Source File: echoDoc0.1.py From EchoBurst with MIT License

5 votes

def __iter__(self):
        deck = []
        for line in open(self.filename, encoding="utf-8"):
            deck.append(line)
            if len(deck) >= 10000000:
                shuffle(deck)
                for card in deck:
                    csv = card.split(",")
                    subreddit = csv[0]
                    body = csv[1].split()
                    yield TaggedDocument(words=body, tags=[subreddit, clusterLabel[subreddit]])
                deck = []

Source File: document_sequence.py From fake-news-detection-pipeline with Apache License 2.0

5 votes

def _set_tagged(self):
        """set self._set_tagged to list[TaggedDocument] each TaggedDocument has a tag of [index]"""
        print("listing tagged documents in memory")
        self._tagged = [TaggedDocument(doc, tags=[index]) for index, doc in enumerate(self._tokenized)]

Source File: graph2vec.py From cogdl with MIT License

5 votes

def feature_extractor(data, rounds, name):
        graph = nx.from_edgelist(np.array(data.edge_index.T.cpu(), dtype=int))
        if data.x is not None:
            feature = {int(key): str(val) for key, val in enumerate(np.array(data.x.cpu()))}
        else:
            feature = dict(nx.degree(graph))
        graph_wl_features = Graph2Vec.wl_iterations(graph, feature, rounds)
        doc = TaggedDocument(words=graph_wl_features, tags=["g_" + name])
        return doc

Source File: doc2vec_sentiment.py From textlytics with MIT License

5 votes

def to_array(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(
                        TaggedDocument(utils.to_unicode(line).split(),
                                       [prefix + '_%s' % item_no]))
        return self.sentences

Source File: utils.py From role2vec with GNU General Public License v3.0

5 votes

def create_documents(features):
    """
    Created tagged documents object from a dictionary.
    :param features: Keys are document ids and values are strings of the document.
    :return docs: List of tagged documents.
    """
    docs = [TaggedDocument(words=v, tags=[str(k)]) for k, v in features.items()]
    return docs

Source File: doc2vec_sentiment.py From textlytics with MIT License

5 votes

def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield TaggedDocument(utils.to_unicode(line).split(),
                                         [prefix + '_%s' % item_no])

Source File: test_vec4ir.py From vec4ir with MIT License

5 votes

def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1

Source File: test_vec4ir.py From vec4ir with MIT License

5 votes

def test_doc2vec_inference():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1

Source File: doc2vec.py From vec4ir with MIT License

5 votes

def fit(self, docs, y):
        assert len(docs) == len(y)
        model = self.model
        n_epochs = self.n_epochs
        verbose = self.verbose
        decay = (self.alpha - self.min_alpha) / n_epochs
        X = [TaggedDocument(self.analyzer(doc), [label])
             for doc, label in zip(docs, y)]

        if verbose > 0:
            print("First 3 tagged documents:\n", X[:3])
            print("Training doc2vec model")
        # d2v = Doc2Vec()
        # d2v.build_vocab(X)
        # if self.intersect is not None:
        #     d2v.intersect_word2vec_format(self.intersect)
        model.build_vocab(X)
        for epoch in range(n_epochs):
            if verbose:
                print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs))
            model.train(X)
            model.alpha -= decay  # apply global decay
            model.min_alpha = model.alpha  # but no decay inside one epoch

        if verbose > 0:
            print("Finished.")
            print("model:", self.model)

        if self._matching:
            self._matching.fit(docs)
        else:
            # if we dont do matching, its enough to fit a nearest neighbors on
            # all centroids before query time
            dvs = np.asarray([model.docvecs[tag] for tag in y])
            self._neighbors.fit(dvs)

        self._y = y

        return self

Source File: features_nn.py From Semantic-Texual-Similarity-Toolkits with MIT License

5 votes

def extract_instances(self, train_instances):
        sentences = []
        for idx, train_instance in enumerate(train_instances):
            sa, sb = train_instance.get_word(type='lemma', lower=True)
            sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx]))
            sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx]))

        model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000)

        features = []
        infos = []
        for idx in range(len(train_instances)):
            vec_a = model.docvecs['sa_%d' % idx]
            vec_b = model.docvecs['sb_%d' % idx]
            feature, info = vk.get_all_kernel(vec_a, vec_b)
            features.append(feature)
            infos.append([])
            # infos.append([vec_a, vec_b])

        return features, infos

    # def load_instances(self, train_instances):
    #     """
    #     extract cosine distance from already trained feature file
    #     without modify the feature_file
    #     this function's priority is higher that the above extract_instances
    #     """
    #
    #     _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file)
    #     features = []
    #     infos = []
    #     ''' get features from train instances'''
    #     for _feature in _features:
    #         feature = Feature._feat_string_to_list(_feature, _n_dim)
    #         features.append([feature[1]])
    #         infos.append(['cosine'])
    #
    #     features = [ Feature._feat_list_to_string(feature) for feature in features ]
    #
    #     return features, 1, _n_instance

Source File: build_doc2vec_trainingset.py From altair with Apache License 2.0

5 votes

def main(script_folder,output_folder,min_script_len,max_total_files,max_per_pkl):

    doc2vec_tagged_documents = list()
    counter = 0
    logger.info("retrieving files")
    just_started = True

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter>= max_total_files: break
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                if counter >= max_total_files: break
                if counter!=0 and counter % 50000 == 0: logger.info("processed %d files" % counter)
                if not just_started and counter % max_per_pkl == 0:
                    logger.info("Saving pickle file of tagged documents for size %d",max_per_pkl)
                    pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb"))
                    doc2vec_tagged_documents = list()
                    just_started = True
                parsed_json = json.loads(line)
                code, _ = separate_code_and_comments(parsed_json['content'],py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
                    if len(tokenized_code) > 1:
                    	doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
                    	counter += 1
                    	just_started = False
        
    logger.info("Saving final pickle file of tagged documents for size %d",max_per_pkl)            
    pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb"))

# Run this when called from CLI

Source File: build_doc2vec_model.py From altair with Apache License 2.0

5 votes

def main(script_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, max_script_count, min_script_len, negative):

    doc2vec_tagged_documents = list()
    counter = 0

    logger.info("retrieving files")

    # Retrieve files containing Python scripts
    # Altair's JSON format uses the 'content' label for the script code
    for py_file in sorted(os.listdir(script_folder)):
        if counter >= max_script_count: break
        if counter % 100000 == 0: logger.info("processed %d files" % counter)
        fullpath = os.path.join(script_folder, py_file)
        with open(fullpath, "r") as py_file_contents:
            for line in py_file_contents:
                parsed_json = json.loads(line)
                code, comments = separate_code_and_comments(parsed_json['content'],py_file)
                if len(code) < min_script_len:
                    continue
                else:
                    tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
                    doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
                    counter += 1

    doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,training_algorithm,num_cores,epochs,vector_size,window,min_count,alpha,negative)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size
    # If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available
    # If keep_inference is set to false, infer_vector on a new document is no longer possible
    doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True)

    # Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own  method for saving/loading models
    # doc2vec_model.save(model_pickle_filename)
    # doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename)

    #logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename)
    pickle.dump(doc2vec_model, open(model_pickle_filename, "wb"))
    logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename)

# Run this when called from CLI

Source File: run_doc2vec.py From KATE with BSD 3-Clause "New" or "Revised" License

5 votes

def test(args):
    vocab = load_json(args.vocab)
    # load corpus
    corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
    # corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
    # corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
    corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)

    d2v = load_doc2vec(args.load_model)
    doc_codes = predict(d2v, corpus_iter)
    dump_json(doc_codes, args.output)
    import pdb;pdb.set_trace()

Source File: sent_utils.py From embedding with MIT License

5 votes

def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for line in f:
                try:
                    sentence, movie_id = line.strip().split("\u241E")
                    tokens = self.tokenizer.morphs(sentence)
                    tagged_doc = TaggedDocument(words=tokens, tags=['MOVIE_%s' % movie_id])
                    yield tagged_doc
                except:
                    continue

Source File: utils.py From MUSAE with GNU General Public License v3.0

5 votes

def create_documents(features):
    """
    From a feature hash create a list of TaggedDocuments.
    :param features: Feature hash table - keys are nodes, values are feature lists.
    :return docs: Tagged Documents list.
    """
    docs = [TaggedDocument(words=v, tags=[str(k)]) for k, v in features.items()]
    return docs

Source File: musae.py From karateclub with GNU General Public License v3.0

5 votes

def _create_base_docs(self):
        features_out = [TaggedDocument(words=[str(feature) for feature in features], tags = [str(node)]) for node, features in self.features.items()]
        return features_out

Source File: musae.py From karateclub with GNU General Public License v3.0

5 votes

def _create_documents(self, features):
        features_out = [TaggedDocument(words=[str(feat) for feat_elems in feature_set for feat in feat_elems], tags = [str(node)]) for node, feature_set in features.items()]
        return features_out

Python gensim.models.doc2vec.TaggedDocument() Examples