Python gensim.models.Doc2Vec.load() Examples

The following are 15 code examples of gensim.models.Doc2Vec.load(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.Doc2Vec , or try the search function .
Example #1
Source File: dict_utils.py    From Semantic-Texual-Similarity-Toolkits with MIT License 6 votes vote down vote up
def load_idf_dict(self, dict_name='idf_dict'):

        if dict_name not in self.dict_manager:

            word_frequencies = {}

            file_name = config.EX_DICT_DIR + '/word-frequencies.txt'
            print('load dict from file %s \n' % file_name)

            f_dict = utils.create_read_file(file_name)

            for idx, line in enumerate(f_dict):
                if idx == 0:
                    totfreq = int(line)
                else:
                    w, freq = line.strip().split()
                    freq = float(freq)
                    if freq < 10:
                        continue
                    word_frequencies[w] = math.log(totfreq / freq)  / math.log(2)
            self.dict_manager[dict_name] = word_frequencies

        return self.dict_manager[dict_name] 
Example #2
Source File: document2vec.py    From Document2Vec with MIT License 5 votes vote down vote up
def load_from_pickle(self, filename):
        """
        This loads a pretrained Word2Vec file into this Doc2Vec class.
        """
        model_w2v = Doc2Vec.load(filename)
        for attr in dir(model_w2v):
            if attr == '__dict__':
                continue
            # Skip methods that we already have in this class
            if attr in dir(self) and callable(getattr(model_w2v, attr)):
                continue
            try:
                setattr(self, attr, getattr(model_w2v, attr))
            except AttributeError:
                continue 
Example #3
Source File: sent_eval.py    From embedding with MIT License 5 votes vote down vote up
def __init__(self, model_fname="data/doc2vec.vecs", use_notebook=False):
        self.model = Doc2Vec.load(model_fname)
        self.doc2idx = {el:idx for idx, el in enumerate(self.model.docvecs.doctags.keys())}
        self.use_notebook = use_notebook 
Example #4
Source File: sent_eval.py    From embedding with MIT License 5 votes vote down vote up
def __init__(self, model_path="data/lda.results", tokenizer_name="mecab"):
        self.tokenizer = get_tokenizer(tokenizer_name)
        self.all_topics = self.load_results(model_path + ".results")
        self.model = LdaModel.load(model_path + ".model") 
Example #5
Source File: doc2vec.py    From KATE with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_doc2vec(mod_file):
    return Doc2Vec.load(mod_file) 
Example #6
Source File: dict_utils.py    From Semantic-Texual-Similarity-Toolkits with MIT License 5 votes vote down vote up
def load_dict(self, dict_name, path=config.DICT_DIR):
        """
        path: config.DICT_DIR
              config.DICT_EX_DIR
        """
        if dict_name not in self.dict_manager:

            dict_object = {}

            cur_dir = os.path.dirname(__file__)
            path = os.path.join(cur_dir, '../resources')

            ''' load dict from file '''
            file_name = path + '/dict_%s.txt' % dict_name
            print('load dict from file %s \n' % file_name)

            f_dict = utils.create_read_file(file_name)

            for idx, line in enumerate(f_dict):
                line = line.strip().split('\t')
                if len(line) == 1:
                    dict_object[line[0]] = idx + 1
                elif len(line) == 2:
                    dict_object[line[0]] = eval(line[1])
                else:
                    raise NotImplementedError

            self.dict_manager[dict_name] = dict_object

        return self.dict_manager[dict_name] 
Example #7
Source File: dict_utils.py    From Semantic-Texual-Similarity-Toolkits with MIT License 5 votes vote down vote up
def load_doc2vec(self):
        dict_name = 'doc2vec'
        if dict_name not in self.dict_manager:
            from gensim.models import Doc2Vec
            model = Doc2Vec.load(config.EX_DICT_DIR + '/doc2vec.model')
            self.dict_manager[dict_name] = model
        return self.dict_manager[dict_name] 
Example #8
Source File: test_vec4ir.py    From vec4ir with MIT License 5 votes vote down vote up
def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1 
Example #9
Source File: echoDoc0.1.py    From EchoBurst with MIT License 5 votes vote down vote up
def retrainModel(vectorFile, dataFile, outputFile, iterations):
    documents = LabeledLineSentence("Data\\" + dataFile)
    model = Doc2Vec.load("Models\\" + vectorFile)
    for epoch in range(iterations):
        model.train(documents)
    model.save("Models\\" + outputFile) 
Example #10
Source File: echoDoc0.1.py    From EchoBurst with MIT License 5 votes vote down vote up
def testModel(inputFile):
    model = Doc2Vec.load("Models\\" + inputFile)
    while True:
        choice = input("Press 1 to compare documents within the model to each other.\n"
                       "Press 2 to run similarity tests on individual words.\n"
                       "Press 3 to get the top related subreddits for an inferred new vector (comment).\n"
                       "Hit any key to exit.\n")
        if choice == "1":
            docChoice = input("Enter the subreddit you want to test.\n")
            print(model.docvecs.most_similar(docChoice))
        elif choice == "2":
            wordChoice = input("Enter the word you wish to analyze.\n").lower()
            print(model.most_similar(wordChoice))
        elif choice == "3":
            with open("testing.txt") as t:
                resultList = []
                testDocs = t.readlines()
                for doc in testDocs:
                    doc = doc.split("\t")
                    tag = doc[0]
                    body = doc[1]
                    newVec = model.infer_vector(body.split())
                    resultList.append("The original category is {}: {}\n {}\n".
                                      format(tag, body, model.docvecs.most_similar(positive=[newVec])))
                with open("clusteredResults.txt", "a") as x:
                    for element in resultList:
                        x.write(element)
        else:
            break 
Example #11
Source File: echoDoc0.1.py    From EchoBurst with MIT License 5 votes vote down vote up
def newKMeansModel(vectorFile, outputFile, numClusters):
    # https://stackoverflow.com/questions/43476869/doc2vec-sentence-clustering

    model = Doc2Vec.load("Models\\" + vectorFile)
    docVecs = model.docvecs.doctag_syn0
    km = KMeans(n_clusters=numClusters)
    print("Starting")
    km.fit(docVecs)
    print("Fitting Data")
    joblib.dump(km, outputFile) 
Example #12
Source File: echoDoc0.1.py    From EchoBurst with MIT License 5 votes vote down vote up
def loadKMeansModel(vectorFile, clusterFile, csvFile):
    # https://stackoverflow.com/questions/43476869/doc2vec-sentence-clustering

    model = Doc2Vec.load("Models\\" + vectorFile)
    km = joblib.load(clusterFile)
    clusters = km.labels_.tolist()
    cluster_info = {'labels': model.docvecs.offset2doctag,
                    "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in model.docvecs.offset2doctag],
                    'clusters': clusters}
    sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
                              columns=['labels', "index, wordcount and repeated words", 'clusters'])
    print(sentenceDF)
    sentenceDF.to_csv(csvFile) 
Example #13
Source File: echoDoc0.1.py    From EchoBurst with MIT License 5 votes vote down vote up
def newDBSCANModel(vectorFile, outputFile):
    model = Doc2Vec.load("Models\\" + vectorFile)
    vecs = []
    for doc in range(0, len(model.docvecs)):
        doc_vec = model.docvecs[doc]
        # print doc_vec
        vecs.append(doc_vec.reshape((1, 300)))

    doc_vecs = np.array(vecs, dtype='float')  # TSNE expects float type values

    # print doc_vecs
    docs = []
    for i in doc_vecs:
        docs.append(i[0])
    db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs)
    joblib.dump(db, outputFile)


    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters = db.labels_.tolist()
    cluster_info = {'labels': model.docvecs.offset2doctag,
                    "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in
                                                            model.docvecs.offset2doctag],
                    'clusters': clusters}
    sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
                              columns=['labels', "index, wordcount and repeated words", 'clusters'])
    print(sentenceDF)
    sentenceDF.to_csv("DBSCAN.csv")

    print('Estimated number of clusters: %d' % n_clusters_) 
Example #14
Source File: echoDoc0.1.py    From EchoBurst with MIT License 5 votes vote down vote up
def plotModel3D(vectorFile, numClusters):
    # http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html

    model = Doc2Vec.load("Models\\" + vectorFile)
    docVecs = model.docvecs.doctag_syn0
    reduced_data = PCA(n_components=10).fit_transform(docVecs)
    kmeans = KMeans(init='k-means++', n_clusters=numClusters, n_init=10)

    fig = plt.figure(1, figsize=(10, 10))
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    kmeans.fit(reduced_data)
    labels = kmeans.labels_

    ax.scatter(reduced_data[:, 5], reduced_data[:, 2], reduced_data[:, 3], c=labels.astype(np.float))
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    # Plot the ground truth
    fig = plt.figure(1, figsize=(10, 10))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
    plt.cla()
    ax.scatter(reduced_data[:, 5], reduced_data[:, 2], reduced_data[:, 3], c=labels.astype(np.float))
    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
    ax.w_zaxis.set_ticklabels([])
    plt.show() 
Example #15
Source File: title2vec.py    From OAG with MIT License 5 votes vote down vote up
def load_model(self):
        logger.info('loading doc2vec model name %s', self.model_fname)
        self.model = Doc2Vec.load(join(self.model_dir, self.model_fname))
        logger.info('doc2vec model %s loaded', self.model_fname)
        return self.model