Python gensim.models.word2vec.Word2Vec() Examples

The following are 30 code examples of gensim.models.word2vec.Word2Vec(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.word2vec , or try the search function .
Example #1
Source File: text2vec.py    From chatbot_by_similarity with MIT License 7 votes vote down vote up
def creat_dict(texts_cut=None,
               sg=1,
               size=128,
               window=5,
               min_count=1):
    '''
    训练词向量模型词典
    :param texts_cut: Word list of texts
    :param sg: 0 CBOW,1 skip-gram
    :param size: The dimensionality of the feature vectors
    :param window: The maximum distance between the current and predicted word within a sentence
    :param min_count: Ignore all words with total frequency lower than this
    :return:
    '''
    model_word2vec = word2vec.Word2Vec(texts_cut, sg=sg, size=size, window=window, min_count=min_count)
    return model_word2vec 
Example #2
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingSgNegative(self):
        """Test skip-gram (negative sampling) word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, hs=0, negative=2)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=0, negative=2)
        self.models_equal(model, model2) 
Example #3
Source File: data_utils.py    From CCKS2019-Chinese-Clinical-NER with MIT License 6 votes vote down vote up
def get_embedding_matrix(model_filepath, word2id):
    """
    Get the embedding matrix of the word2vec model
    :param model_filepath: the file path to the pre-build word2vec model
    :param word2id: the directory mapping from word to id
    :return: the embedding matrix of the word2vec model
    """
    word2vec_model = Word2Vec.load(model_filepath)
    embeddings_dict = __get_embedding_dict(model_filepath)
    embedding_matrix = np.zeros((len(word2id) + 1, word2vec_model.vector_size))
    for word, idx in word2id.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix 
Example #4
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingCbowNegative(self):
        """Test CBOW (negative sampling) word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2)
        self.models_equal(model, model2) 
Example #5
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingSgNegative(self):
        """Test skip-gram (negative sampling) word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, hs=0, negative=2)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=0, negative=2)
        self.models_equal(model, model2) 
Example #6
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingCbowNegative(self):
        """Test CBOW (negative sampling) word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2)
        self.models_equal(model, model2) 
Example #7
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingSgNegative(self):
        """Test skip-gram (negative sampling) word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, hs=0, negative=2)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=0, negative=2)
        self.models_equal(model, model2) 
Example #8
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingCbow(self):
        """Test CBOW word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0)
        self.models_equal(model, model2) 
Example #9
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingCbow(self):
        """Test CBOW word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0)
        self.models_equal(model, model2) 
Example #10
Source File: textAnalysis.py    From deep_learning with MIT License 6 votes vote down vote up
def predictData():
    """
    使用模型预测真实数据

    """
    input_texts = ["很好很满意","不好不满意","质量有问题","商家态度很差","售后很渣,渣渣"]

    # word_model = word2vec.Word2Vec.load('./models/Word2vec_model.model')
    # w2indx, w2vec, texts = create_dictionaries(word_model, texts)
    # print(texts)

    texts = predict_wordtoVect(input_texts)

    model = get_model()
    # # 预测
    pred_result = model.predict_classes(texts)
    print(pred_result)
    labels = [int(round(x[0])) for x in pred_result]
    label2word = {1: '正面', 0: '负面'}
    for i in range(len(pred_result)):
        print('{0} -------- {1}'.format(label2word[labels[i]], input_texts[i])) 
Example #11
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingCbow(self):
        """Test CBOW word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0)
        self.models_equal(model, model2) 
Example #12
Source File: class_w2v.py    From 2016CCF-sougou with Apache License 2.0 6 votes vote down vote up
def load_trainsform(self,X):
        """
        载入模型,并且生成wv向量
        :param X:读入的文档,list
        :return:np.array
        """
        print '载入模型中'
        model = word2vec.Word2Vec.load('20w_size_win100_300.model') #填写你的路径
        print '加载成功'
        res=np.zeros((len(X),self.size))
        print '生成w2v向量中..'
        for i,line in enumerate(X):
            line=line.decode('utf-8')
            terms=line.split()
            count=0
            for j,term in enumerate(terms):
                try:#---try失败说明X中有单词不在model中,训练的时候model的模型是min_count的 忽略了一部分单词
                    count += 1
                    res[i]+=np.array(model[term])
                except:
                    1 == 1
            if count!=0:
                res[i]=res[i]/float(count) # 求均值
        return res 
Example #13
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTrainingCbowNegative(self):
        """Test CBOW (negative sampling) word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        self.assertEqual(sims, sims2[1:])  # ignore first element of sims2, which is 'graph' itself

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2)
        self.models_equal(model, model2) 
Example #14
Source File: CategoryEmbeddings.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def embed_category(self, category, model=None):
        '''

        :param model: gensim word2vec.Word2Vec model
        :param term_acceptance_re : SRE_Pattern, Regular expression to identify
            valid terms, default re.compile('[a-z]{3,}')
        :return: EmbeddingsResolver
        '''
        self._verify_category(category)
        if self.term_acceptance_re is not None:
            acceptable_terms = set([t for t in self.corpus_.get_terms() if self.term_acceptance_re.match(t)])
        else:
            acceptable_terms = set(self.corpus_.get_terms())
        trained_model = CategorySpecificWord2VecFromParsedCorpus(self.corpus_, category, model).train()
        self.category_word2vec_model_[category] = trained_model
        word2dwe = {word: trained_model[word] for word in trained_model.wv.vocab.keys()}
        self.category_embeddings_[category] = word2dwe
        return self 
Example #15
Source File: Word2VecFromParsedCorpus.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def __init__(self, corpus, word2vec_model=None):
		'''
		Parameters
		----------
		corpus: ParsedCorpus
		  from which to build word2vec model
		word2vec_model: word2vec.Word2Vec
			Gensim instance to be used to train word2vec model
		'''
		try:
			from gensim.models import word2vec
			assert word2vec_model is None or isinstance(word2vec_model, word2vec.Word2Vec)
		except:
			warnings.warn("You should really install gensim, but we're going to duck-type your model and pray it works")
		assert isinstance(corpus, ParsedCorpus)
		self.corpus = corpus
		self.model = self._get_word2vec_model(word2vec_model) 
Example #16
Source File: Word2VecFromParsedCorpus.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def _default_word2vec_model(self):
		from gensim.models import word2vec
		return word2vec.Word2Vec(size=100,
		                         alpha=0.025,
		                         window=5,
		                         min_count=5,
		                         max_vocab_size=None,
		                         sample=0,
		                         seed=1,
		                         workers=1,
		                         min_alpha=0.0001,
		                         sg=1,
		                         hs=1,
		                         negative=0,
		                         cbow_mean=0,
		                         iter=1,
		                         null_word=0,
		                         trim_rule=None,
		                         sorted_vocab=1) 
Example #17
Source File: walklets.py    From walklets with GNU General Public License v3.0 6 votes vote down vote up
def create_embedding(self):
        """
        Creating a multi-scale embedding.
        """
        self.embedding = []
        for index in range(1, self.args.window_size+1):
            print("\nOptimization round: "+str(index)+"/"+str(self.args.window_size)+".")
            print("Creating documents.")
            clean_documents = self.walk_extracts(index)
            print("Fitting model.")

            model = Word2Vec(clean_documents,
                             size=self.args.dimensions,
                             window=1,
                             min_count=self.args.min_count,
                             sg=1,
                             workers=self.args.workers)

            new_embedding = self.get_embedding(model)
            self.embedding = self.embedding + [new_embedding]
        self.embedding = np.concatenate(self.embedding, axis=1) 
Example #18
Source File: diff2vec.py    From karateclub with GNU General Public License v3.0 6 votes vote down vote up
def fit(self, graph):
        """
        Fitting a Diff2Vec model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        self._set_seed()
        self._check_graph(graph)
        diffuser = EulerianDiffuser(self.diffusion_number, self.diffusion_cover)
        diffuser.do_diffusions(graph)

        model = Word2Vec(diffuser.diffusions,
                         hs=1,
                         alpha=self.learning_rate,
                         iter=self.epochs,
                         size=self.dimensions,
                         window=self.window_size,
                         min_count=self.min_count,
                         workers=self.workers,
                         seed=self.seed)

        num_of_nodes = graph.number_of_nodes()
        self._embedding = [model[str(n)] for n in range(num_of_nodes)] 
Example #19
Source File: deepwalk.py    From karateclub with GNU General Public License v3.0 6 votes vote down vote up
def fit(self, graph):
        """
        Fitting a DeepWalk model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        self._set_seed()
        self._check_graph(graph)
        walker = RandomWalker(self.walk_length, self.walk_number)
        walker.do_walks(graph)

        model = Word2Vec(walker.walks,
                         hs=1,
                         alpha=self.learning_rate,
                         iter=self.epochs,
                         size=self.dimensions,
                         window=self.window_size,
                         min_count=self.min_count,
                         workers=self.workers,
                         seed=self.seed)

        num_of_nodes = graph.number_of_nodes()
        self._embedding = [model[str(n)] for n in range(num_of_nodes)] 
Example #20
Source File: MyWord2Vec.py    From SAIVS with Apache License 2.0 5 votes vote down vote up
def learn_sentense(self):
        if os.path.exists(MODEL_NAME):
            # print('Using Word2Vec :', MODEL_NAME)
            return
        else:
            print('Learning sentense...')
            logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
            obj_sentense = word2vec.Text8Corpus(DATA_PATH)
            obj_model = word2vec.Word2Vec(obj_sentense, size=200, min_count=20, window=15)
            obj_model.save(MODEL_NAME)
            return 
Example #21
Source File: MyWord2Vec.py    From SAIVS with Apache License 2.0 5 votes vote down vote up
def get_candidate_word(self, str_target_word):
        self.learn_sentense()
        obj_model = word2vec.Word2Vec.load(MODEL_NAME)
        str_word = str_target_word
        lst_nega = []
        return self.cal_similarity([str_word.encode()], lst_nega, obj_model) 
Example #22
Source File: utils.py    From russe-evaluation with MIT License 5 votes vote down vote up
def load_vectors(fvec):
#    return gs.models.Word2Vec.load_word2vec_format(fvec,binary=True)
    return load_word2vec_format(fvec, binary=True) 
Example #23
Source File: gensim_nlpir.py    From nlp_learning with MIT License 5 votes vote down vote up
def mode_training():
    """
    模型训练
    """
    # 读取文件下下面的文件
    # sentences = MySentences('/some/directory')
    # 分词数据
    sentences = word2vec.Text8Corpus('data/xuezhong_seg_1.txt')
    # 训练 size参数主要是用来设置神经网络的层数
    # workers参数用于设置并发训练时候的线程数,不过仅当Cython安装的情况
    model = word2vec.Word2Vec(
        sentences, min_count=20, size=4000, window=10, workers=4)


    # model.sort_vocab()

    # 计算两个词的相似度/相关程度
    # simil_1 = model.wv.similarity(u"王仙芝", u"老怪物")
    # simil_2 = model.wv.similarity(u"徐凤年", u"殿下")
    # print("【王仙芝】和【老怪物】相似度:", simil_1)
    # print("【徐凤年】和【世子】相似度:", simil_2)

    # # 计算某个词的相关词列表
    # lar = model.wv.most_similar(u"徐凤年", topn=20)  # 20个最相关的
    # print("【徐凤年】相关性:", lar)

    # 保存模型,以便重用
    model.save(u"models/xue.model")
    print("training finished") 
Example #24
Source File: data_utils.py    From CCKS2019-Chinese-Clinical-NER with MIT License 5 votes vote down vote up
def __get_embedding_dict(model_filepath):
    embedding_dict = {}
    word2vec_model = Word2Vec.load(model_filepath)
    vocab = [(word, word2vec_model.wv[word]) for word, vectors in word2vec_model.wv.vocab.items()]

    for i in range(len(vocab)):
        word = vocab[i][0]
        vectors = vocab[i][1]
        embedding_dict[word] = vectors

    return embedding_dict 
Example #25
Source File: data_utils.py    From CCKS2019-Chinese-Clinical-NER with MIT License 5 votes vote down vote up
def train_word2vec_model(data_filepath, model_filepath, embedding_dim):
    seg_txt_list = __get_seg_txt_list(data_filepath)
    model = word2vec.Word2Vec(seg_txt_list, size=embedding_dim, window=5, min_count=1)
    model.save(model_filepath) 
Example #26
Source File: word2vec_model.py    From SMPCUP2017 with MIT License 5 votes vote down vote up
def word2vec_model(blog_seg_path):
    sentences = word2vec.LineSentence(blog_seg_path)
    model = word2vec.Word2Vec(sentences, workers=4)
    return model 
Example #27
Source File: indexing.py    From faiss-server with MIT License 5 votes vote down vote up
def get_vector():
    if not os.path.isfile(MODEL_FILE_PATH):
        sentences = word2vec.Text8Corpus(fname=DATASET_FILE_PATH)
        model = word2vec.Word2Vec(sentences=sentences)
        model.wv.save_word2vec_format(fname=MODEL_FILE_PATH)
        return model
    else:
        return KeyedVectors.load_word2vec_format(
            fname=MODEL_FILE_PATH, binary=False) 
Example #28
Source File: word2vec.py    From nlp-journey with Apache License 2.0 5 votes vote down vote up
def load(self):
        # 加载模型文件
        try:
            model = word2vec.Word2Vec.load(self.model_path)
        except FileNotFoundError:
            model = None
        return model 
Example #29
Source File: word2vec.py    From nlp-journey with Apache License 2.0 5 votes vote down vote up
def train(self):
        sentences = process_data(self.train_file)
        model = word2vec.Word2Vec(sentences, min_count=2, window=3, size=self.embed_size, workers=4)
        return model 
Example #30
Source File: walklets.py    From walklets with GNU General Public License v3.0 5 votes vote down vote up
def get_embedding(self, model):
        """
        Extracting the embedding according to node order from the embedding model.
        :param model: A Word2Vec model after model fitting.
        :return embedding: A numpy array with the embedding sorted by node IDs.
        """
        embedding = []
        for node in range(len(self.graph.nodes())):
            embedding.append(list(model[str(node)]))
        embedding = np.array(embedding)
        return embedding