Python gensim.models.word2vec.Word2Vec() Examples
The following are 30
code examples of gensim.models.word2vec.Word2Vec().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models.word2vec
, or try the search function
.
Example #1
Source File: text2vec.py From chatbot_by_similarity with MIT License | 7 votes |
def creat_dict(texts_cut=None, sg=1, size=128, window=5, min_count=1): ''' 训练词向量模型词典 :param texts_cut: Word list of texts :param sg: 0 CBOW,1 skip-gram :param size: The dimensionality of the feature vectors :param window: The maximum distance between the current and predicted word within a sentence :param min_count: Ignore all words with total frequency lower than this :return: ''' model_word2vec = word2vec.Word2Vec(texts_cut, sg=sg, size=size, window=window, min_count=min_count) return model_word2vec
Example #2
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingSgNegative(self): """Test skip-gram (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=0, negative=2) self.models_equal(model, model2)
Example #3
Source File: data_utils.py From CCKS2019-Chinese-Clinical-NER with MIT License | 6 votes |
def get_embedding_matrix(model_filepath, word2id): """ Get the embedding matrix of the word2vec model :param model_filepath: the file path to the pre-build word2vec model :param word2id: the directory mapping from word to id :return: the embedding matrix of the word2vec model """ word2vec_model = Word2Vec.load(model_filepath) embeddings_dict = __get_embedding_dict(model_filepath) embedding_matrix = np.zeros((len(word2id) + 1, word2vec_model.vector_size)) for word, idx in word2id.items(): embedding_vector = embeddings_dict.get(word) if embedding_vector is not None: embedding_matrix[idx] = embedding_vector return embedding_matrix
Example #4
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingCbowNegative(self): """Test CBOW (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2) self.models_equal(model, model2)
Example #5
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingSgNegative(self): """Test skip-gram (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=0, negative=2) self.models_equal(model, model2)
Example #6
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingCbowNegative(self): """Test CBOW (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2) self.models_equal(model, model2)
Example #7
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingSgNegative(self): """Test skip-gram (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=0, negative=2) self.models_equal(model, model2)
Example #8
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingCbow(self): """Test CBOW word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0) self.models_equal(model, model2)
Example #9
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingCbow(self): """Test CBOW word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0) self.models_equal(model, model2)
Example #10
Source File: textAnalysis.py From deep_learning with MIT License | 6 votes |
def predictData(): """ 使用模型预测真实数据 """ input_texts = ["很好很满意","不好不满意","质量有问题","商家态度很差","售后很渣,渣渣"] # word_model = word2vec.Word2Vec.load('./models/Word2vec_model.model') # w2indx, w2vec, texts = create_dictionaries(word_model, texts) # print(texts) texts = predict_wordtoVect(input_texts) model = get_model() # # 预测 pred_result = model.predict_classes(texts) print(pred_result) labels = [int(round(x[0])) for x in pred_result] label2word = {1: '正面', 0: '负面'} for i in range(len(pred_result)): print('{0} -------- {1}'.format(label2word[labels[i]], input_texts[i]))
Example #11
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingCbow(self): """Test CBOW word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0) self.models_equal(model, model2)
Example #12
Source File: class_w2v.py From 2016CCF-sougou with Apache License 2.0 | 6 votes |
def load_trainsform(self,X): """ 载入模型,并且生成wv向量 :param X:读入的文档,list :return:np.array """ print '载入模型中' model = word2vec.Word2Vec.load('20w_size_win100_300.model') #填写你的路径 print '加载成功' res=np.zeros((len(X),self.size)) print '生成w2v向量中..' for i,line in enumerate(X): line=line.decode('utf-8') terms=line.split() count=0 for j,term in enumerate(terms): try:#---try失败说明X中有单词不在model中,训练的时候model的模型是min_count的 忽略了一部分单词 count += 1 res[i]+=np.array(model[term]) except: 1 == 1 if count!=0: res[i]=res[i]/float(count) # 求均值 return res
Example #13
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 6 votes |
def testTrainingCbowNegative(self): """Test CBOW (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.syn0.shape == (len(model.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.vocab), 2)) model.train(sentences) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar # test querying for "most similar" by vector graph_vector = model.syn0norm[model.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) self.assertEqual(sims, sims2[1:]) # ignore first element of sims2, which is 'graph' itself # build vocab and train in one step; must be the same as above model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2) self.models_equal(model, model2)
Example #14
Source File: CategoryEmbeddings.py From scattertext with Apache License 2.0 | 6 votes |
def embed_category(self, category, model=None): ''' :param model: gensim word2vec.Word2Vec model :param term_acceptance_re : SRE_Pattern, Regular expression to identify valid terms, default re.compile('[a-z]{3,}') :return: EmbeddingsResolver ''' self._verify_category(category) if self.term_acceptance_re is not None: acceptable_terms = set([t for t in self.corpus_.get_terms() if self.term_acceptance_re.match(t)]) else: acceptable_terms = set(self.corpus_.get_terms()) trained_model = CategorySpecificWord2VecFromParsedCorpus(self.corpus_, category, model).train() self.category_word2vec_model_[category] = trained_model word2dwe = {word: trained_model[word] for word in trained_model.wv.vocab.keys()} self.category_embeddings_[category] = word2dwe return self
Example #15
Source File: Word2VecFromParsedCorpus.py From scattertext with Apache License 2.0 | 6 votes |
def __init__(self, corpus, word2vec_model=None): ''' Parameters ---------- corpus: ParsedCorpus from which to build word2vec model word2vec_model: word2vec.Word2Vec Gensim instance to be used to train word2vec model ''' try: from gensim.models import word2vec assert word2vec_model is None or isinstance(word2vec_model, word2vec.Word2Vec) except: warnings.warn("You should really install gensim, but we're going to duck-type your model and pray it works") assert isinstance(corpus, ParsedCorpus) self.corpus = corpus self.model = self._get_word2vec_model(word2vec_model)
Example #16
Source File: Word2VecFromParsedCorpus.py From scattertext with Apache License 2.0 | 6 votes |
def _default_word2vec_model(self): from gensim.models import word2vec return word2vec.Word2Vec(size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, iter=1, null_word=0, trim_rule=None, sorted_vocab=1)
Example #17
Source File: walklets.py From walklets with GNU General Public License v3.0 | 6 votes |
def create_embedding(self): """ Creating a multi-scale embedding. """ self.embedding = [] for index in range(1, self.args.window_size+1): print("\nOptimization round: "+str(index)+"/"+str(self.args.window_size)+".") print("Creating documents.") clean_documents = self.walk_extracts(index) print("Fitting model.") model = Word2Vec(clean_documents, size=self.args.dimensions, window=1, min_count=self.args.min_count, sg=1, workers=self.args.workers) new_embedding = self.get_embedding(model) self.embedding = self.embedding + [new_embedding] self.embedding = np.concatenate(self.embedding, axis=1)
Example #18
Source File: diff2vec.py From karateclub with GNU General Public License v3.0 | 6 votes |
def fit(self, graph): """ Fitting a Diff2Vec model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._set_seed() self._check_graph(graph) diffuser = EulerianDiffuser(self.diffusion_number, self.diffusion_cover) diffuser.do_diffusions(graph) model = Word2Vec(diffuser.diffusions, hs=1, alpha=self.learning_rate, iter=self.epochs, size=self.dimensions, window=self.window_size, min_count=self.min_count, workers=self.workers, seed=self.seed) num_of_nodes = graph.number_of_nodes() self._embedding = [model[str(n)] for n in range(num_of_nodes)]
Example #19
Source File: deepwalk.py From karateclub with GNU General Public License v3.0 | 6 votes |
def fit(self, graph): """ Fitting a DeepWalk model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._set_seed() self._check_graph(graph) walker = RandomWalker(self.walk_length, self.walk_number) walker.do_walks(graph) model = Word2Vec(walker.walks, hs=1, alpha=self.learning_rate, iter=self.epochs, size=self.dimensions, window=self.window_size, min_count=self.min_count, workers=self.workers, seed=self.seed) num_of_nodes = graph.number_of_nodes() self._embedding = [model[str(n)] for n in range(num_of_nodes)]
Example #20
Source File: MyWord2Vec.py From SAIVS with Apache License 2.0 | 5 votes |
def learn_sentense(self): if os.path.exists(MODEL_NAME): # print('Using Word2Vec :', MODEL_NAME) return else: print('Learning sentense...') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) obj_sentense = word2vec.Text8Corpus(DATA_PATH) obj_model = word2vec.Word2Vec(obj_sentense, size=200, min_count=20, window=15) obj_model.save(MODEL_NAME) return
Example #21
Source File: MyWord2Vec.py From SAIVS with Apache License 2.0 | 5 votes |
def get_candidate_word(self, str_target_word): self.learn_sentense() obj_model = word2vec.Word2Vec.load(MODEL_NAME) str_word = str_target_word lst_nega = [] return self.cal_similarity([str_word.encode()], lst_nega, obj_model)
Example #22
Source File: utils.py From russe-evaluation with MIT License | 5 votes |
def load_vectors(fvec): # return gs.models.Word2Vec.load_word2vec_format(fvec,binary=True) return load_word2vec_format(fvec, binary=True)
Example #23
Source File: gensim_nlpir.py From nlp_learning with MIT License | 5 votes |
def mode_training(): """ 模型训练 """ # 读取文件下下面的文件 # sentences = MySentences('/some/directory') # 分词数据 sentences = word2vec.Text8Corpus('data/xuezhong_seg_1.txt') # 训练 size参数主要是用来设置神经网络的层数 # workers参数用于设置并发训练时候的线程数,不过仅当Cython安装的情况 model = word2vec.Word2Vec( sentences, min_count=20, size=4000, window=10, workers=4) # model.sort_vocab() # 计算两个词的相似度/相关程度 # simil_1 = model.wv.similarity(u"王仙芝", u"老怪物") # simil_2 = model.wv.similarity(u"徐凤年", u"殿下") # print("【王仙芝】和【老怪物】相似度:", simil_1) # print("【徐凤年】和【世子】相似度:", simil_2) # # 计算某个词的相关词列表 # lar = model.wv.most_similar(u"徐凤年", topn=20) # 20个最相关的 # print("【徐凤年】相关性:", lar) # 保存模型,以便重用 model.save(u"models/xue.model") print("training finished")
Example #24
Source File: data_utils.py From CCKS2019-Chinese-Clinical-NER with MIT License | 5 votes |
def __get_embedding_dict(model_filepath): embedding_dict = {} word2vec_model = Word2Vec.load(model_filepath) vocab = [(word, word2vec_model.wv[word]) for word, vectors in word2vec_model.wv.vocab.items()] for i in range(len(vocab)): word = vocab[i][0] vectors = vocab[i][1] embedding_dict[word] = vectors return embedding_dict
Example #25
Source File: data_utils.py From CCKS2019-Chinese-Clinical-NER with MIT License | 5 votes |
def train_word2vec_model(data_filepath, model_filepath, embedding_dim): seg_txt_list = __get_seg_txt_list(data_filepath) model = word2vec.Word2Vec(seg_txt_list, size=embedding_dim, window=5, min_count=1) model.save(model_filepath)
Example #26
Source File: word2vec_model.py From SMPCUP2017 with MIT License | 5 votes |
def word2vec_model(blog_seg_path): sentences = word2vec.LineSentence(blog_seg_path) model = word2vec.Word2Vec(sentences, workers=4) return model
Example #27
Source File: indexing.py From faiss-server with MIT License | 5 votes |
def get_vector(): if not os.path.isfile(MODEL_FILE_PATH): sentences = word2vec.Text8Corpus(fname=DATASET_FILE_PATH) model = word2vec.Word2Vec(sentences=sentences) model.wv.save_word2vec_format(fname=MODEL_FILE_PATH) return model else: return KeyedVectors.load_word2vec_format( fname=MODEL_FILE_PATH, binary=False)
Example #28
Source File: word2vec.py From nlp-journey with Apache License 2.0 | 5 votes |
def load(self): # 加载模型文件 try: model = word2vec.Word2Vec.load(self.model_path) except FileNotFoundError: model = None return model
Example #29
Source File: word2vec.py From nlp-journey with Apache License 2.0 | 5 votes |
def train(self): sentences = process_data(self.train_file) model = word2vec.Word2Vec(sentences, min_count=2, window=3, size=self.embed_size, workers=4) return model
Example #30
Source File: walklets.py From walklets with GNU General Public License v3.0 | 5 votes |
def get_embedding(self, model): """ Extracting the embedding according to node order from the embedding model. :param model: A Word2Vec model after model fitting. :return embedding: A numpy array with the embedding sorted by node IDs. """ embedding = [] for node in range(len(self.graph.nodes())): embedding.append(list(model[str(node)])) embedding = np.array(embedding) return embedding