Python gensim.models.Word2Vec() Examples
The following are 30
code examples of gensim.models.Word2Vec().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models
, or try the search function
.
Example #1
Source File: build_w2v.py From text-classifier with Apache License 2.0 | 7 votes |
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'): sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep) save_sentence(sentences, sentence_path) print('train w2v model...') # train model w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=40) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test # sim = w2v.wv.similarity('大', '小') # print('大 vs 小 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] save_pkl(word_dict, out_path, overwrite=True)
Example #2
Source File: node2vec.py From GraphEmbedding with MIT License | 6 votes |
def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): kwargs["sentences"] = self.sentences kwargs["min_count"] = kwargs.get("min_count", 0) kwargs["size"] = embed_size kwargs["sg"] = 1 kwargs["hs"] = 0 # node2vec not use Hierarchical Softmax kwargs["workers"] = workers kwargs["window"] = window_size kwargs["iter"] = iter print("Learning embedding vectors...") model = Word2Vec(**kwargs) print("Learning embedding vectors done!") self.w2v_model = model return model
Example #3
Source File: pre_train.py From embeddings with Apache License 2.0 | 6 votes |
def train_word2vec(input_file, output_file, skipgram, loss, size, epochs): """ train_word2vec(args**) -> Takes the input file, the output file and the model hyperparameters as arguments and trains the model accordingly. The model is saved at the output location. Arguments --------- input_file : Input pre-processed wiki dump output_file : Output directory to save the model. skipgram : Layers of the model (0 - CBOW, 1 - Skipgram) loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss) size : Embedding size (100 ~ 300) epochs : Number of epochs """ sentence = LineSentence(input_file) model = Word2Vec(sentence, sg=skipgram, hs=loss, size=size, alpha=0.05, window=5, min_count=5, workers=3, iter=epochs) model.save(output_file)
Example #4
Source File: create_word2vec.py From dutchembeddings with GNU General Public License v2.0 | 6 votes |
def create(basedir, num_workers=12, size=320, threshold=5): """ Creates a word2vec model using the Gensim word2vec implementation. :param basedir: the dir from which to get the documents. :param num_workers: the number of workers to use for training word2vec :param size: the size of the resulting vectors. :param threshold: the frequency threshold. :return: the model. """ logging.basicConfig(level=logging.INFO) sentences = SentenceIter(root=basedir) model = Word2Vec(sentences=sentences, sg=True, size=size, workers=num_workers, min_count=threshold, window=11, negative=15) model.save_word2vec_format("{0}-{1}.wordvecs", "{0}-{1}.vocab") return model
Example #5
Source File: diffusion_2_vec.py From diff2vec with GNU General Public License v3.0 | 6 votes |
def learn_pooled_embeddings(walks, counts, args): """ Method to learn an embedding given the sequences and arguments. :param walks: Linear vertex sequences. :param counts: Number of nodes. :param args: Arguments. """ model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=1, sg=1, workers=args.workers, iter=args.iter, alpha=args.alpha) save_embedding(args, model, counts)
Example #6
Source File: metapath2vec.py From cogdl with MIT License | 6 votes |
def train(self, G, node_type): self.G = G self.node_type = [str(a) for a in node_type] walks = self._simulate_walks(self.walk_length, self.walk_num, self.schema) walks = [[str(node) for node in walk] for walk in walks] model = Word2Vec( walks, size=self.dimension, window=self.window_size, min_count=0, sg=1, workers=self.worker, iter=self.iteration, ) id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())]) embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))]) return embeddings
Example #7
Source File: lex_sem_ft.py From DL-text with MIT License | 6 votes |
def sum_trigram(sent, model): sent = sent.split() first = True second = True tot = 0 for i in range(len(sent)): try: if first: tot += model[None, None][sent[i]] first = False elif second: tot += model[None, sent[i-1]][sent[i]] second = False else: tot += model[sent[i-2], sent[i-1]][sent[i]] except: continue return tot #Word2Vec Training(Returns Vector):
Example #8
Source File: node2vec.py From cogdl with MIT License | 6 votes |
def train(self, G): self.G = G is_directed = nx.is_directed(self.G) for i, j in G.edges(): G[i][j]["weight"] = G[i][j].get("weight", 1.0) if not is_directed: G[j][i]["weight"] = G[j][i].get("weight", 1.0) self._preprocess_transition_probs() walks = self._simulate_walks(self.walk_num, self.walk_length) walks = [[str(node) for node in walk] for walk in walks] model = Word2Vec( walks, size=self.dimension, window=self.window_size, min_count=0, sg=1, workers=self.worker, iter=self.iteration, ) id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())]) self.embeddings = np.asarray( [model.wv[str(id2node[i])] for i in range(len(id2node))] ) return self.embeddings
Example #9
Source File: node2vec.py From entity2vec with Apache License 2.0 | 6 votes |
def learn_embeddings(self, output, output_format='binary'): """ Learn embeddings by optimizing the Skipgram objective using SGD. """ self._simulate_walks() # simulate random walks model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0, workers=self.workers, iter=self.iter, negative=25, sg=1) print("defined model using w2v") is_binary = output_format != 'text' model.wv.save_word2vec_format(output, binary=is_binary) actual_format = 'text' if output_format == 'text' else 'binary' print("saved model in word2vec %s format" % actual_format) return
Example #10
Source File: deepwalk.py From cogdl with MIT License | 6 votes |
def train(self, G): self.G = G walks = self._simulate_walks(self.walk_length, self.walk_num) walks = [[str(node) for node in walk] for walk in walks] model = Word2Vec( walks, size=self.dimension, window=self.window_size, min_count=0, sg=1, workers=self.worker, iter=self.iteration, ) id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())]) embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))]) return embeddings
Example #11
Source File: keyword_word2vec.py From nlg-yongzhuo with MIT License | 6 votes |
def train_word2vec_by_word(): logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logging.info("running") inp = "cut_zhwiki_wiki_parse.txt" outp1 = "w2v_model_wiki.model" outp2 = "w2v_model_wiki_word.vec" print(multiprocessing.cpu_count()) model = Word2Vec(LineSentence(inp), size=300, window=10, # 这里用skip-heriber min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count()) model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False)
Example #12
Source File: walkers.py From Splitter with GNU General Public License v3.0 | 6 votes |
def learn_base_embedding(self): """ Learning an embedding of nodes in the base graph. :return self.embedding: Embedding of nodes in the latent space. """ self.paths = [[str(node) for node in walk] for walk in self.paths] model = Word2Vec(self.paths, size=self.args.dimensions, window=self.args.window_size, min_count=1, sg=1, workers=self.args.workers, iter=1) self.embedding = np.array([list(model[str(n)]) for n in self.graph.nodes()]) return self.embedding
Example #13
Source File: node2vec.py From entity2rec with Apache License 2.0 | 6 votes |
def learn_embeddings(self, output): """ Learn embeddings by optimizing the Skipgram objective using SGD. """ walks = self._simulate_walks() # simulate random walks model = Word2Vec(walks, size=self.dimensions, window=self.window_size, min_count=0, workers=self.workers, iter=self.iter, negative=25, sg=1) print("defined model using w2v") model.wv.save_word2vec_format(output, binary=True) # free memory del walks self.alias_nodes = None self.alias_edges = None self.G = None print("saved model in word2vec binary format") return
Example #14
Source File: deepwalk.py From GraphEmbedding with MIT License | 6 votes |
def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs): kwargs["sentences"] = self.sentences kwargs["min_count"] = kwargs.get("min_count", 0) kwargs["size"] = embed_size kwargs["sg"] = 1 # skip gram kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax kwargs["workers"] = workers kwargs["window"] = window_size kwargs["iter"] = iter print("Learning embedding vectors...") model = Word2Vec(**kwargs) print("Learning embedding vectors done!") self.w2v_model = model return model
Example #15
Source File: graph2vec.py From PyTorchText with MIT License | 6 votes |
def train_save(self, list_csv): sentences = MySentences(list_csv) num_features = 256 min_word_count = 1 num_workers = 20 context = 5 epoch = 20 sample = 1e-5 model = Word2Vec( sentences, size=num_features, min_count=min_word_count, workers=num_workers, sample=sample, window=context, iter=epoch, ) #model.save(model_fn) return model
Example #16
Source File: lex_sem_ft.py From DeepLearn with MIT License | 6 votes |
def sum_trigram(sent, model): sent = sent.split() first = True second = True tot = 0 for i in range(len(sent)): try: if first: tot += model[None, None][sent[i]] first = False elif second: tot += model[None, sent[i-1]][sent[i]] second = False else: tot += model[sent[i-2], sent[i-1]][sent[i]] except: continue return tot #Word2Vec Training(Returns Vector):
Example #17
Source File: deepwalk.py From CogDL-TensorFlow with MIT License | 6 votes |
def train(self, G): self.G = G walks = self._simulate_walks(self.walk_length, self.walk_num) walks = [[str(node) for node in walk] for walk in walks] model = Word2Vec( walks, size=self.dimension, window=self.window_size, min_count=0, sg=1, workers=self.worker, iter=self.iteration, ) id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())]) embeddings = np.asarray([model[str(id2node[i])] for i in range(len(id2node))]) return embeddings
Example #18
Source File: baseline.py From HARP with MIT License | 6 votes |
def skipgram_baseline(graph, **kwargs): scale = kwargs.get('scale', -1) representation_size = kwargs.get('representation_size', 128) if scale == 1: edges, weights = graph.get_edges() else: path_length = kwargs.get('path_length', 40) num_paths = kwargs.get('num_paths', 80) output = kwargs.get('output', 'default') edges = graph_coarsening.build_deepwalk_corpus(graph, num_paths, path_length, output) if kwargs['hs'] == 0: print ('Training the Negative Sampling Model...') model = Word2Vec(edges, size=representation_size, window=kwargs['window_size'], min_count=0, sg=1, hs=0, iter=kwargs['iter_count'], negative=5, workers=20) else: print ('Training the Hierarchical Softmax Model...') model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sg=1, hs=1, iter=kwargs['iter_count'], workers=20) print ('Finish training the Skip-gram model.') return model
Example #19
Source File: node2vec.py From CogDL-TensorFlow with MIT License | 6 votes |
def train(self, G): self.G = G is_directed = nx.is_directed(self.G) for i, j in G.edges(): G[i][j]["weight"] = G[i][j].get("weight", 1.0) if not is_directed: G[j][i]["weight"] = G[j][i].get("weight", 1.0) self._preprocess_transition_probs() walks = self._simulate_walks(self.walk_num, self.walk_length) walks = [[str(node) for node in walk] for walk in walks] model = Word2Vec( walks, size=self.dimension, window=self.window_size, min_count=0, sg=1, workers=self.worker, iter=self.iteration, ) id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())]) self.embeddings = np.asarray( [model[str(id2node[i])] for i in range(len(id2node))] ) return self.embeddings
Example #20
Source File: test_average.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 6 votes |
def test_cy_equal_np_w2v_random(self): w2v = Word2Vec(min_count=1, size=DIM) # Random initialization w2v.build_vocab(SENTENCES) m1 = Average(w2v) m1.prep.prepare_vectors( sv=m1.sv, total_sentences=len(self.sentences), update=False ) m1._pre_train_calls() mem1 = m1._get_thread_working_mem() o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1) m2 = Average(w2v) m2.prep.prepare_vectors( sv=m2.sv, total_sentences=len(self.sentences), update=False ) m2._pre_train_calls() mem2 = m2._get_thread_working_mem() from fse.models.average_inner import train_average_cy o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2) self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
Example #21
Source File: postprocessing.py From vec4ir with MIT License | 6 votes |
def uptrain(corpus, model_path=None, binary=True, lockf=0.0, min_count=1, size=300, **word2vec_params): wv = Word2Vec(min_count=min_count, size=size, **word2vec_params) print("Building vocabulary...") wv.build_vocab(corpus) print("Found %d distinct words." % len(wv.index2word)) if model_path is not None: print("Intersecting with", model_path, "...") wv.intersect_word2vec_format(model_path, binary=binary, lockf=lockf) print("Intersected vectors locked with", lockf) total_examples = len(corpus) print("Training on %d documents..." % total_examples) wv.train(corpus, total_examples=total_examples) return wv
Example #22
Source File: test_sif.py From Fast_Sentence_Embeddings with GNU General Public License v3.0 | 5 votes |
def test_broken_vocab(self): w2v = Word2Vec(min_count=1, size=DIM) w2v.build_vocab([l.split() for l in open(CORPUS, "r")]) for k in w2v.wv.vocab: w2v.wv.vocab[k].count = np.nan model = SIF(w2v) with self.assertRaises(RuntimeError): model.train(self.sentences)
Example #23
Source File: pretrain_embedding.py From tf_CFO with MIT License | 5 votes |
def train(data_path, save_dir): sentences = [] data_files = [os.path.join(os.path.dirname(data_path), file) for file in os.listdir(data_path)] for data_file in data_files: with open(data_file, 'r')as reader: for line in reader: question = line.strip().split('\t')[-1].lower() sentences.append(nltk.word_tokenize(question)) model = Word2Vec(sentences, size=300, min_count=1, window=5, sg=1, iter=10) weights = model.wv.syn0 d = dict([(k, v.index) for k, v in model.wv.vocab.items()]) embeddings_index = {} for item in d: embeddings_index[item] = weights[d[item], :] pickle_save(embeddings_index, os.path.join(save_dir, 'fb_word2vec_300d.pkl')) word2idx = {} for idx, word in enumerate(embeddings_index.keys()): word2idx[word] = idx+1 # index 0 refers to unknown token pickle_save(word2idx, os.path.join(save_dir, 'fb_word2idx.pkl')) char2idx = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1':28, '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36} pickle_save(char2idx, os.path.join(save_dir, 'fb_char2idx.pkl'))
Example #24
Source File: __main__.py From GraphEmbeddingRecommendationSystem with MIT License | 5 votes |
def process(args): # Create a graph from the training set nodedict = graph.records_to_graph() # print(args) # Build the model using DeepWalk and Word2Vec G = graph.load_adjacencylist("out.adj", undirected=True) # YOUR CODE HERE # print(args.number_walks) # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0)) walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0,rand=random.Random(0)) print len(walk) model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) print model # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1) # Perform some evaluation of the model on the test dataset with open("./data/test_user_ratings.dat") as fin: fin.next() groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating) tr = [int(round(float(g[2]))) for g in groundtruth] # print(groundtruth) pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth] # print(pr) print "MSE = %f" % mean_squared_error(tr, pr) print "accuracy = %f" % accuracy_score(tr, pr) cm = confusion_matrix(tr, pr, labels=range(1,6)) print cm
Example #25
Source File: link_prediction.py From node2vec_linkprediction with MIT License | 5 votes |
def learn_embeddings(self, walks, dimensions, window_size=10, niter=5): ''' Learn embeddings by optimizing the Skipgram objective using SGD. ''' # TODO: Python27 only walks = [map(str, walk) for walk in walks] model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=self.workers, iter=niter) self.wvecs = model.wv
Example #26
Source File: test_query_expansion.py From vec4ir with MIT License | 5 votes |
def test_embedded_query_expansion(): model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) m = 2 expansion = EmbeddedQueryExpansion(model.wv, m=m) expansion.fit(DOCUMENTS) query = "surf" expanded_query = expansion.transform(query) # surf => surf surf Surfing print(query, expanded_query, sep='=>') assert len(expanded_query.split()) == len(query.split()) + m
Example #27
Source File: test_query_expansion.py From vec4ir with MIT License | 5 votes |
def test_centroid_expansion(): model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) m = 2 expansion = CentroidExpansion(model.wv, m=m) expansion.fit(DOCUMENTS) query = "surf" expanded_query = expansion.transform(query) # surf => surf surf Surfing print(query, expanded_query, sep='=>') assert len(expanded_query.split()) == len(query.split()) + m
Example #28
Source File: test_vec4ir.py From vec4ir with MIT License | 5 votes |
def test_combined(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) wcd = WordCentroidDistance(model.wv) tfidf = Tfidf() wcd.fit(documents) # # they can operate on different feilds tfidf.fit(['fox', 'scientists']) match_op = Matching().fit(documents) combined = wcd + tfidf ** 2 retrieval = Retrieval(combined, matching=match_op, labels=[7,42]) result = retrieval.query('fox') assert result[0] == 7 result = retrieval.query('scientists') assert result[0] == 42 # # PYEMD is required # def test_wordmovers(): # model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) # match_op = Matching() # wmd = WordMoversDistance(model.wv) # retrieval = Retrieval(wmd, matching=match_op) # retrieval.fit(documents) # result = retrieval.query('dog') # assert result[0] == 0
Example #29
Source File: test_vec4ir.py From vec4ir with MIT License | 5 votes |
def test_word2vec(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) match_op = Matching() with pytest.raises(ValueError): wcd = WordCentroidDistance(model) wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd, matching=match_op) retrieval.fit(documents) result = retrieval.query('dog') assert result[0] == 0
Example #30
Source File: train.py From DeepNews with Apache License 2.0 | 5 votes |
def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'): model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count()) model.wv.save_word2vec_format(model_save_file_name, binary=False)