Python gensim.models.word2vec.LineSentence() Examples
The following are 30
code examples of gensim.models.word2vec.LineSentence().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models.word2vec
, or try the search function
.
Example #1
Source File: build_w2v.py From text-classifier with Apache License 2.0 | 7 votes |
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'): sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep) save_sentence(sentences, sentence_path) print('train w2v model...') # train model w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=40) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test # sim = w2v.wv.similarity('大', '小') # print('大 vs 小 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] save_pkl(word_dict, out_path, overwrite=True)
Example #2
Source File: helpers.py From webvectors with GNU General Public License v3.0 | 6 votes |
def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi', commonfile='common_tagged.txt'): """ :param source_file: :param outfile: :param mincount: :param threshold: :param scoring: :param commonfile: :return: """ common = set([word.strip() for word in open(commonfile, 'r').readlines()]) data = LineSentence(source_file) bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold, scoring=scoring, max_vocab_size=400000000, delimiter=b':::', progress_per=100000, common_terms=common) bigrams = Phraser(bigram_transformer) tempfile = open(outfile, 'a') print('Writing bigrammed text to %s' % outfile, file=sys.stderr) for i in bigrams[data]: tempfile.write(' '.join(i) + '\n') tempfile.close() return len(bigrams.phrasegrams)
Example #3
Source File: pre_train.py From embeddings with Apache License 2.0 | 6 votes |
def train_fasttext(input_file, output_file, skipgram, loss, size, epochs): """ train_fasttext(args**) -> Takes the input file, the output file and the model hyperparameters as arguments and trains the model accordingly. The model is saved at the output location. Arguments --------- input_file : Input pre-processed wiki dump output_file : Output directory to save the model. skipgram : Layers of the model (0 - CBOW, 1 - Skipgram) loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss) size : Embedding size (100 ~ 300) epochs : Number of epochs """ sentence = LineSentence(input_file) model = FastText(sentence, sg=skipgram, hs=loss, size=size, alpha=0.05, window=5, min_count=5, min_n=2, max_n=5, workers=3, iter=epochs) model.save(output_file)
Example #4
Source File: pre_train.py From embeddings with Apache License 2.0 | 6 votes |
def train_word2vec(input_file, output_file, skipgram, loss, size, epochs): """ train_word2vec(args**) -> Takes the input file, the output file and the model hyperparameters as arguments and trains the model accordingly. The model is saved at the output location. Arguments --------- input_file : Input pre-processed wiki dump output_file : Output directory to save the model. skipgram : Layers of the model (0 - CBOW, 1 - Skipgram) loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss) size : Embedding size (100 ~ 300) epochs : Number of epochs """ sentence = LineSentence(input_file) model = Word2Vec(sentence, sg=skipgram, hs=loss, size=size, alpha=0.05, window=5, min_count=5, workers=3, iter=epochs) model.save(output_file)
Example #5
Source File: keyword_word2vec.py From nlg-yongzhuo with MIT License | 6 votes |
def train_word2vec_by_word(): logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logging.info("running") inp = "cut_zhwiki_wiki_parse.txt" outp1 = "w2v_model_wiki.model" outp2 = "w2v_model_wiki_word.vec" print(multiprocessing.cpu_count()) model = Word2Vec(LineSentence(inp), size=300, window=10, # 这里用skip-heriber min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count()) model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False)
Example #6
Source File: train.py From DeepNews with Apache License 2.0 | 5 votes |
def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'): model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count()) model.wv.save_word2vec_format(model_save_file_name, binary=False)
Example #7
Source File: word2vec_vector.py From nlp_xiaojiang with MIT License | 5 votes |
def train_word2vec_by_char(): logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logging.info("running") inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt" outp1 = "w2v_model_wiki.model" outp2 = "w2v_model_wiki_char.vec" model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count()) model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False)
Example #8
Source File: train_vectors.py From Blackstone with Apache License 2.0 | 5 votes |
def compute_vectors(input_path: Path, output_path: Path): """ Builds word embeddings using gensim Word2Vec. This function takes a file contained single sentences per line and writes the computed vectors in text format to the specified output path. """ print(f"Processing {input_path}") sentences = LineSentence(input_path) bigram_transformer = Phrases(sentences) model = Word2Vec( bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4 ) print(f"Saving vectors to {output_path}") model.wv.save_word2vec_format(output_path, binary=False)
Example #9
Source File: word2vec_helpers.py From DetectMaliciousURL with Apache License 2.0 | 5 votes |
def generate_word2vec_files(input_file, output_model_file, output_vector_file, size = 128, window = 5, min_count = 5): start_time = time.time() # trim unneeded model memory = use(much) less RAM # model.init_sims(replace=True) model = Word2Vec(LineSentence(input_file), size = size, window = window, min_count = min_count, workers = multiprocessing.cpu_count()) model.save(output_model_file) model.wv.save_word2vec_format(output_vector_file, binary=False) end_time = time.time() print("used time : %d s" % (end_time - start_time))
Example #10
Source File: word2vec_helpers.py From DetectMaliciousURL with Apache License 2.0 | 5 votes |
def generate_word2vec_files(input_file, output_model_file, output_vector_file, size = 128, window = 5, min_count = 5): start_time = time.time() # trim unneeded model memory = use(much) less RAM # model.init_sims(replace=True) model = Word2Vec(LineSentence(input_file), size = size, window = window, min_count = min_count, workers = multiprocessing.cpu_count()) model.save(output_model_file) model.wv.save_word2vec_format(output_vector_file, binary=False) end_time = time.time() print("used time : %d s" % (end_time - start_time))
Example #11
Source File: class_w2v.py From 2016_CCFsougou2 with MIT License | 5 votes |
def train_w2v(self, filename): """ 训练wv模型 :param filename:path :return:none """ sentences = word2vec.LineSentence(filename) # 加载语料,要求语料为“一行一文本”的格式 print '正在训练w2v 针对语料:',str(filename) print 'size is: ',self.size model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48) # 训练模型; 注意参数window 对结果有影响 一般5-100 savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径 print '训练完毕,已保存: ', savepath, model.save(savepath)
Example #12
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #13
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #14
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) #endclass TestWord2VecSentenceIterators
Example #15
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #16
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #17
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #18
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #19
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) #endclass TestWord2VecSentenceIterators
Example #20
Source File: class_w2v.py From 2016CCF_BDCI_Sougou with MIT License | 5 votes |
def train_w2v(self, filename): """ 训练wv模型 :param filename:path :return:none """ sentences = word2vec.LineSentence(filename) # 加载语料,要求语料为“一行一文本”的格式 print '正在训练w2v 针对语料:',str(filename) print 'size is: ',self.size model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48) # 训练模型; 注意参数window 对结果有影响 一般5-100 savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径 print '训练完毕,已保存: ', savepath, model.save(savepath)
Example #21
Source File: preprocess.py From blstm-cws with MIT License | 5 votes |
def gen_embeddings(in_file, out_file, size=100): corpus = LineSentence(in_file) model = Word2Vec( sentences=corpus, size=size, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1 ) model.save_word2vec_format(out_file, binary=False)
Example #22
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #23
Source File: train_word2vec_model.py From linguistic-style-transfer with Apache License 2.0 | 5 votes |
def train_word2vec_model(text_file_path, model_file_path): # define training data # train model logger.info("Loading input file and training mode ...") model = Word2Vec(sentences=LineSentence(text_file_path), min_count=1, size=global_config.embedding_size) # summarize the loaded model logger.info("Model Details: {}".format(model)) # save model model.wv.save_word2vec_format(model_file_path, binary=True) logger.info("Model saved")
Example #24
Source File: train.py From word2vec-tutorial with MIT License | 5 votes |
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.LineSentence("wiki_seg.txt") model = word2vec.Word2Vec(sentences, size=250) #保存模型,供日後使用 model.save(u"word2vec.model") #模型讀取方式 # model = word2vec.Word2Vec.load("your_model_name")
Example #25
Source File: train.py From word2vec-tutorial with MIT License | 5 votes |
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.LineSentence("wiki_seg.txt") model = word2vec.Word2Vec(sentences, size=250) #保存模型,供日後使用 model.save("word2vec.model") #模型讀取方式 # model = word2vec.Word2Vec.load("your_model_name")
Example #26
Source File: class_w2v.py From 2016CCF-sougou with Apache License 2.0 | 5 votes |
def train_w2v(self, filename): """ 训练wv模型 :param filename:path :return:none """ sentences = word2vec.LineSentence(filename) # 加载语料,要求语料为“一行一文本”的格式 print '正在训练w2v 针对语料:',str(filename) print 'size is: ',self.size model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48) # 训练模型; 注意参数window 对结果有影响 一般5-100 savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径 print '训练完毕,已保存: ', savepath, model.save(savepath)
Example #27
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #28
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #29
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) #endclass TestWord2VecSentenceIterators
Example #30
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())