Python Examples of gensim.models.word2vec.LineSentence

Source File: build_w2v.py From text-classifier with Apache License 2.0

7 votes

def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True)

Source File: helpers.py From webvectors with GNU General Public License v3.0

6 votes

def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi',
              commonfile='common_tagged.txt'):
    """
    :param source_file:
    :param outfile:
    :param mincount:
    :param threshold:
    :param scoring:
    :param commonfile:
    :return:
    """
    common = set([word.strip() for word in open(commonfile, 'r').readlines()])
    data = LineSentence(source_file)
    bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold,
                                 scoring=scoring, max_vocab_size=400000000, delimiter=b':::',
                                 progress_per=100000, common_terms=common)
    bigrams = Phraser(bigram_transformer)
    tempfile = open(outfile, 'a')
    print('Writing bigrammed text to %s' % outfile, file=sys.stderr)
    for i in bigrams[data]:
        tempfile.write(' '.join(i) + '\n')
    tempfile.close()
    return len(bigrams.phrasegrams)

Source File: pre_train.py From embeddings with Apache License 2.0

6 votes

def train_fasttext(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_fasttext(args**) -> Takes the input file, the
    output file and the model
    hyperparameters as arguments
    and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = FastText(sentence, sg=skipgram, hs=loss, size=size,
                     alpha=0.05, window=5, min_count=5, min_n=2,
                     max_n=5, workers=3, iter=epochs)

    model.save(output_file)

Source File: pre_train.py From embeddings with Apache License 2.0

6 votes

def train_word2vec(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_word2vec(args**) -> Takes the input file,
    the output file and the model hyperparameters as
    arguments and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = Word2Vec(sentence, sg=skipgram, hs=loss,
                     size=size, alpha=0.05, window=5,
                     min_count=5, workers=3, iter=epochs)

    model.save(output_file)

Source File: keyword_word2vec.py From nlg-yongzhuo with MIT License

6 votes

def train_word2vec_by_word():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running")

    inp = "cut_zhwiki_wiki_parse.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_word.vec"

    print(multiprocessing.cpu_count())
    model = Word2Vec(LineSentence(inp), size=300, window=10,
                     # 这里用skip-heriber
                     min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count())

    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)

Source File: train.py From DeepNews with Apache License 2.0

5 votes

def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'):
        model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count())
        model.wv.save_word2vec_format(model_save_file_name, binary=False)

Source File: word2vec_vector.py From nlp_xiaojiang with MIT License

5 votes

def train_word2vec_by_char():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running")

    inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_char.vec"
    model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)

Source File: train_vectors.py From Blackstone with Apache License 2.0

5 votes

def compute_vectors(input_path: Path, output_path: Path):
    """
    Builds word embeddings using gensim Word2Vec. This function takes
    a file contained single sentences per line and writes the computed
    vectors in text format to the specified output path. 
    """
    print(f"Processing {input_path}")
    sentences = LineSentence(input_path)
    bigram_transformer = Phrases(sentences)
    model = Word2Vec(
        bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4
    )
    print(f"Saving vectors to {output_path}")
    model.wv.save_word2vec_format(output_path, binary=False)

Source File: word2vec_helpers.py From DetectMaliciousURL with Apache License 2.0

5 votes

def generate_word2vec_files(input_file, output_model_file, output_vector_file, size = 128, window = 5, min_count = 5):
    start_time = time.time()

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model = Word2Vec(LineSentence(input_file), size = size, window = window, min_count = min_count, workers = multiprocessing.cpu_count())
    model.save(output_model_file)
    model.wv.save_word2vec_format(output_vector_file, binary=False)

    end_time = time.time()
    print("used time : %d s" % (end_time - start_time))

Source File: word2vec_helpers.py From DetectMaliciousURL with Apache License 2.0

5 votes

def generate_word2vec_files(input_file, output_model_file, output_vector_file, size = 128, window = 5, min_count = 5):
    start_time = time.time()

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model = Word2Vec(LineSentence(input_file), size = size, window = window, min_count = min_count, workers = multiprocessing.cpu_count())
    model.save(output_model_file)
    model.wv.save_word2vec_format(output_vector_file, binary=False)

    end_time = time.time()
    print("used time : %d s" % (end_time - start_time))

Source File: class_w2v.py From 2016_CCFsougou2 with MIT License

5 votes

def train_w2v(self, filename):
        """
        训练wv模型
        :param filename:path
        :return:none
        """
        sentences = word2vec.LineSentence(filename)  # 加载语料，要求语料为“一行一文本”的格式
        print '正在训练w2v 针对语料：',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
        print '训练完毕，已保存: ', savepath,
        model.save(savepath)

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators

Source File: class_w2v.py From 2016CCF_BDCI_Sougou with MIT License

5 votes

def train_w2v(self, filename):
        """
        训练wv模型
        :param filename:path
        :return:none
        """
        sentences = word2vec.LineSentence(filename)  # 加载语料，要求语料为“一行一文本”的格式
        print '正在训练w2v 针对语料：',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
        print '训练完毕，已保存: ', savepath,
        model.save(savepath)

Source File: preprocess.py From blstm-cws with MIT License

5 votes

def gen_embeddings(in_file, out_file, size=100):
    corpus = LineSentence(in_file)
    model = Word2Vec(
        sentences=corpus, size=size, alpha=0.025, window=5, min_count=5,
        max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
        sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
        trim_rule=None, sorted_vocab=1
    )
    model.save_word2vec_format(out_file, binary=False)

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: train_word2vec_model.py From linguistic-style-transfer with Apache License 2.0

5 votes

def train_word2vec_model(text_file_path, model_file_path):
    # define training data
    # train model
    logger.info("Loading input file and training mode ...")
    model = Word2Vec(sentences=LineSentence(text_file_path), min_count=1, size=global_config.embedding_size)
    # summarize the loaded model
    logger.info("Model Details: {}".format(model))
    # save model
    model.wv.save_word2vec_format(model_file_path, binary=True)
    logger.info("Model saved")

Source File: train.py From word2vec-tutorial with MIT License

5 votes

def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)

    #保存模型，供日後使用
    model.save(u"word2vec.model")

    #模型讀取方式
    # model = word2vec.Word2Vec.load("your_model_name")

Source File: train.py From word2vec-tutorial with MIT License

5 votes

def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, size=250)

    #保存模型，供日後使用
    model.save("word2vec.model")

    #模型讀取方式
    # model = word2vec.Word2Vec.load("your_model_name")

Source File: class_w2v.py From 2016CCF-sougou with Apache License 2.0

5 votes

def train_w2v(self, filename):
        """
        训练wv模型
        :param filename:path
        :return:none
        """
        sentences = word2vec.LineSentence(filename)  # 加载语料，要求语料为“一行一文本”的格式
        print '正在训练w2v 针对语料：',str(filename)
        print 'size is: ',self.size
        model = word2vec.Word2Vec(sentences, size=self.size, window=100,workers=48)  # 训练模型; 注意参数window 对结果有影响 一般5-100
        savepath = '20w_size_win100_' + str(self.size)+'.model' # 保存model的路径
        print '训练完毕，已保存: ', savepath,
        model.save(savepath)

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Python gensim.models.word2vec.LineSentence() Examples