Python gensim.corpora.WikiCorpus() Examples

The following are 8 code examples of gensim.corpora.WikiCorpus(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.corpora , or try the search function .
Example #1
Source File: wiki_extract.py    From TaskBot with GNU General Public License v3.0 7 votes vote down vote up
def wiki_extract(input_file, output_file):
    """wiki下载文件提取文本内容脚本

    :param input_file: 原始文件路径
    :param output_file:  提取文件路径
    :return: None
    """
    # 原始文件是否存在
    assert Path(input_file).resolve().exists()
    # 提取文件路径不存在就新建
    output_file_path = Path(output_file).resolve()
    output_file_path.parent.mkdir(exist_ok=True)
    logger.info("Start extract wiki ..")
    wiki = WikiCorpus(input_file, lemmatize=False)
    with open(output_file, "w", encoding="utf8") as f:
        for i, text in enumerate(wiki.get_texts()):
            f.write(" ".join(text) + "\n")
            if i % 10000 == 0:
                logger.info("Saved %d articles" % i)
    logger.info("Finished extract wiki, Saved in %s" % output_file) 
Example #2
Source File: wiki_to_txt.py    From word2vec-tutorial with MIT License 6 votes vote down vote up
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with io.open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(b' '.join(text).decode('utf-8') + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num) 
Example #3
Source File: wiki_to_txt.py    From word2vec-tutorial with MIT License 6 votes vote down vote up
def main():

    if len(sys.argv) != 2:
        print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
    texts_num = 0

    with open("wiki_texts.txt",'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            output.write(' '.join(text) + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num) 
Example #4
Source File: train_word2vec.py    From text-classification with Apache License 2.0 6 votes vote down vote up
def to_text():
    # wiki_corpus = WikiCorpus(config['wiki_raw'], dictionary={})
    # texts_num = 0
    # with open(config['input_raw'], 'w', encoding='utf-8') as output:
    #     for text in wiki_corpus.get_texts():
    #         output.write(' '.join(text) + '\n')
    #         texts_num += 1
    #         if texts_num % 10000 == 0:
    #             logging.info("Parsed %d th articles" % texts_num)

    df = pd.read_csv(os.getcwd() + '/data/financenews/news.csv')
    title = list(df['Title'].values)
    content = list(df['NewsContent'].values)
    raw_text = title + content

    texts_num = 0
    with open(config['input_raw'], 'w', encoding='utf-8') as output:
        for text in raw_text:
            text = str(text)
            output.write(text.strip() + '\n')
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("Parsed %d th articles" % texts_num) 
Example #5
Source File: preprocess.py    From blstm-cws with MIT License 6 votes vote down vote up
def zhwiki2chars(in_file, out_file):
    reg = re.compile(r'^[a-zA-Z]+$')

    def _isalpha(string):
        return reg.match(string) is not None

    i = 0
    out = open(out_file, 'w')
    wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
    for article in wiki.get_texts():
        tokens = []
        for token in article:
            token = token.decode("utf-8").strip()
            if _isalpha(token):
                continue
            tokens.append(" ".join(token))  # divided by character
        out.write(" ".join(tokens) + "\n")
        i += 1
        if i % 10000 == 0:
            print("process %d articles" % i)
    out.close() 
Example #6
Source File: make_wikicorpus.py    From wiki-sim-search with MIT License 5 votes vote down vote up
def formatTime(seconds):
    """
    Takes a number of elapsed seconds and returns a string in the format h:mm.
    """
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return "%d:%02d" % (h, m)
 

# ======== main ========
# Main entry point for the script.
# This little check has to do with the multiprocess module (which is used by
# WikiCorpus). Without it, the code will spawn infinite processes and hang! 
Example #7
Source File: dump.py    From embedding with MIT License 5 votes vote down vote up
def make_corpus(in_f, out_f):
    """Convert Wikipedia xml dump file to text corpus"""
    output = open(out_f, 'w', encoding = "utf-8")
    wiki = WikiCorpus(in_f, tokenizer_func=tokenize, dictionary=Dictionary())
    i = 0
    for text in wiki.get_texts():
        output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
        i = i + 1
        if (i % 10000 == 0):
            print('Processed ' + str(i) + ' articles')
    output.close()
    print('Processing complete!') 
Example #8
Source File: preprocess.py    From BERT-pytorch with The Unlicense 5 votes vote down vote up
def extract_articles_wiki(wiki_raw_path, raw_documents_path, **_):
    wiki_corpus = WikiCorpus(wiki_raw_path, lemmatize=False, dictionary={}, tokenizer_func=tokenize, lower=False)

    with open(raw_documents_path, 'w') as raw_documents_file:
        for text in tqdm(wiki_corpus.get_texts()):
            document = ' '.join(text)
            raw_documents_file.write(document + '\n')