Python jieba.enable_parallel() Examples

The following are 7 code examples of jieba.enable_parallel(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function

Example #1

Source File: text_mining.py From CloudMusic-Crawler with MIT License

6 votes

def word_segmentation(content, stop_words):

    # 使用 jieba 分词对文本进行分词处理
    jieba.enable_parallel()
    seg_list = jieba.cut(content, cut_all=False)

    seg_list = list(seg_list)

    # 去除停用词
    word_list = []
    for word in seg_list:
        if word not in stop_words:
            word_list.append(word)

    # 过滤遗漏词、空格
    user_dict = [' ', '哒']
    filter_space = lambda w: w not in user_dict
    word_list = list(filter(filter_space, word_list))

    return word_list

# 词频统计
# 返回前 top_N 个值，如果不指定则返回所有值

Example #2

Source File: tokenizer.py From dialogbot with Apache License 2.0

5 votes

def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True):
    """
    segment input file to output file
    :param in_file:
    :param out_file:
    :param word_sep:
    :param pos_sep:
    :param is_pos: 需要词性标注
    :return:
    """
    jieba.enable_parallel()
    with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout:
        count = 0
        for line in fin:
            in_line = line.strip()
            seg_line = ''
            if is_pos:
                words = posseg.lcut(in_line)
                for word, pos in words:
                    seg_line += word + pos_sep + pos + word_sep
            else:
                words = jieba.lcut(in_line)
                for word in words:
                    seg_line += word + word_sep
            fout.write(seg_line + "\n")
            count += 1
    print("segment ok. input file count:", count)

Example #3

Source File: tf_idf.py From SMPCUP2017 with MIT License

5 votes

def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus

Example #4

Source File: Data_analysis.py From Spider with MIT License

5 votes

def word_segmentation(content, stop_words):
    # 使用 jieba 分词对文本进行分词处理
    jieba.enable_parallel()
    seg_list = jieba.cut(content)

    seg_list = list(seg_list)

    # 去除停用词
    user_dict = [' ', '哒']
    filter_space = lambda w: w not in stop_words and w not in user_dict
    word_list = list(filter(filter_space, seg_list))

    return word_list

#将数据库中的微博动态转化为字符串

Example #5

Source File: data_helper.py From AmusingPythonCodes with MIT License

5 votes

def load_chinese_data(file_path, save_path, test_size=0.1, verbose=True):
    if os.path.exists(save_path):
        data = pd.read_csv(save_path, sep=",", header=0)
    else:
        data = pd.read_excel(file_path, sheet_name="sheet1")
        data = data.rename(index=str, columns={"分类": "label", "正文": "text"})

        # tokenization
        jieba.enable_parallel(16)
        data["tokens"] = data["text"].apply(lambda x: jieba.cut(x.strip()))
        data["tokens"] = [" ".join(x) for x in data["tokens"]]
        data["tokens"] = data["tokens"].apply(
            lambda x: re.sub(" +", " ", x.strip().replace("\n", " ").replace("\t", " ")))
        data.to_csv(save_path, sep=",", header=True, index=False, na_rep="")

    label_encoder = preprocessing.LabelEncoder()
    labels = label_encoder.fit_transform(data.label.values)

    x_train, x_test, y_train, y_test = train_test_split(data.tokens.values, labels, stratify=labels, random_state=1234,
                                                        test_size=test_size, shuffle=True)

    if verbose:
        print("sample tokenized text: {}".format(data["tokens"].values[0]), flush=True)
        print("labels: {}".format(data.label.unique()), flush=True)
        print("train set shape: {}, test set shape: {}".format(x_train.shape, x_test.shape))

    return x_train, x_test, y_train, y_test

Example #6

Source File: data_preprocess.py From Neural-Headline-Generator-CN with GNU General Public License v3.0

5 votes

def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
    jieba.enable_parallel(32)
    for word in custom_words:
        jieba.add_word(word)
    words=jieba.lcut(text)
    return words

Example #7

Source File: crawl.py From MillionHeroAssistant with MIT License

5 votes

def jieba_initialize():
    if not platform.system().upper().startswith("WINDOWS"):
        jieba.enable_parallel(multiprocessing.cpu_count())
    jieba.load_userdict('resources/QAattrdic.txt')
    jieba.initialize()