Python jieba.enable_parallel() Examples
The following are 7
code examples of jieba.enable_parallel().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
jieba
, or try the search function
.
Example #1
Source File: text_mining.py From CloudMusic-Crawler with MIT License | 6 votes |
def word_segmentation(content, stop_words): # 使用 jieba 分词对文本进行分词处理 jieba.enable_parallel() seg_list = jieba.cut(content, cut_all=False) seg_list = list(seg_list) # 去除停用词 word_list = [] for word in seg_list: if word not in stop_words: word_list.append(word) # 过滤遗漏词、空格 user_dict = [' ', '哒'] filter_space = lambda w: w not in user_dict word_list = list(filter(filter_space, word_list)) return word_list # 词频统计 # 返回前 top_N 个值,如果不指定则返回所有值
Example #2
Source File: tokenizer.py From dialogbot with Apache License 2.0 | 5 votes |
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True): """ segment input file to output file :param in_file: :param out_file: :param word_sep: :param pos_sep: :param is_pos: 需要词性标注 :return: """ jieba.enable_parallel() with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout: count = 0 for line in fin: in_line = line.strip() seg_line = '' if is_pos: words = posseg.lcut(in_line) for word, pos in words: seg_line += word + pos_sep + pos + word_sep else: words = jieba.lcut(in_line) for word in words: seg_line += word + word_sep fout.write(seg_line + "\n") count += 1 print("segment ok. input file count:", count)
Example #3
Source File: tf_idf.py From SMPCUP2017 with MIT License | 5 votes |
def tf_idf(texts): jieba.load_userdict("./model/dict.txt") jieba.analyse.set_idf_path("./model/idf.txt") jieba.analyse.set_stop_words("./model/chinese_stopwords.txt") jieba.enable_parallel(8) corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts] return corpus
Example #4
Source File: Data_analysis.py From Spider with MIT License | 5 votes |
def word_segmentation(content, stop_words): # 使用 jieba 分词对文本进行分词处理 jieba.enable_parallel() seg_list = jieba.cut(content) seg_list = list(seg_list) # 去除停用词 user_dict = [' ', '哒'] filter_space = lambda w: w not in stop_words and w not in user_dict word_list = list(filter(filter_space, seg_list)) return word_list #将数据库中的微博动态转化为字符串
Example #5
Source File: data_helper.py From AmusingPythonCodes with MIT License | 5 votes |
def load_chinese_data(file_path, save_path, test_size=0.1, verbose=True): if os.path.exists(save_path): data = pd.read_csv(save_path, sep=",", header=0) else: data = pd.read_excel(file_path, sheet_name="sheet1") data = data.rename(index=str, columns={"分类": "label", "正文": "text"}) # tokenization jieba.enable_parallel(16) data["tokens"] = data["text"].apply(lambda x: jieba.cut(x.strip())) data["tokens"] = [" ".join(x) for x in data["tokens"]] data["tokens"] = data["tokens"].apply( lambda x: re.sub(" +", " ", x.strip().replace("\n", " ").replace("\t", " "))) data.to_csv(save_path, sep=",", header=True, index=False, na_rep="") label_encoder = preprocessing.LabelEncoder() labels = label_encoder.fit_transform(data.label.values) x_train, x_test, y_train, y_test = train_test_split(data.tokens.values, labels, stratify=labels, random_state=1234, test_size=test_size, shuffle=True) if verbose: print("sample tokenized text: {}".format(data["tokens"].values[0]), flush=True) print("labels: {}".format(data.label.unique()), flush=True) print("train set shape: {}, test set shape: {}".format(x_train.shape, x_test.shape)) return x_train, x_test, y_train, y_test
Example #6
Source File: data_preprocess.py From Neural-Headline-Generator-CN with GNU General Public License v3.0 | 5 votes |
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']): jieba.enable_parallel(32) for word in custom_words: jieba.add_word(word) words=jieba.lcut(text) return words
Example #7
Source File: crawl.py From MillionHeroAssistant with MIT License | 5 votes |
def jieba_initialize(): if not platform.system().upper().startswith("WINDOWS"): jieba.enable_parallel(multiprocessing.cpu_count()) jieba.load_userdict('resources/QAattrdic.txt') jieba.initialize()