Python jieba.cut() Examples
The following are 30
code examples of jieba.cut().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
jieba
, or try the search function
.
Example #1
Source File: run.py From TiebaTool with MIT License | 8 votes |
def calculate_similarity(text1,text2): raw1 = jieba.cut(text1) raw2 = jieba.cut(text2) raw1 = Counter(raw1) raw2 = Counter(raw2) same_words = set(raw1) & set(raw2) if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0: dot_product = 0 mod1 = 0 mod2 = 0 for word in same_words: dot_product += raw1[word] * raw2[word] for word in raw1: mod1 += math.pow(raw1[word],2) for word in raw2: mod2 += math.pow(raw2[word],2) cos = dot_product/math.sqrt(mod1*mod2) else: cos = 0 return cos
Example #2
Source File: Bayes.py From weiboanalysis with Apache License 2.0 | 7 votes |
def loadDataSet(path): # 返回每条微博的分词与标签 line_cut = [] label = [] with open(path, encoding="utf-8") as fp: for line in fp: temp = line.strip() try: sentence = temp[2:].lstrip() # 每条微博 label.append(int(temp[:2])) # 获取标注 word_list = [] sentence = str(sentence).replace('\u200b', '') for word in jieba.cut(sentence.strip()): p = re.compile(r'\w', re.L) result = p.sub("", word) if not result or result == ' ': # 空字符 continue word_list.append(word) word_list = list(set(word_list) - set(stop) - set('\u200b') - set(' ') - set('\u3000') - set('️')) line_cut.append(word_list) except Exception: continue return line_cut, label # 返回每条微博的分词和标注
Example #3
Source File: make_handcrafted_33_features.py From wsdm19cup with MIT License | 7 votes |
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True): """return 4 tensors of train_q1,q2 and test_q1,q2""" df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("") df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("") df = pd.DataFrame() df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique() if to_preprocess: df['text'] = df['text'].map(lambda x: preprocess(x)) df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess) df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess) df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess) df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess) if analyzer == 'char': vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) else: vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) vect.fit(df["text"].tolist()) return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect
Example #4
Source File: Bayes.py From weiboanalysis with Apache License 2.0 | 7 votes |
def loadDataSet(path): # 返回每条微博的分词与标签 line_cut = [] label = [] with open(path, encoding="utf-8") as fp: for line in fp: temp = line.strip() try: sentence = temp[2:].lstrip() # 每条微博 label.append(int(temp[:2])) # 获取标注 word_list = [] sentence = str(sentence).replace('\u200b', '') for word in jieba.cut(sentence.strip()): p = re.compile(b'\w', re.L) result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8") if not result or result == ' ': # 空字符 continue word_list.append(word) word_list = list(set(word_list) - set(stop) - set('\u200b') - set(' ') - set('\u3000') - set('️')) line_cut.append(word_list) except Exception: continue return line_cut, label # 返回每条微博的分词和标注
Example #5
Source File: engine_for_mlp.py From text-antispam with MIT License | 6 votes |
def text_tensor(text, wv): """获取文本向量 Args: text: 待检测文本 wv: 词向量模型 Returns: [[ 3.80905056 1.94315064 -0.20703495 -1.31589055 1.9627794 ... 2.16935492 2.95426321 -4.71534014 -3.25034237 -11.28901672]] """ text = tr.extractWords(text) words = jieba.cut(text.strip()) text_embedding = np.zeros(200, dtype=np.float32) for word in words: try: text_embedding += wv[word] except KeyError: text_embedding += wv['UNK'] sample = text_embedding.reshape(1, 200) return sample
Example #6
Source File: get_topic.py From poem_generator with Apache License 2.0 | 6 votes |
def read_data(fin): poem_words = list() title_flag = False title = '' fd = codecs.open(fin, 'r', 'utf-8') for line in fd: line = line.strip() line = reg_sep.sub(' ', line) title_flag = not title_flag if title_flag: title = line else: words = ' '.join(jieba.cut(title + line)) poem_words.append(words) fd.close() print('Read data done.') return poem_words
Example #7
Source File: qaData.py From QA with GNU General Public License v3.0 | 6 votes |
def sentenceToIndex(sentence, word2idx, maxLen): """ 将句子分词,并转换成embeddings列表的索引值 :param sentence: 句子 :param word2idx: 词语的索引 :param maxLen: 句子的最大长度 :return: 句子的词向量索引表示 """ unknown = word2idx.get("UNKNOWN", 0) num = word2idx.get("NUM", len(word2idx)) index = [unknown] * maxLen i = 0 for word in jieba.cut(sentence): if word in word2idx: index[i] = word2idx[word] else: if re.match("\d+", word): index[i] = num else: index[i] = unknown if i >= maxLen - 1: break i += 1 return index
Example #8
Source File: analyze_data.py From Machine-Translation with Apache License 2.0 | 6 votes |
def analyze_zh(): translation_path = os.path.join(train_translation_folder, train_translation_zh_filename) with open(translation_path, 'r') as f: sentences = f.readlines() sent_lengths = [] for sentence in tqdm(sentences): seg_list = list(jieba.cut(sentence.strip())) # Update word frequency sent_lengths.append(len(seg_list)) num_bins = 100 n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5) title = 'Chinese Sentence Lengths Distribution' plt.title(title) plt.show()
Example #9
Source File: analyze_data.py From Machine-Translation with Apache License 2.0 | 6 votes |
def analyze_en(): translation_path = os.path.join(train_translation_folder, train_translation_en_filename) with open(translation_path, 'r') as f: sentences = f.readlines() sent_lengths = [] for sentence in tqdm(sentences): sentence_en = sentence.strip().lower() tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)] seg_list = list(jieba.cut(sentence.strip())) # Update word frequency sent_lengths.append(len(seg_list)) num_bins = 100 n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5) title = 'English Sentence Lengths Distribution' plt.title(title) plt.show()
Example #10
Source File: process_corpus.py From question-classification-cnn-rnn-attention with Apache License 2.0 | 6 votes |
def WordBeark(): logger.info("running Word Beark in " + path + data) inputfile = path + data + ".zhs" outputfile = path + data + ".wordbreak" i = 0 output = open(outputfile, 'w') input = open(inputfile, 'r') for line in input.readlines(): seg_list = jieba.cut(line) output.write(u' '.join(seg_list)) i = i + 1 if (i % 10000 == 0): logger.info("Cut " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
Example #11
Source File: process_corpus.py From question-classification-cnn-rnn-attention with Apache License 2.0 | 6 votes |
def WordBeark(): logger.info("running Word Beark in " + path + data) inputfile = path + data + ".zhs" outputfile = path + data + ".wordbreak" i = 0 output = open(outputfile, 'w') input = open(inputfile, 'r') for line in input.readlines(): seg_list = jieba.cut(line) output.write(u' '.join(seg_list)) i = i + 1 if (i % 10000 == 0): logger.info("Cut " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
Example #12
Source File: tool.py From dudulu with MIT License | 6 votes |
def build_key_word(path): """ 通过词频产生key word :param path: :return: """ d = {} with open(path, encoding="utf-8") as fp: for line in fp: for word in jieba.cut(line.strip()): if len(word) > 1: # 避免大量无意义的词语进入统计范围 d[word] = d.get(word, 0) + 1 kw_list = sorted(d, key=lambda x: d[x], reverse=True) # 取前0.5名 size = int(len(kw_list) * 0.2) return kw_list[:size]
Example #13
Source File: norm_zh.py From mars with Apache License 2.0 | 6 votes |
def _zh_split(s): """ Split text length in Chinese """ import jieba try: s.encode('ascii') has_zh = False except ValueError: has_zh = True if has_zh: return list(jieba.cut(s)) else: return pofile.WORD_SEP.split(s) # code modified from babel.messages.pofile (hash 359ecffca479dfe032d0f7210d5cd8160599c816)
Example #14
Source File: AsianNLP.py From scattertext with Apache License 2.0 | 6 votes |
def _asian_tokenization(doc, entity_type, tag_type, tokenizer): sents = [] for paragraph in doc.split('\n'): sent_splits = iter(re.split(r'(?|。|」|!)+', paragraph, flags=re.MULTILINE)) for partial_sent in sent_splits: sent = partial_sent + next(sent_splits, '') if sent.strip() == '': continue toks = [] # for tok in jieba.cut(sent, ): for tok in tokenizer(sent): pos = 'WORD' if tok.strip() == '': pos = 'SPACE' elif punct_re.match(tok): pos = 'PUNCT' toks.append(Tok(pos, tok[:2].lower(), tok.lower(), tok, ent_type='' if entity_type is None else entity_type.get(tok, ''), tag='' if tag_type is None else tag_type.get(tok, ''))) sents.append(Sentence(toks, sent)) return Doc(sents, doc)
Example #15
Source File: dataset.py From atec-nlp with MIT License | 6 votes |
def __init__(self, data_file, sequence_length, word2idx, char_level=True): self.word2idx = word2idx self.seq_len = sequence_length x1, x2, y = [], [], [] for line in open(data_file, 'r'): _, s1, s2, label = line.strip().split('\t') s1, s2 = map(self._clean_text, [s1, s2]) if not char_level: s1 = list(jieba.cut(s1)) s2 = list(jieba.cut(s2)) x1.append(s1) x2.append(s2) y.append(1) if label == '1' else y.append(0) self.x1 = x1 self.x2 = x2 self.y = y
Example #16
Source File: dataset.py From atec-nlp with MIT License | 6 votes |
def _load_data(self, data_file): """Load origin train data and do text pre-processing (converting and cleaning) Returns: A generator if self.is_training: train sentence pairs and labels (s1, s2, y). else: train sentence pairs and None (s1, s2, None). """ for line in open(data_file): line = line.strip().decode('utf-8').split('\t') s1, s2 = map(self._clean_text, map(self._tradition2simple, line[1:3])) if not self.char_level: s1 = list(jieba.cut(s1)) s2 = list(jieba.cut(s2)) if self.is_training: y = int(line[-1]) # 1 or [1] yield s1, s2, y else: yield s1, s2, None # for consistent
Example #17
Source File: make_handcrafted_33_features.py From wsdm19cup with MIT License | 6 votes |
def __prepare__(self,q): q = self.__preprocess__(q) new_q = [] surplus_q = [] numbers_q = [] new_xitrum = True for w in list(jieba.cut(q))[::-1]: if w not in self.STOP_WORDS: if new_xitrum: new_q = ["__xitrum__"] + new_q new_xitrum = False if self.__is_numeric__(w): numbers_q = [w] + numbers_q else: surplus_q = [w] + surplus_q else: new_xitrum = True if len(new_q) == self.MAX_SEQUENCE_LENGTH: break new_q = " ".join(new_q) return new_q, set(surplus_q), set(numbers_q) ### jaccard
Example #18
Source File: train_predict_trees_batch2.py From wsdm19cup with MIT License | 6 votes |
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True): """return 4 tensors of train_q1,q2 and test_q1,q2""" df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("") df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("") df = pd.DataFrame() df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique() if to_preprocess: df['text'] = df['text'].map(lambda x: preprocess(x)) df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess) df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess) df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess) df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess) if analyzer == 'char': vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) else: vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) vect.fit(df["text"].tolist()) return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect
Example #19
Source File: train_predict_trees_batch2.py From wsdm19cup with MIT License | 6 votes |
def __prepare__(self,q): q = self.__preprocess__(q) new_q = [] surplus_q = [] numbers_q = [] new_xitrum = True for w in list(jieba.cut(q))[::-1]: if w not in self.STOP_WORDS: if new_xitrum: new_q = ["__xitrum__"] + new_q new_xitrum = False if self.__is_numeric__(w): numbers_q = [w] + numbers_q else: surplus_q = [w] + surplus_q else: new_xitrum = True if len(new_q) == self.MAX_SEQUENCE_LENGTH: break new_q = " ".join(new_q) return new_q, set(surplus_q), set(numbers_q) ### jaccard
Example #20
Source File: train_predict_trees_batch1.py From wsdm19cup with MIT License | 6 votes |
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True): """return 4 tensors of train_q1,q2 and test_q1,q2""" df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("") df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("") df = pd.DataFrame() df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique() if to_preprocess: df['text'] = df['text'].map(lambda x: preprocess(x)) df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess) df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess) df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess) df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess) if analyzer == 'char': vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) else: vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) vect.fit(df["text"].tolist()) return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect
Example #21
Source File: train_predict_trees_batch3.py From wsdm19cup with MIT License | 6 votes |
def __prepare__(self,q): q = self.__preprocess__(q) new_q = [] surplus_q = [] numbers_q = [] new_xitrum = True for w in list(jieba.cut(q))[::-1]: if w not in self.STOP_WORDS: if new_xitrum: new_q = ["__xitrum__"] + new_q new_xitrum = False if self.__is_numeric__(w): numbers_q = [w] + numbers_q else: surplus_q = [w] + surplus_q else: new_xitrum = True if len(new_q) == self.MAX_SEQUENCE_LENGTH: break new_q = " ".join(new_q) return new_q, set(surplus_q), set(numbers_q) ### jaccard
Example #22
Source File: ptt_filter.py From justcopy-backend with MIT License | 6 votes |
def print2file(f, title, responses, marker = '', separater = True): if marker != '': f.write(marker + ' ') title_cutted = jieba.cut(title.strip(), cut_all=False) for word in title_cutted: f.write(word + ' ') f.write('\n') for response in responses: #print(response['Content']) #if response['Content'] not in count_response.keys(): # count_response[response['Content']] = 0 #count_response[response['Content']] += 1 if marker != '': f.write(marker + ' ') response_cutted = jieba.cut(response['Content'].strip(), cut_all=False) for word in response_cutted: f.write(word + ' ') f.write('\n') if separater: f.write('===\n')
Example #23
Source File: run_infer.py From BERT with Apache License 2.0 | 6 votes |
def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length): before_part = tokens_a_id[0:answer_symbol_id_pos] after_part = tokens_a_id[answer_symbol_id_pos:] half_length = int(max_length / 2) if len(before_part) < half_length: # cut at tail st = 0 ed = min(len(before_part) + 1 + len(after_part), max_length - 3) elif len(after_part) < half_length: # cut at head ed = len(before_part) + 1 + len(after_part) st = max(0, ed - (max_length - 3)) else: # cut at both sides st = len(before_part) + 3 - half_length ed = len(before_part) + half_length output = tokens_a_id[st:ed] assert tokens_a_id[answer_symbol_id_pos] in output return output
Example #24
Source File: out_script.py From BERT with Apache License 2.0 | 6 votes |
def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length): before_part = tokens_a_id[0:answer_symbol_id_pos] after_part = tokens_a_id[answer_symbol_id_pos:] half_length = int(max_length / 2) if len(before_part) < half_length: # cut at tail st = 0 ed = min(len(before_part) + 1 + len(after_part), max_length - 3) elif len(after_part) < half_length: # cut at head ed = len(before_part) + 1 + len(after_part) st = max(0, ed - (max_length - 3)) else: # cut at both sides st = len(before_part) + 3 - half_length ed = len(before_part) + half_length output = tokens_a_id[st:ed] assert tokens_a_id[answer_symbol_id_pos] in output return output
Example #25
Source File: output_script_test.py From BERT with Apache License 2.0 | 6 votes |
def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length): before_part = tokens_a_id[0:answer_symbol_id_pos] after_part = tokens_a_id[answer_symbol_id_pos:] half_length = int(max_length / 2) if len(before_part) < half_length: # cut at tail st = 0 ed = min(len(before_part) + 1 + len(after_part), max_length - 3) elif len(after_part) < half_length: # cut at head ed = len(before_part) + 1 + len(after_part) st = max(0, ed - (max_length - 3)) else: # cut at both sides st = len(before_part) + 3 - half_length ed = len(before_part) + half_length output = tokens_a_id[st:ed] assert tokens_a_id[answer_symbol_id_pos] in output return output
Example #26
Source File: engine.py From text-antispam with MIT License | 6 votes |
def text_tensor(text, wv): """获取文本向量 Args: text: 待检测文本 wv: 词向量模型 Returns: [[[ 3.80905056 1.94315064 -0.20703495 -1.31589055 1.9627794 ... 2.16935492 2.95426321 -4.71534014 -3.25034237 -11.28901672]]] """ text = tr.extractWords(text) words = jieba.cut(text.strip()) text_sequence = [] for word in words: try: text_sequence.append(wv[word]) except KeyError: text_sequence.append(wv['UNK']) text_sequence = np.asarray(text_sequence) sample = text_sequence.reshape(1, len(text_sequence), 200) return sample
Example #27
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 6 votes |
def title2wordbag(self, title, remove_stopwords=True): words=jieba.cut(title,cut_all=False) str_cut=' '.join(words) for sym in zh_symbol: # remove chinese symbols str_cut=str_cut.replace(sym,'') for sym in number: # remove number str_cut=str_cut.replace(sym,'') strlist_cut=str_cut.split(' ') strlist_new=[] for word in strlist_cut: # remove english letter if (not len(word)) or (word in self.stop_words): continue elif (word[0]>='A' and word[0]<='Z') or(word[0]>='a' and word[0]<='z'): continue elif(ord(word[0])<1024): continue else: strlist_new.append(word) return strlist_new ################################ problem #####################################
Example #28
Source File: tool.py From weiboanalysis with Apache License 2.0 | 5 votes |
def getlinejieba(path): d = [] with open(path, encoding="utf-8") as fp: for line in fp: temp = [] line = str(line).replace('\u200b', '') for word in jieba.cut(line.strip()[2:]): temp.append(word) d.append(list(set(temp) - set(stop) - set(' '))) # 差集、去空 return d
Example #29
Source File: train_predict_trees_batch3.py From wsdm19cup with MIT License | 5 votes |
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True): """return 4 tensors of train_q1,q2 and test_q1,q2""" df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("") df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("") df = pd.DataFrame() df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique() if to_preprocess: df['text'] = df['text'].map(lambda x: preprocess(x)) df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess) df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess) df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess) df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess) if analyzer == 'char': vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) else: vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) vect.fit(df["text"].tolist()) return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect
Example #30
Source File: tool.py From weiboanalysis with Apache License 2.0 | 5 votes |
def get_word_feature(sentence): wordlist = [] sentence = str(sentence).replace('\u200b', '') for word in jieba.cut(sentence.strip()): p = re.compile(r'\w', re.L) result = p.sub("", word) if not result or result == ' ': # 空字符 continue wordlist.append(word) return list(set(wordlist) - set(stop) - set(' '))