Python jieba.lcut() Examples
The following are 30
code examples of jieba.lcut().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
jieba
, or try the search function
.
Example #1
Source File: similarity.py From sentence-similarity with MIT License | 6 votes |
def M_idf(self,s1, s2): v1, v2 = [], [] s1_list = jieba.lcut(s1) s2_list = jieba.lcut(s2) for s in s1_list: idf_v = self.idf.get(s, 1) if s in self.voc: v1.append(1.0 * idf_v * self.voc[s]) for s in s2_list: idf_v = self.idf.get(s, 1) if s in self.voc: v2.append(1.0 * idf_v * self.voc[s]) v1 = np.array(v1).sum(axis=0) v2 = np.array(v2).sum(axis=0) sim = 1 - spatial.distance.cosine(v1, v2) return sim
Example #2
Source File: cut_td_idf.py From nlp_xiaojiang with MIT License | 6 votes |
def cut_td_idf(sources_path, target_path): """ 结巴切词,汉语 :param path: :return: """ print("cut_td_idf start! ") corpus = txtRead(sources_path) governments = [] for corpus_one in corpus: corpus_one_clear = corpus_one.replace(' ', '').strip() ques_q2b = strQ2B(corpus_one_clear.strip()) ques_q2b_syboml = get_syboml(ques_q2b) governments.append(ques_q2b_syboml.strip()) government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments)) topic_ques_all = [] for topic_ques_one in government_ques: top_ques_aqlq = topic_ques_one.replace(' ', ' ').replace(' ', ' ').strip() + '\n' topic_ques_all.append(top_ques_aqlq) txtWrite(topic_ques_all, target_path) print("cut_td_idf ok! " + sources_path)
Example #3
Source File: chatbot_sentence_vec_by_word.py From nlp_xiaojiang with MIT License | 6 votes |
def word_flag_cut(sentence): """ jieba切词词性 :param sentence: :return: """ sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\ replace(' ', '').replace('\t', '').upper().strip() word_list = [] flag_list = [] try: sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False)) words = jieba_seg.cut(sentence_cut) for word in words: word_list.append(word.word) flag_list.append(word.flag) except Exception as e: word_list = [sentence] flag_list = ['nt'] return word_list, flag_list
Example #4
Source File: segment.py From Chinese-Poetry-Generation with MIT License | 6 votes |
def segment(self, sentence): # TODO: try CRF-based segmentation. toks = [] idx = 0 while idx + 4 <= len(sentence): # Cut 2 chars each time. if sentence[idx : idx + 2] in self.sxhy_dict: toks.append(sentence[idx : idx + 2]) else: for tok in jieba.lcut(sentence[idx : idx + 2]): toks.append(tok) idx += 2 # Cut last 3 chars. if idx < len(sentence): if sentence[idx : ] in self.sxhy_dict: toks.append(sentence[idx : ]) else: for tok in jieba.lcut(sentence[idx : ]): toks.append(tok) return toks # For testing purpose.
Example #5
Source File: segment.py From Chinese-Poetry-Generation with MIT License | 6 votes |
def _gen_sxhy_dict(): print("Parsing shixuehanying dictionary ...") words = set() with open(_rawsxhy_path, 'r') as fin: for line in fin.readlines(): if line[0] == '<': continue for phrase in line.strip().split()[1:]: if not is_cn_sentence(phrase): continue idx = 0 while idx + 4 <= len(phrase): # Cut 2 chars each time. words.add(phrase[idx : idx + 2]) idx += 2 # Use jieba to cut the last 3 chars. if idx < len(phrase): for word in jieba.lcut(phrase[idx:]): words.add(word) with open(sxhy_path, 'w') as fout: fout.write(' '.join(words))
Example #6
Source File: chinese.py From Multi-Label-Text-Classification-for-Chinese with MIT License | 6 votes |
def __call__(self, sent): sent = ptxt.Text(sent, "whi").clean sent = self.clean_linkpic(sent) sent = self.clean_english(sent) sent = self.clean_date(sent) sent = self.clean_time(sent) sent = self.clean_money(sent) sent = self.clean_weight(sent) sent = self.clean_concentration(sent) sent = self.clean_entity(sent) sent = self.clean_nums(sent) wlist = jieba.lcut(sent) sent = self.clean_stopwords(wlist) sent = self.clean_punctuation(sent) return sent
Example #7
Source File: pre_process.py From nlp-journey with Apache License 2.0 | 6 votes |
def process_data(train_file, user_dict=None, stop_dict=None): # 结巴分词加载自定义词典(要符合jieba自定义词典规范) if user_dict: jieba.load_userdict(user_dict) # 加载停用词表(每行一个停用词) stop_words = [] if stop_dict: with open(stop_dict, 'r', encoding='utf-8') as file: stop_words = [stop_word.strip() for stop_word in file.readlines()] # 读取文件内容并分词, 去掉停用词 with open(train_file, 'r', encoding='utf-8') as file: sentences = file.readlines() sentences = [jieba.lcut(sentence.strip()) for sentence in sentences] sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences] return sentences
Example #8
Source File: textAnalysis.py From deep_learning with MIT License | 6 votes |
def train_wordtoVect(train_inputTexts): """ 训练词向量函数 """ texts=[] for doc in train_inputTexts: seg_doc = jieba.lcut(doc.replace('\n', '')) d =" ".join(seg_doc) texts.append(d) tokenizer = text.Tokenizer() # 分词MAX_NB_WORDS tokenizer.fit_on_texts(texts) text_sequences = tokenizer.texts_to_sequences(texts) # 受num_words影响 word_index = tokenizer.word_index # 词_索引 data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH) return word_index, data
Example #9
Source File: textAnalysis.py From deep_learning with MIT License | 6 votes |
def train(): """ 训练模型,并保存 """ print('Loading Data...') inputTexts, labels = load_data() print(inputTexts.shape, labels.shape) print('segment...') # seg_data = [jieba.lcut(document.replace('\n', ''))for document in inputTexts] # print('word2vec...') # index_dict, word_vectors, data = word2vec_train(seg_data) # n_symbols = len(index_dict) + 1 # x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15) # print(x_train.shape, y_train.shape) # train_model(n_symbols, x_train, y_train, x_test, y_test) word_index, data = train_wordtoVect(inputTexts) input_dim=len(word_index) + 1 x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15) print(x_train.shape, y_train.shape) train_model(input_dim, x_train, y_train, x_test, y_test)
Example #10
Source File: DataManager.py From ChID-Dataset with Apache License 2.0 | 6 votes |
def _prepare_data(self, temp_data): cans = temp_data["candidates"] cans = [self.vocab.tran2id(each, True) for each in cans] for text in temp_data["content"]: content = re.split(r'(#idiom\d+#)', text) doc = [] loc = [] labs = [] tags = [] for i, segment in enumerate(content): if re.match(r'#idiom\d+#', segment) is not None: tags.append(segment) if segment in self.ans: labs.append(self.ans[segment]) loc.append(len(doc)) doc.append(self.vocab.tran2id('#idiom#')) else: doc += [self.vocab.tran2id(each) for each in jieba.lcut(segment)] yield doc, cans, labs, loc, tags
Example #11
Source File: nlp.py From open-entity-relation-extraction with MIT License | 6 votes |
def segment(self, sentence, entity_postag=dict()): """采用NLPIR进行分词处理 Args: sentence: string,句子 entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生 Returns: lemmas: list,分词结果 """ # 添加实体词典 if entity_postag: for entity in entity_postag: # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode())) jieba.add_word(entity) # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode())) # 单个用户词加入示例 # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode())) # 单个用户词加入示例 # 分词,不进行词性标注 # lemmas = pynlpir.segment(sentence, pos_tagging=False) lemmas = jieba.lcut(sentence) # pynlpir.close() # 释放 return lemmas
Example #12
Source File: cut_text.py From chatbot_by_similarity with MIT License | 6 votes |
def cut_texts(texts=None, need_cut=True, word_len=1): ''' Use jieba to cut texts :param texts:list of texts :param need_cut:whether need cut text :param word_len:min length of words to keep,in order to delete stop-words :param savepath:path to save word list in json file :return: ''' if need_cut: if word_len > 1: texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts] else: texts_cut = [jieba.lcut(one_text) for one_text in texts] else: if word_len > 1: texts_cut = [[word for word in text if len(word) >= word_len] for text in texts] else: texts_cut = texts return texts_cut
Example #13
Source File: train_word2vec.py From text-cnn with MIT License | 6 votes |
def __iter__(self): for filename in self.filenames: with codecs.open(filename, 'r', encoding='utf-8') as f: for _,line in enumerate(f): try: line=line.strip() line=line.split('\t') assert len(line)==2 blocks=re_han.split(line[1]) word=[] for blk in blocks: if re_han.match(blk): word.extend(jieba.lcut(blk)) yield word except: pass
Example #14
Source File: text_predict.py From text-cnn with MIT License | 6 votes |
def sentence_cut(sentences): """ Args: sentence: a list of text need to segment Returns: seglist: a list of sentence cut by jieba """ re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)") # the method of cutting text by punctuation seglist=[] for sentence in sentences: words=[] blocks = re_han.split(sentence) for blk in blocks: if re_han.match(blk): words.extend(jieba.lcut(blk)) seglist.append(words) return seglist
Example #15
Source File: train_word2vec.py From text_rnn_attention with MIT License | 6 votes |
def __iter__(self): for filename in self.filenames: with codecs.open(filename, 'r', encoding='utf-8') as f: for _,line in enumerate(f): try: line=line.strip() line=line.split('\t') assert len(line)==2 blocks=re_han.split(line[1]) word=[] for blk in blocks: if re_han.match(blk): word.extend(jieba.lcut(blk)) yield word except: pass
Example #16
Source File: text_predict.py From text_rnn_attention with MIT License | 6 votes |
def sentence_cut(sentences): """ Args: sentence: a list of text need to segment Returns: seglist: a list of sentence cut by jieba """ re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)") # the method of cutting text by punctuation with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f: stopwords=[line.strip() for line in f.readlines()] contents=[] for sentence in sentences: words=[] blocks = re_han.split(sentence) for blk in blocks: if re_han.match(blk): seglist = jieba.lcut(blk) words.extend([w for w in seglist if w not in stopwords]) contents.append(words) return contents
Example #17
Source File: create_pretraining_data.py From albert-chinese-ner with MIT License | 5 votes |
def get_new_segment(segment): # 新增的方法 #### """ 输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。 :param segment: 一句话. e.g. ['悬', '灸', '技', '术', '培', '训', '专', '家', '教', '你', '艾', '灸', '降', '血', '糖', ',', '为', '爸', '妈', '收', '好', '了', '!'] :return: 一句处理过的话 e.g. ['悬', '##灸', '技', '术', '培', '训', '专', '##家', '教', '你', '艾', '##灸', '降', '##血', '##糖', ',', '为', '爸', '##妈', '收', '##好', '了', '!'] """ seq_cws = jieba.lcut("".join(segment)) # 分词 seq_cws_dict = {x: 1 for x in seq_cws} # 分词后的词加入到词典dict new_segment = [] i = 0 while i < len(segment): # 从句子的第一个字开始处理,知道处理完整个句子 if len(re.findall('[\u4E00-\u9FA5]', segment[i])) == 0: # 如果找不到中文的,原文加进去即不用特殊处理。 new_segment.append(segment[i]) i += 1 continue has_add = False for length in range(3, 0, -1): if i + length > len(segment): continue if ''.join(segment[i:i + length]) in seq_cws_dict: new_segment.append(segment[i]) for l in range(1, length): new_segment.append('##' + segment[i + l]) i += length has_add = True break if not has_add: new_segment.append(segment[i]) i += 1 # print("get_new_segment.wwm.get_new_segment:",new_segment) return new_segment
Example #18
Source File: test_spm_preprocessor.py From fancy-nlp with GNU General Public License v3.0 | 5 votes |
def test_get_word_ids(self): example_text = ''.join(self.x_train[0][0]) word_cut = jieba.lcut(example_text) word_ids = self.preprocessor.get_word_ids(word_cut) assert len(word_ids) == len(word_cut)
Example #19
Source File: raw_data.py From textclf with MIT License | 5 votes |
def jieba_tokenizer(text: str): return jieba.lcut(text)
Example #20
Source File: preprocess.py From deep-siamese-text-similarity with MIT License | 5 votes |
def tokenizer_word(iterator): jieba.load_userdict('./dict.txt') for sentence in iterator: yield list(jieba.lcut(sentence))
Example #21
Source File: loader.py From text_rnn_attention with MIT License | 5 votes |
def read_file(filename): """ Args: filename:trian_filename,test_filename,val_filename Returns: two list where the first is lables and the second is contents cut by jieba """ re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)") # the method of cutting text by punctuation with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f: stopwords=[line.strip() for line in f.readlines()] contents,labels=[],[] with codecs.open(filename,'r',encoding='utf-8') as f: for line in f: try: line=line.rstrip() assert len(line.split('\t'))==2 label,content=line.split('\t') labels.append(label) blocks = re_han.split(content) word = [] for blk in blocks: if re_han.match(blk): seglist=jieba.lcut(blk) word.extend([w for w in seglist if w not in stopwords]) contents.append(word) except: pass return labels,contents
Example #22
Source File: preprocess.py From deep-siamese-text-similarity with MIT License | 5 votes |
def tokenizer_word(iterator): jieba.load_userdict('./dict.txt') for sentence in iterator: sentence = sentence.decode("utf8") sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。:??、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"), sentence) yield list(jieba.lcut(sentence))
Example #23
Source File: evaluate.py From tatk with Apache License 2.0 | 5 votes |
def split_delex_sentence(sen): res_sen = '' pattern = re.compile(r'(\[[^\[^\]]+\])') slots = pattern.findall(sen) for slot in slots: sen = sen.replace(slot, '[slot]') sen = sen.split('[slot]') for part in sen: part = ' '.join(jieba.lcut(part)) res_sen += part if slots: res_sen += ' ' + slots.pop(0) + ' ' return res_sen
Example #24
Source File: LoadDemo.py From deep_learning with MIT License | 5 votes |
def input_fn(filenames="./data/knowledge.txt", batch_size=32, epoch_num=None, shuffle_size=256): dataset = tf.data.TextLineDataset(filenames) def clean_data(line): columns_data = tf.string_split([line], '\t') # tensor化 labels = tf.string_to_number(columns_data.values[1], out_type=tf.float32) splits_data = columns_data.values[2] def split_word(text): text = text.decode() print(text) text = rules.sub("", text) text = text.strip() tokens = jieba.lcut(text) print(tokens) if len(tokens)==0: tokens = ["未知空"] # dataset需要保证长度一致 return tokens[:1] # tf.py_func 将普通函数作用在tensor上 result = tf.py_func(split_word, [splits_data], [tf.string]) return {"context": result}, labels dataset = dataset.map(lambda line: clean_data(line)) # shuffle将数据打乱,数值越大,混乱程度越大 if shuffle_size > 0: if epoch_num: # repeat数据集重复了指定次数 dataset = dataset.shuffle(shuffle_size).repeat(epoch_num) else: dataset = dataset.shuffle(shuffle_size).repeat() # 按照顺序取出FLAGS.batch_size行数据,最后一次输出可能小于FLAGS.batch_size dataset = dataset.batch(batch_size).prefetch(1) return dataset
Example #25
Source File: tokenizer.py From text2vec with Apache License 2.0 | 5 votes |
def tokenize(self, sentence, cut_all=False, HMM=True): """ 切词并返回切词位置 :param sentence: 句子 :param cut_all: 全模式,默认关闭 :param HMM: 是否打开NER识别,默认打开 :return: A list of strings. """ return self.model.lcut(sentence, cut_all=cut_all, HMM=HMM)
Example #26
Source File: tokenizer.py From text2vec with Apache License 2.0 | 5 votes |
def segment(sentence, cut_type='word', pos=False): """ 切词 :param sentence: :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence) :param pos: enable POS :return: list """ if pos: if cut_type == 'word': word_pos_seq = posseg.lcut(sentence) word_seq, pos_seq = [], [] for w, p in word_pos_seq: word_seq.append(w) pos_seq.append(p) return word_seq, pos_seq elif cut_type == 'char': word_seq = list(sentence) pos_seq = [] for w in word_seq: w_p = posseg.lcut(w) pos_seq.append(w_p[0].flag) return word_seq, pos_seq else: if cut_type == 'word': return jieba.lcut(sentence) elif cut_type == 'char': return list(sentence)
Example #27
Source File: utils_test.py From text2vec with Apache License 2.0 | 5 votes |
def test_jieba(): a = '我要办理特价机票,李浩然可以想办法' import jieba b = jieba.lcut(a, cut_all=False) print('cut_all=False', b) b = jieba.lcut(a, cut_all=True) print('cut_all=True', b) b = jieba.lcut(a, HMM=True) print('HMM=True', b) b = jieba.lcut(a, HMM=False) print('HMM=False', b)
Example #28
Source File: textAnalysis.py From deep_learning with MIT License | 5 votes |
def predict_wordtoVect(valid_inputTexts): """ 预测词向量函数 """ train_texts, labels = load_data() train_texts = [" ".join(jieba.lcut(doc)) for doc in train_texts] tokenizer = text.Tokenizer() tokenizer.fit_on_texts(train_texts) pred_texts = [" ".join(jieba.lcut(doc)) for doc in valid_inputTexts] print(pred_texts) text_seq = tokenizer.texts_to_sequences(pred_texts) valid_data = sequence.pad_sequences(text_seq, maxlen=MAX_SEQUENCE_LENGTH) return valid_data
Example #29
Source File: evaluate.py From tatk with Apache License 2.0 | 5 votes |
def split_delex_sentence(sen): res_sen = '' pattern = re.compile(r'(\[[^\[^\]]+\])') slots = pattern.findall(sen) for slot in slots: sen = sen.replace(slot, '[slot]') sen = sen.split('[slot]') for part in sen: part = ' '.join(jieba.lcut(part)) res_sen += part if slots: res_sen += ' ' + slots.pop(0) + ' ' return res_sen
Example #30
Source File: Bayes_classifier.py From python with Apache License 2.0 | 5 votes |
def cut_word(self, origin): """结巴分词 """ origin = re.sub(r"[^\u4e00-\u9fa5]+", "", origin) # 除去所有非中文的字符 self.seg_list = jieba.lcut(origin) # jieba搜索引擎模式分词