Python jieba.tokenize() Examples
The following are 30
code examples of jieba.tokenize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
jieba
, or try the search function
.
Example #1
Source File: tokenize_chinese.py From driverlessai-recipes with Apache License 2.0 | 6 votes |
def create_data(X: dt.Frame = None) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: # exit gracefully if method is called as a data upload rather than data modify if X is None: return [] # Tokenize the chinese text import jieba X = dt.Frame(X).to_pandas() # If no columns to tokenize, use the first column if len(cols_to_tokenize) == 0: cols_to_tokenize.append(X.columns[0]) for col in cols_to_tokenize: X[col] = X[col].astype('unicode').fillna(u'NA') X[col] = X[col].apply(lambda x: " ".join([r[0] for r in jieba.tokenize(x)])) return dt.Frame(X)
Example #2
Source File: tokenizer_test.py From pycorrector with Apache License 2.0 | 6 votes |
def test_tokenizer(): txts = ["我不要你花钱,这些路曲近通幽", "这个消息不胫儿走", "这个消息不径而走", "这个消息不胫而走", "复方甘草口服溶液限田基", "张老师经常背课到深夜,我们要体晾老师的心苦。", '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。', ",我遇到了问题怎么办", ",我遇到了问题", "问题", "北川景子参演了林诣彬导演的《速度与激情3》", "林志玲亮相网友:确定不是波多野结衣?", "龟山千广和近藤公园在龟山公园里喝酒赏花", "小牛曲清去蛋白提取物乙"] t = Tokenizer() for text in txts: print(text) print('deault', t.tokenize(text, 'default')) print('search', t.tokenize(text, 'search')) print('ngram', t.tokenize(text, 'ngram'))
Example #3
Source File: tokenizer_test.py From pycorrector with Apache License 2.0 | 6 votes |
def test_detector_tokenizer(): sents = ["我不要你花钱,这些路曲近通幽", "这个消息不胫儿走", "这个消息不径而走", "这个消息不胫而走", "复方甘草口服溶液限田基", "张老师经常背课到深夜,我们要体晾老师的心苦。", '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。', "北川景子参演了林诣彬导演的《速度与激情3》", "林志玲亮相网友:确定不是波多野结衣?", "龟山千广和近藤公园在龟山公园里喝酒赏花", "问题" ] d = Detector() d.check_detector_initialized() detector_tokenizer = d.tokenizer for text in sents: print(text) print('deault', detector_tokenizer.tokenize(text, 'default')) print('search', detector_tokenizer.tokenize(text, 'search'))
Example #4
Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0 | 5 votes |
def tokenize(self, text): # type: (Text) -> List[Token] import jieba tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
Example #5
Source File: jieba_pseg_extractor.py From rasa_nlu with Apache License 2.0 | 5 votes |
def posseg(text): # type: (Text) -> List[Token] import jieba import jieba.posseg as pseg result = [] for (word, start, end) in jieba.tokenize(text): pseg_data = [(w, f) for (w, f) in pseg.cut(word)] result.append((pseg_data, start, end)) return result
Example #6
Source File: analyzer.py From QAbot_by_base_KG with MIT License | 5 votes |
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
Example #7
Source File: analyzer.py From python-girlfriend-mood with MIT License | 5 votes |
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
Example #8
Source File: jieba_tokenizer.py From rasa-for-botfront with Apache License 2.0 | 5 votes |
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import jieba text = message.get(attribute) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
Example #9
Source File: analyzer.py From annotated_jieba with MIT License | 5 votes |
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
Example #10
Source File: jieba_test.py From annotated_jieba with MIT License | 5 votes |
def testTokenize(self): for content in test_contents: result = jieba.tokenize(content) assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr) print("testTokenize", file=sys.stderr)
Example #11
Source File: jieba_test.py From annotated_jieba with MIT License | 5 votes |
def testTokenize_NOHMM(self): for content in test_contents: result = jieba.tokenize(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr) print("testTokenize_NOHMM", file=sys.stderr)
Example #12
Source File: test_tokenize_no_hmm.py From annotated_jieba with MIT License | 5 votes |
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode,HMM=False) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
Example #13
Source File: test_tokenize.py From annotated_jieba with MIT License | 5 votes |
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
Example #14
Source File: analyzer.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
Example #15
Source File: jieba_test.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def testTokenize(self): for content in test_contents: result = jieba.tokenize(content) assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr) print("testTokenize", file=sys.stderr)
Example #16
Source File: jieba_test.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def testTokenize_NOHMM(self): for content in test_contents: result = jieba.tokenize(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr) print("testTokenize_NOHMM", file=sys.stderr)
Example #17
Source File: test_tokenize_no_hmm.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode,HMM=False) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
Example #18
Source File: test_tokenize.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
Example #19
Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0 | 5 votes |
def process(self, message, **kwargs): # type: (Message, **Any) -> None message.set("tokens", self.tokenize(message.text))
Example #20
Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text))
Example #21
Source File: analyzer.py From jieba_fast with MIT License | 5 votes |
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
Example #22
Source File: jieba_tokenizer.py From rasa_nlu with Apache License 2.0 | 5 votes |
def tokenize(text: Text) -> List[Token]: import jieba tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
Example #23
Source File: jieba_tokenizer.py From rasa_nlu with Apache License 2.0 | 5 votes |
def process(self, message: Message, **kwargs: Any) -> None: message.set("tokens", self.tokenize(message.text))
Example #24
Source File: jieba_tokenizer.py From rasa_nlu with Apache License 2.0 | 5 votes |
def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any) -> None: for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text))
Example #25
Source File: jieba_pseg_extractor.py From rasa_nlu_gq with Apache License 2.0 | 5 votes |
def posseg(text): # type: (Text) -> List[Token] result = [] for (word, start, end) in jieba.tokenize(text): pseg_data = [(w, f) for (w, f) in pseg.cut(word)] result.append((pseg_data, start, end)) return result
Example #26
Source File: analyzer.py From Synonyms with MIT License | 5 votes |
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
Example #27
Source File: tokenizer_test.py From pycorrector with Apache License 2.0 | 5 votes |
def test_segment(): """测试疾病名纠错""" error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以' # 奥美沙坦酯片 print(error_sentence_1) print(segment(error_sentence_1)) import jieba print(list(jieba.tokenize(error_sentence_1))) import jieba.posseg as pseg words = pseg.lcut("我爱北京天安门") # jieba默认模式 print('old:', words) # jieba.enable_paddle() # 启动paddle模式。 0.40版之后开始支持,早期版本不支持 # words = pseg.cut("我爱北京天安门", use_paddle=True) # paddle模式 # for word, flag in words: # print('new:','%s %s' % (word, flag))
Example #28
Source File: analyzer.py From chinese-support-redux with GNU General Public License v3.0 | 5 votes |
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
Example #29
Source File: test_tokenize.py From jieba_fast with MIT License | 5 votes |
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
Example #30
Source File: test_tokenize_no_hmm.py From jieba_fast with MIT License | 5 votes |
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode,HMM=False) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))