Python jieba.tokenize() Examples

The following are 30 code examples of jieba.tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: tokenize_chinese.py    From driverlessai-recipes with Apache License 2.0 6 votes vote down vote up
def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        # exit gracefully if method is called as a data upload rather than data modify
        if X is None:
            return []
        # Tokenize the chinese text
        import jieba
        X = dt.Frame(X).to_pandas()
        # If no columns to tokenize, use the first column
        if len(cols_to_tokenize) == 0:
            cols_to_tokenize.append(X.columns[0])
        for col in cols_to_tokenize:
            X[col] = X[col].astype('unicode').fillna(u'NA')
            X[col] = X[col].apply(lambda x: " ".join([r[0] for r in jieba.tokenize(x)]))
        return dt.Frame(X) 
Example #2
Source File: tokenizer_test.py    From pycorrector with Apache License 2.0 6 votes vote down vote up
def test_tokenizer():
    txts = ["我不要你花钱,这些路曲近通幽",
            "这个消息不胫儿走",
            "这个消息不径而走",
            "这个消息不胫而走",
            "复方甘草口服溶液限田基",
            "张老师经常背课到深夜,我们要体晾老师的心苦。",
            '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。',
            ",我遇到了问题怎么办",
            ",我遇到了问题",
            "问题",
            "北川景子参演了林诣彬导演的《速度与激情3》",
            "林志玲亮相网友:确定不是波多野结衣?",
            "龟山千广和近藤公园在龟山公园里喝酒赏花",
            "小牛曲清去蛋白提取物乙"]
    t = Tokenizer()
    for text in txts:
        print(text)
        print('deault', t.tokenize(text, 'default'))
        print('search', t.tokenize(text, 'search'))
        print('ngram', t.tokenize(text, 'ngram')) 
Example #3
Source File: tokenizer_test.py    From pycorrector with Apache License 2.0 6 votes vote down vote up
def test_detector_tokenizer():
    sents = ["我不要你花钱,这些路曲近通幽",
             "这个消息不胫儿走",
             "这个消息不径而走",
             "这个消息不胫而走",
             "复方甘草口服溶液限田基",
             "张老师经常背课到深夜,我们要体晾老师的心苦。",
             '新进人员时,知识当然还不过,可是人有很有精神,面对工作很认真的话,很快就学会、体会。',
             "北川景子参演了林诣彬导演的《速度与激情3》",
             "林志玲亮相网友:确定不是波多野结衣?",
             "龟山千广和近藤公园在龟山公园里喝酒赏花",
             "问题"
             ]
    d = Detector()
    d.check_detector_initialized()
    detector_tokenizer = d.tokenizer
    for text in sents:
        print(text)
        print('deault', detector_tokenizer.tokenize(text, 'default'))
        print('search', detector_tokenizer.tokenize(text, 'search')) 
Example #4
Source File: jieba_tokenizer.py    From rasa_bot with Apache License 2.0 5 votes vote down vote up
def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens 
Example #5
Source File: jieba_pseg_extractor.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def posseg(text):
        # type: (Text) -> List[Token]

        import jieba
        import jieba.posseg as pseg

        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result 
Example #6
Source File: analyzer.py    From QAbot_by_base_KG with MIT License 5 votes vote down vote up
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token 
Example #7
Source File: analyzer.py    From python-girlfriend-mood with MIT License 5 votes vote down vote up
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token 
Example #8
Source File: jieba_tokenizer.py    From rasa-for-botfront with Apache License 2.0 5 votes vote down vote up
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import jieba

        text = message.get(attribute)

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens 
Example #9
Source File: analyzer.py    From annotated_jieba with MIT License 5 votes vote down vote up
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token 
Example #10
Source File: jieba_test.py    From annotated_jieba with MIT License 5 votes vote down vote up
def testTokenize(self):
        for content in test_contents:
            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize", file=sys.stderr) 
Example #11
Source File: jieba_test.py    From annotated_jieba with MIT License 5 votes vote down vote up
def testTokenize_NOHMM(self):
        for content in test_contents:
            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize_NOHMM", file=sys.stderr) 
Example #12
Source File: test_tokenize_no_hmm.py    From annotated_jieba with MIT License 5 votes vote down vote up
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) 
Example #13
Source File: test_tokenize.py    From annotated_jieba with MIT License 5 votes vote down vote up
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) 
Example #14
Source File: analyzer.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token 
Example #15
Source File: jieba_test.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def testTokenize(self):
        for content in test_contents:
            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize", file=sys.stderr) 
Example #16
Source File: jieba_test.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def testTokenize_NOHMM(self):
        for content in test_contents:
            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize_NOHMM", file=sys.stderr) 
Example #17
Source File: test_tokenize_no_hmm.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) 
Example #18
Source File: test_tokenize.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) 
Example #19
Source File: jieba_tokenizer.py    From rasa_bot with Apache License 2.0 5 votes vote down vote up
def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        message.set("tokens", self.tokenize(message.text)) 
Example #20
Source File: jieba_tokenizer.py    From rasa_bot with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
Example #21
Source File: analyzer.py    From jieba_fast with MIT License 5 votes vote down vote up
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token 
Example #22
Source File: jieba_tokenizer.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def tokenize(text: Text) -> List[Token]:
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens 
Example #23
Source File: jieba_tokenizer.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def process(self, message: Message, **kwargs: Any) -> None:
        message.set("tokens", self.tokenize(message.text)) 
Example #24
Source File: jieba_tokenizer.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def train(self,
              training_data: TrainingData,
              config: RasaNLUModelConfig,
              **kwargs: Any) -> None:
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
Example #25
Source File: jieba_pseg_extractor.py    From rasa_nlu_gq with Apache License 2.0 5 votes vote down vote up
def posseg(text):
        # type: (Text) -> List[Token]
        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result 
Example #26
Source File: analyzer.py    From Synonyms with MIT License 5 votes vote down vote up
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token 
Example #27
Source File: tokenizer_test.py    From pycorrector with Apache License 2.0 5 votes vote down vote up
def test_segment():
    """测试疾病名纠错"""
    error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以'  # 奥美沙坦酯片
    print(error_sentence_1)
    print(segment(error_sentence_1))
    import jieba
    print(list(jieba.tokenize(error_sentence_1)))
    import jieba.posseg as pseg
    words = pseg.lcut("我爱北京天安门")  # jieba默认模式
    print('old:', words)
    # jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持,早期版本不支持
    # words = pseg.cut("我爱北京天安门", use_paddle=True)  # paddle模式
    # for word, flag in words:
    #     print('new:','%s %s' % (word, flag)) 
Example #28
Source File: analyzer.py    From chinese-support-redux with GNU General Public License v3.0 5 votes vote down vote up
def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token 
Example #29
Source File: test_tokenize.py    From jieba_fast with MIT License 5 votes vote down vote up
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2])) 
Example #30
Source File: test_tokenize_no_hmm.py    From jieba_fast with MIT License 5 votes vote down vote up
def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))