Python Examples of jieba.tokenize

Source File: tokenize_chinese.py From driverlessai-recipes with Apache License 2.0

6 votes

def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        # exit gracefully if method is called as a data upload rather than data modify
        if X is None:
            return []
        # Tokenize the chinese text
        import jieba
        X = dt.Frame(X).to_pandas()
        # If no columns to tokenize, use the first column
        if len(cols_to_tokenize) == 0:
            cols_to_tokenize.append(X.columns[0])
        for col in cols_to_tokenize:
            X[col] = X[col].astype('unicode').fillna(u'NA')
            X[col] = X[col].apply(lambda x: " ".join([r[0] for r in jieba.tokenize(x)]))
        return dt.Frame(X)

Source File: tokenizer_test.py From pycorrector with Apache License 2.0

6 votes

def test_tokenizer():
    txts = ["我不要你花钱,这些路曲近通幽",
            "这个消息不胫儿走",
            "这个消息不径而走",
            "这个消息不胫而走",
            "复方甘草口服溶液限田基",
            "张老师经常背课到深夜，我们要体晾老师的心苦。",
            '新进人员时，知识当然还不过，可是人有很有精神，面对工作很认真的话，很快就学会、体会。',
            ",我遇到了问题怎么办",
            ",我遇到了问题",
            "问题",
            "北川景子参演了林诣彬导演的《速度与激情3》",
            "林志玲亮相网友:确定不是波多野结衣？",
            "龟山千广和近藤公园在龟山公园里喝酒赏花",
            "小牛曲清去蛋白提取物乙"]
    t = Tokenizer()
    for text in txts:
        print(text)
        print('deault', t.tokenize(text, 'default'))
        print('search', t.tokenize(text, 'search'))
        print('ngram', t.tokenize(text, 'ngram'))

Source File: tokenizer_test.py From pycorrector with Apache License 2.0

6 votes

def test_detector_tokenizer():
    sents = ["我不要你花钱,这些路曲近通幽",
             "这个消息不胫儿走",
             "这个消息不径而走",
             "这个消息不胫而走",
             "复方甘草口服溶液限田基",
             "张老师经常背课到深夜，我们要体晾老师的心苦。",
             '新进人员时，知识当然还不过，可是人有很有精神，面对工作很认真的话，很快就学会、体会。',
             "北川景子参演了林诣彬导演的《速度与激情3》",
             "林志玲亮相网友:确定不是波多野结衣？",
             "龟山千广和近藤公园在龟山公园里喝酒赏花",
             "问题"
             ]
    d = Detector()
    d.check_detector_initialized()
    detector_tokenizer = d.tokenizer
    for text in sents:
        print(text)
        print('deault', detector_tokenizer.tokenize(text, 'default'))
        print('search', detector_tokenizer.tokenize(text, 'search'))

Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0

5 votes

def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens

Source File: jieba_pseg_extractor.py From rasa_nlu with Apache License 2.0

5 votes

def posseg(text):
        # type: (Text) -> List[Token]

        import jieba
        import jieba.posseg as pseg

        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result

Source File: analyzer.py From QAbot_by_base_KG with MIT License

5 votes

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

Source File: analyzer.py From python-girlfriend-mood with MIT License

5 votes

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

Source File: jieba_tokenizer.py From rasa-for-botfront with Apache License 2.0

5 votes

def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import jieba

        text = message.get(attribute)

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens

Source File: analyzer.py From annotated_jieba with MIT License

5 votes

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

Source File: jieba_test.py From annotated_jieba with MIT License

5 votes

def testTokenize(self):
        for content in test_contents:
            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize", file=sys.stderr)

Source File: jieba_test.py From annotated_jieba with MIT License

5 votes

def testTokenize_NOHMM(self):
        for content in test_contents:
            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize_NOHMM", file=sys.stderr)

Source File: test_tokenize_no_hmm.py From annotated_jieba with MIT License

5 votes

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

Source File: test_tokenize.py From annotated_jieba with MIT License

5 votes

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

Source File: analyzer.py From Malicious_Domain_Whois with GNU General Public License v3.0

5 votes

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

Source File: jieba_test.py From Malicious_Domain_Whois with GNU General Public License v3.0

5 votes

def testTokenize(self):
        for content in test_contents:
            result = jieba.tokenize(content)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize", file=sys.stderr)

Source File: jieba_test.py From Malicious_Domain_Whois with GNU General Public License v3.0

5 votes

def testTokenize_NOHMM(self):
        for content in test_contents:
            result = jieba.tokenize(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Tokenize error on content: %s" % content
            for tk in result:
                print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr)
        print("testTokenize_NOHMM", file=sys.stderr)

Source File: test_tokenize_no_hmm.py From Malicious_Domain_Whois with GNU General Public License v3.0

5 votes

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

Source File: test_tokenize.py From Malicious_Domain_Whois with GNU General Public License v3.0

5 votes

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0

5 votes

def process(self, message, **kwargs):
        # type: (Message, **Any) -> None
        message.set("tokens", self.tokenize(message.text))

Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

Source File: analyzer.py From jieba_fast with MIT License

5 votes

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

Source File: jieba_tokenizer.py From rasa_nlu with Apache License 2.0

5 votes

def tokenize(text: Text) -> List[Token]:
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens

Source File: jieba_tokenizer.py From rasa_nlu with Apache License 2.0

5 votes

def process(self, message: Message, **kwargs: Any) -> None:
        message.set("tokens", self.tokenize(message.text))

Source File: jieba_tokenizer.py From rasa_nlu with Apache License 2.0

5 votes

def train(self,
              training_data: TrainingData,
              config: RasaNLUModelConfig,
              **kwargs: Any) -> None:
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

Source File: jieba_pseg_extractor.py From rasa_nlu_gq with Apache License 2.0

5 votes

def posseg(text):
        # type: (Text) -> List[Token]
        result = []
        for (word, start, end) in jieba.tokenize(text):
            pseg_data = [(w, f) for (w, f) in pseg.cut(word)]
            result.append((pseg_data, start, end))

        return result

Source File: analyzer.py From Synonyms with MIT License

5 votes

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

Source File: tokenizer_test.py From pycorrector with Apache License 2.0

5 votes

def test_segment():
    """测试疾病名纠错"""
    error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛，效果还可以'  # 奥美沙坦酯片
    print(error_sentence_1)
    print(segment(error_sentence_1))
    import jieba
    print(list(jieba.tokenize(error_sentence_1)))
    import jieba.posseg as pseg
    words = pseg.lcut("我爱北京天安门")  # jieba默认模式
    print('old:', words)
    # jieba.enable_paddle()  # 启动paddle模式。 0.40版之后开始支持，早期版本不支持
    # words = pseg.cut("我爱北京天安门", use_paddle=True)  # paddle模式
    # for word, flag in words:
    #     print('new:','%s %s' % (word, flag))

Source File: analyzer.py From chinese-support-redux with GNU General Public License v3.0

5 votes

def __call__(self, text, **kargs):
        words = jieba.tokenize(text, mode="search")
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w) and len(w) <= 1:
                continue
            token.original = token.text = w
            token.pos = start_pos
            token.startchar = start_pos
            token.endchar = stop_pos
            yield token

Source File: test_tokenize.py From jieba_fast with MIT License

5 votes

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

Source File: test_tokenize_no_hmm.py From jieba_fast with MIT License

5 votes

def cuttest(test_sent):
    global g_mode
    result = jieba.tokenize(test_sent,mode=g_mode,HMM=False)
    for tk in result:
        print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

Python jieba.tokenize() Examples