Python jieba.set_dictionary() Examples

The following are 12 code examples of jieba.set_dictionary(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: segment_word.py    From athena with Apache License 2.0 6 votes vote down vote up
def segment_trans(vocab_file, text_file):
    ''' segment transcripts according to vocab
        using Maximum Matching Algorithm
    Args:
      vocab_file: vocab file
      text_file: transcripts file
    Returns:
      seg_trans: segment words
    '''
    jieba.set_dictionary(vocab_file)
    with open(text_file, "r", encoding="utf-8") as text:
        lines = text.readlines()
        sents = ''
        for line in lines:
            seg_line = jieba.cut(line.strip(), HMM=False)
            seg_line = ' '.join(seg_line)
            sents += seg_line + '\n'
        return sents 
Example #2
Source File: segment_word.py    From athena with Apache License 2.0 6 votes vote down vote up
def segment_trans(vocab_file, text_file):
    ''' segment transcripts according to vocab
        using Maximum Matching Algorithm
    Args:
      vocab_file: vocab file
      text_file: transcripts file
    Returns:
      seg_trans: segment words
    '''
    jieba.set_dictionary(vocab_file)
    with open(text_file, "r", encoding="utf-8") as text:
        lines = text.readlines()
        sents = ''
        for line in lines:
            seg_line = jieba.cut(line.strip(), HMM=False)
            seg_line = ' '.join(seg_line)
            sents += seg_line + '\n'
        return sents 
Example #3
Source File: jieba_test.py    From jieba_fast with MIT License 5 votes vote down vote up
def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testSetDictionary", file=sys.stderr) 
Example #4
Source File: segment.py    From word2vec-tutorial with MIT License 5 votes vote down vote up
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    stopword_set = set()
    with io.open('jieba_dict/stopwords.txt', 'r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    output = io.open('wiki_seg.txt','w', encoding='utf-8')
    with io.open('wiki_zh_tw.txt','r', encoding='utf-8') as content :
        for texts_num, line in enumerate(content):
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopword_set:
                    output.write(word +' ')
            output.write('\n')

            if (texts_num + 1) % 10000 == 0:
                logging.info("已完成前 %d 行的斷詞" % (texts_num + 1))
    output.close() 
Example #5
Source File: segment.py    From word2vec-tutorial with MIT License 5 votes vote down vote up
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    stopword_set = set()
    with open('jieba_dict/stopwords.txt','r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    output = open('wiki_seg.txt', 'w', encoding='utf-8')
    with open('wiki_zh_tw.txt', 'r', encoding='utf-8') as content :
        for texts_num, line in enumerate(content):
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopword_set:
                    output.write(word + ' ')
            output.write('\n')

            if (texts_num + 1) % 10000 == 0:
                logging.info("已完成前 %d 行的斷詞" % (texts_num + 1))
    output.close() 
Example #6
Source File: console.py    From Chatbot with GNU General Public License v3.0 5 votes vote down vote up
def init_jieba(self, seg_dic, userdic):

        """
        jieba custom setting.
        """

        jieba.load_userdict(userdic)
        jieba.set_dictionary(seg_dic)
        with open(userdic,'r',encoding='utf-8') as input:
            for word in input:
                word = word.strip('\n')
                jieba.suggest_freq(word, True) 
Example #7
Source File: matcher.py    From Chatbot with GNU General Public License v3.0 5 votes vote down vote up
def jiebaCustomSetting(self, dict_path, usr_dict_path):

        jieba.set_dictionary(dict_path)
        with open(usr_dict_path, 'r', encoding='utf-8') as dic:
            for word in dic:
                jieba.add_word(word.strip('\n')) 
Example #8
Source File: plugins_test.py    From kim-voice-assistant with MIT License 5 votes vote down vote up
def setUp(self):
        jieba.set_dictionary(APP_RESOURCES_DATA_PATH + 'jieba.dict')  # 设置中文分词库 
Example #9
Source File: matcher.py    From PTT-Chat-Generator with MIT License 5 votes vote down vote up
def jiebaCustomSetting(self, dict_path, usr_dict_path):

        jieba.set_dictionary(dict_path)
        with open(usr_dict_path, 'r', encoding='utf-8') as dic:
            for word in dic:
                jieba.add_word(word.strip('\n')) 
Example #10
Source File: jieba_tokenizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def set_default_dict(tokenizer, path_default_dict):
        print("Setting Jieba Default Dictionary at " + str(path_default_dict))
        tokenizer.set_dictionary(path_default_dict)
        
        return tokenizer 
Example #11
Source File: jieba_test.py    From annotated_jieba with MIT License 5 votes vote down vote up
def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testSetDictionary", file=sys.stderr) 
Example #12
Source File: jieba_test.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def testSetDictionary(self):
        jieba.set_dictionary("foobar.txt")
        for content in test_contents:
            result = jieba.cut(content)
            assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
            result = list(result)
            assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testSetDictionary", file=sys.stderr)