Python jieba.set_dictionary() Examples
The following are 12
code examples of jieba.set_dictionary().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
jieba
, or try the search function
.
Example #1
Source File: segment_word.py From athena with Apache License 2.0 | 6 votes |
def segment_trans(vocab_file, text_file): ''' segment transcripts according to vocab using Maximum Matching Algorithm Args: vocab_file: vocab file text_file: transcripts file Returns: seg_trans: segment words ''' jieba.set_dictionary(vocab_file) with open(text_file, "r", encoding="utf-8") as text: lines = text.readlines() sents = '' for line in lines: seg_line = jieba.cut(line.strip(), HMM=False) seg_line = ' '.join(seg_line) sents += seg_line + '\n' return sents
Example #2
Source File: segment_word.py From athena with Apache License 2.0 | 6 votes |
def segment_trans(vocab_file, text_file): ''' segment transcripts according to vocab using Maximum Matching Algorithm Args: vocab_file: vocab file text_file: transcripts file Returns: seg_trans: segment words ''' jieba.set_dictionary(vocab_file) with open(text_file, "r", encoding="utf-8") as text: lines = text.readlines() sents = '' for line in lines: seg_line = jieba.cut(line.strip(), HMM=False) seg_line = ' '.join(seg_line) sents += seg_line + '\n' return sents
Example #3
Source File: jieba_test.py From jieba_fast with MIT License | 5 votes |
def testSetDictionary(self): jieba.set_dictionary("foobar.txt") for content in test_contents: result = jieba.cut(content) assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error" result = list(result) assert isinstance(result, list), "Test SetDictionary error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testSetDictionary", file=sys.stderr)
Example #4
Source File: segment.py From word2vec-tutorial with MIT License | 5 votes |
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # jieba custom setting. jieba.set_dictionary('jieba_dict/dict.txt.big') # load stopwords set stopword_set = set() with io.open('jieba_dict/stopwords.txt', 'r', encoding='utf-8') as stopwords: for stopword in stopwords: stopword_set.add(stopword.strip('\n')) output = io.open('wiki_seg.txt','w', encoding='utf-8') with io.open('wiki_zh_tw.txt','r', encoding='utf-8') as content : for texts_num, line in enumerate(content): line = line.strip('\n') words = jieba.cut(line, cut_all=False) for word in words: if word not in stopword_set: output.write(word +' ') output.write('\n') if (texts_num + 1) % 10000 == 0: logging.info("已完成前 %d 行的斷詞" % (texts_num + 1)) output.close()
Example #5
Source File: segment.py From word2vec-tutorial with MIT License | 5 votes |
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # jieba custom setting. jieba.set_dictionary('jieba_dict/dict.txt.big') # load stopwords set stopword_set = set() with open('jieba_dict/stopwords.txt','r', encoding='utf-8') as stopwords: for stopword in stopwords: stopword_set.add(stopword.strip('\n')) output = open('wiki_seg.txt', 'w', encoding='utf-8') with open('wiki_zh_tw.txt', 'r', encoding='utf-8') as content : for texts_num, line in enumerate(content): line = line.strip('\n') words = jieba.cut(line, cut_all=False) for word in words: if word not in stopword_set: output.write(word + ' ') output.write('\n') if (texts_num + 1) % 10000 == 0: logging.info("已完成前 %d 行的斷詞" % (texts_num + 1)) output.close()
Example #6
Source File: console.py From Chatbot with GNU General Public License v3.0 | 5 votes |
def init_jieba(self, seg_dic, userdic): """ jieba custom setting. """ jieba.load_userdict(userdic) jieba.set_dictionary(seg_dic) with open(userdic,'r',encoding='utf-8') as input: for word in input: word = word.strip('\n') jieba.suggest_freq(word, True)
Example #7
Source File: matcher.py From Chatbot with GNU General Public License v3.0 | 5 votes |
def jiebaCustomSetting(self, dict_path, usr_dict_path): jieba.set_dictionary(dict_path) with open(usr_dict_path, 'r', encoding='utf-8') as dic: for word in dic: jieba.add_word(word.strip('\n'))
Example #8
Source File: plugins_test.py From kim-voice-assistant with MIT License | 5 votes |
def setUp(self): jieba.set_dictionary(APP_RESOURCES_DATA_PATH + 'jieba.dict') # 设置中文分词库
Example #9
Source File: matcher.py From PTT-Chat-Generator with MIT License | 5 votes |
def jiebaCustomSetting(self, dict_path, usr_dict_path): jieba.set_dictionary(dict_path) with open(usr_dict_path, 'r', encoding='utf-8') as dic: for word in dic: jieba.add_word(word.strip('\n'))
Example #10
Source File: jieba_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def set_default_dict(tokenizer, path_default_dict): print("Setting Jieba Default Dictionary at " + str(path_default_dict)) tokenizer.set_dictionary(path_default_dict) return tokenizer
Example #11
Source File: jieba_test.py From annotated_jieba with MIT License | 5 votes |
def testSetDictionary(self): jieba.set_dictionary("foobar.txt") for content in test_contents: result = jieba.cut(content) assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error" result = list(result) assert isinstance(result, list), "Test SetDictionary error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testSetDictionary", file=sys.stderr)
Example #12
Source File: jieba_test.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def testSetDictionary(self): jieba.set_dictionary("foobar.txt") for content in test_contents: result = jieba.cut(content) assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error" result = list(result) assert isinstance(result, list), "Test SetDictionary error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testSetDictionary", file=sys.stderr)