Python jieba.load_userdict() Examples
The following are 30
code examples of jieba.load_userdict().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
jieba
, or try the search function
.
Example #1
Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License | 6 votes |
def jieba_tokenize(self,documents): '''Cut the documents into a sequence of independent words. # Arguments: documents: List of news(articles). ''' chnSTW = self.getchnSTW() corpora_documents = [] jieba.load_userdict(self.finance_dict) for item_text in documents: outstr = [] sentence_seged = list(jieba.cut(item_text)) for word in sentence_seged: if word not in chnSTW and word != '\t' \ and word != ' ': outstr.append(word) corpora_documents.append(outstr) return corpora_documents
Example #2
Source File: text_analyzer.py From public-opinion-analysis with MIT License | 6 votes |
def do_text_analyze(text): load_dictionary_to_cache() jieba.load_userdict("./resources/dict_terminology.txt") article = domain.article.Article(text) raw_sentences = article.split_into_sentences(text) for raw_sentence in raw_sentences: sentence = domain.sentence.Sentence(article.article_id, raw_sentence) article.sentences.append(sentence) article.cache_raw_seg() article.generate_sentence_brief() article.generate_sentence_score() article.generate_article_score() print("Article total score:" + str(article.total_score)) article.clean_up_cache() return article
Example #3
Source File: pre_process.py From nlp-journey with Apache License 2.0 | 6 votes |
def process_data(train_file, user_dict=None, stop_dict=None): # 结巴分词加载自定义词典(要符合jieba自定义词典规范) if user_dict: jieba.load_userdict(user_dict) # 加载停用词表(每行一个停用词) stop_words = [] if stop_dict: with open(stop_dict, 'r', encoding='utf-8') as file: stop_words = [stop_word.strip() for stop_word in file.readlines()] # 读取文件内容并分词, 去掉停用词 with open(train_file, 'r', encoding='utf-8') as file: sentences = file.readlines() sentences = [jieba.lcut(sentence.strip()) for sentence in sentences] sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences] return sentences
Example #4
Source File: bilstm_crf_entity_extractor.py From rasa_nlu_gq with Apache License 2.0 | 6 votes |
def __init__(self, component_config=None, ent_tagger=None, session=None, char_to_id=None, id_to_tag=None): super(BilstmCRFEntityExtractor, self).__init__(component_config) self.ent_tagger = ent_tagger # 指的是训练好的model self.session = session self.char_to_id = char_to_id self.id_to_tag = id_to_tag dictionary_path = self.component_config.get('dictionary_path') if dictionary_path: jieba.load_userdict(dictionary_path) self.seg = jieba
Example #5
Source File: seg_words.py From tf-text-classification with Apache License 2.0 | 6 votes |
def load_dict(path): """ Load dictionary """ jieba.load_userdict(path + 'default.dic') stop_words = open(path + 'stop.dic', 'r', encoding='utf-8').readlines() stop_dic = {}.fromkeys([line.strip() for line in stop_words]) single_words = open(path + 'single.dic', 'r', encoding='utf-8').readlines() single_dic = {}.fromkeys([line.strip() for line in single_words]) synonym_words = open(path + 'synonym.dic', 'r', encoding='UTF-8').readlines() synonym_dic = dict([line.strip().split(" ", 1) for line in synonym_words]) return stop_dic, single_dic, synonym_dic
Example #6
Source File: data_preprocess2.py From Customer-Chatbot with MIT License | 5 votes |
def cut(sentence, stopwords, stopword=True, cut_all=False): # 加载外部词典 jieba.load_userdict('./userdict/userdict.txt') seg_list = jieba.cut(sentence, cut_all) results = [] for seg in seg_list: if stopword and seg in stopwords: continue results.append(seg) return results
Example #7
Source File: train_word2vec.py From text-classification with Apache License 2.0 | 5 votes |
def segment(): # jieba custom setting. DATA_DIR = os.getcwd() + '/data/user_dict' jieba.load_userdict(os.path.join(DATA_DIR, 'Company.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Concept.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Consumer.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Holder.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'HoldingCompany.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'MainComposition.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Manager.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Material.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'OtherCompetitor.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Supplier.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Finance.txt')) # load stopwords set stopword_set = set() with open(os.getcwd()+'/data/user_dict/stopWord.txt', 'r', encoding='utf-8') as stopwords: for stopword in stopwords: stopword_set.add(stopword.strip('\n')) output = open(config['input_seg'], 'w', encoding='utf-8') with open(config['input_raw'], 'r', encoding='utf-8') as content : for texts_num, line in enumerate(content): line = line.strip('\n') words = jieba.cut(line, cut_all=False) for word in words: if word not in stopword_set: output.write(word + ' ') output.write('\n') if (texts_num + 1) % 10000 == 0: logging.info("Segmented %d th articles" % (texts_num + 1)) output.close()
Example #8
Source File: dbscan_analysis.py From ns4_chatbot with Apache License 2.0 | 5 votes |
def init(self): # logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logger.DEBUG,filename='log.txt') logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logger.DEBUG) logger.getLogger("gensim").setLevel(logger.WARNING) logger.getLogger("jieba").setLevel(logger.WARNING) jieba.initialize() jieba.load_userdict("data/addwords.txt") jieba.analyse.set_stop_words('data/stopwords.txt')
Example #9
Source File: segmentor.py From TextCluster with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, args): if args.lang == 'cn': import jieba if args.dict: if not os.path.exists(args.dict): print('Segmentor dictionary not found.') exit(1) jieba.load_userdict(args.dict) self.cut = jieba.cut else: # en from spacy.tokenizer import Tokenizer from spacy.lang.en import English nlp = English() self.tokenizer = Tokenizer(nlp.vocab) self.cut = self.cut_en
Example #10
Source File: crawl.py From MillionHeroAssistant with MIT License | 5 votes |
def jieba_initialize(): if not platform.system().upper().startswith("WINDOWS"): jieba.enable_parallel(multiprocessing.cpu_count()) jieba.load_userdict('resources/QAattrdic.txt') jieba.initialize()
Example #11
Source File: jiebaSegment.py From Customer-Chatbot with MIT License | 5 votes |
def load_userdict(self, file_name): jieba.load_userdict(file_name)
Example #12
Source File: text_segment.py From JiaYuan with Apache License 2.0 | 5 votes |
def segment_text(text): # load user dict jieba.load_userdict(user_dict) # set stop words jieba.analyse.set_stop_words(stop_words) tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=()) for tag in tags: print(str(tag[0]) + "\t" + str(tag[1]))
Example #13
Source File: jiebaSegment.py From Customer-Chatbot with MIT License | 5 votes |
def load_userdict(self, file_name): jieba.load_userdict(file_name)
Example #14
Source File: data_preprocess2.py From Customer-Chatbot with MIT License | 5 votes |
def cut(sentence, stopwords, stopword=True, cut_all=False): # 加载外部词典 jieba.load_userdict('./userdict/userdict.txt') seg_list = jieba.cut(sentence, cut_all) results = [] for seg in seg_list: if stopword and seg in stopwords: continue results.append(seg) return results
Example #15
Source File: preprocessing.py From seq2seq with Apache License 2.0 | 5 votes |
def __init__(self): #self.encoderFile = "/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_ask.txt" #self.decoderFile = '/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_answer.txt' #self.savePath = '/home/yanwii/Python/NLP/seq2seq/seq2seq_pytorch/data/' self.encoderFile = "./data/question.txt" self.decoderFile = "./data/answer.txt" self.savePath = './data/' jieba.load_userdict("./data/supplementvocab.txt")
Example #16
Source File: jieba_tokenizer.py From rasa-for-botfront with Apache License 2.0 | 5 votes |
def load_custom_dictionary(path: Text) -> None: """Load all the custom dictionaries stored in the path. More information about the dictionaries file format can be found in the documentation of jieba. https://github.com/fxsjy/jieba#load-dictionary """ import jieba jieba_userdicts = glob.glob(f"{path}/*") for jieba_userdict in jieba_userdicts: logger.info(f"Loading Jieba User Dictionary at {jieba_userdict}") jieba.load_userdict(jieba_userdict)
Example #17
Source File: preprocess.py From deep-siamese-text-similarity with MIT License | 5 votes |
def tokenizer_word(iterator): jieba.load_userdict('./dict.txt') for sentence in iterator: sentence = sentence.decode("utf8") sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。:??、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"), sentence) yield list(jieba.lcut(sentence))
Example #18
Source File: preprocess.py From deep-siamese-text-similarity with MIT License | 5 votes |
def tokenizer_word(iterator): jieba.load_userdict('./dict.txt') for sentence in iterator: yield list(jieba.lcut(sentence))
Example #19
Source File: jiebaSegment.py From QAmodel-for-Retrievalchatbot with MIT License | 5 votes |
def load_userdict(self,file_name): jieba.load_userdict(file_name)
Example #20
Source File: jieba_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def set_user_dicts(tokenizer, path_user_dicts): if len(path_user_dicts) > 0: for path_user_dict in path_user_dicts: print("Loading Jieba User Dictionary at " + str(path_user_dict)) tokenizer.load_userdict(path_user_dict) else: print("No Jieba User Dictionary found") return tokenizer
Example #21
Source File: segment.py From nlp_toolkit with MIT License | 5 votes |
def __init__(self, user_dict='', model_name='word-rnn', mode='accurate', verbose=0): try: assert mode in ['accurate', 'fast'] except: print('Only support three following mode: accurate, fast') sys.exit() self.pos = True self.mode = mode self.verbose = verbose self.path = os.path.abspath(os.path.dirname(__file__)) if model_name != '': self.model_name = model_name else: try: self.model_name = read_line(Path(self.path) / 'data' / 'best_model.txt')[0] except Exception: self.model_name = model_name # jieba初始化 base_dict = Path(self.path) / 'data' / 'dict' / 'jieba_base_supplyment.txt' jieba.load_userdict(str(base_dict)) if mode == 'fast': global load_dict if not load_dict: if self.verbose: print('loading np dict to jieba cache') dict_path = Path(self.path) / 'data' / 'dict' / 'chunk_pos.txt' jieba.load_userdict(str(dict_path)) load_dict = True if user_dict: jieba.load_userdict(user_dict) self.seg = pseg # model变量 self.weight_file = os.path.join(self.path, 'data/model/%s_weights.h5' % self.model_name) self.param_file = os.path.join(self.path, 'data/model/%s_parameters.json' % self.model_name) self.preprocess_file = os.path.join(self.path, 'data/model/%s_transformer.h5' % self.model_name) self.define_tagger()
Example #22
Source File: hot_words_generator.py From LagouJob with Apache License 2.0 | 5 votes |
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'): """ calculate and show hot words of Job Impression :param interviewee_comments_dir: :return: """ if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0: print('Error! No valid content in {0}'.format(interviewee_comments_dir)) sys.exit(0) else: job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)} for k, v in job_and_dir.items(): text = self.concat_all_text(v) jieba.analyse.set_stop_words(STOPWORDS_PATH) jieba.load_userdict(USER_CORPUS) hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()) frequencies = {_[0]: _[1] for _ in hot_words_with_weights} print(frequencies) x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white", repeat=False, mask=mask) wordcloud.generate_from_frequencies(frequencies) import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
Example #23
Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0 | 5 votes |
def load_custom_dictionary(path): # type: (Text) -> None """Load all the custom dictionaries stored in the path. More information about the dictionaries file format can be found in the documentation of jieba. https://github.com/fxsjy/jieba#load-dictionary """ import jieba jieba_userdicts = glob.glob("{}/*".format(path)) for jieba_userdict in jieba_userdicts: logger.info("Loading Jieba User Dictionary at " "{}".format(jieba_userdict)) jieba.load_userdict(jieba_userdict)
Example #24
Source File: jieba_tokenizer.py From rasa_nlu with Apache License 2.0 | 5 votes |
def load_custom_dictionary(path: Text) -> None: """Load all the custom dictionaries stored in the path. More information about the dictionaries file format can be found in the documentation of jieba. https://github.com/fxsjy/jieba#load-dictionary """ import jieba jieba_userdicts = glob.glob("{}/*".format(path)) for jieba_userdict in jieba_userdicts: logger.info("Loading Jieba User Dictionary at " "{}".format(jieba_userdict)) jieba.load_userdict(jieba_userdict)
Example #25
Source File: jieba_pseg_extractor.py From rasa_nlu_gq with Apache License 2.0 | 5 votes |
def __init__(self, component_config=None): # type: (Optional[Dict[Text, Text]]) -> None super(JiebaPsegExtractor, self).__init__(component_config) dictionary_path = self.component_config.get('dictionary_path') if dictionary_path is not None: jieba.load_userdict(dictionary_path)
Example #26
Source File: cutWord.py From cn-text-classifier with GNU General Public License v3.0 | 5 votes |
def addDictionary(self, dict_list): """ 添加用户自定义字典列表 """ map(lambda x: jieba.load_userdict(x), dict_list)
Example #27
Source File: SentenceSegment.py From Distant-Supervised-Chinese-Relation-Extraction with MIT License | 5 votes |
def __init__(self, dict_file, stop_word_file, sentences_folder, process_num): self.process_num = process_num self.stop_word = read_txt(stop_word_file) jieba.load_userdict(dict_file) # sentence files self.sentence_files = [] for root, dirs, files in os.walk(sentences_folder): for file in files: self.sentence_files.append(os.path.join(root, file))
Example #28
Source File: tf_idf.py From SMPCUP2017 with MIT License | 5 votes |
def tf_idf(texts): jieba.load_userdict("./model/dict.txt") jieba.analyse.set_idf_path("./model/idf.txt") jieba.analyse.set_stop_words("./model/chinese_stopwords.txt") jieba.enable_parallel(8) corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts] return corpus
Example #29
Source File: chinese.py From Multi-Label-Text-Classification-for-Chinese with MIT License | 5 votes |
def __init__(self, stopwords_path="", userdict_path="", *args, **kwargs): super().__init__(*args, **kwargs) if userdict_path and os.path.exists(userdict_path): jieba.load_userdict(str(userdict_path)) self.reset(stopwords_path)
Example #30
Source File: similarity.py From sentence-similarity with MIT License | 5 votes |
def __init__(self): t1 = time.time() self.voc=load_voc(file_voc) print("Loading word2vec vector cost %.3f seconds...\n" % (time.time() - t1)) t1 = time.time() self.idf=load_idf(file_idf) print("Loading idf data cost %.3f seconds...\n" % (time.time() - t1)) jieba.load_userdict(file_userdict)