Python jieba.load_userdict() Examples

The following are 30 code examples of jieba.load_userdict(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: text_processing.py    From Listed-company-news-crawl-and-text-analysis with MIT License 6 votes vote down vote up
def jieba_tokenize(self,documents): 
        '''Cut the documents into a sequence of independent words.

        # Arguments:
            documents: List of news(articles).
        '''
        chnSTW = self.getchnSTW()
        corpora_documents = []
        jieba.load_userdict(self.finance_dict)
        for item_text in documents: 
            outstr = []
            sentence_seged = list(jieba.cut(item_text))
            for word in sentence_seged:  
                if word not in chnSTW and word != '\t' \
                and word != ' ':  
                    outstr.append(word)
            corpora_documents.append(outstr)
        return corpora_documents 
Example #2
Source File: text_analyzer.py    From public-opinion-analysis with MIT License 6 votes vote down vote up
def do_text_analyze(text):
    load_dictionary_to_cache()
    jieba.load_userdict("./resources/dict_terminology.txt")

    article = domain.article.Article(text)
    raw_sentences = article.split_into_sentences(text)
    for raw_sentence in raw_sentences:
        sentence = domain.sentence.Sentence(article.article_id, raw_sentence)
        article.sentences.append(sentence)

    article.cache_raw_seg()
    article.generate_sentence_brief()
    article.generate_sentence_score()
    article.generate_article_score()
    print("Article total score:" + str(article.total_score))
    article.clean_up_cache()
    return article 
Example #3
Source File: pre_process.py    From nlp-journey with Apache License 2.0 6 votes vote down vote up
def process_data(train_file, user_dict=None, stop_dict=None):
    # 结巴分词加载自定义词典(要符合jieba自定义词典规范)
    if user_dict:
        jieba.load_userdict(user_dict)

    # 加载停用词表(每行一个停用词)
    stop_words = []
    if stop_dict:
        with open(stop_dict, 'r', encoding='utf-8') as file:
            stop_words = [stop_word.strip() for stop_word in file.readlines()]

    # 读取文件内容并分词, 去掉停用词
    with open(train_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
        sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
        sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]

    return sentences 
Example #4
Source File: bilstm_crf_entity_extractor.py    From rasa_nlu_gq with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 component_config=None,
                 ent_tagger=None,
                 session=None,
                 char_to_id=None,
                 id_to_tag=None):
        super(BilstmCRFEntityExtractor, self).__init__(component_config)

        self.ent_tagger = ent_tagger  # 指的是训练好的model
        self.session = session
        self.char_to_id = char_to_id
        self.id_to_tag = id_to_tag
        dictionary_path = self.component_config.get('dictionary_path')

        if dictionary_path:
            jieba.load_userdict(dictionary_path)

        self.seg = jieba 
Example #5
Source File: seg_words.py    From tf-text-classification with Apache License 2.0 6 votes vote down vote up
def load_dict(path):
    """
    Load dictionary
    """
    jieba.load_userdict(path + 'default.dic')

    stop_words = open(path + 'stop.dic', 'r', encoding='utf-8').readlines()
    stop_dic = {}.fromkeys([line.strip() for line in stop_words])

    single_words = open(path + 'single.dic', 'r', encoding='utf-8').readlines()
    single_dic = {}.fromkeys([line.strip() for line in single_words])

    synonym_words = open(path + 'synonym.dic', 'r', encoding='UTF-8').readlines()
    synonym_dic = dict([line.strip().split(" ", 1) for line in synonym_words])

    return stop_dic, single_dic, synonym_dic 
Example #6
Source File: data_preprocess2.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def cut(sentence, stopwords, stopword=True, cut_all=False):
    # 加载外部词典
    jieba.load_userdict('./userdict/userdict.txt')
    seg_list = jieba.cut(sentence, cut_all)
    results = []
    for seg in seg_list:
        if stopword and seg in stopwords:
            continue
        results.append(seg)
    return results 
Example #7
Source File: train_word2vec.py    From text-classification with Apache License 2.0 5 votes vote down vote up
def segment():
    # jieba custom setting.
    DATA_DIR = os.getcwd() + '/data/user_dict'
    jieba.load_userdict(os.path.join(DATA_DIR, 'Company.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Concept.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Consumer.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Holder.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'HoldingCompany.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'MainComposition.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Manager.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Material.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'OtherCompetitor.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Supplier.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Finance.txt'))

    # load stopwords set
    stopword_set = set()
    with open(os.getcwd()+'/data/user_dict/stopWord.txt', 'r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    output = open(config['input_seg'], 'w', encoding='utf-8')
    with open(config['input_raw'], 'r', encoding='utf-8') as content :
        for texts_num, line in enumerate(content):
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopword_set:
                    output.write(word + ' ')
            output.write('\n')

            if (texts_num + 1) % 10000 == 0:
                logging.info("Segmented %d th articles" % (texts_num + 1))
    output.close() 
Example #8
Source File: dbscan_analysis.py    From ns4_chatbot with Apache License 2.0 5 votes vote down vote up
def init(self):
		# logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logger.DEBUG,filename='log.txt')
		logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logger.DEBUG)
		logger.getLogger("gensim").setLevel(logger.WARNING)
		logger.getLogger("jieba").setLevel(logger.WARNING)

		jieba.initialize()
		jieba.load_userdict("data/addwords.txt")
		jieba.analyse.set_stop_words('data/stopwords.txt') 
Example #9
Source File: segmentor.py    From TextCluster with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, args):
        if args.lang == 'cn':
            import jieba
            if args.dict:
                if not os.path.exists(args.dict):
                    print('Segmentor dictionary not found.')
                    exit(1)
                jieba.load_userdict(args.dict)
            self.cut = jieba.cut
        else:  # en
            from spacy.tokenizer import Tokenizer
            from spacy.lang.en import English
            nlp = English()
            self.tokenizer = Tokenizer(nlp.vocab)
            self.cut = self.cut_en 
Example #10
Source File: crawl.py    From MillionHeroAssistant with MIT License 5 votes vote down vote up
def jieba_initialize():
    if not platform.system().upper().startswith("WINDOWS"):
        jieba.enable_parallel(multiprocessing.cpu_count())
    jieba.load_userdict('resources/QAattrdic.txt')
    jieba.initialize() 
Example #11
Source File: jiebaSegment.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def load_userdict(self, file_name):
        jieba.load_userdict(file_name) 
Example #12
Source File: text_segment.py    From JiaYuan with Apache License 2.0 5 votes vote down vote up
def segment_text(text):
    # load user dict
    jieba.load_userdict(user_dict)
    # set stop words
    jieba.analyse.set_stop_words(stop_words)
    tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
    for tag in tags:
        print(str(tag[0]) + "\t" + str(tag[1])) 
Example #13
Source File: jiebaSegment.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def load_userdict(self, file_name):
        jieba.load_userdict(file_name) 
Example #14
Source File: data_preprocess2.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def cut(sentence, stopwords, stopword=True, cut_all=False):
    # 加载外部词典
    jieba.load_userdict('./userdict/userdict.txt')
    seg_list = jieba.cut(sentence, cut_all)
    results = []
    for seg in seg_list:
        if stopword and seg in stopwords:
            continue
        results.append(seg)
    return results 
Example #15
Source File: preprocessing.py    From seq2seq with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        #self.encoderFile = "/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_ask.txt"
        #self.decoderFile = '/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_answer.txt'
        #self.savePath = '/home/yanwii/Python/NLP/seq2seq/seq2seq_pytorch/data/'
        self.encoderFile = "./data/question.txt"
        self.decoderFile = "./data/answer.txt"
        self.savePath = './data/'
        
        jieba.load_userdict("./data/supplementvocab.txt") 
Example #16
Source File: jieba_tokenizer.py    From rasa-for-botfront with Apache License 2.0 5 votes vote down vote up
def load_custom_dictionary(path: Text) -> None:
        """Load all the custom dictionaries stored in the path.

        More information about the dictionaries file format can
        be found in the documentation of jieba.
        https://github.com/fxsjy/jieba#load-dictionary
        """
        import jieba

        jieba_userdicts = glob.glob(f"{path}/*")
        for jieba_userdict in jieba_userdicts:
            logger.info(f"Loading Jieba User Dictionary at {jieba_userdict}")
            jieba.load_userdict(jieba_userdict) 
Example #17
Source File: preprocess.py    From deep-siamese-text-similarity with MIT License 5 votes vote down vote up
def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        sentence = sentence.decode("utf8")
        sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。:??、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"),
                          sentence)
        yield list(jieba.lcut(sentence)) 
Example #18
Source File: preprocess.py    From deep-siamese-text-similarity with MIT License 5 votes vote down vote up
def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        yield list(jieba.lcut(sentence)) 
Example #19
Source File: jiebaSegment.py    From QAmodel-for-Retrievalchatbot with MIT License 5 votes vote down vote up
def load_userdict(self,file_name):
        jieba.load_userdict(file_name) 
Example #20
Source File: jieba_tokenizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def set_user_dicts(tokenizer, path_user_dicts):
        if len(path_user_dicts) > 0:
            for path_user_dict in path_user_dicts:
                print("Loading Jieba User Dictionary at " + str(path_user_dict))
                tokenizer.load_userdict(path_user_dict)
        else:
            print("No Jieba User Dictionary found")

        return tokenizer 
Example #21
Source File: segment.py    From nlp_toolkit with MIT License 5 votes vote down vote up
def __init__(self, user_dict='', model_name='word-rnn', mode='accurate', verbose=0):
        try:
            assert mode in ['accurate', 'fast']
        except:
            print('Only support three following mode: accurate, fast')
            sys.exit()
        self.pos = True
        self.mode = mode
        self.verbose = verbose
        self.path = os.path.abspath(os.path.dirname(__file__))
        if model_name != '':
            self.model_name = model_name
        else:
            try:
                self.model_name = read_line(Path(self.path) / 'data' / 'best_model.txt')[0]
            except Exception:
                self.model_name = model_name

        # jieba初始化
        base_dict = Path(self.path) / 'data' / 'dict' / 'jieba_base_supplyment.txt'
        jieba.load_userdict(str(base_dict))
        if mode == 'fast':
            global load_dict
            if not load_dict:
                if self.verbose:
                    print('loading np dict to jieba cache')
                dict_path = Path(self.path) / 'data' / 'dict' / 'chunk_pos.txt'
                jieba.load_userdict(str(dict_path))
                load_dict = True
        if user_dict:
            jieba.load_userdict(user_dict)
        self.seg = pseg

        # model变量
        self.weight_file = os.path.join(self.path, 'data/model/%s_weights.h5' % self.model_name)
        self.param_file = os.path.join(self.path, 'data/model/%s_parameters.json' % self.model_name)
        self.preprocess_file = os.path.join(self.path, 'data/model/%s_transformer.h5' % self.model_name)
        self.define_tagger() 
Example #22
Source File: hot_words_generator.py    From LagouJob with Apache License 2.0 5 votes vote down vote up
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
        """
        calculate and show hot words of Job Impression
        :param interviewee_comments_dir:
        :return:
        """
        if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
            print('Error! No valid content in {0}'.format(interviewee_comments_dir))
            sys.exit(0)
        else:
            job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}

            for k, v in job_and_dir.items():
                text = self.concat_all_text(v)
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show() 
Example #23
Source File: jieba_tokenizer.py    From rasa_bot with Apache License 2.0 5 votes vote down vote up
def load_custom_dictionary(path):
        # type: (Text) -> None
        """Load all the custom dictionaries stored in the path.

        More information about the dictionaries file format can
        be found in the documentation of jieba.
        https://github.com/fxsjy/jieba#load-dictionary
        """
        import jieba

        jieba_userdicts = glob.glob("{}/*".format(path))
        for jieba_userdict in jieba_userdicts:
            logger.info("Loading Jieba User Dictionary at "
                        "{}".format(jieba_userdict))
            jieba.load_userdict(jieba_userdict) 
Example #24
Source File: jieba_tokenizer.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def load_custom_dictionary(path: Text) -> None:
        """Load all the custom dictionaries stored in the path.

        More information about the dictionaries file format can
        be found in the documentation of jieba.
        https://github.com/fxsjy/jieba#load-dictionary
        """
        import jieba

        jieba_userdicts = glob.glob("{}/*".format(path))
        for jieba_userdict in jieba_userdicts:
            logger.info("Loading Jieba User Dictionary at "
                        "{}".format(jieba_userdict))
            jieba.load_userdict(jieba_userdict) 
Example #25
Source File: jieba_pseg_extractor.py    From rasa_nlu_gq with Apache License 2.0 5 votes vote down vote up
def __init__(self, component_config=None):
        # type: (Optional[Dict[Text, Text]]) -> None
        super(JiebaPsegExtractor, self).__init__(component_config)
        dictionary_path = self.component_config.get('dictionary_path')

        if dictionary_path is not None:
            jieba.load_userdict(dictionary_path) 
Example #26
Source File: cutWord.py    From cn-text-classifier with GNU General Public License v3.0 5 votes vote down vote up
def addDictionary(self, dict_list):
        """
        添加用户自定义字典列表
        """
        map(lambda x: jieba.load_userdict(x), dict_list) 
Example #27
Source File: SentenceSegment.py    From Distant-Supervised-Chinese-Relation-Extraction with MIT License 5 votes vote down vote up
def __init__(self, dict_file, stop_word_file, sentences_folder, process_num):
        self.process_num = process_num
        self.stop_word = read_txt(stop_word_file)
        jieba.load_userdict(dict_file)
        # sentence files
        self.sentence_files = []
        for root, dirs, files in os.walk(sentences_folder):
            for file in files:
                self.sentence_files.append(os.path.join(root, file)) 
Example #28
Source File: tf_idf.py    From SMPCUP2017 with MIT License 5 votes vote down vote up
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus 
Example #29
Source File: chinese.py    From Multi-Label-Text-Classification-for-Chinese with MIT License 5 votes vote down vote up
def __init__(self, stopwords_path="", userdict_path="", *args, **kwargs):
        super().__init__(*args, **kwargs)
        if userdict_path and os.path.exists(userdict_path):
            jieba.load_userdict(str(userdict_path))
        self.reset(stopwords_path) 
Example #30
Source File: similarity.py    From sentence-similarity with MIT License 5 votes vote down vote up
def __init__(self):
        t1 = time.time()
        self.voc=load_voc(file_voc)
        print("Loading  word2vec vector cost %.3f seconds...\n" % (time.time() - t1))
        t1 = time.time()
        self.idf=load_idf(file_idf)
        print("Loading  idf data cost %.3f seconds...\n" % (time.time() - t1))
        jieba.load_userdict(file_userdict)