Python Examples of jieba.load

Source File: text_processing.py From Listed-company-news-crawl-and-text-analysis with MIT License

6 votes

def jieba_tokenize(self,documents): 
        '''Cut the documents into a sequence of independent words.

        # Arguments:
            documents: List of news(articles).
        '''
        chnSTW = self.getchnSTW()
        corpora_documents = []
        jieba.load_userdict(self.finance_dict)
        for item_text in documents: 
            outstr = []
            sentence_seged = list(jieba.cut(item_text))
            for word in sentence_seged:  
                if word not in chnSTW and word != '\t' \
                and word != ' ':  
                    outstr.append(word)
            corpora_documents.append(outstr)
        return corpora_documents

Source File: text_analyzer.py From public-opinion-analysis with MIT License

6 votes

def do_text_analyze(text):
    load_dictionary_to_cache()
    jieba.load_userdict("./resources/dict_terminology.txt")

    article = domain.article.Article(text)
    raw_sentences = article.split_into_sentences(text)
    for raw_sentence in raw_sentences:
        sentence = domain.sentence.Sentence(article.article_id, raw_sentence)
        article.sentences.append(sentence)

    article.cache_raw_seg()
    article.generate_sentence_brief()
    article.generate_sentence_score()
    article.generate_article_score()
    print("Article total score:" + str(article.total_score))
    article.clean_up_cache()
    return article

Source File: pre_process.py From nlp-journey with Apache License 2.0

6 votes

def process_data(train_file, user_dict=None, stop_dict=None):
    # 结巴分词加载自定义词典(要符合jieba自定义词典规范)
    if user_dict:
        jieba.load_userdict(user_dict)

    # 加载停用词表(每行一个停用词)
    stop_words = []
    if stop_dict:
        with open(stop_dict, 'r', encoding='utf-8') as file:
            stop_words = [stop_word.strip() for stop_word in file.readlines()]

    # 读取文件内容并分词, 去掉停用词
    with open(train_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
        sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
        sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]

    return sentences

Source File: bilstm_crf_entity_extractor.py From rasa_nlu_gq with Apache License 2.0

6 votes

def __init__(self,
                 component_config=None,
                 ent_tagger=None,
                 session=None,
                 char_to_id=None,
                 id_to_tag=None):
        super(BilstmCRFEntityExtractor, self).__init__(component_config)

        self.ent_tagger = ent_tagger  # 指的是训练好的model
        self.session = session
        self.char_to_id = char_to_id
        self.id_to_tag = id_to_tag
        dictionary_path = self.component_config.get('dictionary_path')

        if dictionary_path:
            jieba.load_userdict(dictionary_path)

        self.seg = jieba

Source File: seg_words.py From tf-text-classification with Apache License 2.0

6 votes

def load_dict(path):
    """
    Load dictionary
    """
    jieba.load_userdict(path + 'default.dic')

    stop_words = open(path + 'stop.dic', 'r', encoding='utf-8').readlines()
    stop_dic = {}.fromkeys([line.strip() for line in stop_words])

    single_words = open(path + 'single.dic', 'r', encoding='utf-8').readlines()
    single_dic = {}.fromkeys([line.strip() for line in single_words])

    synonym_words = open(path + 'synonym.dic', 'r', encoding='UTF-8').readlines()
    synonym_dic = dict([line.strip().split(" ", 1) for line in synonym_words])

    return stop_dic, single_dic, synonym_dic

Source File: data_preprocess2.py From Customer-Chatbot with MIT License

5 votes

def cut(sentence, stopwords, stopword=True, cut_all=False):
    # 加载外部词典
    jieba.load_userdict('./userdict/userdict.txt')
    seg_list = jieba.cut(sentence, cut_all)
    results = []
    for seg in seg_list:
        if stopword and seg in stopwords:
            continue
        results.append(seg)
    return results

Source File: train_word2vec.py From text-classification with Apache License 2.0

5 votes

def segment():
    # jieba custom setting.
    DATA_DIR = os.getcwd() + '/data/user_dict'
    jieba.load_userdict(os.path.join(DATA_DIR, 'Company.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Concept.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Consumer.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Holder.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'HoldingCompany.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'MainComposition.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Manager.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Material.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'OtherCompetitor.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Supplier.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Finance.txt'))

    # load stopwords set
    stopword_set = set()
    with open(os.getcwd()+'/data/user_dict/stopWord.txt', 'r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    output = open(config['input_seg'], 'w', encoding='utf-8')
    with open(config['input_raw'], 'r', encoding='utf-8') as content :
        for texts_num, line in enumerate(content):
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopword_set:
                    output.write(word + ' ')
            output.write('\n')

            if (texts_num + 1) % 10000 == 0:
                logging.info("Segmented %d th articles" % (texts_num + 1))
    output.close()

Source File: dbscan_analysis.py From ns4_chatbot with Apache License 2.0

5 votes

def init(self):
		# logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logger.DEBUG,filename='log.txt')
		logger.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logger.DEBUG)
		logger.getLogger("gensim").setLevel(logger.WARNING)
		logger.getLogger("jieba").setLevel(logger.WARNING)

		jieba.initialize()
		jieba.load_userdict("data/addwords.txt")
		jieba.analyse.set_stop_words('data/stopwords.txt')

Source File: segmentor.py From TextCluster with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, args):
        if args.lang == 'cn':
            import jieba
            if args.dict:
                if not os.path.exists(args.dict):
                    print('Segmentor dictionary not found.')
                    exit(1)
                jieba.load_userdict(args.dict)
            self.cut = jieba.cut
        else:  # en
            from spacy.tokenizer import Tokenizer
            from spacy.lang.en import English
            nlp = English()
            self.tokenizer = Tokenizer(nlp.vocab)
            self.cut = self.cut_en

Source File: crawl.py From MillionHeroAssistant with MIT License

5 votes

def jieba_initialize():
    if not platform.system().upper().startswith("WINDOWS"):
        jieba.enable_parallel(multiprocessing.cpu_count())
    jieba.load_userdict('resources/QAattrdic.txt')
    jieba.initialize()

Source File: jiebaSegment.py From Customer-Chatbot with MIT License

5 votes

def load_userdict(self, file_name):
        jieba.load_userdict(file_name)

Source File: text_segment.py From JiaYuan with Apache License 2.0

5 votes

def segment_text(text):
    # load user dict
    jieba.load_userdict(user_dict)
    # set stop words
    jieba.analyse.set_stop_words(stop_words)
    tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
    for tag in tags:
        print(str(tag[0]) + "\t" + str(tag[1]))

Source File: jiebaSegment.py From Customer-Chatbot with MIT License

5 votes

def load_userdict(self, file_name):
        jieba.load_userdict(file_name)

Source File: data_preprocess2.py From Customer-Chatbot with MIT License

5 votes

def cut(sentence, stopwords, stopword=True, cut_all=False):
    # 加载外部词典
    jieba.load_userdict('./userdict/userdict.txt')
    seg_list = jieba.cut(sentence, cut_all)
    results = []
    for seg in seg_list:
        if stopword and seg in stopwords:
            continue
        results.append(seg)
    return results

Source File: preprocessing.py From seq2seq with Apache License 2.0

5 votes

def __init__(self):
        #self.encoderFile = "/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_ask.txt"
        #self.decoderFile = '/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_answer.txt'
        #self.savePath = '/home/yanwii/Python/NLP/seq2seq/seq2seq_pytorch/data/'
        self.encoderFile = "./data/question.txt"
        self.decoderFile = "./data/answer.txt"
        self.savePath = './data/'
        
        jieba.load_userdict("./data/supplementvocab.txt")

Source File: jieba_tokenizer.py From rasa-for-botfront with Apache License 2.0

5 votes

def load_custom_dictionary(path: Text) -> None:
        """Load all the custom dictionaries stored in the path.

        More information about the dictionaries file format can
        be found in the documentation of jieba.
        https://github.com/fxsjy/jieba#load-dictionary
        """
        import jieba

        jieba_userdicts = glob.glob(f"{path}/*")
        for jieba_userdict in jieba_userdicts:
            logger.info(f"Loading Jieba User Dictionary at {jieba_userdict}")
            jieba.load_userdict(jieba_userdict)

Source File: preprocess.py From deep-siamese-text-similarity with MIT License

5 votes

def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        sentence = sentence.decode("utf8")
        sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。：？?、~@#￥%……&*（）]+".decode("utf8"), "".decode("utf8"),
                          sentence)
        yield list(jieba.lcut(sentence))

Source File: preprocess.py From deep-siamese-text-similarity with MIT License

5 votes

def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        yield list(jieba.lcut(sentence))

Source File: jiebaSegment.py From QAmodel-for-Retrievalchatbot with MIT License

5 votes

def load_userdict(self,file_name):
        jieba.load_userdict(file_name)

Source File: jieba_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def set_user_dicts(tokenizer, path_user_dicts):
        if len(path_user_dicts) > 0:
            for path_user_dict in path_user_dicts:
                print("Loading Jieba User Dictionary at " + str(path_user_dict))
                tokenizer.load_userdict(path_user_dict)
        else:
            print("No Jieba User Dictionary found")

        return tokenizer

Source File: segment.py From nlp_toolkit with MIT License

5 votes

def __init__(self, user_dict='', model_name='word-rnn', mode='accurate', verbose=0):
        try:
            assert mode in ['accurate', 'fast']
        except:
            print('Only support three following mode: accurate, fast')
            sys.exit()
        self.pos = True
        self.mode = mode
        self.verbose = verbose
        self.path = os.path.abspath(os.path.dirname(__file__))
        if model_name != '':
            self.model_name = model_name
        else:
            try:
                self.model_name = read_line(Path(self.path) / 'data' / 'best_model.txt')[0]
            except Exception:
                self.model_name = model_name

        # jieba初始化
        base_dict = Path(self.path) / 'data' / 'dict' / 'jieba_base_supplyment.txt'
        jieba.load_userdict(str(base_dict))
        if mode == 'fast':
            global load_dict
            if not load_dict:
                if self.verbose:
                    print('loading np dict to jieba cache')
                dict_path = Path(self.path) / 'data' / 'dict' / 'chunk_pos.txt'
                jieba.load_userdict(str(dict_path))
                load_dict = True
        if user_dict:
            jieba.load_userdict(user_dict)
        self.seg = pseg

        # model变量
        self.weight_file = os.path.join(self.path, 'data/model/%s_weights.h5' % self.model_name)
        self.param_file = os.path.join(self.path, 'data/model/%s_parameters.json' % self.model_name)
        self.preprocess_file = os.path.join(self.path, 'data/model/%s_transformer.h5' % self.model_name)
        self.define_tagger()

Source File: hot_words_generator.py From LagouJob with Apache License 2.0

5 votes

def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
        """
        calculate and show hot words of Job Impression
        :param interviewee_comments_dir:
        :return:
        """
        if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
            print('Error! No valid content in {0}'.format(interviewee_comments_dir))
            sys.exit(0)
        else:
            job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}

            for k, v in job_and_dir.items():
                text = self.concat_all_text(v)
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()

Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0

5 votes

def load_custom_dictionary(path):
        # type: (Text) -> None
        """Load all the custom dictionaries stored in the path.

        More information about the dictionaries file format can
        be found in the documentation of jieba.
        https://github.com/fxsjy/jieba#load-dictionary
        """
        import jieba

        jieba_userdicts = glob.glob("{}/*".format(path))
        for jieba_userdict in jieba_userdicts:
            logger.info("Loading Jieba User Dictionary at "
                        "{}".format(jieba_userdict))
            jieba.load_userdict(jieba_userdict)

Source File: jieba_tokenizer.py From rasa_nlu with Apache License 2.0

5 votes

def load_custom_dictionary(path: Text) -> None:
        """Load all the custom dictionaries stored in the path.

        More information about the dictionaries file format can
        be found in the documentation of jieba.
        https://github.com/fxsjy/jieba#load-dictionary
        """
        import jieba

        jieba_userdicts = glob.glob("{}/*".format(path))
        for jieba_userdict in jieba_userdicts:
            logger.info("Loading Jieba User Dictionary at "
                        "{}".format(jieba_userdict))
            jieba.load_userdict(jieba_userdict)

Source File: jieba_pseg_extractor.py From rasa_nlu_gq with Apache License 2.0

5 votes

def __init__(self, component_config=None):
        # type: (Optional[Dict[Text, Text]]) -> None
        super(JiebaPsegExtractor, self).__init__(component_config)
        dictionary_path = self.component_config.get('dictionary_path')

        if dictionary_path is not None:
            jieba.load_userdict(dictionary_path)

Source File: cutWord.py From cn-text-classifier with GNU General Public License v3.0

5 votes

def addDictionary(self, dict_list):
        """
        添加用户自定义字典列表
        """
        map(lambda x: jieba.load_userdict(x), dict_list)

Source File: SentenceSegment.py From Distant-Supervised-Chinese-Relation-Extraction with MIT License

5 votes

def __init__(self, dict_file, stop_word_file, sentences_folder, process_num):
        self.process_num = process_num
        self.stop_word = read_txt(stop_word_file)
        jieba.load_userdict(dict_file)
        # sentence files
        self.sentence_files = []
        for root, dirs, files in os.walk(sentences_folder):
            for file in files:
                self.sentence_files.append(os.path.join(root, file))

Source File: tf_idf.py From SMPCUP2017 with MIT License

5 votes

def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus

Source File: chinese.py From Multi-Label-Text-Classification-for-Chinese with MIT License

5 votes

def __init__(self, stopwords_path="", userdict_path="", *args, **kwargs):
        super().__init__(*args, **kwargs)
        if userdict_path and os.path.exists(userdict_path):
            jieba.load_userdict(str(userdict_path))
        self.reset(stopwords_path)

Source File: similarity.py From sentence-similarity with MIT License

5 votes

def __init__(self):
        t1 = time.time()
        self.voc=load_voc(file_voc)
        print("Loading  word2vec vector cost %.3f seconds...\n" % (time.time() - t1))
        t1 = time.time()
        self.idf=load_idf(file_idf)
        print("Loading  idf data cost %.3f seconds...\n" % (time.time() - t1))
        jieba.load_userdict(file_userdict)

Python jieba.load_userdict() Examples