Python jieba.analyse() Examples

The following are 13 code examples of jieba.analyse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: semantic.py    From chat with MIT License 6 votes vote down vote up
def get_tag(sentence, config):
    """Get semantic tag of sentence. 获取句子语义标签。
    """
    iquestion = sentence.format(**config)
    try:
        keywords = analyse.extract_tags(iquestion, topK=1)
        keyword = keywords[0]
    except IndexError:
        keyword = iquestion
    tags = synonym_cut(keyword, 'wf') # tuple list
    if tags:
        tag = tags[0][1]
        if not tag:
            tag = keyword
    else:
        tag = keyword
    return tag 
Example #2
Source File: hot_words_generator.py    From LagouJob with Apache License 2.0 5 votes vote down vote up
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
        """
        calculate and show hot words of Job Impression
        :param interviewee_comments_dir:
        :return:
        """
        if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
            print('Error! No valid content in {0}'.format(interviewee_comments_dir))
            sys.exit(0)
        else:
            job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}

            for k, v in job_and_dir.items():
                text = self.concat_all_text(v)
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show() 
Example #3
Source File: hot_words_generator.py    From LagouJob with Apache License 2.0 5 votes vote down vote up
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['详情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show() 
Example #4
Source File: tokenizer.py    From dialogbot with Apache License 2.0 5 votes vote down vote up
def get_keywords(cls, text, size=3):
        return jieba.analyse.textrank(text, topK=size) 
Example #5
Source File: console.py    From Chatbot with GNU General Public License v3.0 5 votes vote down vote up
def jieba_textrank(self):

        """
        Use textrank in jieba to extract keywords in a sentence.
        """

        speech = input('Input a sentence: ')
        return jieba.analyse.textrank(speech, withWeight=True, topK=20) 
Example #6
Source File: console.py    From Chatbot with GNU General Public License v3.0 5 votes vote down vote up
def jieba_tf_idf(self):

        """
        Use tf/idf in jieba to extract keywords in a sentence
        """

        speech = input('Input a sentence: ')
        return jieba.analyse.extract_tags(speech, topK=20, withWeight=True) 
Example #7
Source File: engineering.py    From Multi-Label-Text-Classification-for-Chinese with MIT License 5 votes vote down vote up
def tfidf(self) -> list:
        kw_with_weight = jieba.analyse.extract_tags(
            self.text, allowPOS=ALLOW_POS, withWeight=True)
        return self.standardize(kw_with_weight) 
Example #8
Source File: engineering.py    From Multi-Label-Text-Classification-for-Chinese with MIT License 5 votes vote down vote up
def textrank(self) -> list:
        kw_with_weight = jieba.analyse.textrank(
            self.text, allowPOS=ALLOW_POS, withWeight=True)
        return self.standardize(kw_with_weight) 
Example #9
Source File: tf_idf.py    From SMPCUP2017 with MIT License 5 votes vote down vote up
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus 
Example #10
Source File: text_segment.py    From JiaYuan with Apache License 2.0 5 votes vote down vote up
def segment_text(text):
    # load user dict
    jieba.load_userdict(user_dict)
    # set stop words
    jieba.analyse.set_stop_words(stop_words)
    tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
    for tag in tags:
        print(str(tag[0]) + "\t" + str(tag[1])) 
Example #11
Source File: distance_text_or_vec.py    From nlp_xiaojiang with MIT License 5 votes vote down vote up
def sim_hash(content):
    seg = jieba.cut(content)
    keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
    # 先按照权重排序,再按照词排序
    keyList = []
    # print(keyWord)
    for feature, weight in keyWord:
        weight = int(weight * 20)
        feature = string_hash(feature)
        temp = []
        for f in feature:
            if f == '1':
                temp.append(weight)
            else:
                temp.append(-weight)
        keyList.append(temp)
    content_list = np.sum(np.array(keyList), axis=0)
    # 编码读不出来
    if len(keyList) == 0:
        return '00'
    simhash = ''
    for c in content_list:
        if c > 0:
            simhash = simhash + '1'
        else:
            simhash = simhash + '0'
    return simhash 
Example #12
Source File: tfidf_top.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def get_top_words(top, filename):
	topK = top
	content = open(filename, 'rb').read()
	tags = jieba.analyse.extract_tags(content, topK=topK)
	# items = str(tags).replace('u\'', '\'').decode("unicode-escape")
	return tags 
Example #13
Source File: semantic.py    From chat with MIT License 5 votes vote down vote up
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    将句子切分为同义词向量标签。

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    如果同义词词典中没有则标注为切词工具默认的词性。

    Args:
        pattern: 'w'-分词, 'k'-唯一关键词,'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。
    """
    # 句尾标点符号过滤
    sentence = sentence.rstrip(''.join(punctuation_all))
    # 句尾语气词过滤
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in filter_characters]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in filter_characters:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector