Python Examples of jieba.analyse

Source File: semantic.py From chat with MIT License

6 votes

def get_tag(sentence, config):
    """Get semantic tag of sentence. 获取句子语义标签。
    """
    iquestion = sentence.format(**config)
    try:
        keywords = analyse.extract_tags(iquestion, topK=1)
        keyword = keywords[0]
    except IndexError:
        keyword = iquestion
    tags = synonym_cut(keyword, 'wf') # tuple list
    if tags:
        tag = tags[0][1]
        if not tag:
            tag = keyword
    else:
        tag = keyword
    return tag

Source File: hot_words_generator.py From LagouJob with Apache License 2.0

5 votes

def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
        """
        calculate and show hot words of Job Impression
        :param interviewee_comments_dir:
        :return:
        """
        if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
            print('Error! No valid content in {0}'.format(interviewee_comments_dir))
            sys.exit(0)
        else:
            job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}

            for k, v in job_and_dir.items():
                text = self.concat_all_text(v)
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()

Source File: hot_words_generator.py From LagouJob with Apache License 2.0

5 votes

def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['详情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()

Source File: tokenizer.py From dialogbot with Apache License 2.0

5 votes

def get_keywords(cls, text, size=3):
        return jieba.analyse.textrank(text, topK=size)

Source File: console.py From Chatbot with GNU General Public License v3.0

5 votes

def jieba_textrank(self):

        """
        Use textrank in jieba to extract keywords in a sentence.
        """

        speech = input('Input a sentence: ')
        return jieba.analyse.textrank(speech, withWeight=True, topK=20)

Source File: console.py From Chatbot with GNU General Public License v3.0

5 votes

def jieba_tf_idf(self):

        """
        Use tf/idf in jieba to extract keywords in a sentence
        """

        speech = input('Input a sentence: ')
        return jieba.analyse.extract_tags(speech, topK=20, withWeight=True)

Source File: engineering.py From Multi-Label-Text-Classification-for-Chinese with MIT License

5 votes

def tfidf(self) -> list:
        kw_with_weight = jieba.analyse.extract_tags(
            self.text, allowPOS=ALLOW_POS, withWeight=True)
        return self.standardize(kw_with_weight)

Source File: engineering.py From Multi-Label-Text-Classification-for-Chinese with MIT License

5 votes

def textrank(self) -> list:
        kw_with_weight = jieba.analyse.textrank(
            self.text, allowPOS=ALLOW_POS, withWeight=True)
        return self.standardize(kw_with_weight)

Source File: tf_idf.py From SMPCUP2017 with MIT License

5 votes

def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts]
    return corpus

Source File: text_segment.py From JiaYuan with Apache License 2.0

5 votes

def segment_text(text):
    # load user dict
    jieba.load_userdict(user_dict)
    # set stop words
    jieba.analyse.set_stop_words(stop_words)
    tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=())
    for tag in tags:
        print(str(tag[0]) + "\t" + str(tag[1]))

Source File: distance_text_or_vec.py From nlp_xiaojiang with MIT License

5 votes

def sim_hash(content):
    seg = jieba.cut(content)
    keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
    # 先按照权重排序，再按照词排序
    keyList = []
    # print(keyWord)
    for feature, weight in keyWord:
        weight = int(weight * 20)
        feature = string_hash(feature)
        temp = []
        for f in feature:
            if f == '1':
                temp.append(weight)
            else:
                temp.append(-weight)
        keyList.append(temp)
    content_list = np.sum(np.array(keyList), axis=0)
    # 编码读不出来
    if len(keyList) == 0:
        return '00'
    simhash = ''
    for c in content_list:
        if c > 0:
            simhash = simhash + '1'
        else:
            simhash = simhash + '0'
    return simhash

Source File: tfidf_top.py From Malicious_Domain_Whois with GNU General Public License v3.0

5 votes

def get_top_words(top, filename):
	topK = top
	content = open(filename, 'rb').read()
	tags = jieba.analyse.extract_tags(content, topK=topK)
	# items = str(tags).replace('u\'', '\'').decode("unicode-escape")
	return tags

Source File: semantic.py From chat with MIT License

5 votes

def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    将句子切分为同义词向量标签。

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    如果同义词词典中没有则标注为切词工具默认的词性。

    Args:
        pattern: 'w'-分词, 'k'-唯一关键词，'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。
    """
    # 句尾标点符号过滤
    sentence = sentence.rstrip(''.join(punctuation_all))
    # 句尾语气词过滤
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in filter_characters]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in filter_characters:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector

Python jieba.analyse() Examples