Python jieba.analyse() Examples
The following are 13
code examples of jieba.analyse().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
jieba
, or try the search function
.
Example #1
Source File: semantic.py From chat with MIT License | 6 votes |
def get_tag(sentence, config): """Get semantic tag of sentence. 获取句子语义标签。 """ iquestion = sentence.format(**config) try: keywords = analyse.extract_tags(iquestion, topK=1) keyword = keywords[0] except IndexError: keyword = iquestion tags = synonym_cut(keyword, 'wf') # tuple list if tags: tag = tags[0][1] if not tag: tag = keyword else: tag = keyword return tag
Example #2
Source File: hot_words_generator.py From LagouJob with Apache License 2.0 | 5 votes |
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'): """ calculate and show hot words of Job Impression :param interviewee_comments_dir: :return: """ if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0: print('Error! No valid content in {0}'.format(interviewee_comments_dir)) sys.exit(0) else: job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)} for k, v in job_and_dir.items(): text = self.concat_all_text(v) jieba.analyse.set_stop_words(STOPWORDS_PATH) jieba.load_userdict(USER_CORPUS) hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()) frequencies = {_[0]: _[1] for _ in hot_words_with_weights} print(frequencies) x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white", repeat=False, mask=mask) wordcloud.generate_from_frequencies(frequencies) import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
Example #3
Source File: hot_words_generator.py From LagouJob with Apache License 2.0 | 5 votes |
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'): """ calculate and show hot words of Job Description (JD) :param jd_dir: :return: """ if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0: print('Error! No valid content in {0}'.format(jd_dir)) sys.exit(0) else: jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)} for k, v in jd_and_dir.items(): text = "".join(pd.read_excel(v)['详情描述']) jieba.analyse.set_stop_words(STOPWORDS_PATH) jieba.load_userdict(USER_CORPUS) hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()) frequencies = {_[0]: _[1] for _ in hot_words_with_weights} print(frequencies) x, y = np.ogrid[:300, :300] mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2 mask = 255 * mask.astype(int) wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white", repeat=False, mask=mask) wordcloud.generate_from_frequencies(frequencies) import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
Example #4
Source File: tokenizer.py From dialogbot with Apache License 2.0 | 5 votes |
def get_keywords(cls, text, size=3): return jieba.analyse.textrank(text, topK=size)
Example #5
Source File: console.py From Chatbot with GNU General Public License v3.0 | 5 votes |
def jieba_textrank(self): """ Use textrank in jieba to extract keywords in a sentence. """ speech = input('Input a sentence: ') return jieba.analyse.textrank(speech, withWeight=True, topK=20)
Example #6
Source File: console.py From Chatbot with GNU General Public License v3.0 | 5 votes |
def jieba_tf_idf(self): """ Use tf/idf in jieba to extract keywords in a sentence """ speech = input('Input a sentence: ') return jieba.analyse.extract_tags(speech, topK=20, withWeight=True)
Example #7
Source File: engineering.py From Multi-Label-Text-Classification-for-Chinese with MIT License | 5 votes |
def tfidf(self) -> list: kw_with_weight = jieba.analyse.extract_tags( self.text, allowPOS=ALLOW_POS, withWeight=True) return self.standardize(kw_with_weight)
Example #8
Source File: engineering.py From Multi-Label-Text-Classification-for-Chinese with MIT License | 5 votes |
def textrank(self) -> list: kw_with_weight = jieba.analyse.textrank( self.text, allowPOS=ALLOW_POS, withWeight=True) return self.standardize(kw_with_weight)
Example #9
Source File: tf_idf.py From SMPCUP2017 with MIT License | 5 votes |
def tf_idf(texts): jieba.load_userdict("./model/dict.txt") jieba.analyse.set_idf_path("./model/idf.txt") jieba.analyse.set_stop_words("./model/chinese_stopwords.txt") jieba.enable_parallel(8) corpus = [filter(jieba.analyse.extract_tags(s, topK = 15)) for s in texts] return corpus
Example #10
Source File: text_segment.py From JiaYuan with Apache License 2.0 | 5 votes |
def segment_text(text): # load user dict jieba.load_userdict(user_dict) # set stop words jieba.analyse.set_stop_words(stop_words) tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=()) for tag in tags: print(str(tag[0]) + "\t" + str(tag[1]))
Example #11
Source File: distance_text_or_vec.py From nlp_xiaojiang with MIT License | 5 votes |
def sim_hash(content): seg = jieba.cut(content) keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=()) # 先按照权重排序,再按照词排序 keyList = [] # print(keyWord) for feature, weight in keyWord: weight = int(weight * 20) feature = string_hash(feature) temp = [] for f in feature: if f == '1': temp.append(weight) else: temp.append(-weight) keyList.append(temp) content_list = np.sum(np.array(keyList), axis=0) # 编码读不出来 if len(keyList) == 0: return '00' simhash = '' for c in content_list: if c > 0: simhash = simhash + '1' else: simhash = simhash + '0' return simhash
Example #12
Source File: tfidf_top.py From Malicious_Domain_Whois with GNU General Public License v3.0 | 5 votes |
def get_top_words(top, filename): topK = top content = open(filename, 'rb').read() tags = jieba.analyse.extract_tags(content, topK=topK) # items = str(tags).replace('u\'', '\'').decode("unicode-escape") return tags
Example #13
Source File: semantic.py From chat with MIT License | 5 votes |
def synonym_cut(sentence, pattern="wf"): """Cut the sentence into a synonym vector tag. 将句子切分为同义词向量标签。 If a word in this sentence was not found in the synonym dictionary, it will be marked with default value of the word segmentation tool. 如果同义词词典中没有则标注为切词工具默认的词性。 Args: pattern: 'w'-分词, 'k'-唯一关键词,'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。 """ # 句尾标点符号过滤 sentence = sentence.rstrip(''.join(punctuation_all)) # 句尾语气词过滤 sentence = sentence.rstrip(tone_words) synonym_vector = [] if pattern == "w": synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters] elif pattern == "k": synonym_vector = analyse.extract_tags(sentence, topK=1) elif pattern == "t": synonym_vector = analyse.extract_tags(sentence, topK=10) elif pattern == "wf": result = posseg.cut(sentence) # synonym_vector = [(item.word, item.flag) for item in result \ # if item.word not in filter_characters] # Modify in 2017.4.27 for item in result: if item.word not in filter_characters: if len(item.flag) < 4: item.flag = list(posseg.cut(item.word))[0].flag synonym_vector.append((item.word, item.flag)) elif pattern == "tf": result = posseg.cut(sentence) tags = analyse.extract_tags(sentence, topK=10) for item in result: if item.word in tags: synonym_vector.append((item.word, item.flag)) return synonym_vector