Python jieba.cut_for_search() Examples

The following are 24 code examples of jieba.cut_for_search(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: jieba_test.py    From annotated_jieba with MIT License 5 votes vote down vote up
def testCutForSearch_NOHMM(self):
        for content in test_contents:
            result = jieba.cut_for_search(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutForSearch_NOHMM", file=sys.stderr) 
Example #2
Source File: similar_doc.py    From Information_retrieva_Projectl- with MIT License 5 votes vote down vote up
def calculate(self,doc_id,Top_numbers=10,multiple=10):
        title,content,url=self.index.get_data(doc_id)
        cut=jieba.cut_for_search(content)
        word_list=[]
        for word in cut:
            if  word not in self.punct and word not in self.Letters_and_numbers :
                #计算文档间相似度,必须去停用词,否则太慢
                if self.stopword.has_key(word.encode("utf-8")):
                    pass
                else:
                    word_list.append(word.encode("utf-8"))
	return self.FastCos.calculate(word_list,Top_numbers,multiple) 
Example #3
Source File: main.py    From Information_retrieva_Projectl- with MIT License 5 votes vote down vote up
def GET(self):
        data=web.input()
        if data:
            ID=data.id
            news = dict()
            title, content, url=id_index.get_data(int(ID))
            news['content'] = content.decode("utf-8")
            news['title'] = title.decode("utf-8")
            news['url'] = url.decode("utf-8")
            recomand=[]
            #在线方法
            cut = jieba.cut_for_search(content)
            word_list = []
            for word in cut:
                if word not in punct and word not in Letters_and_numbers:
                    # 计算文档间相似度,必须去停用词,否则太慢
                    if recommand.stopword.has_key(word.encode("utf-8")):
                        pass
                    else:
                        word_list.append(word.encode("utf-8"))
            topk= recommand.calculate(word_list, config.recommand_numbers, 10)
            for i in topk:#在线方法
            #for i in recommand.dic[int(ID)]:#离线方法
                if i !=int(ID):
                    title, content, url=id_index.get_data(i)
                    recomand.append([title.decode('utf-8'),content.decode('utf-8'),url.decode('utf-8')])
            news['recommand']=recomand
            del title,content,url,recomand
        else:
            ID=''
            news = dict()
            news['title'] = "No Such News"
            news['content'] = "Oh No!"
            news['url'] = "#"
            news['recommand']=[['','',''] for m in range(config.recommand_numbers)]
        return render.news(news) 
Example #4
Source File: main.py    From Information_retrieva_Projectl- with MIT License 5 votes vote down vote up
def GET(self):
        data=web.input()
        if data:
            searchword=data.searchword
        else:
            searchword=''
        news_list=list()
        topic=list()
        if searchword:
            cut = jieba.cut_for_search(searchword)
            word_list = []
            for word in cut:
                if word not in punct and word not in Letters_and_numbers:
                    word_list.append(word.encode("utf-8"))
            topK=query.calculate(word_list,config.query_return_numbers)
            for k in topK:
                data = dict()
                title, content, url= id_index.get_data(k)
                data['id'] = k
                data['content'] = content.decode("utf-8")[:config.query_return_snipper_size]
                data['title']=title.decode("utf-8")
                data['url'] = url.decode("utf-8")
                news_list.append(data)
            del data,cut,word_list,word,topK,title,content,url
            #word2Vec推荐相似主题
            word2vec.cal(searchword.encode('utf-8'))
            print word2vec.result.length
            if word2vec.result.length==0:#词不存在,长度为1
                pass
            else:
                for i in range(config.recommand_topic_numbers):
                    topic.append(word2vec.result.word[i].char)
        return render.index(searchword,news_list,topic) 
Example #5
Source File: test_multithread.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def run(self):
        seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
        print("Full Mode:" + "/ ".join(seg_list)) #全模式

        seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
        print("Default Mode:" + "/ ".join(seg_list)) #默认模式

        seg_list = jieba.cut("他来到了网易杭研大厦")
        print(", ".join(seg_list))

        seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
        print(", ".join(seg_list)) 
Example #6
Source File: test_cut_for_search.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ') 
    print("") 
Example #7
Source File: test_cut_for_search.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ') 
    print("") 
Example #8
Source File: jieba_test.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def testCutForSearch_NOHMM(self):
        for content in test_contents:
            result = jieba.cut_for_search(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutForSearch_NOHMM", file=sys.stderr) 
Example #9
Source File: jieba_test.py    From Malicious_Domain_Whois with GNU General Public License v3.0 5 votes vote down vote up
def testCutForSearch(self):
        for content in test_contents:
            result = jieba.cut_for_search(content)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutForSearch", file=sys.stderr) 
Example #10
Source File: test_multithread.py    From annotated_jieba with MIT License 5 votes vote down vote up
def run(self):
        seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
        print("Full Mode:" + "/ ".join(seg_list)) #全模式

        seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
        print("Default Mode:" + "/ ".join(seg_list)) #默认模式

        seg_list = jieba.cut("他来到了网易杭研大厦")
        print(", ".join(seg_list))

        seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
        print(", ".join(seg_list)) 
Example #11
Source File: test_cut_for_search.py    From annotated_jieba with MIT License 5 votes vote down vote up
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ') 
    print("") 
Example #12
Source File: test_cut_for_search.py    From annotated_jieba with MIT License 5 votes vote down vote up
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ') 
    print("") 
Example #13
Source File: jieba_test.py    From jieba_fast with MIT License 5 votes vote down vote up
def testCutForSearch(self):
        for content in test_contents:
            result = jieba.cut_for_search(content)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutForSearch", file=sys.stderr) 
Example #14
Source File: jieba_test.py    From annotated_jieba with MIT License 5 votes vote down vote up
def testCutForSearch(self):
        for content in test_contents:
            result = jieba.cut_for_search(content)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutForSearch", file=sys.stderr) 
Example #15
Source File: min.py    From min with GNU General Public License v2.0 5 votes vote down vote up
def add_content(self, content, obj_key):
        """
        添加文档到索引
        """
        seg_list = jieba.cut_for_search(content)
        seg_list = min_nlp.get_weight(seg_list)
        self.add_word_index(seg_list, obj_key) 
Example #16
Source File: min.py    From min with GNU General Public License v2.0 5 votes vote down vote up
def search(self, keywords, start=0, length=20):
        """
        搜索关键字
        """
        seg_list = list(jieba.cut_for_search(keywords))
        key_list = self.search_by_words(seg_list, start, length)
        return key_list 
Example #17
Source File: jiebaSegment.py    From QAmodel-for-Retrievalchatbot with MIT License 5 votes vote down vote up
def cut_for_search(self,sentence, stopword=True):
        seg_list = jieba.cut_for_search(sentence)

        results = []
        for seg in seg_list:
            if stopword and seg in self.stopwords:
                continue
            results.append(seg)

        return results 
Example #18
Source File: jiebaSegment.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def cut_for_search(self,sentence, stopword=True):
        seg_list = jieba.cut_for_search(sentence)

        results = []
        for seg in seg_list:
            if stopword and seg in self.stopwords:
                continue
            results.append(seg)

        return results 
Example #19
Source File: jiebaSegment.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def cut_for_search(self,sentence, stopword=True):
        seg_list = jieba.cut_for_search(sentence)

        results = []
        for seg in seg_list:
            if stopword and seg in self.stopwords:
                continue
            results.append(seg)

        return results 
Example #20
Source File: article.py    From public-opinion-analysis with MIT License 5 votes vote down vote up
def cache_raw_seg(self):
        config = yaml.safe_load(open("./application.yml"))
        r = redis.StrictRedis(host=config['redis']['host'], port=config['redis']['port'], db=config['redis']['db'])
        for i in range(0, len(self.sentences)):
            raw_word_seg_list = jieba.cut_for_search(self.sentences[i].raw_sentence)
            sentence_seg_id = 'article:' + self.article_id + ':raw_seg:' + str(i)
            for raw_word_seg in raw_word_seg_list:
                r.sadd(sentence_seg_id, raw_word_seg) 
Example #21
Source File: test_multithread.py    From jieba_fast with MIT License 5 votes vote down vote up
def run(self):
        seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
        print("Full Mode:" + "/ ".join(seg_list)) #全模式

        seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
        print("Default Mode:" + "/ ".join(seg_list)) #默认模式

        seg_list = jieba.cut("他来到了网易杭研大厦")
        print(", ".join(seg_list))

        seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
        print(", ".join(seg_list)) 
Example #22
Source File: test_cut_for_search.py    From jieba_fast with MIT License 5 votes vote down vote up
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ') 
    print("") 
Example #23
Source File: test_cut_for_search.py    From jieba_fast with MIT License 5 votes vote down vote up
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ') 
    print("") 
Example #24
Source File: jieba_test.py    From jieba_fast with MIT License 5 votes vote down vote up
def testCutForSearch_NOHMM(self):
        for content in test_contents:
            result = jieba.cut_for_search(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutForSearch_NOHMM", file=sys.stderr)