Python Examples of jieba.cut

Source File: run.py From TiebaTool with MIT License

8 votes

def calculate_similarity(text1,text2):
    raw1 = jieba.cut(text1)
    raw2 = jieba.cut(text2)
    raw1 = Counter(raw1)
    raw2 = Counter(raw2)
    same_words = set(raw1) & set(raw2)
    if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0:
        dot_product = 0
        mod1 = 0
        mod2 = 0
        for word in same_words:
            dot_product += raw1[word] * raw2[word]
        for word in raw1:
            mod1 += math.pow(raw1[word],2)
        for word in raw2:
            mod2 += math.pow(raw2[word],2)
        cos = dot_product/math.sqrt(mod1*mod2)
    else:
        cos = 0
    return cos

Source File: Bayes.py From weiboanalysis with Apache License 2.0

7 votes

def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace('\u200b', '')
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(r'\w', re.L)
                    result = p.sub("", word)
                    if not result or result == ' ':  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set('\u200b')
                                 - set(' ') - set('\u3000') - set('️'))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut, label  # 返回每条微博的分词和标注

Source File: make_handcrafted_33_features.py From wsdm19cup with MIT License

7 votes

def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect

Source File: Bayes.py From weiboanalysis with Apache License 2.0

7 votes

def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace('\u200b', '')
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(b'\w', re.L)
                    result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8")
                    if not result or result == ' ':  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set('\u200b')
                                 - set(' ') - set('\u3000') - set('️'))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut, label  # 返回每条微博的分词和标注

Source File: engine_for_mlp.py From text-antispam with MIT License

6 votes

def text_tensor(text, wv):
    """获取文本向量
    Args:
        text: 待检测文本
        wv: 词向量模型
    Returns:
        [[ 3.80905056   1.94315064  -0.20703495  -1.31589055   1.9627794
           ...
           2.16935492   2.95426321  -4.71534014  -3.25034237 -11.28901672]]
    """
    text = tr.extractWords(text)
    words = jieba.cut(text.strip())
    text_embedding = np.zeros(200, dtype=np.float32)
    for word in words:
        try:
            text_embedding += wv[word]
        except KeyError:
            text_embedding += wv['UNK']

    sample = text_embedding.reshape(1, 200)
    return sample

Source File: get_topic.py From poem_generator with Apache License 2.0

6 votes

def read_data(fin):
    poem_words = list()
    title_flag = False
    title = ''
    fd = codecs.open(fin, 'r', 'utf-8')
    for line in fd:
        line = line.strip()
        line = reg_sep.sub(' ', line)
        title_flag = not title_flag
        if title_flag:
            title = line
        else:
            words = ' '.join(jieba.cut(title + line))
            poem_words.append(words)
    fd.close()
    print('Read data done.')
    return poem_words

Source File: qaData.py From QA with GNU General Public License v3.0

6 votes

def sentenceToIndex(sentence, word2idx, maxLen):
    """
    将句子分词，并转换成embeddings列表的索引值

    :param sentence: 句子
    :param word2idx: 词语的索引
    :param maxLen: 句子的最大长度
    :return: 句子的词向量索引表示
    """
    unknown = word2idx.get("UNKNOWN", 0)
    num = word2idx.get("NUM", len(word2idx))
    index = [unknown] * maxLen
    i = 0
    for word in jieba.cut(sentence):
        if word in word2idx:
            index[i] = word2idx[word]
        else:
            if re.match("\d+", word):
                index[i] = num
            else:
                index[i] = unknown
        if i >= maxLen - 1:
            break
        i += 1
    return index

Source File: analyze_data.py From Machine-Translation with Apache License 2.0

6 votes

def analyze_zh():
    translation_path = os.path.join(train_translation_folder, train_translation_zh_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'Chinese Sentence Lengths Distribution'
    plt.title(title)
    plt.show()

Source File: analyze_data.py From Machine-Translation with Apache License 2.0

6 votes

def analyze_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show()

Source File: process_corpus.py From question-classification-cnn-rnn-attention with Apache License 2.0

6 votes

def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)

Source File: process_corpus.py From question-classification-cnn-rnn-attention with Apache License 2.0

6 votes

def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)

Source File: tool.py From dudulu with MIT License

6 votes

def build_key_word(path):
    """
    通过词频产生key word
    :param path:
    :return:
    """
    d = {}
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            for word in jieba.cut(line.strip()):
                if len(word) > 1:  # 避免大量无意义的词语进入统计范围
                    d[word] = d.get(word, 0) + 1
    kw_list = sorted(d, key=lambda x: d[x], reverse=True)
    # 取前0.5名
    size = int(len(kw_list) * 0.2)
    return kw_list[:size]

Source File: norm_zh.py From mars with Apache License 2.0

6 votes

def _zh_split(s):
    """
    Split text length in Chinese
    """
    import jieba
    try:
        s.encode('ascii')
        has_zh = False
    except ValueError:
        has_zh = True

    if has_zh:
        return list(jieba.cut(s))
    else:
        return pofile.WORD_SEP.split(s)


# code modified from babel.messages.pofile (hash 359ecffca479dfe032d0f7210d5cd8160599c816)

Source File: AsianNLP.py From scattertext with Apache License 2.0

6 votes

def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
	sents = []
	for paragraph in doc.split('\n'):
		sent_splits = iter(re.split(r'(？|。|」|！)+', paragraph, flags=re.MULTILINE))
		for partial_sent in sent_splits:
			sent = partial_sent + next(sent_splits, '')
			if sent.strip() == '': continue
			toks = []
			# for tok in jieba.cut(sent, ):
			for tok in tokenizer(sent):
				pos = 'WORD'
				if tok.strip() == '':
					pos = 'SPACE'
				elif punct_re.match(tok):
					pos = 'PUNCT'
				toks.append(Tok(pos,
				                tok[:2].lower(),
				                tok.lower(),
				                tok,
				                ent_type='' if entity_type is None else entity_type.get(tok, ''),
				                tag='' if tag_type is None else tag_type.get(tok, '')))
			sents.append(Sentence(toks, sent))
	return Doc(sents, doc)

Source File: dataset.py From atec-nlp with MIT License

6 votes

def __init__(self, data_file, sequence_length, word2idx, char_level=True):
        self.word2idx = word2idx
        self.seq_len = sequence_length

        x1, x2, y = [], [], []
        for line in open(data_file, 'r'):
            _, s1, s2, label = line.strip().split('\t')
            s1, s2 = map(self._clean_text, [s1, s2])
            if not char_level:
                s1 = list(jieba.cut(s1))
                s2 = list(jieba.cut(s2))
            x1.append(s1)
            x2.append(s2)
            y.append(1) if label == '1' else y.append(0)
        self.x1 = x1
        self.x2 = x2
        self.y = y

Source File: dataset.py From atec-nlp with MIT License

6 votes

def _load_data(self, data_file):
        """Load origin train data and do text pre-processing (converting and cleaning)
        Returns:
            A generator
            if self.is_training:
                train sentence pairs and labels (s1, s2, y).
            else:
                train sentence pairs and None (s1, s2, None).
        """
        for line in open(data_file):
            line = line.strip().decode('utf-8').split('\t')
            s1, s2 = map(self._clean_text, map(self._tradition2simple, line[1:3]))
            if not self.char_level:
                s1 = list(jieba.cut(s1))
                s2 = list(jieba.cut(s2))
            if self.is_training:
                y = int(line[-1])  # 1 or [1]
                yield s1, s2, y
            else:
                yield s1, s2, None  # for consistent

Source File: make_handcrafted_33_features.py From wsdm19cup with MIT License

6 votes

def __prepare__(self,q):
        q = self.__preprocess__(q)
        new_q = []
        surplus_q = []
        numbers_q = []
        new_xitrum = True
        for w in list(jieba.cut(q))[::-1]:
            if w not in self.STOP_WORDS:
                if new_xitrum:
                    new_q = ["__xitrum__"] + new_q
                    new_xitrum = False
                if self.__is_numeric__(w):
                    numbers_q = [w] + numbers_q
                else:
                    surplus_q = [w] + surplus_q
            else:
                new_xitrum = True
            if len(new_q) == self.MAX_SEQUENCE_LENGTH:
                break
        new_q = " ".join(new_q)
        return new_q, set(surplus_q), set(numbers_q)

    ### jaccard

Source File: train_predict_trees_batch2.py From wsdm19cup with MIT License

6 votes

def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect

Source File: train_predict_trees_batch2.py From wsdm19cup with MIT License

6 votes

def __prepare__(self,q):
        q = self.__preprocess__(q)
        new_q = []
        surplus_q = []
        numbers_q = []
        new_xitrum = True
        for w in list(jieba.cut(q))[::-1]:
            if w not in self.STOP_WORDS:
                if new_xitrum:
                    new_q = ["__xitrum__"] + new_q
                    new_xitrum = False
                if self.__is_numeric__(w):
                    numbers_q = [w] + numbers_q
                else:
                    surplus_q = [w] + surplus_q
            else:
                new_xitrum = True
            if len(new_q) == self.MAX_SEQUENCE_LENGTH:
                break
        new_q = " ".join(new_q)
        return new_q, set(surplus_q), set(numbers_q)

    ### jaccard

Source File: train_predict_trees_batch1.py From wsdm19cup with MIT License

6 votes

def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect

Source File: train_predict_trees_batch3.py From wsdm19cup with MIT License

6 votes

def __prepare__(self,q):
        q = self.__preprocess__(q)
        new_q = []
        surplus_q = []
        numbers_q = []
        new_xitrum = True
        for w in list(jieba.cut(q))[::-1]:
            if w not in self.STOP_WORDS:
                if new_xitrum:
                    new_q = ["__xitrum__"] + new_q
                    new_xitrum = False
                if self.__is_numeric__(w):
                    numbers_q = [w] + numbers_q
                else:
                    surplus_q = [w] + surplus_q
            else:
                new_xitrum = True
            if len(new_q) == self.MAX_SEQUENCE_LENGTH:
                break
        new_q = " ".join(new_q)
        return new_q, set(surplus_q), set(numbers_q)

    ### jaccard

Source File: ptt_filter.py From justcopy-backend with MIT License

6 votes

def print2file(f, title, responses, marker = '', separater = True):
    if marker != '':
        f.write(marker + ' ')
    title_cutted = jieba.cut(title.strip(), cut_all=False)
    for word in title_cutted:
        f.write(word + ' ')
    f.write('\n')
    for response in responses:
        #print(response['Content'])
        #if response['Content'] not in count_response.keys():
        #    count_response[response['Content']] = 0
        #count_response[response['Content']] += 1
        if marker != '':
            f.write(marker + ' ')
        response_cutted = jieba.cut(response['Content'].strip(), cut_all=False)
        for word in response_cutted:
            f.write(word + ' ')
        f.write('\n')
    if separater:
        f.write('===\n')

Source File: run_infer.py From BERT with Apache License 2.0

6 votes

def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length):
        
    before_part = tokens_a_id[0:answer_symbol_id_pos]
    after_part = tokens_a_id[answer_symbol_id_pos:]
    
    half_length = int(max_length / 2)
    if len(before_part) < half_length: # cut at tail
        st = 0
        ed = min(len(before_part) + 1 + len(after_part), max_length - 3)
    elif len(after_part) < half_length: # cut at head
        ed = len(before_part) + 1 + len(after_part)
        st = max(0, ed - (max_length - 3))
    else: # cut at both sides
        st = len(before_part) + 3 - half_length
        ed = len(before_part) + half_length
    
    output = tokens_a_id[st:ed]
    assert tokens_a_id[answer_symbol_id_pos] in output
    return output

Source File: out_script.py From BERT with Apache License 2.0

6 votes

def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length):
        
    before_part = tokens_a_id[0:answer_symbol_id_pos]
    after_part = tokens_a_id[answer_symbol_id_pos:]
    
    half_length = int(max_length / 2)
    if len(before_part) < half_length: # cut at tail
        st = 0
        ed = min(len(before_part) + 1 + len(after_part), max_length - 3)
    elif len(after_part) < half_length: # cut at head
        ed = len(before_part) + 1 + len(after_part)
        st = max(0, ed - (max_length - 3))
    else: # cut at both sides
        st = len(before_part) + 3 - half_length
        ed = len(before_part) + half_length
    
    output = tokens_a_id[st:ed]
    assert tokens_a_id[answer_symbol_id_pos] in output
    return output

Source File: output_script_test.py From BERT with Apache License 2.0

6 votes

def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length):
        
    before_part = tokens_a_id[0:answer_symbol_id_pos]
    after_part = tokens_a_id[answer_symbol_id_pos:]
    
    half_length = int(max_length / 2)
    if len(before_part) < half_length: # cut at tail
        st = 0
        ed = min(len(before_part) + 1 + len(after_part), max_length - 3)
    elif len(after_part) < half_length: # cut at head
        ed = len(before_part) + 1 + len(after_part)
        st = max(0, ed - (max_length - 3))
    else: # cut at both sides
        st = len(before_part) + 3 - half_length
        ed = len(before_part) + half_length
    
    output = tokens_a_id[st:ed]
    assert tokens_a_id[answer_symbol_id_pos] in output
    return output

Source File: engine.py From text-antispam with MIT License

6 votes

def text_tensor(text, wv):
    """获取文本向量
    Args:
        text: 待检测文本
        wv: 词向量模型
    Returns:
        [[[ 3.80905056   1.94315064  -0.20703495  -1.31589055   1.9627794
           ...
           2.16935492   2.95426321  -4.71534014  -3.25034237 -11.28901672]]]
    """
    text = tr.extractWords(text)
    words = jieba.cut(text.strip())
    text_sequence = []
    for word in words:
        try:
            text_sequence.append(wv[word])
        except KeyError:
            text_sequence.append(wv['UNK'])
    text_sequence = np.asarray(text_sequence)
    sample = text_sequence.reshape(1, len(text_sequence), 200)
    return sample

Source File: NLP.py From Financial-NLP with Apache License 2.0

6 votes

def title2wordbag(self, title, remove_stopwords=True):
        words=jieba.cut(title,cut_all=False)
        str_cut=' '.join(words)
        for sym in zh_symbol: # remove chinese symbols
            str_cut=str_cut.replace(sym,'')
        for sym in number:    # remove number
            str_cut=str_cut.replace(sym,'')    
        strlist_cut=str_cut.split(' ')
        
        strlist_new=[]
        for word in strlist_cut: # remove english letter
            if (not len(word)) or (word in self.stop_words):
                continue
            elif (word[0]>='A' and word[0]<='Z') or(word[0]>='a' and word[0]<='z'):
                continue
            elif(ord(word[0])<1024):
                continue
            else:
                strlist_new.append(word)
        return strlist_new
    
    
################################ problem #####################################

Source File: tool.py From weiboanalysis with Apache License 2.0

5 votes

def getlinejieba(path):
    d = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = []
            line = str(line).replace('\u200b', '')
            for word in jieba.cut(line.strip()[2:]):
                temp.append(word)
            d.append(list(set(temp) - set(stop) - set(' ')))  # 差集、去空
    return d

Source File: train_predict_trees_batch3.py From wsdm19cup with MIT License

5 votes

def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect

Source File: tool.py From weiboanalysis with Apache License 2.0

5 votes

def get_word_feature(sentence):
    wordlist = []
    sentence = str(sentence).replace('\u200b', '')
    for word in jieba.cut(sentence.strip()):
        p = re.compile(r'\w', re.L)
        result = p.sub("", word)
        if not result or result == ' ':  # 空字符
            continue
        wordlist.append(word)
    return list(set(wordlist) - set(stop) - set(' '))

Python jieba.cut() Examples