Python jieba.cut() Examples

The following are 30 code examples of jieba.cut(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: run.py    From TiebaTool with MIT License 8 votes vote down vote up
def calculate_similarity(text1,text2):
    raw1 = jieba.cut(text1)
    raw2 = jieba.cut(text2)
    raw1 = Counter(raw1)
    raw2 = Counter(raw2)
    same_words = set(raw1) & set(raw2)
    if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0:
        dot_product = 0
        mod1 = 0
        mod2 = 0
        for word in same_words:
            dot_product += raw1[word] * raw2[word]
        for word in raw1:
            mod1 += math.pow(raw1[word],2)
        for word in raw2:
            mod2 += math.pow(raw2[word],2)
        cos = dot_product/math.sqrt(mod1*mod2)
    else:
        cos = 0
    return cos 
Example #2
Source File: Bayes.py    From weiboanalysis with Apache License 2.0 7 votes vote down vote up
def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace('\u200b', '')
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(r'\w', re.L)
                    result = p.sub("", word)
                    if not result or result == ' ':  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set('\u200b')
                                 - set(' ') - set('\u3000') - set('️'))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut, label  # 返回每条微博的分词和标注 
Example #3
Source File: make_handcrafted_33_features.py    From wsdm19cup with MIT License 7 votes vote down vote up
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect 
Example #4
Source File: Bayes.py    From weiboanalysis with Apache License 2.0 7 votes vote down vote up
def loadDataSet(path):  # 返回每条微博的分词与标签
    line_cut = []
    label = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = line.strip()
            try:
                sentence = temp[2:].lstrip()  # 每条微博
                label.append(int(temp[:2]))  # 获取标注
                word_list = []
                sentence = str(sentence).replace('\u200b', '')
                for word in jieba.cut(sentence.strip()):
                    p = re.compile(b'\w', re.L)
                    result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8")
                    if not result or result == ' ':  # 空字符
                        continue
                    word_list.append(word)
                word_list = list(set(word_list) - set(stop) - set('\u200b')
                                 - set(' ') - set('\u3000') - set('️'))
                line_cut.append(word_list)
            except Exception:
                continue
    return line_cut, label  # 返回每条微博的分词和标注 
Example #5
Source File: engine_for_mlp.py    From text-antispam with MIT License 6 votes vote down vote up
def text_tensor(text, wv):
    """获取文本向量
    Args:
        text: 待检测文本
        wv: 词向量模型
    Returns:
        [[ 3.80905056   1.94315064  -0.20703495  -1.31589055   1.9627794
           ...
           2.16935492   2.95426321  -4.71534014  -3.25034237 -11.28901672]]
    """
    text = tr.extractWords(text)
    words = jieba.cut(text.strip())
    text_embedding = np.zeros(200, dtype=np.float32)
    for word in words:
        try:
            text_embedding += wv[word]
        except KeyError:
            text_embedding += wv['UNK']

    sample = text_embedding.reshape(1, 200)
    return sample 
Example #6
Source File: get_topic.py    From poem_generator with Apache License 2.0 6 votes vote down vote up
def read_data(fin):
    poem_words = list()
    title_flag = False
    title = ''
    fd = codecs.open(fin, 'r', 'utf-8')
    for line in fd:
        line = line.strip()
        line = reg_sep.sub(' ', line)
        title_flag = not title_flag
        if title_flag:
            title = line
        else:
            words = ' '.join(jieba.cut(title + line))
            poem_words.append(words)
    fd.close()
    print('Read data done.')
    return poem_words 
Example #7
Source File: qaData.py    From QA with GNU General Public License v3.0 6 votes vote down vote up
def sentenceToIndex(sentence, word2idx, maxLen):
    """
    将句子分词,并转换成embeddings列表的索引值

    :param sentence: 句子
    :param word2idx: 词语的索引
    :param maxLen: 句子的最大长度
    :return: 句子的词向量索引表示
    """
    unknown = word2idx.get("UNKNOWN", 0)
    num = word2idx.get("NUM", len(word2idx))
    index = [unknown] * maxLen
    i = 0
    for word in jieba.cut(sentence):
        if word in word2idx:
            index[i] = word2idx[word]
        else:
            if re.match("\d+", word):
                index[i] = num
            else:
                index[i] = unknown
        if i >= maxLen - 1:
            break
        i += 1
    return index 
Example #8
Source File: analyze_data.py    From Machine-Translation with Apache License 2.0 6 votes vote down vote up
def analyze_zh():
    translation_path = os.path.join(train_translation_folder, train_translation_zh_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'Chinese Sentence Lengths Distribution'
    plt.title(title)
    plt.show() 
Example #9
Source File: analyze_data.py    From Machine-Translation with Apache License 2.0 6 votes vote down vote up
def analyze_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show() 
Example #10
Source File: process_corpus.py    From question-classification-cnn-rnn-attention with Apache License 2.0 6 votes vote down vote up
def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile) 
Example #11
Source File: process_corpus.py    From question-classification-cnn-rnn-attention with Apache License 2.0 6 votes vote down vote up
def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile) 
Example #12
Source File: tool.py    From dudulu with MIT License 6 votes vote down vote up
def build_key_word(path):
    """
    通过词频产生key word
    :param path:
    :return:
    """
    d = {}
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            for word in jieba.cut(line.strip()):
                if len(word) > 1:  # 避免大量无意义的词语进入统计范围
                    d[word] = d.get(word, 0) + 1
    kw_list = sorted(d, key=lambda x: d[x], reverse=True)
    # 取前0.5名
    size = int(len(kw_list) * 0.2)
    return kw_list[:size] 
Example #13
Source File: norm_zh.py    From mars with Apache License 2.0 6 votes vote down vote up
def _zh_split(s):
    """
    Split text length in Chinese
    """
    import jieba
    try:
        s.encode('ascii')
        has_zh = False
    except ValueError:
        has_zh = True

    if has_zh:
        return list(jieba.cut(s))
    else:
        return pofile.WORD_SEP.split(s)


# code modified from babel.messages.pofile (hash 359ecffca479dfe032d0f7210d5cd8160599c816) 
Example #14
Source File: AsianNLP.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
	sents = []
	for paragraph in doc.split('\n'):
		sent_splits = iter(re.split(r'(?|。|」|!)+', paragraph, flags=re.MULTILINE))
		for partial_sent in sent_splits:
			sent = partial_sent + next(sent_splits, '')
			if sent.strip() == '': continue
			toks = []
			# for tok in jieba.cut(sent, ):
			for tok in tokenizer(sent):
				pos = 'WORD'
				if tok.strip() == '':
					pos = 'SPACE'
				elif punct_re.match(tok):
					pos = 'PUNCT'
				toks.append(Tok(pos,
				                tok[:2].lower(),
				                tok.lower(),
				                tok,
				                ent_type='' if entity_type is None else entity_type.get(tok, ''),
				                tag='' if tag_type is None else tag_type.get(tok, '')))
			sents.append(Sentence(toks, sent))
	return Doc(sents, doc) 
Example #15
Source File: dataset.py    From atec-nlp with MIT License 6 votes vote down vote up
def __init__(self, data_file, sequence_length, word2idx, char_level=True):
        self.word2idx = word2idx
        self.seq_len = sequence_length

        x1, x2, y = [], [], []
        for line in open(data_file, 'r'):
            _, s1, s2, label = line.strip().split('\t')
            s1, s2 = map(self._clean_text, [s1, s2])
            if not char_level:
                s1 = list(jieba.cut(s1))
                s2 = list(jieba.cut(s2))
            x1.append(s1)
            x2.append(s2)
            y.append(1) if label == '1' else y.append(0)
        self.x1 = x1
        self.x2 = x2
        self.y = y 
Example #16
Source File: dataset.py    From atec-nlp with MIT License 6 votes vote down vote up
def _load_data(self, data_file):
        """Load origin train data and do text pre-processing (converting and cleaning)
        Returns:
            A generator
            if self.is_training:
                train sentence pairs and labels (s1, s2, y).
            else:
                train sentence pairs and None (s1, s2, None).
        """
        for line in open(data_file):
            line = line.strip().decode('utf-8').split('\t')
            s1, s2 = map(self._clean_text, map(self._tradition2simple, line[1:3]))
            if not self.char_level:
                s1 = list(jieba.cut(s1))
                s2 = list(jieba.cut(s2))
            if self.is_training:
                y = int(line[-1])  # 1 or [1]
                yield s1, s2, y
            else:
                yield s1, s2, None  # for consistent 
Example #17
Source File: make_handcrafted_33_features.py    From wsdm19cup with MIT License 6 votes vote down vote up
def __prepare__(self,q):
        q = self.__preprocess__(q)
        new_q = []
        surplus_q = []
        numbers_q = []
        new_xitrum = True
        for w in list(jieba.cut(q))[::-1]:
            if w not in self.STOP_WORDS:
                if new_xitrum:
                    new_q = ["__xitrum__"] + new_q
                    new_xitrum = False
                if self.__is_numeric__(w):
                    numbers_q = [w] + numbers_q
                else:
                    surplus_q = [w] + surplus_q
            else:
                new_xitrum = True
            if len(new_q) == self.MAX_SEQUENCE_LENGTH:
                break
        new_q = " ".join(new_q)
        return new_q, set(surplus_q), set(numbers_q)

    ### jaccard 
Example #18
Source File: train_predict_trees_batch2.py    From wsdm19cup with MIT License 6 votes vote down vote up
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect 
Example #19
Source File: train_predict_trees_batch2.py    From wsdm19cup with MIT License 6 votes vote down vote up
def __prepare__(self,q):
        q = self.__preprocess__(q)
        new_q = []
        surplus_q = []
        numbers_q = []
        new_xitrum = True
        for w in list(jieba.cut(q))[::-1]:
            if w not in self.STOP_WORDS:
                if new_xitrum:
                    new_q = ["__xitrum__"] + new_q
                    new_xitrum = False
                if self.__is_numeric__(w):
                    numbers_q = [w] + numbers_q
                else:
                    surplus_q = [w] + surplus_q
            else:
                new_xitrum = True
            if len(new_q) == self.MAX_SEQUENCE_LENGTH:
                break
        new_q = " ".join(new_q)
        return new_q, set(surplus_q), set(numbers_q)

    ### jaccard 
Example #20
Source File: train_predict_trees_batch1.py    From wsdm19cup with MIT License 6 votes vote down vote up
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect 
Example #21
Source File: train_predict_trees_batch3.py    From wsdm19cup with MIT License 6 votes vote down vote up
def __prepare__(self,q):
        q = self.__preprocess__(q)
        new_q = []
        surplus_q = []
        numbers_q = []
        new_xitrum = True
        for w in list(jieba.cut(q))[::-1]:
            if w not in self.STOP_WORDS:
                if new_xitrum:
                    new_q = ["__xitrum__"] + new_q
                    new_xitrum = False
                if self.__is_numeric__(w):
                    numbers_q = [w] + numbers_q
                else:
                    surplus_q = [w] + surplus_q
            else:
                new_xitrum = True
            if len(new_q) == self.MAX_SEQUENCE_LENGTH:
                break
        new_q = " ".join(new_q)
        return new_q, set(surplus_q), set(numbers_q)

    ### jaccard 
Example #22
Source File: ptt_filter.py    From justcopy-backend with MIT License 6 votes vote down vote up
def print2file(f, title, responses, marker = '', separater = True):
    if marker != '':
        f.write(marker + ' ')
    title_cutted = jieba.cut(title.strip(), cut_all=False)
    for word in title_cutted:
        f.write(word + ' ')
    f.write('\n')
    for response in responses:
        #print(response['Content'])
        #if response['Content'] not in count_response.keys():
        #    count_response[response['Content']] = 0
        #count_response[response['Content']] += 1
        if marker != '':
            f.write(marker + ' ')
        response_cutted = jieba.cut(response['Content'].strip(), cut_all=False)
        for word in response_cutted:
            f.write(word + ' ')
        f.write('\n')
    if separater:
        f.write('===\n') 
Example #23
Source File: run_infer.py    From BERT with Apache License 2.0 6 votes vote down vote up
def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length):
        
    before_part = tokens_a_id[0:answer_symbol_id_pos]
    after_part = tokens_a_id[answer_symbol_id_pos:]
    
    half_length = int(max_length / 2)
    if len(before_part) < half_length: # cut at tail
        st = 0
        ed = min(len(before_part) + 1 + len(after_part), max_length - 3)
    elif len(after_part) < half_length: # cut at head
        ed = len(before_part) + 1 + len(after_part)
        st = max(0, ed - (max_length - 3))
    else: # cut at both sides
        st = len(before_part) + 3 - half_length
        ed = len(before_part) + half_length
    
    output = tokens_a_id[st:ed]
    assert tokens_a_id[answer_symbol_id_pos] in output
    return output 
Example #24
Source File: out_script.py    From BERT with Apache License 2.0 6 votes vote down vote up
def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length):
        
    before_part = tokens_a_id[0:answer_symbol_id_pos]
    after_part = tokens_a_id[answer_symbol_id_pos:]
    
    half_length = int(max_length / 2)
    if len(before_part) < half_length: # cut at tail
        st = 0
        ed = min(len(before_part) + 1 + len(after_part), max_length - 3)
    elif len(after_part) < half_length: # cut at head
        ed = len(before_part) + 1 + len(after_part)
        st = max(0, ed - (max_length - 3))
    else: # cut at both sides
        st = len(before_part) + 3 - half_length
        ed = len(before_part) + half_length
    
    output = tokens_a_id[st:ed]
    assert tokens_a_id[answer_symbol_id_pos] in output
    return output 
Example #25
Source File: output_script_test.py    From BERT with Apache License 2.0 6 votes vote down vote up
def cut_doc(tokens_a_id, answer_symbol_id_pos, answer_symbol, max_length):
        
    before_part = tokens_a_id[0:answer_symbol_id_pos]
    after_part = tokens_a_id[answer_symbol_id_pos:]
    
    half_length = int(max_length / 2)
    if len(before_part) < half_length: # cut at tail
        st = 0
        ed = min(len(before_part) + 1 + len(after_part), max_length - 3)
    elif len(after_part) < half_length: # cut at head
        ed = len(before_part) + 1 + len(after_part)
        st = max(0, ed - (max_length - 3))
    else: # cut at both sides
        st = len(before_part) + 3 - half_length
        ed = len(before_part) + half_length
    
    output = tokens_a_id[st:ed]
    assert tokens_a_id[answer_symbol_id_pos] in output
    return output 
Example #26
Source File: engine.py    From text-antispam with MIT License 6 votes vote down vote up
def text_tensor(text, wv):
    """获取文本向量
    Args:
        text: 待检测文本
        wv: 词向量模型
    Returns:
        [[[ 3.80905056   1.94315064  -0.20703495  -1.31589055   1.9627794
           ...
           2.16935492   2.95426321  -4.71534014  -3.25034237 -11.28901672]]]
    """
    text = tr.extractWords(text)
    words = jieba.cut(text.strip())
    text_sequence = []
    for word in words:
        try:
            text_sequence.append(wv[word])
        except KeyError:
            text_sequence.append(wv['UNK'])
    text_sequence = np.asarray(text_sequence)
    sample = text_sequence.reshape(1, len(text_sequence), 200)
    return sample 
Example #27
Source File: NLP.py    From Financial-NLP with Apache License 2.0 6 votes vote down vote up
def title2wordbag(self, title, remove_stopwords=True):
        words=jieba.cut(title,cut_all=False)
        str_cut=' '.join(words)
        for sym in zh_symbol: # remove chinese symbols
            str_cut=str_cut.replace(sym,'')
        for sym in number:    # remove number
            str_cut=str_cut.replace(sym,'')    
        strlist_cut=str_cut.split(' ')
        
        strlist_new=[]
        for word in strlist_cut: # remove english letter
            if (not len(word)) or (word in self.stop_words):
                continue
            elif (word[0]>='A' and word[0]<='Z') or(word[0]>='a' and word[0]<='z'):
                continue
            elif(ord(word[0])<1024):
                continue
            else:
                strlist_new.append(word)
        return strlist_new
    
    
################################ problem ##################################### 
Example #28
Source File: tool.py    From weiboanalysis with Apache License 2.0 5 votes vote down vote up
def getlinejieba(path):
    d = []
    with open(path, encoding="utf-8") as fp:
        for line in fp:
            temp = []
            line = str(line).replace('\u200b', '')
            for word in jieba.cut(line.strip()[2:]):
                temp.append(word)
            d.append(list(set(temp) - set(stop) - set(' ')))  # 差集、去空
    return d 
Example #29
Source File: train_predict_trees_batch3.py    From wsdm19cup with MIT License 5 votes vote down vote up
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True):
    """return 4 tensors of train_q1,q2 and test_q1,q2"""
    df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("")
    df = pd.DataFrame()
    df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique()
        
    if to_preprocess:
        df['text'] = df['text'].map(lambda x: preprocess(x))
        df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess)
        df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess)
        df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess)
        df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess)
        
    if analyzer == 'char':
        vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    else:
        vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf)
    vect.fit(df["text"].tolist())
    return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect 
Example #30
Source File: tool.py    From weiboanalysis with Apache License 2.0 5 votes vote down vote up
def get_word_feature(sentence):
    wordlist = []
    sentence = str(sentence).replace('\u200b', '')
    for word in jieba.cut(sentence.strip()):
        p = re.compile(r'\w', re.L)
        result = p.sub("", word)
        if not result or result == ' ':  # 空字符
            continue
        wordlist.append(word)
    return list(set(wordlist) - set(stop) - set(' '))