Python jieba.lcut() Examples

The following are 30 code examples of jieba.lcut(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba , or try the search function .
Example #1
Source File: similarity.py    From sentence-similarity with MIT License 6 votes vote down vote up
def M_idf(self,s1, s2):
        v1, v2 = [], []
        s1_list = jieba.lcut(s1)
        s2_list = jieba.lcut(s2)

        for s in s1_list:
            idf_v = self.idf.get(s, 1)
            if s in self.voc:
                v1.append(1.0 * idf_v * self.voc[s])

        for s in s2_list:
            idf_v = self.idf.get(s, 1)
            if s in self.voc:
                v2.append(1.0 * idf_v * self.voc[s])

        v1 = np.array(v1).sum(axis=0)
        v2 = np.array(v2).sum(axis=0)

        sim = 1 - spatial.distance.cosine(v1, v2)

        return sim 
Example #2
Source File: cut_td_idf.py    From nlp_xiaojiang with MIT License 6 votes vote down vote up
def cut_td_idf(sources_path, target_path):
    """
    结巴切词,汉语
    :param path: 
    :return: 
    """
    print("cut_td_idf start! ")
    corpus = txtRead(sources_path)
    governments = []
    for corpus_one in corpus:
        corpus_one_clear = corpus_one.replace(' ', '').strip()
        ques_q2b = strQ2B(corpus_one_clear.strip())
        ques_q2b_syboml = get_syboml(ques_q2b)
        governments.append(ques_q2b_syboml.strip())

    government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))

    topic_ques_all = []
    for topic_ques_one in government_ques:
        top_ques_aqlq = topic_ques_one.replace('   ', ' ').replace('  ', ' ').strip() + '\n'
        topic_ques_all.append(top_ques_aqlq)

    txtWrite(topic_ques_all, target_path)
    print("cut_td_idf ok! " + sources_path) 
Example #3
Source File: chatbot_sentence_vec_by_word.py    From nlp_xiaojiang with MIT License 6 votes vote down vote up
def word_flag_cut(sentence):
    """
        jieba切词词性
    :param sentence: 
    :return: 
    """
    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
                        replace(' ', '').replace('\t', '').upper().strip()
    word_list = []
    flag_list = []
    try:
        sentence_cut =  ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
        words = jieba_seg.cut(sentence_cut)
        for word in words:
            word_list.append(word.word)
            flag_list.append(word.flag)
    except Exception as e:
        word_list = [sentence]
        flag_list = ['nt']
    return word_list, flag_list 
Example #4
Source File: segment.py    From Chinese-Poetry-Generation with MIT License 6 votes vote down vote up
def segment(self, sentence):
        # TODO: try CRF-based segmentation.
        toks = []
        idx = 0
        while idx + 4 <= len(sentence):
            # Cut 2 chars each time.
            if sentence[idx : idx + 2] in self.sxhy_dict:
                toks.append(sentence[idx : idx + 2])
            else:
                for tok in jieba.lcut(sentence[idx : idx + 2]):
                    toks.append(tok)
            idx += 2
        # Cut last 3 chars.
        if idx < len(sentence):
            if sentence[idx : ] in self.sxhy_dict:
                toks.append(sentence[idx : ])
            else:
                for tok in jieba.lcut(sentence[idx : ]):
                    toks.append(tok)
        return toks


# For testing purpose. 
Example #5
Source File: segment.py    From Chinese-Poetry-Generation with MIT License 6 votes vote down vote up
def _gen_sxhy_dict():
    print("Parsing shixuehanying dictionary ...")
    words = set()
    with open(_rawsxhy_path, 'r') as fin:
        for line in fin.readlines():
            if line[0] == '<':
                continue
            for phrase in line.strip().split()[1:]:
                if not is_cn_sentence(phrase):
                    continue
                idx = 0
                while idx + 4 <= len(phrase):
                    # Cut 2 chars each time.
                    words.add(phrase[idx : idx + 2])
                    idx += 2
                # Use jieba to cut the last 3 chars.
                if idx < len(phrase):
                    for word in jieba.lcut(phrase[idx:]):
                        words.add(word)
    with open(sxhy_path, 'w') as fout:
        fout.write(' '.join(words)) 
Example #6
Source File: chinese.py    From Multi-Label-Text-Classification-for-Chinese with MIT License 6 votes vote down vote up
def __call__(self, sent):
        sent = ptxt.Text(sent, "whi").clean
        sent = self.clean_linkpic(sent)

        sent = self.clean_english(sent)

        sent = self.clean_date(sent)
        sent = self.clean_time(sent)

        sent = self.clean_money(sent)
        sent = self.clean_weight(sent)
        sent = self.clean_concentration(sent)

        sent = self.clean_entity(sent)

        sent = self.clean_nums(sent)

        wlist = jieba.lcut(sent)
        sent = self.clean_stopwords(wlist)
        sent = self.clean_punctuation(sent)

        return sent 
Example #7
Source File: pre_process.py    From nlp-journey with Apache License 2.0 6 votes vote down vote up
def process_data(train_file, user_dict=None, stop_dict=None):
    # 结巴分词加载自定义词典(要符合jieba自定义词典规范)
    if user_dict:
        jieba.load_userdict(user_dict)

    # 加载停用词表(每行一个停用词)
    stop_words = []
    if stop_dict:
        with open(stop_dict, 'r', encoding='utf-8') as file:
            stop_words = [stop_word.strip() for stop_word in file.readlines()]

    # 读取文件内容并分词, 去掉停用词
    with open(train_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
        sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
        sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]

    return sentences 
Example #8
Source File: textAnalysis.py    From deep_learning with MIT License 6 votes vote down vote up
def train_wordtoVect(train_inputTexts):
    """
    训练词向量函数
    """
    texts=[]
    for doc in train_inputTexts:
        seg_doc = jieba.lcut(doc.replace('\n', ''))
        d =" ".join(seg_doc)
        texts.append(d)
    tokenizer = text.Tokenizer()                            # 分词MAX_NB_WORDS
    tokenizer.fit_on_texts(texts)
    text_sequences = tokenizer.texts_to_sequences(texts)    # 受num_words影响
    word_index = tokenizer.word_index                       # 词_索引
    data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return word_index, data 
Example #9
Source File: textAnalysis.py    From deep_learning with MIT License 6 votes vote down vote up
def train():
    """
    训练模型,并保存

    """
    print('Loading Data...')
    inputTexts, labels = load_data()
    print(inputTexts.shape, labels.shape)

    print('segment...')

    # seg_data = [jieba.lcut(document.replace('\n', ''))for document in inputTexts]
    # print('word2vec...')
    # index_dict, word_vectors, data = word2vec_train(seg_data)
    # n_symbols = len(index_dict) + 1   
    # x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
    # print(x_train.shape, y_train.shape)
    # train_model(n_symbols, x_train, y_train, x_test, y_test)

    word_index, data = train_wordtoVect(inputTexts)
    input_dim=len(word_index) + 1
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
    print(x_train.shape, y_train.shape)

    train_model(input_dim, x_train, y_train, x_test, y_test) 
Example #10
Source File: DataManager.py    From ChID-Dataset with Apache License 2.0 6 votes vote down vote up
def _prepare_data(self, temp_data):
        cans = temp_data["candidates"]
        cans = [self.vocab.tran2id(each, True) for each in cans]

        for text in temp_data["content"]:
            content = re.split(r'(#idiom\d+#)', text)

            doc = []
            loc = []
            labs = []
            tags = []

            for i, segment in enumerate(content):
                if re.match(r'#idiom\d+#', segment) is not None:
                    tags.append(segment)
                    if segment in self.ans:
                        labs.append(self.ans[segment])
                    loc.append(len(doc))
                    doc.append(self.vocab.tran2id('#idiom#'))
                else:
                    doc += [self.vocab.tran2id(each) for each in jieba.lcut(segment)]

            yield doc, cans, labs, loc, tags 
Example #11
Source File: nlp.py    From open-entity-relation-extraction with MIT License 6 votes vote down vote up
def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string,句子
            entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list,分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
                jieba.add_word(entity)
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))  # 单个用户词加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))  # 单个用户词加入示例
        # 分词,不进行词性标注
        # lemmas = pynlpir.segment(sentence, pos_tagging=False)
        lemmas = jieba.lcut(sentence)
        # pynlpir.close()  # 释放
        return lemmas 
Example #12
Source File: cut_text.py    From chatbot_by_similarity with MIT License 6 votes vote down vote up
def cut_texts(texts=None, need_cut=True, word_len=1):
    '''
    Use jieba to cut texts
    :param texts:list of texts
    :param need_cut:whether need cut text
    :param word_len:min length of words to keep,in order to delete stop-words
    :param savepath:path to save word list in json file
    :return:
    '''
    if need_cut:
        if word_len > 1:
            texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
        else:
            texts_cut = [jieba.lcut(one_text) for one_text in texts]
    else:
        if word_len > 1:
            texts_cut = [[word for word in text if len(word) >= word_len] for text in texts]
        else:
            texts_cut = texts

    return texts_cut 
Example #13
Source File: train_word2vec.py    From text-cnn with MIT License 6 votes vote down vote up
def __iter__(self):
        for filename in self.filenames:
            with codecs.open(filename, 'r', encoding='utf-8') as f:
                for _,line in enumerate(f):
                    try:
                        line=line.strip()
                        line=line.split('\t')
                        assert len(line)==2
                        blocks=re_han.split(line[1])
                        word=[]
                        for blk in blocks:
                            if re_han.match(blk):
                                word.extend(jieba.lcut(blk))
                        yield word
                    except:
                        pass 
Example #14
Source File: text_predict.py    From text-cnn with MIT License 6 votes vote down vote up
def sentence_cut(sentences):
    """
    Args:
        sentence: a list of text need to segment
    Returns:
        seglist:  a list of sentence cut by jieba 

    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation
    seglist=[]
    for sentence in sentences:
        words=[]
        blocks = re_han.split(sentence)
        for blk in blocks:
            if re_han.match(blk):
                words.extend(jieba.lcut(blk))
        seglist.append(words)
    return  seglist 
Example #15
Source File: train_word2vec.py    From text_rnn_attention with MIT License 6 votes vote down vote up
def __iter__(self):
        for filename in self.filenames:
            with codecs.open(filename, 'r', encoding='utf-8') as f:
                for _,line in enumerate(f):
                    try:
                        line=line.strip()
                        line=line.split('\t')
                        assert len(line)==2
                        blocks=re_han.split(line[1])
                        word=[]
                        for blk in blocks:
                            if re_han.match(blk):
                                word.extend(jieba.lcut(blk))
                        yield word
                    except:
                        pass 
Example #16
Source File: text_predict.py    From text_rnn_attention with MIT License 6 votes vote down vote up
def sentence_cut(sentences):
    """
    Args:
        sentence: a list of text need to segment
    Returns:
        seglist:  a list of sentence cut by jieba

    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation
    with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f:
            stopwords=[line.strip() for line in f.readlines()]
    contents=[]
    for sentence in sentences:
        words=[]
        blocks = re_han.split(sentence)
        for blk in blocks:
            if re_han.match(blk):
                seglist = jieba.lcut(blk)
                words.extend([w for w in seglist if w not in stopwords])
        contents.append(words)
    return  contents 
Example #17
Source File: create_pretraining_data.py    From albert-chinese-ner with MIT License 5 votes vote down vote up
def get_new_segment(segment):  # 新增的方法 ####
    """
    输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。
    :param segment: 一句话. e.g.  ['悬', '灸', '技', '术', '培', '训', '专', '家', '教', '你', '艾', '灸', '降', '血', '糖', ',', '为', '爸', '妈', '收', '好', '了', '!']
    :return: 一句处理过的话 e.g.    ['悬', '##灸', '技', '术', '培', '训', '专', '##家', '教', '你', '艾', '##灸', '降', '##血', '##糖', ',', '为', '爸', '##妈', '收', '##好', '了', '!']
    """
    seq_cws = jieba.lcut("".join(segment)) # 分词
    seq_cws_dict = {x: 1 for x in seq_cws} # 分词后的词加入到词典dict
    new_segment = []
    i = 0
    while i < len(segment): # 从句子的第一个字开始处理,知道处理完整个句子
      if len(re.findall('[\u4E00-\u9FA5]', segment[i])) == 0:  # 如果找不到中文的,原文加进去即不用特殊处理。
        new_segment.append(segment[i])
        i += 1
        continue

      has_add = False
      for length in range(3, 0, -1):
        if i + length > len(segment):
          continue
        if ''.join(segment[i:i + length]) in seq_cws_dict:
          new_segment.append(segment[i])
          for l in range(1, length):
            new_segment.append('##' + segment[i + l])
          i += length
          has_add = True
          break
      if not has_add:
        new_segment.append(segment[i])
        i += 1
    # print("get_new_segment.wwm.get_new_segment:",new_segment)
    return new_segment 
Example #18
Source File: test_spm_preprocessor.py    From fancy-nlp with GNU General Public License v3.0 5 votes vote down vote up
def test_get_word_ids(self):
        example_text = ''.join(self.x_train[0][0])
        word_cut = jieba.lcut(example_text)
        word_ids = self.preprocessor.get_word_ids(word_cut)
        assert len(word_ids) == len(word_cut) 
Example #19
Source File: raw_data.py    From textclf with MIT License 5 votes vote down vote up
def jieba_tokenizer(text: str):
    return jieba.lcut(text) 
Example #20
Source File: preprocess.py    From deep-siamese-text-similarity with MIT License 5 votes vote down vote up
def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        yield list(jieba.lcut(sentence)) 
Example #21
Source File: loader.py    From text_rnn_attention with MIT License 5 votes vote down vote up
def read_file(filename):
    """
    Args:
        filename:trian_filename,test_filename,val_filename 
    Returns:
        two list where the first is lables and the second is contents cut by jieba
        
    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation

    with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f:
            stopwords=[line.strip() for line in f.readlines()]

    contents,labels=[],[]
    with codecs.open(filename,'r',encoding='utf-8') as f:
        for line in f:
            try:
                line=line.rstrip()
                assert len(line.split('\t'))==2
                label,content=line.split('\t')
                labels.append(label)
                blocks = re_han.split(content)
                word = []
                for blk in blocks:
                    if re_han.match(blk):
                        seglist=jieba.lcut(blk)
                        word.extend([w for w in seglist if w not in stopwords])
                contents.append(word)
            except:
                pass
    return labels,contents 
Example #22
Source File: preprocess.py    From deep-siamese-text-similarity with MIT License 5 votes vote down vote up
def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        sentence = sentence.decode("utf8")
        sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。:??、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"),
                          sentence)
        yield list(jieba.lcut(sentence)) 
Example #23
Source File: evaluate.py    From tatk with Apache License 2.0 5 votes vote down vote up
def split_delex_sentence(sen):
    res_sen = ''
    pattern = re.compile(r'(\[[^\[^\]]+\])')
    slots = pattern.findall(sen)
    for slot in slots:
        sen = sen.replace(slot, '[slot]')
    sen = sen.split('[slot]')
    for part in sen:
        part = ' '.join(jieba.lcut(part))
        res_sen += part
        if slots:
            res_sen += ' ' + slots.pop(0) + ' '
    return res_sen 
Example #24
Source File: LoadDemo.py    From deep_learning with MIT License 5 votes vote down vote up
def input_fn(filenames="./data/knowledge.txt", batch_size=32, epoch_num=None, shuffle_size=256):
    dataset = tf.data.TextLineDataset(filenames)

    def clean_data(line):
        columns_data = tf.string_split([line], '\t')
        # tensor化
        labels = tf.string_to_number(columns_data.values[1], out_type=tf.float32)
        splits_data = columns_data.values[2]

        def split_word(text):
            text = text.decode()
            print(text)
            text = rules.sub("", text)
            text = text.strip()
            tokens = jieba.lcut(text)
            print(tokens)
            if len(tokens)==0:
                tokens = ["未知空"]
            # dataset需要保证长度一致
            return tokens[:1]
        # tf.py_func 将普通函数作用在tensor上
        result = tf.py_func(split_word, [splits_data], [tf.string])
        return {"context": result}, labels

    dataset = dataset.map(lambda line: clean_data(line))
    # shuffle将数据打乱,数值越大,混乱程度越大
    if shuffle_size > 0:
        if epoch_num:
            # repeat数据集重复了指定次数
            dataset = dataset.shuffle(shuffle_size).repeat(epoch_num)
        else:
            dataset = dataset.shuffle(shuffle_size).repeat()

    # 按照顺序取出FLAGS.batch_size行数据,最后一次输出可能小于FLAGS.batch_size
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset 
Example #25
Source File: tokenizer.py    From text2vec with Apache License 2.0 5 votes vote down vote up
def tokenize(self, sentence, cut_all=False, HMM=True):
        """
        切词并返回切词位置
        :param sentence: 句子
        :param cut_all: 全模式,默认关闭
        :param HMM: 是否打开NER识别,默认打开
        :return:  A list of strings.
        """
        return self.model.lcut(sentence, cut_all=cut_all, HMM=HMM) 
Example #26
Source File: tokenizer.py    From text2vec with Apache License 2.0 5 votes vote down vote up
def segment(sentence, cut_type='word', pos=False):
    """
    切词
    :param sentence:
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
    :param pos: enable POS
    :return: list
    """
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence) 
Example #27
Source File: utils_test.py    From text2vec with Apache License 2.0 5 votes vote down vote up
def test_jieba():
    a = '我要办理特价机票,李浩然可以想办法'
    import jieba
    b = jieba.lcut(a, cut_all=False)
    print('cut_all=False', b)
    b = jieba.lcut(a, cut_all=True)
    print('cut_all=True', b)

    b = jieba.lcut(a, HMM=True)
    print('HMM=True', b)

    b = jieba.lcut(a, HMM=False)
    print('HMM=False', b) 
Example #28
Source File: textAnalysis.py    From deep_learning with MIT License 5 votes vote down vote up
def predict_wordtoVect(valid_inputTexts):
    """
    预测词向量函数
    """
    train_texts, labels = load_data()
    train_texts = [" ".join(jieba.lcut(doc)) for doc in train_texts]
    tokenizer = text.Tokenizer()
    tokenizer.fit_on_texts(train_texts)

    pred_texts = [" ".join(jieba.lcut(doc)) for doc in valid_inputTexts]
    print(pred_texts)
    text_seq = tokenizer.texts_to_sequences(pred_texts)
    valid_data = sequence.pad_sequences(text_seq, maxlen=MAX_SEQUENCE_LENGTH)

    return valid_data 
Example #29
Source File: evaluate.py    From tatk with Apache License 2.0 5 votes vote down vote up
def split_delex_sentence(sen):
    res_sen = ''
    pattern = re.compile(r'(\[[^\[^\]]+\])')
    slots = pattern.findall(sen)
    for slot in slots:
        sen = sen.replace(slot, '[slot]')
    sen = sen.split('[slot]')
    for part in sen:
        part = ' '.join(jieba.lcut(part))
        res_sen += part
        if slots:
            res_sen += ' ' + slots.pop(0) + ' '
    return res_sen 
Example #30
Source File: Bayes_classifier.py    From python with Apache License 2.0 5 votes vote down vote up
def cut_word(self, origin):
		"""结巴分词
		"""
		origin = re.sub(r"[^\u4e00-\u9fa5]+", "", origin) # 除去所有非中文的字符
		self.seg_list = jieba.lcut(origin) # jieba搜索引擎模式分词