Python Examples of jieba.lcut

Source File: similarity.py From sentence-similarity with MIT License

6 votes

def M_idf(self,s1, s2):
        v1, v2 = [], []
        s1_list = jieba.lcut(s1)
        s2_list = jieba.lcut(s2)

        for s in s1_list:
            idf_v = self.idf.get(s, 1)
            if s in self.voc:
                v1.append(1.0 * idf_v * self.voc[s])

        for s in s2_list:
            idf_v = self.idf.get(s, 1)
            if s in self.voc:
                v2.append(1.0 * idf_v * self.voc[s])

        v1 = np.array(v1).sum(axis=0)
        v2 = np.array(v2).sum(axis=0)

        sim = 1 - spatial.distance.cosine(v1, v2)

        return sim

Source File: cut_td_idf.py From nlp_xiaojiang with MIT License

6 votes

def cut_td_idf(sources_path, target_path):
    """
    结巴切词，汉语
    :param path: 
    :return: 
    """
    print("cut_td_idf start! ")
    corpus = txtRead(sources_path)
    governments = []
    for corpus_one in corpus:
        corpus_one_clear = corpus_one.replace(' ', '').strip()
        ques_q2b = strQ2B(corpus_one_clear.strip())
        ques_q2b_syboml = get_syboml(ques_q2b)
        governments.append(ques_q2b_syboml.strip())

    government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))

    topic_ques_all = []
    for topic_ques_one in government_ques:
        top_ques_aqlq = topic_ques_one.replace('   ', ' ').replace('  ', ' ').strip() + '\n'
        topic_ques_all.append(top_ques_aqlq)

    txtWrite(topic_ques_all, target_path)
    print("cut_td_idf ok! " + sources_path)

Source File: chatbot_sentence_vec_by_word.py From nlp_xiaojiang with MIT License

6 votes

def word_flag_cut(sentence):
    """
        jieba切词词性
    :param sentence: 
    :return: 
    """
    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
                        replace(' ', '').replace('\t', '').upper().strip()
    word_list = []
    flag_list = []
    try:
        sentence_cut =  ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
        words = jieba_seg.cut(sentence_cut)
        for word in words:
            word_list.append(word.word)
            flag_list.append(word.flag)
    except Exception as e:
        word_list = [sentence]
        flag_list = ['nt']
    return word_list, flag_list

Source File: segment.py From Chinese-Poetry-Generation with MIT License

6 votes

def segment(self, sentence):
        # TODO: try CRF-based segmentation.
        toks = []
        idx = 0
        while idx + 4 <= len(sentence):
            # Cut 2 chars each time.
            if sentence[idx : idx + 2] in self.sxhy_dict:
                toks.append(sentence[idx : idx + 2])
            else:
                for tok in jieba.lcut(sentence[idx : idx + 2]):
                    toks.append(tok)
            idx += 2
        # Cut last 3 chars.
        if idx < len(sentence):
            if sentence[idx : ] in self.sxhy_dict:
                toks.append(sentence[idx : ])
            else:
                for tok in jieba.lcut(sentence[idx : ]):
                    toks.append(tok)
        return toks


# For testing purpose.

Source File: segment.py From Chinese-Poetry-Generation with MIT License

6 votes

def _gen_sxhy_dict():
    print("Parsing shixuehanying dictionary ...")
    words = set()
    with open(_rawsxhy_path, 'r') as fin:
        for line in fin.readlines():
            if line[0] == '<':
                continue
            for phrase in line.strip().split()[1:]:
                if not is_cn_sentence(phrase):
                    continue
                idx = 0
                while idx + 4 <= len(phrase):
                    # Cut 2 chars each time.
                    words.add(phrase[idx : idx + 2])
                    idx += 2
                # Use jieba to cut the last 3 chars.
                if idx < len(phrase):
                    for word in jieba.lcut(phrase[idx:]):
                        words.add(word)
    with open(sxhy_path, 'w') as fout:
        fout.write(' '.join(words))

Source File: chinese.py From Multi-Label-Text-Classification-for-Chinese with MIT License

6 votes

def __call__(self, sent):
        sent = ptxt.Text(sent, "whi").clean
        sent = self.clean_linkpic(sent)

        sent = self.clean_english(sent)

        sent = self.clean_date(sent)
        sent = self.clean_time(sent)

        sent = self.clean_money(sent)
        sent = self.clean_weight(sent)
        sent = self.clean_concentration(sent)

        sent = self.clean_entity(sent)

        sent = self.clean_nums(sent)

        wlist = jieba.lcut(sent)
        sent = self.clean_stopwords(wlist)
        sent = self.clean_punctuation(sent)

        return sent

Source File: pre_process.py From nlp-journey with Apache License 2.0

6 votes

def process_data(train_file, user_dict=None, stop_dict=None):
    # 结巴分词加载自定义词典(要符合jieba自定义词典规范)
    if user_dict:
        jieba.load_userdict(user_dict)

    # 加载停用词表(每行一个停用词)
    stop_words = []
    if stop_dict:
        with open(stop_dict, 'r', encoding='utf-8') as file:
            stop_words = [stop_word.strip() for stop_word in file.readlines()]

    # 读取文件内容并分词, 去掉停用词
    with open(train_file, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
        sentences = [jieba.lcut(sentence.strip()) for sentence in sentences]
        sentences = [[s for s in sentence if s not in stop_words and s.strip() != ''] for sentence in sentences]

    return sentences

Source File: textAnalysis.py From deep_learning with MIT License

6 votes

def train_wordtoVect(train_inputTexts):
    """
    训练词向量函数
    """
    texts=[]
    for doc in train_inputTexts:
        seg_doc = jieba.lcut(doc.replace('\n', ''))
        d =" ".join(seg_doc)
        texts.append(d)
    tokenizer = text.Tokenizer()                            # 分词MAX_NB_WORDS
    tokenizer.fit_on_texts(texts)
    text_sequences = tokenizer.texts_to_sequences(texts)    # 受num_words影响
    word_index = tokenizer.word_index                       # 词_索引
    data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    return word_index, data

Source File: textAnalysis.py From deep_learning with MIT License

6 votes

def train():
    """
    训练模型，并保存

    """
    print('Loading Data...')
    inputTexts, labels = load_data()
    print(inputTexts.shape, labels.shape)

    print('segment...')

    # seg_data = [jieba.lcut(document.replace('\n', ''))for document in inputTexts]
    # print('word2vec...')
    # index_dict, word_vectors, data = word2vec_train(seg_data)
    # n_symbols = len(index_dict) + 1   
    # x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
    # print(x_train.shape, y_train.shape)
    # train_model(n_symbols, x_train, y_train, x_test, y_test)

    word_index, data = train_wordtoVect(inputTexts)
    input_dim=len(word_index) + 1
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)
    print(x_train.shape, y_train.shape)

    train_model(input_dim, x_train, y_train, x_test, y_test)

Source File: DataManager.py From ChID-Dataset with Apache License 2.0

6 votes

def _prepare_data(self, temp_data):
        cans = temp_data["candidates"]
        cans = [self.vocab.tran2id(each, True) for each in cans]

        for text in temp_data["content"]:
            content = re.split(r'(#idiom\d+#)', text)

            doc = []
            loc = []
            labs = []
            tags = []

            for i, segment in enumerate(content):
                if re.match(r'#idiom\d+#', segment) is not None:
                    tags.append(segment)
                    if segment in self.ans:
                        labs.append(self.ans[segment])
                    loc.append(len(doc))
                    doc.append(self.vocab.tran2id('#idiom#'))
                else:
                    doc += [self.vocab.tran2id(each) for each in jieba.lcut(segment)]

            yield doc, cans, labs, loc, tags

Source File: nlp.py From open-entity-relation-extraction with MIT License

6 votes

def segment(self, sentence, entity_postag=dict()):
        """采用NLPIR进行分词处理
        Args:
            sentence: string，句子
            entity_postag: dict，实体词性词典，默认为空集合，分析每一个案例的结构化文本时产生
        Returns:
            lemmas: list，分词结果
        """
        # 添加实体词典
        if entity_postag:
            for entity in entity_postag:
                # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode()))
                jieba.add_word(entity)
        # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode()))  # 单个用户词加入示例
        # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode()))  # 单个用户词加入示例
        # 分词，不进行词性标注
        # lemmas = pynlpir.segment(sentence, pos_tagging=False)
        lemmas = jieba.lcut(sentence)
        # pynlpir.close()  # 释放
        return lemmas

Source File: cut_text.py From chatbot_by_similarity with MIT License

6 votes

def cut_texts(texts=None, need_cut=True, word_len=1):
    '''
    Use jieba to cut texts
    :param texts:list of texts
    :param need_cut:whether need cut text
    :param word_len:min length of words to keep,in order to delete stop-words
    :param savepath:path to save word list in json file
    :return:
    '''
    if need_cut:
        if word_len > 1:
            texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
        else:
            texts_cut = [jieba.lcut(one_text) for one_text in texts]
    else:
        if word_len > 1:
            texts_cut = [[word for word in text if len(word) >= word_len] for text in texts]
        else:
            texts_cut = texts

    return texts_cut

Source File: train_word2vec.py From text-cnn with MIT License

6 votes

def __iter__(self):
        for filename in self.filenames:
            with codecs.open(filename, 'r', encoding='utf-8') as f:
                for _,line in enumerate(f):
                    try:
                        line=line.strip()
                        line=line.split('\t')
                        assert len(line)==2
                        blocks=re_han.split(line[1])
                        word=[]
                        for blk in blocks:
                            if re_han.match(blk):
                                word.extend(jieba.lcut(blk))
                        yield word
                    except:
                        pass

Source File: text_predict.py From text-cnn with MIT License

6 votes

def sentence_cut(sentences):
    """
    Args:
        sentence: a list of text need to segment
    Returns:
        seglist:  a list of sentence cut by jieba 

    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation
    seglist=[]
    for sentence in sentences:
        words=[]
        blocks = re_han.split(sentence)
        for blk in blocks:
            if re_han.match(blk):
                words.extend(jieba.lcut(blk))
        seglist.append(words)
    return  seglist

Source File: train_word2vec.py From text_rnn_attention with MIT License

6 votes

def __iter__(self):
        for filename in self.filenames:
            with codecs.open(filename, 'r', encoding='utf-8') as f:
                for _,line in enumerate(f):
                    try:
                        line=line.strip()
                        line=line.split('\t')
                        assert len(line)==2
                        blocks=re_han.split(line[1])
                        word=[]
                        for blk in blocks:
                            if re_han.match(blk):
                                word.extend(jieba.lcut(blk))
                        yield word
                    except:
                        pass

Source File: text_predict.py From text_rnn_attention with MIT License

6 votes

def sentence_cut(sentences):
    """
    Args:
        sentence: a list of text need to segment
    Returns:
        seglist:  a list of sentence cut by jieba

    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation
    with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f:
            stopwords=[line.strip() for line in f.readlines()]
    contents=[]
    for sentence in sentences:
        words=[]
        blocks = re_han.split(sentence)
        for blk in blocks:
            if re_han.match(blk):
                seglist = jieba.lcut(blk)
                words.extend([w for w in seglist if w not in stopwords])
        contents.append(words)
    return  contents

Source File: create_pretraining_data.py From albert-chinese-ner with MIT License

5 votes

def get_new_segment(segment):  # 新增的方法 ####
    """
    输入一句话，返回一句经过处理的话: 为了支持中文全称mask，将被分开的词，将上特殊标记("#")，使得后续处理模块，能够知道哪些字是属于同一个词的。
    :param segment: 一句话. e.g.  ['悬', '灸', '技', '术', '培', '训', '专', '家', '教', '你', '艾', '灸', '降', '血', '糖', '，', '为', '爸', '妈', '收', '好', '了', '！']
    :return: 一句处理过的话 e.g.    ['悬', '##灸', '技', '术', '培', '训', '专', '##家', '教', '你', '艾', '##灸', '降', '##血', '##糖', '，', '为', '爸', '##妈', '收', '##好', '了', '！']
    """
    seq_cws = jieba.lcut("".join(segment)) # 分词
    seq_cws_dict = {x: 1 for x in seq_cws} # 分词后的词加入到词典dict
    new_segment = []
    i = 0
    while i < len(segment): # 从句子的第一个字开始处理，知道处理完整个句子
      if len(re.findall('[\u4E00-\u9FA5]', segment[i])) == 0:  # 如果找不到中文的，原文加进去即不用特殊处理。
        new_segment.append(segment[i])
        i += 1
        continue

      has_add = False
      for length in range(3, 0, -1):
        if i + length > len(segment):
          continue
        if ''.join(segment[i:i + length]) in seq_cws_dict:
          new_segment.append(segment[i])
          for l in range(1, length):
            new_segment.append('##' + segment[i + l])
          i += length
          has_add = True
          break
      if not has_add:
        new_segment.append(segment[i])
        i += 1
    # print("get_new_segment.wwm.get_new_segment:",new_segment)
    return new_segment

Source File: test_spm_preprocessor.py From fancy-nlp with GNU General Public License v3.0

5 votes

def test_get_word_ids(self):
        example_text = ''.join(self.x_train[0][0])
        word_cut = jieba.lcut(example_text)
        word_ids = self.preprocessor.get_word_ids(word_cut)
        assert len(word_ids) == len(word_cut)

Source File: raw_data.py From textclf with MIT License

5 votes

def jieba_tokenizer(text: str):
    return jieba.lcut(text)

Source File: preprocess.py From deep-siamese-text-similarity with MIT License

5 votes

def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        yield list(jieba.lcut(sentence))

Source File: loader.py From text_rnn_attention with MIT License

5 votes

def read_file(filename):
    """
    Args:
        filename:trian_filename,test_filename,val_filename 
    Returns:
        two list where the first is lables and the second is contents cut by jieba
        
    """
    re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")  # the method of cutting text by punctuation

    with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f:
            stopwords=[line.strip() for line in f.readlines()]

    contents,labels=[],[]
    with codecs.open(filename,'r',encoding='utf-8') as f:
        for line in f:
            try:
                line=line.rstrip()
                assert len(line.split('\t'))==2
                label,content=line.split('\t')
                labels.append(label)
                blocks = re_han.split(content)
                word = []
                for blk in blocks:
                    if re_han.match(blk):
                        seglist=jieba.lcut(blk)
                        word.extend([w for w in seglist if w not in stopwords])
                contents.append(word)
            except:
                pass
    return labels,contents

Source File: preprocess.py From deep-siamese-text-similarity with MIT License

5 votes

def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        sentence = sentence.decode("utf8")
        sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。：？?、~@#￥%……&*（）]+".decode("utf8"), "".decode("utf8"),
                          sentence)
        yield list(jieba.lcut(sentence))

Source File: evaluate.py From tatk with Apache License 2.0

5 votes

def split_delex_sentence(sen):
    res_sen = ''
    pattern = re.compile(r'(\[[^\[^\]]+\])')
    slots = pattern.findall(sen)
    for slot in slots:
        sen = sen.replace(slot, '[slot]')
    sen = sen.split('[slot]')
    for part in sen:
        part = ' '.join(jieba.lcut(part))
        res_sen += part
        if slots:
            res_sen += ' ' + slots.pop(0) + ' '
    return res_sen

Source File: LoadDemo.py From deep_learning with MIT License

5 votes

def input_fn(filenames="./data/knowledge.txt", batch_size=32, epoch_num=None, shuffle_size=256):
    dataset = tf.data.TextLineDataset(filenames)

    def clean_data(line):
        columns_data = tf.string_split([line], '\t')
        # tensor化
        labels = tf.string_to_number(columns_data.values[1], out_type=tf.float32)
        splits_data = columns_data.values[2]

        def split_word(text):
            text = text.decode()
            print(text)
            text = rules.sub("", text)
            text = text.strip()
            tokens = jieba.lcut(text)
            print(tokens)
            if len(tokens)==0:
                tokens = ["未知空"]
            # dataset需要保证长度一致
            return tokens[:1]
        # tf.py_func 将普通函数作用在tensor上
        result = tf.py_func(split_word, [splits_data], [tf.string])
        return {"context": result}, labels

    dataset = dataset.map(lambda line: clean_data(line))
    # shuffle将数据打乱，数值越大，混乱程度越大
    if shuffle_size > 0:
        if epoch_num:
            # repeat数据集重复了指定次数
            dataset = dataset.shuffle(shuffle_size).repeat(epoch_num)
        else:
            dataset = dataset.shuffle(shuffle_size).repeat()

    # 按照顺序取出FLAGS.batch_size行数据，最后一次输出可能小于FLAGS.batch_size
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset

Source File: tokenizer.py From text2vec with Apache License 2.0

5 votes

def tokenize(self, sentence, cut_all=False, HMM=True):
        """
        切词并返回切词位置
        :param sentence: 句子
        :param cut_all: 全模式，默认关闭
        :param HMM: 是否打开NER识别，默认打开
        :return:  A list of strings.
        """
        return self.model.lcut(sentence, cut_all=cut_all, HMM=HMM)

Source File: tokenizer.py From text2vec with Apache License 2.0

5 votes

def segment(sentence, cut_type='word', pos=False):
    """
    切词
    :param sentence:
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
    :param pos: enable POS
    :return: list
    """
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)

Source File: utils_test.py From text2vec with Apache License 2.0

5 votes

def test_jieba():
    a = '我要办理特价机票，李浩然可以想办法'
    import jieba
    b = jieba.lcut(a, cut_all=False)
    print('cut_all=False', b)
    b = jieba.lcut(a, cut_all=True)
    print('cut_all=True', b)

    b = jieba.lcut(a, HMM=True)
    print('HMM=True', b)

    b = jieba.lcut(a, HMM=False)
    print('HMM=False', b)

Source File: textAnalysis.py From deep_learning with MIT License

5 votes

def predict_wordtoVect(valid_inputTexts):
    """
    预测词向量函数
    """
    train_texts, labels = load_data()
    train_texts = [" ".join(jieba.lcut(doc)) for doc in train_texts]
    tokenizer = text.Tokenizer()
    tokenizer.fit_on_texts(train_texts)

    pred_texts = [" ".join(jieba.lcut(doc)) for doc in valid_inputTexts]
    print(pred_texts)
    text_seq = tokenizer.texts_to_sequences(pred_texts)
    valid_data = sequence.pad_sequences(text_seq, maxlen=MAX_SEQUENCE_LENGTH)

    return valid_data

Source File: evaluate.py From tatk with Apache License 2.0

5 votes

def split_delex_sentence(sen):
    res_sen = ''
    pattern = re.compile(r'(\[[^\[^\]]+\])')
    slots = pattern.findall(sen)
    for slot in slots:
        sen = sen.replace(slot, '[slot]')
    sen = sen.split('[slot]')
    for part in sen:
        part = ' '.join(jieba.lcut(part))
        res_sen += part
        if slots:
            res_sen += ' ' + slots.pop(0) + ' '
    return res_sen

Source File: Bayes_classifier.py From python with Apache License 2.0

5 votes

def cut_word(self, origin):
		"""结巴分词
		"""
		origin = re.sub(r"[^\u4e00-\u9fa5]+", "", origin) # 除去所有非中文的字符
		self.seg_list = jieba.lcut(origin) # jieba搜索引擎模式分词

Python jieba.lcut() Examples