Python Examples of pypinyin.pinyin

Source File: txt2pinyin.py From MTTS with MIT License

6 votes

def pinyinformat(syllable):
    '''format pinyin to mtts's format''' 
    if not syllable[-1].isdigit():
        syllable = syllable + '5'
    assert syllable[-1].isdigit()
    syl_no_tone = syllable[:-1]
    if syl_no_tone in TRANSFORM_DICT:
        syllable = syllable.replace(syl_no_tone, TRANSFORM_DICT[syl_no_tone])
    return syllable
 
    """
    for key, value in translate_dict.items():
        syllable = syllable.replace(key, value)
    for key, value in translate_dict_more.items():
        syllable = syllable.replace(key, value)
    if not syllable[-1].isdigit():
        syllable = syllable + '5'
    return syllable
    """

Source File: gardener.py From Automatic_Speech_Recognition with MIT License

6 votes

def process_poetry(self, data_dir='/media/pony/DLdigest/data/languageModel/chinese-poetry/json'):
        """
        Process Tang and Song poems dataset
        """
        save_dir = os.path.join(self.save_dir, 'poem')
        check_path_exists(save_dir)
        count = 0
        for entry in os.scandir(data_dir):
            if entry.name.startswith('poet'):
                with open(entry.path, 'r') as json_file:
                    poems = json.load(json_file)
                    for p in poems: 
                        paras = HanziConv.toSimplified(''.join(p['paragraphs']).replace('\n', ''))
                        paras = filter_punctuation(paras)
                        for para in paras.split(' '):
                            if len(para.strip())>1:
                                pys = ' '.join(np.array(pinyin(para)).flatten())
                                with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
                                    f.write(para+','+pys+'\n')
                                count += 1

Source File: utils.py From DimSim with Apache License 2.0

6 votes

def get_edit_distance_close_2d_code(a, b):
    res = 0
    try:
        if (a is None) or (b is None):
            print("Error:pinyin({},{})".format(a.toString(),b.toString()))
            return res
        
        twoDcode_consonant_a = consonantMap_TwoDCode[a.consonant]
        twoDcode_consonant_b = consonantMap_TwoDCode[b.consonant]
        
        cDis = abs(get_distance_2d_code(twoDcode_consonant_a, twoDcode_consonant_b))
        
        twoDcode_vowel_a = vowelMap_TwoDCode[a.vowel]
        twoDcode_vowel_b = vowelMap_TwoDCode[b.vowel]
        
        vDis = abs(get_distance_2d_code(twoDcode_vowel_a, twoDcode_vowel_b))

        hcDis = get_sim_dis_from_hardcod_map(a,b)
        
        res = min((cDis+vDis),hcDis) + 1.0*abs(a.tone-b.tone)/10
        
    except:
        raise Exception("Error pinyin {}{}".format(a.toString(), b.toString()))
    return res

Source File: gardener.py From Automatic_Speech_Recognition with MIT License

6 votes

def process_audioLabels(self, data_dir='/media/pony/DLdigest/data/ASR_zh/'): 
        """
        Processing label files in collected Chinese audio dataset
        """
        save_dir = os.path.join(self.save_dir, 'audioLabels')
        check_path_exists(save_dir)
        count = 0
        for subdir, dirs, files in os.walk(data_dir):
            print(subdir)
            for f in files:
                if f.endswith("label"):
                    fullFilename = os.path.join(subdir, f)
                    with open(fullFilename, 'r') as f:
                        line = f.read()
                        con = HanziConv.toSimplified(line)
                        con = filter_punctuation(con)
                        for c in con.split(' '):
                            if len(c.strip())>1:
                                pys = ' '.join(np.array(pinyin(c)).flatten())
                                count += 1
                                with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
                                    f.write(c+','+pys+'\n')

Source File: format_util.py From betterlifepsi with MIT License

6 votes

def get_pinyin_first_letters(chinese_characters):
    """
    Get fist letters of pin yin of chinese characters, if there's any 多音字
    All combinations will be returned, for example for "调向"
    Result of dx|tx will be returned.
    :param chinese_characters: Chinese characters to get pinyin. 
    :return: first letters of pin yin of the letters
    """
    pys = _get_pinyin_all([], chinese_characters)
    result = ''
    for py in pys:
        for p in py:
            result += p
        result += "|"
    result = result.rstrip('|') # <- Remove last "|"
    return result

Source File: phoneme_tokenizer.py From espnet with Apache License 2.0

6 votes

def pypinyin_g2p_phone(text) -> List[str]:
    from pypinyin import pinyin
    from pypinyin import Style
    from pypinyin.style._utils import get_finals
    from pypinyin.style._utils import get_initials

    phones = [
        p
        for phone in pinyin(text, style=Style.TONE3)
        for p in [
            get_initials(phone[0], strict=True),
            get_finals(phone[0], strict=True),
        ]
        if len(p) != 0
    ]
    return phones

Source File: mtts.py From style-token_tacotron2 with MIT License

6 votes

def _add_lab(txtlines, wav_dir_path):
    logger = logging.getLogger('mtts')
    for line in txtlines:
        numstr, txt = line.split(' ')
        txt = re.sub('#\d', '', txt)
        pinyin_list = pinyin(txt, style=Style.TONE3)
        new_pinyin_list = []
        for item in pinyin_list:
            if not item:
                logger.warning(
                    '{file_num} do not generate right pinyin'.format(numstr))
            if not item[0][-1].isdigit():
                phone = item[0] + '5'
            else:
                phone = item[0]
            new_pinyin_list.append(phone)
        lab_file = os.path.join(wav_dir_path, numstr + '.lab')
        with open(lab_file, 'w') as oid:
            oid.write(' '.join(new_pinyin_list))

Source File: mtts.py From style-token_tacotron2 with MIT License

6 votes

def _add_pinyin(txtlines, output_path):
    ''' txt2pinyin in one file '''
    logger = logging.getLogger('mtts')
    all_pinyin = []
    for line in txtlines:
        numstr, txt = line.split(' ')
        txt = re.sub('#\d', '', txt)
        pinyin_list = pinyin(txt, style=Style.TONE3)
        new_pinyin_list = []
        for item in pinyin_list:
            if not item:
                logger.warning(
                    '{file_num} do not generate right pinyin'.format(numstr))
            if not item[0][-1].isdigit():
                phone = item[0] + '5'
            else:
                #phone = item[0]
                phone = item[0].replace('v', 'u')
            new_pinyin_list.append(phone)
        all_pinyin.append(numstr + ' ' + ' '.join(new_pinyin_list))
    all_pinyin_file = os.path.join(output_path, 'all_pinyin.lab')
    with open(all_pinyin_file, 'w') as oid:
        for item in all_pinyin:
            oid.write(item + '\n')

Source File: components.py From glyce with Apache License 2.0

6 votes

def token_indexing(idx, encoding_type, return_type):
    """
    将输入的单词id映射为每个字五笔、拼音的字符的id
    :param idx: (seq_len, batch_size)
    :return: chars: (seq_len, batch_size, num_char)  token_lens: (seq_len, batch_size, num_char)
    """
    c = dict_word['idx2word'][idx]
    if c == '<eos>':
        c = '。'
    if encoding_type == 'wubi':
        encoding = wubi(c)[0] if wubi(c) else c
        full_encoding = encoding if len(encoding) == 8 else encoding + '。' * (8 - len(encoding))
        assert len(full_encoding) == 8, full_encoding
        tokens = [dict_wubi['char2idx'][c] for c in full_encoding]
        length = [i < len(encoding) for i in range(len(tokens))]
    elif encoding_type == 'pinyin':
        encoding = pinyin(c)[0][0] if pinyin(c) else c
        full_encoding = encoding if len(encoding) == 8 else encoding + '。' * (8 - len(encoding))
        assert len(full_encoding) == 8, full_encoding
        tokens = [dict_pinyin['char2idx'][c] for c in full_encoding]
        length = [i < len(encoding) for i in range(len(tokens))]
    else:
        raise NotImplementedError
    # print(idx, c, encoding, tokens, length)
    return tokens if return_type == 'tokens' else length

Source File: rhyme.py From Chinese-Hip-pop-Generation with MIT License

6 votes

def rhyme(a, b):
    # 判断两句话是几押，返回0为不押韵
    # 两句话完全相同也返回0
    if a == b:
        return 0
    # N押 韵母和声调都要相同
    py1_tone = pinyin(a, style=FINALS_TONE3)
    py2_tone = pinyin(b, style=FINALS_TONE3)
    py1_tone.reverse()
    py2_tone.reverse()
    result = 0
    result = n_rhyme(py1_tone, py2_tone)
    if result > 1:
        return result
    # 单押和双押 韵母相同  声调可以不同
    py1 = pinyin(a, style=FINALS)[-2:]
    py2 = pinyin(b, style=FINALS)[-2:]
    py1.reverse()
    py2.reverse()
    result = n_rhyme(py1, py2)
    return result


# index -> sentence

Source File: txt2pinyin.py From style-token_tacotron2 with MIT License

6 votes

def pinyinformat(syllable):
    '''format pinyin to mtts's format''' 
    if not syllable[-1].isdigit():
        syllable = syllable + '5'
    assert syllable[-1].isdigit()
    syl_no_tone = syllable[:-1]
    if syl_no_tone in TRANSFORM_DICT:
        syllable = syllable.replace(syl_no_tone, TRANSFORM_DICT[syl_no_tone])
    return syllable
 
    """
    for key, value in translate_dict.items():
        syllable = syllable.replace(key, value)
    for key, value in translate_dict_more.items():
        syllable = syllable.replace(key, value)
    if not syllable[-1].isdigit():
        syllable = syllable + '5'
    return syllable
    """

Source File: mtts.py From MTTS with MIT License

6 votes

def _add_pinyin(txtlines, output_path):
    ''' txt2pinyin in one file '''
    logger = logging.getLogger('mtts')
    all_pinyin = []
    for line in txtlines:
        numstr, txt = line.split(' ')
        txt = re.sub('#\d', '', txt)
        pinyin_list = pinyin(txt, style=Style.TONE3)
        new_pinyin_list = []
        for item in pinyin_list:
            if not item:
                logger.warning(
                    '{file_num} do not generate right pinyin'.format(numstr))
            if not item[0][-1].isdigit():
                phone = item[0] + '5'
            else:
                #phone = item[0]
                phone = item[0].replace('v', 'u')
            new_pinyin_list.append(phone)
        all_pinyin.append(numstr + ' ' + ' '.join(new_pinyin_list))
    all_pinyin_file = os.path.join(output_path, 'all_pinyin.lab')
    with open(all_pinyin_file, 'w') as oid:
        for item in all_pinyin:
            oid.write(item + '\n')

Source File: mtts.py From MTTS with MIT License

6 votes

def _add_lab(txtlines, wav_dir_path):
    logger = logging.getLogger('mtts')
    for line in txtlines:
        numstr, txt = line.split(' ')
        txt = re.sub('#\d', '', txt)
        pinyin_list = pinyin(txt, style=Style.TONE3)
        new_pinyin_list = []
        for item in pinyin_list:
            if not item:
                logger.warning(
                    '{file_num} do not generate right pinyin'.format(numstr))
            if not item[0][-1].isdigit():
                phone = item[0] + '5'
            else:
                phone = item[0]
            new_pinyin_list.append(phone)
        lab_file = os.path.join(wav_dir_path, numstr + '.lab')
        with open(lab_file, 'w') as oid:
            oid.write(' '.join(new_pinyin_list))

Source File: word2pinyin.py From chat with MIT License

6 votes

def match_pinyin(pinyin1, pinyin2):
    """Similarity score between two pinyin.
    计算两个拼音的相似度得分。
    """
    assert pinyin1 != "", "pinyin1 can not be empty"
    assert pinyin2 != "", "pinyin2 can not be empty"
    pv_match = 0
    if len(pinyin1) < len(pinyin2):
        len_short = len(pinyin1)
        len_long = len(pinyin2)
        pv_long = pinyin2
        pv_short = pinyin1
    else:
        len_short = len(pinyin2)
        len_long = len(pinyin1)
        pv_long = pinyin1
        pv_short = pinyin2
    for i in range(0, len_short):
        if pv_short[i] == pv_long[i]:
            pv_match += 1
    score = pv_match/len_long
    return score

Source File: word2pinyin.py From chat with MIT License

6 votes

def jaccard_pinyin(pv1, pv2, threshold=0.7):
    """Similarity score between two pinyin vectors with jaccard.
    计算两个拼音向量的语义 jaccard 相似度得分。

    According to the semantic jaccard model to calculate the similarity.
    The similarity score interval for each two pinyin sentences was [0, 1].
    根据语义jaccard模型来计算相似度。每两个拼音向量的相似度得分区间为为[0, 1]。
    """
    sv_matrix = []
    sv_rows = []
    for pinyin1 in pv1:
        for pinyin2 in pv2:
            score = match_pinyin(pinyin1, pinyin2)
            sv_rows.append(score)
        sv_matrix.append(sv_rows)
        sv_rows = []
    matrix = mat(sv_matrix)
    result = sum_cosine(matrix, threshold)
    total = result["total"]
    total_dif = result["total_dif"]
    num = result["num_not_match"]
    sim = total/(total + num*(1-total_dif))
    return sim

Source File: poetize_plus.py From AI_Poet_Totoro with MIT License

6 votes

def match_tone(ci, tone_ci):  
    '''判断字词是否符合相应平仄'''
    judge = True
    for i in range(len(ci)):
        diao = pinyin(ci[i], style=9, errors='ignore')[0][0][-1]
        if tone_ci[i] == 'x':
            pass
        elif tone_ci[i] == '0' and diao in ['1', '2']:
            pass
        elif tone_ci[i] == '1' and diao in ['3', '4', 'i']:
            pass
        else:
            judge = False
    return judge

    # yn:首行是否押韵，0押，1不押

Source File: text_utils.py From pycorrector with Apache License 2.0

6 votes

def get_homophones_by_pinyin(input_pinyin):
    """
    根据拼音取同音字
    :param input_pinyin:
    :return:
    """
    result = []
    # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
    for i in range(0x4e00, 0x9fa6):
        if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin:
            # TONE2: 中zho1ng
            result.append(chr(i))
    return result

Source File: components.py From glyce with Apache License 2.0

6 votes

def __init__(self, encoding_type, composing_func, embedding_size, hidden_size, num_layers=1):
        super(SubCharComponent, self).__init__()
        self.encoding_type = encoding_type  # 拼音，五笔
        self.composing_func = composing_func  # 构造函数：lstm, cnn, avg, max
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        if self.composing_func == 'LSTM':
            self.composing = nn.LSTM(input_size=embedding_size,
                                     hidden_size=hidden_size,
                                     num_layers=num_layers,
                                     bidirectional=True)
        elif self.composing_func == 'GRU':
            self.composing = nn.GRU(input_size=embedding_size,
                                    hidden_size=hidden_size,
                                    num_layers=num_layers,
                                    bidirectional=True)
        if self.encoding_type == 'wubi':
            self.embedding = nn.Embedding(len(dict_wubi['idx2char']), embedding_size)
        elif self.encoding_type == 'pinyin':
            self.embedding = nn.Embedding(len(dict_pinyin['idx2char']), embedding_size)

Source File: txt2pinyin.py From style-token_tacotron2 with MIT License

5 votes

def _pre_pinyin_setting():
    ''' fix pinyin error'''
    load_phrases_dict({'嗯':[['ēn']]})

Source File: utils.py From DimSim with Apache License 2.0

5 votes

def to_pinyin(utterance):
    length = len(utterance)
    translated = []
    pinyin_encodings = pinyin(utterance, style=Style.TONE2)
    for i in range(length):
        currPinyin = pinyin_encodings[i][0]
        translated.append(put_tone_to_end(currPinyin))
    return translated

Source File: mtts.py From style-token_tacotron2 with MIT License

5 votes

def _pre_pinyin_setting():
    ''' fix pinyin error'''
    load_phrases_dict({'嗯': [['ēn']]})
    load_phrases_dict({'风云变幻': [['fēng'], ['yún'], ['bià'], ['huàn']]})
    load_phrases_dict({'不破不立': [['bù'], ['pò'], ['bù'], ['lì']]})

Source File: word2pinyin.py From chat with MIT License

5 votes

def similarity_pinyin(sentence1, sentence2):
    """Similarity score between two based on pinyin vectors with jaccard.
    基于拼音向量的语义 jaccard 句子相似度得分。
    """
    pv1 = pinyin_cut(sentence1)
    pv2 = pinyin_cut(sentence2)
    return jaccard_pinyin(pv1, pv2)

Source File: match.py From kog-money with MIT License

5 votes

def to_pinyin(name):
    n = [x for a in pinyin(name, 0) for x in a]
    return ''.join(n)

Source File: pinyin.py From Mcx with GNU General Public License v2.0

5 votes

def checker_full(name, pattern):
    initials = pypinyin.pinyin(name, style=pypinyin.NORMAL)
    return pattern.lower() in u"".join(x[0] for x in initials).lower()

Source File: pinyin.py From Mcx with GNU General Public License v2.0

5 votes

def checker_first_letters(name, pattern):
    initials = pypinyin.pinyin(name, style=pypinyin.FIRST_LETTER)
    return pattern.lower() in u"".join(x[0] for x in initials).lower()

Source File: pinyin.py From Mcx with GNU General Public License v2.0

5 votes

def checker_initials(name, pattern):
    initials = pypinyin.pinyin(name, style=pypinyin.INITIALS)
    return pattern.lower() in u"".join(x[0] for x in initials).lower()

Source File: gardener.py From Automatic_Speech_Recognition with MIT License

5 votes

def process_dureader(self, data_dir='/media/pony/DLdigest/data/languageModel/dureader-raw/'): 
        """
        Processing Baidu released QA Reader Dataset
        """
        save_dir = os.path.join(self.save_dir, 'dureader')
        check_path_exists(save_dir)
        count = 0
        for entry in os.scandir(data_dir):
            if entry.name.endswith('json'):
                print(entry.path)
                with open(entry.path, 'r') as f:
                    for line in f:
                        contents = json.loads(line)
                        con = []
                        try:
                            answers = ''.join(contents['answers'])
                            con.append(answers)
                            questions = contents['question']
                            con.append(questions)
                            for doc in contents['documents']:
                                paragraphs = ''.join(doc['paragraphs'])
                                title = doc['title']
                                con.append(paragraphs)
                                con.append(title)
                            con = HanziConv.toSimplified(''.join(con).replace('\n', ''))
                            cons = filter_punctuation(con)
                            for c in cons.split(' '):
                                if len(c.strip())>1:
                                    pys = ' '.join(np.array(pinyin(c)).flatten())
                                    count += 1
                                    with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
                                        f.write(c+','+pys+'\n')
                        except KeyError:
                            continue

Source File: harvesttext.py From HarvestText with MIT License

5 votes

def get_pinyin_correct_candidates(self, word, tolerance=1):  # 默认最多容忍一个拼音的变化
        assert tolerance in [0, 1]
        pinyins = lazy_pinyin(word)
        tmp = pinyins[:]
        pinyin_cands = {tuple(pinyins)}
        if tolerance == 1:
            for i, pinyin in enumerate(pinyins):
                if pinyin in self.pinyin_adjlist:
                    pinyin_cands |= {tuple(tmp[:i] + [neibr] + tmp[i + 1:]) for neibr in self.pinyin_adjlist[pinyin]}
        pinyin_cands = pinyin_cands & set(self.pinyin_mention_dict.keys())
        mention_cands = set()
        for pinyin in pinyin_cands:
            mention_cands |= self.pinyin_mention_dict[pinyin]
        return list(mention_cands)

Source File: format_util.py From betterlifepsi with MIT License

5 votes

def _get_pinyin_all(existing_combinations, characters):
    """
    Get all combinations of pinyin of some chinese characters as list, in a 
    recurrence way, since format of result from pinyin is [['a'], ['b']]
    So a combination of two level loop is needed to get all the pinyin. 
    :param existing_combinations:  Existing combinations, for already calculated characters. 
    :param characters: Characters to get combination of pinyin 
    :return:  A flat list of all combinations of pinyin for 多音字
    """
    first_character, other_characters = characters[0:1], characters[1:]
    if len(first_character) > 0:
        py = pinyin(first_character, style=pypinyin.FIRST_LETTER, heteronym=True)
        new_existing = []
        for p in py:
            for a in p:
                if len(existing_combinations) > 0:
                    for e in existing_combinations:
                        ne = e[:]
                        ne.append(a)
                        new_existing.append(ne)
                else:
                    ne = existing_combinations[:]
                    ne.append(a)
                    new_existing.append(ne)
        return _get_pinyin_all(new_existing, other_characters)
    return existing_combinations

Source File: phoneme_tokenizer.py From espnet with Apache License 2.0

5 votes

def pypinyin_g2p(text) -> List[str]:
    from pypinyin import pinyin
    from pypinyin import Style

    phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
    return phones

Python pypinyin.pinyin() Examples