Python Examples of pypinyin.lazy

Source File: main.py From HanTTS with MIT License

7 votes

def speak(self, text):
        syllables = lazy_pinyin(text, style=pypinyin.TONE3)
        print(syllables)
        delay = 0
        
        def preprocess(syllables):
            temp = []
            for syllable in syllables:
                for p in TextToSpeech.punctuation:
                    syllable = syllable.replace(p, "")
                if syllable.isdigit():
                    syllable = atc.num2chinese(syllable)
                    new_sounds = lazy_pinyin(syllable, style=pypinyin.TONE3)
                    for e in new_sounds:
                        temp.append(e)
                else:
                    temp.append(syllable)
            return temp

        syllables = preprocess(syllables)
        for syllable in syllables:
            path = "syllables/"+syllable+".wav"
            _thread.start_new_thread(TextToSpeech._play_audio, (path, delay))
            delay += 0.355

Source File: main.py From HanTTS with MIT License

6 votes

def synthesize(self, text, src, dst):
        """
        Synthesize .wav from text
        src is the folder that contains all syllables .wav files
        dst is the destination folder to save the synthesized file
        """
        print("Synthesizing ...")
        delay = 0
        increment = 355 # milliseconds
        pause = 500 # pause for punctuation
        syllables = lazy_pinyin(text, style=pypinyin.TONE3)

        # initialize to be complete silence, each character takes up ~500ms
        result = AudioSegment.silent(duration=500*len(text))
        for syllable in syllables:
            path = src+syllable+".wav"
            sound_file = Path(path)
            # insert 500 ms silence for punctuation marks
            if syllable in TextToSpeech.punctuation:
                short_silence = AudioSegment.silent(duration=pause)
                result = result.overlay(short_silence, position=delay)
                delay += increment
                continue
            # skip sound file that doesn't exist
            if not sound_file.is_file():
                continue
            segment = AudioSegment.from_wav(path)
            result = result.overlay(segment, position=delay)
            delay += increment

        directory = dst
        if not os.path.exists(directory):
            os.makedirs(directory)

        result.export(directory+"generated.wav", format="wav")
        print("Exported.")

Source File: generate.py From GST-Tacotron with MIT License

6 votes

def _pinyin(s):
    symbols = '0123456789abcdefghijklmnopqrstuvwxyz '
    s = lazy_pinyin(s, style=Style.TONE2)
    yin = []
    for token in s:
        if token != ' ':
            a = ''
            for c in token:
                if c in symbols:
                    a += c
            yin.append(a)
    a = ''
    s = ' '.join(yin)
    for i in range(len(s)):
        if s[i] == ' ' and i < len(s) - 1 and s[i + 1] == ' ':
            continue
        a += s[i]
    return a

Source File: chinese_to_pinyin.py From style-token_tacotron2 with MIT License

6 votes

def transform_chinese_to_pinyin(data_path, output_path,type='corpus'):
    with open(data_path, 'rb') as fin, open(output_path, 'wb') as fout:
        if type=='corpus':
            for line in fin:
                line = line.decode('utf-8').strip('\r\n ')
                if not line:
                    continue
                transformed_line = ' '.join(lazy_pinyin(line, style=Style.TONE2))
                fout.write(f'{transformed_line}\n'.encode('utf-8'))
        elif type=='training_data':
            for line in fin:
                line=line.decode('utf-8').strip('\r\n ')
                if not line:
                    continue
                index,chinese_text=line.split('|')
                pinyin_text=' '.join(lazy_pinyin(chinese_text,style=Style.TONE2))
                fout.write(f'{index}|{pinyin_text}\n'.encode('utf-8'))

Source File: pg2.py From aca with MIT License

6 votes

def check_homepage_validity(name, res):
    """
    Check if the homepage is simtisfied basic rules.
    Input: name-name of expert res-homepage info list
    """
    title, url, detail, cited = res
    if url.endswith('pdf') or url.endswith('doc') or 'linkedin' in url.lower() or 'researchgate' in url.lower() or 'citations' in url.lower():
        return False
    # to check if the title or detail contains the name
    
    
    title = ' '.join(lazy_pinyin(title))
    name = name.replace('?', '')
    p = re.compile(r'|'.join(name.lower().split(' ')))
    if len(p.findall(title.lower())) == 0:
        return False
    
    #if 'wikipedia' in title.lower():
     #   return False
    return True

Source File: get_databaker_data.py From NeMo with Apache License 2.0

6 votes

def __convert_transcript(raw_transcript):
    """
    Converts a Chinese transcript to a Chinese pinyin sequence.
    """
    waveid, raw_trans = raw_transcript.split("\t")[:2]
    wavename = waveid + ".wav"
    symbols = ",.!?"
    # For simplicity, we only retain the Chinese chars and symbols
    trans = ''.join([_char for _char in __replace_symbols(raw_trans) if __is_chinese(_char) or _char in symbols])
    pinyin_trans = []
    for pinyin in lazy_pinyin(trans, style=Style.TONE3):
        if pinyin not in symbols and not pinyin[-1].isdigit():
            pinyin_trans.append(pinyin + "0")
        else:
            pinyin_trans.append(pinyin)
    return wavename, " ".join(pinyin_trans)

Source File: word2pinyin.py From chat with MIT License

5 votes

def pinyin_cut(sentence, pattern=None):
    """Cut the sentence into phonetic vectors.
    将句子切分为拼音向量。
    """
    return lazy_pinyin(sentence)

# @time_me()

Source File: tts.py From parrots with Apache License 2.0

5 votes

def synthesize(self, input_text='', output_wav_path=''):
        """
        Synthesize .wav from text
        input_text: the folder that contains all syllables .wav files
        output_wav_path: the destination folder to save the synthesized file
        """
        delay = 0
        increment = 355  # milliseconds
        pause = 500  # pause for punctuation
        syllables = lazy_pinyin(input_text, style=pypinyin.TONE3)

        # initialize to be complete silence, each character takes up ~500ms
        result = AudioSegment.silent(duration=500 * len(input_text))
        for syllable in syllables:
            path = os.path.join(self.syllables_dir, syllable + ".wav")
            sound_file = Path(path)
            # insert 500 sr silence for punctuation marks
            if syllable in self.punctuation:
                short_silence = AudioSegment.silent(duration=pause)
                result = result.overlay(short_silence, position=delay)
                delay += increment
                continue
            # skip sound file that doesn't exist
            if not sound_file.is_file():
                continue
            segment = AudioSegment.from_wav(path)
            result = result.overlay(segment, position=delay)
            delay += increment
        if not output_wav_path:
            output_wav_path = 'out.wav'

        result.export(output_wav_path, format="wav")
        default_logger.debug("Exported:" + output_wav_path)
        return result

Source File: tts.py From parrots with Apache License 2.0

5 votes

def speak(self, text):
        syllables = lazy_pinyin(text, style=pypinyin.TONE3)
        default_logger.debug(syllables)
        delay = 0

        def preprocess(syllables):
            temp = []
            for syllable in syllables:
                for p in self.punctuation:
                    syllable = syllable.replace(p, '')
                if syllable.isdigit():
                    syllable = num2chinese(syllable)
                    new_sounds = lazy_pinyin(syllable, style=pypinyin.TONE3)
                    for e in new_sounds:
                        temp.append(e)
                else:
                    temp.append(syllable)
            return temp

        syllables = preprocess(syllables)
        threads = []
        for syllable in syllables:
            path = os.path.join(self.syllables_dir, syllable + ".wav")
            if not os.path.exists(path): continue
            t = threading.Thread(target=self._play_audio, args=(path, delay))
            threads.append(t)
            delay += 0.355
        for t in threads:
            t.start()
        t.join()

Source File: biaobei_cleaner.py From style-token_tacotron2 with MIT License

5 votes

def biaobei_cleaner(input_path, output_path):
    output_lines = []
    index=1
    with open(input_path, 'rb') as fin, open(output_path, 'wb') as fout:
        for line in fin:
            line = line.decode('utf-8').strip('\r\n ')
            line = ' '.join(lazy_pinyin(line, style=Style.TONE2))
            for regex, replacement in _replacement_expression:
                line = re.sub(regex, replacement, line)
            line = '{:05d}|{}\n'.format(index,line)
            output_lines.append(line.encode('utf-8'))
            index+=1
        fout.writelines(output_lines)

Source File: pg2.py From aca with MIT License

5 votes

def check_name_in_text(name, text):
    """
    Sample: for the name of "Bai Li", \
    # www.xx.com/li.jpg get 0.5
    www.xx.org/bai_li.jpg get 1
    www.xx.org.avatar.jpg get 0
    """
    score = 0
    text = ' '.join(lazy_pinyin(text))
    for i in re.split(r'[ -]', name):
        if i.lower() in text.lower():
            score += 1
    return score / len(name.split(' '))

Source File: fenbian.py From dayworkspace with GNU Lesser General Public License v3.0

5 votes

def get_pitch(hans):
    '''获取音调
        参数:  汉字,语句或词组
        返回: 一个声调元祖.  包括  1,2, 3, 4, 0.  分别表示 一到四声和轻声
    '''
    pitch_list = list()
    pinyin = lazy_pinyin(hans, style=Style.FINALS_TONE3)
    # ['en2', 'uei3'] 或  ['uo1', 'i']
    for py in pinyin:
        
        if py[-1].isdigit():
            pitch_list.append(int(py[-1]))
        else:
            pitch_list.append(0)
    return tuple(pitch_list)

Source File: BibTexEntries.py From CNKI_2_BibTeX with MIT License

5 votes

def generateIDInTitleFormat(self, cnkiNetEntry):
        title = cnkiNetEntry["Title"]
        title = re.sub(r"[0-9]", "", title)
        title = re.sub(r"[_,;]", "", title)
        if self.__isFullEnglish(title):
            titleWords = title.strip().split(" ")
            self.ID = "".join(titleWords[0:min(len(titleWords), 4)])
        else:
            jieba.setLogLevel(logging.INFO)
            title = title.replace(" ", "").replace(u"\u3000", "")
            titleWords = list(jieba.cut(title))
            stringForConvertToPinyin = "".join(
                titleWords[0:min(len(titleWords), 3)])
            self.ID = "".join(pinyin(stringForConvertToPinyin))

Source File: BibTexEntries.py From CNKI_2_BibTeX with MIT License

5 votes

def generateIDInNameYearFormat(self, cnkiNetEntry):
        name = cnkiNetEntry["Author"].split(";")[0].split(",")[0].split("，")[0]
        name = name.replace(" ", "").replace(u"\u3000", "")
        year = cnkiNetEntry["Year"]
        if self.__isFullEnglish(name):
            self.ID = name + year
        else:
            self.ID = "".join([i.title() for i in pinyin(name)]) + year

Source File: harvesttext.py From HarvestText with MIT License

5 votes

def get_pinyin_correct_candidates(self, word, tolerance=1):  # 默认最多容忍一个拼音的变化
        assert tolerance in [0, 1]
        pinyins = lazy_pinyin(word)
        tmp = pinyins[:]
        pinyin_cands = {tuple(pinyins)}
        if tolerance == 1:
            for i, pinyin in enumerate(pinyins):
                if pinyin in self.pinyin_adjlist:
                    pinyin_cands |= {tuple(tmp[:i] + [neibr] + tmp[i + 1:]) for neibr in self.pinyin_adjlist[pinyin]}
        pinyin_cands = pinyin_cands & set(self.pinyin_mention_dict.keys())
        mention_cands = set()
        for pinyin in pinyin_cands:
            mention_cands |= self.pinyin_mention_dict[pinyin]
        return list(mention_cands)

Source File: harvesttext.py From HarvestText with MIT License

5 votes

def build_trie(self, new_word, entity, entity_type):
        type0 = "#%s#" % entity_type
        if not type0 in self.entity_types:
            punct_regex = r"[、！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏!\"\#$%&\'\(\)\*\+,-\./:;<=>?@\[\\\]\^_`{\|}~]"
            matched = re.search(punct_regex, entity_type, re.MULTILINE | re.UNICODE)
            if matched:
                punct0 = matched.group()
                raise Exception("Your type input '{}' includes punctuation '{}', please remove them first".format(entity_type,punct0))
            self.entity_types.add(type0)
            self.prepared = False
            self.hanlp_prepared = False
        self.mentions.add(new_word)
        self.pinyin_mention_dict[tuple(lazy_pinyin(new_word))].add(new_word)

        trie_node = self.trie_root
        for ch in new_word:
            if not ch in trie_node:
                trie_node[ch] = {}
            trie_node = trie_node[ch]
        if not 'leaf' in trie_node:
            trie_node['leaf'] = {(entity, type0)}
        else:
            for (entity_orig, type_orig) in trie_node['leaf'].copy():
                if entity_orig == entity:           # 不允许同一实体有不同类型
                    trie_node['leaf'].remove((entity_orig, type_orig))
            trie_node['leaf'].add((entity, type0))

Source File: entity_discoverer.py From HarvestText with MIT License

5 votes

def get_pinyin_correct_candidates(self, word, tolerance):  # 默认最多容忍一个拼音的变化
        assert tolerance in [0, 1]
        pinyins = lazy_pinyin(word)
        tmp = pinyins[:]
        pinyin_cands = {tuple(pinyins)}
        if tolerance == 1:
            for i, pinyin in enumerate(pinyins):
                if pinyin in self.pinyin_adjlist:
                    pinyin_cands |= {tuple(tmp[:i] + [neibr] + tmp[i + 1:]) for neibr in self.pinyin_adjlist[pinyin]}
        pinyin_cands = pinyin_cands & set(self.pinyin_mention_dict.keys())
        mention_cands = set()
        for pinyin in pinyin_cands:
            mention_cands |= self.pinyin_mention_dict[pinyin]
        return list(mention_cands)

Source File: common.py From lufly-im with Apache License 2.0

5 votes

def get_full(word: str) -> List[str]:
    fulls = []
    for full in lazy_pinyin(word):
        for e in full:
            if e not in "abcdefghijklmnopqrstuvwxyz":
                raise RuntimeError(f"{e} not alphe, word is: {word}")
        fulls.append(full)
    return fulls

Source File: utils.py From slack_bot with MIT License

5 votes

def to_pinyin(word):
    if not isinstance(word, unicode):
        word = word.decode('utf-8')
    return ''.join(lazy_pinyin(word))

Source File: corrector.py From pycorrector with Apache License 2.0

5 votes

def _confusion_word_set(self, word):
        confusion_word_set = set()
        candidate_words = list(self.known(edit_distance_word(word, self.cn_char_set)))
        for candidate_word in candidate_words:
            if lazy_pinyin(candidate_word) == lazy_pinyin(word):
                # same pinyin
                confusion_word_set.add(candidate_word)
        return confusion_word_set

Source File: parser.py From chinese-rhymer with MIT License

5 votes

def word_parser(word: str) -> List[Tuple[str, List[str]]]:
    pinyins: List[str] = lazy_pinyin(word)
    return pinyin_parser(pinyins)

Source File: pipelines.py From poi_spider with Apache License 2.0

4 votes

def process_item(self, item, spider):
        global MYSQL_TableName
        poly = item['poly']
        if item['results']:
            results = item['results']
            rows = []
            for result in results:
                row = []
                keys1 = ['name', 'province', 'city', 'area', 'address', 'telephone', 'uid', 'street_id', 'detail',
                         'detail_info', 'location']

                for key in keys1:
                    # d[key] = result.get(key)
                    row.append(result.get(key))

                keys2 = ['detail_url', 'tag', 'type']
                for key in keys2:
                    detail_info = result.get('detail_info')
                    if detail_info is None:
                        row.append(None)
                    else:
                        row.append(detail_info.get(key))
                keys3 = ['search_word', 'region', 'requests_url']

                for key in keys3:
                    row.append(item[key])
                rows.append([str(x) for x in row])
                print('获取到的pois:%s' % row[0])

            df = DataFrame(rows, columns=keys1 + keys2 + keys3)
            # region_pinyin = ''.join(lazy_pinyin(item['region']))
            region_pinyin = str(item['region'])

            # 判断点是否在指定poly区域内，使用到了shapely polygon.contains函数
            try:
                df['isin_region'] = df['location'].apply(
                    lambda x: poly.contains(Point(float(eval(x)['lng']), float(eval(x)['lat']))))
            except Exception as e:
                logging.info(e)
                df['isin_region'] = 999

            if MYSQL_TableName == "":
                MYSQL_TableName = '{region}_bd_map_pois'.format(region=region_pinyin)
            else:
                pass
            df.to_sql(MYSQL_TableName, engine, if_exists='append',
                      index=False)

Source File: __init__.py From TWchat with MIT License

4 votes

def start():
    @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING,PICTURE, RECORDING, ATTACHMENT, VIDEO,FRIENDS])
    def recive_contact_msg(msg):
        contact_name = get_contact_name(msg)
        try:
            wechatMain.recive_message(msg,contact_name)
            notify('TWchat',"new message from: "+contact_name)
        except AttributeError:
            pass
    
    @itchat.msg_register(TEXT, isGroupChat=True)
    def recive_group_msg(msg):
        group_name = get_group_name(msg)
        try:
            wechatMain.recive_message(msg,group_name)
            notify('TWchat',"new message from: "+group_name)
        except AttributeError:
            pass
        return   

    def on_contact_item_click(button,info):
        wechatMain.chatListBox.addNewChat(info[0],info[1])
        wechatMain.set_current_chat(info[0],info[1])
        wechatMain.chatListBox.show_chat()
        return 
    def on_chat_item_click(button,info):
        wechatMain.set_current_chat(info[0],info[1])
        return
    palette = [
        ('left', 'black', 'light gray'),
        ('right', 'black', 'dark cyan'),
        ('button', 'dark green','black'),
        ('mybg', 'black','dark cyan'),
        ('tobg', 'dark blue','light gray'),
        ('edit', 'dark cyan','black'),
        ('bg', 'dark green', 'black'),]
    print ('''
 _____  _    _  _____  _   _   ___   _____ 
|_   _|| |  | |/  __ \| | | | / _ \ |_   _|
  | |  | |  | || /  \/| |_| |/ /_\ \  | |  
  | |  | |/\| || |    |  _  ||  _  |  | |  
  | |  \  /\  /| \__/\| | | || | | |  | |  
  \_/   \/  \/  \____/\_| |_/\_| |_/  \_/  
            ''')

    wechatMain = wegui.WechatMain(palette)
    itchat.auto_login(enableCmdQR=2,hotReload=True)
    itchat.run(blockThread=False)
    userInfo =itchat.web_init()['User']
    owner_id = userInfo['UserName']
    owner_name = userInfo['NickName']
    contactlist= itchat.get_friends(update=True)
    chatlist = itchat.get_chatrooms()
    #contactlist = sorted(contactlist,key=lambda x:(x['RemarkPYInitial'],x['PYInitial']))
    contactlist = sorted(contactlist,key=lambda x:(lazy_pinyin(get_name(x))))
    wechatMain.initUserInfo(owner_id,owner_name,on_contact_item_click,on_chat_item_click,contactlist,chatlist)
    wechatMain.bind_itchat(itchat)
    wechatMain.createLoop()

Source File: entity_discoverer.py From HarvestText with MIT License

4 votes

def postprocessing(self, partition, pinyin_tolerance, pop_words_cnt):
        """应用模式修复一些小问题

        :return: partition, pattern_entity2mentions
        """
        # simple postfix like removing parenthesis
        # “+?” parttern for lazy match so that "新区" can be matched instead of match
        re_patterns = {
            "parenthesis": (None, re.compile(r"[\[{\(<#【（《](\S+?)[\]}\)>#】）》]")),
            "person_postfix": ({"人名"}, re.compile(r"^(\S+?)(哥|姐|先生|女士|小姐|同志|同学|老师|教授)$")),
            "district": ({"地名"}, re.compile(r"^(\S+?)(国|省|市|区|县|村|镇|古镇|新区|特区|自治区|特别行政区|帝国|王国|共和国)$")),
            "organization": ({"地名", "机构名"}, re.compile(r"^(\S+?)(厂|公司|有限公司|协会|基金会|俱乐部|队|国家队|集团|联盟)$")),
        }
        pattern_entity2mentions = defaultdict(set)
        if pinyin_tolerance is not None:
            self.pinyin_mention_dict = defaultdict(set)
            for entity_type in self.id2word:
                new_word = entity_type[:entity_type.rfind("_")]
                self.pinyin_mention_dict[tuple(lazy_pinyin(new_word))].add(new_word)

        for eid1, entity_type in enumerate(self.id2word):
            tmp = entity_type.rfind("_")
            entity, etype = entity_type[:tmp], entity_type[tmp + 1:]
            # pattern_matching
            for pname, (allow_types, pat) in re_patterns.items():
                if (allow_types is None or (etype in allow_types)) and re.match(pat, entity):
                    trim_entity = re.sub(pat, r"\1", entity)
                    entity2 = trim_entity + "_" + etype
                    if entity2 in self.word2id:
                        eid2 = self.word2id[entity2]
                        partition[eid1] = partition[eid2]
                    if (pname in ["district", "organization"]) and len(trim_entity) > 1:
                        if trim_entity in self.mentions or trim_entity in pop_words_cnt:
                            pattern_entity2mentions[entity_type].add(trim_entity)
                            if trim_entity not in self.mention_count:
                                self.mention_count[trim_entity] = pop_words_cnt[trim_entity]

            # pinyin recheck
            if pinyin_tolerance is not None:
                candidates = self.get_pinyin_correct_candidates(entity, pinyin_tolerance)
                for cand in candidates:
                    entity2 = cand + "_" + etype
                    if entity2 in self.word2id:
                        eid2 = self.word2id[entity2]
                        partition[eid1] = partition[eid2]

        return partition, pattern_entity2mentions

Python pypinyin.lazy_pinyin() Examples