Python pypinyin.lazy_pinyin() Examples

The following are 24 code examples of pypinyin.lazy_pinyin(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pypinyin , or try the search function .
Example #1
Source File: main.py    From HanTTS with MIT License 7 votes vote down vote up
def speak(self, text):
        syllables = lazy_pinyin(text, style=pypinyin.TONE3)
        print(syllables)
        delay = 0
        
        def preprocess(syllables):
            temp = []
            for syllable in syllables:
                for p in TextToSpeech.punctuation:
                    syllable = syllable.replace(p, "")
                if syllable.isdigit():
                    syllable = atc.num2chinese(syllable)
                    new_sounds = lazy_pinyin(syllable, style=pypinyin.TONE3)
                    for e in new_sounds:
                        temp.append(e)
                else:
                    temp.append(syllable)
            return temp

        syllables = preprocess(syllables)
        for syllable in syllables:
            path = "syllables/"+syllable+".wav"
            _thread.start_new_thread(TextToSpeech._play_audio, (path, delay))
            delay += 0.355 
Example #2
Source File: main.py    From HanTTS with MIT License 6 votes vote down vote up
def synthesize(self, text, src, dst):
        """
        Synthesize .wav from text
        src is the folder that contains all syllables .wav files
        dst is the destination folder to save the synthesized file
        """
        print("Synthesizing ...")
        delay = 0
        increment = 355 # milliseconds
        pause = 500 # pause for punctuation
        syllables = lazy_pinyin(text, style=pypinyin.TONE3)

        # initialize to be complete silence, each character takes up ~500ms
        result = AudioSegment.silent(duration=500*len(text))
        for syllable in syllables:
            path = src+syllable+".wav"
            sound_file = Path(path)
            # insert 500 ms silence for punctuation marks
            if syllable in TextToSpeech.punctuation:
                short_silence = AudioSegment.silent(duration=pause)
                result = result.overlay(short_silence, position=delay)
                delay += increment
                continue
            # skip sound file that doesn't exist
            if not sound_file.is_file():
                continue
            segment = AudioSegment.from_wav(path)
            result = result.overlay(segment, position=delay)
            delay += increment

        directory = dst
        if not os.path.exists(directory):
            os.makedirs(directory)

        result.export(directory+"generated.wav", format="wav")
        print("Exported.") 
Example #3
Source File: generate.py    From GST-Tacotron with MIT License 6 votes vote down vote up
def _pinyin(s):
    symbols = '0123456789abcdefghijklmnopqrstuvwxyz '
    s = lazy_pinyin(s, style=Style.TONE2)
    yin = []
    for token in s:
        if token != ' ':
            a = ''
            for c in token:
                if c in symbols:
                    a += c
            yin.append(a)
    a = ''
    s = ' '.join(yin)
    for i in range(len(s)):
        if s[i] == ' ' and i < len(s) - 1 and s[i + 1] == ' ':
            continue
        a += s[i]
    return a 
Example #4
Source File: chinese_to_pinyin.py    From style-token_tacotron2 with MIT License 6 votes vote down vote up
def transform_chinese_to_pinyin(data_path, output_path,type='corpus'):
    with open(data_path, 'rb') as fin, open(output_path, 'wb') as fout:
        if type=='corpus':
            for line in fin:
                line = line.decode('utf-8').strip('\r\n ')
                if not line:
                    continue
                transformed_line = ' '.join(lazy_pinyin(line, style=Style.TONE2))
                fout.write(f'{transformed_line}\n'.encode('utf-8'))
        elif type=='training_data':
            for line in fin:
                line=line.decode('utf-8').strip('\r\n ')
                if not line:
                    continue
                index,chinese_text=line.split('|')
                pinyin_text=' '.join(lazy_pinyin(chinese_text,style=Style.TONE2))
                fout.write(f'{index}|{pinyin_text}\n'.encode('utf-8')) 
Example #5
Source File: pg2.py    From aca with MIT License 6 votes vote down vote up
def check_homepage_validity(name, res):
    """
    Check if the homepage is simtisfied basic rules.
    Input: name-name of expert res-homepage info list
    """
    title, url, detail, cited = res
    if url.endswith('pdf') or url.endswith('doc') or 'linkedin' in url.lower() or 'researchgate' in url.lower() or 'citations' in url.lower():
        return False
    # to check if the title or detail contains the name
    
    
    title = ' '.join(lazy_pinyin(title))
    name = name.replace('?', '')
    p = re.compile(r'|'.join(name.lower().split(' ')))
    if len(p.findall(title.lower())) == 0:
        return False
    
    #if 'wikipedia' in title.lower():
     #   return False
    return True 
Example #6
Source File: get_databaker_data.py    From NeMo with Apache License 2.0 6 votes vote down vote up
def __convert_transcript(raw_transcript):
    """
    Converts a Chinese transcript to a Chinese pinyin sequence.
    """
    waveid, raw_trans = raw_transcript.split("\t")[:2]
    wavename = waveid + ".wav"
    symbols = ",.!?"
    # For simplicity, we only retain the Chinese chars and symbols
    trans = ''.join([_char for _char in __replace_symbols(raw_trans) if __is_chinese(_char) or _char in symbols])
    pinyin_trans = []
    for pinyin in lazy_pinyin(trans, style=Style.TONE3):
        if pinyin not in symbols and not pinyin[-1].isdigit():
            pinyin_trans.append(pinyin + "0")
        else:
            pinyin_trans.append(pinyin)
    return wavename, " ".join(pinyin_trans) 
Example #7
Source File: word2pinyin.py    From chat with MIT License 5 votes vote down vote up
def pinyin_cut(sentence, pattern=None):
    """Cut the sentence into phonetic vectors.
    将句子切分为拼音向量。
    """
    return lazy_pinyin(sentence)

# @time_me() 
Example #8
Source File: tts.py    From parrots with Apache License 2.0 5 votes vote down vote up
def synthesize(self, input_text='', output_wav_path=''):
        """
        Synthesize .wav from text
        input_text: the folder that contains all syllables .wav files
        output_wav_path: the destination folder to save the synthesized file
        """
        delay = 0
        increment = 355  # milliseconds
        pause = 500  # pause for punctuation
        syllables = lazy_pinyin(input_text, style=pypinyin.TONE3)

        # initialize to be complete silence, each character takes up ~500ms
        result = AudioSegment.silent(duration=500 * len(input_text))
        for syllable in syllables:
            path = os.path.join(self.syllables_dir, syllable + ".wav")
            sound_file = Path(path)
            # insert 500 sr silence for punctuation marks
            if syllable in self.punctuation:
                short_silence = AudioSegment.silent(duration=pause)
                result = result.overlay(short_silence, position=delay)
                delay += increment
                continue
            # skip sound file that doesn't exist
            if not sound_file.is_file():
                continue
            segment = AudioSegment.from_wav(path)
            result = result.overlay(segment, position=delay)
            delay += increment
        if not output_wav_path:
            output_wav_path = 'out.wav'

        result.export(output_wav_path, format="wav")
        default_logger.debug("Exported:" + output_wav_path)
        return result 
Example #9
Source File: tts.py    From parrots with Apache License 2.0 5 votes vote down vote up
def speak(self, text):
        syllables = lazy_pinyin(text, style=pypinyin.TONE3)
        default_logger.debug(syllables)
        delay = 0

        def preprocess(syllables):
            temp = []
            for syllable in syllables:
                for p in self.punctuation:
                    syllable = syllable.replace(p, '')
                if syllable.isdigit():
                    syllable = num2chinese(syllable)
                    new_sounds = lazy_pinyin(syllable, style=pypinyin.TONE3)
                    for e in new_sounds:
                        temp.append(e)
                else:
                    temp.append(syllable)
            return temp

        syllables = preprocess(syllables)
        threads = []
        for syllable in syllables:
            path = os.path.join(self.syllables_dir, syllable + ".wav")
            if not os.path.exists(path): continue
            t = threading.Thread(target=self._play_audio, args=(path, delay))
            threads.append(t)
            delay += 0.355
        for t in threads:
            t.start()
        t.join() 
Example #10
Source File: biaobei_cleaner.py    From style-token_tacotron2 with MIT License 5 votes vote down vote up
def biaobei_cleaner(input_path, output_path):
    output_lines = []
    index=1
    with open(input_path, 'rb') as fin, open(output_path, 'wb') as fout:
        for line in fin:
            line = line.decode('utf-8').strip('\r\n ')
            line = ' '.join(lazy_pinyin(line, style=Style.TONE2))
            for regex, replacement in _replacement_expression:
                line = re.sub(regex, replacement, line)
            line = '{:05d}|{}\n'.format(index,line)
            output_lines.append(line.encode('utf-8'))
            index+=1
        fout.writelines(output_lines) 
Example #11
Source File: pg2.py    From aca with MIT License 5 votes vote down vote up
def check_name_in_text(name, text):
    """
    Sample: for the name of "Bai Li", \
    # www.xx.com/li.jpg get 0.5
    www.xx.org/bai_li.jpg get 1
    www.xx.org.avatar.jpg get 0
    """
    score = 0
    text = ' '.join(lazy_pinyin(text))
    for i in re.split(r'[ -]', name):
        if i.lower() in text.lower():
            score += 1
    return score / len(name.split(' ')) 
Example #12
Source File: fenbian.py    From dayworkspace with GNU Lesser General Public License v3.0 5 votes vote down vote up
def get_pitch(hans):
    '''获取音调
        参数:  汉字,语句或词组
        返回: 一个声调元祖.  包括  1,2, 3, 4, 0.  分别表示 一到四声和轻声
    '''
    pitch_list = list()
    pinyin = lazy_pinyin(hans, style=Style.FINALS_TONE3)
    # ['en2', 'uei3'] 或  ['uo1', 'i']
    for py in pinyin:
        
        if py[-1].isdigit():
            pitch_list.append(int(py[-1]))
        else:
            pitch_list.append(0)
    return tuple(pitch_list) 
Example #13
Source File: BibTexEntries.py    From CNKI_2_BibTeX with MIT License 5 votes vote down vote up
def generateIDInTitleFormat(self, cnkiNetEntry):
        title = cnkiNetEntry["Title"]
        title = re.sub(r"[0-9]", "", title)
        title = re.sub(r"[_,;]", "", title)
        if self.__isFullEnglish(title):
            titleWords = title.strip().split(" ")
            self.ID = "".join(titleWords[0:min(len(titleWords), 4)])
        else:
            jieba.setLogLevel(logging.INFO)
            title = title.replace(" ", "").replace(u"\u3000", "")
            titleWords = list(jieba.cut(title))
            stringForConvertToPinyin = "".join(
                titleWords[0:min(len(titleWords), 3)])
            self.ID = "".join(pinyin(stringForConvertToPinyin)) 
Example #14
Source File: BibTexEntries.py    From CNKI_2_BibTeX with MIT License 5 votes vote down vote up
def generateIDInNameYearFormat(self, cnkiNetEntry):
        name = cnkiNetEntry["Author"].split(";")[0].split(",")[0].split(",")[0]
        name = name.replace(" ", "").replace(u"\u3000", "")
        year = cnkiNetEntry["Year"]
        if self.__isFullEnglish(name):
            self.ID = name + year
        else:
            self.ID = "".join([i.title() for i in pinyin(name)]) + year 
Example #15
Source File: harvesttext.py    From HarvestText with MIT License 5 votes vote down vote up
def get_pinyin_correct_candidates(self, word, tolerance=1):  # 默认最多容忍一个拼音的变化
        assert tolerance in [0, 1]
        pinyins = lazy_pinyin(word)
        tmp = pinyins[:]
        pinyin_cands = {tuple(pinyins)}
        if tolerance == 1:
            for i, pinyin in enumerate(pinyins):
                if pinyin in self.pinyin_adjlist:
                    pinyin_cands |= {tuple(tmp[:i] + [neibr] + tmp[i + 1:]) for neibr in self.pinyin_adjlist[pinyin]}
        pinyin_cands = pinyin_cands & set(self.pinyin_mention_dict.keys())
        mention_cands = set()
        for pinyin in pinyin_cands:
            mention_cands |= self.pinyin_mention_dict[pinyin]
        return list(mention_cands) 
Example #16
Source File: harvesttext.py    From HarvestText with MIT License 5 votes vote down vote up
def build_trie(self, new_word, entity, entity_type):
        type0 = "#%s#" % entity_type
        if not type0 in self.entity_types:
            punct_regex = r"[、!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏!\"\#$%&\'\(\)\*\+,-\./:;<=>?@\[\\\]\^_`{\|}~]"
            matched = re.search(punct_regex, entity_type, re.MULTILINE | re.UNICODE)
            if matched:
                punct0 = matched.group()
                raise Exception("Your type input '{}' includes punctuation '{}', please remove them first".format(entity_type,punct0))
            self.entity_types.add(type0)
            self.prepared = False
            self.hanlp_prepared = False
        self.mentions.add(new_word)
        self.pinyin_mention_dict[tuple(lazy_pinyin(new_word))].add(new_word)

        trie_node = self.trie_root
        for ch in new_word:
            if not ch in trie_node:
                trie_node[ch] = {}
            trie_node = trie_node[ch]
        if not 'leaf' in trie_node:
            trie_node['leaf'] = {(entity, type0)}
        else:
            for (entity_orig, type_orig) in trie_node['leaf'].copy():
                if entity_orig == entity:           # 不允许同一实体有不同类型
                    trie_node['leaf'].remove((entity_orig, type_orig))
            trie_node['leaf'].add((entity, type0)) 
Example #17
Source File: entity_discoverer.py    From HarvestText with MIT License 5 votes vote down vote up
def get_pinyin_correct_candidates(self, word, tolerance):  # 默认最多容忍一个拼音的变化
        assert tolerance in [0, 1]
        pinyins = lazy_pinyin(word)
        tmp = pinyins[:]
        pinyin_cands = {tuple(pinyins)}
        if tolerance == 1:
            for i, pinyin in enumerate(pinyins):
                if pinyin in self.pinyin_adjlist:
                    pinyin_cands |= {tuple(tmp[:i] + [neibr] + tmp[i + 1:]) for neibr in self.pinyin_adjlist[pinyin]}
        pinyin_cands = pinyin_cands & set(self.pinyin_mention_dict.keys())
        mention_cands = set()
        for pinyin in pinyin_cands:
            mention_cands |= self.pinyin_mention_dict[pinyin]
        return list(mention_cands) 
Example #18
Source File: common.py    From lufly-im with Apache License 2.0 5 votes vote down vote up
def get_full(word: str) -> List[str]:
    fulls = []
    for full in lazy_pinyin(word):
        for e in full:
            if e not in "abcdefghijklmnopqrstuvwxyz":
                raise RuntimeError(f"{e} not alphe, word is: {word}")
        fulls.append(full)
    return fulls 
Example #19
Source File: utils.py    From slack_bot with MIT License 5 votes vote down vote up
def to_pinyin(word):
    if not isinstance(word, unicode):
        word = word.decode('utf-8')
    return ''.join(lazy_pinyin(word)) 
Example #20
Source File: corrector.py    From pycorrector with Apache License 2.0 5 votes vote down vote up
def _confusion_word_set(self, word):
        confusion_word_set = set()
        candidate_words = list(self.known(edit_distance_word(word, self.cn_char_set)))
        for candidate_word in candidate_words:
            if lazy_pinyin(candidate_word) == lazy_pinyin(word):
                # same pinyin
                confusion_word_set.add(candidate_word)
        return confusion_word_set 
Example #21
Source File: parser.py    From chinese-rhymer with MIT License 5 votes vote down vote up
def word_parser(word: str) -> List[Tuple[str, List[str]]]:
    pinyins: List[str] = lazy_pinyin(word)
    return pinyin_parser(pinyins) 
Example #22
Source File: pipelines.py    From poi_spider with Apache License 2.0 4 votes vote down vote up
def process_item(self, item, spider):
        global MYSQL_TableName
        poly = item['poly']
        if item['results']:
            results = item['results']
            rows = []
            for result in results:
                row = []
                keys1 = ['name', 'province', 'city', 'area', 'address', 'telephone', 'uid', 'street_id', 'detail',
                         'detail_info', 'location']

                for key in keys1:
                    # d[key] = result.get(key)
                    row.append(result.get(key))

                keys2 = ['detail_url', 'tag', 'type']
                for key in keys2:
                    detail_info = result.get('detail_info')
                    if detail_info is None:
                        row.append(None)
                    else:
                        row.append(detail_info.get(key))
                keys3 = ['search_word', 'region', 'requests_url']

                for key in keys3:
                    row.append(item[key])
                rows.append([str(x) for x in row])
                print('获取到的pois:%s' % row[0])

            df = DataFrame(rows, columns=keys1 + keys2 + keys3)
            # region_pinyin = ''.join(lazy_pinyin(item['region']))
            region_pinyin = str(item['region'])

            # 判断点是否在指定poly区域内,使用到了shapely polygon.contains函数
            try:
                df['isin_region'] = df['location'].apply(
                    lambda x: poly.contains(Point(float(eval(x)['lng']), float(eval(x)['lat']))))
            except Exception as e:
                logging.info(e)
                df['isin_region'] = 999

            if MYSQL_TableName == "":
                MYSQL_TableName = '{region}_bd_map_pois'.format(region=region_pinyin)
            else:
                pass
            df.to_sql(MYSQL_TableName, engine, if_exists='append',
                      index=False) 
Example #23
Source File: __init__.py    From TWchat with MIT License 4 votes vote down vote up
def start():
    @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING,PICTURE, RECORDING, ATTACHMENT, VIDEO,FRIENDS])
    def recive_contact_msg(msg):
        contact_name = get_contact_name(msg)
        try:
            wechatMain.recive_message(msg,contact_name)
            notify('TWchat',"new message from: "+contact_name)
        except AttributeError:
            pass
    
    @itchat.msg_register(TEXT, isGroupChat=True)
    def recive_group_msg(msg):
        group_name = get_group_name(msg)
        try:
            wechatMain.recive_message(msg,group_name)
            notify('TWchat',"new message from: "+group_name)
        except AttributeError:
            pass
        return   

    def on_contact_item_click(button,info):
        wechatMain.chatListBox.addNewChat(info[0],info[1])
        wechatMain.set_current_chat(info[0],info[1])
        wechatMain.chatListBox.show_chat()
        return 
    def on_chat_item_click(button,info):
        wechatMain.set_current_chat(info[0],info[1])
        return
    palette = [
        ('left', 'black', 'light gray'),
        ('right', 'black', 'dark cyan'),
        ('button', 'dark green','black'),
        ('mybg', 'black','dark cyan'),
        ('tobg', 'dark blue','light gray'),
        ('edit', 'dark cyan','black'),
        ('bg', 'dark green', 'black'),]
    print ('''
 _____  _    _  _____  _   _   ___   _____ 
|_   _|| |  | |/  __ \| | | | / _ \ |_   _|
  | |  | |  | || /  \/| |_| |/ /_\ \  | |  
  | |  | |/\| || |    |  _  ||  _  |  | |  
  | |  \  /\  /| \__/\| | | || | | |  | |  
  \_/   \/  \/  \____/\_| |_/\_| |_/  \_/  
            ''')

    wechatMain = wegui.WechatMain(palette)
    itchat.auto_login(enableCmdQR=2,hotReload=True)
    itchat.run(blockThread=False)
    userInfo =itchat.web_init()['User']
    owner_id = userInfo['UserName']
    owner_name = userInfo['NickName']
    contactlist= itchat.get_friends(update=True)
    chatlist = itchat.get_chatrooms()
    #contactlist = sorted(contactlist,key=lambda x:(x['RemarkPYInitial'],x['PYInitial']))
    contactlist = sorted(contactlist,key=lambda x:(lazy_pinyin(get_name(x))))
    wechatMain.initUserInfo(owner_id,owner_name,on_contact_item_click,on_chat_item_click,contactlist,chatlist)
    wechatMain.bind_itchat(itchat)
    wechatMain.createLoop() 
Example #24
Source File: entity_discoverer.py    From HarvestText with MIT License 4 votes vote down vote up
def postprocessing(self, partition, pinyin_tolerance, pop_words_cnt):
        """应用模式修复一些小问题

        :return: partition, pattern_entity2mentions
        """
        # simple postfix like removing parenthesis
        # “+?” parttern for lazy match so that "新区" can be matched instead of match
        re_patterns = {
            "parenthesis": (None, re.compile(r"[\[{\(<#【(《](\S+?)[\]}\)>#】)》]")),
            "person_postfix": ({"人名"}, re.compile(r"^(\S+?)(哥|姐|先生|女士|小姐|同志|同学|老师|教授)$")),
            "district": ({"地名"}, re.compile(r"^(\S+?)(国|省|市|区|县|村|镇|古镇|新区|特区|自治区|特别行政区|帝国|王国|共和国)$")),
            "organization": ({"地名", "机构名"}, re.compile(r"^(\S+?)(厂|公司|有限公司|协会|基金会|俱乐部|队|国家队|集团|联盟)$")),
        }
        pattern_entity2mentions = defaultdict(set)
        if pinyin_tolerance is not None:
            self.pinyin_mention_dict = defaultdict(set)
            for entity_type in self.id2word:
                new_word = entity_type[:entity_type.rfind("_")]
                self.pinyin_mention_dict[tuple(lazy_pinyin(new_word))].add(new_word)

        for eid1, entity_type in enumerate(self.id2word):
            tmp = entity_type.rfind("_")
            entity, etype = entity_type[:tmp], entity_type[tmp + 1:]
            # pattern_matching
            for pname, (allow_types, pat) in re_patterns.items():
                if (allow_types is None or (etype in allow_types)) and re.match(pat, entity):
                    trim_entity = re.sub(pat, r"\1", entity)
                    entity2 = trim_entity + "_" + etype
                    if entity2 in self.word2id:
                        eid2 = self.word2id[entity2]
                        partition[eid1] = partition[eid2]
                    if (pname in ["district", "organization"]) and len(trim_entity) > 1:
                        if trim_entity in self.mentions or trim_entity in pop_words_cnt:
                            pattern_entity2mentions[entity_type].add(trim_entity)
                            if trim_entity not in self.mention_count:
                                self.mention_count[trim_entity] = pop_words_cnt[trim_entity]

            # pinyin recheck
            if pinyin_tolerance is not None:
                candidates = self.get_pinyin_correct_candidates(entity, pinyin_tolerance)
                for cand in candidates:
                    entity2 = cand + "_" + etype
                    if entity2 in self.word2id:
                        eid2 = self.word2id[entity2]
                        partition[eid1] = partition[eid2]

        return partition, pattern_entity2mentions