Python pypinyin.lazy_pinyin() Examples
The following are 24
code examples of pypinyin.lazy_pinyin().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pypinyin
, or try the search function
.
Example #1
Source File: main.py From HanTTS with MIT License | 7 votes |
def speak(self, text): syllables = lazy_pinyin(text, style=pypinyin.TONE3) print(syllables) delay = 0 def preprocess(syllables): temp = [] for syllable in syllables: for p in TextToSpeech.punctuation: syllable = syllable.replace(p, "") if syllable.isdigit(): syllable = atc.num2chinese(syllable) new_sounds = lazy_pinyin(syllable, style=pypinyin.TONE3) for e in new_sounds: temp.append(e) else: temp.append(syllable) return temp syllables = preprocess(syllables) for syllable in syllables: path = "syllables/"+syllable+".wav" _thread.start_new_thread(TextToSpeech._play_audio, (path, delay)) delay += 0.355
Example #2
Source File: main.py From HanTTS with MIT License | 6 votes |
def synthesize(self, text, src, dst): """ Synthesize .wav from text src is the folder that contains all syllables .wav files dst is the destination folder to save the synthesized file """ print("Synthesizing ...") delay = 0 increment = 355 # milliseconds pause = 500 # pause for punctuation syllables = lazy_pinyin(text, style=pypinyin.TONE3) # initialize to be complete silence, each character takes up ~500ms result = AudioSegment.silent(duration=500*len(text)) for syllable in syllables: path = src+syllable+".wav" sound_file = Path(path) # insert 500 ms silence for punctuation marks if syllable in TextToSpeech.punctuation: short_silence = AudioSegment.silent(duration=pause) result = result.overlay(short_silence, position=delay) delay += increment continue # skip sound file that doesn't exist if not sound_file.is_file(): continue segment = AudioSegment.from_wav(path) result = result.overlay(segment, position=delay) delay += increment directory = dst if not os.path.exists(directory): os.makedirs(directory) result.export(directory+"generated.wav", format="wav") print("Exported.")
Example #3
Source File: generate.py From GST-Tacotron with MIT License | 6 votes |
def _pinyin(s): symbols = '0123456789abcdefghijklmnopqrstuvwxyz ' s = lazy_pinyin(s, style=Style.TONE2) yin = [] for token in s: if token != ' ': a = '' for c in token: if c in symbols: a += c yin.append(a) a = '' s = ' '.join(yin) for i in range(len(s)): if s[i] == ' ' and i < len(s) - 1 and s[i + 1] == ' ': continue a += s[i] return a
Example #4
Source File: chinese_to_pinyin.py From style-token_tacotron2 with MIT License | 6 votes |
def transform_chinese_to_pinyin(data_path, output_path,type='corpus'): with open(data_path, 'rb') as fin, open(output_path, 'wb') as fout: if type=='corpus': for line in fin: line = line.decode('utf-8').strip('\r\n ') if not line: continue transformed_line = ' '.join(lazy_pinyin(line, style=Style.TONE2)) fout.write(f'{transformed_line}\n'.encode('utf-8')) elif type=='training_data': for line in fin: line=line.decode('utf-8').strip('\r\n ') if not line: continue index,chinese_text=line.split('|') pinyin_text=' '.join(lazy_pinyin(chinese_text,style=Style.TONE2)) fout.write(f'{index}|{pinyin_text}\n'.encode('utf-8'))
Example #5
Source File: pg2.py From aca with MIT License | 6 votes |
def check_homepage_validity(name, res): """ Check if the homepage is simtisfied basic rules. Input: name-name of expert res-homepage info list """ title, url, detail, cited = res if url.endswith('pdf') or url.endswith('doc') or 'linkedin' in url.lower() or 'researchgate' in url.lower() or 'citations' in url.lower(): return False # to check if the title or detail contains the name title = ' '.join(lazy_pinyin(title)) name = name.replace('?', '') p = re.compile(r'|'.join(name.lower().split(' '))) if len(p.findall(title.lower())) == 0: return False #if 'wikipedia' in title.lower(): # return False return True
Example #6
Source File: get_databaker_data.py From NeMo with Apache License 2.0 | 6 votes |
def __convert_transcript(raw_transcript): """ Converts a Chinese transcript to a Chinese pinyin sequence. """ waveid, raw_trans = raw_transcript.split("\t")[:2] wavename = waveid + ".wav" symbols = ",.!?" # For simplicity, we only retain the Chinese chars and symbols trans = ''.join([_char for _char in __replace_symbols(raw_trans) if __is_chinese(_char) or _char in symbols]) pinyin_trans = [] for pinyin in lazy_pinyin(trans, style=Style.TONE3): if pinyin not in symbols and not pinyin[-1].isdigit(): pinyin_trans.append(pinyin + "0") else: pinyin_trans.append(pinyin) return wavename, " ".join(pinyin_trans)
Example #7
Source File: word2pinyin.py From chat with MIT License | 5 votes |
def pinyin_cut(sentence, pattern=None): """Cut the sentence into phonetic vectors. 将句子切分为拼音向量。 """ return lazy_pinyin(sentence) # @time_me()
Example #8
Source File: tts.py From parrots with Apache License 2.0 | 5 votes |
def synthesize(self, input_text='', output_wav_path=''): """ Synthesize .wav from text input_text: the folder that contains all syllables .wav files output_wav_path: the destination folder to save the synthesized file """ delay = 0 increment = 355 # milliseconds pause = 500 # pause for punctuation syllables = lazy_pinyin(input_text, style=pypinyin.TONE3) # initialize to be complete silence, each character takes up ~500ms result = AudioSegment.silent(duration=500 * len(input_text)) for syllable in syllables: path = os.path.join(self.syllables_dir, syllable + ".wav") sound_file = Path(path) # insert 500 sr silence for punctuation marks if syllable in self.punctuation: short_silence = AudioSegment.silent(duration=pause) result = result.overlay(short_silence, position=delay) delay += increment continue # skip sound file that doesn't exist if not sound_file.is_file(): continue segment = AudioSegment.from_wav(path) result = result.overlay(segment, position=delay) delay += increment if not output_wav_path: output_wav_path = 'out.wav' result.export(output_wav_path, format="wav") default_logger.debug("Exported:" + output_wav_path) return result
Example #9
Source File: tts.py From parrots with Apache License 2.0 | 5 votes |
def speak(self, text): syllables = lazy_pinyin(text, style=pypinyin.TONE3) default_logger.debug(syllables) delay = 0 def preprocess(syllables): temp = [] for syllable in syllables: for p in self.punctuation: syllable = syllable.replace(p, '') if syllable.isdigit(): syllable = num2chinese(syllable) new_sounds = lazy_pinyin(syllable, style=pypinyin.TONE3) for e in new_sounds: temp.append(e) else: temp.append(syllable) return temp syllables = preprocess(syllables) threads = [] for syllable in syllables: path = os.path.join(self.syllables_dir, syllable + ".wav") if not os.path.exists(path): continue t = threading.Thread(target=self._play_audio, args=(path, delay)) threads.append(t) delay += 0.355 for t in threads: t.start() t.join()
Example #10
Source File: biaobei_cleaner.py From style-token_tacotron2 with MIT License | 5 votes |
def biaobei_cleaner(input_path, output_path): output_lines = [] index=1 with open(input_path, 'rb') as fin, open(output_path, 'wb') as fout: for line in fin: line = line.decode('utf-8').strip('\r\n ') line = ' '.join(lazy_pinyin(line, style=Style.TONE2)) for regex, replacement in _replacement_expression: line = re.sub(regex, replacement, line) line = '{:05d}|{}\n'.format(index,line) output_lines.append(line.encode('utf-8')) index+=1 fout.writelines(output_lines)
Example #11
Source File: pg2.py From aca with MIT License | 5 votes |
def check_name_in_text(name, text): """ Sample: for the name of "Bai Li", \ # www.xx.com/li.jpg get 0.5 www.xx.org/bai_li.jpg get 1 www.xx.org.avatar.jpg get 0 """ score = 0 text = ' '.join(lazy_pinyin(text)) for i in re.split(r'[ -]', name): if i.lower() in text.lower(): score += 1 return score / len(name.split(' '))
Example #12
Source File: fenbian.py From dayworkspace with GNU Lesser General Public License v3.0 | 5 votes |
def get_pitch(hans): '''获取音调 参数: 汉字,语句或词组 返回: 一个声调元祖. 包括 1,2, 3, 4, 0. 分别表示 一到四声和轻声 ''' pitch_list = list() pinyin = lazy_pinyin(hans, style=Style.FINALS_TONE3) # ['en2', 'uei3'] 或 ['uo1', 'i'] for py in pinyin: if py[-1].isdigit(): pitch_list.append(int(py[-1])) else: pitch_list.append(0) return tuple(pitch_list)
Example #13
Source File: BibTexEntries.py From CNKI_2_BibTeX with MIT License | 5 votes |
def generateIDInTitleFormat(self, cnkiNetEntry): title = cnkiNetEntry["Title"] title = re.sub(r"[0-9]", "", title) title = re.sub(r"[_,;]", "", title) if self.__isFullEnglish(title): titleWords = title.strip().split(" ") self.ID = "".join(titleWords[0:min(len(titleWords), 4)]) else: jieba.setLogLevel(logging.INFO) title = title.replace(" ", "").replace(u"\u3000", "") titleWords = list(jieba.cut(title)) stringForConvertToPinyin = "".join( titleWords[0:min(len(titleWords), 3)]) self.ID = "".join(pinyin(stringForConvertToPinyin))
Example #14
Source File: BibTexEntries.py From CNKI_2_BibTeX with MIT License | 5 votes |
def generateIDInNameYearFormat(self, cnkiNetEntry): name = cnkiNetEntry["Author"].split(";")[0].split(",")[0].split(",")[0] name = name.replace(" ", "").replace(u"\u3000", "") year = cnkiNetEntry["Year"] if self.__isFullEnglish(name): self.ID = name + year else: self.ID = "".join([i.title() for i in pinyin(name)]) + year
Example #15
Source File: harvesttext.py From HarvestText with MIT License | 5 votes |
def get_pinyin_correct_candidates(self, word, tolerance=1): # 默认最多容忍一个拼音的变化 assert tolerance in [0, 1] pinyins = lazy_pinyin(word) tmp = pinyins[:] pinyin_cands = {tuple(pinyins)} if tolerance == 1: for i, pinyin in enumerate(pinyins): if pinyin in self.pinyin_adjlist: pinyin_cands |= {tuple(tmp[:i] + [neibr] + tmp[i + 1:]) for neibr in self.pinyin_adjlist[pinyin]} pinyin_cands = pinyin_cands & set(self.pinyin_mention_dict.keys()) mention_cands = set() for pinyin in pinyin_cands: mention_cands |= self.pinyin_mention_dict[pinyin] return list(mention_cands)
Example #16
Source File: harvesttext.py From HarvestText with MIT License | 5 votes |
def build_trie(self, new_word, entity, entity_type): type0 = "#%s#" % entity_type if not type0 in self.entity_types: punct_regex = r"[、!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏!\"\#$%&\'\(\)\*\+,-\./:;<=>?@\[\\\]\^_`{\|}~]" matched = re.search(punct_regex, entity_type, re.MULTILINE | re.UNICODE) if matched: punct0 = matched.group() raise Exception("Your type input '{}' includes punctuation '{}', please remove them first".format(entity_type,punct0)) self.entity_types.add(type0) self.prepared = False self.hanlp_prepared = False self.mentions.add(new_word) self.pinyin_mention_dict[tuple(lazy_pinyin(new_word))].add(new_word) trie_node = self.trie_root for ch in new_word: if not ch in trie_node: trie_node[ch] = {} trie_node = trie_node[ch] if not 'leaf' in trie_node: trie_node['leaf'] = {(entity, type0)} else: for (entity_orig, type_orig) in trie_node['leaf'].copy(): if entity_orig == entity: # 不允许同一实体有不同类型 trie_node['leaf'].remove((entity_orig, type_orig)) trie_node['leaf'].add((entity, type0))
Example #17
Source File: entity_discoverer.py From HarvestText with MIT License | 5 votes |
def get_pinyin_correct_candidates(self, word, tolerance): # 默认最多容忍一个拼音的变化 assert tolerance in [0, 1] pinyins = lazy_pinyin(word) tmp = pinyins[:] pinyin_cands = {tuple(pinyins)} if tolerance == 1: for i, pinyin in enumerate(pinyins): if pinyin in self.pinyin_adjlist: pinyin_cands |= {tuple(tmp[:i] + [neibr] + tmp[i + 1:]) for neibr in self.pinyin_adjlist[pinyin]} pinyin_cands = pinyin_cands & set(self.pinyin_mention_dict.keys()) mention_cands = set() for pinyin in pinyin_cands: mention_cands |= self.pinyin_mention_dict[pinyin] return list(mention_cands)
Example #18
Source File: common.py From lufly-im with Apache License 2.0 | 5 votes |
def get_full(word: str) -> List[str]: fulls = [] for full in lazy_pinyin(word): for e in full: if e not in "abcdefghijklmnopqrstuvwxyz": raise RuntimeError(f"{e} not alphe, word is: {word}") fulls.append(full) return fulls
Example #19
Source File: utils.py From slack_bot with MIT License | 5 votes |
def to_pinyin(word): if not isinstance(word, unicode): word = word.decode('utf-8') return ''.join(lazy_pinyin(word))
Example #20
Source File: corrector.py From pycorrector with Apache License 2.0 | 5 votes |
def _confusion_word_set(self, word): confusion_word_set = set() candidate_words = list(self.known(edit_distance_word(word, self.cn_char_set))) for candidate_word in candidate_words: if lazy_pinyin(candidate_word) == lazy_pinyin(word): # same pinyin confusion_word_set.add(candidate_word) return confusion_word_set
Example #21
Source File: parser.py From chinese-rhymer with MIT License | 5 votes |
def word_parser(word: str) -> List[Tuple[str, List[str]]]: pinyins: List[str] = lazy_pinyin(word) return pinyin_parser(pinyins)
Example #22
Source File: pipelines.py From poi_spider with Apache License 2.0 | 4 votes |
def process_item(self, item, spider): global MYSQL_TableName poly = item['poly'] if item['results']: results = item['results'] rows = [] for result in results: row = [] keys1 = ['name', 'province', 'city', 'area', 'address', 'telephone', 'uid', 'street_id', 'detail', 'detail_info', 'location'] for key in keys1: # d[key] = result.get(key) row.append(result.get(key)) keys2 = ['detail_url', 'tag', 'type'] for key in keys2: detail_info = result.get('detail_info') if detail_info is None: row.append(None) else: row.append(detail_info.get(key)) keys3 = ['search_word', 'region', 'requests_url'] for key in keys3: row.append(item[key]) rows.append([str(x) for x in row]) print('获取到的pois:%s' % row[0]) df = DataFrame(rows, columns=keys1 + keys2 + keys3) # region_pinyin = ''.join(lazy_pinyin(item['region'])) region_pinyin = str(item['region']) # 判断点是否在指定poly区域内,使用到了shapely polygon.contains函数 try: df['isin_region'] = df['location'].apply( lambda x: poly.contains(Point(float(eval(x)['lng']), float(eval(x)['lat'])))) except Exception as e: logging.info(e) df['isin_region'] = 999 if MYSQL_TableName == "": MYSQL_TableName = '{region}_bd_map_pois'.format(region=region_pinyin) else: pass df.to_sql(MYSQL_TableName, engine, if_exists='append', index=False)
Example #23
Source File: __init__.py From TWchat with MIT License | 4 votes |
def start(): @itchat.msg_register([TEXT, MAP, CARD, NOTE, SHARING,PICTURE, RECORDING, ATTACHMENT, VIDEO,FRIENDS]) def recive_contact_msg(msg): contact_name = get_contact_name(msg) try: wechatMain.recive_message(msg,contact_name) notify('TWchat',"new message from: "+contact_name) except AttributeError: pass @itchat.msg_register(TEXT, isGroupChat=True) def recive_group_msg(msg): group_name = get_group_name(msg) try: wechatMain.recive_message(msg,group_name) notify('TWchat',"new message from: "+group_name) except AttributeError: pass return def on_contact_item_click(button,info): wechatMain.chatListBox.addNewChat(info[0],info[1]) wechatMain.set_current_chat(info[0],info[1]) wechatMain.chatListBox.show_chat() return def on_chat_item_click(button,info): wechatMain.set_current_chat(info[0],info[1]) return palette = [ ('left', 'black', 'light gray'), ('right', 'black', 'dark cyan'), ('button', 'dark green','black'), ('mybg', 'black','dark cyan'), ('tobg', 'dark blue','light gray'), ('edit', 'dark cyan','black'), ('bg', 'dark green', 'black'),] print (''' _____ _ _ _____ _ _ ___ _____ |_ _|| | | |/ __ \| | | | / _ \ |_ _| | | | | | || / \/| |_| |/ /_\ \ | | | | | |/\| || | | _ || _ | | | | | \ /\ /| \__/\| | | || | | | | | \_/ \/ \/ \____/\_| |_/\_| |_/ \_/ ''') wechatMain = wegui.WechatMain(palette) itchat.auto_login(enableCmdQR=2,hotReload=True) itchat.run(blockThread=False) userInfo =itchat.web_init()['User'] owner_id = userInfo['UserName'] owner_name = userInfo['NickName'] contactlist= itchat.get_friends(update=True) chatlist = itchat.get_chatrooms() #contactlist = sorted(contactlist,key=lambda x:(x['RemarkPYInitial'],x['PYInitial'])) contactlist = sorted(contactlist,key=lambda x:(lazy_pinyin(get_name(x)))) wechatMain.initUserInfo(owner_id,owner_name,on_contact_item_click,on_chat_item_click,contactlist,chatlist) wechatMain.bind_itchat(itchat) wechatMain.createLoop()
Example #24
Source File: entity_discoverer.py From HarvestText with MIT License | 4 votes |
def postprocessing(self, partition, pinyin_tolerance, pop_words_cnt): """应用模式修复一些小问题 :return: partition, pattern_entity2mentions """ # simple postfix like removing parenthesis # “+?” parttern for lazy match so that "新区" can be matched instead of match re_patterns = { "parenthesis": (None, re.compile(r"[\[{\(<#【(《](\S+?)[\]}\)>#】)》]")), "person_postfix": ({"人名"}, re.compile(r"^(\S+?)(哥|姐|先生|女士|小姐|同志|同学|老师|教授)$")), "district": ({"地名"}, re.compile(r"^(\S+?)(国|省|市|区|县|村|镇|古镇|新区|特区|自治区|特别行政区|帝国|王国|共和国)$")), "organization": ({"地名", "机构名"}, re.compile(r"^(\S+?)(厂|公司|有限公司|协会|基金会|俱乐部|队|国家队|集团|联盟)$")), } pattern_entity2mentions = defaultdict(set) if pinyin_tolerance is not None: self.pinyin_mention_dict = defaultdict(set) for entity_type in self.id2word: new_word = entity_type[:entity_type.rfind("_")] self.pinyin_mention_dict[tuple(lazy_pinyin(new_word))].add(new_word) for eid1, entity_type in enumerate(self.id2word): tmp = entity_type.rfind("_") entity, etype = entity_type[:tmp], entity_type[tmp + 1:] # pattern_matching for pname, (allow_types, pat) in re_patterns.items(): if (allow_types is None or (etype in allow_types)) and re.match(pat, entity): trim_entity = re.sub(pat, r"\1", entity) entity2 = trim_entity + "_" + etype if entity2 in self.word2id: eid2 = self.word2id[entity2] partition[eid1] = partition[eid2] if (pname in ["district", "organization"]) and len(trim_entity) > 1: if trim_entity in self.mentions or trim_entity in pop_words_cnt: pattern_entity2mentions[entity_type].add(trim_entity) if trim_entity not in self.mention_count: self.mention_count[trim_entity] = pop_words_cnt[trim_entity] # pinyin recheck if pinyin_tolerance is not None: candidates = self.get_pinyin_correct_candidates(entity, pinyin_tolerance) for cand in candidates: entity2 = cand + "_" + etype if entity2 in self.word2id: eid2 = self.word2id[entity2] partition[eid1] = partition[eid2] return partition, pattern_entity2mentions