Path: blob/master/tensorflow_tts/processor/baker.py
1558 views
# -*- coding: utf-8 -*-1# Copyright 2020 TensorFlowTTS Team.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14"""Perform preprocessing and raw feature extraction for Baker dataset."""1516import os17import re18from typing import Dict, List, Union, Tuple, Any1920import librosa21import numpy as np22import soundfile as sf23from dataclasses import dataclass, field24from pypinyin import Style25from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin26from pypinyin.converter import DefaultConverter27from pypinyin.core import Pinyin28from tensorflow_tts.processor import BaseProcessor29from tensorflow_tts.utils.utils import PROCESSOR_FILE_NAME3031_pad = ["pad"]32_eos = ["eos"]33_pause = ["sil", "#0", "#1", "#2", "#3"]3435_initials = [36"^",37"b",38"c",39"ch",40"d",41"f",42"g",43"h",44"j",45"k",46"l",47"m",48"n",49"p",50"q",51"r",52"s",53"sh",54"t",55"x",56"z",57"zh",58]5960_tones = ["1", "2", "3", "4", "5"]6162_finals = [63"a",64"ai",65"an",66"ang",67"ao",68"e",69"ei",70"en",71"eng",72"er",73"i",74"ia",75"ian",76"iang",77"iao",78"ie",79"ii",80"iii",81"in",82"ing",83"iong",84"iou",85"o",86"ong",87"ou",88"u",89"ua",90"uai",91"uan",92"uang",93"uei",94"uen",95"ueng",96"uo",97"v",98"van",99"ve",100"vn",101]102103BAKER_SYMBOLS = _pad + _pause + _initials + [i + j for i in _finals for j in _tones] + _eos104105106PINYIN_DICT = {107"a": ("^", "a"),108"ai": ("^", "ai"),109"an": ("^", "an"),110"ang": ("^", "ang"),111"ao": ("^", "ao"),112"ba": ("b", "a"),113"bai": ("b", "ai"),114"ban": ("b", "an"),115"bang": ("b", "ang"),116"bao": ("b", "ao"),117"be": ("b", "e"),118"bei": ("b", "ei"),119"ben": ("b", "en"),120"beng": ("b", "eng"),121"bi": ("b", "i"),122"bian": ("b", "ian"),123"biao": ("b", "iao"),124"bie": ("b", "ie"),125"bin": ("b", "in"),126"bing": ("b", "ing"),127"bo": ("b", "o"),128"bu": ("b", "u"),129"ca": ("c", "a"),130"cai": ("c", "ai"),131"can": ("c", "an"),132"cang": ("c", "ang"),133"cao": ("c", "ao"),134"ce": ("c", "e"),135"cen": ("c", "en"),136"ceng": ("c", "eng"),137"cha": ("ch", "a"),138"chai": ("ch", "ai"),139"chan": ("ch", "an"),140"chang": ("ch", "ang"),141"chao": ("ch", "ao"),142"che": ("ch", "e"),143"chen": ("ch", "en"),144"cheng": ("ch", "eng"),145"chi": ("ch", "iii"),146"chong": ("ch", "ong"),147"chou": ("ch", "ou"),148"chu": ("ch", "u"),149"chua": ("ch", "ua"),150"chuai": ("ch", "uai"),151"chuan": ("ch", "uan"),152"chuang": ("ch", "uang"),153"chui": ("ch", "uei"),154"chun": ("ch", "uen"),155"chuo": ("ch", "uo"),156"ci": ("c", "ii"),157"cong": ("c", "ong"),158"cou": ("c", "ou"),159"cu": ("c", "u"),160"cuan": ("c", "uan"),161"cui": ("c", "uei"),162"cun": ("c", "uen"),163"cuo": ("c", "uo"),164"da": ("d", "a"),165"dai": ("d", "ai"),166"dan": ("d", "an"),167"dang": ("d", "ang"),168"dao": ("d", "ao"),169"de": ("d", "e"),170"dei": ("d", "ei"),171"den": ("d", "en"),172"deng": ("d", "eng"),173"di": ("d", "i"),174"dia": ("d", "ia"),175"dian": ("d", "ian"),176"diao": ("d", "iao"),177"die": ("d", "ie"),178"ding": ("d", "ing"),179"diu": ("d", "iou"),180"dong": ("d", "ong"),181"dou": ("d", "ou"),182"du": ("d", "u"),183"duan": ("d", "uan"),184"dui": ("d", "uei"),185"dun": ("d", "uen"),186"duo": ("d", "uo"),187"e": ("^", "e"),188"ei": ("^", "ei"),189"en": ("^", "en"),190"ng": ("^", "en"),191"eng": ("^", "eng"),192"er": ("^", "er"),193"fa": ("f", "a"),194"fan": ("f", "an"),195"fang": ("f", "ang"),196"fei": ("f", "ei"),197"fen": ("f", "en"),198"feng": ("f", "eng"),199"fo": ("f", "o"),200"fou": ("f", "ou"),201"fu": ("f", "u"),202"ga": ("g", "a"),203"gai": ("g", "ai"),204"gan": ("g", "an"),205"gang": ("g", "ang"),206"gao": ("g", "ao"),207"ge": ("g", "e"),208"gei": ("g", "ei"),209"gen": ("g", "en"),210"geng": ("g", "eng"),211"gong": ("g", "ong"),212"gou": ("g", "ou"),213"gu": ("g", "u"),214"gua": ("g", "ua"),215"guai": ("g", "uai"),216"guan": ("g", "uan"),217"guang": ("g", "uang"),218"gui": ("g", "uei"),219"gun": ("g", "uen"),220"guo": ("g", "uo"),221"ha": ("h", "a"),222"hai": ("h", "ai"),223"han": ("h", "an"),224"hang": ("h", "ang"),225"hao": ("h", "ao"),226"he": ("h", "e"),227"hei": ("h", "ei"),228"hen": ("h", "en"),229"heng": ("h", "eng"),230"hong": ("h", "ong"),231"hou": ("h", "ou"),232"hu": ("h", "u"),233"hua": ("h", "ua"),234"huai": ("h", "uai"),235"huan": ("h", "uan"),236"huang": ("h", "uang"),237"hui": ("h", "uei"),238"hun": ("h", "uen"),239"huo": ("h", "uo"),240"ji": ("j", "i"),241"jia": ("j", "ia"),242"jian": ("j", "ian"),243"jiang": ("j", "iang"),244"jiao": ("j", "iao"),245"jie": ("j", "ie"),246"jin": ("j", "in"),247"jing": ("j", "ing"),248"jiong": ("j", "iong"),249"jiu": ("j", "iou"),250"ju": ("j", "v"),251"juan": ("j", "van"),252"jue": ("j", "ve"),253"jun": ("j", "vn"),254"ka": ("k", "a"),255"kai": ("k", "ai"),256"kan": ("k", "an"),257"kang": ("k", "ang"),258"kao": ("k", "ao"),259"ke": ("k", "e"),260"kei": ("k", "ei"),261"ken": ("k", "en"),262"keng": ("k", "eng"),263"kong": ("k", "ong"),264"kou": ("k", "ou"),265"ku": ("k", "u"),266"kua": ("k", "ua"),267"kuai": ("k", "uai"),268"kuan": ("k", "uan"),269"kuang": ("k", "uang"),270"kui": ("k", "uei"),271"kun": ("k", "uen"),272"kuo": ("k", "uo"),273"la": ("l", "a"),274"lai": ("l", "ai"),275"lan": ("l", "an"),276"lang": ("l", "ang"),277"lao": ("l", "ao"),278"le": ("l", "e"),279"lei": ("l", "ei"),280"leng": ("l", "eng"),281"li": ("l", "i"),282"lia": ("l", "ia"),283"lian": ("l", "ian"),284"liang": ("l", "iang"),285"liao": ("l", "iao"),286"lie": ("l", "ie"),287"lin": ("l", "in"),288"ling": ("l", "ing"),289"liu": ("l", "iou"),290"lo": ("l", "o"),291"long": ("l", "ong"),292"lou": ("l", "ou"),293"lu": ("l", "u"),294"lv": ("l", "v"),295"luan": ("l", "uan"),296"lve": ("l", "ve"),297"lue": ("l", "ve"),298"lun": ("l", "uen"),299"luo": ("l", "uo"),300"ma": ("m", "a"),301"mai": ("m", "ai"),302"man": ("m", "an"),303"mang": ("m", "ang"),304"mao": ("m", "ao"),305"me": ("m", "e"),306"mei": ("m", "ei"),307"men": ("m", "en"),308"meng": ("m", "eng"),309"mi": ("m", "i"),310"mian": ("m", "ian"),311"miao": ("m", "iao"),312"mie": ("m", "ie"),313"min": ("m", "in"),314"ming": ("m", "ing"),315"miu": ("m", "iou"),316"mo": ("m", "o"),317"mou": ("m", "ou"),318"mu": ("m", "u"),319"na": ("n", "a"),320"nai": ("n", "ai"),321"nan": ("n", "an"),322"nang": ("n", "ang"),323"nao": ("n", "ao"),324"ne": ("n", "e"),325"nei": ("n", "ei"),326"nen": ("n", "en"),327"neng": ("n", "eng"),328"ni": ("n", "i"),329"nia": ("n", "ia"),330"nian": ("n", "ian"),331"niang": ("n", "iang"),332"niao": ("n", "iao"),333"nie": ("n", "ie"),334"nin": ("n", "in"),335"ning": ("n", "ing"),336"niu": ("n", "iou"),337"nong": ("n", "ong"),338"nou": ("n", "ou"),339"nu": ("n", "u"),340"nv": ("n", "v"),341"nuan": ("n", "uan"),342"nve": ("n", "ve"),343"nue": ("n", "ve"),344"nuo": ("n", "uo"),345"o": ("^", "o"),346"ou": ("^", "ou"),347"pa": ("p", "a"),348"pai": ("p", "ai"),349"pan": ("p", "an"),350"pang": ("p", "ang"),351"pao": ("p", "ao"),352"pe": ("p", "e"),353"pei": ("p", "ei"),354"pen": ("p", "en"),355"peng": ("p", "eng"),356"pi": ("p", "i"),357"pian": ("p", "ian"),358"piao": ("p", "iao"),359"pie": ("p", "ie"),360"pin": ("p", "in"),361"ping": ("p", "ing"),362"po": ("p", "o"),363"pou": ("p", "ou"),364"pu": ("p", "u"),365"qi": ("q", "i"),366"qia": ("q", "ia"),367"qian": ("q", "ian"),368"qiang": ("q", "iang"),369"qiao": ("q", "iao"),370"qie": ("q", "ie"),371"qin": ("q", "in"),372"qing": ("q", "ing"),373"qiong": ("q", "iong"),374"qiu": ("q", "iou"),375"qu": ("q", "v"),376"quan": ("q", "van"),377"que": ("q", "ve"),378"qun": ("q", "vn"),379"ran": ("r", "an"),380"rang": ("r", "ang"),381"rao": ("r", "ao"),382"re": ("r", "e"),383"ren": ("r", "en"),384"reng": ("r", "eng"),385"ri": ("r", "iii"),386"rong": ("r", "ong"),387"rou": ("r", "ou"),388"ru": ("r", "u"),389"rua": ("r", "ua"),390"ruan": ("r", "uan"),391"rui": ("r", "uei"),392"run": ("r", "uen"),393"ruo": ("r", "uo"),394"sa": ("s", "a"),395"sai": ("s", "ai"),396"san": ("s", "an"),397"sang": ("s", "ang"),398"sao": ("s", "ao"),399"se": ("s", "e"),400"sen": ("s", "en"),401"seng": ("s", "eng"),402"sha": ("sh", "a"),403"shai": ("sh", "ai"),404"shan": ("sh", "an"),405"shang": ("sh", "ang"),406"shao": ("sh", "ao"),407"she": ("sh", "e"),408"shei": ("sh", "ei"),409"shen": ("sh", "en"),410"sheng": ("sh", "eng"),411"shi": ("sh", "iii"),412"shou": ("sh", "ou"),413"shu": ("sh", "u"),414"shua": ("sh", "ua"),415"shuai": ("sh", "uai"),416"shuan": ("sh", "uan"),417"shuang": ("sh", "uang"),418"shui": ("sh", "uei"),419"shun": ("sh", "uen"),420"shuo": ("sh", "uo"),421"si": ("s", "ii"),422"song": ("s", "ong"),423"sou": ("s", "ou"),424"su": ("s", "u"),425"suan": ("s", "uan"),426"sui": ("s", "uei"),427"sun": ("s", "uen"),428"suo": ("s", "uo"),429"ta": ("t", "a"),430"tai": ("t", "ai"),431"tan": ("t", "an"),432"tang": ("t", "ang"),433"tao": ("t", "ao"),434"te": ("t", "e"),435"tei": ("t", "ei"),436"teng": ("t", "eng"),437"ti": ("t", "i"),438"tian": ("t", "ian"),439"tiao": ("t", "iao"),440"tie": ("t", "ie"),441"ting": ("t", "ing"),442"tong": ("t", "ong"),443"tou": ("t", "ou"),444"tu": ("t", "u"),445"tuan": ("t", "uan"),446"tui": ("t", "uei"),447"tun": ("t", "uen"),448"tuo": ("t", "uo"),449"wa": ("^", "ua"),450"wai": ("^", "uai"),451"wan": ("^", "uan"),452"wang": ("^", "uang"),453"wei": ("^", "uei"),454"wen": ("^", "uen"),455"weng": ("^", "ueng"),456"wo": ("^", "uo"),457"wu": ("^", "u"),458"xi": ("x", "i"),459"xia": ("x", "ia"),460"xian": ("x", "ian"),461"xiang": ("x", "iang"),462"xiao": ("x", "iao"),463"xie": ("x", "ie"),464"xin": ("x", "in"),465"xing": ("x", "ing"),466"xiong": ("x", "iong"),467"xiu": ("x", "iou"),468"xu": ("x", "v"),469"xuan": ("x", "van"),470"xue": ("x", "ve"),471"xun": ("x", "vn"),472"ya": ("^", "ia"),473"yan": ("^", "ian"),474"yang": ("^", "iang"),475"yao": ("^", "iao"),476"ye": ("^", "ie"),477"yi": ("^", "i"),478"yin": ("^", "in"),479"ying": ("^", "ing"),480"yo": ("^", "iou"),481"yong": ("^", "iong"),482"you": ("^", "iou"),483"yu": ("^", "v"),484"yuan": ("^", "van"),485"yue": ("^", "ve"),486"yun": ("^", "vn"),487"za": ("z", "a"),488"zai": ("z", "ai"),489"zan": ("z", "an"),490"zang": ("z", "ang"),491"zao": ("z", "ao"),492"ze": ("z", "e"),493"zei": ("z", "ei"),494"zen": ("z", "en"),495"zeng": ("z", "eng"),496"zha": ("zh", "a"),497"zhai": ("zh", "ai"),498"zhan": ("zh", "an"),499"zhang": ("zh", "ang"),500"zhao": ("zh", "ao"),501"zhe": ("zh", "e"),502"zhei": ("zh", "ei"),503"zhen": ("zh", "en"),504"zheng": ("zh", "eng"),505"zhi": ("zh", "iii"),506"zhong": ("zh", "ong"),507"zhou": ("zh", "ou"),508"zhu": ("zh", "u"),509"zhua": ("zh", "ua"),510"zhuai": ("zh", "uai"),511"zhuan": ("zh", "uan"),512"zhuang": ("zh", "uang"),513"zhui": ("zh", "uei"),514"zhun": ("zh", "uen"),515"zhuo": ("zh", "uo"),516"zi": ("z", "ii"),517"zong": ("z", "ong"),518"zou": ("z", "ou"),519"zu": ("z", "u"),520"zuan": ("z", "uan"),521"zui": ("z", "uei"),522"zun": ("z", "uen"),523"zuo": ("z", "uo"),524}525526527zh_pattern = re.compile("[\u4e00-\u9fa5]")528529530def is_zh(word):531global zh_pattern532match = zh_pattern.search(word)533return match is not None534535536class MyConverter(NeutralToneWith5Mixin, DefaultConverter):537pass538539540@dataclass541class BakerProcessor(BaseProcessor):542543pinyin_dict: Dict[str, Tuple[str, str]] = field(default_factory=lambda: PINYIN_DICT)544cleaner_names: str = None545target_rate: int = 24000546speaker_name: str = "baker"547548def __post_init__(self):549super().__post_init__()550self.pinyin_parser = self.get_pinyin_parser()551552def setup_eos_token(self):553return _eos[0]554555def save_pretrained(self, saved_path):556os.makedirs(saved_path, exist_ok=True)557self._save_mapper(558os.path.join(saved_path, PROCESSOR_FILE_NAME),559{"pinyin_dict": self.pinyin_dict},560)561562def create_items(self):563items = []564if self.data_dir:565with open(566os.path.join(self.data_dir, "ProsodyLabeling/000001-010000.txt"),567encoding="utf-8",568) as ttf:569lines = ttf.readlines()570for idx in range(0, len(lines), 2):571utt_id, chn_char = lines[idx].strip().split()572pinyin = lines[idx + 1].strip().split()573if "IY1" in pinyin or "B" in chn_char:574print(f"Skip this: {utt_id} {chn_char} {pinyin}")575continue576phonemes = self.get_phoneme_from_char_and_pinyin(chn_char, pinyin)577wav_path = os.path.join(self.data_dir, "Wave", "%s.wav" % utt_id)578items.append(579[" ".join(phonemes), wav_path, utt_id, self.speaker_name]580)581self.items = items582583def get_phoneme_from_char_and_pinyin(self, chn_char, pinyin):584# we do not need #4, use sil to replace it585chn_char = chn_char.replace("#4", "")586char_len = len(chn_char)587i, j = 0, 0588result = ["sil"]589while i < char_len:590cur_char = chn_char[i]591if is_zh(cur_char):592if pinyin[j][:-1] not in self.pinyin_dict:593assert chn_char[i + 1] == "儿"594assert pinyin[j][-2] == "r"595tone = pinyin[j][-1]596a = pinyin[j][:-2]597a1, a2 = self.pinyin_dict[a]598result += [a1, a2 + tone, "er5"]599if i + 2 < char_len and chn_char[i + 2] != "#":600result.append("#0")601602i += 2603j += 1604else:605tone = pinyin[j][-1]606a = pinyin[j][:-1]607a1, a2 = self.pinyin_dict[a]608result += [a1, a2 + tone]609610if i + 1 < char_len and chn_char[i + 1] != "#":611result.append("#0")612613i += 1614j += 1615elif cur_char == "#":616result.append(chn_char[i : i + 2])617i += 2618else:619# ignore the unknown char and punctuation620# result.append(chn_char[i])621i += 1622if result[-1] == "#0":623result = result[:-1]624result.append("sil")625assert j == len(pinyin)626return result627628def get_one_sample(self, item):629text, wav_file, utt_id, speaker_name = item630631# normalize audio signal to be [-1, 1], soundfile already norm.632audio, rate = sf.read(wav_file)633audio = audio.astype(np.float32)634if rate != self.target_rate:635assert rate > self.target_rate636audio = librosa.resample(audio, rate, self.target_rate)637638# convert text to ids639try:640text_ids = np.asarray(self.text_to_sequence(text), np.int32)641except Exception as e:642print(e, utt_id, text)643return None644645# return None646sample = {647"raw_text": text,648"text_ids": text_ids,649"audio": audio,650"utt_id": str(int(utt_id)),651"speaker_name": speaker_name,652"rate": self.target_rate,653}654655return sample656657def get_pinyin_parser(self):658my_pinyin = Pinyin(MyConverter())659pinyin = my_pinyin.pinyin660return pinyin661662def text_to_sequence(self, text, inference=False):663if inference:664pinyin = self.pinyin_parser(text, style=Style.TONE3, errors="ignore")665new_pinyin = []666for x in pinyin:667x = "".join(x)668if "#" not in x:669new_pinyin.append(x)670phonemes = self.get_phoneme_from_char_and_pinyin(text, new_pinyin)671text = " ".join(phonemes)672print(f"phoneme seq: {text}")673674sequence = []675for symbol in text.split():676idx = self.symbol_to_id[symbol]677sequence.append(idx)678679# add eos tokens680sequence += [self.eos_id]681return sequence682683684