Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
TensorSpeech
GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/processor/baker.py
1558 views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 TensorFlowTTS Team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""Perform preprocessing and raw feature extraction for Baker dataset."""
16
17
import os
18
import re
19
from typing import Dict, List, Union, Tuple, Any
20
21
import librosa
22
import numpy as np
23
import soundfile as sf
24
from dataclasses import dataclass, field
25
from pypinyin import Style
26
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
27
from pypinyin.converter import DefaultConverter
28
from pypinyin.core import Pinyin
29
from tensorflow_tts.processor import BaseProcessor
30
from tensorflow_tts.utils.utils import PROCESSOR_FILE_NAME
31
32
_pad = ["pad"]
33
_eos = ["eos"]
34
_pause = ["sil", "#0", "#1", "#2", "#3"]
35
36
_initials = [
37
"^",
38
"b",
39
"c",
40
"ch",
41
"d",
42
"f",
43
"g",
44
"h",
45
"j",
46
"k",
47
"l",
48
"m",
49
"n",
50
"p",
51
"q",
52
"r",
53
"s",
54
"sh",
55
"t",
56
"x",
57
"z",
58
"zh",
59
]
60
61
_tones = ["1", "2", "3", "4", "5"]
62
63
_finals = [
64
"a",
65
"ai",
66
"an",
67
"ang",
68
"ao",
69
"e",
70
"ei",
71
"en",
72
"eng",
73
"er",
74
"i",
75
"ia",
76
"ian",
77
"iang",
78
"iao",
79
"ie",
80
"ii",
81
"iii",
82
"in",
83
"ing",
84
"iong",
85
"iou",
86
"o",
87
"ong",
88
"ou",
89
"u",
90
"ua",
91
"uai",
92
"uan",
93
"uang",
94
"uei",
95
"uen",
96
"ueng",
97
"uo",
98
"v",
99
"van",
100
"ve",
101
"vn",
102
]
103
104
BAKER_SYMBOLS = _pad + _pause + _initials + [i + j for i in _finals for j in _tones] + _eos
105
106
107
PINYIN_DICT = {
108
"a": ("^", "a"),
109
"ai": ("^", "ai"),
110
"an": ("^", "an"),
111
"ang": ("^", "ang"),
112
"ao": ("^", "ao"),
113
"ba": ("b", "a"),
114
"bai": ("b", "ai"),
115
"ban": ("b", "an"),
116
"bang": ("b", "ang"),
117
"bao": ("b", "ao"),
118
"be": ("b", "e"),
119
"bei": ("b", "ei"),
120
"ben": ("b", "en"),
121
"beng": ("b", "eng"),
122
"bi": ("b", "i"),
123
"bian": ("b", "ian"),
124
"biao": ("b", "iao"),
125
"bie": ("b", "ie"),
126
"bin": ("b", "in"),
127
"bing": ("b", "ing"),
128
"bo": ("b", "o"),
129
"bu": ("b", "u"),
130
"ca": ("c", "a"),
131
"cai": ("c", "ai"),
132
"can": ("c", "an"),
133
"cang": ("c", "ang"),
134
"cao": ("c", "ao"),
135
"ce": ("c", "e"),
136
"cen": ("c", "en"),
137
"ceng": ("c", "eng"),
138
"cha": ("ch", "a"),
139
"chai": ("ch", "ai"),
140
"chan": ("ch", "an"),
141
"chang": ("ch", "ang"),
142
"chao": ("ch", "ao"),
143
"che": ("ch", "e"),
144
"chen": ("ch", "en"),
145
"cheng": ("ch", "eng"),
146
"chi": ("ch", "iii"),
147
"chong": ("ch", "ong"),
148
"chou": ("ch", "ou"),
149
"chu": ("ch", "u"),
150
"chua": ("ch", "ua"),
151
"chuai": ("ch", "uai"),
152
"chuan": ("ch", "uan"),
153
"chuang": ("ch", "uang"),
154
"chui": ("ch", "uei"),
155
"chun": ("ch", "uen"),
156
"chuo": ("ch", "uo"),
157
"ci": ("c", "ii"),
158
"cong": ("c", "ong"),
159
"cou": ("c", "ou"),
160
"cu": ("c", "u"),
161
"cuan": ("c", "uan"),
162
"cui": ("c", "uei"),
163
"cun": ("c", "uen"),
164
"cuo": ("c", "uo"),
165
"da": ("d", "a"),
166
"dai": ("d", "ai"),
167
"dan": ("d", "an"),
168
"dang": ("d", "ang"),
169
"dao": ("d", "ao"),
170
"de": ("d", "e"),
171
"dei": ("d", "ei"),
172
"den": ("d", "en"),
173
"deng": ("d", "eng"),
174
"di": ("d", "i"),
175
"dia": ("d", "ia"),
176
"dian": ("d", "ian"),
177
"diao": ("d", "iao"),
178
"die": ("d", "ie"),
179
"ding": ("d", "ing"),
180
"diu": ("d", "iou"),
181
"dong": ("d", "ong"),
182
"dou": ("d", "ou"),
183
"du": ("d", "u"),
184
"duan": ("d", "uan"),
185
"dui": ("d", "uei"),
186
"dun": ("d", "uen"),
187
"duo": ("d", "uo"),
188
"e": ("^", "e"),
189
"ei": ("^", "ei"),
190
"en": ("^", "en"),
191
"ng": ("^", "en"),
192
"eng": ("^", "eng"),
193
"er": ("^", "er"),
194
"fa": ("f", "a"),
195
"fan": ("f", "an"),
196
"fang": ("f", "ang"),
197
"fei": ("f", "ei"),
198
"fen": ("f", "en"),
199
"feng": ("f", "eng"),
200
"fo": ("f", "o"),
201
"fou": ("f", "ou"),
202
"fu": ("f", "u"),
203
"ga": ("g", "a"),
204
"gai": ("g", "ai"),
205
"gan": ("g", "an"),
206
"gang": ("g", "ang"),
207
"gao": ("g", "ao"),
208
"ge": ("g", "e"),
209
"gei": ("g", "ei"),
210
"gen": ("g", "en"),
211
"geng": ("g", "eng"),
212
"gong": ("g", "ong"),
213
"gou": ("g", "ou"),
214
"gu": ("g", "u"),
215
"gua": ("g", "ua"),
216
"guai": ("g", "uai"),
217
"guan": ("g", "uan"),
218
"guang": ("g", "uang"),
219
"gui": ("g", "uei"),
220
"gun": ("g", "uen"),
221
"guo": ("g", "uo"),
222
"ha": ("h", "a"),
223
"hai": ("h", "ai"),
224
"han": ("h", "an"),
225
"hang": ("h", "ang"),
226
"hao": ("h", "ao"),
227
"he": ("h", "e"),
228
"hei": ("h", "ei"),
229
"hen": ("h", "en"),
230
"heng": ("h", "eng"),
231
"hong": ("h", "ong"),
232
"hou": ("h", "ou"),
233
"hu": ("h", "u"),
234
"hua": ("h", "ua"),
235
"huai": ("h", "uai"),
236
"huan": ("h", "uan"),
237
"huang": ("h", "uang"),
238
"hui": ("h", "uei"),
239
"hun": ("h", "uen"),
240
"huo": ("h", "uo"),
241
"ji": ("j", "i"),
242
"jia": ("j", "ia"),
243
"jian": ("j", "ian"),
244
"jiang": ("j", "iang"),
245
"jiao": ("j", "iao"),
246
"jie": ("j", "ie"),
247
"jin": ("j", "in"),
248
"jing": ("j", "ing"),
249
"jiong": ("j", "iong"),
250
"jiu": ("j", "iou"),
251
"ju": ("j", "v"),
252
"juan": ("j", "van"),
253
"jue": ("j", "ve"),
254
"jun": ("j", "vn"),
255
"ka": ("k", "a"),
256
"kai": ("k", "ai"),
257
"kan": ("k", "an"),
258
"kang": ("k", "ang"),
259
"kao": ("k", "ao"),
260
"ke": ("k", "e"),
261
"kei": ("k", "ei"),
262
"ken": ("k", "en"),
263
"keng": ("k", "eng"),
264
"kong": ("k", "ong"),
265
"kou": ("k", "ou"),
266
"ku": ("k", "u"),
267
"kua": ("k", "ua"),
268
"kuai": ("k", "uai"),
269
"kuan": ("k", "uan"),
270
"kuang": ("k", "uang"),
271
"kui": ("k", "uei"),
272
"kun": ("k", "uen"),
273
"kuo": ("k", "uo"),
274
"la": ("l", "a"),
275
"lai": ("l", "ai"),
276
"lan": ("l", "an"),
277
"lang": ("l", "ang"),
278
"lao": ("l", "ao"),
279
"le": ("l", "e"),
280
"lei": ("l", "ei"),
281
"leng": ("l", "eng"),
282
"li": ("l", "i"),
283
"lia": ("l", "ia"),
284
"lian": ("l", "ian"),
285
"liang": ("l", "iang"),
286
"liao": ("l", "iao"),
287
"lie": ("l", "ie"),
288
"lin": ("l", "in"),
289
"ling": ("l", "ing"),
290
"liu": ("l", "iou"),
291
"lo": ("l", "o"),
292
"long": ("l", "ong"),
293
"lou": ("l", "ou"),
294
"lu": ("l", "u"),
295
"lv": ("l", "v"),
296
"luan": ("l", "uan"),
297
"lve": ("l", "ve"),
298
"lue": ("l", "ve"),
299
"lun": ("l", "uen"),
300
"luo": ("l", "uo"),
301
"ma": ("m", "a"),
302
"mai": ("m", "ai"),
303
"man": ("m", "an"),
304
"mang": ("m", "ang"),
305
"mao": ("m", "ao"),
306
"me": ("m", "e"),
307
"mei": ("m", "ei"),
308
"men": ("m", "en"),
309
"meng": ("m", "eng"),
310
"mi": ("m", "i"),
311
"mian": ("m", "ian"),
312
"miao": ("m", "iao"),
313
"mie": ("m", "ie"),
314
"min": ("m", "in"),
315
"ming": ("m", "ing"),
316
"miu": ("m", "iou"),
317
"mo": ("m", "o"),
318
"mou": ("m", "ou"),
319
"mu": ("m", "u"),
320
"na": ("n", "a"),
321
"nai": ("n", "ai"),
322
"nan": ("n", "an"),
323
"nang": ("n", "ang"),
324
"nao": ("n", "ao"),
325
"ne": ("n", "e"),
326
"nei": ("n", "ei"),
327
"nen": ("n", "en"),
328
"neng": ("n", "eng"),
329
"ni": ("n", "i"),
330
"nia": ("n", "ia"),
331
"nian": ("n", "ian"),
332
"niang": ("n", "iang"),
333
"niao": ("n", "iao"),
334
"nie": ("n", "ie"),
335
"nin": ("n", "in"),
336
"ning": ("n", "ing"),
337
"niu": ("n", "iou"),
338
"nong": ("n", "ong"),
339
"nou": ("n", "ou"),
340
"nu": ("n", "u"),
341
"nv": ("n", "v"),
342
"nuan": ("n", "uan"),
343
"nve": ("n", "ve"),
344
"nue": ("n", "ve"),
345
"nuo": ("n", "uo"),
346
"o": ("^", "o"),
347
"ou": ("^", "ou"),
348
"pa": ("p", "a"),
349
"pai": ("p", "ai"),
350
"pan": ("p", "an"),
351
"pang": ("p", "ang"),
352
"pao": ("p", "ao"),
353
"pe": ("p", "e"),
354
"pei": ("p", "ei"),
355
"pen": ("p", "en"),
356
"peng": ("p", "eng"),
357
"pi": ("p", "i"),
358
"pian": ("p", "ian"),
359
"piao": ("p", "iao"),
360
"pie": ("p", "ie"),
361
"pin": ("p", "in"),
362
"ping": ("p", "ing"),
363
"po": ("p", "o"),
364
"pou": ("p", "ou"),
365
"pu": ("p", "u"),
366
"qi": ("q", "i"),
367
"qia": ("q", "ia"),
368
"qian": ("q", "ian"),
369
"qiang": ("q", "iang"),
370
"qiao": ("q", "iao"),
371
"qie": ("q", "ie"),
372
"qin": ("q", "in"),
373
"qing": ("q", "ing"),
374
"qiong": ("q", "iong"),
375
"qiu": ("q", "iou"),
376
"qu": ("q", "v"),
377
"quan": ("q", "van"),
378
"que": ("q", "ve"),
379
"qun": ("q", "vn"),
380
"ran": ("r", "an"),
381
"rang": ("r", "ang"),
382
"rao": ("r", "ao"),
383
"re": ("r", "e"),
384
"ren": ("r", "en"),
385
"reng": ("r", "eng"),
386
"ri": ("r", "iii"),
387
"rong": ("r", "ong"),
388
"rou": ("r", "ou"),
389
"ru": ("r", "u"),
390
"rua": ("r", "ua"),
391
"ruan": ("r", "uan"),
392
"rui": ("r", "uei"),
393
"run": ("r", "uen"),
394
"ruo": ("r", "uo"),
395
"sa": ("s", "a"),
396
"sai": ("s", "ai"),
397
"san": ("s", "an"),
398
"sang": ("s", "ang"),
399
"sao": ("s", "ao"),
400
"se": ("s", "e"),
401
"sen": ("s", "en"),
402
"seng": ("s", "eng"),
403
"sha": ("sh", "a"),
404
"shai": ("sh", "ai"),
405
"shan": ("sh", "an"),
406
"shang": ("sh", "ang"),
407
"shao": ("sh", "ao"),
408
"she": ("sh", "e"),
409
"shei": ("sh", "ei"),
410
"shen": ("sh", "en"),
411
"sheng": ("sh", "eng"),
412
"shi": ("sh", "iii"),
413
"shou": ("sh", "ou"),
414
"shu": ("sh", "u"),
415
"shua": ("sh", "ua"),
416
"shuai": ("sh", "uai"),
417
"shuan": ("sh", "uan"),
418
"shuang": ("sh", "uang"),
419
"shui": ("sh", "uei"),
420
"shun": ("sh", "uen"),
421
"shuo": ("sh", "uo"),
422
"si": ("s", "ii"),
423
"song": ("s", "ong"),
424
"sou": ("s", "ou"),
425
"su": ("s", "u"),
426
"suan": ("s", "uan"),
427
"sui": ("s", "uei"),
428
"sun": ("s", "uen"),
429
"suo": ("s", "uo"),
430
"ta": ("t", "a"),
431
"tai": ("t", "ai"),
432
"tan": ("t", "an"),
433
"tang": ("t", "ang"),
434
"tao": ("t", "ao"),
435
"te": ("t", "e"),
436
"tei": ("t", "ei"),
437
"teng": ("t", "eng"),
438
"ti": ("t", "i"),
439
"tian": ("t", "ian"),
440
"tiao": ("t", "iao"),
441
"tie": ("t", "ie"),
442
"ting": ("t", "ing"),
443
"tong": ("t", "ong"),
444
"tou": ("t", "ou"),
445
"tu": ("t", "u"),
446
"tuan": ("t", "uan"),
447
"tui": ("t", "uei"),
448
"tun": ("t", "uen"),
449
"tuo": ("t", "uo"),
450
"wa": ("^", "ua"),
451
"wai": ("^", "uai"),
452
"wan": ("^", "uan"),
453
"wang": ("^", "uang"),
454
"wei": ("^", "uei"),
455
"wen": ("^", "uen"),
456
"weng": ("^", "ueng"),
457
"wo": ("^", "uo"),
458
"wu": ("^", "u"),
459
"xi": ("x", "i"),
460
"xia": ("x", "ia"),
461
"xian": ("x", "ian"),
462
"xiang": ("x", "iang"),
463
"xiao": ("x", "iao"),
464
"xie": ("x", "ie"),
465
"xin": ("x", "in"),
466
"xing": ("x", "ing"),
467
"xiong": ("x", "iong"),
468
"xiu": ("x", "iou"),
469
"xu": ("x", "v"),
470
"xuan": ("x", "van"),
471
"xue": ("x", "ve"),
472
"xun": ("x", "vn"),
473
"ya": ("^", "ia"),
474
"yan": ("^", "ian"),
475
"yang": ("^", "iang"),
476
"yao": ("^", "iao"),
477
"ye": ("^", "ie"),
478
"yi": ("^", "i"),
479
"yin": ("^", "in"),
480
"ying": ("^", "ing"),
481
"yo": ("^", "iou"),
482
"yong": ("^", "iong"),
483
"you": ("^", "iou"),
484
"yu": ("^", "v"),
485
"yuan": ("^", "van"),
486
"yue": ("^", "ve"),
487
"yun": ("^", "vn"),
488
"za": ("z", "a"),
489
"zai": ("z", "ai"),
490
"zan": ("z", "an"),
491
"zang": ("z", "ang"),
492
"zao": ("z", "ao"),
493
"ze": ("z", "e"),
494
"zei": ("z", "ei"),
495
"zen": ("z", "en"),
496
"zeng": ("z", "eng"),
497
"zha": ("zh", "a"),
498
"zhai": ("zh", "ai"),
499
"zhan": ("zh", "an"),
500
"zhang": ("zh", "ang"),
501
"zhao": ("zh", "ao"),
502
"zhe": ("zh", "e"),
503
"zhei": ("zh", "ei"),
504
"zhen": ("zh", "en"),
505
"zheng": ("zh", "eng"),
506
"zhi": ("zh", "iii"),
507
"zhong": ("zh", "ong"),
508
"zhou": ("zh", "ou"),
509
"zhu": ("zh", "u"),
510
"zhua": ("zh", "ua"),
511
"zhuai": ("zh", "uai"),
512
"zhuan": ("zh", "uan"),
513
"zhuang": ("zh", "uang"),
514
"zhui": ("zh", "uei"),
515
"zhun": ("zh", "uen"),
516
"zhuo": ("zh", "uo"),
517
"zi": ("z", "ii"),
518
"zong": ("z", "ong"),
519
"zou": ("z", "ou"),
520
"zu": ("z", "u"),
521
"zuan": ("z", "uan"),
522
"zui": ("z", "uei"),
523
"zun": ("z", "uen"),
524
"zuo": ("z", "uo"),
525
}
526
527
528
zh_pattern = re.compile("[\u4e00-\u9fa5]")
529
530
531
def is_zh(word):
532
global zh_pattern
533
match = zh_pattern.search(word)
534
return match is not None
535
536
537
class MyConverter(NeutralToneWith5Mixin, DefaultConverter):
538
pass
539
540
541
@dataclass
542
class BakerProcessor(BaseProcessor):
543
544
pinyin_dict: Dict[str, Tuple[str, str]] = field(default_factory=lambda: PINYIN_DICT)
545
cleaner_names: str = None
546
target_rate: int = 24000
547
speaker_name: str = "baker"
548
549
def __post_init__(self):
550
super().__post_init__()
551
self.pinyin_parser = self.get_pinyin_parser()
552
553
def setup_eos_token(self):
554
return _eos[0]
555
556
def save_pretrained(self, saved_path):
557
os.makedirs(saved_path, exist_ok=True)
558
self._save_mapper(
559
os.path.join(saved_path, PROCESSOR_FILE_NAME),
560
{"pinyin_dict": self.pinyin_dict},
561
)
562
563
def create_items(self):
564
items = []
565
if self.data_dir:
566
with open(
567
os.path.join(self.data_dir, "ProsodyLabeling/000001-010000.txt"),
568
encoding="utf-8",
569
) as ttf:
570
lines = ttf.readlines()
571
for idx in range(0, len(lines), 2):
572
utt_id, chn_char = lines[idx].strip().split()
573
pinyin = lines[idx + 1].strip().split()
574
if "IY1" in pinyin or "B" in chn_char:
575
print(f"Skip this: {utt_id} {chn_char} {pinyin}")
576
continue
577
phonemes = self.get_phoneme_from_char_and_pinyin(chn_char, pinyin)
578
wav_path = os.path.join(self.data_dir, "Wave", "%s.wav" % utt_id)
579
items.append(
580
[" ".join(phonemes), wav_path, utt_id, self.speaker_name]
581
)
582
self.items = items
583
584
def get_phoneme_from_char_and_pinyin(self, chn_char, pinyin):
585
# we do not need #4, use sil to replace it
586
chn_char = chn_char.replace("#4", "")
587
char_len = len(chn_char)
588
i, j = 0, 0
589
result = ["sil"]
590
while i < char_len:
591
cur_char = chn_char[i]
592
if is_zh(cur_char):
593
if pinyin[j][:-1] not in self.pinyin_dict:
594
assert chn_char[i + 1] == "儿"
595
assert pinyin[j][-2] == "r"
596
tone = pinyin[j][-1]
597
a = pinyin[j][:-2]
598
a1, a2 = self.pinyin_dict[a]
599
result += [a1, a2 + tone, "er5"]
600
if i + 2 < char_len and chn_char[i + 2] != "#":
601
result.append("#0")
602
603
i += 2
604
j += 1
605
else:
606
tone = pinyin[j][-1]
607
a = pinyin[j][:-1]
608
a1, a2 = self.pinyin_dict[a]
609
result += [a1, a2 + tone]
610
611
if i + 1 < char_len and chn_char[i + 1] != "#":
612
result.append("#0")
613
614
i += 1
615
j += 1
616
elif cur_char == "#":
617
result.append(chn_char[i : i + 2])
618
i += 2
619
else:
620
# ignore the unknown char and punctuation
621
# result.append(chn_char[i])
622
i += 1
623
if result[-1] == "#0":
624
result = result[:-1]
625
result.append("sil")
626
assert j == len(pinyin)
627
return result
628
629
def get_one_sample(self, item):
630
text, wav_file, utt_id, speaker_name = item
631
632
# normalize audio signal to be [-1, 1], soundfile already norm.
633
audio, rate = sf.read(wav_file)
634
audio = audio.astype(np.float32)
635
if rate != self.target_rate:
636
assert rate > self.target_rate
637
audio = librosa.resample(audio, rate, self.target_rate)
638
639
# convert text to ids
640
try:
641
text_ids = np.asarray(self.text_to_sequence(text), np.int32)
642
except Exception as e:
643
print(e, utt_id, text)
644
return None
645
646
# return None
647
sample = {
648
"raw_text": text,
649
"text_ids": text_ids,
650
"audio": audio,
651
"utt_id": str(int(utt_id)),
652
"speaker_name": speaker_name,
653
"rate": self.target_rate,
654
}
655
656
return sample
657
658
def get_pinyin_parser(self):
659
my_pinyin = Pinyin(MyConverter())
660
pinyin = my_pinyin.pinyin
661
return pinyin
662
663
def text_to_sequence(self, text, inference=False):
664
if inference:
665
pinyin = self.pinyin_parser(text, style=Style.TONE3, errors="ignore")
666
new_pinyin = []
667
for x in pinyin:
668
x = "".join(x)
669
if "#" not in x:
670
new_pinyin.append(x)
671
phonemes = self.get_phoneme_from_char_and_pinyin(text, new_pinyin)
672
text = " ".join(phonemes)
673
print(f"phoneme seq: {text}")
674
675
sequence = []
676
for symbol in text.split():
677
idx = self.symbol_to_id[symbol]
678
sequence.append(idx)
679
680
# add eos tokens
681
sequence += [self.eos_id]
682
return sequence
683
684