Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
TensorSpeech
GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/bin/preprocess.py
1558 views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 Minh Nguyen (@dathudeptrai)
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""Perform preprocessing, with raw feature extraction and normalization of train/valid split."""
16
17
import argparse
18
import glob
19
import logging
20
import os
21
import yaml
22
23
import librosa
24
import numpy as np
25
import pyworld as pw
26
27
from functools import partial
28
from multiprocessing import Pool
29
from sklearn.model_selection import train_test_split
30
from sklearn.preprocessing import StandardScaler
31
from tqdm import tqdm
32
33
from tensorflow_tts.processor import LJSpeechProcessor
34
from tensorflow_tts.processor import BakerProcessor
35
from tensorflow_tts.processor import KSSProcessor
36
from tensorflow_tts.processor import LibriTTSProcessor
37
from tensorflow_tts.processor import ThorstenProcessor
38
from tensorflow_tts.processor import LJSpeechUltimateProcessor
39
from tensorflow_tts.processor import SynpaflexProcessor
40
from tensorflow_tts.processor import JSUTProcessor
41
from tensorflow_tts.processor.ljspeech import LJSPEECH_SYMBOLS
42
from tensorflow_tts.processor.baker import BAKER_SYMBOLS
43
from tensorflow_tts.processor.kss import KSS_SYMBOLS
44
from tensorflow_tts.processor.libritts import LIBRITTS_SYMBOLS
45
from tensorflow_tts.processor.thorsten import THORSTEN_SYMBOLS
46
from tensorflow_tts.processor.ljspeechu import LJSPEECH_U_SYMBOLS
47
from tensorflow_tts.processor.synpaflex import SYNPAFLEX_SYMBOLS
48
from tensorflow_tts.processor.jsut import JSUT_SYMBOLS
49
50
from tensorflow_tts.utils import remove_outlier
51
52
os.environ["CUDA_VISIBLE_DEVICES"] = ""
53
54
55
def parse_and_config():
56
"""Parse arguments and set configuration parameters."""
57
parser = argparse.ArgumentParser(
58
description="Preprocess audio and text features "
59
"(See detail in tensorflow_tts/bin/preprocess_dataset.py)."
60
)
61
parser.add_argument(
62
"--rootdir",
63
default=None,
64
type=str,
65
required=True,
66
help="Directory containing the dataset files.",
67
)
68
parser.add_argument(
69
"--outdir",
70
default=None,
71
type=str,
72
required=True,
73
help="Output directory where features will be saved.",
74
)
75
parser.add_argument(
76
"--dataset",
77
type=str,
78
default="ljspeech",
79
choices=["ljspeech", "kss", "libritts", "baker", "thorsten", "ljspeechu", "synpaflex", "jsut"],
80
help="Dataset to preprocess.",
81
)
82
parser.add_argument(
83
"--config", type=str, required=True, help="YAML format configuration file."
84
)
85
parser.add_argument(
86
"--n_cpus",
87
type=int,
88
default=4,
89
required=False,
90
help="Number of CPUs to use in parallel.",
91
)
92
parser.add_argument(
93
"--test_size",
94
type=float,
95
default=0.05,
96
required=False,
97
help="Proportion of files to use as test dataset.",
98
)
99
parser.add_argument(
100
"--verbose",
101
type=int,
102
default=0,
103
choices=[0, 1, 2],
104
help="Logging level. 0: DEBUG, 1: INFO and WARNING, 2: INFO, WARNING, and ERROR",
105
)
106
args = parser.parse_args()
107
108
# set logger
109
FORMAT = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
110
log_level = {0: logging.DEBUG, 1: logging.WARNING, 2: logging.ERROR}
111
logging.basicConfig(level=log_level[args.verbose], format=FORMAT)
112
113
# load config
114
config = yaml.load(open(args.config), Loader=yaml.SafeLoader)
115
config.update(vars(args))
116
# config checks
117
assert config["format"] == "npy", "'npy' is the only supported format."
118
return config
119
120
121
def ph_based_trim(
122
config,
123
utt_id: str,
124
text_ids: np.array,
125
raw_text: str,
126
audio: np.array,
127
hop_size: int,
128
) -> (bool, np.array, np.array):
129
"""
130
Args:
131
config: Parsed yaml config
132
utt_id: file name
133
text_ids: array with text ids
134
raw_text: raw text of file
135
audio: parsed wav file
136
hop_size: Hop size
137
Returns: (bool, np.array, np.array) => if trimmed return True, new text_ids, new audio_array
138
"""
139
140
os.makedirs(os.path.join(config["rootdir"], "trimmed-durations"), exist_ok=True)
141
duration_path = config.get(
142
"duration_path", os.path.join(config["rootdir"], "durations")
143
)
144
duration_fixed_path = config.get(
145
"duration_fixed_path", os.path.join(config["rootdir"], "trimmed-durations")
146
)
147
sil_ph = ["SIL", "END"] # TODO FIX hardcoded values
148
text = raw_text.split(" ")
149
150
trim_start, trim_end = False, False
151
152
if text[0] in sil_ph:
153
trim_start = True
154
155
if text[-1] in sil_ph:
156
trim_end = True
157
158
if not trim_start and not trim_end:
159
return False, text_ids, audio
160
161
idx_start, idx_end = (
162
0 if not trim_start else 1,
163
text_ids.__len__() if not trim_end else -1,
164
)
165
text_ids = text_ids[idx_start:idx_end]
166
durations = np.load(os.path.join(duration_path, f"{utt_id}-durations.npy"))
167
if trim_start:
168
s_trim = int(durations[0] * hop_size)
169
audio = audio[s_trim:]
170
if trim_end:
171
e_trim = int(durations[-1] * hop_size)
172
audio = audio[:-e_trim]
173
174
durations = durations[idx_start:idx_end]
175
np.save(os.path.join(duration_fixed_path, f"{utt_id}-durations.npy"), durations)
176
return True, text_ids, audio
177
178
179
def gen_audio_features(item, config):
180
"""Generate audio features and transformations
181
Args:
182
item (Dict): dictionary containing the attributes to encode.
183
config (Dict): configuration dictionary.
184
Returns:
185
(bool): keep this sample or not.
186
mel (ndarray): mel matrix in np.float32.
187
energy (ndarray): energy audio profile.
188
f0 (ndarray): fundamental frequency.
189
item (Dict): dictionary containing the updated attributes.
190
"""
191
# get info from sample.
192
audio = item["audio"]
193
utt_id = item["utt_id"]
194
rate = item["rate"]
195
196
# check audio properties
197
assert len(audio.shape) == 1, f"{utt_id} seems to be multi-channel signal."
198
assert np.abs(audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM."
199
200
# check sample rate
201
if rate != config["sampling_rate"]:
202
audio = librosa.resample(audio, rate, config["sampling_rate"])
203
logging.info(f"{utt_id} sampling rate is {rate}, not {config['sampling_rate']}, we resample it.")
204
205
# trim silence
206
if config["trim_silence"]:
207
if "trim_mfa" in config and config["trim_mfa"]:
208
_, item["text_ids"], audio = ph_based_trim(
209
config,
210
utt_id,
211
item["text_ids"],
212
item["raw_text"],
213
audio,
214
config["hop_size"],
215
)
216
if (
217
audio.__len__() < 1
218
): # very short files can get trimmed fully if mfa didnt extract any tokens LibriTTS maybe take only longer files?
219
logging.warning(
220
f"File have only silence or MFA didnt extract any token {utt_id}"
221
)
222
return False, None, None, None, item
223
else:
224
audio, _ = librosa.effects.trim(
225
audio,
226
top_db=config["trim_threshold_in_db"],
227
frame_length=config["trim_frame_size"],
228
hop_length=config["trim_hop_size"],
229
)
230
231
# resample audio if necessary
232
if "sampling_rate_for_feats" in config:
233
audio = librosa.resample(audio, rate, config["sampling_rate_for_feats"])
234
sampling_rate = config["sampling_rate_for_feats"]
235
assert (
236
config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0
237
), "'hop_size' must be 'int' value. Please check if 'sampling_rate_for_feats' is correct."
238
hop_size = config["hop_size"] * config["sampling_rate_for_feats"] // rate
239
else:
240
sampling_rate = config["sampling_rate"]
241
hop_size = config["hop_size"]
242
243
# get spectrogram
244
D = librosa.stft(
245
audio,
246
n_fft=config["fft_size"],
247
hop_length=hop_size,
248
win_length=config["win_length"],
249
window=config["window"],
250
pad_mode="reflect",
251
)
252
S, _ = librosa.magphase(D) # (#bins, #frames)
253
254
# get mel basis
255
fmin = 0 if config["fmin"] is None else config["fmin"]
256
fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
257
mel_basis = librosa.filters.mel(
258
sr=sampling_rate,
259
n_fft=config["fft_size"],
260
n_mels=config["num_mels"],
261
fmin=fmin,
262
fmax=fmax,
263
)
264
mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T # (#frames, #bins)
265
266
# check audio and feature length
267
audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
268
audio = audio[: len(mel) * hop_size]
269
assert len(mel) * hop_size == len(audio)
270
271
# extract raw pitch
272
_f0, t = pw.dio(
273
audio.astype(np.double),
274
fs=sampling_rate,
275
f0_ceil=fmax,
276
frame_period=1000 * hop_size / sampling_rate,
277
)
278
f0 = pw.stonemask(audio.astype(np.double), _f0, t, sampling_rate)
279
if len(f0) >= len(mel):
280
f0 = f0[: len(mel)]
281
else:
282
f0 = np.pad(f0, (0, len(mel) - len(f0)))
283
284
# extract energy
285
energy = np.sqrt(np.sum(S ** 2, axis=0))
286
assert len(mel) == len(f0) == len(energy)
287
288
# remove outlier f0/energy
289
f0 = remove_outlier(f0)
290
energy = remove_outlier(energy)
291
292
# apply global gain
293
if config["global_gain_scale"] > 0.0:
294
audio *= config["global_gain_scale"]
295
if np.abs(audio).max() >= 1.0:
296
logging.warn(
297
f"{utt_id} causes clipping. It is better to reconsider global gain scale value."
298
)
299
item["audio"] = audio
300
item["mel"] = mel
301
item["f0"] = f0
302
item["energy"] = energy
303
return True, mel, energy, f0, item
304
305
306
def save_statistics_to_file(scaler_list, config):
307
"""Save computed statistics to disk.
308
Args:
309
scaler_list (List): List of scalers containing statistics to save.
310
config (Dict): configuration dictionary.
311
"""
312
for scaler, name in scaler_list:
313
stats = np.stack((scaler.mean_, scaler.scale_))
314
np.save(
315
os.path.join(config["outdir"], f"stats{name}.npy"),
316
stats.astype(np.float32),
317
allow_pickle=False,
318
)
319
320
321
def save_features_to_file(features, subdir, config):
322
"""Save transformed dataset features in disk.
323
Args:
324
features (Dict): dictionary containing the attributes to save.
325
subdir (str): data split folder where features will be saved.
326
config (Dict): configuration dictionary.
327
"""
328
utt_id = features["utt_id"]
329
330
if config["format"] == "npy":
331
save_list = [
332
(features["audio"], "wavs", "wave", np.float32),
333
(features["mel"], "raw-feats", "raw-feats", np.float32),
334
(features["text_ids"], "ids", "ids", np.int32),
335
(features["f0"], "raw-f0", "raw-f0", np.float32),
336
(features["energy"], "raw-energies", "raw-energy", np.float32),
337
]
338
for item, name_dir, name_file, fmt in save_list:
339
np.save(
340
os.path.join(
341
config["outdir"], subdir, name_dir, f"{utt_id}-{name_file}.npy"
342
),
343
item.astype(fmt),
344
allow_pickle=False,
345
)
346
else:
347
raise ValueError("'npy' is the only supported format.")
348
349
350
def preprocess():
351
"""Run preprocessing process and compute statistics for normalizing."""
352
config = parse_and_config()
353
354
dataset_processor = {
355
"ljspeech": LJSpeechProcessor,
356
"kss": KSSProcessor,
357
"libritts": LibriTTSProcessor,
358
"baker": BakerProcessor,
359
"thorsten": ThorstenProcessor,
360
"ljspeechu": LJSpeechUltimateProcessor,
361
"synpaflex": SynpaflexProcessor,
362
"jsut": JSUTProcessor,
363
}
364
365
dataset_symbol = {
366
"ljspeech": LJSPEECH_SYMBOLS,
367
"kss": KSS_SYMBOLS,
368
"libritts": LIBRITTS_SYMBOLS,
369
"baker": BAKER_SYMBOLS,
370
"thorsten": THORSTEN_SYMBOLS,
371
"ljspeechu": LJSPEECH_U_SYMBOLS,
372
"synpaflex": SYNPAFLEX_SYMBOLS,
373
"jsut": JSUT_SYMBOLS,
374
}
375
376
dataset_cleaner = {
377
"ljspeech": "english_cleaners",
378
"kss": "korean_cleaners",
379
"libritts": None,
380
"baker": None,
381
"thorsten": "german_cleaners",
382
"ljspeechu": "english_cleaners",
383
"synpaflex": "basic_cleaners",
384
"jsut": None,
385
}
386
387
logging.info(f"Selected '{config['dataset']}' processor.")
388
processor = dataset_processor[config["dataset"]](
389
config["rootdir"],
390
symbols=dataset_symbol[config["dataset"]],
391
cleaner_names=dataset_cleaner[config["dataset"]],
392
)
393
394
# check output directories
395
build_dir = lambda x: [
396
os.makedirs(os.path.join(config["outdir"], x, y), exist_ok=True)
397
for y in ["raw-feats", "wavs", "ids", "raw-f0", "raw-energies"]
398
]
399
build_dir("train")
400
build_dir("valid")
401
402
# save pretrained-processor to feature dir
403
processor._save_mapper(
404
os.path.join(config["outdir"], f"{config['dataset']}_mapper.json"),
405
extra_attrs_to_save={"pinyin_dict": processor.pinyin_dict}
406
if config["dataset"] == "baker"
407
else {},
408
)
409
410
# build train test split
411
if config["dataset"] == "libritts":
412
train_split, valid_split, _, _ = train_test_split(
413
processor.items,
414
[i[-1] for i in processor.items],
415
test_size=config["test_size"],
416
random_state=42,
417
shuffle=True,
418
)
419
else:
420
train_split, valid_split = train_test_split(
421
processor.items,
422
test_size=config["test_size"],
423
random_state=42,
424
shuffle=True,
425
)
426
logging.info(f"Training items: {len(train_split)}")
427
logging.info(f"Validation items: {len(valid_split)}")
428
429
get_utt_id = lambda x: os.path.split(x[1])[-1].split(".")[0]
430
train_utt_ids = [get_utt_id(x) for x in train_split]
431
valid_utt_ids = [get_utt_id(x) for x in valid_split]
432
433
# save train and valid utt_ids to track later
434
np.save(os.path.join(config["outdir"], "train_utt_ids.npy"), train_utt_ids)
435
np.save(os.path.join(config["outdir"], "valid_utt_ids.npy"), valid_utt_ids)
436
437
# define map iterator
438
def iterator_data(items_list):
439
for item in items_list:
440
yield processor.get_one_sample(item)
441
442
train_iterator_data = iterator_data(train_split)
443
valid_iterator_data = iterator_data(valid_split)
444
445
p = Pool(config["n_cpus"])
446
447
# preprocess train files and get statistics for normalizing
448
partial_fn = partial(gen_audio_features, config=config)
449
train_map = p.imap_unordered(
450
partial_fn,
451
tqdm(train_iterator_data, total=len(train_split), desc="[Preprocessing train]"),
452
chunksize=10,
453
)
454
# init scaler for multiple features
455
scaler_mel = StandardScaler(copy=False)
456
scaler_energy = StandardScaler(copy=False)
457
scaler_f0 = StandardScaler(copy=False)
458
459
id_to_remove = []
460
for result, mel, energy, f0, features in train_map:
461
if not result:
462
id_to_remove.append(features["utt_id"])
463
continue
464
save_features_to_file(features, "train", config)
465
# partial fitting of scalers
466
if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
467
id_to_remove.append(features["utt_id"])
468
continue
469
# partial fitting of scalers
470
if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
471
id_to_remove.append(features["utt_id"])
472
continue
473
scaler_mel.partial_fit(mel)
474
scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
475
scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))
476
477
if len(id_to_remove) > 0:
478
np.save(
479
os.path.join(config["outdir"], "train_utt_ids.npy"),
480
[i for i in train_utt_ids if i not in id_to_remove],
481
)
482
logging.info(
483
f"removed {len(id_to_remove)} cause of too many outliers or bad mfa extraction"
484
)
485
486
# save statistics to file
487
logging.info("Saving computed statistics.")
488
scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
489
save_statistics_to_file(scaler_list, config)
490
491
# preprocess valid files
492
partial_fn = partial(gen_audio_features, config=config)
493
valid_map = p.imap_unordered(
494
partial_fn,
495
tqdm(valid_iterator_data, total=len(valid_split), desc="[Preprocessing valid]"),
496
chunksize=10,
497
)
498
for *_, features in valid_map:
499
save_features_to_file(features, "valid", config)
500
501
502
def gen_normal_mel(mel_path, scaler, config):
503
"""Normalize the mel spectrogram and save it to the corresponding path.
504
Args:
505
mel_path (string): path of the mel spectrogram to normalize.
506
scaler (sklearn.base.BaseEstimator): scaling function to use for normalize.
507
config (Dict): configuration dictionary.
508
"""
509
mel = np.load(mel_path)
510
mel_norm = scaler.transform(mel)
511
path, file_name = os.path.split(mel_path)
512
*_, subdir, suffix = path.split(os.sep)
513
514
utt_id = file_name.split(f"-{suffix}.npy")[0]
515
np.save(
516
os.path.join(
517
config["outdir"], subdir, "norm-feats", f"{utt_id}-norm-feats.npy"
518
),
519
mel_norm.astype(np.float32),
520
allow_pickle=False,
521
)
522
523
524
def normalize():
525
"""Normalize mel spectrogram with pre-computed statistics."""
526
config = parse_and_config()
527
if config["format"] == "npy":
528
# init scaler with saved values
529
scaler = StandardScaler()
530
scaler.mean_, scaler.scale_ = np.load(
531
os.path.join(config["outdir"], "stats.npy")
532
)
533
scaler.n_features_in_ = config["num_mels"]
534
else:
535
raise ValueError("'npy' is the only supported format.")
536
537
# find all "raw-feats" files in both train and valid folders
538
glob_path = os.path.join(config["rootdir"], "**", "raw-feats", "*.npy")
539
mel_raw_feats = glob.glob(glob_path, recursive=True)
540
logging.info(f"Files to normalize: {len(mel_raw_feats)}")
541
542
# check for output directories
543
os.makedirs(os.path.join(config["outdir"], "train", "norm-feats"), exist_ok=True)
544
os.makedirs(os.path.join(config["outdir"], "valid", "norm-feats"), exist_ok=True)
545
546
p = Pool(config["n_cpus"])
547
partial_fn = partial(gen_normal_mel, scaler=scaler, config=config)
548
list(p.map(partial_fn, tqdm(mel_raw_feats, desc="[Normalizing]")))
549
550
551
def compute_statistics():
552
"""Compute mean / std statistics of some features for later normalization."""
553
config = parse_and_config()
554
555
# find features files for the train split
556
glob_fn = lambda x: glob.glob(os.path.join(config["rootdir"], "train", x, "*.npy"))
557
glob_mel = glob_fn("raw-feats")
558
glob_f0 = glob_fn("raw-f0")
559
glob_energy = glob_fn("raw-energies")
560
assert (
561
len(glob_mel) == len(glob_f0) == len(glob_energy)
562
), "Features, f0 and energies have different files in training split."
563
564
logging.info(f"Computing statistics for {len(glob_mel)} files.")
565
# init scaler for multiple features
566
scaler_mel = StandardScaler(copy=False)
567
scaler_energy = StandardScaler(copy=False)
568
scaler_f0 = StandardScaler(copy=False)
569
570
for mel, f0, energy in tqdm(
571
zip(glob_mel, glob_f0, glob_energy), total=len(glob_mel)
572
):
573
# remove outliers
574
energy = np.load(energy)
575
f0 = np.load(f0)
576
# partial fitting of scalers
577
scaler_mel.partial_fit(np.load(mel))
578
scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
579
scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))
580
581
# save statistics to file
582
logging.info("Saving computed statistics.")
583
scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
584
save_statistics_to_file(scaler_list, config)
585
586
587
if __name__ == "__main__":
588
preprocess()
589
590