CoCalc -- preprocess.py

GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/bin/preprocess.py
¹⁵⁵⁸ views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 Minh Nguyen (@dathudeptrai)
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""Perform preprocessing, with raw feature extraction and normalization of train/valid split."""
16

17
import argparse
18
import glob
19
import logging
20
import os
21
import yaml
22

23
import librosa
24
import numpy as np
25
import pyworld as pw
26

27
from functools import partial
28
from multiprocessing import Pool
29
from sklearn.model_selection import train_test_split
30
from sklearn.preprocessing import StandardScaler
31
from tqdm import tqdm
32

33
from tensorflow_tts.processor import LJSpeechProcessor
34
from tensorflow_tts.processor import BakerProcessor
35
from tensorflow_tts.processor import KSSProcessor
36
from tensorflow_tts.processor import LibriTTSProcessor
37
from tensorflow_tts.processor import ThorstenProcessor
38
from tensorflow_tts.processor import LJSpeechUltimateProcessor
39
from tensorflow_tts.processor import SynpaflexProcessor
40
from tensorflow_tts.processor import JSUTProcessor
41
from tensorflow_tts.processor.ljspeech import LJSPEECH_SYMBOLS
42
from tensorflow_tts.processor.baker import BAKER_SYMBOLS
43
from tensorflow_tts.processor.kss import KSS_SYMBOLS
44
from tensorflow_tts.processor.libritts import LIBRITTS_SYMBOLS
45
from tensorflow_tts.processor.thorsten import THORSTEN_SYMBOLS
46
from tensorflow_tts.processor.ljspeechu import LJSPEECH_U_SYMBOLS
47
from tensorflow_tts.processor.synpaflex import SYNPAFLEX_SYMBOLS
48
from tensorflow_tts.processor.jsut import JSUT_SYMBOLS
49

50
from tensorflow_tts.utils import remove_outlier
51

52
os.environ["CUDA_VISIBLE_DEVICES"] = ""
53

54

55
def parse_and_config():
56
    """Parse arguments and set configuration parameters."""
57
    parser = argparse.ArgumentParser(
58
        description="Preprocess audio and text features "
59
        "(See detail in tensorflow_tts/bin/preprocess_dataset.py)."
60
    )
61
    parser.add_argument(
62
        "--rootdir",
63
        default=None,
64
        type=str,
65
        required=True,
66
        help="Directory containing the dataset files.",
67
    )
68
    parser.add_argument(
69
        "--outdir",
70
        default=None,
71
        type=str,
72
        required=True,
73
        help="Output directory where features will be saved.",
74
    )
75
    parser.add_argument(
76
        "--dataset",
77
        type=str,
78
        default="ljspeech",
79
        choices=["ljspeech", "kss", "libritts", "baker", "thorsten", "ljspeechu", "synpaflex", "jsut"],
80
        help="Dataset to preprocess.",
81
    )
82
    parser.add_argument(
83
        "--config", type=str, required=True, help="YAML format configuration file."
84
    )
85
    parser.add_argument(
86
        "--n_cpus",
87
        type=int,
88
        default=4,
89
        required=False,
90
        help="Number of CPUs to use in parallel.",
91
    )
92
    parser.add_argument(
93
        "--test_size",
94
        type=float,
95
        default=0.05,
96
        required=False,
97
        help="Proportion of files to use as test dataset.",
98
    )
99
    parser.add_argument(
100
        "--verbose",
101
        type=int,
102
        default=0,
103
        choices=[0, 1, 2],
104
        help="Logging level. 0: DEBUG, 1: INFO and WARNING, 2: INFO, WARNING, and ERROR",
105
    )
106
    args = parser.parse_args()
107

108
    # set logger
109
    FORMAT = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
110
    log_level = {0: logging.DEBUG, 1: logging.WARNING, 2: logging.ERROR}
111
    logging.basicConfig(level=log_level[args.verbose], format=FORMAT)
112

113
    # load config
114
    config = yaml.load(open(args.config), Loader=yaml.SafeLoader)
115
    config.update(vars(args))
116
    # config checks
117
    assert config["format"] == "npy", "'npy' is the only supported format."
118
    return config
119

120

121
def ph_based_trim(
122
    config,
123
    utt_id: str,
124
    text_ids: np.array,
125
    raw_text: str,
126
    audio: np.array,
127
    hop_size: int,
128
) -> (bool, np.array, np.array):
129
    """
130
    Args:
131
        config: Parsed yaml config
132
        utt_id: file name
133
        text_ids: array with text ids
134
        raw_text: raw text of file
135
        audio: parsed wav file
136
        hop_size: Hop size
137
    Returns: (bool, np.array, np.array) => if trimmed return True, new text_ids, new audio_array
138
    """
139

140
    os.makedirs(os.path.join(config["rootdir"], "trimmed-durations"), exist_ok=True)
141
    duration_path = config.get(
142
        "duration_path", os.path.join(config["rootdir"], "durations")
143
    )
144
    duration_fixed_path = config.get(
145
        "duration_fixed_path", os.path.join(config["rootdir"], "trimmed-durations")
146
    )
147
    sil_ph = ["SIL", "END"]  # TODO FIX hardcoded values
148
    text = raw_text.split(" ")
149

150
    trim_start, trim_end = False, False
151

152
    if text[0] in sil_ph:
153
        trim_start = True
154

155
    if text[-1] in sil_ph:
156
        trim_end = True
157

158
    if not trim_start and not trim_end:
159
        return False, text_ids, audio
160

161
    idx_start, idx_end = (
162
        0 if not trim_start else 1,
163
        text_ids.__len__() if not trim_end else -1,
164
    )
165
    text_ids = text_ids[idx_start:idx_end]
166
    durations = np.load(os.path.join(duration_path, f"{utt_id}-durations.npy"))
167
    if trim_start:
168
        s_trim = int(durations[0] * hop_size)
169
        audio = audio[s_trim:]
170
    if trim_end:
171
        e_trim = int(durations[-1] * hop_size)
172
        audio = audio[:-e_trim]
173

174
    durations = durations[idx_start:idx_end]
175
    np.save(os.path.join(duration_fixed_path, f"{utt_id}-durations.npy"), durations)
176
    return True, text_ids, audio
177

178

179
def gen_audio_features(item, config):
180
    """Generate audio features and transformations
181
    Args:
182
        item (Dict): dictionary containing the attributes to encode.
183
        config (Dict): configuration dictionary.
184
    Returns:
185
        (bool): keep this sample or not.
186
        mel (ndarray): mel matrix in np.float32.
187
        energy (ndarray): energy audio profile.
188
        f0 (ndarray): fundamental frequency.
189
        item (Dict): dictionary containing the updated attributes.
190
    """
191
    # get info from sample.
192
    audio = item["audio"]
193
    utt_id = item["utt_id"]
194
    rate = item["rate"]
195

196
    # check audio properties
197
    assert len(audio.shape) == 1, f"{utt_id} seems to be multi-channel signal."
198
    assert np.abs(audio).max() <= 1.0, f"{utt_id} is different from 16 bit PCM."
199
    
200
    # check sample rate
201
    if rate != config["sampling_rate"]:
202
        audio = librosa.resample(audio, rate, config["sampling_rate"])
203
        logging.info(f"{utt_id} sampling rate is {rate}, not {config['sampling_rate']}, we resample it.")
204

205
    # trim silence
206
    if config["trim_silence"]:
207
        if "trim_mfa" in config and config["trim_mfa"]:
208
            _, item["text_ids"], audio = ph_based_trim(
209
                config,
210
                utt_id,
211
                item["text_ids"],
212
                item["raw_text"],
213
                audio,
214
                config["hop_size"],
215
            )
216
            if (
217
                audio.__len__() < 1
218
            ):  # very short files can get trimmed fully if mfa didnt extract any tokens LibriTTS maybe take only longer files?
219
                logging.warning(
220
                    f"File have only silence or MFA didnt extract any token {utt_id}"
221
                )
222
                return False, None, None, None, item
223
        else:
224
            audio, _ = librosa.effects.trim(
225
                audio,
226
                top_db=config["trim_threshold_in_db"],
227
                frame_length=config["trim_frame_size"],
228
                hop_length=config["trim_hop_size"],
229
            )
230

231
    # resample audio if necessary
232
    if "sampling_rate_for_feats" in config:
233
        audio = librosa.resample(audio, rate, config["sampling_rate_for_feats"])
234
        sampling_rate = config["sampling_rate_for_feats"]
235
        assert (
236
            config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0
237
        ), "'hop_size' must be 'int' value. Please check if 'sampling_rate_for_feats' is correct."
238
        hop_size = config["hop_size"] * config["sampling_rate_for_feats"] // rate
239
    else:
240
        sampling_rate = config["sampling_rate"]
241
        hop_size = config["hop_size"]
242

243
    # get spectrogram
244
    D = librosa.stft(
245
        audio,
246
        n_fft=config["fft_size"],
247
        hop_length=hop_size,
248
        win_length=config["win_length"],
249
        window=config["window"],
250
        pad_mode="reflect",
251
    )
252
    S, _ = librosa.magphase(D)  # (#bins, #frames)
253

254
    # get mel basis
255
    fmin = 0 if config["fmin"] is None else config["fmin"]
256
    fmax = sampling_rate // 2 if config["fmax"] is None else config["fmax"]
257
    mel_basis = librosa.filters.mel(
258
        sr=sampling_rate,
259
        n_fft=config["fft_size"],
260
        n_mels=config["num_mels"],
261
        fmin=fmin,
262
        fmax=fmax,
263
    )
264
    mel = np.log10(np.maximum(np.dot(mel_basis, S), 1e-10)).T  # (#frames, #bins)
265

266
    # check audio and feature length
267
    audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
268
    audio = audio[: len(mel) * hop_size]
269
    assert len(mel) * hop_size == len(audio)
270

271
    # extract raw pitch
272
    _f0, t = pw.dio(
273
        audio.astype(np.double),
274
        fs=sampling_rate,
275
        f0_ceil=fmax,
276
        frame_period=1000 * hop_size / sampling_rate,
277
    )
278
    f0 = pw.stonemask(audio.astype(np.double), _f0, t, sampling_rate)
279
    if len(f0) >= len(mel):
280
        f0 = f0[: len(mel)]
281
    else:
282
        f0 = np.pad(f0, (0, len(mel) - len(f0)))
283

284
    # extract energy
285
    energy = np.sqrt(np.sum(S ** 2, axis=0))
286
    assert len(mel) == len(f0) == len(energy)
287

288
    # remove outlier f0/energy
289
    f0 = remove_outlier(f0)
290
    energy = remove_outlier(energy)
291

292
    # apply global gain
293
    if config["global_gain_scale"] > 0.0:
294
        audio *= config["global_gain_scale"]
295
    if np.abs(audio).max() >= 1.0:
296
        logging.warn(
297
            f"{utt_id} causes clipping. It is better to reconsider global gain scale value."
298
        )
299
    item["audio"] = audio
300
    item["mel"] = mel
301
    item["f0"] = f0
302
    item["energy"] = energy
303
    return True, mel, energy, f0, item
304

305

306
def save_statistics_to_file(scaler_list, config):
307
    """Save computed statistics to disk.
308
    Args:
309
        scaler_list (List): List of scalers containing statistics to save.
310
        config (Dict): configuration dictionary.
311
    """
312
    for scaler, name in scaler_list:
313
        stats = np.stack((scaler.mean_, scaler.scale_))
314
        np.save(
315
            os.path.join(config["outdir"], f"stats{name}.npy"),
316
            stats.astype(np.float32),
317
            allow_pickle=False,
318
        )
319

320

321
def save_features_to_file(features, subdir, config):
322
    """Save transformed dataset features in disk.
323
    Args:
324
        features (Dict): dictionary containing the attributes to save.
325
        subdir (str): data split folder where features will be saved.
326
        config (Dict): configuration dictionary.
327
    """
328
    utt_id = features["utt_id"]
329

330
    if config["format"] == "npy":
331
        save_list = [
332
            (features["audio"], "wavs", "wave", np.float32),
333
            (features["mel"], "raw-feats", "raw-feats", np.float32),
334
            (features["text_ids"], "ids", "ids", np.int32),
335
            (features["f0"], "raw-f0", "raw-f0", np.float32),
336
            (features["energy"], "raw-energies", "raw-energy", np.float32),
337
        ]
338
        for item, name_dir, name_file, fmt in save_list:
339
            np.save(
340
                os.path.join(
341
                    config["outdir"], subdir, name_dir, f"{utt_id}-{name_file}.npy"
342
                ),
343
                item.astype(fmt),
344
                allow_pickle=False,
345
            )
346
    else:
347
        raise ValueError("'npy' is the only supported format.")
348

349

350
def preprocess():
351
    """Run preprocessing process and compute statistics for normalizing."""
352
    config = parse_and_config()
353

354
    dataset_processor = {
355
        "ljspeech": LJSpeechProcessor,
356
        "kss": KSSProcessor,
357
        "libritts": LibriTTSProcessor,
358
        "baker": BakerProcessor,
359
        "thorsten": ThorstenProcessor,
360
        "ljspeechu": LJSpeechUltimateProcessor,
361
        "synpaflex": SynpaflexProcessor,
362
        "jsut": JSUTProcessor,
363
    }
364

365
    dataset_symbol = {
366
        "ljspeech": LJSPEECH_SYMBOLS,
367
        "kss": KSS_SYMBOLS,
368
        "libritts": LIBRITTS_SYMBOLS,
369
        "baker": BAKER_SYMBOLS,
370
        "thorsten": THORSTEN_SYMBOLS,
371
        "ljspeechu": LJSPEECH_U_SYMBOLS,
372
        "synpaflex": SYNPAFLEX_SYMBOLS,
373
        "jsut": JSUT_SYMBOLS,
374
    }
375

376
    dataset_cleaner = {
377
        "ljspeech": "english_cleaners",
378
        "kss": "korean_cleaners",
379
        "libritts": None,
380
        "baker": None,
381
        "thorsten": "german_cleaners",
382
        "ljspeechu": "english_cleaners",
383
        "synpaflex": "basic_cleaners",
384
        "jsut": None,
385
    }
386

387
    logging.info(f"Selected '{config['dataset']}' processor.")
388
    processor = dataset_processor[config["dataset"]](
389
        config["rootdir"],
390
        symbols=dataset_symbol[config["dataset"]],
391
        cleaner_names=dataset_cleaner[config["dataset"]],
392
    )
393

394
    # check output directories
395
    build_dir = lambda x: [
396
        os.makedirs(os.path.join(config["outdir"], x, y), exist_ok=True)
397
        for y in ["raw-feats", "wavs", "ids", "raw-f0", "raw-energies"]
398
    ]
399
    build_dir("train")
400
    build_dir("valid")
401

402
    # save pretrained-processor to feature dir
403
    processor._save_mapper(
404
        os.path.join(config["outdir"], f"{config['dataset']}_mapper.json"),
405
        extra_attrs_to_save={"pinyin_dict": processor.pinyin_dict}
406
        if config["dataset"] == "baker"
407
        else {},
408
    )
409

410
    # build train test split
411
    if config["dataset"] == "libritts":
412
        train_split, valid_split, _, _ = train_test_split(
413
            processor.items,
414
            [i[-1] for i in processor.items],
415
            test_size=config["test_size"],
416
            random_state=42,
417
            shuffle=True,
418
        )
419
    else:
420
        train_split, valid_split = train_test_split(
421
            processor.items,
422
            test_size=config["test_size"],
423
            random_state=42,
424
            shuffle=True,
425
        )
426
    logging.info(f"Training items: {len(train_split)}")
427
    logging.info(f"Validation items: {len(valid_split)}")
428

429
    get_utt_id = lambda x: os.path.split(x[1])[-1].split(".")[0]
430
    train_utt_ids = [get_utt_id(x) for x in train_split]
431
    valid_utt_ids = [get_utt_id(x) for x in valid_split]
432

433
    # save train and valid utt_ids to track later
434
    np.save(os.path.join(config["outdir"], "train_utt_ids.npy"), train_utt_ids)
435
    np.save(os.path.join(config["outdir"], "valid_utt_ids.npy"), valid_utt_ids)
436

437
    # define map iterator
438
    def iterator_data(items_list):
439
        for item in items_list:
440
            yield processor.get_one_sample(item)
441

442
    train_iterator_data = iterator_data(train_split)
443
    valid_iterator_data = iterator_data(valid_split)
444

445
    p = Pool(config["n_cpus"])
446

447
    # preprocess train files and get statistics for normalizing
448
    partial_fn = partial(gen_audio_features, config=config)
449
    train_map = p.imap_unordered(
450
        partial_fn,
451
        tqdm(train_iterator_data, total=len(train_split), desc="[Preprocessing train]"),
452
        chunksize=10,
453
    )
454
    # init scaler for multiple features
455
    scaler_mel = StandardScaler(copy=False)
456
    scaler_energy = StandardScaler(copy=False)
457
    scaler_f0 = StandardScaler(copy=False)
458

459
    id_to_remove = []
460
    for result, mel, energy, f0, features in train_map:
461
        if not result:
462
            id_to_remove.append(features["utt_id"])
463
            continue
464
        save_features_to_file(features, "train", config)
465
        # partial fitting of scalers
466
        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
467
            id_to_remove.append(features["utt_id"])
468
            continue
469
        # partial fitting of scalers
470
        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
471
            id_to_remove.append(features["utt_id"])
472
            continue
473
        scaler_mel.partial_fit(mel)
474
        scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
475
        scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))
476

477
    if len(id_to_remove) > 0:
478
        np.save(
479
            os.path.join(config["outdir"], "train_utt_ids.npy"),
480
            [i for i in train_utt_ids if i not in id_to_remove],
481
        )
482
        logging.info(
483
            f"removed {len(id_to_remove)} cause of too many outliers or bad mfa extraction"
484
        )
485

486
    # save statistics to file
487
    logging.info("Saving computed statistics.")
488
    scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
489
    save_statistics_to_file(scaler_list, config)
490

491
    # preprocess valid files
492
    partial_fn = partial(gen_audio_features, config=config)
493
    valid_map = p.imap_unordered(
494
        partial_fn,
495
        tqdm(valid_iterator_data, total=len(valid_split), desc="[Preprocessing valid]"),
496
        chunksize=10,
497
    )
498
    for *_, features in valid_map:
499
        save_features_to_file(features, "valid", config)
500

501

502
def gen_normal_mel(mel_path, scaler, config):
503
    """Normalize the mel spectrogram and save it to the corresponding path.
504
    Args:
505
        mel_path (string): path of the mel spectrogram to normalize.
506
        scaler (sklearn.base.BaseEstimator): scaling function to use for normalize.
507
        config (Dict): configuration dictionary.
508
    """
509
    mel = np.load(mel_path)
510
    mel_norm = scaler.transform(mel)
511
    path, file_name = os.path.split(mel_path)
512
    *_, subdir, suffix = path.split(os.sep)
513

514
    utt_id = file_name.split(f"-{suffix}.npy")[0]
515
    np.save(
516
        os.path.join(
517
            config["outdir"], subdir, "norm-feats", f"{utt_id}-norm-feats.npy"
518
        ),
519
        mel_norm.astype(np.float32),
520
        allow_pickle=False,
521
    )
522

523

524
def normalize():
525
    """Normalize mel spectrogram with pre-computed statistics."""
526
    config = parse_and_config()
527
    if config["format"] == "npy":
528
        # init scaler with saved values
529
        scaler = StandardScaler()
530
        scaler.mean_, scaler.scale_ = np.load(
531
            os.path.join(config["outdir"], "stats.npy")
532
        )
533
        scaler.n_features_in_ = config["num_mels"]
534
    else:
535
        raise ValueError("'npy' is the only supported format.")
536

537
    # find all "raw-feats" files in both train and valid folders
538
    glob_path = os.path.join(config["rootdir"], "**", "raw-feats", "*.npy")
539
    mel_raw_feats = glob.glob(glob_path, recursive=True)
540
    logging.info(f"Files to normalize: {len(mel_raw_feats)}")
541

542
    # check for output directories
543
    os.makedirs(os.path.join(config["outdir"], "train", "norm-feats"), exist_ok=True)
544
    os.makedirs(os.path.join(config["outdir"], "valid", "norm-feats"), exist_ok=True)
545

546
    p = Pool(config["n_cpus"])
547
    partial_fn = partial(gen_normal_mel, scaler=scaler, config=config)
548
    list(p.map(partial_fn, tqdm(mel_raw_feats, desc="[Normalizing]")))
549

550

551
def compute_statistics():
552
    """Compute mean / std statistics of some features for later normalization."""
553
    config = parse_and_config()
554

555
    # find features files for the train split
556
    glob_fn = lambda x: glob.glob(os.path.join(config["rootdir"], "train", x, "*.npy"))
557
    glob_mel = glob_fn("raw-feats")
558
    glob_f0 = glob_fn("raw-f0")
559
    glob_energy = glob_fn("raw-energies")
560
    assert (
561
        len(glob_mel) == len(glob_f0) == len(glob_energy)
562
    ), "Features, f0 and energies have different files in training split."
563

564
    logging.info(f"Computing statistics for {len(glob_mel)} files.")
565
    # init scaler for multiple features
566
    scaler_mel = StandardScaler(copy=False)
567
    scaler_energy = StandardScaler(copy=False)
568
    scaler_f0 = StandardScaler(copy=False)
569

570
    for mel, f0, energy in tqdm(
571
        zip(glob_mel, glob_f0, glob_energy), total=len(glob_mel)
572
    ):
573
        # remove outliers
574
        energy = np.load(energy)
575
        f0 = np.load(f0)
576
        # partial fitting of scalers
577
        scaler_mel.partial_fit(np.load(mel))
578
        scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
579
        scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))
580

581
    # save statistics to file
582
    logging.info("Saving computed statistics.")
583
    scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
584
    save_statistics_to_file(scaler_list, config)
585

586

587
if __name__ == "__main__":
588
    preprocess()
589

590
Product

Resources

Company