Path: blob/master/tensorflow_tts/processor/ljspeechu.py
1558 views
# -*- coding: utf-8 -*-1# Copyright 2020 TensorFlowTTS Team.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14"""Perform preprocessing and raw feature extraction for LJSpeech Ultimate dataset."""1516import os17import re1819import numpy as np20import soundfile as sf21from dataclasses import dataclass22from tensorflow_tts.processor import BaseProcessor23from tensorflow_tts.utils import cleaners24from tensorflow_tts.utils.utils import PROCESSOR_FILE_NAME25from g2p_en import G2p as grapheme_to_phn2627valid_symbols = [28"AA",29"AA0",30"AA1",31"AA2",32"AE",33"AE0",34"AE1",35"AE2",36"AH",37"AH0",38"AH1",39"AH2",40"AO",41"AO0",42"AO1",43"AO2",44"AW",45"AW0",46"AW1",47"AW2",48"AY",49"AY0",50"AY1",51"AY2",52"B",53"CH",54"D",55"DH",56"EH",57"EH0",58"EH1",59"EH2",60"ER",61"ER0",62"ER1",63"ER2",64"EY",65"EY0",66"EY1",67"EY2",68"F",69"G",70"HH",71"IH",72"IH0",73"IH1",74"IH2",75"IY",76"IY0",77"IY1",78"IY2",79"JH",80"K",81"L",82"M",83"N",84"NG",85"OW",86"OW0",87"OW1",88"OW2",89"OY",90"OY0",91"OY1",92"OY2",93"P",94"R",95"S",96"SH",97"T",98"TH",99"UH",100"UH0",101"UH1",102"UH2",103"UW",104"UW0",105"UW1",106"UW2",107"V",108"W",109"Y",110"Z",111"ZH",112]113114_pad = "pad"115_eos = "eos"116_punctuation = "!'(),.:;?" # Unlike LJSpeech, we do not use spaces since we are phoneme only and spaces lead to very bad attention performance with phonetic input.117_special = "-"118119# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):120_arpabet = ["@" + s for s in valid_symbols]121122# Export all symbols:123LJSPEECH_U_SYMBOLS = [_pad] + list(_special) + list(_punctuation) + _arpabet + [_eos]124125# Regular expression matching text enclosed in curly braces:126_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")127128129_arpa_exempt = _punctuation + _special130131arpa_g2p = grapheme_to_phn()132133134@dataclass135class LJSpeechUltimateProcessor(BaseProcessor):136"""LJSpeech Ultimate processor."""137138cleaner_names: str = "english_cleaners"139positions = {140"wave_file": 0,141"text_norm": 1,142}143train_f_name: str = "filelist.txt"144145def create_items(self):146if self.data_dir:147with open(148os.path.join(self.data_dir, self.train_f_name), encoding="utf-8"149) as f:150self.items = [self.split_line(self.data_dir, line, "|") for line in f]151152def split_line(self, data_dir, line, split):153parts = line.strip().split(split)154wave_file = parts[self.positions["wave_file"]]155text_norm = parts[self.positions["text_norm"]]156wav_path = os.path.join(data_dir, wave_file)157speaker_name = "ljspeech"158return text_norm, wav_path, speaker_name159160def setup_eos_token(self):161return _eos162163def save_pretrained(self, saved_path):164os.makedirs(saved_path, exist_ok=True)165self._save_mapper(os.path.join(saved_path, PROCESSOR_FILE_NAME), {})166167def to_arpa(self, in_str):168phn_arr = arpa_g2p(in_str)169phn_arr = [x for x in phn_arr if x != " "]170171arpa_str = "{"172in_chain = True173174# Iterative array-traverse approach to build ARPA string. Phonemes must be in curly braces, but not punctuation175for token in phn_arr:176if token in _arpa_exempt and in_chain:177arpa_str += " }"178in_chain = False179180if token not in _arpa_exempt and not in_chain:181arpa_str += " {"182in_chain = True183184arpa_str += " " + token185186if in_chain:187arpa_str += " }"188189return arpa_str190191def get_one_sample(self, item):192text, wav_path, speaker_name = item193194# Check if this line is already an ARPA string by searching for the trademark curly brace. If not, we apply195if not "{" in text:196text = self.to_arpa(text)197198# normalize audio signal to be [-1, 1], soundfile already norm.199audio, rate = sf.read(wav_path)200audio = audio.astype(np.float32)201202# convert text to ids203text_ids = np.asarray(self.text_to_sequence(text), np.int32)204205sample = {206"raw_text": text,207"text_ids": text_ids,208"audio": audio,209"utt_id": os.path.split(wav_path)[-1].split(".")[0],210"speaker_name": speaker_name,211"rate": rate,212}213214return sample215216def text_to_sequence(self, text):217sequence = []218# Check for curly braces and treat their contents as ARPAbet:219while len(text):220m = _curly_re.match(text)221if not m:222sequence += self._symbols_to_sequence(223self._clean_text(text, [self.cleaner_names])224)225break226sequence += self._symbols_to_sequence(227self._clean_text(m.group(1), [self.cleaner_names])228)229sequence += self._arpabet_to_sequence(m.group(2))230text = m.group(3)231232# add eos tokens233sequence += [self.eos_id]234return sequence235236def _clean_text(self, text, cleaner_names):237for name in cleaner_names:238cleaner = getattr(cleaners, name)239if not cleaner:240raise Exception("Unknown cleaner: %s" % name)241text = cleaner(text)242return text243244def _symbols_to_sequence(self, symbols):245return [self.symbol_to_id[s] for s in symbols if self._should_keep_symbol(s)]246247def _arpabet_to_sequence(self, text):248return self._symbols_to_sequence(["@" + s for s in text.split()])249250def _should_keep_symbol(self, s):251return s in self.symbol_to_id and s != "_" and s != "~"252253254