Path: blob/master/tensorflow_tts/processor/jsut.py
1558 views
# -*- coding: utf-8 -*-1# Copyright 2020 TensorFlowTTS Team.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14"""Perform preprocessing and raw feature extraction for JSUT dataset."""1516import os17import re1819import numpy as np20import soundfile as sf21import pyopenjtalk22import yaml23import librosa24from dataclasses import dataclass25from tensorflow_tts.processor import BaseProcessor26# from tensorflow_tts.utils import cleaners27from tensorflow_tts.utils.utils import PROCESSOR_FILE_NAME2829valid_symbols = [30'N',31'a',32'b',33'by',34'ch',35'cl',36'd',37'dy',38'e',39'f',40'g',41'gy',42'h',43'hy',44'i',45'j',46'k',47'ky',48'm',49'my',50'n',51'ny',52'o',53'p',54'pau',55'py',56'r',57'ry',58's',59'sh',60't',61'ts',62'u',63'v',64'w',65'y',66'z'67]6869_pad = "pad"70_eos = "eos"71_sil = "sil"72# _punctuation = "!'(),.:;? "73# _special = "-"74# _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"7576# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):77# _arpabet = ["@" + s for s in valid_symbols]7879# Export all symbols:80JSUT_SYMBOLS = (81[_pad] + [_sil] + valid_symbols + [_eos]82)8384# Regular expression matching text enclosed in curly braces:85_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")868788@dataclass89class JSUTProcessor(BaseProcessor):90"""JSUT processor."""91cleaner_names: str = None92speaker_name: str = "jsut"93train_f_name: str = "text_kana/basic5000.yaml"9495def create_items(self):96items = []97if self.data_dir:98with open(99os.path.join(self.data_dir, self.train_f_name), encoding="utf-8"100) as f:101data_json = yaml.load(f, Loader=yaml.FullLoader)102103for k, v in data_json.items():104utt_id = k105phones = v['phone_level3']106phones = phones.split("-")107phones = [_sil] + phones + [_sil]108wav_path = os.path.join(self.data_dir, "wav", f"{utt_id}.wav")109items.append(110[" ".join(phones), wav_path, utt_id, self.speaker_name]111)112self.items = items113114def setup_eos_token(self):115return _eos116117def save_pretrained(self, saved_path):118os.makedirs(saved_path, exist_ok=True)119self._save_mapper(os.path.join(saved_path, PROCESSOR_FILE_NAME), {})120121def get_one_sample(self, item):122text, wav_path, utt_id, speaker_name = item123124# normalize audio signal to be [-1, 1], soundfile already norm.125audio, rate = sf.read(wav_path)126audio = audio.astype(np.float32)127128# if rate != self.target_rate:129# assert rate > self.target_rate130# audio = librosa.resample(audio, rate, self.target_rate)131132# convert text to ids133text_ids = np.asarray(self.text_to_sequence(text), np.int32)134135sample = {136"raw_text": text,137"text_ids": text_ids,138"audio": audio,139"utt_id": utt_id,140"speaker_name": speaker_name,141"rate": rate,142}143144return sample145146def text_to_sequence(self, text, inference=False):147sequence = []148# Check for curly braces and treat their contents as ARPAbet:149if inference:150text = pyopenjtalk.g2p(text)151text = text.replace("I", "i")152text = text.replace("U", "u")153print(f"phoneme seq: {text}")154155for symbol in text.split():156idx = self.symbol_to_id[symbol]157sequence.append(idx)158159# add eos tokens160sequence += [self.eos_id]161return sequence162163164