CoCalc -- jsut.py

GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/processor/jsut.py
¹⁵⁵⁸ views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 TensorFlowTTS Team.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""Perform preprocessing and raw feature extraction for JSUT dataset."""
16

17
import os
18
import re
19

20
import numpy as np
21
import soundfile as sf
22
import pyopenjtalk
23
import yaml
24
import librosa
25
from dataclasses import dataclass
26
from tensorflow_tts.processor import BaseProcessor
27
# from tensorflow_tts.utils import cleaners
28
from tensorflow_tts.utils.utils import PROCESSOR_FILE_NAME
29

30
valid_symbols = [
31
    'N',
32
    'a',
33
    'b',
34
    'by',
35
    'ch',
36
    'cl',
37
    'd',
38
    'dy',
39
    'e',
40
    'f',
41
    'g',
42
    'gy',
43
    'h',
44
    'hy',
45
    'i',
46
    'j',
47
    'k',
48
    'ky',
49
    'm',
50
    'my',
51
    'n',
52
    'ny',
53
    'o',
54
    'p',
55
    'pau',
56
    'py',
57
    'r',
58
    'ry',
59
    's',
60
    'sh',
61
    't',
62
    'ts',
63
    'u',
64
    'v',
65
    'w',
66
    'y',
67
    'z'
68
]
69

70
_pad = "pad"
71
_eos = "eos"
72
_sil = "sil"
73
# _punctuation = "!'(),.:;? "
74
# _special = "-"
75
# _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
76

77
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
78
# _arpabet = ["@" + s for s in valid_symbols]
79

80
# Export all symbols:
81
JSUT_SYMBOLS = (
82
    [_pad] + [_sil] + valid_symbols + [_eos]
83
)
84

85
# Regular expression matching text enclosed in curly braces:
86
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
87

88

89
@dataclass
90
class JSUTProcessor(BaseProcessor):
91
    """JSUT processor."""
92
    cleaner_names: str = None
93
    speaker_name: str = "jsut"
94
    train_f_name: str = "text_kana/basic5000.yaml"
95

96
    def create_items(self):
97
        items = []
98
        if self.data_dir:
99
            with open(
100
                os.path.join(self.data_dir, self.train_f_name), encoding="utf-8"
101
            ) as f:
102
                data_json = yaml.load(f, Loader=yaml.FullLoader)
103

104
                for k, v in data_json.items():
105
                    utt_id = k
106
                    phones = v['phone_level3']
107
                    phones = phones.split("-")
108
                    phones = [_sil] + phones + [_sil]
109
                    wav_path = os.path.join(self.data_dir, "wav", f"{utt_id}.wav")
110
                    items.append(
111
                        [" ".join(phones), wav_path, utt_id, self.speaker_name]
112
                    )
113
            self.items = items
114

115
    def setup_eos_token(self):
116
        return _eos
117

118
    def save_pretrained(self, saved_path):
119
        os.makedirs(saved_path, exist_ok=True)
120
        self._save_mapper(os.path.join(saved_path, PROCESSOR_FILE_NAME), {})
121

122
    def get_one_sample(self, item):
123
        text, wav_path, utt_id, speaker_name = item
124

125
        # normalize audio signal to be [-1, 1], soundfile already norm.
126
        audio, rate = sf.read(wav_path)
127
        audio = audio.astype(np.float32)
128

129
        # if rate != self.target_rate:
130
        #     assert rate > self.target_rate
131
        #     audio = librosa.resample(audio, rate, self.target_rate)
132

133
        # convert text to ids
134
        text_ids = np.asarray(self.text_to_sequence(text), np.int32)
135

136
        sample = {
137
            "raw_text": text,
138
            "text_ids": text_ids,
139
            "audio": audio,
140
            "utt_id": utt_id,
141
            "speaker_name": speaker_name,
142
            "rate": rate,
143
        }
144

145
        return sample
146

147
    def text_to_sequence(self, text, inference=False):
148
        sequence = []
149
        # Check for curly braces and treat their contents as ARPAbet:
150
        if inference:
151
            text = pyopenjtalk.g2p(text)
152
            text = text.replace("I", "i")
153
            text = text.replace("U", "u")
154
            print(f"phoneme seq: {text}")
155

156
        for symbol in text.split():
157
            idx = self.symbol_to_id[symbol]
158
            sequence.append(idx)
159

160
        # add eos tokens
161
        sequence += [self.eos_id]
162
        return sequence
163

164
Product

Resources

Company