Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
TensorSpeech
GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/models/fastspeech2.py
1558 views
1
# -*- coding: utf-8 -*-
2
# Copyright 2020 The FastSpeech2 Authors and Minh Nguyen (@dathudeptrai)
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
"""Tensorflow Model modules for FastSpeech2."""
16
17
import tensorflow as tf
18
19
from tensorflow_tts.models.fastspeech import TFFastSpeech, get_initializer
20
21
22
class TFFastSpeechVariantPredictor(tf.keras.layers.Layer):
23
"""FastSpeech duration predictor module."""
24
25
def __init__(self, config, **kwargs):
26
"""Init variables."""
27
super().__init__(**kwargs)
28
self.conv_layers = []
29
for i in range(config.variant_prediction_num_conv_layers):
30
self.conv_layers.append(
31
tf.keras.layers.Conv1D(
32
config.variant_predictor_filter,
33
config.variant_predictor_kernel_size,
34
padding="same",
35
name="conv_._{}".format(i),
36
)
37
)
38
self.conv_layers.append(tf.keras.layers.Activation(tf.nn.relu))
39
self.conv_layers.append(
40
tf.keras.layers.LayerNormalization(
41
epsilon=config.layer_norm_eps, name="LayerNorm_._{}".format(i)
42
)
43
)
44
self.conv_layers.append(
45
tf.keras.layers.Dropout(config.variant_predictor_dropout_rate)
46
)
47
self.conv_layers_sequence = tf.keras.Sequential(self.conv_layers)
48
self.output_layer = tf.keras.layers.Dense(1)
49
50
if config.n_speakers > 1:
51
self.decoder_speaker_embeddings = tf.keras.layers.Embedding(
52
config.n_speakers,
53
config.encoder_self_attention_params.hidden_size,
54
embeddings_initializer=get_initializer(config.initializer_range),
55
name="speaker_embeddings",
56
)
57
self.speaker_fc = tf.keras.layers.Dense(
58
units=config.encoder_self_attention_params.hidden_size,
59
name="speaker_fc",
60
)
61
62
self.config = config
63
64
def call(self, inputs, training=False):
65
"""Call logic."""
66
encoder_hidden_states, speaker_ids, attention_mask = inputs
67
attention_mask = tf.cast(
68
tf.expand_dims(attention_mask, 2), encoder_hidden_states.dtype
69
)
70
71
if self.config.n_speakers > 1:
72
speaker_embeddings = self.decoder_speaker_embeddings(speaker_ids)
73
speaker_features = tf.math.softplus(self.speaker_fc(speaker_embeddings))
74
# extended speaker embeddings
75
extended_speaker_features = speaker_features[:, tf.newaxis, :]
76
encoder_hidden_states += extended_speaker_features
77
78
# mask encoder hidden states
79
masked_encoder_hidden_states = encoder_hidden_states * attention_mask
80
81
# pass though first layer
82
outputs = self.conv_layers_sequence(masked_encoder_hidden_states)
83
outputs = self.output_layer(outputs)
84
masked_outputs = outputs * attention_mask
85
86
outputs = tf.squeeze(masked_outputs, -1)
87
return outputs
88
89
90
class TFFastSpeech2(TFFastSpeech):
91
"""TF Fastspeech module."""
92
93
def __init__(self, config, **kwargs):
94
"""Init layers for fastspeech."""
95
super().__init__(config, **kwargs)
96
self.f0_predictor = TFFastSpeechVariantPredictor(
97
config, dtype=tf.float32, name="f0_predictor"
98
)
99
self.energy_predictor = TFFastSpeechVariantPredictor(
100
config, dtype=tf.float32, name="energy_predictor",
101
)
102
self.duration_predictor = TFFastSpeechVariantPredictor(
103
config, dtype=tf.float32, name="duration_predictor"
104
)
105
106
# define f0_embeddings and energy_embeddings
107
self.f0_embeddings = tf.keras.layers.Conv1D(
108
filters=config.encoder_self_attention_params.hidden_size,
109
kernel_size=9,
110
padding="same",
111
name="f0_embeddings",
112
)
113
self.f0_dropout = tf.keras.layers.Dropout(0.5)
114
self.energy_embeddings = tf.keras.layers.Conv1D(
115
filters=config.encoder_self_attention_params.hidden_size,
116
kernel_size=9,
117
padding="same",
118
name="energy_embeddings",
119
)
120
self.energy_dropout = tf.keras.layers.Dropout(0.5)
121
122
def _build(self):
123
"""Dummy input for building model."""
124
# fake inputs
125
input_ids = tf.convert_to_tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], tf.int32)
126
speaker_ids = tf.convert_to_tensor([0], tf.int32)
127
duration_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32)
128
f0_gts = tf.convert_to_tensor(
129
[[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]], tf.float32
130
)
131
energy_gts = tf.convert_to_tensor(
132
[[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]], tf.float32
133
)
134
self(
135
input_ids=input_ids,
136
speaker_ids=speaker_ids,
137
duration_gts=duration_gts,
138
f0_gts=f0_gts,
139
energy_gts=energy_gts,
140
)
141
142
def call(
143
self,
144
input_ids,
145
speaker_ids,
146
duration_gts,
147
f0_gts,
148
energy_gts,
149
training=False,
150
**kwargs,
151
):
152
"""Call logic."""
153
attention_mask = tf.math.not_equal(input_ids, 0)
154
embedding_output = self.embeddings([input_ids, speaker_ids], training=training)
155
encoder_output = self.encoder(
156
[embedding_output, attention_mask], training=training
157
)
158
last_encoder_hidden_states = encoder_output[0]
159
160
# energy predictor, here use last_encoder_hidden_states, u can use more hidden_states layers
161
# rather than just use last_hidden_states of encoder for energy_predictor.
162
duration_outputs = self.duration_predictor(
163
[last_encoder_hidden_states, speaker_ids, attention_mask]
164
) # [batch_size, length]
165
166
f0_outputs = self.f0_predictor(
167
[last_encoder_hidden_states, speaker_ids, attention_mask], training=training
168
)
169
energy_outputs = self.energy_predictor(
170
[last_encoder_hidden_states, speaker_ids, attention_mask], training=training
171
)
172
173
f0_embedding = self.f0_embeddings(
174
tf.expand_dims(f0_gts, 2)
175
) # [barch_size, mel_length, feature]
176
energy_embedding = self.energy_embeddings(
177
tf.expand_dims(energy_gts, 2)
178
) # [barch_size, mel_length, feature]
179
180
# apply dropout both training/inference
181
f0_embedding = self.f0_dropout(f0_embedding, training=True)
182
energy_embedding = self.energy_dropout(energy_embedding, training=True)
183
184
# sum features
185
last_encoder_hidden_states += f0_embedding + energy_embedding
186
187
length_regulator_outputs, encoder_masks = self.length_regulator(
188
[last_encoder_hidden_states, duration_gts], training=training
189
)
190
191
# create decoder positional embedding
192
decoder_pos = tf.range(
193
1, tf.shape(length_regulator_outputs)[1] + 1, dtype=tf.int32
194
)
195
masked_decoder_pos = tf.expand_dims(decoder_pos, 0) * encoder_masks
196
197
decoder_output = self.decoder(
198
[length_regulator_outputs, speaker_ids, encoder_masks, masked_decoder_pos],
199
training=training,
200
)
201
last_decoder_hidden_states = decoder_output[0]
202
203
# here u can use sum or concat more than 1 hidden states layers from decoder.
204
mels_before = self.mel_dense(last_decoder_hidden_states)
205
mels_after = (
206
self.postnet([mels_before, encoder_masks], training=training) + mels_before
207
)
208
209
outputs = (
210
mels_before,
211
mels_after,
212
duration_outputs,
213
f0_outputs,
214
energy_outputs,
215
)
216
return outputs
217
218
def _inference(
219
self, input_ids, speaker_ids, speed_ratios, f0_ratios, energy_ratios, **kwargs,
220
):
221
"""Call logic."""
222
attention_mask = tf.math.not_equal(input_ids, 0)
223
embedding_output = self.embeddings([input_ids, speaker_ids], training=False)
224
encoder_output = self.encoder(
225
[embedding_output, attention_mask], training=False
226
)
227
last_encoder_hidden_states = encoder_output[0]
228
229
# expand ratios
230
speed_ratios = tf.expand_dims(speed_ratios, 1) # [B, 1]
231
f0_ratios = tf.expand_dims(f0_ratios, 1) # [B, 1]
232
energy_ratios = tf.expand_dims(energy_ratios, 1) # [B, 1]
233
234
# energy predictor, here use last_encoder_hidden_states, u can use more hidden_states layers
235
# rather than just use last_hidden_states of encoder for energy_predictor.
236
duration_outputs = self.duration_predictor(
237
[last_encoder_hidden_states, speaker_ids, attention_mask]
238
) # [batch_size, length]
239
duration_outputs = tf.nn.relu(tf.math.exp(duration_outputs) - 1.0)
240
duration_outputs = tf.cast(
241
tf.math.round(duration_outputs * speed_ratios), tf.int32
242
)
243
244
f0_outputs = self.f0_predictor(
245
[last_encoder_hidden_states, speaker_ids, attention_mask], training=False
246
)
247
f0_outputs *= f0_ratios
248
249
energy_outputs = self.energy_predictor(
250
[last_encoder_hidden_states, speaker_ids, attention_mask], training=False
251
)
252
energy_outputs *= energy_ratios
253
254
f0_embedding = self.f0_dropout(
255
self.f0_embeddings(tf.expand_dims(f0_outputs, 2)), training=True
256
)
257
energy_embedding = self.energy_dropout(
258
self.energy_embeddings(tf.expand_dims(energy_outputs, 2)), training=True
259
)
260
261
# sum features
262
last_encoder_hidden_states += f0_embedding + energy_embedding
263
264
length_regulator_outputs, encoder_masks = self.length_regulator(
265
[last_encoder_hidden_states, duration_outputs], training=False
266
)
267
268
# create decoder positional embedding
269
decoder_pos = tf.range(
270
1, tf.shape(length_regulator_outputs)[1] + 1, dtype=tf.int32
271
)
272
masked_decoder_pos = tf.expand_dims(decoder_pos, 0) * encoder_masks
273
274
decoder_output = self.decoder(
275
[length_regulator_outputs, speaker_ids, encoder_masks, masked_decoder_pos],
276
training=False,
277
)
278
last_decoder_hidden_states = decoder_output[0]
279
280
# here u can use sum or concat more than 1 hidden states layers from decoder.
281
mel_before = self.mel_dense(last_decoder_hidden_states)
282
mel_after = (
283
self.postnet([mel_before, encoder_masks], training=False) + mel_before
284
)
285
286
outputs = (mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs)
287
return outputs
288
289
def setup_inference_fn(self):
290
self.inference = tf.function(
291
self._inference,
292
experimental_relax_shapes=True,
293
input_signature=[
294
tf.TensorSpec(shape=[None, None], dtype=tf.int32, name="input_ids"),
295
tf.TensorSpec(shape=[None,], dtype=tf.int32, name="speaker_ids"),
296
tf.TensorSpec(shape=[None,], dtype=tf.float32, name="speed_ratios"),
297
tf.TensorSpec(shape=[None,], dtype=tf.float32, name="f0_ratios"),
298
tf.TensorSpec(shape=[None,], dtype=tf.float32, name="energy_ratios"),
299
],
300
)
301
302
self.inference_tflite = tf.function(
303
self._inference,
304
experimental_relax_shapes=True,
305
input_signature=[
306
tf.TensorSpec(shape=[1, None], dtype=tf.int32, name="input_ids"),
307
tf.TensorSpec(shape=[1,], dtype=tf.int32, name="speaker_ids"),
308
tf.TensorSpec(shape=[1,], dtype=tf.float32, name="speed_ratios"),
309
tf.TensorSpec(shape=[1,], dtype=tf.float32, name="f0_ratios"),
310
tf.TensorSpec(shape=[1,], dtype=tf.float32, name="energy_ratios"),
311
],
312
)
313
314