Path: blob/master/tensorflow_tts/configs/fastspeech.py
1558 views
# -*- coding: utf-8 -*-1# Copyright 2020 Minh Nguyen (@dathudeptrai)2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.14"""FastSpeech Config object."""1516import collections1718from tensorflow_tts.configs import BaseConfig19from tensorflow_tts.processor.ljspeech import LJSPEECH_SYMBOLS as lj_symbols20from tensorflow_tts.processor.kss import KSS_SYMBOLS as kss_symbols21from tensorflow_tts.processor.baker import BAKER_SYMBOLS as bk_symbols22from tensorflow_tts.processor.libritts import LIBRITTS_SYMBOLS as lbri_symbols23from tensorflow_tts.processor.jsut import JSUT_SYMBOLS as jsut_symbols242526SelfAttentionParams = collections.namedtuple(27"SelfAttentionParams",28[29"n_speakers",30"hidden_size",31"num_hidden_layers",32"num_attention_heads",33"attention_head_size",34"intermediate_size",35"intermediate_kernel_size",36"hidden_act",37"output_attentions",38"output_hidden_states",39"initializer_range",40"hidden_dropout_prob",41"attention_probs_dropout_prob",42"layer_norm_eps",43"max_position_embeddings",44],45)464748class FastSpeechConfig(BaseConfig):49"""Initialize FastSpeech Config."""5051def __init__(52self,53dataset="ljspeech",54vocab_size=len(lj_symbols),55n_speakers=1,56encoder_hidden_size=384,57encoder_num_hidden_layers=4,58encoder_num_attention_heads=2,59encoder_attention_head_size=192,60encoder_intermediate_size=1024,61encoder_intermediate_kernel_size=3,62encoder_hidden_act="mish",63decoder_hidden_size=384,64decoder_num_hidden_layers=4,65decoder_num_attention_heads=2,66decoder_attention_head_size=192,67decoder_intermediate_size=1024,68decoder_intermediate_kernel_size=3,69decoder_hidden_act="mish",70output_attentions=True,71output_hidden_states=True,72hidden_dropout_prob=0.1,73attention_probs_dropout_prob=0.1,74initializer_range=0.02,75layer_norm_eps=1e-5,76max_position_embeddings=2048,77num_duration_conv_layers=2,78duration_predictor_filters=256,79duration_predictor_kernel_sizes=3,80num_mels=80,81duration_predictor_dropout_probs=0.1,82n_conv_postnet=5,83postnet_conv_filters=512,84postnet_conv_kernel_sizes=5,85postnet_dropout_rate=0.1,86**kwargs87):88"""Init parameters for Fastspeech model."""89# encoder params90if dataset == "ljspeech":91self.vocab_size = vocab_size92elif dataset == "kss":93self.vocab_size = len(kss_symbols)94elif dataset == "baker":95self.vocab_size = len(bk_symbols)96elif dataset == "libritts":97self.vocab_size = len(lbri_symbols)98elif dataset == "jsut":99self.vocab_size = len(jsut_symbols)100else:101raise ValueError("No such dataset: {}".format(dataset))102self.initializer_range = initializer_range103self.max_position_embeddings = max_position_embeddings104self.n_speakers = n_speakers105self.layer_norm_eps = layer_norm_eps106107# encoder params108self.encoder_self_attention_params = SelfAttentionParams(109n_speakers=n_speakers,110hidden_size=encoder_hidden_size,111num_hidden_layers=encoder_num_hidden_layers,112num_attention_heads=encoder_num_attention_heads,113attention_head_size=encoder_attention_head_size,114hidden_act=encoder_hidden_act,115intermediate_size=encoder_intermediate_size,116intermediate_kernel_size=encoder_intermediate_kernel_size,117output_attentions=output_attentions,118output_hidden_states=output_hidden_states,119initializer_range=initializer_range,120hidden_dropout_prob=hidden_dropout_prob,121attention_probs_dropout_prob=attention_probs_dropout_prob,122layer_norm_eps=layer_norm_eps,123max_position_embeddings=max_position_embeddings,124)125126# decoder params127self.decoder_self_attention_params = SelfAttentionParams(128n_speakers=n_speakers,129hidden_size=decoder_hidden_size,130num_hidden_layers=decoder_num_hidden_layers,131num_attention_heads=decoder_num_attention_heads,132attention_head_size=decoder_attention_head_size,133hidden_act=decoder_hidden_act,134intermediate_size=decoder_intermediate_size,135intermediate_kernel_size=decoder_intermediate_kernel_size,136output_attentions=output_attentions,137output_hidden_states=output_hidden_states,138initializer_range=initializer_range,139hidden_dropout_prob=hidden_dropout_prob,140attention_probs_dropout_prob=attention_probs_dropout_prob,141layer_norm_eps=layer_norm_eps,142max_position_embeddings=max_position_embeddings,143)144145self.duration_predictor_dropout_probs = duration_predictor_dropout_probs146self.num_duration_conv_layers = num_duration_conv_layers147self.duration_predictor_filters = duration_predictor_filters148self.duration_predictor_kernel_sizes = duration_predictor_kernel_sizes149self.num_mels = num_mels150151# postnet152self.n_conv_postnet = n_conv_postnet153self.postnet_conv_filters = postnet_conv_filters154self.postnet_conv_kernel_sizes = postnet_conv_kernel_sizes155self.postnet_dropout_rate = postnet_dropout_rate156157158