GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/notebooks/TensorFlowTTS_FastSpeech_with_TFLite.ipynb
¹⁵⁵⁸ views

Kernel: Python 3

Copyright 2020 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");

In [ ]:

#@title Licensed under the Apache License, Version 2.0 (the "License"); { display-mode: "form" }
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Authors : jaeyoo@, khanhlvg@, abattery@, thaink@ (Google Research) (refactored by sayakpaul (PyImageSearch))

Created : 2020-07-03 KST

Last updated : 2020-07-04 KST

Change logs

2020-07-04 KST : Update notebook with the latest repo.
https://github.com/TensorSpeech/TensorflowTTS/pull/84 merged.
2020-07-03 KST : First implementation (outputs : fastspeech_quant.tflite)
varied-length input tensor, varied-length output tensor
Inference on tflite works well.
2020-12-22 IST: Notebook runs end-to-end on Colab.

Status : successfully converted (fastspeech_quant.tflite)

Disclaimer

This colab doesn't care about the latency, so it compressed the model with quantization. (112 MB -> 28 MB)
The TFLite file doesn't have LJSpeechProcessor. So you need to run it before feeding input vectors.
tf-nightly>=2.4.0-dev20200630

Generate voice with FastSpeech

In [ ]:

!git clone https://github.com/TensorSpeech/TensorFlowTTS.git
!cd TensorFlowTTS
!pip install /content/TensorFlowTTS/

In [ ]:

!pip install -q tf-nightly

Another runtime restart is required.

In [ ]:

import numpy as np
import yaml
import tensorflow as tf

import sys
sys.path.append('/content/TensorFlowTTS')

from tensorflow_tts.inference import AutoProcessor
from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import TFAutoModel

from IPython.display import Audio
print(tf.__version__) # check if >= 2.4.0

In [ ]:

# initialize melgan model
melgan = TFAutoModel.from_pretrained("tensorspeech/tts-melgan-ljspeech-en")

In [ ]:

# initialize FastSpeech model.
fastspeech = TFAutoModel.from_pretraned("tensorspeech/tts-fastspeech-ljspeech-en")

In [ ]:

input_text = "Recent research at Harvard has shown meditating\
for as little as 8 weeks, can actually increase the grey matter in the \
parts of the brain responsible for emotional regulation, and learning."

processor = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech-ljspeech-en")
input_ids = processor.text_to_sequence(input_text.lower())

mel_before, mel_after, duration_outputs, _, _ = fastspeech.inference(
    input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
    speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
    speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
    f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
    energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
)

audio_before = melgan(mel_before)[0, :, 0]
audio_after = melgan(mel_after)[0, :, 0]

In [ ]:

Audio(data=audio_before, rate=22050)

In [ ]:

Audio(data=audio_after, rate=22050)

Convert to TFLite

In [ ]:

# Concrete Function
fastspeech_concrete_function = fastspeech.inference_tflite.get_concrete_function()

In [ ]:

converter = tf.lite.TFLiteConverter.from_concrete_functions(
    [fastspeech_concrete_function]
)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
                                       tf.lite.OpsSet.SELECT_TF_OPS]
tflite_model = converter.convert()

In [ ]:

# Save the TF Lite model.
with open('fastspeech_quant.tflite', 'wb') as f:
  f.write(tflite_model)

print('Model size is %f MBs.' % (len(tflite_model) / 1024 / 1024.0) )

In [ ]:

## Download the TF Lite model
#from google.colab import files
#files.download('fastspeech_quant.tflite')

Inference from TFLite

In [ ]:

import numpy as np
import tensorflow as tf

# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path='fastspeech_quant.tflite')

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Prepare input data.
def prepare_input(input_ids):
  input_ids = tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0)
  return (input_ids,
          tf.convert_to_tensor([0], tf.int32),
          tf.convert_to_tensor([1.0], dtype=tf.float32),
          tf.convert_to_tensor([1.0], dtype=tf.float32),
          tf.convert_to_tensor([1.0], dtype=tf.float32))

# Test the model on random input data.
def infer(input_text):
  processor = AutoProcessor.from_pretrained(pretrained_path="ljspeech_mapper.json")
  input_ids = processor.text_to_sequence(input_text.lower())
  interpreter.resize_tensor_input(input_details[0]['index'], 
                                  [1, len(input_ids)])
  interpreter.resize_tensor_input(input_details[1]['index'], 
                                  [1])
  interpreter.resize_tensor_input(input_details[2]['index'], 
                                  [1])
  interpreter.resize_tensor_input(input_details[3]['index'], 
                                  [1])
  interpreter.resize_tensor_input(input_details[4]['index'], 
                                  [1])
  interpreter.allocate_tensors()
  input_data = prepare_input(input_ids)
  for i, detail in enumerate(input_details):
    input_shape = detail['shape_signature']
    interpreter.set_tensor(detail['index'], input_data[i])

  interpreter.invoke()

  # The function `get_tensor()` returns a copy of the tensor data.
  # Use `tensor()` in order to get a pointer to the tensor.
  return (interpreter.get_tensor(output_details[0]['index']),
          interpreter.get_tensor(output_details[1]['index']))

In [ ]:

input_text = "Recent research at Harvard has shown meditating\
for as little as 8 weeks, can actually increase the grey matter in the \
parts of the brain responsible for emotional regulation, and learning."

decoder_output_tflite, mel_output_tflite = infer(input_text)
audio_before_tflite = melgan(decoder_output_tflite)[0, :, 0]
audio_after_tflite = melgan(mel_output_tflite)[0, :, 0]

In [ ]:

Audio(data=audio_before_tflite, rate=22050)

In [ ]:

Audio(data=audio_after_tflite, rate=22050)

In [ ]:

input_text = "I love TensorFlow Lite converted FastSpeech with quantization. \
The converted model file is of 28.6 Mega bytes."

decoder_output_tflite, mel_output_tflite = infer(input_text)
audio_before_tflite = melgan(decoder_output_tflite)[0, :, 0]
audio_after_tflite = melgan(mel_output_tflite)[0, :, 0]

In [ ]:

Audio(data=audio_before_tflite, rate=22050)

In [ ]:

Audio(data=audio_after_tflite, rate=22050)

Copyright 2020 The TensorFlow Authors. All Rights Reserved.

Generate voice with FastSpeech

Convert to TFLite

Inference from TFLite

Product

Resources

Company