Path: blob/master/tensorflow_tts/utils/cleaners.py
1558 views
# -*- coding: utf-8 -*-1# Copyright (c) 2017 Keith Ito2#3# Permission is hereby granted, free of charge, to any person obtaining a copy4# of this software and associated documentation files (the "Software"), to deal5# in the Software without restriction, including without limitation the rights6# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell7# copies of the Software, and to permit persons to whom the Software is8# furnished to do so, subject to the following conditions:910# The above copyright notice and this permission notice shall be included in11# all copies or substantial portions of the Software.1213# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR14# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,15# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE16# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER17# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,18# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN19# THE SOFTWARE.2021import re2223from tensorflow_tts.utils.korean import tokenize as ko_tokenize24from tensorflow_tts.utils.number_norm import normalize_numbers25from unidecode import unidecode2627try:28from german_transliterate.core import GermanTransliterate29except:30pass3132# Regular expression matching whitespace:33_whitespace_re = re.compile(r"\s+")3435# List of (regular expression, replacement) pairs for abbreviations:36_abbreviations = [37(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])38for x in [39("mrs", "misess"),40("mr", "mister"),41("dr", "doctor"),42("st", "saint"),43("co", "company"),44("jr", "junior"),45("maj", "major"),46("gen", "general"),47("drs", "doctors"),48("rev", "reverend"),49("lt", "lieutenant"),50("hon", "honorable"),51("sgt", "sergeant"),52("capt", "captain"),53("esq", "esquire"),54("ltd", "limited"),55("col", "colonel"),56("ft", "fort"),57]58]596061def expand_abbreviations(text):62for regex, replacement in _abbreviations:63text = re.sub(regex, replacement, text)64return text656667def expand_numbers(text):68return normalize_numbers(text)697071def lowercase(text):72return text.lower()737475def collapse_whitespace(text):76return re.sub(_whitespace_re, " ", text)777879def convert_to_ascii(text):80return unidecode(text)818283def basic_cleaners(text):84"""Basic pipeline that lowercases and collapses whitespace without transliteration."""85text = lowercase(text)86text = collapse_whitespace(text)87return text888990def transliteration_cleaners(text):91"""Pipeline for non-English text that transliterates to ASCII."""92text = convert_to_ascii(text)93text = lowercase(text)94text = collapse_whitespace(text)95return text969798def english_cleaners(text):99"""Pipeline for English text, including number and abbreviation expansion."""100text = convert_to_ascii(text)101text = lowercase(text)102text = expand_numbers(text)103text = expand_abbreviations(text)104text = collapse_whitespace(text)105return text106107108def korean_cleaners(text):109"""Pipeline for Korean text, including number and abbreviation expansion."""110text = ko_tokenize(111text112) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ']113return text114115def german_cleaners(text):116"""Pipeline for German text, including number and abbreviation expansion."""117try:118text = GermanTransliterate(replace={';': ',', ':': ' '}, sep_abbreviation=' -- ').transliterate(text)119except NameError:120raise ModuleNotFoundError("Install german_transliterate package to use german_cleaners")121return text122123