Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
TensorSpeech
GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/utils/cleaners.py
1558 views
1
# -*- coding: utf-8 -*-
2
# Copyright (c) 2017 Keith Ito
3
#
4
# Permission is hereby granted, free of charge, to any person obtaining a copy
5
# of this software and associated documentation files (the "Software"), to deal
6
# in the Software without restriction, including without limitation the rights
7
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
# copies of the Software, and to permit persons to whom the Software is
9
# furnished to do so, subject to the following conditions:
10
11
# The above copyright notice and this permission notice shall be included in
12
# all copies or substantial portions of the Software.
13
14
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
# THE SOFTWARE.
21
22
import re
23
24
from tensorflow_tts.utils.korean import tokenize as ko_tokenize
25
from tensorflow_tts.utils.number_norm import normalize_numbers
26
from unidecode import unidecode
27
28
try:
29
from german_transliterate.core import GermanTransliterate
30
except:
31
pass
32
33
# Regular expression matching whitespace:
34
_whitespace_re = re.compile(r"\s+")
35
36
# List of (regular expression, replacement) pairs for abbreviations:
37
_abbreviations = [
38
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
39
for x in [
40
("mrs", "misess"),
41
("mr", "mister"),
42
("dr", "doctor"),
43
("st", "saint"),
44
("co", "company"),
45
("jr", "junior"),
46
("maj", "major"),
47
("gen", "general"),
48
("drs", "doctors"),
49
("rev", "reverend"),
50
("lt", "lieutenant"),
51
("hon", "honorable"),
52
("sgt", "sergeant"),
53
("capt", "captain"),
54
("esq", "esquire"),
55
("ltd", "limited"),
56
("col", "colonel"),
57
("ft", "fort"),
58
]
59
]
60
61
62
def expand_abbreviations(text):
63
for regex, replacement in _abbreviations:
64
text = re.sub(regex, replacement, text)
65
return text
66
67
68
def expand_numbers(text):
69
return normalize_numbers(text)
70
71
72
def lowercase(text):
73
return text.lower()
74
75
76
def collapse_whitespace(text):
77
return re.sub(_whitespace_re, " ", text)
78
79
80
def convert_to_ascii(text):
81
return unidecode(text)
82
83
84
def basic_cleaners(text):
85
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
86
text = lowercase(text)
87
text = collapse_whitespace(text)
88
return text
89
90
91
def transliteration_cleaners(text):
92
"""Pipeline for non-English text that transliterates to ASCII."""
93
text = convert_to_ascii(text)
94
text = lowercase(text)
95
text = collapse_whitespace(text)
96
return text
97
98
99
def english_cleaners(text):
100
"""Pipeline for English text, including number and abbreviation expansion."""
101
text = convert_to_ascii(text)
102
text = lowercase(text)
103
text = expand_numbers(text)
104
text = expand_abbreviations(text)
105
text = collapse_whitespace(text)
106
return text
107
108
109
def korean_cleaners(text):
110
"""Pipeline for Korean text, including number and abbreviation expansion."""
111
text = ko_tokenize(
112
text
113
) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ']
114
return text
115
116
def german_cleaners(text):
117
"""Pipeline for German text, including number and abbreviation expansion."""
118
try:
119
text = GermanTransliterate(replace={';': ',', ':': ' '}, sep_abbreviation=' -- ').transliterate(text)
120
except NameError:
121
raise ModuleNotFoundError("Install german_transliterate package to use german_cleaners")
122
return text
123