CoCalc -- cleaners.py

GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/tensorflow_tts/utils/cleaners.py
¹⁵⁵⁸ views
1
# -*- coding: utf-8 -*-
2
# Copyright (c) 2017 Keith Ito
3
#
4
# Permission is hereby granted, free of charge, to any person obtaining a copy
5
# of this software and associated documentation files (the "Software"), to deal
6
# in the Software without restriction, including without limitation the rights
7
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
# copies of the Software, and to permit persons to whom the Software is
9
# furnished to do so, subject to the following conditions:
10

11
# The above copyright notice and this permission notice shall be included in
12
# all copies or substantial portions of the Software.
13

14
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
# THE SOFTWARE.
21

22
import re
23

24
from tensorflow_tts.utils.korean import tokenize as ko_tokenize
25
from tensorflow_tts.utils.number_norm import normalize_numbers
26
from unidecode import unidecode
27

28
try:
29
    from german_transliterate.core import GermanTransliterate
30
except:
31
    pass
32

33
# Regular expression matching whitespace:
34
_whitespace_re = re.compile(r"\s+")
35

36
# List of (regular expression, replacement) pairs for abbreviations:
37
_abbreviations = [
38
    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
39
    for x in [
40
        ("mrs", "misess"),
41
        ("mr", "mister"),
42
        ("dr", "doctor"),
43
        ("st", "saint"),
44
        ("co", "company"),
45
        ("jr", "junior"),
46
        ("maj", "major"),
47
        ("gen", "general"),
48
        ("drs", "doctors"),
49
        ("rev", "reverend"),
50
        ("lt", "lieutenant"),
51
        ("hon", "honorable"),
52
        ("sgt", "sergeant"),
53
        ("capt", "captain"),
54
        ("esq", "esquire"),
55
        ("ltd", "limited"),
56
        ("col", "colonel"),
57
        ("ft", "fort"),
58
    ]
59
]
60

61

62
def expand_abbreviations(text):
63
    for regex, replacement in _abbreviations:
64
        text = re.sub(regex, replacement, text)
65
    return text
66

67

68
def expand_numbers(text):
69
    return normalize_numbers(text)
70

71

72
def lowercase(text):
73
    return text.lower()
74

75

76
def collapse_whitespace(text):
77
    return re.sub(_whitespace_re, " ", text)
78

79

80
def convert_to_ascii(text):
81
    return unidecode(text)
82

83

84
def basic_cleaners(text):
85
    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
86
    text = lowercase(text)
87
    text = collapse_whitespace(text)
88
    return text
89

90

91
def transliteration_cleaners(text):
92
    """Pipeline for non-English text that transliterates to ASCII."""
93
    text = convert_to_ascii(text)
94
    text = lowercase(text)
95
    text = collapse_whitespace(text)
96
    return text
97

98

99
def english_cleaners(text):
100
    """Pipeline for English text, including number and abbreviation expansion."""
101
    text = convert_to_ascii(text)
102
    text = lowercase(text)
103
    text = expand_numbers(text)
104
    text = expand_abbreviations(text)
105
    text = collapse_whitespace(text)
106
    return text
107

108

109
def korean_cleaners(text):
110
    """Pipeline for Korean text, including number and abbreviation expansion."""
111
    text = ko_tokenize(
112
        text
113
    )  # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ']
114
    return text
115

116
def german_cleaners(text):
117
    """Pipeline for German text, including number and abbreviation expansion."""
118
    try:
119
        text = GermanTransliterate(replace={';': ',', ':': ' '}, sep_abbreviation=' -- ').transliterate(text)
120
    except NameError:
121
        raise ModuleNotFoundError("Install german_transliterate package to use german_cleaners")
122
    return text
123
Product

Resources

Company