Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
TensorSpeech
GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/examples/cppwin/TensorflowTTSCppInference/TextTokenizer.cpp
1559 views
1
#include "TextTokenizer.h"
2
#include "ext/ZCharScanner.h"
3
#include <algorithm>
4
#include <cassert>
5
#include <cctype>
6
#include <iostream>
7
const std::vector<std::string> first14 = { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen" };
8
const std::vector<std::string> prefixes = { "twen", "thir", "for", "fif", "six", "seven", "eigh", "nine" };
9
10
// Punctuation, this gets auto-converted to SIL
11
const std::string punctuation = ",.-;";
12
13
14
using namespace std;
15
16
void TextTokenizer::SetAllowedChars(const std::string &value)
17
{
18
AllowedChars = value;
19
}
20
21
string TextTokenizer::IntToStr(int number)
22
{
23
if (number < 0)
24
{
25
return "minus " + IntToStr(-number);
26
}
27
if (number <= 14)
28
return first14.at(number);
29
if (number < 20)
30
return prefixes.at(number - 12) + "teen";
31
if (number < 100) {
32
unsigned int remainder = number - (static_cast<int>(number / 10) * 10);
33
return prefixes.at(number / 10 - 2) + (0 != remainder ? "ty " + IntToStr(remainder) : "ty");
34
}
35
if (number < 1000) {
36
unsigned int remainder = number - (static_cast<int>(number / 100) * 100);
37
return first14.at(number / 100) + (0 != remainder ? " hundred " + IntToStr(remainder) : " hundred");
38
}
39
if (number < 1000000) {
40
unsigned int thousands = static_cast<int>(number / 1000);
41
unsigned int remainder = number - (thousands * 1000);
42
return IntToStr(thousands) + (0 != remainder ? " thousand " + IntToStr(remainder) : " thousand");
43
}
44
if (number < 1000000000) {
45
unsigned int millions = static_cast<int>(number / 1000000);
46
unsigned int remainder = number - (millions * 1000000);
47
return IntToStr(millions) + (0 != remainder ? " million " + IntToStr(remainder) : " million");
48
}
49
throw std::out_of_range("inttostr() value too large");
50
}
51
52
53
vector<string> TextTokenizer::ExpandNumbers(const std::vector<std::string>& SpaceTokens)
54
{
55
vector<string> RetVec;
56
RetVec.reserve(SpaceTokens.size());
57
58
for (auto& Token : SpaceTokens) {
59
char* p;
60
long converted = strtol(Token.c_str(), &p, 10);
61
if (*p) {
62
RetVec.push_back(Token);
63
}
64
else {
65
if (converted > 1000000000)
66
continue;
67
68
string IntStr = IntToStr((int)converted);
69
ZStringDelimiter DelInt(IntStr);
70
DelInt.AddDelimiter(" ");
71
72
std::vector<std::string> NumToks = DelInt.GetTokens();
73
74
// If a number results in one word the delimiter may not add it.
75
if (NumToks.empty())
76
NumToks.push_back(IntStr);
77
78
for (const auto& NumTok : NumToks)
79
RetVec.push_back(NumTok);
80
81
82
}
83
}
84
85
return RetVec;
86
87
}
88
89
TextTokenizer::TextTokenizer()
90
{
91
}
92
93
TextTokenizer::~TextTokenizer()
94
{
95
}
96
97
vector<string> TextTokenizer::Tokenize(const std::string & InTxt,ETTSLanguage::Enum Language)
98
{
99
vector<string> ProcessedTokens;
100
101
102
ZStringDelimiter Delim(InTxt);
103
Delim.AddDelimiter(" ");
104
105
vector<string> DelimitedTokens = Delim.GetTokens();
106
107
// Single word handler
108
if (!Delim.szTokens())
109
DelimitedTokens.push_back(InTxt);
110
111
if (Language == ETTSLanguage::English)
112
DelimitedTokens = ExpandNumbers(DelimitedTokens);
113
114
115
116
117
// We know that the new vector is going to be at least this size so we reserve
118
ProcessedTokens.reserve(DelimitedTokens.size());
119
120
/*
121
In this step we go through the string and only allow qualified character to pass through.
122
*/
123
for (size_t TokCtr = 0; TokCtr < DelimitedTokens.size();TokCtr++)
124
{
125
const auto& tok = DelimitedTokens[TokCtr];
126
string AppTok = "";
127
128
129
if (tok.find("@") != string::npos)
130
{
131
132
ProcessedTokens.push_back(tok);
133
continue;
134
135
}
136
137
for (size_t s = 0;s < tok.size();s++)
138
{
139
140
141
if (AllowedChars.find(tok[s]) != std::string::npos)
142
AppTok += tok[s];
143
144
145
// Prevent an ending period from adding another SIL
146
bool LastElem = TokCtr == DelimitedTokens.size() - 1 && s == tok.size() - 1;
147
// Punctuation handler
148
// This time we explicitly add a token to the vector
149
if (punctuation.find(tok[s]) != string::npos && !LastElem) {
150
// First, if the assembled string isn't empty, we add it in its current state
151
// Otherwise, the SIL could end up appearing before the word.
152
if (!AppTok.empty()) {
153
ProcessedTokens.push_back(AppTok);
154
AppTok = "";
155
}
156
ProcessedTokens.push_back("@SIL");
157
}
158
159
160
161
162
163
164
}
165
if (!AppTok.empty())
166
ProcessedTokens.push_back(AppTok);
167
168
}
169
// Prevent out of range error if the user inputs one word
170
if (ProcessedTokens.size() > 1)
171
{
172
if (ProcessedTokens[ProcessedTokens.size() - 1] == "SIL")
173
ProcessedTokens.pop_back();
174
}
175
176
177
return ProcessedTokens;
178
}
179
180