CoCalc -- TextTokenizer.cpp

GitHub Repository: TensorSpeech/TensorFlowTTS
Path: blob/master/examples/cppwin/TensorflowTTSCppInference/TextTokenizer.cpp
¹⁵⁵⁹ views
1
#include "TextTokenizer.h"
2
#include "ext/ZCharScanner.h"
3
#include <algorithm>
4
#include <cassert>
5
#include <cctype>
6
#include <iostream>
7
const std::vector<std::string> first14 = { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen" };
8
const std::vector<std::string> prefixes = { "twen", "thir", "for", "fif", "six", "seven", "eigh", "nine" };
9

10
// Punctuation, this gets auto-converted to SIL
11
const std::string punctuation = ",.-;";
12

13

14
using namespace std;
15

16
void TextTokenizer::SetAllowedChars(const std::string &value)
17
{
18
    AllowedChars = value;
19
}
20

21
string TextTokenizer::IntToStr(int number)
22
{
23
    if (number < 0)
24
    {
25
        return "minus " + IntToStr(-number);
26
	}
27
	if (number <= 14)
28
		return first14.at(number);
29
	if (number < 20)
30
		return prefixes.at(number - 12) + "teen";
31
	if (number < 100) {
32
		unsigned int remainder = number - (static_cast<int>(number / 10) * 10);
33
		return prefixes.at(number / 10 - 2) + (0 != remainder ? "ty " + IntToStr(remainder) : "ty");
34
	}
35
	if (number < 1000) {
36
		unsigned int remainder = number - (static_cast<int>(number / 100) * 100);
37
		return first14.at(number / 100) + (0 != remainder ? " hundred " + IntToStr(remainder) : " hundred");
38
	}
39
	if (number < 1000000) {
40
		unsigned int thousands = static_cast<int>(number / 1000);
41
		unsigned int remainder = number - (thousands * 1000);
42
		return IntToStr(thousands) + (0 != remainder ? " thousand " + IntToStr(remainder) : " thousand");
43
	}
44
	if (number < 1000000000) {
45
		unsigned int millions = static_cast<int>(number / 1000000);
46
		unsigned int remainder = number - (millions * 1000000);
47
		return IntToStr(millions) + (0 != remainder ? " million " + IntToStr(remainder) : " million");
48
	}
49
	throw std::out_of_range("inttostr() value too large");
50
}
51

52

53
vector<string> TextTokenizer::ExpandNumbers(const std::vector<std::string>& SpaceTokens)
54
{
55
	vector<string> RetVec;
56
	RetVec.reserve(SpaceTokens.size());
57

58
	for (auto& Token : SpaceTokens) {
59
		char* p;
60
		long converted = strtol(Token.c_str(), &p, 10);
61
		if (*p) {
62
			RetVec.push_back(Token);
63
		}
64
		else {
65
			if (converted > 1000000000)
66
				continue;
67

68
			string IntStr = IntToStr((int)converted);
69
			ZStringDelimiter DelInt(IntStr);
70
			DelInt.AddDelimiter(" ");
71

72
			std::vector<std::string> NumToks = DelInt.GetTokens();
73

74
			// If a number results in one word the delimiter may not add it.
75
			if (NumToks.empty())
76
				NumToks.push_back(IntStr);
77

78
			for (const auto& NumTok : NumToks)
79
				RetVec.push_back(NumTok);
80
			
81

82
		}
83
	}
84

85
	return RetVec;
86
	
87
}
88

89
TextTokenizer::TextTokenizer()
90
{
91
}
92

93
TextTokenizer::~TextTokenizer()
94
{
95
}
96

97
vector<string> TextTokenizer::Tokenize(const std::string & InTxt,ETTSLanguage::Enum Language)
98
{
99
	vector<string> ProcessedTokens;
100

101

102
	ZStringDelimiter Delim(InTxt);
103
	Delim.AddDelimiter(" ");
104

105
	vector<string> DelimitedTokens = Delim.GetTokens();
106

107
	// Single word handler
108
	if (!Delim.szTokens())
109
		DelimitedTokens.push_back(InTxt);
110

111
    if (Language == ETTSLanguage::English)
112
        DelimitedTokens = ExpandNumbers(DelimitedTokens);
113

114

115

116

117
	// We know that the new vector is going to be at least this size so we reserve
118
	ProcessedTokens.reserve(DelimitedTokens.size());
119

120
	/*
121
	In this step we go through the string and only allow qualified character to pass through.
122
	*/
123
    for (size_t TokCtr = 0; TokCtr < DelimitedTokens.size();TokCtr++)
124
	{
125
        const auto& tok = DelimitedTokens[TokCtr];
126
		string AppTok = "";
127

128

129
        if (tok.find("@") != string::npos)
130
        {
131

132
            ProcessedTokens.push_back(tok);
133
            continue;
134

135
        }
136

137
		for (size_t s = 0;s < tok.size();s++)
138
		{
139

140

141
            if (AllowedChars.find(tok[s]) != std::string::npos)
142
                AppTok += tok[s];
143

144

145
            // Prevent an ending period from adding another SIL
146
            bool LastElem = TokCtr == DelimitedTokens.size() - 1 && s == tok.size() - 1;
147
			// Punctuation handler
148
            // This time we explicitly add a token to the vector
149
            if (punctuation.find(tok[s]) != string::npos && !LastElem) {
150
				// First, if the assembled string isn't empty, we add it in its current state
151
				// Otherwise, the SIL could end up appearing before the word.
152
				if (!AppTok.empty()) {
153
					ProcessedTokens.push_back(AppTok);
154
					AppTok = "";
155
				}
156
                ProcessedTokens.push_back("@SIL");
157
			}
158

159

160

161

162

163

164
		}
165
		if (!AppTok.empty())
166
			ProcessedTokens.push_back(AppTok);
167

168
	}
169
	// Prevent out of range error if the user inputs one word
170
	if (ProcessedTokens.size() > 1) 
171
	{
172
		if (ProcessedTokens[ProcessedTokens.size() - 1] == "SIL")
173
			ProcessedTokens.pop_back();
174
	}
175

176

177
	return ProcessedTokens;
178
}
179

180
Product

Resources

Company