Path: blob/master/examples/cppwin/TensorflowTTSCppInference/TextTokenizer.cpp
1559 views
#include "TextTokenizer.h"1#include "ext/ZCharScanner.h"2#include <algorithm>3#include <cassert>4#include <cctype>5#include <iostream>6const std::vector<std::string> first14 = { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen" };7const std::vector<std::string> prefixes = { "twen", "thir", "for", "fif", "six", "seven", "eigh", "nine" };89// Punctuation, this gets auto-converted to SIL10const std::string punctuation = ",.-;";111213using namespace std;1415void TextTokenizer::SetAllowedChars(const std::string &value)16{17AllowedChars = value;18}1920string TextTokenizer::IntToStr(int number)21{22if (number < 0)23{24return "minus " + IntToStr(-number);25}26if (number <= 14)27return first14.at(number);28if (number < 20)29return prefixes.at(number - 12) + "teen";30if (number < 100) {31unsigned int remainder = number - (static_cast<int>(number / 10) * 10);32return prefixes.at(number / 10 - 2) + (0 != remainder ? "ty " + IntToStr(remainder) : "ty");33}34if (number < 1000) {35unsigned int remainder = number - (static_cast<int>(number / 100) * 100);36return first14.at(number / 100) + (0 != remainder ? " hundred " + IntToStr(remainder) : " hundred");37}38if (number < 1000000) {39unsigned int thousands = static_cast<int>(number / 1000);40unsigned int remainder = number - (thousands * 1000);41return IntToStr(thousands) + (0 != remainder ? " thousand " + IntToStr(remainder) : " thousand");42}43if (number < 1000000000) {44unsigned int millions = static_cast<int>(number / 1000000);45unsigned int remainder = number - (millions * 1000000);46return IntToStr(millions) + (0 != remainder ? " million " + IntToStr(remainder) : " million");47}48throw std::out_of_range("inttostr() value too large");49}505152vector<string> TextTokenizer::ExpandNumbers(const std::vector<std::string>& SpaceTokens)53{54vector<string> RetVec;55RetVec.reserve(SpaceTokens.size());5657for (auto& Token : SpaceTokens) {58char* p;59long converted = strtol(Token.c_str(), &p, 10);60if (*p) {61RetVec.push_back(Token);62}63else {64if (converted > 1000000000)65continue;6667string IntStr = IntToStr((int)converted);68ZStringDelimiter DelInt(IntStr);69DelInt.AddDelimiter(" ");7071std::vector<std::string> NumToks = DelInt.GetTokens();7273// If a number results in one word the delimiter may not add it.74if (NumToks.empty())75NumToks.push_back(IntStr);7677for (const auto& NumTok : NumToks)78RetVec.push_back(NumTok);798081}82}8384return RetVec;8586}8788TextTokenizer::TextTokenizer()89{90}9192TextTokenizer::~TextTokenizer()93{94}9596vector<string> TextTokenizer::Tokenize(const std::string & InTxt,ETTSLanguage::Enum Language)97{98vector<string> ProcessedTokens;99100101ZStringDelimiter Delim(InTxt);102Delim.AddDelimiter(" ");103104vector<string> DelimitedTokens = Delim.GetTokens();105106// Single word handler107if (!Delim.szTokens())108DelimitedTokens.push_back(InTxt);109110if (Language == ETTSLanguage::English)111DelimitedTokens = ExpandNumbers(DelimitedTokens);112113114115116// We know that the new vector is going to be at least this size so we reserve117ProcessedTokens.reserve(DelimitedTokens.size());118119/*120In this step we go through the string and only allow qualified character to pass through.121*/122for (size_t TokCtr = 0; TokCtr < DelimitedTokens.size();TokCtr++)123{124const auto& tok = DelimitedTokens[TokCtr];125string AppTok = "";126127128if (tok.find("@") != string::npos)129{130131ProcessedTokens.push_back(tok);132continue;133134}135136for (size_t s = 0;s < tok.size();s++)137{138139140if (AllowedChars.find(tok[s]) != std::string::npos)141AppTok += tok[s];142143144// Prevent an ending period from adding another SIL145bool LastElem = TokCtr == DelimitedTokens.size() - 1 && s == tok.size() - 1;146// Punctuation handler147// This time we explicitly add a token to the vector148if (punctuation.find(tok[s]) != string::npos && !LastElem) {149// First, if the assembled string isn't empty, we add it in its current state150// Otherwise, the SIL could end up appearing before the word.151if (!AppTok.empty()) {152ProcessedTokens.push_back(AppTok);153AppTok = "";154}155ProcessedTokens.push_back("@SIL");156}157158159160161162163}164if (!AppTok.empty())165ProcessedTokens.push_back(AppTok);166167}168// Prevent out of range error if the user inputs one word169if (ProcessedTokens.size() > 1)170{171if (ProcessedTokens[ProcessedTokens.size() - 1] == "SIL")172ProcessedTokens.pop_back();173}174175176return ProcessedTokens;177}178179180