Path: blob/master/examples/cppwin/TensorflowTTSCppInference/Voice.cpp
1559 views
#include "Voice.h"1#include "ext/ZCharScanner.h"2345std::vector<int32_t> Voice::PhonemesToID(const std::string & InTxt)6{7ZStringDelimiter Delim(InTxt);8Delim.AddDelimiter(" ");910std::vector<int32_t> VecPhones;11VecPhones.reserve(Delim.szTokens());1213for (const auto& Pho : Delim.GetTokens())14{15size_t ArrID = 0;1617if (VoxUtil::FindInVec<std::string>(Pho, Phonemes, ArrID))18VecPhones.push_back(PhonemeIDs[ArrID]);19else20std::cout << "Voice::PhonemesToID() WARNING: Unknown phoneme " << Pho << std::endl;21222324}252627return VecPhones;2829}3031void Voice::ReadPhonemes(const std::string &PhonemePath)32{33std::ifstream Phone(PhonemePath);3435std::string Line;36while (std::getline(Phone, Line))37{38if (Line.find("\t") == std::string::npos)39continue;404142ZStringDelimiter Deline(Line);43Deline.AddDelimiter("\t");4445Phonemes.push_back(Deline[0]);46PhonemeIDs.push_back(stoi(Deline[1]));4748495051}5253}5455void Voice::ReadSpeakers(const std::string &SpeakerPath)56{57Speakers = GetLinedFile(SpeakerPath);5859}6061void Voice::ReadEmotions(const std::string &EmotionPath)62{63Emotions = GetLinedFile(EmotionPath);6465}6667void Voice::ReadModelInfo(const std::string &ModelInfoPath)68{6970ModelInfo = "";71std::vector<std::string> MiLines = GetLinedFile(ModelInfoPath);7273for (const std::string& ss : MiLines)74ModelInfo += ss + "\n";757677}7879std::vector<std::string> Voice::GetLinedFile(const std::string &Path)80{81std::vector<std::string> RetLines;82std::ifstream Fi(Path);8384if (!Fi.good()) // File not exists, ret empty vec85return RetLines;8687std::string Line;88while (std::getline(Fi, Line))89{90if (Line.size() > 1)91RetLines.push_back(Line);929394}9596return RetLines;9798}99100Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer *InPhn)101{102MelPredictor.Initialize(VoxPath + "/melgen");103Vocoder.Initialize(VoxPath + "/vocoder");104105if (InPhn)106Processor.Initialize(InPhn);107108109VoxInfo = VoxUtil::ReadModelJSON(VoxPath + "/info.json");110Name = inName;111ReadPhonemes(VoxPath + "/phonemes.txt");112ReadSpeakers(VoxPath + "/speakers.txt");113ReadEmotions(VoxPath + "/emotions.txt");114115116ReadModelInfo(VoxPath + "/info.txt");117118119120121122123124}125126void Voice::AddPhonemizer(Phonemizer *InPhn)127{128Processor.Initialize(InPhn);129130131}132133134std::vector<float> Voice::Vocalize(const std::string & Prompt, float Speed, int32_t SpeakerID, float Energy, float F0, int32_t EmotionID)135{136137std::string PhoneticTxt = Processor.ProcessTextPhonetic(Prompt,Phonemes,(ETTSLanguage::Enum)VoxInfo.Language);138139TFTensor<float> Mel = MelPredictor.DoInference(PhonemesToID(PhoneticTxt), SpeakerID, Speed, Energy, F0,EmotionID);140141TFTensor<float> AuData = Vocoder.DoInference(Mel);142143144int64_t Width = AuData.Shape[0];145int64_t Height = AuData.Shape[1];146int64_t Depth = AuData.Shape[2];147//int z = 0;148149std::vector<float> AudioData;150AudioData.resize(Height);151152// Code to access 1D array as if it were 3D153for (int64_t x = 0; x < Width;x++)154{155for (int64_t z = 0;z < Depth;z++)156{157for (int64_t y = 0; y < Height;y++) {158int64_t Index = x * Height * Depth + y * Depth + z;159AudioData[(size_t)y] = AuData.Data[(size_t)Index];160161}162163}164}165166167return AudioData;168}169170171Voice::~Voice()172{173}174175176