CoCalc -- silero

GitHub Repository: snakers4/silero-vad
Path: blob/master/examples/cpp_libtorch/silero_torch.cc
¹¹⁷⁹ views
1
//Author      : Nathan Lee
2
//Created On  : 2024-11-18
3
//Description : silero 5.1 system for torch-script(c++).
4
//Version     : 1.0
5

6

7
#include "silero_torch.h"
8

9
namespace silero {
10

11
	VadIterator::VadIterator(const std::string &model_path, float threshold, int sample_rate, int window_size_ms, int speech_pad_ms, int min_silence_duration_ms, int min_speech_duration_ms, int max_duration_merge_ms, bool print_as_samples)
12
		:sample_rate(sample_rate), threshold(threshold), window_size_ms(window_size_ms), speech_pad_ms(speech_pad_ms), min_silence_duration_ms(min_silence_duration_ms), min_speech_duration_ms(min_speech_duration_ms), max_duration_merge_ms(max_duration_merge_ms), print_as_samples(print_as_samples)
13
	{
14
		init_torch_model(model_path);
15
		//init_engine(window_size_ms);
16
	}
17
	VadIterator::~VadIterator(){
18
	}
19

20

21
	void VadIterator::SpeechProbs(std::vector<float>& input_wav){
22
		// Set the sample rate (must match the model's expected sample rate)
23
		// Process the waveform in chunks of 512 samples
24
		int num_samples = input_wav.size();
25
		int num_chunks = num_samples / window_size_samples;
26
		int remainder_samples = num_samples % window_size_samples;
27

28
		total_sample_size += num_samples;
29

30
		torch::Tensor output;
31
		std::vector<torch::Tensor> chunks;
32

33
		for (int i = 0; i < num_chunks; i++) {
34

35
			float* chunk_start = input_wav.data() + i *window_size_samples;
36
			torch::Tensor chunk = torch::from_blob(chunk_start, {1,window_size_samples}, torch::kFloat32);
37
			//std::cout<<"chunk size : "<<chunk.sizes()<<std::endl;
38
			chunks.push_back(chunk);
39

40

41
			if(i==num_chunks-1 && remainder_samples>0){//마지막 chunk && 나머지가 존재
42
				int remaining_samples = num_samples - num_chunks * window_size_samples;
43
				//std::cout<<"Remainder size : "<<remaining_samples;
44
				float* chunk_start_remainder = input_wav.data() + num_chunks *window_size_samples;
45

46
				torch::Tensor remainder_chunk = torch::from_blob(chunk_start_remainder, {1,remaining_samples},
47
						torch::kFloat32);
48
				// Pad the remainder chunk to match window_size_samples
49
				torch::Tensor padded_chunk = torch::cat({remainder_chunk, torch::zeros({1, window_size_samples
50
							- remaining_samples}, torch::kFloat32)}, 1);
51
				//std::cout<<", padded_chunk size : "<<padded_chunk.size(1)<<std::endl;
52

53
				chunks.push_back(padded_chunk);
54
			}
55
		}
56

57
		if (!chunks.empty()) {
58

59
#ifdef USE_BATCH
60
			torch::Tensor batched_chunks = torch::stack(chunks);  // Stack all chunks into a single tensor
61
			//batched_chunks = batched_chunks.squeeze(1);
62
			batched_chunks = torch::cat({batched_chunks.squeeze(1)});
63

64
#ifdef USE_GPU
65
			batched_chunks = batched_chunks.to(at::kCUDA);        // Move the entire batch to GPU once
66
#endif
67
			// Prepare input for model
68
			std::vector<torch::jit::IValue> inputs;
69
			inputs.push_back(batched_chunks);  // Batch of chunks
70
			inputs.push_back(sample_rate);     // Assuming sample_rate is a valid input for the model
71

72
			// Run inference on the batch
73
			torch::NoGradGuard no_grad;
74
			torch::Tensor output = model.forward(inputs).toTensor();
75
#ifdef USE_GPU
76
			output = output.to(at::kCPU);      // Move the output back to CPU once
77
#endif
78
			// Collect output probabilities
79
			for (int i = 0; i < chunks.size(); i++) {
80
				float output_f = output[i].item<float>();
81
				outputs_prob.push_back(output_f);
82
				//std::cout << "Chunk " << i << " prob: " << output_f<< "\n";
83
			}
84
#else
85

86
			std::vector<torch::Tensor> outputs;
87
			torch::Tensor batched_chunks = torch::stack(chunks);
88
#ifdef USE_GPU
89
			batched_chunks = batched_chunks.to(at::kCUDA);
90
#endif
91
			for (int i = 0; i < chunks.size(); i++) {
92
				torch::NoGradGuard no_grad;
93
				std::vector<torch::jit::IValue> inputs;
94
				inputs.push_back(batched_chunks[i]);
95
				inputs.push_back(sample_rate);
96

97
				torch::Tensor output = model.forward(inputs).toTensor();
98
				outputs.push_back(output);
99
			}
100
			torch::Tensor all_outputs = torch::stack(outputs);
101
#ifdef USE_GPU
102
			all_outputs = all_outputs.to(at::kCPU);
103
#endif
104
			for (int i = 0; i < chunks.size(); i++) {
105
				float output_f = all_outputs[i].item<float>();
106
				outputs_prob.push_back(output_f);
107
			}
108

109

110

111
#endif
112

113
		}
114

115

116
	}
117

118

119
	std::vector<SpeechSegment> VadIterator::GetSpeechTimestamps() {
120
		std::vector<SpeechSegment> speeches = DoVad();
121

122
#ifdef USE_BATCH
123
		//When you use BATCH inference. You would better use 'mergeSpeeches' function to arrage time stamp.
124
		//It could be better get reasonable output because of distorted probs.
125
		duration_merge_samples = sample_rate * max_duration_merge_ms / 1000;
126
		std::vector<SpeechSegment> speeches_merge = mergeSpeeches(speeches, duration_merge_samples);
127
		if(!print_as_samples){
128
			for (auto& speech : speeches_merge) { //samples to second
129
				speech.start /= sample_rate;
130
				speech.end /= sample_rate;
131
			}
132
		}
133

134
		return speeches_merge;
135
#else
136

137
		if(!print_as_samples){
138
			for (auto& speech : speeches) { //samples to second
139
				speech.start /= sample_rate;
140
				speech.end /= sample_rate;
141
			}
142
		}
143

144
		return speeches;
145

146
#endif
147

148
	}
149
	void VadIterator::SetVariables(){
150
		init_engine(window_size_ms);
151
	}
152

153
	void VadIterator::init_engine(int window_size_ms) {
154
		min_silence_samples = sample_rate * min_silence_duration_ms / 1000;
155
		speech_pad_samples = sample_rate * speech_pad_ms / 1000;
156
		window_size_samples = sample_rate / 1000 * window_size_ms;
157
		min_speech_samples = sample_rate * min_speech_duration_ms / 1000;
158
	}
159

160
	void VadIterator::init_torch_model(const std::string& model_path) {
161
		at::set_num_threads(1);
162
		model = torch::jit::load(model_path);
163

164
#ifdef USE_GPU
165
		if (!torch::cuda::is_available()) {
166
			std::cout<<"CUDA is not available! Please check your GPU settings"<<std::endl;
167
			throw std::runtime_error("CUDA is not available!");
168
			model.to(at::Device(at::kCPU));    
169

170
		} else {
171
			std::cout<<"CUDA available! Running on '0'th GPU"<<std::endl;
172
			model.to(at::Device(at::kCUDA, 0));        //select 0'th machine 
173
		}
174
#endif
175

176

177
		model.eval();
178
		torch::NoGradGuard no_grad;
179
		std::cout << "Model loaded successfully"<<std::endl;
180
	}
181

182
	void VadIterator::reset_states() {
183
		triggered = false;
184
		current_sample = 0;
185
		temp_end = 0;
186
		outputs_prob.clear();
187
		model.run_method("reset_states");
188
		total_sample_size = 0;
189
	}
190

191
	std::vector<SpeechSegment> VadIterator::DoVad() {
192
		std::vector<SpeechSegment> speeches;
193

194
		for (size_t i = 0; i < outputs_prob.size(); ++i) {
195
			float speech_prob = outputs_prob[i];
196
			//std::cout << speech_prob << std::endl;
197
			//std::cout << "Chunk " << i << " Prob: " << speech_prob << "\n";
198
			//std::cout << speech_prob << " ";
199
			current_sample += window_size_samples;
200

201
			if (speech_prob >= threshold && temp_end != 0) {
202
				temp_end = 0;
203
			}
204

205
			if (speech_prob >= threshold && !triggered) {
206
				triggered = true;
207
				SpeechSegment segment;
208
				segment.start = std::max(static_cast<int>(0), current_sample - speech_pad_samples - window_size_samples);
209
				speeches.push_back(segment);
210
				continue;
211
			}
212

213
			if (speech_prob < threshold - 0.15f && triggered) {
214
				if (temp_end == 0) {
215
					temp_end = current_sample;
216
				}
217

218
				if (current_sample - temp_end < min_silence_samples) {
219
					continue;
220
				} else {
221
					SpeechSegment& segment = speeches.back();
222
					segment.end = temp_end + speech_pad_samples - window_size_samples;
223
					temp_end = 0;
224
					triggered = false;
225
				}
226
			}
227
		}
228

229
		if (triggered) { //만약 낮은 확률을 보이다가  마지막프레임 prbos만 딱 확률이 높게 나오면 위에서 triggerd = true 메핑과 동시에  segment start가 돼서 문제가 될것 같은데? start = end 같은값? 후처리가 있으니 문제가 없으려나?
230
			std::cout<<"when last triggered is keep working until last Probs"<<std::endl;
231
			SpeechSegment& segment = speeches.back();
232
			segment.end = total_sample_size;  // 현재 샘플을 마지막 구간의 종료 시간으로 설정
233
			triggered = false;  // VAD 상태 초기화
234
		}
235

236
		speeches.erase(
237
                		std::remove_if(
238
                        speeches.begin(),
239
                        speeches.end(),
240
                        [this](const SpeechSegment& speech) {
241
                        return ((speech.end - this->speech_pad_samples) - (speech.start + this->speech_pad_samples) < min_speech_samples);
242
			//min_speech_samples is 4000samples(0.25sec)
243
			//여기서 포인트!! 계산 할때는 start,end sample에'speech_pad_samples' 사이즈를 추가한후 길이를 측정함. 
244
                        }
245
                ),
246
                speeches.end()
247
              );
248

249

250
		//std::cout<<std::endl;
251
		//std::cout<<"outputs_prob.size : "<<outputs_prob.size()<<std::endl;
252

253
		reset_states();
254
		return speeches;
255
	}
256

257
	std::vector<SpeechSegment> VadIterator::mergeSpeeches(const std::vector<SpeechSegment>& speeches, int duration_merge_samples) {
258
		std::vector<SpeechSegment> mergedSpeeches;
259

260
		if (speeches.empty()) {
261
			return mergedSpeeches; // 빈 벡터 반환
262
		}
263

264
		// 첫 번째 구간으로 초기화
265
		SpeechSegment currentSegment = speeches[0];
266

267
		for (size_t i = 1; i < speeches.size(); ++i) {	//첫번째 start,end 정보 건너뛰기. 그래서 i=1부터
268
			// 두 구간의 차이가 threshold(duration_merge_samples)보다 작은 경우, 합침
269
			if (speeches[i].start - currentSegment.end < duration_merge_samples) {
270
				// 현재 구간의 끝점을 업데이트
271
				currentSegment.end = speeches[i].end;
272
			} else {
273
				// 차이가 threshold(duration_merge_samples) 이상이면 현재 구간을 저장하고 새로운 구간 시작
274
				mergedSpeeches.push_back(currentSegment);
275
				currentSegment = speeches[i];
276
			}
277
		}
278

279
		// 마지막 구간 추가
280
		mergedSpeeches.push_back(currentSegment);
281

282
		return mergedSpeeches;
283
	}
284

285
	}
286

287
Product

Resources

Company