Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
snakers4
GitHub Repository: snakers4/silero-vad
Path: blob/master/examples/rust-example/src/vad_iter.rs
1171 views
1
use crate::{silero, utils};
2
3
const DEBUG_SPEECH_PROB: bool = true;
4
#[derive(Debug)]
5
pub struct VadIter {
6
silero: silero::Silero,
7
params: Params,
8
state: State,
9
}
10
11
impl VadIter {
12
pub fn new(silero: silero::Silero, params: utils::VadParams) -> Self {
13
Self {
14
silero,
15
params: Params::from(params),
16
state: State::new(),
17
}
18
}
19
20
pub fn process(&mut self, samples: &[i16]) -> Result<(), ort::Error> {
21
self.reset_states();
22
for audio_frame in samples.chunks_exact(self.params.frame_size_samples) {
23
let speech_prob: f32 = self.silero.calc_level(audio_frame)?;
24
self.state.update(&self.params, speech_prob);
25
}
26
self.state.check_for_last_speech(samples.len());
27
Ok(())
28
}
29
30
pub fn speeches(&self) -> &[utils::TimeStamp] {
31
&self.state.speeches
32
}
33
}
34
35
impl VadIter {
36
fn reset_states(&mut self) {
37
self.silero.reset();
38
self.state = State::new()
39
}
40
}
41
42
#[allow(unused)]
43
#[derive(Debug)]
44
struct Params {
45
frame_size: usize,
46
threshold: f32,
47
min_silence_duration_ms: usize,
48
speech_pad_ms: usize,
49
min_speech_duration_ms: usize,
50
max_speech_duration_s: f32,
51
sample_rate: usize,
52
sr_per_ms: usize,
53
frame_size_samples: usize,
54
min_speech_samples: usize,
55
speech_pad_samples: usize,
56
max_speech_samples: f32,
57
min_silence_samples: usize,
58
min_silence_samples_at_max_speech: usize,
59
}
60
61
impl From<utils::VadParams> for Params {
62
fn from(value: utils::VadParams) -> Self {
63
let frame_size = value.frame_size;
64
let threshold = value.threshold;
65
let min_silence_duration_ms = value.min_silence_duration_ms;
66
let speech_pad_ms = value.speech_pad_ms;
67
let min_speech_duration_ms = value.min_speech_duration_ms;
68
let max_speech_duration_s = value.max_speech_duration_s;
69
let sample_rate = value.sample_rate;
70
let sr_per_ms = sample_rate / 1000;
71
let frame_size_samples = frame_size * sr_per_ms;
72
let min_speech_samples = sr_per_ms * min_speech_duration_ms;
73
let speech_pad_samples = sr_per_ms * speech_pad_ms;
74
let max_speech_samples = sample_rate as f32 * max_speech_duration_s
75
- frame_size_samples as f32
76
- 2.0 * speech_pad_samples as f32;
77
let min_silence_samples = sr_per_ms * min_silence_duration_ms;
78
let min_silence_samples_at_max_speech = sr_per_ms * 98;
79
Self {
80
frame_size,
81
threshold,
82
min_silence_duration_ms,
83
speech_pad_ms,
84
min_speech_duration_ms,
85
max_speech_duration_s,
86
sample_rate,
87
sr_per_ms,
88
frame_size_samples,
89
min_speech_samples,
90
speech_pad_samples,
91
max_speech_samples,
92
min_silence_samples,
93
min_silence_samples_at_max_speech,
94
}
95
}
96
}
97
98
#[derive(Debug, Default)]
99
struct State {
100
current_sample: usize,
101
temp_end: usize,
102
next_start: usize,
103
prev_end: usize,
104
triggered: bool,
105
current_speech: utils::TimeStamp,
106
speeches: Vec<utils::TimeStamp>,
107
}
108
109
impl State {
110
fn new() -> Self {
111
Default::default()
112
}
113
114
fn update(&mut self, params: &Params, speech_prob: f32) {
115
self.current_sample += params.frame_size_samples;
116
if speech_prob > params.threshold {
117
if self.temp_end != 0 {
118
self.temp_end = 0;
119
if self.next_start < self.prev_end {
120
self.next_start = self
121
.current_sample
122
.saturating_sub(params.frame_size_samples)
123
}
124
}
125
if !self.triggered {
126
self.debug(speech_prob, params, "start");
127
self.triggered = true;
128
self.current_speech.start =
129
self.current_sample as i64 - params.frame_size_samples as i64;
130
}
131
return;
132
}
133
if self.triggered
134
&& (self.current_sample as i64 - self.current_speech.start) as f32
135
> params.max_speech_samples
136
{
137
if self.prev_end > 0 {
138
self.current_speech.end = self.prev_end as _;
139
self.take_speech();
140
if self.next_start < self.prev_end {
141
self.triggered = false
142
} else {
143
self.current_speech.start = self.next_start as _;
144
}
145
self.prev_end = 0;
146
self.next_start = 0;
147
self.temp_end = 0;
148
} else {
149
self.current_speech.end = self.current_sample as _;
150
self.take_speech();
151
self.prev_end = 0;
152
self.next_start = 0;
153
self.temp_end = 0;
154
self.triggered = false;
155
}
156
return;
157
}
158
if speech_prob >= (params.threshold - 0.15) && (speech_prob < params.threshold) {
159
if self.triggered {
160
self.debug(speech_prob, params, "speaking")
161
} else {
162
self.debug(speech_prob, params, "silence")
163
}
164
}
165
if self.triggered && speech_prob < (params.threshold - 0.15) {
166
self.debug(speech_prob, params, "end");
167
if self.temp_end == 0 {
168
self.temp_end = self.current_sample;
169
}
170
if self.current_sample.saturating_sub(self.temp_end)
171
> params.min_silence_samples_at_max_speech
172
{
173
self.prev_end = self.temp_end;
174
}
175
if self.current_sample.saturating_sub(self.temp_end) >= params.min_silence_samples {
176
self.current_speech.end = self.temp_end as _;
177
if self.current_speech.end - self.current_speech.start
178
> params.min_speech_samples as _
179
{
180
self.take_speech();
181
self.prev_end = 0;
182
self.next_start = 0;
183
self.temp_end = 0;
184
self.triggered = false;
185
}
186
}
187
}
188
}
189
190
fn take_speech(&mut self) {
191
self.speeches.push(std::mem::take(&mut self.current_speech)); // current speech becomes TimeStamp::default() due to take()
192
}
193
194
fn check_for_last_speech(&mut self, last_sample: usize) {
195
if self.current_speech.start > 0 {
196
self.current_speech.end = last_sample as _;
197
self.take_speech();
198
self.prev_end = 0;
199
self.next_start = 0;
200
self.temp_end = 0;
201
self.triggered = false;
202
}
203
}
204
205
fn debug(&self, speech_prob: f32, params: &Params, title: &str) {
206
if DEBUG_SPEECH_PROB {
207
let speech = self.current_sample as f32
208
- params.frame_size_samples as f32
209
- if title == "end" {
210
params.speech_pad_samples
211
} else {
212
0
213
} as f32; // minus window_size_samples to get precise start time point.
214
println!(
215
"[{:10}: {:.3} s ({:.3}) {:8}]",
216
title,
217
speech / params.sample_rate as f32,
218
speech_prob,
219
self.current_sample - params.frame_size_samples,
220
);
221
}
222
}
223
}
224
225