Path: blob/master/examples/rust-example/src/vad_iter.rs
1171 views
use crate::{silero, utils};12const DEBUG_SPEECH_PROB: bool = true;3#[derive(Debug)]4pub struct VadIter {5silero: silero::Silero,6params: Params,7state: State,8}910impl VadIter {11pub fn new(silero: silero::Silero, params: utils::VadParams) -> Self {12Self {13silero,14params: Params::from(params),15state: State::new(),16}17}1819pub fn process(&mut self, samples: &[i16]) -> Result<(), ort::Error> {20self.reset_states();21for audio_frame in samples.chunks_exact(self.params.frame_size_samples) {22let speech_prob: f32 = self.silero.calc_level(audio_frame)?;23self.state.update(&self.params, speech_prob);24}25self.state.check_for_last_speech(samples.len());26Ok(())27}2829pub fn speeches(&self) -> &[utils::TimeStamp] {30&self.state.speeches31}32}3334impl VadIter {35fn reset_states(&mut self) {36self.silero.reset();37self.state = State::new()38}39}4041#[allow(unused)]42#[derive(Debug)]43struct Params {44frame_size: usize,45threshold: f32,46min_silence_duration_ms: usize,47speech_pad_ms: usize,48min_speech_duration_ms: usize,49max_speech_duration_s: f32,50sample_rate: usize,51sr_per_ms: usize,52frame_size_samples: usize,53min_speech_samples: usize,54speech_pad_samples: usize,55max_speech_samples: f32,56min_silence_samples: usize,57min_silence_samples_at_max_speech: usize,58}5960impl From<utils::VadParams> for Params {61fn from(value: utils::VadParams) -> Self {62let frame_size = value.frame_size;63let threshold = value.threshold;64let min_silence_duration_ms = value.min_silence_duration_ms;65let speech_pad_ms = value.speech_pad_ms;66let min_speech_duration_ms = value.min_speech_duration_ms;67let max_speech_duration_s = value.max_speech_duration_s;68let sample_rate = value.sample_rate;69let sr_per_ms = sample_rate / 1000;70let frame_size_samples = frame_size * sr_per_ms;71let min_speech_samples = sr_per_ms * min_speech_duration_ms;72let speech_pad_samples = sr_per_ms * speech_pad_ms;73let max_speech_samples = sample_rate as f32 * max_speech_duration_s74- frame_size_samples as f3275- 2.0 * speech_pad_samples as f32;76let min_silence_samples = sr_per_ms * min_silence_duration_ms;77let min_silence_samples_at_max_speech = sr_per_ms * 98;78Self {79frame_size,80threshold,81min_silence_duration_ms,82speech_pad_ms,83min_speech_duration_ms,84max_speech_duration_s,85sample_rate,86sr_per_ms,87frame_size_samples,88min_speech_samples,89speech_pad_samples,90max_speech_samples,91min_silence_samples,92min_silence_samples_at_max_speech,93}94}95}9697#[derive(Debug, Default)]98struct State {99current_sample: usize,100temp_end: usize,101next_start: usize,102prev_end: usize,103triggered: bool,104current_speech: utils::TimeStamp,105speeches: Vec<utils::TimeStamp>,106}107108impl State {109fn new() -> Self {110Default::default()111}112113fn update(&mut self, params: &Params, speech_prob: f32) {114self.current_sample += params.frame_size_samples;115if speech_prob > params.threshold {116if self.temp_end != 0 {117self.temp_end = 0;118if self.next_start < self.prev_end {119self.next_start = self120.current_sample121.saturating_sub(params.frame_size_samples)122}123}124if !self.triggered {125self.debug(speech_prob, params, "start");126self.triggered = true;127self.current_speech.start =128self.current_sample as i64 - params.frame_size_samples as i64;129}130return;131}132if self.triggered133&& (self.current_sample as i64 - self.current_speech.start) as f32134> params.max_speech_samples135{136if self.prev_end > 0 {137self.current_speech.end = self.prev_end as _;138self.take_speech();139if self.next_start < self.prev_end {140self.triggered = false141} else {142self.current_speech.start = self.next_start as _;143}144self.prev_end = 0;145self.next_start = 0;146self.temp_end = 0;147} else {148self.current_speech.end = self.current_sample as _;149self.take_speech();150self.prev_end = 0;151self.next_start = 0;152self.temp_end = 0;153self.triggered = false;154}155return;156}157if speech_prob >= (params.threshold - 0.15) && (speech_prob < params.threshold) {158if self.triggered {159self.debug(speech_prob, params, "speaking")160} else {161self.debug(speech_prob, params, "silence")162}163}164if self.triggered && speech_prob < (params.threshold - 0.15) {165self.debug(speech_prob, params, "end");166if self.temp_end == 0 {167self.temp_end = self.current_sample;168}169if self.current_sample.saturating_sub(self.temp_end)170> params.min_silence_samples_at_max_speech171{172self.prev_end = self.temp_end;173}174if self.current_sample.saturating_sub(self.temp_end) >= params.min_silence_samples {175self.current_speech.end = self.temp_end as _;176if self.current_speech.end - self.current_speech.start177> params.min_speech_samples as _178{179self.take_speech();180self.prev_end = 0;181self.next_start = 0;182self.temp_end = 0;183self.triggered = false;184}185}186}187}188189fn take_speech(&mut self) {190self.speeches.push(std::mem::take(&mut self.current_speech)); // current speech becomes TimeStamp::default() due to take()191}192193fn check_for_last_speech(&mut self, last_sample: usize) {194if self.current_speech.start > 0 {195self.current_speech.end = last_sample as _;196self.take_speech();197self.prev_end = 0;198self.next_start = 0;199self.temp_end = 0;200self.triggered = false;201}202}203204fn debug(&self, speech_prob: f32, params: &Params, title: &str) {205if DEBUG_SPEECH_PROB {206let speech = self.current_sample as f32207- params.frame_size_samples as f32208- if title == "end" {209params.speech_pad_samples210} else {2110212} as f32; // minus window_size_samples to get precise start time point.213println!(214"[{:10}: {:.3} s ({:.3}) {:8}]",215title,216speech / params.sample_rate as f32,217speech_prob,218self.current_sample - params.frame_size_samples,219);220}221}222}223224225