Path: blob/master/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py
1171 views
import collections, queue1import numpy as np2import pyaudio3import webrtcvad4from halo import Halo5import torch6import torchaudio78class Audio(object):9"""Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""1011FORMAT = pyaudio.paInt1612# Network/VAD rate-space13RATE_PROCESS = 1600014CHANNELS = 115BLOCKS_PER_SECOND = 501617def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS):18def proxy_callback(in_data, frame_count, time_info, status):19#pylint: disable=unused-argument20callback(in_data)21return (None, pyaudio.paContinue)22if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)23self.buffer_queue = queue.Queue()24self.device = device25self.input_rate = input_rate26self.sample_rate = self.RATE_PROCESS27self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND))28self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND))29self.pa = pyaudio.PyAudio()3031kwargs = {32'format': self.FORMAT,33'channels': self.CHANNELS,34'rate': self.input_rate,35'input': True,36'frames_per_buffer': self.block_size_input,37'stream_callback': proxy_callback,38}3940self.chunk = None41# if not default device42if self.device:43kwargs['input_device_index'] = self.device4445self.stream = self.pa.open(**kwargs)46self.stream.start_stream()4748def read(self):49"""Return a block of audio data, blocking if necessary."""50return self.buffer_queue.get()5152def destroy(self):53self.stream.stop_stream()54self.stream.close()55self.pa.terminate()5657frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)585960class VADAudio(Audio):61"""Filter & segment audio with voice activity detection."""6263def __init__(self, aggressiveness=3, device=None, input_rate=None):64super().__init__(device=device, input_rate=input_rate)65self.vad = webrtcvad.Vad(aggressiveness)6667def frame_generator(self):68"""Generator that yields all audio frames from microphone."""69if self.input_rate == self.RATE_PROCESS:70while True:71yield self.read()72else:73raise Exception("Resampling required")7475def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):76"""Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.77Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.78Example: (frame, ..., frame, None, frame, ..., frame, None, ...)79|---utterence---| |---utterence---|80"""81if frames is None: frames = self.frame_generator()82num_padding_frames = padding_ms // self.frame_duration_ms83ring_buffer = collections.deque(maxlen=num_padding_frames)84triggered = False8586for frame in frames:87if len(frame) < 640:88return8990is_speech = self.vad.is_speech(frame, self.sample_rate)9192if not triggered:93ring_buffer.append((frame, is_speech))94num_voiced = len([f for f, speech in ring_buffer if speech])95if num_voiced > ratio * ring_buffer.maxlen:96triggered = True97for f, s in ring_buffer:98yield f99ring_buffer.clear()100101else:102yield frame103ring_buffer.append((frame, is_speech))104num_unvoiced = len([f for f, speech in ring_buffer if not speech])105if num_unvoiced > ratio * ring_buffer.maxlen:106triggered = False107yield None108ring_buffer.clear()109110def main(ARGS):111# Start audio with VAD112vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness,113device=ARGS.device,114input_rate=ARGS.rate)115116print("Listening (ctrl-C to exit)...")117frames = vad_audio.vad_collector()118119# load silero VAD120torchaudio.set_audio_backend("soundfile")121model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',122model=ARGS.silaro_model_name,123force_reload= ARGS.reload)124(get_speech_ts,_,_, _,_, _, _) = utils125126127# Stream from microphone to DeepSpeech using VAD128spinner = None129if not ARGS.nospinner:130spinner = Halo(spinner='line')131wav_data = bytearray()132for frame in frames:133if frame is not None:134if spinner: spinner.start()135136wav_data.extend(frame)137else:138if spinner: spinner.stop()139print("webRTC has detected a possible speech")140141newsound= np.frombuffer(wav_data,np.int16)142audio_float32=Int2Float(newsound)143time_stamps =get_speech_ts(audio_float32, model,num_steps=ARGS.num_steps,trig_sum=ARGS.trig_sum,neg_trig_sum=ARGS.neg_trig_sum,144num_samples_per_window=ARGS.num_samples_per_window,min_speech_samples=ARGS.min_speech_samples,145min_silence_samples=ARGS.min_silence_samples)146147if(len(time_stamps)>0):148print("silero VAD has detected a possible speech")149else:150print("silero VAD has detected a noise")151print()152wav_data = bytearray()153154155def Int2Float(sound):156_sound = np.copy(sound) #157abs_max = np.abs(_sound).max()158_sound = _sound.astype('float32')159if abs_max > 0:160_sound *= 1/abs_max161audio_float32 = torch.from_numpy(_sound.squeeze())162return audio_float32163164if __name__ == '__main__':165DEFAULT_SAMPLE_RATE = 16000166167import argparse168parser = argparse.ArgumentParser(description="Stream from microphone to webRTC and silero VAD")169170parser.add_argument('-v', '--webRTC_aggressiveness', type=int, default=3,171help="Set aggressiveness of webRTC: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")172parser.add_argument('--nospinner', action='store_true',173help="Disable spinner")174parser.add_argument('-d', '--device', type=int, default=None,175help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")176177parser.add_argument('-name', '--silaro_model_name', type=str, default="silero_vad",178help="select the name of the model. You can select between 'silero_vad',''silero_vad_micro','silero_vad_micro_8k','silero_vad_mini','silero_vad_mini_8k'")179parser.add_argument('--reload', action='store_true',help="download the last version of the silero vad")180181parser.add_argument('-ts', '--trig_sum', type=float, default=0.25,182help="overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state)")183184parser.add_argument('-nts', '--neg_trig_sum', type=float, default=0.07,185help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")186187parser.add_argument('-N', '--num_steps', type=int, default=8,188help="number of overlapping windows to split audio chunk into (we recommend 4 or 8)")189190parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,191help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")192193parser.add_argument('-msps', '--min_speech_samples', type=int, default=10000,194help="minimum speech chunk duration in samples")195196parser.add_argument('-msis', '--min_silence_samples', type=int, default=500,197help=" minimum silence duration in samples between to separate speech chunks")198ARGS = parser.parse_args()199ARGS.rate=DEFAULT_SAMPLE_RATE200main(ARGS)201202203