Python webrtcvad.Vad() Examples
The following are 22
code examples of webrtcvad.Vad().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
webrtcvad
, or try the search function
.
Example #1
Source File: VAD_segments.py From PyTorch_Speaker_Verification with BSD 3-Clause "New" or "Revised" License | 6 votes |
def VAD_chunk(aggressiveness, path): audio, byte_audio = read_wave(path, hp.data.sr) vad = webrtcvad.Vad(int(aggressiveness)) frames = frame_generator(20, byte_audio, hp.data.sr) frames = list(frames) times = vad_collector(hp.data.sr, 20, 200, vad, frames) speech_times = [] speech_segs = [] for i, time in enumerate(times): start = np.round(time[0],decimals=2) end = np.round(time[1],decimals=2) j = start while j + .4 < end: end_j = np.round(j+.4,decimals=2) speech_times.append((j, end_j)) speech_segs.append(audio[int(j*hp.data.sr):int(end_j*hp.data.sr)]) j = end_j else: speech_times.append((j, end)) speech_segs.append(audio[int(j*hp.data.sr):int(end*hp.data.sr)]) return speech_times, speech_segs
Example #2
Source File: vad.py From respeaker_python_library with Apache License 2.0 | 6 votes |
def __init__(self, sample_rate=16000, level=0): """ Args: sample_rate: audio sample rate level: between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive. """ self.sample_rate = sample_rate self.frame_ms = 30 self.frame_bytes = int(2 * self.frame_ms * self.sample_rate / 1000) # S16_LE, 2 bytes width self.vad = webrtcvad.Vad(level) self.active = False self.data = b'' self.history = collections.deque(maxlen=128)
Example #3
Source File: recorder.py From hermes-audio-server with MIT License | 6 votes |
def initialize(self): """Initialize a Hermes audio recorder.""" self.logger.debug('Probing for available input devices...') for index in range(self.audio.get_device_count()): device = self.audio.get_device_info_by_index(index) name = device['name'] channels = device['maxInputChannels'] if channels: self.logger.debug('[%d] %s', index, name) try: self.audio_in = self.audio.get_default_input_device_info()['name'] except OSError: raise NoDefaultAudioDeviceError('input') self.logger.info('Connected to audio input %s.', self.audio_in) if self.config.vad.enabled: self.logger.info('Voice Activity Detection enabled with mode %s.', self.config.vad.mode) self.vad = webrtcvad.Vad(self.config.vad.mode)
Example #4
Source File: vad.py From py-nltools with Apache License 2.0 | 6 votes |
def __init__(self, aggressiveness=2, sample_rate=SAMPLE_RATE, min_utt_length = MIN_UTT_LENGTH, max_utt_length = MAX_UTT_LENGTH, max_utt_gap = MAX_UTT_GAP): self.sample_rate = sample_rate self.vad = webrtcvad.Vad() self.vad.set_mode(aggressiveness) self.state = STATE_IDLE self.buf = [] self.buf_sent = 0 self.min_buf_entries = int(min_utt_length * 1000) / BUFFER_DURATION self.max_buf_entries = int(max_utt_length * 1000) / BUFFER_DURATION self.max_gap = int(max_utt_gap * 1000) / BUFFER_DURATION self.frame_cnt = 0 self.avg_vol_sum = 0.0 self.avg_vol_cnt = 0
Example #5
Source File: audio.py From Resemblyzer with Apache License 2.0 | 6 votes |
def trim_long_silences(wav): """ Ensures that segments without voice in the waveform remain no longer than a threshold determined by the VAD parameters in params.py. :param wav: the raw waveform as a numpy array of floats :return: the same waveform with silences trimmed away (length <= original wav length) """ # Compute the voice detection window size samples_per_window = (vad_window_length * sampling_rate) // 1000 # Trim the end of the audio to have a multiple of the window size wav = wav[:len(wav) - (len(wav) % samples_per_window)] # Convert the float waveform to 16-bit mono PCM pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) # Perform voice activation detection voice_flags = [] vad = webrtcvad.Vad(mode=3) for window_start in range(0, len(wav), samples_per_window): window_end = window_start + samples_per_window voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=sampling_rate)) voice_flags = np.array(voice_flags) # Smooth the voice detection with a moving average def moving_average(array, width): array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) ret = np.cumsum(array_padded, dtype=float) ret[width:] = ret[width:] - ret[:-width] return ret[width - 1:] / width audio_mask = moving_average(voice_flags, vad_moving_average_width) audio_mask = np.round(audio_mask).astype(np.bool) # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) audio_mask = np.repeat(audio_mask, samples_per_window) return wav[audio_mask == True]
Example #6
Source File: vad.py From tdoa with Apache License 2.0 | 6 votes |
def __init__(self, sample_rate=16000, level=3): """ Args: sample_rate: audio sample rate level: between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive. """ self.sample_rate = sample_rate self.frame_ms = 30 self.frame_bytes = int(2 * self.frame_ms * self.sample_rate / 1000) # S16_LE, 2 bytes width self.vad = webrtcvad.Vad(level) self.active = False self.data = b'' self.history = collections.deque(maxlen=128)
Example #7
Source File: vad_doa.py From mic_array with Apache License 2.0 | 5 votes |
def main(): vad = webrtcvad.Vad(3) speech_count = 0 chunks = [] doa_chunks = int(DOA_FRAMES / VAD_FRAMES) try: with MicArray(RATE, CHANNELS, RATE * VAD_FRAMES / 1000) as mic: for chunk in mic.read_chunks(): # Use single channel audio to detect voice activity if vad.is_speech(chunk[0::CHANNELS].tobytes(), RATE): speech_count += 1 sys.stdout.write('1') else: sys.stdout.write('0') sys.stdout.flush() chunks.append(chunk) if len(chunks) == doa_chunks: if speech_count > (doa_chunks / 2): frames = np.concatenate(chunks) direction = mic.get_direction(frames) pixel_ring.set_direction(direction) print('\n{}'.format(int(direction))) speech_count = 0 chunks = [] except KeyboardInterrupt: pass pixel_ring.off()
Example #8
Source File: capture.py From AlexaPi with MIT License | 5 votes |
def setup(self, state_callback): self._vad = webrtcvad.Vad(2) self._state_callback = state_callback
Example #9
Source File: do_vad.py From setk with Apache License 2.0 | 5 votes |
def __init__(self, mode, cache_size, fs=16000): self.pyvad = vad.Vad(mode=args.mode) self.fs = fs self.cache_size = cache_size self.reset()
Example #10
Source File: vad.py From 3D-convolutional-speaker-recognition-pytorch with Apache License 2.0 | 5 votes |
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: example.py <aggressiveness> <path to wav file>\n') sys.exit(1) audio, sample_rate = read_wave(args[1]) vad = webrtcvad.Vad(int(args[0])) frames = frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, 30, 300, vad, frames) for i, segment in enumerate(segments): path = 'chunk-%002d.wav' % (i,) print(' Writing %s' % (path,)) write_wave(path, segment, sample_rate)
Example #11
Source File: audio.py From Speech_emotion_recognition_BLSTM with MIT License | 5 votes |
def __init__(self, sr=16000, chunk_duration_ms=30, video_path='', out_path=''): self._sr = sr self._chunk_duration_ms = chunk_duration_ms self._chunk_size = int(sr * chunk_duration_ms / 1000) # chunk to read in samples self._nb_window_chunks = int(400 / chunk_duration_ms) # 400ms / 30ms frame self._nb_window_chunks_end = self._nb_window_chunks * 2 self._vad = webrtcvad.Vad(mode=3) self._video_path = video_path self._out_path = out_path
Example #12
Source File: client.py From deepspeech-websocket-server with Mozilla Public License 2.0 | 5 votes |
def __init__(self, aggressiveness=3): super().__init__() self.vad = webrtcvad.Vad(aggressiveness)
Example #13
Source File: transforms.py From pase with MIT License | 5 votes |
def __init__(self, chop_factors=[(0.05, 0.025), (0.1, 0.05)], max_chops=2, force_regions=False, report=False): # chop factors in seconds (mean, std) per possible chop import webrtcvad self.chop_factors = chop_factors self.max_chops = max_chops self.force_regions = force_regions # create VAD to get speech chunks self.vad = webrtcvad.Vad(2) # make scalers to norm/denorm self.denormalizer = Scale(1. / ((2 ** 15) - 1)) self.normalizer = Scale((2 ** 15) - 1) self.report = report # @profile
Example #14
Source File: speech_transformers.py From ffsubsync with MIT License | 5 votes |
def _make_webrtcvad_detector(sample_rate, frame_rate): import webrtcvad vad = webrtcvad.Vad() vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3 window_duration = 1. / sample_rate # duration in seconds frames_per_window = int(window_duration * frame_rate + 0.5) bytes_per_frame = 2 def _detect(asegment): media_bstring = [] failures = 0 for start in range(0, len(asegment) // bytes_per_frame, frames_per_window): stop = min(start + frames_per_window, len(asegment) // bytes_per_frame) try: is_speech = vad.is_speech( asegment[start * bytes_per_frame: stop * bytes_per_frame], sample_rate=frame_rate) except: is_speech = False failures += 1 # webrtcvad has low recall on mode 3, so treat non-speech as "not sure" media_bstring.append(1. if is_speech else 0.5) return np.array(media_bstring) return _detect
Example #15
Source File: microphone.py From ada with Apache License 2.0 | 5 votes |
def __init__(self, frame_length: int, sample_rate: int) -> None: """Initialize Microphone processing.""" self.audio = pyaudio.PyAudio() self.vad = webrtcvad.Vad(1) self.stream: Optional[pyaudio.Stream] = None self._frame_length = frame_length self._sample_rate = sample_rate self._last_frame: Optional[np.ndarray] = None
Example #16
Source File: audio.py From dragonfly with GNU Lesser General Public License v3.0 | 5 votes |
def __init__(self, aggressiveness=3, **kwargs): super(VADAudio, self).__init__(**kwargs) self.vad = webrtcvad.Vad(aggressiveness)
Example #17
Source File: vad.py From voice-engine with GNU General Public License v3.0 | 5 votes |
def __init__(self, rate=16000, mode=0, duration=1000, on_inactive=None): super(VAD, self).__init__() self.rate = rate self.vad = Vad(mode) self.on_inactive = on_inactive self.limit_inactive_cnt = duration / 10 # a frame is 10 ms self.current_inactive_cnt = 0
Example #18
Source File: vad.py From cloud-asr with Apache License 2.0 | 5 votes |
def __init__(self, sample_rate=16000, level=0): self.vad = webrtcvad.Vad(level) self.sample_rate = int(sample_rate) self.num_padding_frames = 10 self.reset()
Example #19
Source File: audiosegment.py From Speech_emotion_recognition_BLSTM with MIT License | 4 votes |
def detect_voice(self, prob_detect_voice=0.5): """ Returns self as a list of tuples: [('v', voiced segment), ('u', unvoiced segment), (etc.)] The overall order of the AudioSegment is preserved. :param prob_detect_voice: The raw probability that any random 20ms window of the audio file contains voice. :returns: The described list. """ assert self.frame_rate in (48000, 32000, 16000, 8000), "Try resampling to one of the allowed frame rates." assert self.sample_width == 2, "Try resampling to 16 bit." assert self.channels == 1, "Try resampling to one channel." class model_class: def __init__(self, aggressiveness): self.v = webrtcvad.Vad(int(aggressiveness)) def predict(self, vector): if self.v.is_speech(vector.raw_data, vector.frame_rate): return 1 else: return 0 model = model_class(aggressiveness=1) pyesno = 0.3 # Probability of the next 20 ms being unvoiced given that this 20 ms was voiced pnoyes = 0.2 # Probability of the next 20 ms being voiced given that this 20 ms was unvoiced p_realyes_outputyes = 0.4 # WebRTCVAD has a very high FP rate - just because it says yes, doesn't mean much p_realyes_outputno = 0.05 # If it says no, we can be very certain that it really is a no p_yes_raw = prob_detect_voice filtered = self.detect_event(model=model, ms_per_input=20, transition_matrix=(pyesno, pnoyes), model_stats=(p_realyes_outputyes, p_realyes_outputno), event_length_s=0.25, prob_raw_yes=p_yes_raw) ret = [] for tup in filtered: t = ('v', tup[1]) if tup[0] == 'y' else ('u', tup[1]) ret.append(t) return ret
Example #20
Source File: audiosegment.py From AudioSegment with MIT License | 4 votes |
def detect_voice(self, prob_detect_voice=0.5): """ Returns self as a list of tuples: [('v', voiced segment), ('u', unvoiced segment), (etc.)] The overall order of the AudioSegment is preserved. :param prob_detect_voice: The raw probability that any random 20ms window of the audio file contains voice. :returns: The described list. """ assert self.frame_rate in (48000, 32000, 16000, 8000), "Try resampling to one of the allowed frame rates." assert self.sample_width == 2, "Try resampling to 16 bit." assert self.channels == 1, "Try resampling to one channel." class model_class: def __init__(self, aggressiveness): self.v = webrtcvad.Vad(int(aggressiveness)) def predict(self, vector): if self.v.is_speech(vector.raw_data, vector.frame_rate): return 1 else: return 0 model = model_class(aggressiveness=2) pyesno = 0.3 # Probability of the next 20 ms being unvoiced given that this 20 ms was voiced pnoyes = 0.2 # Probability of the next 20 ms being voiced given that this 20 ms was unvoiced p_realyes_outputyes = 0.4 # WebRTCVAD has a very high FP rate - just because it says yes, doesn't mean much p_realyes_outputno = 0.05 # If it says no, we can be very certain that it really is a no p_yes_raw = prob_detect_voice filtered = self.detect_event(model=model, ms_per_input=20, transition_matrix=(pyesno, pnoyes), model_stats=(p_realyes_outputyes, p_realyes_outputno), event_length_s=0.25, prob_raw_yes=p_yes_raw) ret = [] for tup in filtered: t = ('v', tup[1]) if tup[0] == 'y' else ('u', tup[1]) ret.append(t) return ret
Example #21
Source File: audio.py From DSAlign with Mozilla Public License 2.0 | 4 votes |
def vad_split(audio_frames, audio_format=DEFAULT_FORMAT, num_padding_frames=10, threshold=0.5, aggressiveness=3): sample_rate, channels, width = audio_format if channels != 1: raise ValueError('VAD-splitting requires mono samples') if width != 2: raise ValueError('VAD-splitting requires 16 bit samples') if sample_rate not in [8000, 16000, 32000, 48000]: raise ValueError('VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000') if aggressiveness not in [0, 1, 2, 3]: raise ValueError('VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3') ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False vad = Vad(int(aggressiveness)) voiced_frames = [] frame_duration_ms = 0 frame_index = 0 for frame_index, frame in enumerate(audio_frames): frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000 if int(frame_duration_ms) not in [10, 20, 30]: raise ValueError('VAD-splitting only supported for frame durations 10, 20, or 30 ms') is_speech = vad.is_speech(frame, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) if num_voiced > threshold * ring_buffer.maxlen: triggered = True for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) if num_unvoiced > threshold * ring_buffer.maxlen: triggered = False yield b''.join(voiced_frames), \ frame_duration_ms * max(0, frame_index - len(voiced_frames)), \ frame_duration_ms * frame_index ring_buffer.clear() voiced_frames = [] if len(voiced_frames) > 0: yield b''.join(voiced_frames), \ frame_duration_ms * (frame_index - len(voiced_frames)), \ frame_duration_ms * (frame_index + 1)
Example #22
Source File: audiosegment.py From AudioSegment with MIT License | 4 votes |
def detect_voice(self, prob_detect_voice=0.5): """ Returns self as a list of tuples: [('v', voiced segment), ('u', unvoiced segment), (etc.)] The overall order of the AudioSegment is preserved. :param prob_detect_voice: The raw probability that any random 20ms window of the audio file contains voice. :returns: The described list. """ assert self.frame_rate in (48000, 32000, 16000, 8000), "Try resampling to one of the allowed frame rates." assert self.sample_width == 2, "Try resampling to 16 bit." assert self.channels == 1, "Try resampling to one channel." class model_class: def __init__(self, aggressiveness): self.v = webrtcvad.Vad(int(aggressiveness)) def predict(self, vector): if self.v.is_speech(vector.raw_data, vector.frame_rate): return 1 else: return 0 model = model_class(aggressiveness=2) pyesno = 0.3 # Probability of the next 20 ms being unvoiced given that this 20 ms was voiced pnoyes = 0.2 # Probability of the next 20 ms being voiced given that this 20 ms was unvoiced p_realyes_outputyes = 0.4 # WebRTCVAD has a very high FP rate - just because it says yes, doesn't mean much p_realyes_outputno = 0.05 # If it says no, we can be very certain that it really is a no p_yes_raw = prob_detect_voice filtered = self.detect_event(model=model, ms_per_input=20, transition_matrix=(pyesno, pnoyes), model_stats=(p_realyes_outputyes, p_realyes_outputno), event_length_s=0.25, prob_raw_yes=p_yes_raw) ret = [] for tup in filtered: t = ('v', tup[1]) if tup[0] == 'y' else ('u', tup[1]) ret.append(t) return ret