Python webrtcvad.Vad() Examples

The following are 22 code examples of webrtcvad.Vad(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module webrtcvad , or try the search function .
Example #1
Source File: VAD_segments.py    From PyTorch_Speaker_Verification with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def VAD_chunk(aggressiveness, path):
    audio, byte_audio = read_wave(path, hp.data.sr)
    vad = webrtcvad.Vad(int(aggressiveness))
    frames = frame_generator(20, byte_audio, hp.data.sr)
    frames = list(frames)
    times = vad_collector(hp.data.sr, 20, 200, vad, frames)
    speech_times = []
    speech_segs = []
    for i, time in enumerate(times):
        start = np.round(time[0],decimals=2)
        end = np.round(time[1],decimals=2)
        j = start
        while j + .4 < end:
            end_j = np.round(j+.4,decimals=2)
            speech_times.append((j, end_j))
            speech_segs.append(audio[int(j*hp.data.sr):int(end_j*hp.data.sr)])
            j = end_j
        else:
            speech_times.append((j, end))
            speech_segs.append(audio[int(j*hp.data.sr):int(end*hp.data.sr)])
    return speech_times, speech_segs 
Example #2
Source File: vad.py    From respeaker_python_library with Apache License 2.0 6 votes vote down vote up
def __init__(self, sample_rate=16000, level=0):
        """

        Args:
            sample_rate: audio sample rate
            level: between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.
        """
        self.sample_rate = sample_rate

        self.frame_ms = 30
        self.frame_bytes = int(2 * self.frame_ms * self.sample_rate / 1000)   # S16_LE, 2 bytes width

        self.vad = webrtcvad.Vad(level)
        self.active = False
        self.data = b''
        self.history = collections.deque(maxlen=128) 
Example #3
Source File: recorder.py    From hermes-audio-server with MIT License 6 votes vote down vote up
def initialize(self):
        """Initialize a Hermes audio recorder."""
        self.logger.debug('Probing for available input devices...')
        for index in range(self.audio.get_device_count()):
            device = self.audio.get_device_info_by_index(index)
            name = device['name']
            channels = device['maxInputChannels']
            if channels:
                self.logger.debug('[%d] %s', index, name)
        try:
            self.audio_in = self.audio.get_default_input_device_info()['name']
        except OSError:
            raise NoDefaultAudioDeviceError('input')
        self.logger.info('Connected to audio input %s.', self.audio_in)

        if self.config.vad.enabled:
            self.logger.info('Voice Activity Detection enabled with mode %s.',
                             self.config.vad.mode)
            self.vad = webrtcvad.Vad(self.config.vad.mode) 
Example #4
Source File: vad.py    From py-nltools with Apache License 2.0 6 votes vote down vote up
def __init__(self, aggressiveness=2, sample_rate=SAMPLE_RATE,
                 min_utt_length = MIN_UTT_LENGTH,
                 max_utt_length = MAX_UTT_LENGTH,
                 max_utt_gap    = MAX_UTT_GAP):


        self.sample_rate = sample_rate

        self.vad = webrtcvad.Vad()
        self.vad.set_mode(aggressiveness)

        self.state          = STATE_IDLE
        self.buf            = []
        self.buf_sent       = 0

        self.min_buf_entries = int(min_utt_length * 1000) / BUFFER_DURATION 
        self.max_buf_entries = int(max_utt_length * 1000) / BUFFER_DURATION
        self.max_gap         = int(max_utt_gap    * 1000) / BUFFER_DURATION

        self.frame_cnt       = 0
        self.avg_vol_sum     = 0.0
        self.avg_vol_cnt     = 0 
Example #5
Source File: audio.py    From Resemblyzer with Apache License 2.0 6 votes vote down vote up
def trim_long_silences(wav):
    """
    Ensures that segments without voice in the waveform remain no longer than a 
    threshold determined by the VAD parameters in params.py.

    :param wav: the raw waveform as a numpy array of floats 
    :return: the same waveform with silences trimmed away (length <= original wav length)
    """
    # Compute the voice detection window size
    samples_per_window = (vad_window_length * sampling_rate) // 1000
    
    # Trim the end of the audio to have a multiple of the window size
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
    
    # Convert the float waveform to 16-bit mono PCM
    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
    
    # Perform voice activation detection
    voice_flags = []
    vad = webrtcvad.Vad(mode=3)
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                                         sample_rate=sampling_rate))
    voice_flags = np.array(voice_flags)
    
    # Smooth the voice detection with a moving average
    def moving_average(array, width):
        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
        ret = np.cumsum(array_padded, dtype=float)
        ret[width:] = ret[width:] - ret[:-width]
        return ret[width - 1:] / width
    
    audio_mask = moving_average(voice_flags, vad_moving_average_width)
    audio_mask = np.round(audio_mask).astype(np.bool)
    
    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)
    
    return wav[audio_mask == True] 
Example #6
Source File: vad.py    From tdoa with Apache License 2.0 6 votes vote down vote up
def __init__(self, sample_rate=16000, level=3):
        """

        Args:
            sample_rate: audio sample rate
            level: between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.
        """
        self.sample_rate = sample_rate

        self.frame_ms = 30
        self.frame_bytes = int(2 * self.frame_ms * self.sample_rate / 1000)   # S16_LE, 2 bytes width

        self.vad = webrtcvad.Vad(level)
        self.active = False
        self.data = b''
        self.history = collections.deque(maxlen=128) 
Example #7
Source File: vad_doa.py    From mic_array with Apache License 2.0 5 votes vote down vote up
def main():
    vad = webrtcvad.Vad(3)

    speech_count = 0
    chunks = []
    doa_chunks = int(DOA_FRAMES / VAD_FRAMES)

    try:
        with MicArray(RATE, CHANNELS, RATE * VAD_FRAMES / 1000)  as mic:
            for chunk in mic.read_chunks():
                # Use single channel audio to detect voice activity
                if vad.is_speech(chunk[0::CHANNELS].tobytes(), RATE):
                    speech_count += 1
                    sys.stdout.write('1')
                else:
                    sys.stdout.write('0')

                sys.stdout.flush()

                chunks.append(chunk)
                if len(chunks) == doa_chunks:
                    if speech_count > (doa_chunks / 2):
                        frames = np.concatenate(chunks)
                        direction = mic.get_direction(frames)
                        pixel_ring.set_direction(direction)
                        print('\n{}'.format(int(direction)))

                    speech_count = 0
                    chunks = []

    except KeyboardInterrupt:
        pass
        
    pixel_ring.off() 
Example #8
Source File: capture.py    From AlexaPi with MIT License 5 votes vote down vote up
def setup(self, state_callback):
		self._vad = webrtcvad.Vad(2)
		self._state_callback = state_callback 
Example #9
Source File: do_vad.py    From setk with Apache License 2.0 5 votes vote down vote up
def __init__(self, mode, cache_size, fs=16000):
        self.pyvad = vad.Vad(mode=args.mode)
        self.fs = fs
        self.cache_size = cache_size
        self.reset() 
Example #10
Source File: vad.py    From 3D-convolutional-speaker-recognition-pytorch with Apache License 2.0 5 votes vote down vote up
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: example.py <aggressiveness> <path to wav file>\n')
        sys.exit(1)
    audio, sample_rate = read_wave(args[1])
    vad = webrtcvad.Vad(int(args[0]))
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)
    for i, segment in enumerate(segments):
        path = 'chunk-%002d.wav' % (i,)
        print(' Writing %s' % (path,))
        write_wave(path, segment, sample_rate) 
Example #11
Source File: audio.py    From Speech_emotion_recognition_BLSTM with MIT License 5 votes vote down vote up
def __init__(self, sr=16000, chunk_duration_ms=30, video_path='', out_path=''):
        self._sr = sr
        self._chunk_duration_ms = chunk_duration_ms
        self._chunk_size = int(sr * chunk_duration_ms / 1000)  # chunk to read in samples
        self._nb_window_chunks = int(400 / chunk_duration_ms)  # 400ms / 30ms frame
        self._nb_window_chunks_end = self._nb_window_chunks * 2
        self._vad = webrtcvad.Vad(mode=3)

        self._video_path = video_path
        self._out_path = out_path 
Example #12
Source File: client.py    From deepspeech-websocket-server with Mozilla Public License 2.0 5 votes vote down vote up
def __init__(self, aggressiveness=3):
        super().__init__()
        self.vad = webrtcvad.Vad(aggressiveness) 
Example #13
Source File: transforms.py    From pase with MIT License 5 votes vote down vote up
def __init__(self, chop_factors=[(0.05, 0.025), (0.1, 0.05)],
                 max_chops=2, force_regions=False, report=False):
        # chop factors in seconds (mean, std) per possible chop
        import webrtcvad
        self.chop_factors = chop_factors
        self.max_chops = max_chops
        self.force_regions = force_regions
        # create VAD to get speech chunks
        self.vad = webrtcvad.Vad(2)
        # make scalers to norm/denorm
        self.denormalizer = Scale(1. / ((2 ** 15) - 1))
        self.normalizer = Scale((2 ** 15) - 1)
        self.report = report

    # @profile 
Example #14
Source File: speech_transformers.py    From ffsubsync with MIT License 5 votes vote down vote up
def _make_webrtcvad_detector(sample_rate, frame_rate):
    import webrtcvad
    vad = webrtcvad.Vad()
    vad.set_mode(3)  # set non-speech pruning aggressiveness from 0 to 3
    window_duration = 1. / sample_rate  # duration in seconds
    frames_per_window = int(window_duration * frame_rate + 0.5)
    bytes_per_frame = 2

    def _detect(asegment):
        media_bstring = []
        failures = 0
        for start in range(0, len(asegment) // bytes_per_frame,
                           frames_per_window):
            stop = min(start + frames_per_window,
                       len(asegment) // bytes_per_frame)
            try:
                is_speech = vad.is_speech(
                    asegment[start * bytes_per_frame: stop * bytes_per_frame],
                    sample_rate=frame_rate)
            except:
                is_speech = False
                failures += 1
            # webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
            media_bstring.append(1. if is_speech else 0.5)
        return np.array(media_bstring)

    return _detect 
Example #15
Source File: microphone.py    From ada with Apache License 2.0 5 votes vote down vote up
def __init__(self, frame_length: int, sample_rate: int) -> None:
        """Initialize Microphone processing."""
        self.audio = pyaudio.PyAudio()
        self.vad = webrtcvad.Vad(1)
        self.stream: Optional[pyaudio.Stream] = None

        self._frame_length = frame_length
        self._sample_rate = sample_rate
        self._last_frame: Optional[np.ndarray] = None 
Example #16
Source File: audio.py    From dragonfly with GNU Lesser General Public License v3.0 5 votes vote down vote up
def __init__(self, aggressiveness=3, **kwargs):
        super(VADAudio, self).__init__(**kwargs)
        self.vad = webrtcvad.Vad(aggressiveness) 
Example #17
Source File: vad.py    From voice-engine with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, rate=16000, mode=0, duration=1000, on_inactive=None):
        super(VAD, self).__init__()

        self.rate = rate
        self.vad = Vad(mode)
        self.on_inactive = on_inactive
        self.limit_inactive_cnt = duration / 10  # a frame is 10 ms
        self.current_inactive_cnt = 0 
Example #18
Source File: vad.py    From cloud-asr with Apache License 2.0 5 votes vote down vote up
def __init__(self, sample_rate=16000, level=0):
        self.vad = webrtcvad.Vad(level)
        self.sample_rate = int(sample_rate)
        self.num_padding_frames = 10
        self.reset() 
Example #19
Source File: audiosegment.py    From Speech_emotion_recognition_BLSTM with MIT License 4 votes vote down vote up
def detect_voice(self, prob_detect_voice=0.5):
        """
        Returns self as a list of tuples:
        [('v', voiced segment), ('u', unvoiced segment), (etc.)]

        The overall order of the AudioSegment is preserved.

        :param prob_detect_voice: The raw probability that any random 20ms window of the audio file
                                  contains voice.
        :returns: The described list.
        """
        assert self.frame_rate in (48000, 32000, 16000, 8000), "Try resampling to one of the allowed frame rates."
        assert self.sample_width == 2, "Try resampling to 16 bit."
        assert self.channels == 1, "Try resampling to one channel."

        class model_class:
            def __init__(self, aggressiveness):
                self.v = webrtcvad.Vad(int(aggressiveness))

            def predict(self, vector):
                if self.v.is_speech(vector.raw_data, vector.frame_rate):
                    return 1
                else:
                    return 0

        model = model_class(aggressiveness=1)
        pyesno = 0.3  # Probability of the next 20 ms being unvoiced given that this 20 ms was voiced
        pnoyes = 0.2  # Probability of the next 20 ms being voiced given that this 20 ms was unvoiced
        p_realyes_outputyes = 0.4  # WebRTCVAD has a very high FP rate - just because it says yes, doesn't mean much
        p_realyes_outputno  = 0.05  # If it says no, we can be very certain that it really is a no
        p_yes_raw = prob_detect_voice
        filtered = self.detect_event(model=model,
                                     ms_per_input=20,
                                     transition_matrix=(pyesno, pnoyes),
                                     model_stats=(p_realyes_outputyes, p_realyes_outputno),
                                     event_length_s=0.25,
                                     prob_raw_yes=p_yes_raw)
        ret = []
        for tup in filtered:
            t = ('v', tup[1]) if tup[0] == 'y' else ('u', tup[1])
            ret.append(t)
        return ret 
Example #20
Source File: audiosegment.py    From AudioSegment with MIT License 4 votes vote down vote up
def detect_voice(self, prob_detect_voice=0.5):
        """
        Returns self as a list of tuples:
        [('v', voiced segment), ('u', unvoiced segment), (etc.)]

        The overall order of the AudioSegment is preserved.

        :param prob_detect_voice: The raw probability that any random 20ms window of the audio file
                                  contains voice.
        :returns: The described list.
        """
        assert self.frame_rate in (48000, 32000, 16000, 8000), "Try resampling to one of the allowed frame rates."
        assert self.sample_width == 2, "Try resampling to 16 bit."
        assert self.channels == 1, "Try resampling to one channel."

        class model_class:
            def __init__(self, aggressiveness):
                self.v = webrtcvad.Vad(int(aggressiveness))

            def predict(self, vector):
                if self.v.is_speech(vector.raw_data, vector.frame_rate):
                    return 1
                else:
                    return 0

        model = model_class(aggressiveness=2)
        pyesno = 0.3  # Probability of the next 20 ms being unvoiced given that this 20 ms was voiced
        pnoyes = 0.2  # Probability of the next 20 ms being voiced given that this 20 ms was unvoiced
        p_realyes_outputyes = 0.4  # WebRTCVAD has a very high FP rate - just because it says yes, doesn't mean much
        p_realyes_outputno  = 0.05  # If it says no, we can be very certain that it really is a no
        p_yes_raw = prob_detect_voice
        filtered = self.detect_event(model=model,
                                     ms_per_input=20,
                                     transition_matrix=(pyesno, pnoyes),
                                     model_stats=(p_realyes_outputyes, p_realyes_outputno),
                                     event_length_s=0.25,
                                     prob_raw_yes=p_yes_raw)
        ret = []
        for tup in filtered:
            t = ('v', tup[1]) if tup[0] == 'y' else ('u', tup[1])
            ret.append(t)
        return ret 
Example #21
Source File: audio.py    From DSAlign with Mozilla Public License 2.0 4 votes vote down vote up
def vad_split(audio_frames,
              audio_format=DEFAULT_FORMAT,
              num_padding_frames=10,
              threshold=0.5,
              aggressiveness=3):
    sample_rate, channels, width = audio_format
    if channels != 1:
        raise ValueError('VAD-splitting requires mono samples')
    if width != 2:
        raise ValueError('VAD-splitting requires 16 bit samples')
    if sample_rate not in [8000, 16000, 32000, 48000]:
        raise ValueError('VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000')
    if aggressiveness not in [0, 1, 2, 3]:
        raise ValueError('VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3')
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False
    vad = Vad(int(aggressiveness))
    voiced_frames = []
    frame_duration_ms = 0
    frame_index = 0
    for frame_index, frame in enumerate(audio_frames):
        frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
        if int(frame_duration_ms) not in [10, 20, 30]:
            raise ValueError('VAD-splitting only supported for frame durations 10, 20, or 30 ms')
        is_speech = vad.is_speech(frame, sample_rate)
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > threshold * ring_buffer.maxlen:
                triggered = True
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > threshold * ring_buffer.maxlen:
                triggered = False
                yield b''.join(voiced_frames), \
                      frame_duration_ms * max(0, frame_index - len(voiced_frames)), \
                      frame_duration_ms * frame_index
                ring_buffer.clear()
                voiced_frames = []
    if len(voiced_frames) > 0:
        yield b''.join(voiced_frames), \
              frame_duration_ms * (frame_index - len(voiced_frames)), \
              frame_duration_ms * (frame_index + 1) 
Example #22
Source File: audiosegment.py    From AudioSegment with MIT License 4 votes vote down vote up
def detect_voice(self, prob_detect_voice=0.5):
        """
        Returns self as a list of tuples:
        [('v', voiced segment), ('u', unvoiced segment), (etc.)]

        The overall order of the AudioSegment is preserved.

        :param prob_detect_voice: The raw probability that any random 20ms window of the audio file
                                  contains voice.
        :returns: The described list.
        """
        assert self.frame_rate in (48000, 32000, 16000, 8000), "Try resampling to one of the allowed frame rates."
        assert self.sample_width == 2, "Try resampling to 16 bit."
        assert self.channels == 1, "Try resampling to one channel."

        class model_class:
            def __init__(self, aggressiveness):
                self.v = webrtcvad.Vad(int(aggressiveness))

            def predict(self, vector):
                if self.v.is_speech(vector.raw_data, vector.frame_rate):
                    return 1
                else:
                    return 0

        model = model_class(aggressiveness=2)
        pyesno = 0.3  # Probability of the next 20 ms being unvoiced given that this 20 ms was voiced
        pnoyes = 0.2  # Probability of the next 20 ms being voiced given that this 20 ms was unvoiced
        p_realyes_outputyes = 0.4  # WebRTCVAD has a very high FP rate - just because it says yes, doesn't mean much
        p_realyes_outputno  = 0.05  # If it says no, we can be very certain that it really is a no
        p_yes_raw = prob_detect_voice
        filtered = self.detect_event(model=model,
                                     ms_per_input=20,
                                     transition_matrix=(pyesno, pnoyes),
                                     model_stats=(p_realyes_outputyes, p_realyes_outputno),
                                     event_length_s=0.25,
                                     prob_raw_yes=p_yes_raw)
        ret = []
        for tup in filtered:
            t = ('v', tup[1]) if tup[0] == 'y' else ('u', tup[1])
            ret.append(t)
        return ret