Python resampy.resample() Examples
The following are 30
code examples of resampy.resample().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
resampy
, or try the search function
.
Example #1
Source File: test_matlab_python.py From pystoi with MIT License | 6 votes |
def test_nnresample(): """ Compare matlab and nnresample resample : FAILING """ from nnresample import resample from pystoi.stoi import FS import matlab_wrapper matlab = matlab_wrapper.MatlabSession() matlab.put('FS', float(FS)) RTOL = 1e-4 for fs in [8000, 11025, 16000, 22050, 32000, 44100, 48000]: x = np.random.randn(2*fs,) x_r = resample(x, FS, fs) matlab.put('x', x) matlab.put('fs', float(fs)) matlab.eval('x_r = resample(x, FS, fs)') assert_allclose(x_r, matlab.get('x_r'), atol=ATOL, rtol=RTOL)
Example #2
Source File: test_matlab_python.py From pystoi with MIT License | 6 votes |
def test_resampy(): """ Compare matlab and librosa resample : FAILING """ from resampy import resample from pystoi.stoi import FS import matlab_wrapper matlab = matlab_wrapper.MatlabSession() matlab.put('FS', float(FS)) RTOL = 1e-4 for fs in [8000, 11025, 16000, 22050, 32000, 44100, 48000]: x = np.random.randn(2*fs,) x_r = resample(x, fs, FS) matlab.put('x', x) matlab.put('fs', float(fs)) matlab.eval('x_r = resample(x, FS, fs)') assert_allclose(x_r, matlab.get('x_r'), atol=ATOL, rtol=RTOL)
Example #3
Source File: audio.py From ZASR_tensorflow with Apache License 2.0 | 6 votes |
def convolve(self, impulse_segment, allow_resample=False): """Convolve this audio segment with the given impulse segment. Note that this is an in-place transformation. :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment :param allow_resample: Indicates whether resampling is allowed when the impulse_segment has a different sample rate from this signal. :type allow_resample: bool :raises ValueError: If the sample rate is not match between two audio segments when resample is not allowed. """ if allow_resample and self.sample_rate != impulse_segment.sample_rate: impulse_segment.resample(self.sample_rate) if self.sample_rate != impulse_segment.sample_rate: raise ValueError("Impulse segment's sample rate (%d Hz) is not " "equal to base signal sample rate (%d Hz)." % (impulse_segment.sample_rate, self.sample_rate)) samples = signal.fftconvolve(self.samples, impulse_segment.samples, "full") self._samples = samples
Example #4
Source File: reEncodeAudio.py From 2.5D-Visual-Sound with Creative Commons Attribution 4.0 International | 5 votes |
def load_wav(fname, rate=None): fp = Sndfile(fname, 'r') _signal = fp.read_frames(fp.nframes) _signal = _signal.reshape((-1, fp.channels)) _rate = fp.samplerate if _signal.ndim == 1: _signal.reshape((-1, 1)) if rate is not None and rate != _rate: signal = resampy.resample(_signal, _rate, rate, axis=0, filter='kaiser_best') else: signal = _signal rate = _rate return signal, rate
Example #5
Source File: test_quality.py From resampy with ISC License | 5 votes |
def test_quality_sine(sr_orig, sr_new, fil, rms): FREQ = 512.0 DURATION = 2.0 x = make_tone(FREQ, sr_orig, DURATION) y = make_tone(FREQ, sr_new, DURATION) y_pred = resampy.resample(x, sr_orig, sr_new, filter=fil) idx = slice(sr_new // 2, - sr_new//2) err = np.mean(np.abs(y[idx] - y_pred[idx])) assert err <= rms, '{:g} > {:g}'.format(err, rms)
Example #6
Source File: test_quality.py From resampy with ISC License | 5 votes |
def test_quality_sweep(sr_orig, sr_new, fil, rms): FREQ = 8192 DURATION = 5.0 x = make_sweep(FREQ, sr_orig, DURATION) y = make_sweep(FREQ, sr_new, DURATION) y_pred = resampy.resample(x, sr_orig, sr_new, filter=fil) idx = slice(sr_new // 2, - sr_new//2) err = np.mean(np.abs(y[idx] - y_pred[idx])) assert err <= rms, '{:g} > {:g}'.format(err, rms)
Example #7
Source File: test_core.py From resampy with ISC License | 5 votes |
def test_shape(axis): sr_orig = 100 sr_new = sr_orig // 2 X = np.random.randn(sr_orig, sr_orig, sr_orig) Y = resampy.resample(X, sr_orig, sr_new, axis=axis) target_shape = list(X.shape) target_shape[axis] = target_shape[axis] * sr_new // sr_orig assert target_shape == list(Y.shape)
Example #8
Source File: test_core.py From resampy with ISC License | 5 votes |
def test_bad_sr(sr_orig, sr_new): x = np.zeros(100) resampy.resample(x, sr_orig, sr_new)
Example #9
Source File: vggish_input.py From edusense with BSD 3-Clause "New" or "Revised" License | 5 votes |
def waveform_to_examples_subtract_bg(data, sample_rate, bg): # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram_subtract_bg( data, bg, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #10
Source File: vggish_input.py From edusense with BSD 3-Clause "New" or "Revised" License | 5 votes |
def waveform_to_examples(data, sample_rate): # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #11
Source File: test_core.py From resampy with ISC License | 5 votes |
def test_bad_rolloff(rolloff): x = np.zeros(100) resampy.resample(x, 100, 50, filter='sinc_window', rolloff=rolloff)
Example #12
Source File: test_core.py From resampy with ISC License | 5 votes |
def test_bad_num_zeros(): x = np.zeros(100) resampy.resample(x, 100, 50, filter='sinc_window', num_zeros=0)
Example #13
Source File: compute-fbank-feats.py From espnet with Apache License 2.0 | 5 votes |
def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper( args.rspecifier, segments=args.segments ) as reader, file_writer_helper( args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, ) as writer: for utt_id, (rate, array) in reader: array = array.astype(numpy.float32) if args.fs is not None and rate != args.fs: array = resampy.resample(array, rate, args.fs, axis=0) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) lmspc = logmelspectrogram( x=array, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, fmin=args.fmin, fmax=args.fmax, ) writer[utt_id] = lmspc
Example #14
Source File: compute-stft-feats.py From espnet with Apache License 2.0 | 5 votes |
def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper( args.rspecifier, segments=args.segments ) as reader, file_writer_helper( args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, ) as writer: for utt_id, (rate, array) in reader: array = array.astype(numpy.float32) if args.fs is not None and rate != args.fs: array = resampy.resample(array, rate, args.fs, axis=0) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) spc = spectrogram( x=array, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, ) writer[utt_id] = spc
Example #15
Source File: test_core.py From resampy with ISC License | 5 votes |
def test_dtype(dtype): x = np.random.randn(100).astype(dtype) y = resampy.resample(x, 100, 200) assert x.dtype == y.dtype
Example #16
Source File: test_core.py From resampy with ISC License | 5 votes |
def test_bad_window(): x = np.zeros(100) resampy.resample(x, 100, 200, filter='sinc_window', window=np.ones(50))
Example #17
Source File: audio.py From ZASR_tensorflow with Apache License 2.0 | 5 votes |
def resample(self, target_sample_rate, filter='kaiser_best'): """Resample the audio to a target sample rate. Note that this is an in-place transformation. :param target_sample_rate: Target sample rate. :type target_sample_rate: int :param filter: The resampling filter to use one of {'kaiser_best', 'kaiser_fast'}. :type filter: str """ self._samples = resampy.resample( self.samples, self.sample_rate, target_sample_rate, filter=filter) self._sample_rate = target_sample_rate
Example #18
Source File: test_core.py From resampy with ISC License | 5 votes |
def test_short_signal(): x = np.zeros(2) resampy.resample(x, 4, 1)
Example #19
Source File: test_core.py From resampy with ISC License | 5 votes |
def test_good_window(): sr_orig = 100 sr_new = 200 x = np.random.randn(500) y = resampy.resample(x, sr_orig, sr_new, filter='sinc_window', window=scipy.signal.blackman) assert len(y) == 2 * len(x)
Example #20
Source File: inference.py From models with Apache License 2.0 | 5 votes |
def main(argv): assert argv graph = tf.Graph() with graph.as_default(): yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.SAMPLE_RATE: waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE) # Predict YAMNet classes. # Second output is log-mel-spectrogram array (used for visualizations). # (steps=1 is a work around for Keras batching limitations.) with graph.as_default(): scores, _ = yamnet.predict(np.reshape(waveform, [1, -1]), steps=1) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print(file_name, ':\n' + '\n'.join(' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i))
Example #21
Source File: resample.py From deep_complex_networks with MIT License | 5 votes |
def resample_musicnet(file_in, file_out, frame_rate, frame_rate_out): ratio = frame_rate_out / float(frame_rate) print('.. resampling {} ({}Hz) into {} ({}Hz)'.format( file_in, frame_rate, file_out, frame_rate_out)) print('.. sampling with ratio {}'.format(ratio)) resampled_data = {} with open(file_in, 'rb') as f_in: data_in = numpy.load(file_in) n_files = len(data_in.keys()) for i, key in enumerate(data_in): print('.. aggregating {} ({} / {})'.format(key, i, n_files)) data = data_in[key] data[0] = resample(data[0], frame_rate, frame_rate_out) resampled_intervals = [] for interval in data[1]: resampled_begin = int(interval.begin * ratio) resampled_end = int(interval.end * ratio) resampled_interval = Interval( resampled_begin, resampled_end, interval.data) resampled_intervals.append(resampled_interval) data[1] = IntervalTree(resampled_intervals) resampled_data[key] = data print('.. saving output') with open(file_out, 'wb') as f_out: numpy.savez(f_out, **resampled_data)
Example #22
Source File: vggish_input.py From multilabel-image-classification-tensorflow with MIT License | 4 votes |
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #23
Source File: vggish_input.py From g-tensorflow-models with Apache License 2.0 | 4 votes |
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #24
Source File: vggish_input.py From models with Apache License 2.0 | 4 votes |
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #25
Source File: vggish_input.py From sklearn-audio-transfer-learning with ISC License | 4 votes |
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #26
Source File: vggish_input.py From audioset_classification with MIT License | 4 votes |
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #27
Source File: vggish_input.py From object_detection_with_tensorflow with MIT License | 4 votes |
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #28
Source File: core.py From openl3 with MIT License | 4 votes |
def _preprocess_audio_batch(audio, sr, center=True, hop_size=0.1): """Process audio into batch format suitable for input to embedding model """ if audio.size == 0: raise OpenL3Error('Got empty audio') # Warn user if audio is all zero if np.all(audio == 0): warnings.warn('Provided audio is all zeros', OpenL3Warning) # Check audio array dimension if audio.ndim > 2: raise OpenL3Error('Audio array can only be be 1D or 2D') elif audio.ndim == 2: # Downmix if multichannel audio = np.mean(audio, axis=1) if not isinstance(sr, Real) or sr <= 0: raise OpenL3Error('Invalid sample rate {}'.format(sr)) if not isinstance(hop_size, Real) or hop_size <= 0: raise OpenL3Error('Invalid hop size {}'.format(hop_size)) if center not in (True, False): raise OpenL3Error('Invalid center value {}'.format(center)) # Resample if necessary if sr != TARGET_SR: audio = resampy.resample(audio, sr_orig=sr, sr_new=TARGET_SR, filter='kaiser_best') audio_len = audio.size frame_len = TARGET_SR hop_len = int(hop_size * TARGET_SR) if audio_len < frame_len: warnings.warn('Duration of provided audio is shorter than window size (1 second). Audio will be padded.', OpenL3Warning) if center: # Center audio audio = _center_audio(audio, frame_len) # Pad if necessary to ensure that we process all samples audio = _pad_audio(audio, frame_len, hop_len) # Split audio into frames, copied from librosa.util.frame n_frames = 1 + int((len(audio) - frame_len) / float(hop_len)) x = np.lib.stride_tricks.as_strided(audio, shape=(frame_len, n_frames), strides=(audio.itemsize, hop_len * audio.itemsize)).T # Add a channel dimension x = x.reshape((x.shape[0], 1, x.shape[-1])) return x
Example #29
Source File: inputs.py From openmic-2018 with MIT License | 4 votes |
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=params.SAMPLE_RATE, log_offset=params.LOG_OFFSET, window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=params.NUM_MEL_BINS, lower_edge_hertz=params.MEL_MIN_HZ, upper_edge_hertz=params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
Example #30
Source File: vggish_input.py From object_detection_kitti with Apache License 2.0 | 4 votes |
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples