Python librosa.resample() Examples

The following are 25 code examples of librosa.resample(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module librosa , or try the search function .
Example #1
Source File: rapSpeaker.py    From rapLyrics with MIT License 11 votes vote down vote up
def doFileStuff(line,isSlow):
    myobj = gTTS(text=line, lang='en', slow=isSlow) 
    myobj.save("placeholder.mp3")
    
    y, sr = librosa.load("placeholder.mp3")
    data = librosa.resample(y, sr, SAMPLE_RATE)
    librosa.output.write_wav('placeholder.wav', data, SAMPLE_RATE)
    d, sr = sf.read('placeholder.wav')
    sf.write('placeholder.wav', d, sr)

    y, sr = librosa.load("placeholder.mp3")
    lowData = librosa.resample(y, sr, SAMPLE_RATE*LOW_FACTOR)
    librosa.output.write_wav('lowPlaceholder.wav', lowData, SAMPLE_RATE)
    d, sr = sf.read('lowPlaceholder.wav')
    sf.write('lowPlaceholder.wav', d, sr)

    return data 
Example #2
Source File: Input.py    From vimss with GNU General Public License v3.0 6 votes vote down vote up
def readWave(audio_path, start_frame, end_frame, mono=True, sample_rate=None, clip=True):
    snd_file = SoundFile(audio_path, mode='r')
    inf = snd_file._info
    audio_sr = inf.samplerate

    start_read = max(start_frame, 0)
    pad_front = -min(start_frame, 0)
    end_read = min(end_frame, inf.frames)
    pad_back = max(end_frame - inf.frames, 0)

    snd_file.seek(start_read)
    audio = snd_file.read(end_read - start_read, dtype='float32', always_2d=True) # (num_frames, channels)
    snd_file.close()

    # Pad if necessary (start_frame or end_frame out of bounds)
    audio = np.pad(audio, [(pad_front, pad_back), (0, 0)], mode="constant", constant_values=0.0)

    # Convert to mono if desired
    if mono:
        audio = np.mean(audio, axis=1, keepdims=True)

    # Resample if needed
    if sample_rate is not None and sample_rate != audio_sr:
        res_length = int(np.ceil(float(audio.shape[0]) * float(sample_rate) / float(audio_sr)))
        audio = np.pad(audio, [(1, 1), (0,0)], mode="reflect")  # Pad audio first
        audio = librosa.resample(audio.T, audio_sr, sample_rate, res_type="kaiser_fast").T
        skip = (audio.shape[0] - res_length) // 2
        audio = audio[skip:skip+res_length,:]

    # Clip to [-1,1] if desired
    if clip:
        audio = np.minimum(np.maximum(audio, -1.0), 1.0)

    return audio, audio_sr 
Example #3
Source File: audio_signal.py    From nussl with MIT License 6 votes vote down vote up
def sample_rate(self):
        """
        ``int``
            Sample rate associated with this object. If audio was read from a file, the sample
            rate will be set to the sample rate associated with the file. If this object was
            initialized from an array then the sample rate is set upon init. This property is
            read-only. To change the sample rate, use :func:`resample`.

        Notes:
            This property is read-only and cannot be set directly. To change

        See Also:
            * :func:`resample` to change the sample rate and resample data in :attr:`sample_rate`.

            * :func:`load_audio_from_array` to read audio from an array and set the sample rate.

            * :var:`nussl.constants.DEFAULT_SAMPLE_RATE` the default sample rate for *nussl*
                if not specified
        """
        return self._sample_rate 
Example #4
Source File: base.py    From pumpp with ISC License 6 votes vote down vote up
def transform(self, y, sr):
        '''Transform an audio signal

        Parameters
        ----------
        y : np.ndarray
            The audio signal

        sr : number > 0
            The native sampling rate of y

        Returns
        -------
        dict
            Data dictionary containing features extracted from y

        See Also
        --------
        transform_audio
        '''
        if sr != self.sr:
            y = resample(y, sr, self.sr)

        return self.merge([self.transform_audio(y)]) 
Example #5
Source File: prepare_data.py    From music_transcription_MAPS with MIT License 6 votes vote down vote up
def read_audio(path, target_fs=None):
    """Read 1 dimension audio sequence from given path. 
    
    Args:
      path: string, path of audio. 
      target_fs: int, resampling rate. 
      
    Returns:
      audio: 1 dimension audio sequence. 
      fs: sampling rate of audio. 
    """
    (audio, fs) = soundfile.read(path)
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
    if target_fs is not None and fs != target_fs:
        audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
        fs = target_fs
    return audio, fs 
Example #6
Source File: perturb.py    From espnet with Apache License 2.0 5 votes vote down vote up
def __call__(self, x, uttid=None, train=True):
        if not train:
            return x

        x = x.astype(numpy.float32)
        if self.accept_uttid:
            ratio = self.utt2ratio[uttid]
        else:
            ratio = self.state.uniform(self.lower, self.upper)

        # Note1: resample requires the sampling-rate of input and output,
        #        but actually only the ratio is used.
        y = librosa.resample(x, ratio, 1, res_type=self.res_type)

        if self.keep_length:
            diff = abs(len(x) - len(y))
            if len(y) > len(x):
                # Truncate noise
                y = y[diff // 2 : -((diff + 1) // 2)]
            elif len(y) < len(x):
                # Assume the time-axis is the first: (Time, Channel)
                pad_width = [(diff // 2, (diff + 1) // 2)] + [
                    (0, 0) for _ in range(y.ndim - 1)
                ]
                y = numpy.pad(
                    y, pad_width=pad_width, constant_values=0, mode="constant"
                )
        return y 
Example #7
Source File: prepare_data.py    From dcase2017_task4_cvssp with MIT License 5 votes vote down vote up
def read_audio(path, target_fs=None):
    (audio, fs) = soundfile.read(path)
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
    if target_fs is not None and fs != target_fs:
        audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
        fs = target_fs
    return audio, fs
    
# Write wav 
Example #8
Source File: extract_audioset_embedding.py    From audioset_classification with MIT License 5 votes vote down vote up
def read_audio(path, target_fs=None):
    (audio, fs) = soundfile.read(path)

    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
        
    if target_fs is not None and fs != target_fs:
        audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
        fs = target_fs
        
    return audio, fs
    

### Feature extraction. 
Example #9
Source File: utils.py    From Wave-U-Net-Pytorch with MIT License 5 votes vote down vote up
def resample(audio, orig_sr, new_sr, mode="numpy"):
    if orig_sr == new_sr:
        return audio

    if isinstance(audio, torch.Tensor):
        audio = audio.detach().cpu().numpy()

    out = librosa.resample(audio, orig_sr, new_sr, res_type='kaiser_fast')

    if mode == "pytorch":
        out = torch.tensor(out)
    return out 
Example #10
Source File: audio.py    From Resemblyzer with Apache License 2.0 5 votes vote down vote up
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int]=None):
    """
    Applies preprocessing operations to a waveform either on disk or in memory such that  
    The waveform will be resampled to match the data hyperparameters.

    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
    just .wav), either the waveform as a numpy array of floats.
    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
    preprocessing. After preprocessing, the waveform'speaker sampling rate will match the data 
    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
    this argument will be ignored.
    """
    # Load the wav from disk if needed
    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
    else:
        wav = fpath_or_wav
    
    # Resample the wav
    if source_sr is not None:
        wav = librosa.resample(wav, source_sr, sampling_rate)
        
    # Apply the preprocessing: normalize volume and shorten long silences 
    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
    wav = trim_long_silences(wav)
    
    return wav 
Example #11
Source File: utilities.py    From dcase2018_task1 with MIT License 5 votes vote down vote up
def read_audio(path, target_fs=None):

    (audio, fs) = soundfile.read(path)

    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)

    if target_fs is not None and fs != target_fs:
        audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
        fs = target_fs

    return audio, fs 
Example #12
Source File: audio.py    From dcase_util with MIT License 5 votes vote down vote up
def resample(self, target_fs, scale=True, res_type='kaiser_best'):
        """Resample audio data.

        Parameters
        ----------
        target_fs : int
            Target sampling rate

        scale : bool
            Scale the resampled signal to have approximately equal total energy (see `librosa.core.resample`).
            Default value True

        res_type : str
            Resample type (see `librosa.core.resample`)
            Default value 'kaiser_best'

        Returns
        -------
        self

        """

        if target_fs != self.fs:
            self._data = numpy.asfortranarray(self._data)
            self._data = librosa.resample(
                y=self._data,
                orig_sr=self.fs,
                target_sr=target_fs,
                scale=scale,
                res_type=res_type
            )
            self.fs = target_fs

        return self 
Example #13
Source File: Utils.py    From Wave-U-Net with MIT License 5 votes vote down vote up
def resample(audio, orig_sr, new_sr):
    return librosa.resample(audio.T, orig_sr, new_sr).T 
Example #14
Source File: base.py    From Sound-of-Pixels with MIT License 5 votes vote down vote up
def _load_audio(self, path, center_timestamp, nearest_resample=False):
        audio = np.zeros(self.audLen, dtype=np.float32)

        # silent
        if path.endswith('silent'):
            return audio

        # load audio
        audio_raw, rate = self._load_audio_file(path)

        # repeat if audio is too short
        if audio_raw.shape[0] < rate * self.audSec:
            n = int(rate * self.audSec / audio_raw.shape[0]) + 1
            audio_raw = np.tile(audio_raw, n)

        # resample
        if rate > self.audRate:
            # print('resmaple {}->{}'.format(rate, self.audRate))
            if nearest_resample:
                audio_raw = audio_raw[::rate//self.audRate]
            else:
                audio_raw = librosa.resample(audio_raw, rate, self.audRate)

        # crop N seconds
        len_raw = audio_raw.shape[0]
        center = int(center_timestamp * self.audRate)
        start = max(0, center - self.audLen // 2)
        end = min(len_raw, center + self.audLen // 2)

        audio[self.audLen//2-(center-start): self.audLen//2+(end-center)] = \
            audio_raw[start:end]

        # randomize volume
        if self.split == 'train':
            scale = random.random() + 0.5     # 0.5-1.5
            audio *= scale
        audio[audio > 1.] = 1.
        audio[audio < -1.] = -1.

        return audio 
Example #15
Source File: 02-compute-mel-specs.py    From kaggle-freesound-audio-tagging with MIT License 5 votes vote down vote up
def compute_melspec(filename, indir, outdir):
    wav = np.load(indir + filename + '.npy')
    wav = librosa.resample(wav, 44100, 22050)
    melspec = librosa.feature.melspectrogram(wav,
                                             sr=22050,
                                             n_fft=1764,
                                             hop_length=220,
                                             n_mels=64)
    logmel = librosa.core.power_to_db(melspec)
    np.save(outdir + filename + '.npy', logmel) 
Example #16
Source File: feature_description.py    From Audio-Vision with MIT License 5 votes vote down vote up
def mel(features,path,dataset=None):
    
    """
    This function extracts mel-spectrogram from audio.
    Make sure, you pass a dictionary containing all attributes
    and a path to audio.
    """
    fsx=features['fs'][0]
    n_mels=features['n_mels'][0]
    #print n_mels
    fmin=features['fmin'][0]
    fmax=features['fmax'][0]
    mono=features['mono'][0]
    hamming_window=features['hamming_window'][0]
    noverlap=features['noverlap'][0]
    detrend=features['detrend'][0]
    return_onesided=features['return_onesided'][0]
    mode=features['mode'][0]
    wav, fs = read_audio('librosa',path,dataset)
    #fsx = librosa.resample(wav,fs, 44100)
    #wav, fs = librosa.load(path)
    wav=convert_mono(wav,mono)
    if fs != fsx:
        raise Exception("Assertion Error. Sampling rate Found {} Expected {}".format(fs,fsx))
    ham_win = np.hamming(hamming_window)
    [f, t, X] = signal.spectral.spectrogram(wav,fs, window=ham_win, nperseg=hamming_window, noverlap=noverlap, detrend=detrend, return_onesided=return_onesided, mode=mode )
    X = X.T

    # define global melW, avoid init melW every time, to speed up.
    if globals().get('melW') is None:
        global melW
        melW = librosa.filters.mel( fs, n_fft=hamming_window, n_mels=n_mels, fmin=fmin, fmax=fmax )
        melW /= np.max(melW, axis=-1)[:,None]
    
    X = np.dot( X, melW.T )
    X = X[:, 0:]
    X=feature_normalize(X)
    return X 
Example #17
Source File: utilities.py    From dcase2019_task2 with MIT License 5 votes vote down vote up
def read_audio(audio_path, target_fs=None):
    (audio, fs) = soundfile.read(audio_path)

    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
        
    if target_fs is not None and fs != target_fs:
        audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
        fs = target_fs
        
    return audio, fs 
Example #18
Source File: audio_signal.py    From nussl with MIT License 5 votes vote down vote up
def resample(self, new_sample_rate, **kwargs):
        """
        Resample the data in :attr:`audio_data` to the new sample rate provided by
        :param:`new_sample_rate`. If the :param:`new_sample_rate` is the same as :attr:`sample_rate`
        then nothing happens.

        Args:
            new_sample_rate (int): The new sample rate of :attr:`audio_data`.
            kwargs: Keyword arguments to librosa.resample.

        """

        if new_sample_rate == self.sample_rate:
            warnings.warn('Cannot resample to the same sample rate.')
            return

        resampled_signal = []

        for channel in self.get_channels():
            resampled_channel = librosa.resample(
                channel, self.sample_rate, new_sample_rate, **kwargs)
            resampled_signal.append(resampled_channel)

        self.audio_data = np.array(resampled_signal)
        self.original_signal_length = self.signal_length
        self._sample_rate = new_sample_rate

    ##################################################
    #              Channel Utilities
    ################################################## 
Example #19
Source File: audio.py    From signaltrain with GNU General Public License v3.0 4 votes vote down vote up
def read_audio_file(filename, sr=44100, mono=True, norm=False, device='cpu', dtype=np.float32, warn=True, fix_and_overwrite=False):
    """
    Generic wrapper for reading an audio file.
    Different libraries offer different speeds for this, so this routine is the
    'catch-all' for whatever read routine happens to work best

    Tries a fast method via scipy first, reverts to slower librosa when necessary.
    """
    # first try to read via scipy, because it's fast
    scipy_ok = False
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("error")    # scipy throws warnings which should be errors
        try:
            read_sr, signal = wavfile.read(filename)
            scipy_ok = True
        except wavfile.WavFileWarning:
            if warn:
                print("read_audio_file: Warning raised by scipy. ",end="")

    might_want_overwrite = False
    if scipy_ok:
        if mono and (len(signal.shape) > 1):     # convert to mono
            signal = signal[:,0]

        if isinstance(signal[0], np.int16):      # convert from ints to floats if necessary
            signal = np.array(signal/32767.0, dtype=dtype)   # change from [-32767..32767] to [-1..1]

        if read_sr != int(sr):
            print(f"read_audio_file: Got sample rate of {read_sr} Hz instead of {sr} Hz requested. Resampling.")
            signal = librosa.resample(signal, read_sr*1.0, sr*1.0, res_type='kaiser_fast')
            might_want_overwrite = True
    else:                                         # try librosa; it's slower but general
        if warn:
            print("Trying librosa.")
        signal, read_sr = librosa.core.load(filename, mono=mono, sr=sr, res_type='kaiser_fast')
        might_want_overwrite = True

    if fix_and_overwrite and might_want_overwrite:
        print(f"    Overwriting {filename} (so we don't have to use process as much again)")
        write_audio_file(filename, signal, sr)

    if signal.dtype != dtype:
        signal = signal.astype(dtype, copy=False)

    if norm:
        absmax = np.max(np.abs(signal))
        signal = signal/absmax if absmax > 0 else signal

    return signal, sr 
Example #20
Source File: audio_io.py    From synvae with MIT License 4 votes vote down vote up
def wav_data_to_samples(wav_data, sample_rate):
  """Read PCM-formatted WAV data and return a NumPy array of samples.

  Uses scipy to read and librosa to process WAV data. Audio will be converted to
  mono if necessary.

  Args:
    wav_data: WAV audio data to read.
    sample_rate: The number of samples per second at which the audio will be
        returned. Resampling will be performed if necessary.

  Returns:
    A numpy array of audio samples, single-channel (mono) and sampled at the
    specified rate, in float32 format.

  Raises:
    AudioIOReadError: If scipy is unable to read the WAV data.
    AudioIOError: If audio processing fails.
  """
  try:
    # Read the wav file, converting sample rate & number of channels.
    native_sr, y = scipy.io.wavfile.read(six.BytesIO(wav_data))
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOReadError(e)

  if y.dtype == np.int16:
    # Convert to float32.
    y = int16_samples_to_float32(y)
  elif y.dtype == np.float32:
    # Already float32.
    pass
  else:
    raise AudioIOError(
        'WAV file not 16-bit or 32-bit float PCM, unsupported')
  try:
    # Convert to mono and the desired sample rate.
    if y.ndim == 2 and y.shape[1] == 2:
      y = y.T
      y = librosa.to_mono(y)
    if native_sr != sample_rate:
      y = librosa.resample(y, native_sr, sample_rate)
  except Exception as e:  # pylint: disable=broad-except
    raise AudioIOError(e)
  return y 
Example #21
Source File: prepare_spectrograms.py    From dcase_task2 with MIT License 4 votes vote down vote up
def process(self, file_path, **kwargs):
        n_fft = 1024
        sr = 32000
        mono = True
        log_spec = False
        n_mels = 128

        hop_length = 192
        fmax = None

        if mono:
            sig, sr = librosa.load(file_path, sr=sr, mono=True)
            sig = sig[np.newaxis]
        else:
            sig, sr = librosa.load(file_path, sr=sr, mono=False)
            # sig, sf_sr = sf.read(file_path)
            # sig = np.transpose(sig, (1, 0))
            # sig = np.asarray([librosa.resample(s, sf_sr, sr) for s in sig])

        spectrograms = []
        for y in sig:

            # compute stft
            stft = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=None, window='hann', center=True,
                                pad_mode='reflect')

            # keep only amplitures
            stft = np.abs(stft)

            # spectrogram weighting
            if log_spec:
                stft = np.log10(stft + 1)
            else:
                freqs = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft)
                stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0)

            # apply mel filterbank
            spectrogram = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=n_mels, fmax=fmax)

            # keep spectrogram
            spectrograms.append(np.asarray(spectrogram))

        spectrograms = np.asarray(spectrograms)

        return spectrograms 
Example #22
Source File: background.py    From muda with ISC License 4 votes vote down vote up
def slice_clip(filename, start, stop, n_samples, sr, mono=True):
    """Slice a fragment of audio from a file.

    This uses pysoundfile to efficiently seek without
    loading the entire stream.

    Parameters
    ----------
    filename : str
        Path to the input file

    start : int
        The sample index of `filename` at which the audio fragment should start

    stop : int
        The sample index of `filename` at which the audio fragment should stop (e.g. y = audio[start:stop])

    n_samples : int > 0
        The number of samples to load

    sr : int > 0
        The target sampling rate

    mono : bool
        Ensure monophonic audio

    Returns
    -------
    y : np.ndarray [shape=(n_samples,)]
        A fragment of audio sampled from `filename`

    Raises
    ------
    ValueError
        If the source file is shorter than the requested length

    """

    with psf.SoundFile(str(filename), mode="r") as soundf:
        n_target = stop - start

        soundf.seek(start)

        y = soundf.read(n_target).T

        if mono:
            y = librosa.to_mono(y)

        # Resample to initial sr
        y = librosa.resample(y, soundf.samplerate, sr)

        # Clip to the target length exactly
        y = librosa.util.fix_length(y, n_samples)

        return y 
Example #23
Source File: music_processor.py    From aurora-sdk-mac with Apache License 2.0 4 votes vote down vote up
def process_music_data(data_in, is_fft, is_mel, n_out_bins, n_fft, n_mel, is_energy, is_visual):
    # length is len(data_in)/4
    data_np = np.fromstring(data_in, 'Float32')

    # visualizer
    if is_visual:
        visualizer(data_np)

    # energy
    if is_energy:
        energy = np.abs(data_np) ** 2
        energy = energy.sum()
        energy *= 2**5
        energy_output = energy.astype(np.uint16)
    else:
        energy_output = np.zeros(2).astype(np.uint16)

    # fft or mel
    if is_fft or is_mel:
        global sample_rate

        # down-sample by 4, with filtering, energy not scaled
        data_np = librosa.resample(data_np,
                                   sample_rate,
                                   sample_rate/4,
                                   res_type='kaiser_fast')

        # short time fft over n_fft samples
        fft_data = librosa.stft(data_np, n_fft,
                                hop_length=n_fft,
                                center=False)

        # calculate FFT or Mel
        if is_fft:
            fft_data_mag = np.abs(fft_data[0:n_fft // 2]) ** 2
            fft_data_mag *= 2**3
            fft_output = get_output_fft_bins(fft_data_mag, n_out_bins)
        else:
            fft_data_mag = np.abs(fft_data)**2
            fft_data_mag *= 2**2
            mel_data = librosa.feature.melspectrogram(S=fft_data_mag, sr=sample_rate / 4, n_mels=n_mel)
            fft_output = get_output_fft_bins(mel_data, n_out_bins)

        # output uint8_t
        fft_output = fft_output.astype(np.uint8)

    else:
        fft_output = np.zeros(n_out_bins).astype(np.uint8)

    return fft_output, energy_output 
Example #24
Source File: Evaluate.py    From vimss with GNU General Public License v3.0 4 votes vote down vote up
def predict_track(model_config, sess, mix_audio, mix_sr, sep_input_shape, sep_output_shape, separator_sources, mix_context):
    '''
    Outputs source estimates for a given input mixture signal mix_audio [n_frames, n_channels] and a given Tensorflow session and placeholders belonging to the prediction network.
    It iterates through the track, collecting segment-wise predictions to form the output.
    :param model_config: Model configuration dictionary
    :param sess: Tensorflow session used to run the network inference
    :param mix_audio: [n_frames, n_channels] audio signal (numpy array). Can have higher sampling rate or channels than the model supports, will be downsampled correspondingly.
    :param mix_sr: Sampling rate of mix_audio
    :param sep_input_shape: Input shape of separator ([batch_size, num_samples, num_channels])
    :param sep_output_shape: Input shape of separator ([batch_size, num_samples, num_channels])
    :param separator_sources: List of Tensorflow tensors that represent the output of the separator network
    :param mix_context: Input tensor of the network
    :return: 
    '''
    # Load mixture, convert to mono and downsample then
    assert(len(mix_audio.shape) == 2)
    if model_config["mono_downmix"]:
        mix_audio = np.mean(mix_audio, axis=1, keepdims=True)
    else:
        if mix_audio.shape[1] == 1:# Duplicate channels if input is mono but model is stereo
            mix_audio = np.tile(mix_audio, [1, 2])
    mix_audio = librosa.resample(mix_audio.T, mix_sr, model_config["expected_sr"], res_type="kaiser_fast").T

    # Preallocate source predictions (same shape as input mixture)
    source_time_frames = mix_audio.shape[0]
    source_preds = [np.zeros(mix_audio.shape, np.float32) for _ in range(model_config["num_sources"])]

    input_time_frames = sep_input_shape[1]
    output_time_frames = sep_output_shape[1]

    # Pad mixture across time at beginning and end so that neural network can make prediction at the beginning and end of signal
    pad_time_frames = (input_time_frames - output_time_frames) / 2
    mix_audio_padded = np.pad(mix_audio, [(pad_time_frames, pad_time_frames), (0,0)], mode="constant", constant_values=0.0)

    # Iterate over mixture magnitudes, fetch network rpediction
    for source_pos in range(0, source_time_frames, output_time_frames):
        # If this output patch would reach over the end of the source spectrogram, set it so we predict the very end of the output, then stop
        if source_pos + output_time_frames > source_time_frames:
            source_pos = source_time_frames - output_time_frames

        # Prepare mixture excerpt by selecting time interval
        mix_part = mix_audio_padded[source_pos:source_pos + input_time_frames,:]
        mix_part = np.expand_dims(mix_part, axis=0)

        source_parts = sess.run(separator_sources, feed_dict={mix_context: mix_part})

        # Save predictions
        # source_shape = [1, freq_bins, acc_mag_part.shape[2], num_chan]
        for i in range(model_config["num_sources"]):
            source_preds[i][source_pos:source_pos + output_time_frames] = source_parts[i][0, :, :]

    return source_preds 
Example #25
Source File: audio.py    From amen with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def __init__(
        self,
        file_path=None,
        raw_samples=None,
        convert_to_mono=False,
        sample_rate=44100,
        analysis_sample_rate=22050,
    ):
        """
        Audio constructor.
        Opens a file path, loads the audio with librosa, and prepares the features

        Parameters
        ----------

        file_path: string
            path to the audio file to load

        raw_samples: np.array
            samples to use for audio output

        convert_to_mono: boolean
            (optional) converts the file to mono on loading

        sample_rate: number > 0 [scalar]
            (optional) sample rate to pass to librosa.


        Returns
        ------
        An Audio object
        """

        if file_path:
            y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate)
        elif raw_samples is not None:
            # This assumes that we're passing in raw_samples
            # directly from another Audio's raw_samples.
            y = raw_samples
            sr = sample_rate

        self.file_path = file_path
        self.sample_rate = float(sr)
        self.analysis_sample_rate = float(analysis_sample_rate)
        self.num_channels = y.ndim
        self.duration = librosa.get_duration(y=y, sr=sr)

        self.analysis_samples = librosa.resample(
            librosa.to_mono(y), sr, self.analysis_sample_rate, res_type='kaiser_best'
        )
        self.raw_samples = np.atleast_2d(y)

        self.zero_indexes = self._create_zero_indexes()
        self.features = self._create_features()
        self.timings = self._create_timings()