Python hparams.hparams.fft_size() Examples

The following are 20 code examples of hparams.hparams.fft_size(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module hparams.hparams , or try the search function .
Example #1
Source File: audio.py    From vae_tacotron2 with MIT License 5 votes vote down vote up
def _stft(y):
	return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size()) 
Example #2
Source File: audio.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _build_mel_basis():
    assert hparams.fmax <= hparams.sample_rate // 2
    return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
                               fmin=hparams.fmin, fmax=hparams.fmax,
                               n_mels=hparams.num_mels) 
Example #3
Source File: audio.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _lws_processor():
    return lws.lws(hparams.fft_size, get_hop_size(), mode="speech") 
Example #4
Source File: audio.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _build_mel_basis():
    assert hparams.fmax <= hparams.sample_rate // 2
    return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
                               fmin=hparams.fmin, fmax=hparams.fmax,
                               n_mels=hparams.num_mels) 
Example #5
Source File: audio.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _build_mel_basis():
    assert hparams.fmax <= hparams.sample_rate // 2
    return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
                               fmin=hparams.fmin, fmax=hparams.fmax,
                               n_mels=hparams.num_mels) 
Example #6
Source File: audio.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _lws_processor():
    return lws.lws(hparams.fft_size, get_hop_size(), mode="speech") 
Example #7
Source File: audio.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _build_mel_basis():
    assert hparams.fmax <= hparams.sample_rate // 2
    return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
                               fmin=hparams.fmin, fmax=hparams.fmax,
                               n_mels=hparams.num_mels) 
Example #8
Source File: audio.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _lws_processor():
    return lws.lws(hparams.fft_size, get_hop_size(), mode="speech") 
Example #9
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def _build_mel_basis():
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax) 
Example #10
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def _stft(y):
	return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size()) 
Example #11
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def _build_mel_basis():
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax) 
Example #12
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def _stft(y):
	return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size()) 
Example #13
Source File: audio.py    From WaveRNN-Pytorch with MIT License 5 votes vote down vote up
def _build_mel_basis():
    if hparams.fmax is not None:
        assert hparams.fmax <= hparams.sample_rate // 2
    return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
                               fmin=hparams.fmin, fmax=hparams.fmax,
                               n_mels=hparams.num_mels) 
Example #14
Source File: audio.py    From WaveRNN-Pytorch with MIT License 5 votes vote down vote up
def _lws_processor():
    return lws.lws(hparams.fft_size, hparams.hop_size, mode="speech")


# Conversions: 
Example #15
Source File: audio.py    From vae_tacotron2 with MIT License 5 votes vote down vote up
def _build_mel_basis():
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax) 
Example #16
Source File: audio.py    From vae_tacotron2 with MIT License 5 votes vote down vote up
def _stft(y):
	return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size()) 
Example #17
Source File: audio.py    From vae_tacotron2 with MIT License 5 votes vote down vote up
def _build_mel_basis():
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax) 
Example #18
Source File: audio.py    From Griffin_lim with MIT License 5 votes vote down vote up
def _stft(y):
    return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size()) 
Example #19
Source File: ljspeech.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'ljspeech-audio-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text) 
Example #20
Source File: ljspeech.py    From representation_mixing with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        out = P.mulaw(wav, hparams.quantize_channels)
        constant_values = P.mulaw(0.0, hparams.quantize_channels)
        out_dtype = np.float32
    else:
        # [-1, 1]
        out = wav
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    # lws pads zeros internally before performing stft
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())

    # zero pad for quantized signal
    out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    timesteps = len(out)

    # Write the spectrograms to disk:
    audio_filename = 'ljspeech-audio-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype), allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, timesteps, text)