Python hparams.hparams.sample_rate() Examples

The following are 30 code examples of hparams.hparams.sample_rate(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module hparams.hparams , or try the search function .
Example #1
Source File: audio.py    From Griffin_lim with MIT License 6 votes vote down vote up
def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size 
Example #2
Source File: LWS.py    From Griffin_lim with MIT License 6 votes vote down vote up
def main():
    data_foler = "data"
    wavs = [os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav")]
    outputs_lws = [file + ".lws.gen.wav" for file in wavs]
    wavs = [audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs]

    lws_processor = lws.lws(512, 128, mode="speech")  # 512: window length; 128: window shift
    i = 0
    for x in wavs:
        X = lws_processor.stft(x)  # where x is a single-channel waveform
        X0 = np.abs(X)  # Magnitude spectrogram
        print('{:6}: {:5.2f} dB'.format('Abs(X)', lws_processor.get_consistency(X0)))
        X1 = lws_processor.run_lws(
            X0)  # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram)
        print(X1.shape)
        print('{:6}: {:5.2f} dB'.format('LWS', lws_processor.get_consistency(X1)))
        print(X1.shape)
        wav = lws_processor.istft(X1).astype(np.float32)

        audio.save_wav(wav, outputs_lws[i])
        i += 1 
Example #3
Source File: train_tacotron.py    From Tacotron-Wavenet-Vocoder-Korean with MIT License 6 votes vote down vote up
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T,hparams)
    save_wav(waveform, audio_path,hparams.sample_rate)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align, align_path, info=info_text,text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=False), isKorean=False) 
Example #4
Source File: blizzard.py    From libfaceid with MIT License 6 votes vote down vote up
def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text) 
Example #5
Source File: train.py    From WaveRNN-Pytorch with MIT License 6 votes vote down vote up
def evaluate_model(model, data_loader, checkpoint_dir, limit_eval_to=5):
    """evaluate model and save generated wav and plot

    """
    test_path = data_loader.dataset.test_path
    test_files = os.listdir(test_path)
    counter = 0
    output_dir = os.path.join(checkpoint_dir,'eval')
    for f in test_files:
        if f[-7:] == "mel.npy":
            mel = np.load(os.path.join(test_path,f))
            wav = model.generate(mel)
            # save wav
            wav_path = os.path.join(output_dir,"checkpoint_step{:09d}_wav_{}.wav".format(global_step,counter))
            librosa.output.write_wav(wav_path, wav, sr=hp.sample_rate)
            # save wav plot
            fig_path = os.path.join(output_dir,"checkpoint_step{:09d}_wav_{}.png".format(global_step,counter))
            fig = plt.plot(wav.reshape(-1))
            plt.savefig(fig_path)
            # clear fig to drawing to the same plot
            plt.clf()
            counter += 1
        # stop evaluation early via limit_eval_to
        if counter >= limit_eval_to:
            break 
Example #6
Source File: generate.py    From Tacotron-Wavenet-Vocoder-Korean with MIT License 6 votes vote down vote up
def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
    # seed의 앞부분만 사용한다.
    seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    seed_audio = audio.trim_silence(seed_audio, hparams)
    if scalar_input:
        if len(seed_audio) < window_size:
            return seed_audio
        else: return seed_audio[:window_size]
    else:
        quantized = mu_law_encode(seed_audio, quantization_channels)
    
    
        # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
        cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
    
        return quantized[:cut_index] 
Example #7
Source File: synthesis_mel.py    From style-token_tacotron2 with MIT License 6 votes vote down vote up
def mel_synthesis(out_dir='wav_griffi_syn',in_dir='mel'):
    os.makedirs(out_dir, exist_ok=True)

    #mel_file = os.path.join(mel_folder, mel_file)
    mel_filenames=[x.split('.')[0] for x in os.listdir(in_dir)]
    start_time=time.time()
    for mel_file in mel_filenames:
        try:
            print('process {}'.format(mel_file))
            mel_file_path = os.path.join('training_data/mels', 'mel-{}.wav.npy'.format(mel_file))
            mel_spectro = np.load(mel_file_path)
            wav = inv_mel_spectrogram(mel_spectro.T, hparams)
            # save the wav under test_<folder>_<file>
            save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(
                mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
                     sr=hparams.sample_rate)
        except:
            print('{} error'.format(mel_file))

    print('griffin-lim :{}'.format(time.time()-start_time)) 
Example #8
Source File: generate.py    From Tacotron2-Wavenet-Korean-TTS with MIT License 6 votes vote down vote up
def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
    # seed의 앞부분만 사용한다.
    seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    seed_audio = audio.trim_silence(seed_audio, hparams)
    if scalar_input:
        if len(seed_audio) < window_size:
            return seed_audio
        else: return seed_audio[:window_size]
    else:
        quantized = mu_law_encode(seed_audio, quantization_channels)
    
    
        # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
        cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
    
        return quantized[:cut_index] 
Example #9
Source File: blizzard.py    From vae_tacotron with MIT License 6 votes vote down vote up
def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text) 
Example #10
Source File: train_tacotron2.py    From Tacotron2-Wavenet-Korean-TTS with MIT License 6 votes vote down vote up
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T,hparams)
    save_wav(waveform, audio_path,hparams.sample_rate)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align, align_path, info=info_text,text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=False), isKorean=False) 
Example #11
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def save_wav(wav, path):
	wav *= 32767 / max(0.01, np.max(np.abs(wav))) 
	#proposed by @dsmiller
	wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 
Example #12
Source File: preprocess.py    From Tacotron2-Wavenet-Korean-TTS with MIT License 5 votes vote down vote up
def write_metadata(metadata, out_dir):
    with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
        for m in metadata:
            f.write('|'.join([str(x) for x in m]) + '\n')
    mel_frames = sum([int(m[4]) for m in metadata])
    timesteps = sum([int(m[3]) for m in metadata])
    sr = hparams.sample_rate
    hours = timesteps / sr / 3600
    print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(len(metadata), mel_frames, timesteps, hours))
    print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
    print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
    print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata))) 
Example #13
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def get_hop_size():
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size 
Example #14
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def _build_mel_basis():
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax) 
Example #15
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def load_wav(path):
	return librosa.core.load(path, sr=hparams.sample_rate)[0] 
Example #16
Source File: wavenet_preprocess.py    From style-token_tacotron2 with MIT License 5 votes vote down vote up
def write_metadata(metadata, out_dir):
	with open(os.path.join(out_dir, 'map.txt'), 'w', encoding='utf-8') as f:
		for m in metadata:
			f.write('|'.join([str(x) for x in m]) + '\n')
	mel_frames = sum([int(m[5]) for m in metadata])
	timesteps = sum([int(m[4]) for m in metadata])
	sr = hparams.sample_rate
	hours = timesteps / sr / 3600
	print('Write {} utterances, {} audio timesteps, ({:.2f} hours)'.format(
		len(metadata), timesteps, hours))
	print('Max mel frames length: {}'.format(max(int(m[5]) for m in metadata)))
	print('Max audio timesteps length: {}'.format(max(m[4] for m in metadata))) 
Example #17
Source File: check_slience_trim.py    From style-token_tacotron2 with MIT License 5 votes vote down vote up
def get_some_inversed_samples(training_data_path='training_data',output_inversed_path='tmp_inverse_wav_out',n_samples=5):
    mel_files=glob.glob(os.path.join(training_data_path,'mels','*.npy'))
    assert len(mel_files)>=n_samples,'no enough .npy to inverse...'
    if os.path.exists(output_inversed_path):
        shutil.rmtree(output_inversed_path)
    os.makedirs(output_inversed_path,exist_ok=False)
    random.seed(2018)
    mel_files=random.sample(mel_files,n_samples)
    for mel_file in mel_files:
        mel_file_basename=os.path.basename(mel_file)
        mel_spectro=np.load(mel_file)
        wav=inv_mel_spectrogram(mel_spectro.T, hparams)
        save_wav(wav,os.path.join(output_inversed_path,'{}.wav'.format(mel_file_basename))
                 ,sr=hparams.sample_rate) 
Example #18
Source File: test_wavenet_feeder.py    From style-token_tacotron2 with MIT License 5 votes vote down vote up
def _limit_time(hparams):
	'''Limit time resolution to save GPU memory.
	'''
	if hparams.max_time_sec is not None:
		return int(hparams.max_time_sec * hparams.sample_rate)
	elif hparams.max_time_steps is not None:
		return hparams.max_time_steps
	else:
		return None 
Example #19
Source File: preprocess.py    From style-token_tacotron2 with MIT License 5 votes vote down vote up
def write_metadata(metadata, out_dir):
    with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
        for m in metadata:
            f.write('|'.join([str(x) for x in m]) + '\n')
    mel_frames = sum([int(m[4]) for m in metadata])
    timesteps = sum([int(m[3]) for m in metadata])
    sr = hparams.sample_rate
    hours = timesteps / sr / 3600
    print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
        len(metadata), mel_frames, timesteps, hours))
    print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
    print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
    print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata))) 
Example #20
Source File: audio.py    From tacotron with MIT License 5 votes vote down vote up
def load_wav(path):
  return librosa.core.load(path, sr=hparams.sample_rate)[0] 
Example #21
Source File: audio.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def load_wav(path):
	return librosa.core.load(path, sr=hparams.sample_rate)[0] 
Example #22
Source File: preprocess.py    From gmvae_tacotron with MIT License 5 votes vote down vote up
def write_metadata(metadata, out_dir):
	with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
		for m in metadata:
			f.write('|'.join([str(x) for x in m]) + '\n')
	mel_frames = sum([int(m[4]) for m in metadata])
	timesteps = sum([int(m[3]) for m in metadata])
	sr = hparams.sample_rate
	hours = timesteps / sr / 3600
	print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
		len(metadata), mel_frames, timesteps, hours))
	print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
	print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
	print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata))) 
Example #23
Source File: audio.py    From WaveRNN-Pytorch with MIT License 5 votes vote down vote up
def _build_mel_basis():
    if hparams.fmax is not None:
        assert hparams.fmax <= hparams.sample_rate // 2
    return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
                               fmin=hparams.fmin, fmax=hparams.fmax,
                               n_mels=hparams.num_mels) 
Example #24
Source File: audio.py    From WaveRNN-Pytorch with MIT License 5 votes vote down vote up
def save_wav(wav, path):
    wav = wav * 32767 / max(0.01, np.max(np.abs(wav)))
    wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 
Example #25
Source File: audio.py    From WaveRNN-Pytorch with MIT License 5 votes vote down vote up
def load_wav(path):
    return librosa.load(path, sr=hparams.sample_rate)[0] 
Example #26
Source File: audio.py    From tacotron with MIT License 5 votes vote down vote up
def _build_mel_basis():
  n_fft = (hparams.num_freq - 1) * 2
  return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels) 
Example #27
Source File: audio.py    From tacotron with MIT License 5 votes vote down vote up
def _stft_parameters():
  n_fft = (hparams.num_freq - 1) * 2
  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
  return n_fft, hop_length, win_length


# Conversions: 
Example #28
Source File: audio.py    From tacotron with MIT License 5 votes vote down vote up
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
  window_length = int(hparams.sample_rate * min_silence_sec)
  hop_length = int(window_length / 4)
  threshold = _db_to_amp(threshold_db)
  for x in range(hop_length, len(wav) - window_length, hop_length):
    if np.max(wav[x:x+window_length]) < threshold:
      return x + hop_length
  return len(wav) 
Example #29
Source File: audio.py    From tacotron with MIT License 5 votes vote down vote up
def save_wav(wav, path):
  wav *= 32767 / max(0.01, np.max(np.abs(wav)))
  scipy.io.wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 
Example #30
Source File: stft.py    From cnn_vocoder with MIT License 5 votes vote down vote up
def _build_mel_basis(n_fft, n_mels=80):
    return torch.FloatTensor(librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=n_mels)).transpose(0, 1)