Python Examples of hparams.hparams.sample

Source File: audio.py From Griffin_lim with MIT License

6 votes

def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size

Source File: LWS.py From Griffin_lim with MIT License

6 votes

def main():
    data_foler = "data"
    wavs = [os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav")]
    outputs_lws = [file + ".lws.gen.wav" for file in wavs]
    wavs = [audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs]

    lws_processor = lws.lws(512, 128, mode="speech")  # 512: window length; 128: window shift
    i = 0
    for x in wavs:
        X = lws_processor.stft(x)  # where x is a single-channel waveform
        X0 = np.abs(X)  # Magnitude spectrogram
        print('{:6}: {:5.2f} dB'.format('Abs(X)', lws_processor.get_consistency(X0)))
        X1 = lws_processor.run_lws(
            X0)  # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram)
        print(X1.shape)
        print('{:6}: {:5.2f} dB'.format('LWS', lws_processor.get_consistency(X1)))
        print(X1.shape)
        wav = lws_processor.istft(X1).astype(np.float32)

        audio.save_wav(wav, outputs_lws[i])
        i += 1

Source File: train_tacotron.py From Tacotron-Wavenet-Vocoder-Korean with MIT License

6 votes

def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T,hparams)
    save_wav(waveform, audio_path,hparams.sample_rate)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align, align_path, info=info_text,text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=False), isKorean=False)

Source File: blizzard.py From libfaceid with MIT License

6 votes

def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)

Source File: train.py From WaveRNN-Pytorch with MIT License

6 votes

def evaluate_model(model, data_loader, checkpoint_dir, limit_eval_to=5):
    """evaluate model and save generated wav and plot

    """
    test_path = data_loader.dataset.test_path
    test_files = os.listdir(test_path)
    counter = 0
    output_dir = os.path.join(checkpoint_dir,'eval')
    for f in test_files:
        if f[-7:] == "mel.npy":
            mel = np.load(os.path.join(test_path,f))
            wav = model.generate(mel)
            # save wav
            wav_path = os.path.join(output_dir,"checkpoint_step{:09d}_wav_{}.wav".format(global_step,counter))
            librosa.output.write_wav(wav_path, wav, sr=hp.sample_rate)
            # save wav plot
            fig_path = os.path.join(output_dir,"checkpoint_step{:09d}_wav_{}.png".format(global_step,counter))
            fig = plt.plot(wav.reshape(-1))
            plt.savefig(fig_path)
            # clear fig to drawing to the same plot
            plt.clf()
            counter += 1
        # stop evaluation early via limit_eval_to
        if counter >= limit_eval_to:
            break

Source File: generate.py From Tacotron-Wavenet-Vocoder-Korean with MIT License

6 votes

def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
    # seed의 앞부분만 사용한다.
    seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    seed_audio = audio.trim_silence(seed_audio, hparams)
    if scalar_input:
        if len(seed_audio) < window_size:
            return seed_audio
        else: return seed_audio[:window_size]
    else:
        quantized = mu_law_encode(seed_audio, quantization_channels)
    
    
        # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
        cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
    
        return quantized[:cut_index]

Source File: synthesis_mel.py From style-token_tacotron2 with MIT License

6 votes

def mel_synthesis(out_dir='wav_griffi_syn',in_dir='mel'):
    os.makedirs(out_dir, exist_ok=True)

    #mel_file = os.path.join(mel_folder, mel_file)
    mel_filenames=[x.split('.')[0] for x in os.listdir(in_dir)]
    start_time=time.time()
    for mel_file in mel_filenames:
        try:
            print('process {}'.format(mel_file))
            mel_file_path = os.path.join('training_data/mels', 'mel-{}.wav.npy'.format(mel_file))
            mel_spectro = np.load(mel_file_path)
            wav = inv_mel_spectrogram(mel_spectro.T, hparams)
            # save the wav under test_<folder>_<file>
            save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(
                mel_file.replace('/', '_').replace('\\', '_').replace('.npy', ''))),
                     sr=hparams.sample_rate)
        except:
            print('{} error'.format(mel_file))

    print('griffin-lim :{}'.format(time.time()-start_time))

Source File: generate.py From Tacotron2-Wavenet-Korean-TTS with MIT License

6 votes

def create_seed(filename,sample_rate,quantization_channels,window_size,scalar_input):
    # seed의 앞부분만 사용한다.
    seed_audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    seed_audio = audio.trim_silence(seed_audio, hparams)
    if scalar_input:
        if len(seed_audio) < window_size:
            return seed_audio
        else: return seed_audio[:window_size]
    else:
        quantized = mu_law_encode(seed_audio, quantization_channels)
    
    
        # 짧으면 짧은 대로 return하는데, padding이라도 해야되지 않나???
        cut_index = tf.cond(tf.size(quantized) < tf.constant(window_size), lambda: tf.size(quantized), lambda: tf.constant(window_size))
    
        return quantized[:cut_index]

Source File: blizzard.py From vae_tacotron with MIT License

6 votes

def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)

Source File: train_tacotron2.py From Tacotron2-Wavenet-Korean-TTS with MIT License

6 votes

def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T,hparams)
    save_wav(waveform, audio_path,hparams.sample_rate)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment( align, align_path, info=info_text, text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align, align_path, info=info_text,text=sequence_to_text(seq,skip_eos_and_pad=True, combine_jamo=False), isKorean=False)

Source File: audio.py From gmvae_tacotron with MIT License

5 votes

def save_wav(wav, path):
	wav *= 32767 / max(0.01, np.max(np.abs(wav))) 
	#proposed by @dsmiller
	wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))

Source File: preprocess.py From Tacotron2-Wavenet-Korean-TTS with MIT License

5 votes

def write_metadata(metadata, out_dir):
    with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
        for m in metadata:
            f.write('|'.join([str(x) for x in m]) + '\n')
    mel_frames = sum([int(m[4]) for m in metadata])
    timesteps = sum([int(m[3]) for m in metadata])
    sr = hparams.sample_rate
    hours = timesteps / sr / 3600
    print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(len(metadata), mel_frames, timesteps, hours))
    print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
    print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
    print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))

Source File: audio.py From gmvae_tacotron with MIT License

5 votes

def get_hop_size():
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size

Source File: audio.py From gmvae_tacotron with MIT License

5 votes

def _build_mel_basis():
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax)

Source File: audio.py From gmvae_tacotron with MIT License

5 votes

def load_wav(path):
	return librosa.core.load(path, sr=hparams.sample_rate)[0]

Source File: wavenet_preprocess.py From style-token_tacotron2 with MIT License

5 votes

def write_metadata(metadata, out_dir):
	with open(os.path.join(out_dir, 'map.txt'), 'w', encoding='utf-8') as f:
		for m in metadata:
			f.write('|'.join([str(x) for x in m]) + '\n')
	mel_frames = sum([int(m[5]) for m in metadata])
	timesteps = sum([int(m[4]) for m in metadata])
	sr = hparams.sample_rate
	hours = timesteps / sr / 3600
	print('Write {} utterances, {} audio timesteps, ({:.2f} hours)'.format(
		len(metadata), timesteps, hours))
	print('Max mel frames length: {}'.format(max(int(m[5]) for m in metadata)))
	print('Max audio timesteps length: {}'.format(max(m[4] for m in metadata)))

Source File: check_slience_trim.py From style-token_tacotron2 with MIT License

5 votes

def get_some_inversed_samples(training_data_path='training_data',output_inversed_path='tmp_inverse_wav_out',n_samples=5):
    mel_files=glob.glob(os.path.join(training_data_path,'mels','*.npy'))
    assert len(mel_files)>=n_samples,'no enough .npy to inverse...'
    if os.path.exists(output_inversed_path):
        shutil.rmtree(output_inversed_path)
    os.makedirs(output_inversed_path,exist_ok=False)
    random.seed(2018)
    mel_files=random.sample(mel_files,n_samples)
    for mel_file in mel_files:
        mel_file_basename=os.path.basename(mel_file)
        mel_spectro=np.load(mel_file)
        wav=inv_mel_spectrogram(mel_spectro.T, hparams)
        save_wav(wav,os.path.join(output_inversed_path,'{}.wav'.format(mel_file_basename))
                 ,sr=hparams.sample_rate)

Source File: test_wavenet_feeder.py From style-token_tacotron2 with MIT License

5 votes

def _limit_time(hparams):
	'''Limit time resolution to save GPU memory.
	'''
	if hparams.max_time_sec is not None:
		return int(hparams.max_time_sec * hparams.sample_rate)
	elif hparams.max_time_steps is not None:
		return hparams.max_time_steps
	else:
		return None

Source File: preprocess.py From style-token_tacotron2 with MIT License

5 votes

def write_metadata(metadata, out_dir):
    with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
        for m in metadata:
            f.write('|'.join([str(x) for x in m]) + '\n')
    mel_frames = sum([int(m[4]) for m in metadata])
    timesteps = sum([int(m[3]) for m in metadata])
    sr = hparams.sample_rate
    hours = timesteps / sr / 3600
    print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
        len(metadata), mel_frames, timesteps, hours))
    print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
    print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
    print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))

Source File: audio.py From tacotron with MIT License

5 votes

def load_wav(path):
  return librosa.core.load(path, sr=hparams.sample_rate)[0]

Source File: audio.py From gmvae_tacotron with MIT License

5 votes

def load_wav(path):
	return librosa.core.load(path, sr=hparams.sample_rate)[0]

Source File: preprocess.py From gmvae_tacotron with MIT License

5 votes

def write_metadata(metadata, out_dir):
	with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
		for m in metadata:
			f.write('|'.join([str(x) for x in m]) + '\n')
	mel_frames = sum([int(m[4]) for m in metadata])
	timesteps = sum([int(m[3]) for m in metadata])
	sr = hparams.sample_rate
	hours = timesteps / sr / 3600
	print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
		len(metadata), mel_frames, timesteps, hours))
	print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
	print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
	print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))

Source File: audio.py From WaveRNN-Pytorch with MIT License

5 votes

def _build_mel_basis():
    if hparams.fmax is not None:
        assert hparams.fmax <= hparams.sample_rate // 2
    return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
                               fmin=hparams.fmin, fmax=hparams.fmax,
                               n_mels=hparams.num_mels)

Source File: audio.py From WaveRNN-Pytorch with MIT License

5 votes

def save_wav(wav, path):
    wav = wav * 32767 / max(0.01, np.max(np.abs(wav)))
    wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))

Source File: audio.py From WaveRNN-Pytorch with MIT License

5 votes

def load_wav(path):
    return librosa.load(path, sr=hparams.sample_rate)[0]

Source File: audio.py From tacotron with MIT License

5 votes

def _build_mel_basis():
  n_fft = (hparams.num_freq - 1) * 2
  return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)

Source File: audio.py From tacotron with MIT License

5 votes

def _stft_parameters():
  n_fft = (hparams.num_freq - 1) * 2
  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
  return n_fft, hop_length, win_length


# Conversions:

Source File: audio.py From tacotron with MIT License

5 votes

def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
  window_length = int(hparams.sample_rate * min_silence_sec)
  hop_length = int(window_length / 4)
  threshold = _db_to_amp(threshold_db)
  for x in range(hop_length, len(wav) - window_length, hop_length):
    if np.max(wav[x:x+window_length]) < threshold:
      return x + hop_length
  return len(wav)

Source File: audio.py From tacotron with MIT License

5 votes

def save_wav(wav, path):
  wav *= 32767 / max(0.01, np.max(np.abs(wav)))
  scipy.io.wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))

Source File: stft.py From cnn_vocoder with MIT License

5 votes

def _build_mel_basis(n_fft, n_mels=80):
    return torch.FloatTensor(librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=n_mels)).transpose(0, 1)

Python hparams.hparams.sample_rate() Examples