Python Examples of hparams.hparams.frame_shift

Source File: blizzard.py From vae_tacotron with MIT License

6 votes

def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)

Source File: blizzard.py From libfaceid with MIT License

6 votes

def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)

Source File: blizzard.py From tacotron with MIT License

6 votes

def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)

Source File: audio.py From Griffin_lim with MIT License

6 votes

def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size

Source File: audio.py From arabic-tacotron-tts with MIT License

5 votes

def _stft_parameters():
  n_fft = (hparams.num_freq - 1) * 2
  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
  return n_fft, hop_length, win_length


# Conversions:

Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License

5 votes

def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size

Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License

5 votes

def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size

Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License

5 votes

def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size

Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License

5 votes

def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size

Source File: audio.py From Tacotron2-PyTorch with MIT License

5 votes

def _stft_parameters():
	n_fft = (hps.num_freq - 1) * 2
	hop_length = int(hps.frame_shift_ms / 1000 * hps.sample_rate)
	win_length = int(hps.frame_length_ms / 1000 * hps.sample_rate)
	return n_fft, hop_length, win_length


# Conversions:

Source File: audio.py From gmvae_tacotron with MIT License

5 votes

def get_hop_size():
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size

Source File: audio.py From gmvae_tacotron with MIT License

5 votes

def get_hop_size():
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size

Source File: audio.py From tacotron with MIT License

5 votes

def _stft_parameters():
  n_fft = (hparams.num_freq - 1) * 2
  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
  return n_fft, hop_length, win_length


# Conversions:

Source File: preprocess.py From tacotron with MIT License

5 votes

def write_metadata(metadata, out_dir):
  with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
    for m in metadata:
      f.write('|'.join([str(x) for x in m]) + '\n')
  frames = sum([m[2] for m in metadata])
  hours = frames * hparams.frame_shift_ms / (3600 * 1000)
  print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
  print('Max input length:  %d' % max(len(m[3]) for m in metadata))
  print('Max output length: %d' % max(m[2] for m in metadata))

Source File: audio.py From cnn_vocoder with MIT License

5 votes

def get_hop_size():
    hop_size = hparams.hop_size
    if hop_size is None:
        assert hparams.frame_shift_ms is not None
        hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
    return hop_size

# Conversions:

Source File: preprocess.py From arabic-tacotron-tts with MIT License

5 votes

def write_metadata(metadata, out_dir):
  with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
    for m in metadata:
      f.write('|'.join([str(x) for x in m]) + '\n')
  frames = sum([m[2] for m in metadata])
  hours = frames * hparams.frame_shift_ms / (3600 * 1000)
  print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
  print('Max input length:  %d' % max(len(m[3]) for m in metadata))
  print('Max output length: %d' % max(m[2] for m in metadata))

Source File: preprocess.py From libfaceid with MIT License

5 votes

def write_metadata(metadata, out_dir):
  with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
    for m in metadata:
      f.write('|'.join([str(x) for x in m]) + '\n')
  frames = sum([m[2] for m in metadata])
  hours = frames * hparams.frame_shift_ms / (3600 * 1000)
  print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
  print('Max input length:  %d' % max(len(m[3]) for m in metadata))
  print('Max output length: %d' % max(m[2] for m in metadata))

Source File: audio.py From vae_tacotron2 with MIT License

5 votes

def get_hop_size():
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size

Source File: audio.py From vae_tacotron2 with MIT License

5 votes

def get_hop_size():
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size

Source File: blizzard2013.py From vae_tacotron with MIT License

5 votes

def _process_utterance(out_dir, index, wav_path, text):
  '''Preprocesses a single utterance audio/text pair.
  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.
  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file
  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

  # Load the audio to a numpy array:
  wav = audio.load_wav(wav_path)
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples and _max_out_length is not None:
    return None

  # Compute the linear-scale spectrogram from the wav:
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]

  # Compute a mel-scale spectrogram from the wav:
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

  # Write the spectrograms to disk:
  spectrogram_filename = 'blizzard2013-spec-%05d.npy' % index
  mel_filename = 'blizzard2013-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

  # Return a tuple describing this training example:
  return (spectrogram_filename, mel_filename, n_frames, text)

Source File: audio.py From vae_tacotron with MIT License

5 votes

def _stft_parameters():
  n_fft = (hparams.num_freq - 1) * 2
  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
  return n_fft, hop_length, win_length


# Conversions:

Source File: preprocess.py From vae_tacotron with MIT License

5 votes

def write_metadata(metadata, out_dir):
  with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
    for m in metadata:
      f.write('|'.join([str(x) for x in m]) + '\n')
  frames = sum([m[2] for m in metadata])
  hours = frames * hparams.frame_shift_ms / (3600 * 1000)
  print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
  print('Max input length:  %d' % max(len(m[3]) for m in metadata))
  print('Max output length: %d' % max(m[2] for m in metadata))

Source File: griffin_lim.py From Griffin_lim with MIT License

5 votes

def spectrogram2wav(spectrogram, n_iter=hparams.griffin_lim_iters, n_fft=(hparams.num_freq - 1) * 2,
                    win_length=int(hparams.frame_length_ms / 1000 * hparams.sample_rate),
                    hop_length=int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)):
    '''Converts spectrogram into a waveform using Griffin-lim's raw.
    '''

    def invert_spectrogram(spectrogram):
        '''
        spectrogram: [t, f]
        '''
        spectrogram = tf.expand_dims(spectrogram, 0)
        inversed = tf.contrib.signal.inverse_stft(spectrogram, win_length, hop_length, n_fft)
        squeezed = tf.squeeze(inversed, 0)
        return squeezed

    spectrogram = tf.transpose(spectrogram)

    spectrogram = tf.cast(spectrogram, dtype=tf.complex64)  # [t, f]
    X_best = tf.identity(spectrogram)
    for i in range(n_iter):
        X_t = invert_spectrogram(X_best)
        est = tf.contrib.signal.stft(X_t, win_length, hop_length, n_fft, pad_end=False)  # (1, T, n_fft/2+1)
        phase = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)  # [t, f]
        X_best = spectrogram * phase  # [t, t]
    X_t = invert_spectrogram(X_best)
    y = tf.real(X_t)

    return y

Python hparams.hparams.frame_shift_ms() Examples