Python hparams.hparams.silence_threshold() Examples
The following are 15
code examples of hparams.hparams.silence_threshold().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
hparams.hparams
, or try the search function
.
Example #1
Source File: audio.py From cnn_vocoder with MIT License | 6 votes |
def adjust_time_resolution(quantized, mel): """Adjust time resolution by repeating features Args: quantized (ndarray): (T,) mel (ndarray): (N, D) Returns: tuple: Tuple of (T,) and (T, D) """ assert len(quantized.shape) == 1 assert len(mel.shape) == 2 upsample_factor = quantized.size // mel.shape[0] mel = np.repeat(mel, upsample_factor, axis=0) n_pad = quantized.size - mel.shape[0] if n_pad != 0: assert n_pad > 0 mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0) # trim start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end], mel[start:end, :]
Example #2
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 6 votes |
def adjust_time_resolution(quantized, mel): """Adjust time resolution by repeating features Args: quantized (ndarray): (T,) mel (ndarray): (N, D) Returns: tuple: Tuple of (T,) and (T, D) """ assert len(quantized.shape) == 1 assert len(mel.shape) == 2 upsample_factor = quantized.size // mel.shape[0] mel = np.repeat(mel, upsample_factor, axis=0) n_pad = quantized.size - mel.shape[0] if n_pad != 0: assert n_pad > 0 mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0) # trim start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end], mel[start:end, :]
Example #3
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 6 votes |
def adjust_time_resolution(quantized, mel): """Adjust time resolution by repeating features Args: quantized (ndarray): (T,) mel (ndarray): (N, D) Returns: tuple: Tuple of (T,) and (T, D) """ assert len(quantized.shape) == 1 assert len(mel.shape) == 2 upsample_factor = quantized.size // mel.shape[0] mel = np.repeat(mel, upsample_factor, axis=0) n_pad = quantized.size - mel.shape[0] if n_pad != 0: assert n_pad > 0 mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0) # trim start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end], mel[start:end, :]
Example #4
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 6 votes |
def adjust_time_resolution(quantized, mel): """Adjust time resolution by repeating features Args: quantized (ndarray): (T,) mel (ndarray): (N, D) Returns: tuple: Tuple of (T,) and (T, D) """ assert len(quantized.shape) == 1 assert len(mel.shape) == 2 upsample_factor = quantized.size // mel.shape[0] mel = np.repeat(mel, upsample_factor, axis=0) n_pad = quantized.size - mel.shape[0] if n_pad != 0: assert n_pad > 0 mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0) # trim start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end], mel[start:end, :]
Example #5
Source File: audio.py From cnn_vocoder with MIT License | 5 votes |
def trim(quantized): start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end]
Example #6
Source File: audio.py From cnn_vocoder with MIT License | 5 votes |
def start_and_end_indices(quantized, silence_threshold=2): for start in range(quantized.size): if abs(quantized[start] - 127) > silence_threshold: break for end in range(quantized.size - 1, 1, -1): if abs(quantized[end] - 127) > silence_threshold: break assert abs(quantized[start] - 127) > silence_threshold assert abs(quantized[end] - 127) > silence_threshold return start, end
Example #7
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def trim(quantized): start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end]
Example #8
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def start_and_end_indices(quantized, silence_threshold=2): for start in range(quantized.size): if abs(quantized[start] - 127) > silence_threshold: break for end in range(quantized.size - 1, 1, -1): if abs(quantized[end] - 127) > silence_threshold: break assert abs(quantized[start] - 127) > silence_threshold assert abs(quantized[end] - 127) > silence_threshold return start, end
Example #9
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def trim(quantized): start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end]
Example #10
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def trim(quantized): start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end]
Example #11
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def start_and_end_indices(quantized, silence_threshold=2): for start in range(quantized.size): if abs(quantized[start] - 127) > silence_threshold: break for end in range(quantized.size - 1, 1, -1): if abs(quantized[end] - 127) > silence_threshold: break assert abs(quantized[start] - 127) > silence_threshold assert abs(quantized[end] - 127) > silence_threshold return start, end
Example #12
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def trim(quantized): start, end = start_and_end_indices(quantized, hparams.silence_threshold) return quantized[start:end]
Example #13
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def start_and_end_indices(quantized, silence_threshold=2): for start in range(quantized.size): if abs(quantized[start] - 127) > silence_threshold: break for end in range(quantized.size - 1, 1, -1): if abs(quantized[end] - 127) > silence_threshold: break assert abs(quantized[start] - 127) > silence_threshold assert abs(quantized[end] - 127) > silence_threshold return start, end
Example #14
Source File: ljspeech.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
Example #15
Source File: ljspeech.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)