Python hparams.hparams.num_mels() Examples
The following are 30
code examples of hparams.hparams.num_mels().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
hparams.hparams
, or try the search function
.
Example #1
Source File: model.py From Tacotron2-PyTorch with MIT License | 6 votes |
def get_alignment_energies(self, query, processed_memory, attention_weights_cat): ''' PARAMS ------ query: decoder output (batch, num_mels * n_frames_per_step) processed_memory: processed encoder outputs (B, T_in, attention_dim) attention_weights_cat: cumulative and prev. att weights (B, 2, max_time) RETURNS ------- alignment (batch, max_time) ''' processed_query = self.query_layer(query.unsqueeze(1)) processed_attention_weights = self.location_layer(attention_weights_cat) energies = self.v(torch.tanh( processed_query + processed_attention_weights + processed_memory)) energies = energies.squeeze(-1) return energies
Example #2
Source File: train_vocoder.py From Tacotron2-Wavenet-Korean-TTS with MIT License | 6 votes |
def create_network(hp,batch_size,num_speakers,is_training): net = WaveNetModel( batch_size=batch_size, dilations=hp.dilations, filter_width=hp.filter_width, residual_channels=hp.residual_channels, dilation_channels=hp.dilation_channels, quantization_channels=hp.quantization_channels, out_channels =hp.out_channels, skip_channels=hp.skip_channels, use_biases=hp.use_biases, # True scalar_input=hp.scalar_input, global_condition_channels=hp.gc_channels, global_condition_cardinality=num_speakers, local_condition_channels=hp.num_mels, upsample_factor=hp.upsample_factor, legacy = hp.legacy, residual_legacy = hp.residual_legacy, drop_rate = hp.wavenet_dropout, train_mode=is_training) return net
Example #3
Source File: model.py From WaveRNN-Pytorch with MIT License | 6 votes |
def build_model(): """build model with hparams settings """ if hp.input_type == 'raw': print('building model with Beta distribution output') elif hp.input_type == 'mixture': print("building model with mixture of logistic output") elif hp.input_type == 'bits': print("building model with quantized bit audio") elif hp.input_type == 'mulaw': print("building model with quantized mulaw encoding") else: raise ValueError('input_type provided not supported') model = Model(hp.rnn_dims, hp.fc_dims, hp.bits, hp.pad, hp.upsample_factors, hp.num_mels, hp.compute_dims, hp.res_out_dims, hp.res_blocks) return model
Example #4
Source File: model.py From Tacotron2-PyTorch with MIT License | 5 votes |
def __init__(self): super(Tacotron2, self).__init__() self.num_mels = hps.num_mels self.mask_padding = hps.mask_padding self.n_frames_per_step = hps.n_frames_per_step self.embedding = nn.Embedding( hps.n_symbols, hps.symbols_embedding_dim) std = sqrt(2.0/(hps.n_symbols+hps.symbols_embedding_dim)) val = sqrt(3.0)*std # uniform bounds for std self.embedding.weight.data.uniform_(-val, val) self.encoder = Encoder() self.decoder = Decoder() self.postnet = Postnet()
Example #5
Source File: audio.py From Griffin_lim with MIT License | 5 votes |
def _build_mel_basis(): n_fft = (hparams.num_freq - 1) * 2 return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
Example #6
Source File: audio.py From Tacotron2-PyTorch with MIT License | 5 votes |
def _build_mel_basis(): n_fft = (hps.num_freq - 1) * 2 return librosa.filters.mel(hps.sample_rate, n_fft, n_mels=hps.num_mels)
Example #7
Source File: model.py From Tacotron2-PyTorch with MIT License | 5 votes |
def __init__(self): super(Postnet, self).__init__() self.convolutions = nn.ModuleList() self.convolutions.append( nn.Sequential( ConvNorm(hps.num_mels, hps.postnet_embedding_dim, kernel_size=hps.postnet_kernel_size, stride=1, padding=int((hps.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hps.postnet_embedding_dim)) ) for i in range(1, hps.postnet_n_convolutions - 1): self.convolutions.append( nn.Sequential( ConvNorm(hps.postnet_embedding_dim, hps.postnet_embedding_dim, kernel_size=hps.postnet_kernel_size, stride=1, padding=int((hps.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hps.postnet_embedding_dim)) ) self.convolutions.append( nn.Sequential( ConvNorm(hps.postnet_embedding_dim, hps.num_mels, kernel_size=hps.postnet_kernel_size, stride=1, padding=int((hps.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='linear'), nn.BatchNorm1d(hps.num_mels)) )
Example #8
Source File: model.py From Tacotron2-PyTorch with MIT License | 5 votes |
def __init__(self): super(Decoder, self).__init__() self.num_mels = hps.num_mels self.n_frames_per_step = hps.n_frames_per_step self.encoder_embedding_dim = hps.encoder_embedding_dim self.attention_rnn_dim = hps.attention_rnn_dim self.decoder_rnn_dim = hps.decoder_rnn_dim self.prenet_dim = hps.prenet_dim self.max_decoder_steps = hps.max_decoder_steps self.gate_threshold = hps.gate_threshold self.p_attention_dropout = hps.p_attention_dropout self.p_decoder_dropout = hps.p_decoder_dropout self.prenet = Prenet( hps.num_mels * hps.n_frames_per_step, [hps.prenet_dim, hps.prenet_dim]) self.attention_rnn = nn.LSTMCell( hps.prenet_dim + hps.encoder_embedding_dim, hps.attention_rnn_dim) self.attention_layer = Attention( hps.attention_rnn_dim, hps.encoder_embedding_dim, hps.attention_dim, hps.attention_location_n_filters, hps.attention_location_kernel_size) self.decoder_rnn = nn.LSTMCell( hps.attention_rnn_dim + hps.encoder_embedding_dim, hps.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hps.decoder_rnn_dim + hps.encoder_embedding_dim, hps.num_mels * hps.n_frames_per_step) self.gate_layer = LinearNorm( hps.decoder_rnn_dim + hps.encoder_embedding_dim, 1, bias=True, w_init_gain='sigmoid')
Example #9
Source File: model.py From Tacotron2-PyTorch with MIT License | 5 votes |
def get_go_frame(self, memory): ''' Gets all zeros frames to use as first decoder input PARAMS ------ memory: decoder outputs RETURNS ------- decoder_input: all zeros frames ''' B = memory.size(0) decoder_input = Variable(memory.data.new( B, self.num_mels * self.n_frames_per_step).zero_()) return decoder_input
Example #10
Source File: model.py From Tacotron2-PyTorch with MIT License | 5 votes |
def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments): ''' Prepares decoder outputs for output PARAMS ------ mel_outputs: gate_outputs: gate output energies alignments: RETURNS ------- mel_outputs: gate_outpust: gate output energies alignments: ''' # (T_out, B) -> (B, T_out) alignments = torch.stack(alignments).transpose(0, 1) # (T_out, B) -> (B, T_out) gate_outputs = torch.stack(gate_outputs).transpose(0, 1) gate_outputs = gate_outputs.contiguous() # (T_out, B, num_mels) -> (B, T_out, num_mels) mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous() # decouple frames per step mel_outputs = mel_outputs.view( mel_outputs.size(0), -1, self.num_mels) # (B, T_out, num_mels) -> (B, num_mels, T_out) mel_outputs = mel_outputs.transpose(1, 2) return mel_outputs, gate_outputs, alignments
Example #11
Source File: synthesizer.py From gmvae_tacotron with MIT License | 5 votes |
def load(self, checkpoint_path, gta=False, model_name='Tacotron'): print('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') with tf.variable_scope('model') as scope: self.model = create_model(model_name, hparams) if hparams.use_vae: ref_targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'ref_targets') if gta: targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets') if hparams.use_vae: self.model.initialize(inputs, input_lengths, targets, gta=gta, reference_mel=ref_targets) else: self.model.initialize(inputs, input_lengths, targets, gta=gta) else: if hparams.use_vae: self.model.initialize(inputs, input_lengths, reference_mel=ref_targets) else: self.model.initialize(inputs, input_lengths) self.mel_outputs = self.model.mel_outputs self.alignment = self.model.alignments[0] self.gta = gta print('Loading checkpoint: %s' % checkpoint_path) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
Example #12
Source File: model.py From Tacotron2-PyTorch with MIT License | 5 votes |
def parse_output(self, outputs, output_lengths=None): if self.mask_padding and output_lengths is not None: mask = ~get_mask_from_lengths(output_lengths, True) # (B, T) mask = mask.expand(self.num_mels, mask.size(0), mask.size(1)) # (80, B, T) mask = mask.permute(1, 0, 2) # (B, 80, T) outputs[0].data.masked_fill_(mask, 0.0) # (B, 80, T) outputs[1].data.masked_fill_(mask, 0.0) # (B, 80, T) slice = torch.arange(0, mask.size(2), self.n_frames_per_step) outputs[2].data.masked_fill_(mask[:, 0, slice], 1e3) # gate energies (B, T//n_frames_per_step) return outputs
Example #13
Source File: datafeeder_tacotron2.py From Tacotron2-Wavenet-Korean-TTS with MIT License | 5 votes |
def _pad_target(t, length): # t: 2 dim array. ( xx, num_mels) ==> (length,num_mels) return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad) # (169, 80) ==> (length, 80) ###
Example #14
Source File: predict_mel.py From self-attention-tacotron with BSD 3-Clause "New" or "Revised" License | 5 votes |
def predict(hparams, model_dir, checkpoint_path, output_dir, test_source_files, test_target_files): def predict_input_fn(): source = tf.data.TFRecordDataset(list(test_source_files)) target = tf.data.TFRecordDataset(list(test_target_files)) dataset = dataset_factory(source, target, hparams) batched = dataset.prepare_and_zip().group_by_batch( batch_size=1).merge_target_to_source() return batched.dataset estimator = tacotron_model_factory(hparams, model_dir, None) predictions = map( lambda p: PredictedMel(p["id"], p["key"], p["mel"], p.get("mel_postnet"), p["mel"].shape[1], p["mel"].shape[0], p["ground_truth_mel"], p["alignment"], p.get("alignment2"), p.get("alignment3"), p.get("alignment4"), p.get("alignment5"), p.get("alignment6"), p["source"], p["text"], p.get("accent_type")), estimator.predict(predict_input_fn, checkpoint_path=checkpoint_path)) for v in predictions: key = v.key.decode('utf-8') mel_filename = f"{key}.{hparams.predicted_mel_extension}" mel_filepath = os.path.join(output_dir, mel_filename) mel = v.predicted_mel_postnet if hparams.use_postnet_v2 else v.predicted_mel assert mel.shape[1] == hparams.num_mels mel.tofile(mel_filepath, format='<f4') text = v.text.decode("utf-8") plot_filename = f"{key}.png" plot_filepath = os.path.join(output_dir, plot_filename) alignments = list(filter(lambda x: x is not None, [v.alignment, v.alignment2, v.alignment3, v.alignment4, v.alignment5, v.alignment6])) plot_predictions(alignments, v.ground_truth_mel, v.predicted_mel, v.predicted_mel_postnet, text, v.key, plot_filepath) prediction_filename = f"{key}.tfrecord" prediction_filepath = os.path.join(output_dir, prediction_filename) write_prediction_result(v.id, key, alignments, mel, v.ground_truth_mel, text, v.source, v.accent_type, prediction_filepath)
Example #15
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _build_mel_basis(): assert hparams.fmax <= hparams.sample_rate // 2 return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, fmin=hparams.fmin, fmax=hparams.fmax, n_mels=hparams.num_mels)
Example #16
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _build_mel_basis(): assert hparams.fmax <= hparams.sample_rate // 2 return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, fmin=hparams.fmin, fmax=hparams.fmax, n_mels=hparams.num_mels)
Example #17
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _build_mel_basis(): assert hparams.fmax <= hparams.sample_rate // 2 return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, fmin=hparams.fmin, fmax=hparams.fmax, n_mels=hparams.num_mels)
Example #18
Source File: audio.py From representation_mixing with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _build_mel_basis(): assert hparams.fmax <= hparams.sample_rate // 2 return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, fmin=hparams.fmin, fmax=hparams.fmax, n_mels=hparams.num_mels)
Example #19
Source File: datafeeder_wavenet.py From Tacotron-Wavenet-Vocoder-Korean with MIT License | 5 votes |
def __init__(self,coord,data_dirs,batch_size,receptive_field, gc_enable=False, queue_size=8): super(DataFeederWavenet, self).__init__() self.data_dirs = data_dirs self.coord = coord self.batch_size = batch_size self.receptive_field = receptive_field self.hop_size = audio.get_hop_size(hparams) self.sample_size = ensure_divisible(hparams.sample_size,self.hop_size, True) self.max_frames = self.sample_size // self.hop_size # sample_size 크기를 확보하기 위해. self.queue_size = queue_size self.gc_enable = gc_enable self.skip_path_filter = hparams.skip_path_filter self.rng = np.random.RandomState(123) self._offset = defaultdict(lambda: 2) # key에 없는 값이 들어어면 2가 할당된다. self.data_dir_to_id = {data_dir: idx for idx, data_dir in enumerate(self.data_dirs)} # data_dir <---> speaker_id 매핑 self.path_dict = get_path_dict(self.data_dirs,np.max([self.sample_size,receptive_field]))# receptive_field 보다 작은 것을 버리고, 나머지만 돌려준다. self._placeholders = [ tf.placeholder(tf.float32, shape=[None,None,1],name='input_wav'), tf.placeholder(tf.float32, shape=[None,None,hparams.num_mels],name='local_condition') ] dtypes = [tf.float32, tf.float32] if self.gc_enable: self._placeholders.append(tf.placeholder(tf.int32, shape=[None],name='speaker_id')) dtypes.append(tf.int32) queue = tf.FIFOQueue(self.queue_size, dtypes, name='input_queue') self.enqueue = queue.enqueue(self._placeholders) if self.gc_enable: self.inputs_wav, self.local_condition, self.speaker_id = queue.dequeue() else: self.inputs_wav, self.local_condition = queue.dequeue() self.inputs_wav.set_shape(self._placeholders[0].shape) self.local_condition.set_shape(self._placeholders[1].shape) if self.gc_enable: self.speaker_id.set_shape(self._placeholders[2].shape)
Example #20
Source File: audio.py From vae_tacotron with MIT License | 5 votes |
def _build_mel_basis(): n_fft = (hparams.num_freq - 1) * 2 return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
Example #21
Source File: audio.py From vae_tacotron2 with MIT License | 5 votes |
def _build_mel_basis(): assert hparams.fmax <= hparams.sample_rate // 2 return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels, fmin=hparams.fmin, fmax=hparams.fmax)
Example #22
Source File: audio.py From vae_tacotron2 with MIT License | 5 votes |
def _build_mel_basis(): assert hparams.fmax <= hparams.sample_rate // 2 return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels, fmin=hparams.fmin, fmax=hparams.fmax)
Example #23
Source File: feeder.py From vae_tacotron2 with MIT License | 5 votes |
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name='inputs'), tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), tf.placeholder(tf.int32,[None],'mel_lengths'), tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), ] # Create queue for buffering data queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.mel_lengths, self.token_targets, self.linear_targets = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.mel_lengths.set_shape(self._placeholders[3].shape) self.token_targets.set_shape(self._placeholders[4].shape) self.linear_targets.set_shape(self._placeholders[5].shape)
Example #24
Source File: synthesizer.py From vae_tacotron2 with MIT License | 5 votes |
def load(self, checkpoint_path, gta=False, model_name='Tacotron'): print('Constructing model: %s' % model_name) inputs = tf.placeholder(tf.int32, [1, None], 'inputs') input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') with tf.variable_scope('model') as scope: self.model = create_model(model_name, hparams) if hparams.use_vae: ref_targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'ref_targets') if gta: targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets') if hparams.use_vae: self.model.initialize(inputs, input_lengths, targets, gta=gta, reference_mel=ref_targets) else: self.model.initialize(inputs, input_lengths, targets, gta=gta) else: if hparams.use_vae: self.model.initialize(inputs, input_lengths, reference_mel=ref_targets) else: self.model.initialize(inputs, input_lengths) self.mel_outputs = self.model.mel_outputs self.alignment = self.model.alignments[0] self.gta = gta print('Loading checkpoint: %s' % checkpoint_path) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
Example #25
Source File: audio.py From arabic-tacotron-tts with MIT License | 5 votes |
def _build_mel_basis(): n_fft = (hparams.num_freq - 1) * 2 return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
Example #26
Source File: feeder.py From gmvae_tacotron with MIT License | 5 votes |
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name='inputs'), tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), tf.placeholder(tf.int32,[None],'mel_lengths'), tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), ] # Create queue for buffering data queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.mel_lengths, self.token_targets, self.linear_targets = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.mel_lengths.set_shape(self._placeholders[3].shape) self.token_targets.set_shape(self._placeholders[4].shape) self.linear_targets.set_shape(self._placeholders[5].shape)
Example #27
Source File: audio.py From cnn_vocoder with MIT License | 5 votes |
def _build_mel_basis(): assert hparams.fmax <= hparams.sample_rate // 2 return librosa.filters.mel(hparams.sample_rate, n_fft, fmin=hparams.fmin, fmax=hparams.fmax, n_mels=hparams.num_mels)
Example #28
Source File: audio.py From tacotron with MIT License | 5 votes |
def _build_mel_basis(): n_fft = (hparams.num_freq - 1) * 2 return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
Example #29
Source File: audio.py From WaveRNN-Pytorch with MIT License | 5 votes |
def _build_mel_basis(): if hparams.fmax is not None: assert hparams.fmax <= hparams.sample_rate // 2 return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, fmin=hparams.fmin, fmax=hparams.fmax, n_mels=hparams.num_mels)
Example #30
Source File: model.py From WaveRNN-Pytorch with MIT License | 5 votes |
def no_test_build_model(): model = Model(hp.rnn_dims, hp.fc_dims, hp.bits, hp.pad, hp.upsample_factors, hp.num_mels, hp.compute_dims, hp.res_out_dims, hp.res_blocks).cuda() print(vars(model))