Python torch.hann_window() Examples
The following are 30
code examples of torch.hann_window().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch
, or try the search function
.
Example #1
Source File: transforms.py From audio with BSD 2-Clause "Simplified" License | 6 votes |
def __init__(self, n_fft: int = 400, win_length: Optional[int] = None, hop_length: Optional[int] = None, pad: int = 0, window_fn: Callable[..., Tensor] = torch.hann_window, power: Optional[float] = 2., normalized: bool = False, wkwargs: Optional[dict] = None) -> None: super(Spectrogram, self).__init__() self.n_fft = n_fft # number of FFT bins. the returned STFT result will have n_fft // 2 + 1 # number of frequecies due to onesided=True in torch.stft self.win_length = win_length if win_length is not None else n_fft self.hop_length = hop_length if hop_length is not None else self.win_length // 2 window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs) self.register_buffer('window', window) self.pad = pad self.power = power self.normalized = normalized
Example #2
Source File: kaldi.py From audio with BSD 2-Clause "Simplified" License | 6 votes |
def _feature_window_function(window_type: str, window_size: int, blackman_coeff: float, device: torch.device, dtype: int, ) -> Tensor: r"""Returns a window function with the given type and size """ if window_type == HANNING: return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype) elif window_type == HAMMING: return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype) elif window_type == POVEY: # like hanning but goes to zero at edges return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85) elif window_type == RECTANGULAR: return torch.ones(window_size, device=device, dtype=dtype) elif window_type == BLACKMAN: a = 2 * math.pi / (window_size - 1) window_function = torch.arange(window_size, device=device, dtype=dtype) # can't use torch.blackman_window as they use different coefficients return (blackman_coeff - 0.5 * torch.cos(a * window_function) + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)).to(device=device, dtype=dtype) else: raise Exception('Invalid window type ' + window_type)
Example #3
Source File: audio_preprocessing.py From NeMo with Apache License 2.0 | 6 votes |
def __init__(self, win_length, hop_length): super().__init__() self.win_length = win_length self.hop_length = hop_length self.disable_casts = self._opt_level == Optimization.mxprO1 self.torch_windows = { 'hann': torch.hann_window, 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, 'ones': torch.ones, None: torch.ones, }
Example #4
Source File: stft.py From cnn_vocoder with MIT License | 6 votes |
def __init__(self, win_length=1024, hop_length=256, n_fft=2048, n_mels=80, preemp=True): super(MelSpectrogram, self).__init__() if preemp: self.preemp = nn.Conv1d(1, 1, 2, bias=False, padding=1) self.preemp.weight.data[0][0][0] = -0.97 self.preemp.weight.data[0][0][1] = 1.0 self.preemp.weight.requires_grad = False else: self.preemp = None self.register_buffer('mel_basis', _build_mel_basis(n_fft, n_mels)) win = torch.hann_window(win_length) self.register_buffer('win', win) self.win_length = win_length self.hop_length = hop_length self.n_fft = n_fft
Example #5
Source File: test_librosa_compatibility.py From audio with BSD 2-Clause "Simplified" License | 6 votes |
def test_griffinlim(self): # NOTE: This test is flaky without a fixed random seed # See https://github.com/pytorch/audio/issues/382 torch.random.manual_seed(42) tensor = torch.rand((1, 1000)) n_fft = 400 ws = 400 hop = 100 window = torch.hann_window(ws) normalize = False momentum = 0.99 n_iter = 8 length = 1000 rand_init = False init = 'random' if rand_init else None specgram = F.spectrogram(tensor, 0, window, n_fft, hop, ws, 2, normalize).sqrt() ta_out = F.griffinlim(specgram, window, n_fft, hop, ws, 1, normalize, n_iter, momentum, length, rand_init) lr_out = librosa.griffinlim(specgram.squeeze(0).numpy(), n_iter=n_iter, hop_length=hop, momentum=momentum, init=init, length=length) lr_out = torch.from_numpy(lr_out).unsqueeze(0) self.assertEqual(ta_out, lr_out, atol=5e-5, rtol=1e-5)
Example #6
Source File: torchscript_consistency_impl.py From audio with BSD 2-Clause "Simplified" License | 6 votes |
def test_griffinlim(self): def func(tensor): n_fft = 400 ws = 400 hop = 200 window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype) power = 2. normalize = False momentum = 0.99 n_iter = 32 length = 1000 rand_int = False return F.griffinlim(tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, rand_int) tensor = torch.rand((1, 201, 6)) self._assert_consistency(func, tensor)
Example #7
Source File: utils.py From freesound-classification with Apache License 2.0 | 6 votes |
def compute_torch_stft(audio, descriptor): name, *args = descriptor.split("_") n_fft, hop_size, *rest = args n_fft = int(n_fft) hop_size = int(hop_size) stft = torch.stft( audio, n_fft=n_fft, hop_length=hop_size, window=torch.hann_window(n_fft, device=audio.device) ) stft = torch.sqrt((stft ** 2).sum(-1)) return stft
Example #8
Source File: modules.py From ddsp_pytorch with GNU General Public License v3.0 | 6 votes |
def get_window(name, window_length, squared=False): """ Returns a windowing function. Arguments: ---------- window (str) : name of the window, currently only 'hann' is available window_length (int) : length of the window squared (bool) : if true, square the window Returns: ---------- torch.FloatTensor : window of size `window_length` """ if name == "hann": window = torch.hann_window(window_length) elif name == "hamming": window = torch.hamming_window(window_length) elif name == "blackman": window = torch.blackman_window(window_length) else: raise ValueError("Invalid window name {}".format(name)) if squared: window *= window return window
Example #9
Source File: model.py From open-unmix-pytorch with MIT License | 5 votes |
def __init__( self, n_fft=4096, n_hop=1024, center=False ): super(STFT, self).__init__() self.window = nn.Parameter( torch.hann_window(n_fft), requires_grad=False ) self.n_fft = n_fft self.n_hop = n_hop self.center = center
Example #10
Source File: modules.py From melgan-neurips with MIT License | 5 votes |
def __init__( self, n_fft=1024, hop_length=256, win_length=1024, sampling_rate=22050, n_mel_channels=80, mel_fmin=0.0, mel_fmax=None, ): super().__init__() ############################################## # FFT Parameters # ############################################## window = torch.hann_window(win_length).float() mel_basis = librosa_mel_fn( sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax ) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer("mel_basis", mel_basis) self.register_buffer("window", window) self.n_fft = n_fft self.hop_length = hop_length self.win_length = win_length self.sampling_rate = sampling_rate self.n_mel_channels = n_mel_channels
Example #11
Source File: features.py From training with Apache License 2.0 | 5 votes |
def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01, n_fft=None, window="hamming", normalize="per_feature", log=True, center=True, dither=constant, pad_to=8, max_duration=16.7, frame_splicing=1): super(SpectrogramFeatures, self).__init__() torch_windows = { 'hann': torch.hann_window, 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, 'none': None, } self.win_length = int(sample_rate * window_size) self.hop_length = int(sample_rate * window_stride) self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) window_fn = torch_windows.get(window, None) window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None self.window = window_tensor self.normalize = normalize self.log = log self.center = center self.dither = dither self.pad_to = pad_to self.frame_splicing = frame_splicing max_length = 1 + math.ceil( (max_duration * sample_rate - self.win_length) / self.hop_length ) max_pad = 16 - (max_length % 16) self.max_length = max_length + max_pad
Example #12
Source File: modules.py From ClariNet with MIT License | 5 votes |
def stft(y, scale='linear'): D = torch.stft(y, n_fft=1024, hop_length=256, win_length=1024)#, window=torch.hann_window(1024).cuda()) D = torch.sqrt(D.pow(2).sum(-1) + 1e-10) # D = torch.sqrt(torch.clamp(D.pow(2).sum(-1), min=1e-10)) if scale == 'linear': return D elif scale == 'log': S = 2 * torch.log(torch.clamp(D, 1e-10, float("inf"))) return S else: pass
Example #13
Source File: modules.py From WaveVAE with MIT License | 5 votes |
def stft(y): D = torch.stft(y, n_fft=1024, hop_length=256, win_length=1024, window=torch.hann_window(1024).cuda()) D = torch.sqrt(D.pow(2).sum(-1) + 1e-10) S = 2 * torch.log(torch.clamp(D, 1e-10, float("inf"))) return D, S
Example #14
Source File: stft.py From cnn_vocoder with MIT License | 5 votes |
def __init__(self, win_length=1024, hop_length=256, n_fft=2048, preemp=True): super(Spectrogram, self).__init__() if preemp: self.preemp = nn.Conv1d(1, 1, 2, bias=False, padding=1) self.preemp.weight.data[0][0][0] = -0.97 self.preemp.weight.data[0][0][1] = 1.0 self.preemp.weight.requires_grad = False else: self.preemp = None win = torch.hann_window(win_length) self.register_buffer('win', win) self.win_length = win_length self.hop_length = hop_length self.n_fft = n_fft
Example #15
Source File: loss.py From cnn_vocoder with MIT License | 5 votes |
def compute_stft(audio, n_fft=1024, win_length=1024, hop_length=256): """ Computes STFT transformation of given audio Args: audio (Tensor): B x T, batch of audio Returns: mag (Tensor): STFT magnitudes real (Tensor): Real part of STFT transformation result im (Tensor): Imagine part of STFT transformation result """ win = torch.hann_window(win_length).cuda() # add some padding because torch 4.0 doesn't signal_dim = audio.dim() extended_shape = [1] * (3 - signal_dim) + list(audio.size()) # pad = int(self.n_fft // 2) pad = win_length audio = F.pad(audio.view(extended_shape), (pad, pad), 'constant') audio = audio.view(audio.shape[-signal_dim:]) stft = torch.stft(audio, win_length, hop_length, fft_size=n_fft, window=win) real = stft[:, :, :, 0] im = stft[:, :, :, 1] power = torch.sqrt(torch.pow(real, 2) + torch.pow(im, 2)) return power, real, im
Example #16
Source File: generators.py From ddsp_pytorch with GNU General Public License v3.0 | 5 votes |
def __init__(self, filter_size, block_size): super(Generator, self).__init__() self.apply(self.init_parameters) self.block_size = block_size self.filter_size = filter_size self.noise_att = 1e-4 self.filter_window = nn.Parameter(torch.hann_window(filter_size).roll(filter_size//2,-1),requires_grad=False) self.filter_coef = None
Example #17
Source File: vocoder_old2.py From TTS-Cube with Apache License 2.0 | 5 votes |
def _power_loss(self, p_y, t_y): fft_orig = torch.stft(t_y.reshape(t_y.shape[0]), n_fft=512, window=torch.hann_window(window_length=512).to(device)) fft_pred = torch.stft(p_y.reshape(p_y.shape[0]), n_fft=512, window=torch.hann_window(window_length=512).to(device)) real_orig = fft_orig[:, :, 0] im_org = fft_orig[:, :, 1] power_orig = torch.sqrt(torch.pow(real_orig, 2) + torch.pow(im_org, 2)) real_pred = fft_pred[:, :, 0] im_pred = fft_pred[:, :, 1] power_pred = torch.sqrt(torch.pow(real_pred, 2) + torch.pow(im_pred, 2)) return torch.sum(torch.pow(torch.norm(torch.abs(power_pred) - torch.abs(power_orig), p=2, dim=1), 2)) / ( power_pred.shape[0] * power_pred.shape[1])
Example #18
Source File: modules.py From TTS-Cube with Apache License 2.0 | 5 votes |
def stft(y, scale='linear'): D = torch.stft(y, n_fft=1024, hop_length=256, win_length=1024, window=torch.hann_window(1024).cuda()) D = torch.sqrt(D.pow(2).sum(-1) + 1e-10) # D = torch.sqrt(torch.clamp(D.pow(2).sum(-1), min=1e-10)) if scale == 'linear': return D elif scale == 'log': S = 2 * torch.log(torch.clamp(D, 1e-10, float("inf"))) return S else: pass # STFT code is adapted from: https://github.com/pseeth/pytorch-stft
Example #19
Source File: reconstruct.py From MelNet with MIT License | 5 votes |
def __init__(self, hp): self.hp = hp self.window = torch.hann_window(window_length=hp.audio.win_length).cuda() self.mel_basis = librosa.filters.mel( sr=hp.audio.sr, n_fft=hp.audio.n_fft, n_mels=hp.audio.n_mels ) self.mel_basis = torch.from_numpy(self.mel_basis).cuda() # [n_mels, n_fft//2+1] self.criterion = torch.nn.MSELoss()
Example #20
Source File: torchscript_consistency_impl.py From audio with BSD 2-Clause "Simplified" License | 5 votes |
def test_spectrogram(self): def func(tensor): n_fft = 400 ws = 400 hop = 200 pad = 0 window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype) power = 2. normalize = False return F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize) tensor = common_utils.get_whitenoise() self._assert_consistency(func, tensor)
Example #21
Source File: functional_cpu_test.py From audio with BSD 2-Clause "Simplified" License | 5 votes |
def test_linearity_of_istft2(self): # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized': False, 'onesided': False, } data_size = (2, 12, 7, 2) self._test_linearity_of_istft(data_size, kwargs2)
Example #22
Source File: functional_cpu_test.py From audio with BSD 2-Clause "Simplified" License | 5 votes |
def test_linearity_of_istft1(self): # hann_window, centered, normalized, onesided kwargs1 = { 'n_fft': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized': True, 'onesided': True, } data_size = (2, 7, 7, 2) self._test_linearity_of_istft(data_size, kwargs1)
Example #23
Source File: functional_cpu_test.py From audio with BSD 2-Clause "Simplified" License | 5 votes |
def test_istft_is_inverse_of_stft2(self): # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, 'hop_length': 2, 'win_length': 8, 'window': torch.hann_window(8), 'center': True, 'pad_mode': 'reflect', 'normalized': False, 'onesided': False, } _test_istft_is_inverse_of_stft(kwargs2)
Example #24
Source File: functional_cpu_test.py From audio with BSD 2-Clause "Simplified" License | 5 votes |
def test_istft_is_inverse_of_stft1(self): # hann_window, centered, normalized, onesided kwargs1 = { 'n_fft': 12, 'hop_length': 4, 'win_length': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized': True, 'onesided': True, } _test_istft_is_inverse_of_stft(kwargs1)
Example #25
Source File: test_batch_consistency.py From audio with BSD 2-Clause "Simplified" License | 5 votes |
def test_griffinlim(self): n_fft = 400 ws = 400 hop = 200 window = torch.hann_window(ws) power = 2 normalize = False momentum = 0.99 n_iter = 32 length = 1000 tensor = torch.rand((1, 201, 6)) self.assert_batch_consistencies( F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0, atol=5e-5 )
Example #26
Source File: transforms.py From audio with BSD 2-Clause "Simplified" License | 5 votes |
def __init__(self, sample_rate: int = 16000, n_fft: int = 400, win_length: Optional[int] = None, hop_length: Optional[int] = None, f_min: float = 0., f_max: Optional[float] = None, pad: int = 0, n_mels: int = 128, window_fn: Callable[..., Tensor] = torch.hann_window, power: Optional[float] = 2., normalized: bool = False, wkwargs: Optional[dict] = None) -> None: super(MelSpectrogram, self).__init__() self.sample_rate = sample_rate self.n_fft = n_fft self.win_length = win_length if win_length is not None else n_fft self.hop_length = hop_length if hop_length is not None else self.win_length // 2 self.pad = pad self.power = power self.normalized = normalized self.n_mels = n_mels # number of mel frequency bins self.f_max = f_max self.f_min = f_min self.spectrogram = Spectrogram(n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length, pad=self.pad, window_fn=window_fn, power=self.power, normalized=self.normalized, wkwargs=wkwargs) self.mel_scale = MelScale(self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1)
Example #27
Source File: transforms.py From audio with BSD 2-Clause "Simplified" License | 5 votes |
def __init__(self, n_fft: int = 400, n_iter: int = 32, win_length: Optional[int] = None, hop_length: Optional[int] = None, window_fn: Callable[..., Tensor] = torch.hann_window, power: float = 2., normalized: bool = False, wkwargs: Optional[dict] = None, momentum: float = 0.99, length: Optional[int] = None, rand_init: bool = True) -> None: super(GriffinLim, self).__init__() assert momentum < 1, 'momentum=%s > 1 can be unstable' % momentum assert momentum > 0, 'momentum=%s < 0' % momentum self.n_fft = n_fft self.n_iter = n_iter self.win_length = win_length if win_length is not None else n_fft self.hop_length = hop_length if hop_length is not None else self.win_length // 2 window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs) self.register_buffer('window', window) self.normalized = normalized self.length = length self.power = power self.momentum = momentum / (1 + momentum) self.rand_init = rand_init
Example #28
Source File: features.py From inference with Apache License 2.0 | 4 votes |
def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01, window="hamming", normalize="per_feature", n_fft=None, preemph=0.97, nfilt=64, lowfreq=0, highfreq=None, log=True, dither=constant, pad_to=8, max_duration=16.7, frame_splicing=1): super(FilterbankFeatures, self).__init__() # print("PADDING: {}".format(pad_to)) torch_windows = { 'hann': torch.hann_window, 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, 'none': None, } self.win_length = int(sample_rate * window_size) # frame size self.hop_length = int(sample_rate * window_stride) self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) self.normalize = normalize self.log = log self.dither = dither self.frame_splicing = frame_splicing self.nfilt = nfilt self.preemph = preemph self.pad_to = pad_to # For now, always enable this. # See https://docs.google.com/presentation/d/1IVC3J-pHB-ipJpKsJox_SqmDHYdkIaoCXTbKmJmV2-I/edit?usp=sharing for elaboration self.use_deterministic_dithering = True highfreq = highfreq or sample_rate / 2 window_fn = torch_windows.get(window, None) window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None filterbanks = torch.tensor( librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq), dtype=torch.float).unsqueeze(0) # self.fb = filterbanks # self.window = window_tensor self.register_buffer("fb", filterbanks) self.register_buffer("window", window_tensor) # Calculate maximum sequence length (# frames) max_length = 1 + math.ceil( (max_duration * sample_rate - self.win_length) / self.hop_length ) max_pad = 16 - (max_length % 16) self.max_length = max_length + max_pad
Example #29
Source File: test_librosa_compatibility.py From audio with BSD 2-Clause "Simplified" License | 4 votes |
def test_InverseMelScale(self): """InverseMelScale transform is comparable to that of librosa""" n_fft = 2048 n_mels = 256 n_stft = n_fft // 2 + 1 hop_length = n_fft // 4 # Prepare mel spectrogram input. We use torchaudio to compute one. common_utils.set_audio_backend('default') sound, sample_rate = _load_audio_asset( 'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14) sound = sound.mean(dim=0, keepdim=True) spec_orig = F.spectrogram( sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, power=2, normalized=False) melspec_ta = torchaudio.transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig) melspec_lr = melspec_ta.cpu().numpy().squeeze() # Perform InverseMelScale with torch audio and librosa spec_ta = torchaudio.transforms.InverseMelScale( n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta) spec_lr = librosa.feature.inverse.mel_to_stft( melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None) spec_lr = torch.from_numpy(spec_lr[None, ...]) # Align dimensions # librosa does not return power spectrogram while torchaudio returns power spectrogram spec_orig = spec_orig.sqrt() spec_ta = spec_ta.sqrt() threshold = 2.0 # This threshold was choosen empirically, based on the following observation # # torch.dist(spec_lr, spec_ta, p=float('inf')) # >>> tensor(1.9666) # # The spectrograms reconstructed by librosa and torchaudio are not comparable elementwise. # This is because they use different approximation algorithms and resulting values can live # in different magnitude. (although most of them are very close) # See # https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm # https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf # distance over frequencies. self.assertEqual(spec_ta, spec_lr, atol=threshold, rtol=1e-5) threshold = 1700.0 # This threshold was choosen empirically, based on the following observations # # torch.dist(spec_orig, spec_ta, p=1) # >>> tensor(1644.3516) # torch.dist(spec_orig, spec_lr, p=1) # >>> tensor(1420.7103) # torch.dist(spec_lr, spec_ta, p=1) # >>> tensor(943.2759) assert torch.dist(spec_orig, spec_ta, p=1) < threshold
Example #30
Source File: features.py From training with Apache License 2.0 | 4 votes |
def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01, window="hamming", normalize="per_feature", n_fft=None, preemph=0.97, nfilt=64, lowfreq=0, highfreq=None, log=True, dither=constant, pad_to=8, max_duration=16.7, frame_splicing=1): super(FilterbankFeatures, self).__init__() # print("PADDING: {}".format(pad_to)) torch_windows = { 'hann': torch.hann_window, 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, 'none': None, } self.win_length = int(sample_rate * window_size) # frame size self.hop_length = int(sample_rate * window_stride) self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length)) self.normalize = normalize self.log = log self.dither = dither self.frame_splicing = frame_splicing self.nfilt = nfilt self.preemph = preemph self.pad_to = pad_to highfreq = highfreq or sample_rate / 2 window_fn = torch_windows.get(window, None) window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None filterbanks = torch.tensor( librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq), dtype=torch.float).unsqueeze(0) # self.fb = filterbanks # self.window = window_tensor self.register_buffer("fb", filterbanks) self.register_buffer("window", window_tensor) # Calculate maximum sequence length (# frames) max_length = 1 + math.ceil( (max_duration * sample_rate - self.win_length) / self.hop_length ) max_pad = 16 - (max_length % 16) self.max_length = max_length + max_pad