Python torch.hann_window() Examples

The following are 30 code examples of torch.hann_window(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch , or try the search function .
Example #1
Source File: transforms.py    From audio with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def __init__(self,
                 n_fft: int = 400,
                 win_length: Optional[int] = None,
                 hop_length: Optional[int] = None,
                 pad: int = 0,
                 window_fn: Callable[..., Tensor] = torch.hann_window,
                 power: Optional[float] = 2.,
                 normalized: bool = False,
                 wkwargs: Optional[dict] = None) -> None:
        super(Spectrogram, self).__init__()
        self.n_fft = n_fft
        # number of FFT bins. the returned STFT result will have n_fft // 2 + 1
        # number of frequecies due to onesided=True in torch.stft
        self.win_length = win_length if win_length is not None else n_fft
        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
        self.register_buffer('window', window)
        self.pad = pad
        self.power = power
        self.normalized = normalized 
Example #2
Source File: kaldi.py    From audio with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def _feature_window_function(window_type: str,
                             window_size: int,
                             blackman_coeff: float,
                             device: torch.device,
                             dtype: int,
                             ) -> Tensor:
    r"""Returns a window function with the given type and size
    """
    if window_type == HANNING:
        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype)
    elif window_type == HAMMING:
        return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype)
    elif window_type == POVEY:
        # like hanning but goes to zero at edges
        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85)
    elif window_type == RECTANGULAR:
        return torch.ones(window_size, device=device, dtype=dtype)
    elif window_type == BLACKMAN:
        a = 2 * math.pi / (window_size - 1)
        window_function = torch.arange(window_size, device=device, dtype=dtype)
        # can't use torch.blackman_window as they use different coefficients
        return (blackman_coeff - 0.5 * torch.cos(a * window_function) +
                (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)).to(device=device, dtype=dtype)
    else:
        raise Exception('Invalid window type ' + window_type) 
Example #3
Source File: audio_preprocessing.py    From NeMo with Apache License 2.0 6 votes vote down vote up
def __init__(self, win_length, hop_length):
        super().__init__()

        self.win_length = win_length
        self.hop_length = hop_length

        self.disable_casts = self._opt_level == Optimization.mxprO1

        self.torch_windows = {
            'hann': torch.hann_window,
            'hamming': torch.hamming_window,
            'blackman': torch.blackman_window,
            'bartlett': torch.bartlett_window,
            'ones': torch.ones,
            None: torch.ones,
        } 
Example #4
Source File: stft.py    From cnn_vocoder with MIT License 6 votes vote down vote up
def __init__(self, win_length=1024, hop_length=256, n_fft=2048, n_mels=80, preemp=True):
        super(MelSpectrogram, self).__init__()
        if preemp:
            self.preemp = nn.Conv1d(1, 1, 2, bias=False, padding=1)
            self.preemp.weight.data[0][0][0] = -0.97
            self.preemp.weight.data[0][0][1] = 1.0
            self.preemp.weight.requires_grad = False
        else:
            self.preemp = None

        self.register_buffer('mel_basis', _build_mel_basis(n_fft, n_mels))

        win = torch.hann_window(win_length)
        self.register_buffer('win', win)

        self.win_length = win_length
        self.hop_length = hop_length
        self.n_fft = n_fft 
Example #5
Source File: test_librosa_compatibility.py    From audio with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_griffinlim(self):
        # NOTE: This test is flaky without a fixed random seed
        # See https://github.com/pytorch/audio/issues/382
        torch.random.manual_seed(42)
        tensor = torch.rand((1, 1000))

        n_fft = 400
        ws = 400
        hop = 100
        window = torch.hann_window(ws)
        normalize = False
        momentum = 0.99
        n_iter = 8
        length = 1000
        rand_init = False
        init = 'random' if rand_init else None

        specgram = F.spectrogram(tensor, 0, window, n_fft, hop, ws, 2, normalize).sqrt()
        ta_out = F.griffinlim(specgram, window, n_fft, hop, ws, 1, normalize,
                              n_iter, momentum, length, rand_init)
        lr_out = librosa.griffinlim(specgram.squeeze(0).numpy(), n_iter=n_iter, hop_length=hop,
                                    momentum=momentum, init=init, length=length)
        lr_out = torch.from_numpy(lr_out).unsqueeze(0)

        self.assertEqual(ta_out, lr_out, atol=5e-5, rtol=1e-5) 
Example #6
Source File: torchscript_consistency_impl.py    From audio with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_griffinlim(self):
        def func(tensor):
            n_fft = 400
            ws = 400
            hop = 200
            window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype)
            power = 2.
            normalize = False
            momentum = 0.99
            n_iter = 32
            length = 1000
            rand_int = False
            return F.griffinlim(tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, rand_int)

        tensor = torch.rand((1, 201, 6))
        self._assert_consistency(func, tensor) 
Example #7
Source File: utils.py    From freesound-classification with Apache License 2.0 6 votes vote down vote up
def compute_torch_stft(audio, descriptor):

    name, *args = descriptor.split("_")

    n_fft, hop_size, *rest = args
    n_fft = int(n_fft)
    hop_size = int(hop_size)

    stft = torch.stft(
        audio,
        n_fft=n_fft,
        hop_length=hop_size,
        window=torch.hann_window(n_fft, device=audio.device)
    )

    stft = torch.sqrt((stft ** 2).sum(-1))

    return stft 
Example #8
Source File: modules.py    From ddsp_pytorch with GNU General Public License v3.0 6 votes vote down vote up
def get_window(name, window_length, squared=False):
    """
    Returns a windowing function.
    
    Arguments:
    ----------
        window (str)                : name of the window, currently only 'hann' is available
        window_length (int)         : length of the window
        squared (bool)              : if true, square the window
        
    Returns:
    ----------
        torch.FloatTensor           : window of size `window_length`
    """
    if name == "hann":
        window = torch.hann_window(window_length)
    elif name == "hamming":
        window = torch.hamming_window(window_length)
    elif name == "blackman":
        window = torch.blackman_window(window_length)
    else:
        raise ValueError("Invalid window name {}".format(name))
    if squared:
        window *= window
    return window 
Example #9
Source File: model.py    From open-unmix-pytorch with MIT License 5 votes vote down vote up
def __init__(
        self,
        n_fft=4096,
        n_hop=1024,
        center=False
    ):
        super(STFT, self).__init__()
        self.window = nn.Parameter(
            torch.hann_window(n_fft),
            requires_grad=False
        )
        self.n_fft = n_fft
        self.n_hop = n_hop
        self.center = center 
Example #10
Source File: modules.py    From melgan-neurips with MIT License 5 votes vote down vote up
def __init__(
        self,
        n_fft=1024,
        hop_length=256,
        win_length=1024,
        sampling_rate=22050,
        n_mel_channels=80,
        mel_fmin=0.0,
        mel_fmax=None,
    ):
        super().__init__()
        ##############################################
        # FFT Parameters                              #
        ##############################################
        window = torch.hann_window(win_length).float()
        mel_basis = librosa_mel_fn(
            sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax
        )
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer("mel_basis", mel_basis)
        self.register_buffer("window", window)
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.win_length = win_length
        self.sampling_rate = sampling_rate
        self.n_mel_channels = n_mel_channels 
Example #11
Source File: features.py    From training with Apache License 2.0 5 votes vote down vote up
def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
                       n_fft=None,
                       window="hamming", normalize="per_feature", log=True, center=True,
                       dither=constant, pad_to=8, max_duration=16.7,
                       frame_splicing=1):
        super(SpectrogramFeatures, self).__init__()
        torch_windows = {
            'hann': torch.hann_window,
            'hamming': torch.hamming_window,
            'blackman': torch.blackman_window,
            'bartlett': torch.bartlett_window,
            'none': None,
        }
        self.win_length = int(sample_rate * window_size)
        self.hop_length = int(sample_rate * window_stride)
        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))

        window_fn = torch_windows.get(window, None)
        window_tensor = window_fn(self.win_length,
                                  periodic=False) if window_fn else None
        self.window = window_tensor

        self.normalize = normalize
        self.log = log
        self.center = center
        self.dither = dither
        self.pad_to = pad_to
        self.frame_splicing = frame_splicing

        max_length = 1 + math.ceil(
                (max_duration * sample_rate - self.win_length) / self.hop_length
        )
        max_pad = 16 - (max_length % 16)
        self.max_length = max_length + max_pad 
Example #12
Source File: modules.py    From ClariNet with MIT License 5 votes vote down vote up
def stft(y, scale='linear'):
    D = torch.stft(y, n_fft=1024, hop_length=256, win_length=1024)#, window=torch.hann_window(1024).cuda())
    D = torch.sqrt(D.pow(2).sum(-1) + 1e-10)
    # D = torch.sqrt(torch.clamp(D.pow(2).sum(-1), min=1e-10))
    if scale == 'linear':
        return D
    elif scale == 'log':
        S = 2 * torch.log(torch.clamp(D, 1e-10, float("inf")))
        return S
    else:
        pass 
Example #13
Source File: modules.py    From WaveVAE with MIT License 5 votes vote down vote up
def stft(y):
    D = torch.stft(y, n_fft=1024, hop_length=256, win_length=1024, window=torch.hann_window(1024).cuda())
    D = torch.sqrt(D.pow(2).sum(-1) + 1e-10)
    S = 2 * torch.log(torch.clamp(D, 1e-10, float("inf")))
    return D, S 
Example #14
Source File: stft.py    From cnn_vocoder with MIT License 5 votes vote down vote up
def __init__(self, win_length=1024, hop_length=256, n_fft=2048, preemp=True):
        super(Spectrogram, self).__init__()
        if preemp:
            self.preemp = nn.Conv1d(1, 1, 2, bias=False, padding=1)
            self.preemp.weight.data[0][0][0] = -0.97
            self.preemp.weight.data[0][0][1] = 1.0
            self.preemp.weight.requires_grad = False
        else:
            self.preemp = None

        win = torch.hann_window(win_length)
        self.register_buffer('win', win)
        self.win_length = win_length
        self.hop_length = hop_length
        self.n_fft = n_fft 
Example #15
Source File: loss.py    From cnn_vocoder with MIT License 5 votes vote down vote up
def compute_stft(audio, n_fft=1024, win_length=1024, hop_length=256):
    """
    Computes STFT transformation of given audio

    Args:
        audio (Tensor): B x T, batch of audio

    Returns:
        mag (Tensor): STFT magnitudes
        real (Tensor): Real part of STFT transformation result
        im (Tensor): Imagine part of STFT transformation result
    """
    win = torch.hann_window(win_length).cuda()

    # add some padding because torch 4.0 doesn't
    signal_dim = audio.dim()
    extended_shape = [1] * (3 - signal_dim) + list(audio.size())
    # pad = int(self.n_fft // 2)
    pad = win_length
    audio = F.pad(audio.view(extended_shape), (pad, pad), 'constant')
    audio = audio.view(audio.shape[-signal_dim:])

    stft = torch.stft(audio, win_length, hop_length, fft_size=n_fft, window=win)
    real = stft[:, :, :, 0]
    im = stft[:, :, :, 1]
    power = torch.sqrt(torch.pow(real, 2) + torch.pow(im, 2))
    return power, real, im 
Example #16
Source File: generators.py    From ddsp_pytorch with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, filter_size, block_size):
        super(Generator, self).__init__()
        self.apply(self.init_parameters)
        self.block_size = block_size
        self.filter_size = filter_size
        self.noise_att = 1e-4
        self.filter_window = nn.Parameter(torch.hann_window(filter_size).roll(filter_size//2,-1),requires_grad=False)
        self.filter_coef = None 
Example #17
Source File: vocoder_old2.py    From TTS-Cube with Apache License 2.0 5 votes vote down vote up
def _power_loss(self, p_y, t_y):
        fft_orig = torch.stft(t_y.reshape(t_y.shape[0]), n_fft=512,
                              window=torch.hann_window(window_length=512).to(device))
        fft_pred = torch.stft(p_y.reshape(p_y.shape[0]), n_fft=512,
                              window=torch.hann_window(window_length=512).to(device))
        real_orig = fft_orig[:, :, 0]
        im_org = fft_orig[:, :, 1]
        power_orig = torch.sqrt(torch.pow(real_orig, 2) + torch.pow(im_org, 2))
        real_pred = fft_pred[:, :, 0]
        im_pred = fft_pred[:, :, 1]
        power_pred = torch.sqrt(torch.pow(real_pred, 2) + torch.pow(im_pred, 2))
        return torch.sum(torch.pow(torch.norm(torch.abs(power_pred) - torch.abs(power_orig), p=2, dim=1), 2)) / (
                power_pred.shape[0] * power_pred.shape[1]) 
Example #18
Source File: modules.py    From TTS-Cube with Apache License 2.0 5 votes vote down vote up
def stft(y, scale='linear'):
    D = torch.stft(y, n_fft=1024, hop_length=256, win_length=1024, window=torch.hann_window(1024).cuda())
    D = torch.sqrt(D.pow(2).sum(-1) + 1e-10)
    # D = torch.sqrt(torch.clamp(D.pow(2).sum(-1), min=1e-10))
    if scale == 'linear':
        return D
    elif scale == 'log':
        S = 2 * torch.log(torch.clamp(D, 1e-10, float("inf")))
        return S
    else:
        pass

# STFT code is adapted from: https://github.com/pseeth/pytorch-stft 
Example #19
Source File: reconstruct.py    From MelNet with MIT License 5 votes vote down vote up
def __init__(self, hp):
        self.hp = hp
        self.window = torch.hann_window(window_length=hp.audio.win_length).cuda()
        self.mel_basis = librosa.filters.mel(
            sr=hp.audio.sr,
            n_fft=hp.audio.n_fft,
            n_mels=hp.audio.n_mels
        )
        self.mel_basis = torch.from_numpy(self.mel_basis).cuda() # [n_mels, n_fft//2+1]
        self.criterion = torch.nn.MSELoss() 
Example #20
Source File: torchscript_consistency_impl.py    From audio with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_spectrogram(self):
        def func(tensor):
            n_fft = 400
            ws = 400
            hop = 200
            pad = 0
            window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype)
            power = 2.
            normalize = False
            return F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize)

        tensor = common_utils.get_whitenoise()
        self._assert_consistency(func, tensor) 
Example #21
Source File: functional_cpu_test.py    From audio with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_linearity_of_istft2(self):
        # hann_window, centered, not normalized, not onesided
        kwargs2 = {
            'n_fft': 12,
            'window': torch.hann_window(12),
            'center': True,
            'pad_mode': 'reflect',
            'normalized': False,
            'onesided': False,
        }
        data_size = (2, 12, 7, 2)
        self._test_linearity_of_istft(data_size, kwargs2) 
Example #22
Source File: functional_cpu_test.py    From audio with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_linearity_of_istft1(self):
        # hann_window, centered, normalized, onesided
        kwargs1 = {
            'n_fft': 12,
            'window': torch.hann_window(12),
            'center': True,
            'pad_mode': 'reflect',
            'normalized': True,
            'onesided': True,
        }
        data_size = (2, 7, 7, 2)
        self._test_linearity_of_istft(data_size, kwargs1) 
Example #23
Source File: functional_cpu_test.py    From audio with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_istft_is_inverse_of_stft2(self):
        # hann_window, centered, not normalized, not onesided
        kwargs2 = {
            'n_fft': 12,
            'hop_length': 2,
            'win_length': 8,
            'window': torch.hann_window(8),
            'center': True,
            'pad_mode': 'reflect',
            'normalized': False,
            'onesided': False,
        }
        _test_istft_is_inverse_of_stft(kwargs2) 
Example #24
Source File: functional_cpu_test.py    From audio with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_istft_is_inverse_of_stft1(self):
        # hann_window, centered, normalized, onesided
        kwargs1 = {
            'n_fft': 12,
            'hop_length': 4,
            'win_length': 12,
            'window': torch.hann_window(12),
            'center': True,
            'pad_mode': 'reflect',
            'normalized': True,
            'onesided': True,
        }
        _test_istft_is_inverse_of_stft(kwargs1) 
Example #25
Source File: test_batch_consistency.py    From audio with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_griffinlim(self):
        n_fft = 400
        ws = 400
        hop = 200
        window = torch.hann_window(ws)
        power = 2
        normalize = False
        momentum = 0.99
        n_iter = 32
        length = 1000
        tensor = torch.rand((1, 201, 6))
        self.assert_batch_consistencies(
            F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0, atol=5e-5
        ) 
Example #26
Source File: transforms.py    From audio with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def __init__(self,
                 sample_rate: int = 16000,
                 n_fft: int = 400,
                 win_length: Optional[int] = None,
                 hop_length: Optional[int] = None,
                 f_min: float = 0.,
                 f_max: Optional[float] = None,
                 pad: int = 0,
                 n_mels: int = 128,
                 window_fn: Callable[..., Tensor] = torch.hann_window,
                 power: Optional[float] = 2.,
                 normalized: bool = False,
                 wkwargs: Optional[dict] = None) -> None:
        super(MelSpectrogram, self).__init__()
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.win_length = win_length if win_length is not None else n_fft
        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
        self.pad = pad
        self.power = power
        self.normalized = normalized
        self.n_mels = n_mels  # number of mel frequency bins
        self.f_max = f_max
        self.f_min = f_min
        self.spectrogram = Spectrogram(n_fft=self.n_fft, win_length=self.win_length,
                                       hop_length=self.hop_length,
                                       pad=self.pad, window_fn=window_fn, power=self.power,
                                       normalized=self.normalized, wkwargs=wkwargs)
        self.mel_scale = MelScale(self.n_mels, self.sample_rate, self.f_min, self.f_max, self.n_fft // 2 + 1) 
Example #27
Source File: transforms.py    From audio with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def __init__(self,
                 n_fft: int = 400,
                 n_iter: int = 32,
                 win_length: Optional[int] = None,
                 hop_length: Optional[int] = None,
                 window_fn: Callable[..., Tensor] = torch.hann_window,
                 power: float = 2.,
                 normalized: bool = False,
                 wkwargs: Optional[dict] = None,
                 momentum: float = 0.99,
                 length: Optional[int] = None,
                 rand_init: bool = True) -> None:
        super(GriffinLim, self).__init__()

        assert momentum < 1, 'momentum=%s > 1 can be unstable' % momentum
        assert momentum > 0, 'momentum=%s < 0' % momentum

        self.n_fft = n_fft
        self.n_iter = n_iter
        self.win_length = win_length if win_length is not None else n_fft
        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
        window = window_fn(self.win_length) if wkwargs is None else window_fn(self.win_length, **wkwargs)
        self.register_buffer('window', window)
        self.normalized = normalized
        self.length = length
        self.power = power
        self.momentum = momentum / (1 + momentum)
        self.rand_init = rand_init 
Example #28
Source File: features.py    From inference with Apache License 2.0 4 votes vote down vote up
def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
                 window="hamming", normalize="per_feature", n_fft=None,
                 preemph=0.97,
                 nfilt=64, lowfreq=0, highfreq=None, log=True, dither=constant,
                 pad_to=8,
                 max_duration=16.7,
                 frame_splicing=1):
        super(FilterbankFeatures, self).__init__()
#        print("PADDING: {}".format(pad_to))

        torch_windows = {
            'hann': torch.hann_window,
            'hamming': torch.hamming_window,
            'blackman': torch.blackman_window,
            'bartlett': torch.bartlett_window,
            'none': None,
        }

        self.win_length = int(sample_rate * window_size)  # frame size
        self.hop_length = int(sample_rate * window_stride)
        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))

        self.normalize = normalize
        self.log = log
        self.dither = dither
        self.frame_splicing = frame_splicing
        self.nfilt = nfilt
        self.preemph = preemph
        self.pad_to = pad_to
        # For now, always enable this.
        # See https://docs.google.com/presentation/d/1IVC3J-pHB-ipJpKsJox_SqmDHYdkIaoCXTbKmJmV2-I/edit?usp=sharing for elaboration
        self.use_deterministic_dithering = True
        highfreq = highfreq or sample_rate / 2
        window_fn = torch_windows.get(window, None)
        window_tensor = window_fn(self.win_length,
                                  periodic=False) if window_fn else None
        filterbanks = torch.tensor(
            librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq,
                                fmax=highfreq), dtype=torch.float).unsqueeze(0)
        # self.fb = filterbanks
        # self.window = window_tensor
        self.register_buffer("fb", filterbanks)
        self.register_buffer("window", window_tensor)
        # Calculate maximum sequence length (# frames)
        max_length = 1 + math.ceil(
            (max_duration * sample_rate - self.win_length) / self.hop_length
        )
        max_pad = 16 - (max_length % 16)
        self.max_length = max_length + max_pad 
Example #29
Source File: test_librosa_compatibility.py    From audio with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def test_InverseMelScale(self):
        """InverseMelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        n_stft = n_fft // 2 + 1
        hop_length = n_fft // 4

        # Prepare mel spectrogram input. We use torchaudio to compute one.
        common_utils.set_audio_backend('default')
        sound, sample_rate = _load_audio_asset(
            'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14)
        sound = sound.mean(dim=0, keepdim=True)
        spec_orig = F.spectrogram(
            sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft,
            hop_length=hop_length, win_length=n_fft, power=2, normalized=False)
        melspec_ta = torchaudio.transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig)
        melspec_lr = melspec_ta.cpu().numpy().squeeze()
        # Perform InverseMelScale with torch audio and librosa
        spec_ta = torchaudio.transforms.InverseMelScale(
            n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta)
        spec_lr = librosa.feature.inverse.mel_to_stft(
            melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None)
        spec_lr = torch.from_numpy(spec_lr[None, ...])

        # Align dimensions
        # librosa does not return power spectrogram while torchaudio returns power spectrogram
        spec_orig = spec_orig.sqrt()
        spec_ta = spec_ta.sqrt()

        threshold = 2.0
        # This threshold was choosen empirically, based on the following observation
        #
        # torch.dist(spec_lr, spec_ta, p=float('inf'))
        # >>> tensor(1.9666)
        #
        # The spectrograms reconstructed by librosa and torchaudio are not comparable elementwise.
        # This is because they use different approximation algorithms and resulting values can live
        # in different magnitude. (although most of them are very close)
        # See
        # https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm
        # https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf
        # distance over frequencies.
        self.assertEqual(spec_ta, spec_lr, atol=threshold, rtol=1e-5)

        threshold = 1700.0
        # This threshold was choosen empirically, based on the following observations
        #
        # torch.dist(spec_orig, spec_ta, p=1)
        # >>> tensor(1644.3516)
        # torch.dist(spec_orig, spec_lr, p=1)
        # >>> tensor(1420.7103)
        # torch.dist(spec_lr, spec_ta, p=1)
        # >>> tensor(943.2759)
        assert torch.dist(spec_orig, spec_ta, p=1) < threshold 
Example #30
Source File: features.py    From training with Apache License 2.0 4 votes vote down vote up
def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
                       window="hamming", normalize="per_feature", n_fft=None,
                       preemph=0.97,
                       nfilt=64, lowfreq=0, highfreq=None, log=True, dither=constant,
                       pad_to=8,
                       max_duration=16.7,
                       frame_splicing=1):
        super(FilterbankFeatures, self).__init__()
#        print("PADDING: {}".format(pad_to))

        torch_windows = {
            'hann': torch.hann_window,
            'hamming': torch.hamming_window,
            'blackman': torch.blackman_window,
            'bartlett': torch.bartlett_window,
            'none': None,
        }

        self.win_length = int(sample_rate * window_size) # frame size
        self.hop_length = int(sample_rate * window_stride)
        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))

        self.normalize = normalize
        self.log = log
        self.dither = dither
        self.frame_splicing = frame_splicing
        self.nfilt = nfilt
        self.preemph = preemph
        self.pad_to = pad_to
        highfreq = highfreq or sample_rate / 2
        window_fn = torch_windows.get(window, None)
        window_tensor = window_fn(self.win_length,
                                  periodic=False) if window_fn else None
        filterbanks = torch.tensor(
            librosa.filters.mel(sample_rate, self.n_fft, n_mels=nfilt, fmin=lowfreq,
                                                    fmax=highfreq), dtype=torch.float).unsqueeze(0)
        # self.fb = filterbanks
        # self.window = window_tensor
        self.register_buffer("fb", filterbanks)
        self.register_buffer("window", window_tensor)
        # Calculate maximum sequence length (# frames)
        max_length = 1 + math.ceil(
                (max_duration * sample_rate - self.win_length) / self.hop_length
        )
        max_pad = 16 - (max_length % 16)
        self.max_length = max_length + max_pad