Python Examples of librosa.get

Source File: Audio.py From mugen with MIT License

6 votes

def __init__(self, file: str, *, sample_rate: int = 44100):
        """        
        Parameters
        ----------
        file
            Audio file to load
        """

        self.file = file
        self.samples, self.sample_rate = librosa.load(file, sr=sample_rate)
        self.duration = librosa.get_duration(y=self.samples, sr=self.sample_rate)

Source File: data_tools.py From Speech-enhancement with MIT License

6 votes

def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
    """This function take audio files of a directory and merge them
    in a numpy matrix of size (nb_frame,frame_length) for a sliding window of size hop_length_frame"""

    list_sound_array = []

    for file in list_audio_files:
        # open the audio file
        y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
        total_duration = librosa.get_duration(y=y, sr=sr)

        if (total_duration >= min_duration):
            list_sound_array.append(audio_to_audio_frame_stack(
                y, frame_length, hop_length_frame))
        else:
            print(
                f"The following file {os.path.join(audio_dir,file)} is below the min duration")

    return np.vstack(list_sound_array)

Source File: test_core.py From muda with ISC License

6 votes

def test_save(jam_in, audio_file, strict, fmt):

    jam = muda.load_jam_audio(jam_in, audio_file)

    _, jamfile = tempfile.mkstemp(suffix='.jams')
    _, audfile = tempfile.mkstemp(suffix='.wav')

    muda.save(audfile, jamfile, jam, strict=strict, fmt=fmt)

    jam2 = muda.load_jam_audio(jamfile, audfile, fmt=fmt)
    jam2_raw = jams.load(jamfile, fmt=fmt)

    os.unlink(audfile)
    os.unlink(jamfile)

    assert hasattr(jam2.sandbox, 'muda')
    assert '_audio' in jam2.sandbox.muda
    assert '_audio' not in jam2_raw.sandbox.muda

    duration = librosa.get_duration(**jam2.sandbox.muda['_audio'])

    assert jam2.file_metadata.duration == duration

Source File: test_deformers.py From muda with ISC License

6 votes

def test_ir_convolution(ir_files,jam_fixture,n_fft,rolloff_value):
    D = muda.deformers.IRConvolution(ir_files = ir_files, n_fft=n_fft, rolloff_value = rolloff_value)

    jam_orig = deepcopy(jam_fixture)
    orig_duration = librosa.get_duration(**jam_orig.sandbox.muda['_audio'])

    for jam_new in D.transform(jam_orig):
        # Verify that the original jam reference hasn't changed
        assert jam_new is not jam_orig

        #Testing with shifted impulse
        __test_shifted_impulse(jam_orig, jam_new, ir_files, orig_duration,n_fft=n_fft, rolloff_value = rolloff_value)

        #Verify that the state and history objects are intact
        __test_deformer_history(D, jam_new.sandbox.muda.history[-1])

    # Serialization test
    D2 = muda.deserialize(muda.serialize(D))
    __test_params(D, D2)

Source File: rhythm.py From pumpp with ISC License

6 votes

def transform_audio(self, y):
        '''Compute the tempogram

        Parameters
        ----------
        y : np.ndarray
            Audio buffer

        Returns
        -------
        data : dict
            data['tempogram'] : np.ndarray, shape=(n_frames, win_length)
                The tempogram
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        tgram = tempogram(y=y, sr=self.sr,
                          hop_length=self.hop_length,
                          win_length=self.win_length)

        tgram = to_dtype(fix_length(tgram, n_frames), self.dtype)
        return {'tempogram': tgram.T[self.idx]}

Source File: test_deformers.py From muda with ISC License

5 votes

def test_colorednoise(n_samples, color, weight_min, weight_max, jam_test_silence):

    D = muda.deformers.ColoredNoise(n_samples=n_samples,
                                    color=color,
                                    weight_min=weight_min,
                                    weight_max=weight_max,
                                    rng=0)
    jam_orig = deepcopy(jam_test_silence)

    orig_duration = librosa.get_duration(**jam_orig.sandbox.muda['_audio'])

    n_out = 0
    for jam_new in D.transform(jam_orig):
        assert jam_new is not jam_test_silence
        __test_effect(jam_orig, jam_test_silence)

        assert not np.allclose(jam_orig.sandbox.muda['_audio']['y'],
                               jam_new.sandbox.muda['_audio']['y'])
        # verify that duration hasn't changed
        assert librosa.get_duration(**jam_new.sandbox.muda['_audio']) == orig_duration

        # Verify that the state and history objects are intact
        __test_deformer_history(D, jam_new.sandbox.muda.history[-1])

        __test_effect(jam_orig, jam_new)

        # Verify the colored noise has desired slope for its log-log
        # scale power spectrum
        color = jam_new.sandbox.muda.history[-1]['state']['color']
        __test_color_slope(jam_orig, jam_new, color)

        n_out += 1
    assert n_out == n_samples
    # Serialization test
    D2 = muda.deserialize(muda.serialize(D))
    __test_params(D, D2)

Source File: labeled_example.py From speechless with MIT License

5 votes

def duration_in_s(self) -> float:
        try:
            return librosa.get_duration(filename=str(self.audio_file))
        except Exception as e:
            log("Failed to get duration of {}: {}".format(self.audio_file, e))
            return 0

Source File: speech_cls_task.py From delta with Apache License 2.0

5 votes

def get_duration(self, filename, sr):  #pylint: disable=invalid-name
    ''' time in second '''
    if filename.endswith('.npy'):
      nframe = np.load(filename).shape[0]
      return librosa.frames_to_time(
          nframe, hop_length=self._winstep * sr, sr=sr)

    if filename.endswith('.wav'):
      return librosa.get_duration(filename=filename)

    raise ValueError("filename suffix not .npy or .wav: {}".format(
        os.path.splitext(filename)[-1]))

Source File: get_hi-mia_data.py From NeMo with Apache License 2.0

5 votes

def process_single_line(line: str):
    line = line.strip()
    y, sr = librosa.load(line, sr=None)
    if sr != 16000:
        y, sr = librosa.load(line, sr=16000)
        librosa.output.write_wav(line, y, sr)
    dur = librosa.get_duration(y=y, sr=sr)
    if 'test' in line.split("/"):
        speaker = line.split('/')[-1].split('.')[0].split('_')[0]
    else:
        speaker = line.split('/')[-2]
    speaker = list(speaker)
    speaker = ''.join(speaker)
    meta = {"audio_filepath": line, "duration": float(dur), "label": speaker}
    return meta

Source File: scp_to_manifest.py From NeMo with Apache License 2.0

5 votes

def main(scp, id, out, split=False):
    if os.path.exists(out):
        os.remove(out)
    scp_file = open(scp, 'r').readlines()

    lines = []
    speakers = []
    with open(out, 'w') as outfile:
        for line in tqdm(scp_file):
            line = line.strip()
            y, sr = l.load(line, sr=None)
            dur = l.get_duration(y=y, sr=sr)
            speaker = line.split('/')[id]
            speaker = list(speaker)
            speaker = ''.join(speaker)
            speakers.append(speaker)
            meta = {"audio_filepath": line, "duration": float(dur), "label": speaker}
            lines.append(meta)
            json.dump(meta, outfile)
            outfile.write("\n")

    path = os.path.dirname(out)
    if split:
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
        for train_idx, test_idx in sss.split(speakers, speakers):
            logging.info(len(train_idx))

        out = os.path.join(path, 'train.json')
        write_file(out, lines, train_idx)
        out = os.path.join(path, 'dev.json')
        write_file(out, lines, test_idx)

Source File: get_databaker_data.py From NeMo with Apache License 2.0

5 votes

def __convert_waves(wavedir, converted_wavedir, wavename, sr):
    """
    Converts a wav file to target sample rate.
    """
    wavepath = os.path.join(wavedir, wavename)
    converted_wavepath = os.path.join(converted_wavedir, wavename)
    y, sr = librosa.load(wavepath, sr=sr)
    duration = librosa.get_duration(y=y, sr=sr)
    librosa.output.write_wav(converted_wavepath, y, sr)
    return wavename, round(duration, 2)

Source File: test_deformers.py From muda with ISC License

5 votes

def __test_time(jam_orig, jam_new, rate):

    # Test the track length
    ap_(librosa.get_duration(**jam_orig.sandbox.muda['_audio']),
        rate * librosa.get_duration(**jam_new.sandbox.muda['_audio']))

    # Test the metadata
    ap_(jam_orig.file_metadata.duration,
        rate * jam_new.file_metadata.duration)

    # Test each annotation
    for ann_orig, ann_new in zip(jam_orig.annotations, jam_new.annotations):
        # JAMS 0.2.1 support
        if hasattr(ann_orig, 'time'):
            ap_(ann_orig.time, rate * ann_new.time)
            ap_(ann_orig.duration, rate * ann_new.duration)

        assert len(ann_orig.data) == len(ann_new.data)

        for obs1, obs2 in zip(ann_orig, ann_new):

            ap_(obs1.time, rate * obs2.time)
            ap_(obs1.duration, rate * obs2.duration)

            if ann_orig.namespace == 'tempo':
                ap_(rate * obs1.value, obs2.value)

Source File: test_deformers.py From muda with ISC License

5 votes

def test_background(noise, n_samples, weight_min, weight_max, jam_fixture):

    D = muda.deformers.BackgroundNoise(files=noise,
                                       n_samples=n_samples,
                                       weight_min=weight_min,
                                       weight_max=weight_max)

    jam_orig = deepcopy(jam_fixture)
    orig_duration = librosa.get_duration(**jam_orig.sandbox.muda['_audio'])

    n_out = 0
    for jam_new in D.transform(jam_orig):

        assert jam_new is not jam_fixture
        __test_effect(jam_orig, jam_fixture)

        assert not np.allclose(jam_orig.sandbox.muda['_audio']['y'],
                               jam_new.sandbox.muda['_audio']['y'])

        d_state = jam_new.sandbox.muda.history[-1]['state']
        filename = d_state['filename']
        start = d_state['start']
        stop = d_state['stop']

        with psf.SoundFile(str(filename), mode='r') as soundf:
            max_index = len(soundf)
            noise_sr = soundf.samplerate

        assert 0 <= start < stop
        assert start < stop <= max_index
        assert ((stop - start) / float(noise_sr)) == orig_duration

        __test_effect(jam_orig, jam_new)
        n_out += 1

    assert n_out == n_samples
    # Serialization test
    D2 = muda.deserialize(muda.serialize(D))
    __test_params(D, D2)

Source File: base.py From crema with BSD 2-Clause "Simplified" License

5 votes

def predict(self, filename=None, y=None, sr=None, outputs=None):
        '''Predict annotations

        Parameters
        ----------
        filename : str (optional)
            Path to audio file

        y, sr : (optional)
            Audio buffer and sample rate

        outputs : (optional)
            Pre-computed model outputs as produced by `CremaModel.outputs`.
            If provided, then predictions are derived from these instead of
            `filename` or `(y, sr)`.


        .. note:: At least one of `filename`, `y, sr` must be provided.

        Returns
        -------
        jams.Annotation
            The predicted annotation
        '''

        # Pump the input features
        output_key = self.model.output_names[0]

        if outputs is None:
            outputs = self.outputs(filename=filename, y=y, sr=sr)

        # Invert the prediction.  This is always the first output layer.
        ann = self.pump[output_key].inverse(outputs[output_key])

        # Populate the metadata
        ann.annotation_metadata.version = self.version
        ann.annotation_metadata.annotation_tools = 'CREMA {}'.format(version)
        ann.annotation_metadata.data_source = 'program'
        ann.duration = librosa.get_duration(y=y, sr=sr, filename=filename)

        return ann

Source File: test_deformers.py From muda with ISC License

5 votes

def __test_duration(jam_orig, jam_shifted, orig_duration):
    #Verify the duration of last delayed annotation is in valid range
    #Verify the total duration hasn't changed
    assert (librosa.get_duration(**jam_shifted.sandbox.muda['_audio'])) == orig_duration

    shifted_data = jam_shifted.search(namespace='chord')[0].data
    #the expected duration of last annotation = Duration - Onset of last annotation
    ref_duration = orig_duration - shifted_data[-1][0] #[-1][0] indicates the 'time' of last observation
    #deformed duration:
    derformed_duration = shifted_data[-1][1] #[-1][0] indicates the 'duration' of last observation
    isclose_(ref_duration,derformed_duration,rtol=1e-5, atol=1e-1)

Source File: pre_processing.py From audio-source-separation with MIT License

5 votes

def process(file_path,direc,destination_path,phase_bool,destination_phase_path):
	t1,t2=librosa.load(file_path,sr=None)
	duration=librosa.get_duration(t1,t2)
	regex = re.compile(r'\d+')
	index=regex.findall(direc)
	#print(index)
	num_segments=0
	#mean=np.zeros((513,52))
	#var=np.zeros((513,52))
	for start in range(30,int(200)):

		wave_array, fs = librosa.load(file_path,sr=44100,offset=start*0.3,duration = 0.3)

		mag, phase = librosa.magphase(librosa.stft(wave_array, n_fft=1024,hop_length=256,window='hann',center='True'))
		#mean+=mag
		#num_segments+=1;
		if not os.path.exists(destination_path):
			os.makedirs(destination_path)
		#print(mag.shape)
		#print(torch.from_numpy(np.expand_dims(mag,axis=0)).shape)

		# magnitude stored as tensor, phase as np array
		#pickle.dump(torch.from_numpy(np.expand_dims(mag,axis=2)),open(os.path.join(destination_path,(index[0] +"_" + str(start) +'_m.pt')),'wb'))
		torch.save(torch.from_numpy(np.expand_dims(mag,axis=0)),os.path.join(destination_path,(index[0] +"_" + str(start) +'_m.pt')))
		if phase_bool:
			if not os.path.exists(destination_phase_path):
				os.makedirs(destination_phase_path)
			np.save(os.path.join(destination_phase_path,(index[0]+"_" +str(start)+'_p.npy')),phase)
	return

#--------- training data-------------------------------------

Source File: audio.py From Multilingual_Text_to_Speech with MIT License

5 votes

def duration(data):
    """Return duration of an audio signal in seconds."""
    return librosa.get_duration(data, sr=hp.sample_rate)

Source File: utils.py From vadnet with GNU Lesser General Public License v3.0

5 votes

def audio_dur(path, ext='', root=''):
    path = os.path.join(root, '{}{}'.format(path, ext))
    try:
        return lr.get_duration(filename=path)
    except Exception as ex:        
        print_err('could not read {}\n{}'.format(path, ex))
        return 0

Source File: eda_vlsp.py From automatic_speech_recognition with GNU General Public License v3.0

5 votes

def stat_acoustic():
    print("\nAcoustic Data:")
    wav_folder = join(ROOT_FOLDER, "data", "vlsp", "wav")
    files = listdir(wav_folder)
    files = [join(wav_folder, file) for file in files]
    durations = [librosa.get_duration(filename=file) for file in files]
    durations = pd.Series(durations)
    print(f"Total: {durations.sum():.2f} seconds ({durations.sum() / 3600:.2f} hours)")
    print(durations.describe())

Source File: mel.py From pumpp with ISC License

5 votes

def transform_audio(self, y):
        '''Compute the Mel spectrogram

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, n_mels)
                The Mel spectrogram
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        mel = np.sqrt(melspectrogram(y=y, sr=self.sr,
                                     n_fft=self.n_fft,
                                     hop_length=self.hop_length,
                                     n_mels=self.n_mels,
                                     fmax=self.fmax))

        mel = fix_length(mel, n_frames)

        if self.log:
            mel = amplitude_to_db(mel, ref=np.max)

        # Type convert
        mel = to_dtype(mel, self.dtype)

        return {'mag': mel.T[self.idx]}

Source File: cqt.py From pumpp with ISC License

5 votes

def transform_audio(self, y):
        '''Compute the CQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                fmin=self.fmin,
                n_bins=(self.n_octaves * self.over_sample * 12),
                bins_per_octave=(self.over_sample * 12))

        C = fix_length(C, n_frames)

        cqtm, phase = magphase(C)
        if self.log:
            cqtm = amplitude_to_db(cqtm, ref=np.max)

        dphase = phase_diff(np.angle(phase).T[self.idx], self.conv)

        return {'mag': to_dtype(cqtm.T[self.idx], self.dtype),
                'dphase': to_dtype(dphase, self.dtype)}

Source File: cqt.py From pumpp with ISC License

5 votes

def transform_audio(self, y):
        '''Compute the CQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                fmin=self.fmin,
                n_bins=(self.n_octaves * self.over_sample * 12),
                bins_per_octave=(self.over_sample * 12))

        C = fix_length(C, n_frames)

        cqtm, phase = magphase(C)
        if self.log:
            cqtm = amplitude_to_db(cqtm, ref=np.max)

        return {'mag': to_dtype(cqtm.T[self.idx], self.dtype),
                'phase': to_dtype(np.angle(phase).T[self.idx], self.dtype)}

Source File: fft.py From pumpp with ISC License

5 votes

def transform_audio(self, y):
        '''Compute the STFT magnitude and phase differential.

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT magnitude

            data['dphase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        D = stft(y, hop_length=self.hop_length,
                 n_fft=self.n_fft)

        D = fix_length(D, n_frames)

        mag, phase = magphase(D)
        if self.log:
            mag = amplitude_to_db(mag, ref=np.max)

        phase = phase_diff(np.angle(phase.T)[self.idx], self.conv)

        return {'mag': to_dtype(mag.T[self.idx], self.dtype),
                'dphase': to_dtype(phase, self.dtype)}

Source File: fft.py From pumpp with ISC License

5 votes

def transform_audio(self, y):
        '''Compute the STFT magnitude and phase.

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT magnitude

            data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        D = stft(y, hop_length=self.hop_length,
                 n_fft=self.n_fft)

        D = fix_length(D, n_frames)

        mag, phase = magphase(D)
        if self.log:
            mag = amplitude_to_db(mag, ref=np.max)

        return {'mag': to_dtype(mag.T[self.idx], self.dtype),
                'phase': to_dtype(np.angle(phase.T)[self.idx], self.dtype)}

Source File: time.py From pumpp with ISC License

5 votes

def transform_audio(self, y):
        '''Compute the time position encoding

        Parameters
        ----------
        y : np.ndarray
            Audio buffer

        Returns
        -------
        data : dict
            data['relative'] = np.ndarray, shape=(n_frames, 2)
            data['absolute'] = np.ndarray, shape=(n_frames, 2)

                Relative and absolute time positional encodings.
        '''

        duration = get_duration(y=y, sr=self.sr)
        n_frames = self.n_frames(duration)

        relative = np.zeros((n_frames, 2), dtype=np.float32)
        relative[:, 0] = np.cos(np.pi * np.linspace(0, 1, num=n_frames))
        relative[:, 1] = np.sin(np.pi * np.linspace(0, 1, num=n_frames))

        absolute = relative * np.sqrt(duration)

        return {'relative': to_dtype(relative[self.idx], self.dtype),
                'absolute': to_dtype(absolute[self.idx], self.dtype)}

Source File: Input.py From vimss with GNU General Public License v3.0

5 votes

def randomPositionInAudio(audio_path, duration):
    length = librosa.get_duration(filename=audio_path)
    if duration >= length:
        return 0.0, None
    else:
        offset = np.random.uniform() * (length - duration)
        return offset, duration

Source File: analyze.py From crema with BSD 2-Clause "Simplified" License

4 votes

def analyze(filename=None, y=None, sr=None):
    '''Analyze a recording for all tasks.

    Parameters
    ----------
    filename : str, optional
        Path to audio file

    y : np.ndarray, optional
    sr : number > 0, optional
        Audio buffer and sampling rate

    .. note:: At least one of `filename` or `y, sr` must be provided.

    Returns
    -------
    jam : jams.JAMS
        a JAMS object containing all estimated annotations

    Examples
    --------
    >>> from crema.analyze import analyze
    >>> import librosa
    >>> jam = analyze(filename=librosa.util.example_audio_file())
    >>> jam
    <JAMS(file_metadata=<FileMetadata(...)>,
          annotations=[1 annotation],
          sandbox=<Sandbox(...)>)>
    >>> # Get the chord estimates
    >>> chords = jam.annotations['chord', 0]
    >>> chords.to_dataframe().head(5)
           time  duration  value  confidence
    0  0.000000  0.092880  E:maj    0.336977
    1  0.092880  0.464399    E:7    0.324255
    2  0.557279  1.021678  E:min    0.448759
    3  1.578957  2.693515  E:maj    0.501462
    4  4.272472  1.486077  E:min    0.287264
    '''

    _load_models()

    jam = jams.JAMS()
    # populate file metadata

    jam.file_metadata.duration = librosa.get_duration(y=y, sr=sr,
                                                      filename=filename)

    for model in __MODELS__:
        jam.annotations.append(model.predict(filename=filename, y=y, sr=sr))

    return jam

Source File: speech_cls_task.py From delta with Apache License 2.0

4 votes

def get_class_files_duration(self):
    ''' dirnames under dataset is class name
     all data_path have same dirnames '''
    classes = None
    for root, dirnames, filenames in os.walk(self._data_path[0]):
      classes = dirnames
      break

    assert classes, 'can not acsess {}'.format(self._data_path[0])
    assert set(classes) == set(self._classes.keys()), '{} {}'.format(
        classes, self._classes.keys())

    def _get_class(path):
      ret = None
      for cls in self._classes:
        if cls in path:
          ret = cls
      return ret

    # to exclude some data under some dir
    excludes = []
    #pylint: disable=too-many-nested-blocks
    for data_path in self._data_path:
      logging.debug("data path: {}".format(data_path))
      for root, dirname, filenames in os.walk(data_path):
        del dirname
        for filename in filenames:
          if filename.endswith(self._file_suffix):
            class_name = _get_class(root)  # 'conflict' or 'normal' str
            assert class_name is not None
            filename = os.path.join(root, filename)

            if excludes:
              for exclude in excludes:
                if exclude in filename:
                  pass

            duration = self.get_duration(
                filename=filename, sr=self._sample_rate)
            self._class_file[class_name].append(
                (filename, duration, class_name))
          else:
            pass

    if not self._class_file:
      logging.debug("class file: {}".format(self._class_file))
      logging.warn("maybe the suffix {} file not exits".format(
          self._file_suffix))

Source File: convert.py From ZeroSpeech-TTS-without-T with MIT License

4 votes

def encode_for_tacotron(target, trainer, seg_len, multi2idx_path, wav_path, result_path):
	wavs = sorted(glob.glob(os.path.join(wav_path, '*.wav')))
	print('[Converter] - Number of wav files to encoded: ', len(wavs))

	names = []
	enc_outputs = []

	for wav_path in tqdm(wavs):
		name = wav_path.split('/')[-1].split('.')[0]
		s_id = name.split('_')[0]
		u_id = name.split('_')[1]
		if s_id != target:
			continue

		y, sr = librosa.load(wav_path)
		d = librosa.get_duration(y=y, sr=sr)
		if d > 25: 
			continue # --> this filter out too long utts, 3523/3533 for V001 and V002 together in the english dataset
		

		_, spec = get_spectrograms(wav_path)
		encodings = encode(spec, trainer, seg_len, save=False)
		encodings = parse_encodings(encodings)
		enc_outputs.append(encodings)
		names.append((s_id, u_id))

	# build encodings to character mapping
	idx = 0
	multi2idx = {}
	print('[Converter] - Building encoding to symbol mapping...')
	for encodings in tqdm(enc_outputs):
		for encoding in encodings:
			if str(encoding) not in multi2idx:
				multi2idx[str(encoding)] = symbols[idx]
				idx += 1

	print('[Converter] - Number of unique discret units: ', len(multi2idx))
	with open(multi2idx_path, 'w') as file:
		file.write(json.dumps(multi2idx))
	
	result_path = result_path.replace('target', target)	
	print('[Converter] - Writing to meta file...')
	with open(result_path, 'w') as file:
		for i, encodings in enumerate(enc_outputs):
			file.write(str(names[i][0]) + '_' + str(names[i][1] + '|'))
			for encoding in encodings:
				file.write(multi2idx[str(encoding)])
			file.write('\n')

Source File: cqt.py From pumpp with ISC License

4 votes

def transform_audio(self, y):
        '''Compute the HCQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins, n_harmonics)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        cqtm, phase = [], []

        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        for h in self.harmonics:
            C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                    fmin=self.fmin * h,
                    n_bins=(self.n_octaves * self.over_sample * 12),
                    bins_per_octave=(self.over_sample * 12))

            C = fix_length(C, n_frames)

            C, P = magphase(C)
            if self.log:
                C = amplitude_to_db(C, ref=np.max)
            cqtm.append(C)
            phase.append(P)

        cqtm = to_dtype(np.asarray(cqtm), self.dtype)
        phase = np.angle(np.asarray(phase))

        dphase = to_dtype(phase_diff(self._index(phase), self.conv),
                          self.dtype)

        return {'mag': self._index(cqtm),
                'dphase': dphase}

Python librosa.get_duration() Examples