Python Examples of librosa.time_to

Source File: base.py From pumpp with ISC License

6 votes

def n_frames(self, duration):
        '''Get the number of frames for a given duration

        Parameters
        ----------
        duration : number >= 0
            The duration, in seconds

        Returns
        -------
        n_frames : int >= 0
            The number of frames at this extractor's sampling rate and
            hop length
        '''

        return int(time_to_frames(duration, sr=self.sr,
                                  hop_length=self.hop_length))

Source File: audio.py From amen with BSD 2-Clause "Simplified" License

6 votes

def _get_beats(self):
        """
        Gets beats using librosa's beat tracker.
        """
        _, beat_frames = librosa.beat.beat_track(
            y=self.analysis_samples, sr=self.analysis_sample_rate, trim=False
        )

        # pad beat times to full duration
        f_max = librosa.time_to_frames(self.duration, sr=self.analysis_sample_rate)
        beat_frames = librosa.util.fix_frames(beat_frames, x_min=0, x_max=f_max)

        # convert frames to times
        beat_times = librosa.frames_to_time(beat_frames, sr=self.analysis_sample_rate)

        # make the list of (start, duration) tuples that TimingList expects
        starts_durs = [(s, t - s) for (s, t) in zip(beat_times, beat_times[1:])]

        return starts_durs

Source File: tags.py From pumpp with ISC License

6 votes

def inverse(self, encoded, duration=None):
        '''Inverse transformation'''

        ann = jams.Annotation(namespace=self.namespace, duration=duration)
        for start, end, value in self.decode_intervals(encoded,
                                                       duration=duration,
                                                       transition=self.transition,
                                                       p_init=self.p_init,
                                                       p_state=self.p_state):
            # Map start:end to frames
            f_start, f_end = time_to_frames([start, end],
                                            sr=self.sr,
                                            hop_length=self.hop_length)

            confidence = np.mean(encoded[f_start:f_end+1, value])

            value_dec = self.encoder.inverse_transform(np.atleast_2d(value))[0]

            for vd in value_dec:
                ann.append(time=start,
                           duration=end-start,
                           value=vd,
                           confidence=confidence)

        return ann

Source File: speech_cls_task.py From delta with Apache License 2.0

6 votes

def feat_output_shape(config):
    ''' without batch_size'''
    if 'feature_shape' in config['task']['audio'] and config['task']['audio'][
        'feature_shape']:
      return config['task']['audio']['feature_shape']

    if config['task']['suffix'] == '.npy':
      input_channels = 3 if config['task']['audio']['add_delta_deltas'] else 1
      nframe = librosa.time_to_frames(
          config['task']['audio']['clip_size'],
          sr=config['task']['audio']['sr'],
          hop_length=config['task']['audio']['winstep'] *
          config['task']['audio']['sr'])
      feature_shape = [
          nframe, config['task']['audio']['feature_size'], input_channels
      ]
    else:
      feature_shape = [
          config['task']['audio']['sr'] * config['task']['audio']['clip_size']
      ]
    config['task']['audio']['feature_shape'] = feature_shape
    return feature_shape

Source File: audio.py From Speech_emotion_recognition_BLSTM with MIT License

6 votes

def split_vocal(self, y):
        S_full, phase = librosa.magphase(librosa.stft(y))

        # To avoid being biased by local continuity, we constrain similar frames to be
        # separated by at least 1.2 seconds.
        S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine',
                                               width=int(librosa.time_to_frames(self._constrained, sr=self._sr)))

        S_filter = np.minimum(S_full, S_filter)

        margin_v = 10
        power = 2

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)

        S_foreground = mask_v * S_full

        foreground = griffinlim(S_foreground)

        return foreground

Source File: emotion_solver.py From delta with Apache License 2.0

5 votes

def process_config(self, config):
    ''' preprocess config '''
    data_conf = config['data']
    class_vocab = data_conf['task']['classes']['vocab']
    assert len(class_vocab) == data_conf['task']['classes']['num']

    # add revere_vocab, positive_id
    reverse_vocab = {val: key for key, val in class_vocab.items()}
    data_conf['task']['classes']['reverse_vocab'] = reverse_vocab

    # binary class
    pos_id = config['solver']['metrics']['pos_label']
    data_conf['task']['classes']['positive_id'] = pos_id
    data_conf['task']['classes']['positive'] = reverse_vocab[pos_id]

    # add feature shape, withoud batch_size
    if data_conf['task']['suffix'] == '.npy':
      input_channels = 3 if data_conf['task']['audio']['add_delta_deltas'] else 1
      nframe = librosa.time_to_frames(
          data_conf['task']['audio']['clip_size'],
          sr=data_conf['task']['audio']['sr'],
          hop_length=data_conf['task']['audio']['winstep'] *
          data_conf['task']['audio']['sr'])
      feature_shape = [
          nframe, data_conf['task']['audio']['feature_size'], input_channels
      ]
    else:
      feature_shape = [
          data_conf['task']['audio']['sr'] *
          data_conf['task']['audio']['clip_size']
      ]
    data_conf['task']['audio']['feature_shape'] = feature_shape
    return config

Source File: speaker_solver.py From delta with Apache License 2.0

5 votes

def process_config(self, config):
    data_conf = config['data']

    feature_shape = data_conf['task']['audio'].get('feature_shape', None)

    if not feature_shape:
      # add feature shape, withoud batch_size
      if data_conf['task']['suffix'] == '.npy':
        input_channels = 3 if data_conf['task']['audio'][
            'add_delta_deltas'] else 1
        nframe = librosa.time_to_frames(
            data_conf['task']['audio']['clip_size'],
            sr=data_conf['task']['audio']['sr'],
            hop_length=data_conf['task']['audio']['winstep'] *
            data_conf['task']['audio']['sr'])
        feature_shape = [
            nframe, data_conf['task']['audio']['feature_size'], input_channels
        ]
      else:
        feature_shape = [
            data_conf['task']['audio']['sr'] *
            data_conf['task']['audio']['clip_size']
        ]
      data_conf['task']['audio']['feature_shape'] = feature_shape
    logging.info(f"FEATURE SHAPE: {feature_shape}")
    return config

Source File: audio.py From Speech_emotion_recognition_BLSTM with MIT License

5 votes

def split_vocal_to_wav(self, filename, fp_foreground, fp_background=None):
        print(filename.split('/')[-1])

        y, sr = librosa.load(filename, sr=self._sr)

        S_full, phase = librosa.magphase(librosa.stft(y))

        # To avoid being biased by local continuity, we constrain similar frames to be
        # separated by at least 1.2 seconds.
        S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine',
                                               width=int(librosa.time_to_frames(self._constrained, sr=self._sr)))

        S_filter = np.minimum(S_full, S_filter)

        margin_i, margin_v = 2, 10
        power = 2

        mask_i = librosa.util.softmask(S_filter,
                                       margin_i * (S_full - S_filter),
                                       power=power)

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)

        S_foreground = mask_v * S_full
        S_background = mask_i * S_full

        foreground = griffinlim(S_foreground)
        fp_foreground += filename.split('/')[-1]
        sf.write(fp_foreground, foreground, sr, 'PCM_16')

        if fp_background is not None:
            background = griffinlim(S_background)
            fp_background += filename.split('/')[-1]
            sf.write(fp_background, background, sr, 'PCM_16')

Source File: 02-train.py From crema with BSD 2-Clause "Simplified" License

5 votes

def val_sampler(max_duration, pump, seed):
    '''validation sampler'''
    n_frames = librosa.time_to_frames(max_duration,
                                      sr=pump['cqt'].sr,
                                      hop_length=pump['cqt'].hop_length)

    return pumpp.sampler.VariableLengthSampler(None, 32, n_frames,
                                               *pump.ops,
                                               random_state=seed)

Source File: 02-train.py From crema with BSD 2-Clause "Simplified" License

5 votes

def make_sampler(max_samples, duration, pump, seed):
    '''stochastic training sampler'''
    n_frames = librosa.time_to_frames(duration,
                                      sr=pump['cqt'].sr,
                                      hop_length=pump['cqt'].hop_length)

    return pump.sampler(max_samples, n_frames, random_state=seed)

Source File: 02-train.py From crema with BSD 2-Clause "Simplified" License

5 votes

def make_sampler(max_samples, duration, pump, seed):

    n_frames = librosa.time_to_frames(duration,
                                      sr=pump['mel'].sr,
                                      hop_length=pump['mel'].hop_length)[0]

    return pump.sampler(max_samples, n_frames, random_state=seed)

Source File: base.py From msaf with MIT License

5 votes

def read_ann_beats(self):
        """Reads the annotated beats if available.

        Returns
        -------
        times: np.array
            Times of annotated beats in seconds.
        frames: np.array
            Frame indeces of annotated beats.
        """
        times, frames = (None, None)

        # Read annotations if they exist in correct folder
        if os.path.isfile(self.file_struct.ref_file):
            try:
                jam = jams.load(self.file_struct.ref_file)
            except TypeError:
                logging.warning(
                    "Can't read JAMS file %s. Maybe it's not "
                    "compatible with current JAMS version?" %
                    self.file_struct.ref_file)
                return times, frames
            beat_annot = jam.search(namespace="beat.*")

            # If beat annotations exist, get times and frames
            if len(beat_annot) > 0:
                beats_inters, _ = beat_annot[0].to_interval_values()
                times = beats_inters[:, 0]
                frames = librosa.time_to_frames(times, sr=self.sr,
                                                hop_length=self.hop_length)
        return times, frames

Source File: key.py From pumpp with ISC License

5 votes

def inverse(self, encoded, duration=None):
        '''Inverse transformation'''

        ann = jams.Annotation(self.namespace, duration=duration)
            
        for start, end, value in self.decode_intervals(encoded,
                                                       duration=duration,
                                                       multi=False,
                                                       sparse=self.sparse,
                                                       transition=self.transition,
                                                       p_init=self.p_init,
                                                       p_state=self.p_state):

            # Map start:end to frames
            f_start, f_end = time_to_frames([start, end],
                                            sr=self.sr,
                                            hop_length=self.hop_length)

            # Reverse the index
            if self.sparse:
                # Compute the confidence
                if encoded.shape[1] == 1:
                    # This case is for full-confidence prediction (just the index)
                    confidence = 1.
                else:
                    confidence = np.mean(encoded[f_start:f_end+1, value])

                value_dec = self.encoder.inverse_transform(value)
            else:
                confidence = np.mean(encoded[f_start:f_end+1, np.argmax(value)])
                value_dec = self.encoder.inverse_transform(np.atleast_2d(value))

            for vd in value_dec:
                ann.append(time=start,
                           duration=end-start,
                           value=vd,
                           confidence=float(confidence))

        return ann

Source File: beat.py From pumpp with ISC License

5 votes

def inverse(self, encoded, downbeat=None, duration=None):
        '''Inverse transformation for beats and optional downbeats'''

        ann = jams.Annotation(namespace=self.namespace, duration=duration)

        beat_times = np.asarray([t for t, _ in self.decode_events(encoded,
                                                                  transition=self.beat_transition,
                                                                  p_init=self.beat_p_init,
                                                                  p_state=self.beat_p_state) if _])
        beat_frames = time_to_frames(beat_times,
                                     sr=self.sr,
                                     hop_length=self.hop_length)

        if downbeat is not None:
            downbeat_times = set([t for t, _ in self.decode_events(downbeat,
                                                                   transition=self.down_transition,
                                                                   p_init=self.down_p_init,
                                                                   p_state=self.down_p_state) if _])
            pickup_beats = len([t for t in beat_times
                                if t < min(downbeat_times)])
        else:
            downbeat_times = set()
            pickup_beats = 0

        value = - pickup_beats - 1
        for beat_t, beat_f in zip(beat_times, beat_frames):
            if beat_t in downbeat_times:
                value = 1
            else:
                value += 1
            confidence = encoded[beat_f]
            ann.append(time=beat_t,
                       duration=0,
                       value=value,
                       confidence=confidence)

        return ann

Source File: base.py From pumpp with ISC License

5 votes

def encode_events(self, duration, events, values, dtype=np.bool):
        '''Encode labeled events as a time-series matrix.

        Parameters
        ----------
        duration : number
            The duration of the track

        events : ndarray, shape=(n,)
            Time index of the events

        values : ndarray, shape=(n, m)
            Values array.  Must have the same first index as `events`.

        dtype : numpy data type

        Returns
        -------
        target : ndarray, shape=(n_frames, n_values)
        '''

        frames = time_to_frames(events, sr=self.sr,
                                hop_length=self.hop_length)

        n_total = int(time_to_frames(duration, sr=self.sr,
                                     hop_length=self.hop_length))

        n_alloc = n_total
        if np.any(frames):
            n_alloc = max(n_total, 1 + int(frames.max()))

        target = np.empty((n_alloc, values.shape[1]),
                          dtype=dtype)

        target.fill(fill_value(dtype))
        values = values.astype(dtype)
        for column, event in zip(values, frames):
            target[event] += column

        return target[:n_total]

Source File: chord.py From pumpp with ISC License

5 votes

def inverse(self, encoded, duration=None):
        '''Inverse transformation'''

        ann = jams.Annotation(self.namespace, duration=duration)

        for start, end, value in self.decode_intervals(encoded,
                                                       duration=duration,
                                                       multi=False,
                                                       sparse=self.sparse,
                                                       transition=self.transition,
                                                       p_init=self.p_init,
                                                       p_state=self.p_state):

            # Map start:end to frames
            f_start, f_end = time_to_frames([start, end],
                                            sr=self.sr,
                                            hop_length=self.hop_length)

            # Reverse the index
            if self.sparse:
                # Compute the confidence
                if encoded.shape[1] == 1:
                    # This case is for full-confidence prediction (just the index)
                    confidence = 1.
                else:
                    confidence = np.mean(encoded[f_start:f_end+1, value])

                value_dec = self.encoder.inverse_transform(value)
            else:
                confidence = np.mean(encoded[f_start:f_end+1, np.argmax(value)])
                value_dec = self.encoder.inverse_transform(np.atleast_2d(value))

            for vd in value_dec:
                ann.append(time=start,
                           duration=end-start,
                           value=vd,
                           confidence=float(confidence))

        return ann

Source File: base.py From pumpp with ISC License

4 votes

def transform(self, jam, query=None):
        '''Transform jam object to make data for this task

        Parameters
        ----------
        jam : jams.JAMS
            The jams container object

        query : string, dict, or callable [optional]
            An optional query to narrow the elements of `jam.annotations`
            to be considered.

            If not provided, all annotations are considered.

        Returns
        -------
        data : dict
            A dictionary of transformed annotations.
            All annotations which can be converted to the target namespace
            will be converted.
        '''
        anns = []
        if query:
            results = jam.search(**query)
        else:
            results = jam.annotations

        # Find annotations that can be coerced to our target namespace
        for ann in results:
            try:
                anns.append(jams.nsconvert.convert(ann, self.namespace))
            except jams.NamespaceError:
                pass

        duration = jam.file_metadata.duration

        # If none, make a fake one
        if not anns:
            anns = [self.empty(duration)]

        # Apply transformations
        results = []
        for ann in anns:

            results.append(self.transform_annotation(ann, duration))
            # If the annotation range is None, it spans the entire track
            if ann.time is None or ann.duration is None:
                valid = [0, duration]
            else:
                valid = [ann.time, ann.time + ann.duration]

            results[-1]['_valid'] = time_to_frames(valid, sr=self.sr,
                                                   hop_length=self.hop_length)

        # Prefix and collect
        return self.merge(results)

Python librosa.time_to_frames() Examples