Python librosa.time_to_frames() Examples

The following are 17 code examples of librosa.time_to_frames(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module librosa , or try the search function .
Example #1
Source File: base.py    From pumpp with ISC License 6 votes vote down vote up
def n_frames(self, duration):
        '''Get the number of frames for a given duration

        Parameters
        ----------
        duration : number >= 0
            The duration, in seconds

        Returns
        -------
        n_frames : int >= 0
            The number of frames at this extractor's sampling rate and
            hop length
        '''

        return int(time_to_frames(duration, sr=self.sr,
                                  hop_length=self.hop_length)) 
Example #2
Source File: audio.py    From amen with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def _get_beats(self):
        """
        Gets beats using librosa's beat tracker.
        """
        _, beat_frames = librosa.beat.beat_track(
            y=self.analysis_samples, sr=self.analysis_sample_rate, trim=False
        )

        # pad beat times to full duration
        f_max = librosa.time_to_frames(self.duration, sr=self.analysis_sample_rate)
        beat_frames = librosa.util.fix_frames(beat_frames, x_min=0, x_max=f_max)

        # convert frames to times
        beat_times = librosa.frames_to_time(beat_frames, sr=self.analysis_sample_rate)

        # make the list of (start, duration) tuples that TimingList expects
        starts_durs = [(s, t - s) for (s, t) in zip(beat_times, beat_times[1:])]

        return starts_durs 
Example #3
Source File: tags.py    From pumpp with ISC License 6 votes vote down vote up
def inverse(self, encoded, duration=None):
        '''Inverse transformation'''

        ann = jams.Annotation(namespace=self.namespace, duration=duration)
        for start, end, value in self.decode_intervals(encoded,
                                                       duration=duration,
                                                       transition=self.transition,
                                                       p_init=self.p_init,
                                                       p_state=self.p_state):
            # Map start:end to frames
            f_start, f_end = time_to_frames([start, end],
                                            sr=self.sr,
                                            hop_length=self.hop_length)

            confidence = np.mean(encoded[f_start:f_end+1, value])

            value_dec = self.encoder.inverse_transform(np.atleast_2d(value))[0]

            for vd in value_dec:
                ann.append(time=start,
                           duration=end-start,
                           value=vd,
                           confidence=confidence)

        return ann 
Example #4
Source File: speech_cls_task.py    From delta with Apache License 2.0 6 votes vote down vote up
def feat_output_shape(config):
    ''' without batch_size'''
    if 'feature_shape' in config['task']['audio'] and config['task']['audio'][
        'feature_shape']:
      return config['task']['audio']['feature_shape']

    if config['task']['suffix'] == '.npy':
      input_channels = 3 if config['task']['audio']['add_delta_deltas'] else 1
      nframe = librosa.time_to_frames(
          config['task']['audio']['clip_size'],
          sr=config['task']['audio']['sr'],
          hop_length=config['task']['audio']['winstep'] *
          config['task']['audio']['sr'])
      feature_shape = [
          nframe, config['task']['audio']['feature_size'], input_channels
      ]
    else:
      feature_shape = [
          config['task']['audio']['sr'] * config['task']['audio']['clip_size']
      ]
    config['task']['audio']['feature_shape'] = feature_shape
    return feature_shape 
Example #5
Source File: audio.py    From Speech_emotion_recognition_BLSTM with MIT License 6 votes vote down vote up
def split_vocal(self, y):
        S_full, phase = librosa.magphase(librosa.stft(y))

        # To avoid being biased by local continuity, we constrain similar frames to be
        # separated by at least 1.2 seconds.
        S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine',
                                               width=int(librosa.time_to_frames(self._constrained, sr=self._sr)))

        S_filter = np.minimum(S_full, S_filter)

        margin_v = 10
        power = 2

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)

        S_foreground = mask_v * S_full

        foreground = griffinlim(S_foreground)

        return foreground 
Example #6
Source File: emotion_solver.py    From delta with Apache License 2.0 5 votes vote down vote up
def process_config(self, config):
    ''' preprocess config '''
    data_conf = config['data']
    class_vocab = data_conf['task']['classes']['vocab']
    assert len(class_vocab) == data_conf['task']['classes']['num']

    # add revere_vocab, positive_id
    reverse_vocab = {val: key for key, val in class_vocab.items()}
    data_conf['task']['classes']['reverse_vocab'] = reverse_vocab

    # binary class
    pos_id = config['solver']['metrics']['pos_label']
    data_conf['task']['classes']['positive_id'] = pos_id
    data_conf['task']['classes']['positive'] = reverse_vocab[pos_id]

    # add feature shape, withoud batch_size
    if data_conf['task']['suffix'] == '.npy':
      input_channels = 3 if data_conf['task']['audio']['add_delta_deltas'] else 1
      nframe = librosa.time_to_frames(
          data_conf['task']['audio']['clip_size'],
          sr=data_conf['task']['audio']['sr'],
          hop_length=data_conf['task']['audio']['winstep'] *
          data_conf['task']['audio']['sr'])
      feature_shape = [
          nframe, data_conf['task']['audio']['feature_size'], input_channels
      ]
    else:
      feature_shape = [
          data_conf['task']['audio']['sr'] *
          data_conf['task']['audio']['clip_size']
      ]
    data_conf['task']['audio']['feature_shape'] = feature_shape
    return config 
Example #7
Source File: speaker_solver.py    From delta with Apache License 2.0 5 votes vote down vote up
def process_config(self, config):
    data_conf = config['data']

    feature_shape = data_conf['task']['audio'].get('feature_shape', None)

    if not feature_shape:
      # add feature shape, withoud batch_size
      if data_conf['task']['suffix'] == '.npy':
        input_channels = 3 if data_conf['task']['audio'][
            'add_delta_deltas'] else 1
        nframe = librosa.time_to_frames(
            data_conf['task']['audio']['clip_size'],
            sr=data_conf['task']['audio']['sr'],
            hop_length=data_conf['task']['audio']['winstep'] *
            data_conf['task']['audio']['sr'])
        feature_shape = [
            nframe, data_conf['task']['audio']['feature_size'], input_channels
        ]
      else:
        feature_shape = [
            data_conf['task']['audio']['sr'] *
            data_conf['task']['audio']['clip_size']
        ]
      data_conf['task']['audio']['feature_shape'] = feature_shape
    logging.info(f"FEATURE SHAPE: {feature_shape}")
    return config 
Example #8
Source File: audio.py    From Speech_emotion_recognition_BLSTM with MIT License 5 votes vote down vote up
def split_vocal_to_wav(self, filename, fp_foreground, fp_background=None):
        print(filename.split('/')[-1])

        y, sr = librosa.load(filename, sr=self._sr)

        S_full, phase = librosa.magphase(librosa.stft(y))

        # To avoid being biased by local continuity, we constrain similar frames to be
        # separated by at least 1.2 seconds.
        S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine',
                                               width=int(librosa.time_to_frames(self._constrained, sr=self._sr)))

        S_filter = np.minimum(S_full, S_filter)

        margin_i, margin_v = 2, 10
        power = 2

        mask_i = librosa.util.softmask(S_filter,
                                       margin_i * (S_full - S_filter),
                                       power=power)

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)

        S_foreground = mask_v * S_full
        S_background = mask_i * S_full

        foreground = griffinlim(S_foreground)
        fp_foreground += filename.split('/')[-1]
        sf.write(fp_foreground, foreground, sr, 'PCM_16')

        if fp_background is not None:
            background = griffinlim(S_background)
            fp_background += filename.split('/')[-1]
            sf.write(fp_background, background, sr, 'PCM_16') 
Example #9
Source File: 02-train.py    From crema with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def val_sampler(max_duration, pump, seed):
    '''validation sampler'''
    n_frames = librosa.time_to_frames(max_duration,
                                      sr=pump['cqt'].sr,
                                      hop_length=pump['cqt'].hop_length)

    return pumpp.sampler.VariableLengthSampler(None, 32, n_frames,
                                               *pump.ops,
                                               random_state=seed) 
Example #10
Source File: 02-train.py    From crema with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def make_sampler(max_samples, duration, pump, seed):
    '''stochastic training sampler'''
    n_frames = librosa.time_to_frames(duration,
                                      sr=pump['cqt'].sr,
                                      hop_length=pump['cqt'].hop_length)

    return pump.sampler(max_samples, n_frames, random_state=seed) 
Example #11
Source File: 02-train.py    From crema with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def make_sampler(max_samples, duration, pump, seed):

    n_frames = librosa.time_to_frames(duration,
                                      sr=pump['mel'].sr,
                                      hop_length=pump['mel'].hop_length)[0]

    return pump.sampler(max_samples, n_frames, random_state=seed) 
Example #12
Source File: base.py    From msaf with MIT License 5 votes vote down vote up
def read_ann_beats(self):
        """Reads the annotated beats if available.

        Returns
        -------
        times: np.array
            Times of annotated beats in seconds.
        frames: np.array
            Frame indeces of annotated beats.
        """
        times, frames = (None, None)

        # Read annotations if they exist in correct folder
        if os.path.isfile(self.file_struct.ref_file):
            try:
                jam = jams.load(self.file_struct.ref_file)
            except TypeError:
                logging.warning(
                    "Can't read JAMS file %s. Maybe it's not "
                    "compatible with current JAMS version?" %
                    self.file_struct.ref_file)
                return times, frames
            beat_annot = jam.search(namespace="beat.*")

            # If beat annotations exist, get times and frames
            if len(beat_annot) > 0:
                beats_inters, _ = beat_annot[0].to_interval_values()
                times = beats_inters[:, 0]
                frames = librosa.time_to_frames(times, sr=self.sr,
                                                hop_length=self.hop_length)
        return times, frames 
Example #13
Source File: key.py    From pumpp with ISC License 5 votes vote down vote up
def inverse(self, encoded, duration=None):
        '''Inverse transformation'''

        ann = jams.Annotation(self.namespace, duration=duration)
            
        for start, end, value in self.decode_intervals(encoded,
                                                       duration=duration,
                                                       multi=False,
                                                       sparse=self.sparse,
                                                       transition=self.transition,
                                                       p_init=self.p_init,
                                                       p_state=self.p_state):

            # Map start:end to frames
            f_start, f_end = time_to_frames([start, end],
                                            sr=self.sr,
                                            hop_length=self.hop_length)

            # Reverse the index
            if self.sparse:
                # Compute the confidence
                if encoded.shape[1] == 1:
                    # This case is for full-confidence prediction (just the index)
                    confidence = 1.
                else:
                    confidence = np.mean(encoded[f_start:f_end+1, value])

                value_dec = self.encoder.inverse_transform(value)
            else:
                confidence = np.mean(encoded[f_start:f_end+1, np.argmax(value)])
                value_dec = self.encoder.inverse_transform(np.atleast_2d(value))

            for vd in value_dec:
                ann.append(time=start,
                           duration=end-start,
                           value=vd,
                           confidence=float(confidence))

        return ann 
Example #14
Source File: beat.py    From pumpp with ISC License 5 votes vote down vote up
def inverse(self, encoded, downbeat=None, duration=None):
        '''Inverse transformation for beats and optional downbeats'''

        ann = jams.Annotation(namespace=self.namespace, duration=duration)

        beat_times = np.asarray([t for t, _ in self.decode_events(encoded,
                                                                  transition=self.beat_transition,
                                                                  p_init=self.beat_p_init,
                                                                  p_state=self.beat_p_state) if _])
        beat_frames = time_to_frames(beat_times,
                                     sr=self.sr,
                                     hop_length=self.hop_length)

        if downbeat is not None:
            downbeat_times = set([t for t, _ in self.decode_events(downbeat,
                                                                   transition=self.down_transition,
                                                                   p_init=self.down_p_init,
                                                                   p_state=self.down_p_state) if _])
            pickup_beats = len([t for t in beat_times
                                if t < min(downbeat_times)])
        else:
            downbeat_times = set()
            pickup_beats = 0

        value = - pickup_beats - 1
        for beat_t, beat_f in zip(beat_times, beat_frames):
            if beat_t in downbeat_times:
                value = 1
            else:
                value += 1
            confidence = encoded[beat_f]
            ann.append(time=beat_t,
                       duration=0,
                       value=value,
                       confidence=confidence)

        return ann 
Example #15
Source File: base.py    From pumpp with ISC License 5 votes vote down vote up
def encode_events(self, duration, events, values, dtype=np.bool):
        '''Encode labeled events as a time-series matrix.

        Parameters
        ----------
        duration : number
            The duration of the track

        events : ndarray, shape=(n,)
            Time index of the events

        values : ndarray, shape=(n, m)
            Values array.  Must have the same first index as `events`.

        dtype : numpy data type

        Returns
        -------
        target : ndarray, shape=(n_frames, n_values)
        '''

        frames = time_to_frames(events, sr=self.sr,
                                hop_length=self.hop_length)

        n_total = int(time_to_frames(duration, sr=self.sr,
                                     hop_length=self.hop_length))

        n_alloc = n_total
        if np.any(frames):
            n_alloc = max(n_total, 1 + int(frames.max()))

        target = np.empty((n_alloc, values.shape[1]),
                          dtype=dtype)

        target.fill(fill_value(dtype))
        values = values.astype(dtype)
        for column, event in zip(values, frames):
            target[event] += column

        return target[:n_total] 
Example #16
Source File: chord.py    From pumpp with ISC License 5 votes vote down vote up
def inverse(self, encoded, duration=None):
        '''Inverse transformation'''

        ann = jams.Annotation(self.namespace, duration=duration)

        for start, end, value in self.decode_intervals(encoded,
                                                       duration=duration,
                                                       multi=False,
                                                       sparse=self.sparse,
                                                       transition=self.transition,
                                                       p_init=self.p_init,
                                                       p_state=self.p_state):

            # Map start:end to frames
            f_start, f_end = time_to_frames([start, end],
                                            sr=self.sr,
                                            hop_length=self.hop_length)

            # Reverse the index
            if self.sparse:
                # Compute the confidence
                if encoded.shape[1] == 1:
                    # This case is for full-confidence prediction (just the index)
                    confidence = 1.
                else:
                    confidence = np.mean(encoded[f_start:f_end+1, value])

                value_dec = self.encoder.inverse_transform(value)
            else:
                confidence = np.mean(encoded[f_start:f_end+1, np.argmax(value)])
                value_dec = self.encoder.inverse_transform(np.atleast_2d(value))

            for vd in value_dec:
                ann.append(time=start,
                           duration=end-start,
                           value=vd,
                           confidence=float(confidence))

        return ann 
Example #17
Source File: base.py    From pumpp with ISC License 4 votes vote down vote up
def transform(self, jam, query=None):
        '''Transform jam object to make data for this task

        Parameters
        ----------
        jam : jams.JAMS
            The jams container object

        query : string, dict, or callable [optional]
            An optional query to narrow the elements of `jam.annotations`
            to be considered.

            If not provided, all annotations are considered.

        Returns
        -------
        data : dict
            A dictionary of transformed annotations.
            All annotations which can be converted to the target namespace
            will be converted.
        '''
        anns = []
        if query:
            results = jam.search(**query)
        else:
            results = jam.annotations

        # Find annotations that can be coerced to our target namespace
        for ann in results:
            try:
                anns.append(jams.nsconvert.convert(ann, self.namespace))
            except jams.NamespaceError:
                pass

        duration = jam.file_metadata.duration

        # If none, make a fake one
        if not anns:
            anns = [self.empty(duration)]

        # Apply transformations
        results = []
        for ann in anns:

            results.append(self.transform_annotation(ann, duration))
            # If the annotation range is None, it spans the entire track
            if ann.time is None or ann.duration is None:
                valid = [0, duration]
            else:
                valid = [ann.time, ann.time + ann.duration]

            results[-1]['_valid'] = time_to_frames(valid, sr=self.sr,
                                                   hop_length=self.hop_length)

        # Prefix and collect
        return self.merge(results)