Python Examples of tables.Int32Atom

Source File: svhn.py From batchup with MIT License

6 votes

def fetch_svhn_extra(source_paths, target_path):
    extra_path = source_paths[0]

    print('Converting {} to HDF5 (compressed)...'.format(extra_path))
    f_out = tables.open_file(target_path, mode='w')
    g_out = f_out.create_group(f_out.root, 'svhn', 'SVHN data')
    filters = tables.Filters(complevel=9, complib='blosc')
    X_u8_arr = f_out.create_earray(
        g_out, 'extra_X_u8', tables.UInt8Atom(), (0, 3, 32, 32),
        filters=filters)
    y_arr = f_out.create_earray(
        g_out, 'extra_y', tables.Int32Atom(), (0,), filters=filters)

    # Load in the extra data Matlab file
    _insert_svhn_matlab_to_h5(X_u8_arr, y_arr, extra_path)

    f_out.close()

    return target_path

Source File: dense_design_matrix.py From TextDetector with GNU General Public License v3.0

5 votes

def init_hdf5(self, path, shapes,
                  title="Pytables Dataset",
                  y_dtype='float'):
        """
        Initializes the hdf5 file into which the data will be stored. This must
        be called before calling fill_hdf5.

        Parameters
        ----------
        path : string
            The name of the hdf5 file.
        shapes : tuple
            The shapes of X and y.
        title : string, optional
            Name of the dataset. e.g. For SVHN, set this to "SVHN Dataset".
            "Pytables Dataset" is used as title, by default.
        y_dtype : string, optional
            Either 'float' or 'int'. Decides the type of pytables atom
            used to store the y data. By default 'float' type is used.
        """
        assert y_dtype in ['float', 'int'], (
            "y_dtype can be 'float' or 'int' only"
        )

        x_shape, y_shape = shapes
        # make pytables
        ensure_tables()
        h5file = tables.openFile(path, mode="w", title=title)
        gcolumns = h5file.createGroup(h5file.root, "Data", "Data")
        atom = (tables.Float32Atom() if config.floatX == 'float32'
                else tables.Float64Atom())
        h5file.createCArray(gcolumns, 'X', atom=atom, shape=x_shape,
                            title="Data values", filters=self.filters)
        if y_dtype != 'float':
            # For 1D ndarray of int labels, override the atom to integer
            atom = (tables.Int32Atom() if config.floatX == 'float32'
                    else tables.Int64Atom())
        h5file.createCArray(gcolumns, 'y', atom=atom, shape=y_shape,
                            title="Data targets", filters=self.filters)
        return h5file, gcolumns

Source File: dense_design_matrix.py From TextDetector with GNU General Public License v3.0

4 votes

def resize(self, h5file, start, stop):
        """
        Resizes the X and y tables. This must be called before calling
        fill_hdf5.

        Parameters
        ----------
        h5file : hdf5 file handle
            Handle to an hdf5 object.
        start : int
            The start index to write data.
        stop : int
            The index of the record following the last record to be written.
        """
        ensure_tables()
        # TODO is there any smarter and more efficient way to this?

        data = h5file.getNode('/', "Data")
        try:
            gcolumns = h5file.createGroup('/', "Data_", "Data")
        except tables.exceptions.NodeError:
            h5file.removeNode('/', "Data_", 1)
            gcolumns = h5file.createGroup('/', "Data_", "Data")

        start = 0 if start is None else start
        stop = gcolumns.X.nrows if stop is None else stop

        atom = (tables.Float32Atom() if config.floatX == 'float32'
                else tables.Float64Atom())
        x = h5file.createCArray(gcolumns,
                                'X',
                                atom=atom,
                                shape=((stop - start, data.X.shape[1])),
                                title="Data values",
                                filters=self.filters)
        if np.issubdtype(data.y, int):
            # For 1D ndarray of int labels, override the atom to integer
            atom = (tables.Int32Atom() if config.floatX == 'float32'
                    else tables.Int64Atom())
        y = h5file.createCArray(gcolumns,
                                'y',
                                atom=atom,
                                shape=((stop - start, data.y.shape[1])),
                                title="Data targets",
                                filters=self.filters)
        x[:] = data.X[start:stop]
        y[:] = data.y[start:stop]

        h5file.removeNode('/', "Data", 1)
        h5file.renameNode('/', "Data", "Data_")
        h5file.flush()
        return h5file, gcolumns

Source File: voice.py From voice-corpus-tool with Mozilla Public License 2.0

4 votes

def _hdf5(self, alphabet_path, hdf5_path, ninput=26, ncontext=9):
        skipped = []
        str_to_label = {}
        alphabet_size = 0
        with codecs.open(alphabet_path, 'r', 'utf-8') as fin:
            for line in fin:
                if line[0:2] == '\\#':
                    line = '#\n'
                elif line[0] == '#':
                    continue
                str_to_label[line[:-1]] = alphabet_size
                alphabet_size += 1

        def process_sample(sample):
            if len(sample.transcript) == 0:
                skipped.append(sample.original_name)
                return None
            sample.write()
            try:
                samplerate, audio = wav.read(sample.file.filename)
                transcript = np.asarray([str_to_label[c] for c in sample.transcript])
            except:
                skipped.append(sample.original_name)
                return None
            features = mfcc(audio, samplerate=samplerate, numcep=ninput)[::2]
            empty_context = np.zeros((ncontext, ninput), dtype=features.dtype)
            features = np.concatenate((empty_context, features, empty_context))
            if (2*ncontext + len(features)) < len(transcript):
                skipped.append(sample.original_name)
                return None
            return features, len(features), transcript, len(transcript)

        out_data = self._map('Computing MFCC features...', self.samples, process_sample)
        out_data = [s for s in out_data if s is not None]
        if len(skipped) > 0:
            log('WARNING - Skipped %d samples that had no transcription, had been too short for their transcription or had been missed:' % len(skipped))
            for s in skipped:
                log(' - Sample origin: "%s".' % s)
        if len(out_data) <= 0:
            log('No samples written to feature DB "%s".' % hdf5_path)
            return
        # list of tuples -> tuple of lists
        features, features_len, transcript, transcript_len = zip(*out_data)

        log('Writing feature DB...')
        with tables.open_file(hdf5_path, 'w') as file:
            features_dset = file.create_vlarray(file.root, 'features', tables.Float32Atom(), filters=tables.Filters(complevel=1))
            # VLArray atoms need to be 1D, so flatten feature array
            for f in features:
                features_dset.append(np.reshape(f, -1))
            features_len_dset = file.create_array(file.root, 'features_len', features_len)

            transcript_dset = file.create_vlarray(file.root, 'transcript', tables.Int32Atom(), filters=tables.Filters(complevel=1))
            for t in transcript:
                transcript_dset.append(t)

            transcript_len_dset = file.create_array(file.root, 'transcript_len', transcript_len)
        log('Wrote features of %d samples to feature DB "%s".' % (len(features), hdf5_path))

Python tables.Int32Atom() Examples