Python tables.Filters() Examples

The following are 30 code examples of tables.Filters(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tables , or try the search function .
Example #1
Source File: svhn.py    From batchup with MIT License 6 votes vote down vote up
def fetch_svhn_extra(source_paths, target_path):
    extra_path = source_paths[0]

    print('Converting {} to HDF5 (compressed)...'.format(extra_path))
    f_out = tables.open_file(target_path, mode='w')
    g_out = f_out.create_group(f_out.root, 'svhn', 'SVHN data')
    filters = tables.Filters(complevel=9, complib='blosc')
    X_u8_arr = f_out.create_earray(
        g_out, 'extra_X_u8', tables.UInt8Atom(), (0, 3, 32, 32),
        filters=filters)
    y_arr = f_out.create_earray(
        g_out, 'extra_y', tables.Int32Atom(), (0,), filters=filters)

    # Load in the extra data Matlab file
    _insert_svhn_matlab_to_h5(X_u8_arr, y_arr, extra_path)

    f_out.close()

    return target_path 
Example #2
Source File: hdf5io.py    From deepdish with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _get_compression_filters(compression='default'):
    if compression == 'default':
        config = conf.config()
        compression = config.get('io', 'compression')
    elif compression is True:
        compression = 'zlib'

    if (compression is False or compression is None or
            compression == 'none' or compression == 'None'):
        ff = None
    else:
        if isinstance(compression, (tuple, list)):
            compression, level = compression
        else:
            level = 9

        try:
            ff = tables.Filters(complevel=level, complib=compression,
                                shuffle=True)
        except Exception:
            warnings.warn(("(deepdish.io.save) Missing compression method {}: "
                           "no compression will be used.").format(compression))
            ff = None
    return ff 
Example #3
Source File: dense_design_matrix.py    From TextDetector with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self,
                 X=None,
                 topo_view=None,
                 y=None,
                 view_converter=None,
                 axes=('b', 0, 1, 'c'),
                 rng=_default_seed,
                 X_labels=None,
                 y_labels=None):
        super_self = super(DenseDesignMatrixPyTables, self)
        super_self.__init__(X=X,
                            topo_view=topo_view,
                            y=y,
                            view_converter=view_converter,
                            axes=axes,
                            rng=rng,
                            X_labels=X_labels,
                            y_labels=y_labels)
        self._check_labels()
        ensure_tables()
        if not hasattr(self, 'filters'):
            self.filters = tables.Filters(complib='blosc', complevel=5) 
Example #4
Source File: Sparse3DMatrix.py    From emase with GNU General Public License v3.0 6 votes vote down vote up
def save(self, h5file, title=None, index_dtype='uint32', data_dtype=float, incidence_only=True, complib='zlib'):
        if self.finalized:
            h5fh = tables.open_file(h5file, 'w', title=title)
            fil  = tables.Filters(complevel=1, complib=complib)
            h5fh.set_node_attr(h5fh.root, 'incidence_only', incidence_only)
            h5fh.set_node_attr(h5fh.root, 'mtype', 'csc_matrix')
            h5fh.set_node_attr(h5fh.root, 'shape', self.shape)
            for hid in xrange(self.shape[1]):
                hgroup = h5fh.create_group(h5fh.root, 'h%d' % hid, 'Sparse matrix components for Haplotype %d' % hid)
                spmat = self.data[hid]
                i1 = h5fh.create_carray(hgroup, 'indptr', obj=spmat.indptr.astype(index_dtype), filters=fil)
                i2 = h5fh.create_carray(hgroup, 'indices', obj=spmat.indices.astype(index_dtype), filters=fil)
                if not incidence_only:
                    d = h5fh.create_carray(hgroup, 'data', obj=spmat.data.astype(data_dtype), filters=fil)
            h5fh.flush()
            h5fh.close()
        else:
            raise RuntimeError('The matrix is not finalized.') 
Example #5
Source File: bam2h5.py    From WASP with Apache License 2.0 6 votes vote down vote up
def create_carray(h5f, chrom, data_type):
    if data_type == "uint8":
        atom = tables.UInt8Atom(dflt=0)
    elif data_type == "uint16":
        atom = tables.UInt16Atom(dflt=0)
    else:
        raise NotImplementedError("unsupported datatype %s" % data_type)

    zlib_filter = tables.Filters(complevel=1, complib="zlib")

    # create CArray for this chromosome
    shape = [chrom.length]
    carray = h5f.create_carray(h5f.root, chrom.name,
                              atom, shape, filters=zlib_filter)

    return carray 
Example #6
Source File: tables_utils.py    From mmvt with GNU General Public License v3.0 6 votes vote down vote up
def create_hdf5_arr_table(hdf_file, group, array_name,
        dtype=np.dtype('float64'), shape=(), arr=None,
        complib='blosc', complevel=5):
    atom = tables.Atom.from_dtype(dtype)
    if arr is not None:
        shape = arr.shape
#     filters = tables.Filters(complib=complib, complevel=complevel)
    if not is_table_in_group(group, array_name):
        try:
            ds = hdf_file.create_carray(group, array_name, atom, shape)
        except:
            ds = hdf_file.createCArray(group, array_name, atom, shape)
    else:
        ds = group._v_children[array_name]

    if arr is not None:
        ds[:] = arr
    return ds 
Example #7
Source File: test_hdf5.py    From ctapipe with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_write_container(temp_h5_file):
    r0tel = R0CameraContainer()
    mc = MCEventContainer()
    mc.reset()
    r0tel.waveform = np.random.uniform(size=(50, 10))
    r0tel.meta["test_attribute"] = 3.14159
    r0tel.meta["date"] = "2020-10-10"

    with HDF5TableWriter(
        temp_h5_file, group_name="R0", filters=tables.Filters(complevel=7)
    ) as writer:
        writer.exclude("tel_002", ".*samples")  # test exclusion of columns

        for ii in range(100):
            r0tel.waveform[:] = np.random.uniform(size=(50, 10))
            mc.energy = 10 ** np.random.uniform(1, 2) * u.TeV
            mc.core_x = np.random.uniform(-1, 1) * u.m
            mc.core_y = np.random.uniform(-1, 1) * u.m

            writer.write("tel_001", r0tel)
            writer.write("tel_002", r0tel)  # write a second table too
            writer.write("MC", mc) 
Example #8
Source File: AlignmentMatrixFactory.py    From emase with GNU General Public License v3.0 5 votes vote down vote up
def produce(self, h5file, title='Alignments', index_dtype='uint32', data_dtype=float, complib='zlib', incidence_only=True):
        h5fh = tables.open_file(h5file, 'w', title=title)
        fil  = tables.Filters(complevel=1, complib=complib)
        h5fh.set_node_attr(h5fh.root, 'incidence_only', incidence_only)
        h5fh.set_node_attr(h5fh.root, 'mtype', 'csc_matrix')
        h5fh.set_node_attr(h5fh.root, 'shape', (len(self.lname), len(self.hname), len(self.rname)))
        h5fh.set_node_attr(h5fh.root, 'hname', self.hname)
        h5fh.create_carray(h5fh.root, 'lname', obj=self.lname, title='Locus Names', filters=fil)
        h5fh.create_carray(h5fh.root, 'rname', obj=self.rname, title='Read Names', filters=fil)
        for hid in xrange(len(self.hname)):
            hap = self.hname[hid]
            infile = self.tmpfiles[hap]
            dmat = np.fromfile(open(infile, 'rb'), dtype='>I')
            dmat = dmat.reshape((len(dmat)/2, 2)).T
            if dmat.shape[0] > 2:
                dvec = dmat[2]
            else:
                dvec = np.ones(dmat.shape[1])
            spmat = coo_matrix((dvec, dmat[:2]), shape=(len(self.rname), len(self.lname)))
            spmat = spmat.tocsc()
            hgroup = h5fh.create_group(h5fh.root, 'h%d' % hid, 'Sparse matrix components for Haplotype %d' % hid)
            i1 = h5fh.create_carray(hgroup, 'indptr', obj=spmat.indptr.astype(index_dtype), filters=fil)
            i2 = h5fh.create_carray(hgroup, 'indices', obj=spmat.indices.astype(index_dtype), filters=fil)
            if not incidence_only:
                d = h5fh.create_carray(hgroup, 'data', obj=spmat.data.astype(data_dtype), filters=fil)
        h5fh.flush()
        h5fh.close() 
Example #9
Source File: read_array.py    From seqc with GNU General Public License v2.0 5 votes vote down vote up
def initial_filtering(self, required_poly_t=1):
        """Apply different filters to the read array. If a read fails a filter, it is not
        passed to the others and so the counts for each filter is the number of reads that
        failed that one but passed all previous ones Filters are not ordered in any
        particular way.

        :param required_poly_t: minimum number of T nucleotides in the primer tail of a
          valid read
        :return None: method sets the status vector according to the result of filtering.
        """

        failing = np.zeros(self.data.shape[0], dtype=np.int8)

        # genes are dealt with differently depending on the state of the array
        if self._ambiguous_genes:
            nnz = self.genes.getnnz(axis=1)
            failing[nnz == 0] |= self.filter_codes['no_gene']
            failing[nnz > 1] |= self.filter_codes['gene_not_unique']
        else:  # multiple gene filter is empty
            failing[self.genes == 0] |= self.filter_codes['no_gene']

        # todo add logic for "primer_missing"
        failing[self.data['rmt'] == 0] |= self.filter_codes['primer_missing']
        failing[self.data['cell'] == 0] |= self.filter_codes['primer_missing']
        failing[self.data['n_poly_t'] < required_poly_t] |= self.filter_codes['low_polyt']

        self.data['status'] = np.bitwise_or(self.data['status'], failing) 
Example #10
Source File: read_array.py    From seqc with GNU General Public License v2.0 5 votes vote down vote up
def save(self, archive_name):
        """save a ReadArray object as an hdf5 archive

        :param str archive_name: filestem for the new archive
        :return None:
        """

        def store_carray(archive, array, name):
            atom = tb.Atom.from_dtype(array.dtype)
            store = archive.create_carray(archive.root, name, atom, array.shape)
            store[:] = array
            store.flush()

        if not archive_name.endswith('.h5'):
            archive_name += '.h5'

        # construct container
        blosc5 = tb.Filters(complevel=5, complib='blosc')
        f = tb.open_file(archive_name, mode='w', title='Data for seqc.ReadArray',
                         filters=blosc5)

        f.create_table(f.root, 'data', self.data)

        if self._ambiguous_genes:
            # each array is data, indices, indptr
            store_carray(f, self.genes.indices, 'indices')
            store_carray(f, self.genes.indptr, 'indptr')
            store_carray(f, self.genes.data, 'gene_data')
            store_carray(f, self.positions.data, 'positions_data')
        else:
            store_carray(f, self.genes, 'genes')
            store_carray(f, self.positions, 'positions')

        f.close() 
Example #11
Source File: data.py    From 3D-CNNs-for-Liver-Classification with Apache License 2.0 5 votes vote down vote up
def create_data_file(out_file, n_channels, n_samples, image_shape):
    hdf5_file = tables.open_file(out_file, mode='w')
    filters = tables.Filters(complevel=5, complib='blosc')
    data_shape = tuple([0, n_channels] + list(image_shape))
    truth_shape = tuple([0, 1] + list(image_shape))
    data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape,
                                           filters=filters, expectedrows=n_samples)
    truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape,
                                            filters=filters, expectedrows=n_samples)
    affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4),
                                             filters=filters, expectedrows=n_samples)
    return hdf5_file, data_storage, truth_storage, affine_storage 
Example #12
Source File: preprocess.py    From 3D-CNNs-for-Liver-Classification with Apache License 2.0 5 votes vote down vote up
def create_data_file(out_file, n_channels, n_samples, image_shape):
    hdf5_file = tables.open_file(out_file, mode='w')
    filters = tables.Filters(complevel=5, complib='blosc')
    data_shape = tuple([0, n_channels] + list(image_shape))
    truth_shape = tuple([0, 1])
    data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape,
                                           filters=filters, expectedrows=n_samples)
    truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape,
                                            filters=filters, expectedrows=n_samples)
    return hdf5_file, data_storage, truth_storage 
Example #13
Source File: AlignmentPropertyMatrix.py    From emase with GNU General Public License v3.0 5 votes vote down vote up
def save(self, h5file, title=None, index_dtype='uint32', data_dtype=float, incidence_only=True, complib='zlib', shallow=False):
        Sparse3DMatrix.save(self, h5file=h5file, title=title, index_dtype=index_dtype, data_dtype=data_dtype, incidence_only=incidence_only, complib=complib)
        h5fh = tables.open_file(h5file, 'a')
        fil  = tables.Filters(complevel=1, complib=complib)
        if self.count is not None:
            h5fh.create_carray(h5fh.root, 'count', obj=self.count, title='Equivalence Class Counts', filters=fil)
        if not shallow:
            h5fh.set_node_attr(h5fh.root, 'hname', self.hname)
            h5fh.create_carray(h5fh.root, 'lname', obj=self.lname, title='Locus Names', filters=fil)
            if self.rname is not None:
                h5fh.create_carray(h5fh.root, 'rname', obj=self.rname, title='Read Names', filters=fil)
        h5fh.flush()
        h5fh.close() 
Example #14
Source File: data_loader.py    From deepAPI with MIT License 5 votes vote down vote up
def save_vecs(vecs, fout):
    fvec = tables.open_file(fout, 'w')
    atom = tables.Atom.from_dtype(vecs.dtype)
    filters = tables.Filters(complib='blosc', complevel=5)
    ds = fvec.create_carray(fvec.root,'vecs', atom, vecs.shape,filters=filters)
    ds[:] = vecs
    print('done')
    fvec.close() 
Example #15
Source File: data_loader.py    From deep-code-search with MIT License 5 votes vote down vote up
def save_code_reprs(vecs, path):
    npvecs=np.array(vecs)
    fvec = tables.open_file(path, 'w')
    atom = tables.Atom.from_dtype(npvecs.dtype)
    filters = tables.Filters(complib='blosc', complevel=5)
    ds = fvec.create_carray(fvec.root, 'vecs', atom, npvecs.shape,filters=filters)
    ds[:] = npvecs
    fvec.close() 
Example #16
Source File: data_loader.py    From deep-code-search with MIT License 5 votes vote down vote up
def save_vecs(vecs, fout):
    fvec = tables.open_file(fout, 'w')
    atom = tables.Atom.from_dtype(vecs.dtype)
    filters = tables.Filters(complib='blosc', complevel=5)
    ds = fvec.create_carray(fvec.root,'vecs', atom, vecs.shape,filters=filters)
    ds[:] = vecs
    print('done')
    fvec.close() 
Example #17
Source File: data.py    From Keras-Brats-Improved-Unet3d with MIT License 5 votes vote down vote up
def create_data_file(out_file, n_channels, n_samples, image_shape):
    hdf5_file = tables.open_file(out_file, mode='w')
    filters = tables.Filters(complevel=5, complib='blosc')
    data_shape = tuple([0, n_channels] + list(image_shape))
    truth_shape = tuple([0, 1] + list(image_shape))
    data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape,
                                           filters=filters, expectedrows=n_samples)
    truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape,
                                            filters=filters, expectedrows=n_samples)
    affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4),
                                             filters=filters, expectedrows=n_samples)
    return hdf5_file, data_storage, truth_storage, affine_storage 
Example #18
Source File: moving_mnist.py    From RATM with MIT License 5 votes vote down vote up
def dump_test_set(self, h5filepath, nframes, framesize):
        # set rng to a hardcoded state, so we always have the same test set!
        self.numpy_rng.seed(1)
        with tables.openFile(h5filepath, 'w') as h5file:

            h5file.createArray(h5file.root, 'test_targets',
                               self.partitions['test']['targets'])

            vids = h5file.createCArray(
                h5file.root,
                'test_images',
                tables.Float32Atom(),
                shape=(10000,
                       nframes, framesize, framesize),
                filters=tables.Filters(complevel=5, complib='zlib'))

            pos = h5file.createCArray(
                h5file.root,
                'test_pos',
                tables.UInt16Atom(),
                shape=(10000,
                       nframes, 2),
                filters=tables.Filters(complevel=5, complib='zlib'))
            for i in range(100):
                print i
                (vids[i*100:(i+1)*100],
                 pos[i*100:(i+1)*100], _) = self.get_batch(
                     'test', 100, nframes, framesize,
                     idx=np.arange(i*100,(i+1)*100))
                h5file.flush() 
Example #19
Source File: h5tools.py    From pywr with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, filename, filter_kwds=None, mode="r", title='', metadata=None, create_directories=False):
        self._opened = False
        if isinstance(filename, (str, os.PathLike)):
            # filename is a path to open
            self.filename = filename
            # Note sure how else to deal with str / unicode requirements in pytables
            # See this issue: https://github.com/PyTables/PyTables/issues/522
            import sys
            if filter_kwds:
                if sys.version_info[0] == 2 and 'complib' in filter_kwds:
                    filter_kwds['complib'] = filter_kwds['complib'].encode()
                filters = tables.Filters(**filter_kwds)
            else:
                filters = None

            # Create directories for the filename if required
            if create_directories:
                try:
                    os.makedirs(os.path.dirname(filename))
                except OSError as exception:
                    import errno
                    if exception.errno != errno.EEXIST:
                        raise

            self.file = tables.open_file(filename, mode=mode, filters=filters, title=title)
            self._opened = True
        elif isinstance(filename, tables.File):
            # filename is a pytables file
            self.file = filename
            assert(self.file.isopen)
            self.filename = self.file.filename
            self._opened = False
        else:
            raise TypeError("{} must be initalised with a filename to open or an open tables.File".format(self.__class__.__name__))

        # now update metadata if given
        if metadata is not None and self.file.mode != 'r':
            for k, v in metadata.items():
                setattr(self.file.root._v_attrs, k, v) 
Example #20
Source File: get_target_regions.py    From WASP with Apache License 2.0 5 votes vote down vote up
def create_carray(self, h5f, chrom, atom):
        zlib_filter = tables.Filters(complevel=1, complib="zlib")

        # create CArray for this chromosome
        shape = [chrom.length]
        carray = h5f.create_carray(h5f.root, chrom.name,
                                  atom, shape, filters=zlib_filter)

        return carray 
Example #21
Source File: pytables_array_list.py    From FRETBursts with GNU General Public License v2.0 5 votes vote down vote up
def append(self, ndarray):
        name = self.get_name()
        comp_filter = tables.Filters(**self.compression)
        tarray = self.data_file.create_carray(self.group, name, obj=ndarray,
                                             filters=comp_filter)
        self.data_file.flush()
        super(PyTablesList, self).append(tarray)
        #print(self.prefix+str(self.size), ndarray)
        self.size += 1
        self.group._v_attrs.size = self.size 
Example #22
Source File: preprocess.py    From LV_groundhog with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def safe_hdf(array, name):
    if os.path.isfile(name + '.hdf') and not args.overwrite:
        logger.warning("Not saving %s, already exists." % (name + '.hdf'))
    else:
        if os.path.isfile(name + '.hdf'):
            logger.info("Overwriting %s." % (name + '.hdf'))
        else:
            logger.info("Saving to %s." % (name + '.hdf'))
        with tables.openFile(name + '.hdf', 'w') as f:
            atom = tables.Atom.from_dtype(array.dtype)
            filters = tables.Filters(complib='blosc', complevel=5)
            ds = f.createCArray(f.root, name.replace('.', ''), atom,
                                array.shape, filters=filters)
            ds[:] = array 
Example #23
Source File: test_hdf5.py    From attention-lvcsr with MIT License 5 votes vote down vote up
def setUp(self):
        num_rows = 500
        filters = tables.Filters(complib='blosc', complevel=5)
        h5file = tables.open_file(
            'tmp.h5', mode='w', title='Test', filters=filters)
        group = h5file.create_group("/", 'Data')
        atom = tables.UInt8Atom()
        y = h5file.create_carray(group, 'y', atom=atom, title='Data targets',
                                 shape=(num_rows, 1), filters=filters)
        for i in range(num_rows):
            y[i] = i
        h5file.flush()
        h5file.close()
        self.dataset = PytablesDataset('tmp.h5', ('y',), 20, 500)
        self.dataset_default = PytablesDataset('tmp.h5', ('y',)) 
Example #24
Source File: test_find_intersecting_snps.py    From WASP with Apache License 2.0 5 votes vote down vote up
def write_snp_index_h5(self):
        atom = tables.Int16Atom(dflt=0)
    
        zlib_filter = tables.Filters(complevel=1, complib="zlib")
        
        snp_index_h5 = tables.open_file(self.snp_index_filename, "w")    

        snp_index = 0

        chrom_arrays = {}
        chrom_lengths = self.get_chrom_lengths()
        
        for snp in self.snp_list:
            if snp[0] in chrom_arrays:
                carray = chrom_arrays[snp[0]]
            else:
                # create CArray for this chromosome
                shape = [chrom_lengths[snp[0]]]
                carray = snp_index_h5.create_carray(snp_index_h5.root,
                                                   snp[0], atom, shape,
                                                   filters=zlib_filter)
                carray[:] = -1
                chrom_arrays[snp[0]] = carray

            pos = snp[1]
            carray[pos-1] = snp_index
            snp_index += 1
            
        self.write_hap_samples(snp_index_h5)

        snp_index_h5.close() 
Example #25
Source File: data.py    From 3DUnetCNN with MIT License 5 votes vote down vote up
def create_data_file(out_file, n_channels, n_samples, image_shape):
    hdf5_file = tables.open_file(out_file, mode='w')
    filters = tables.Filters(complevel=5, complib='blosc')
    data_shape = tuple([0, n_channels] + list(image_shape))
    truth_shape = tuple([0, 1] + list(image_shape))
    data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape,
                                           filters=filters, expectedrows=n_samples)
    truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape,
                                            filters=filters, expectedrows=n_samples)
    affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4),
                                             filters=filters, expectedrows=n_samples)
    return hdf5_file, data_storage, truth_storage, affine_storage 
Example #26
Source File: test_hdf5.py    From ctapipe with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_filters():
    from tables import Filters, open_file

    class TestContainer(Container):
        value = Field(-1, "test")

    no_comp = Filters(complevel=0)
    zstd = Filters(complevel=5, complib="blosc:zstd")

    with tempfile.NamedTemporaryFile(suffix=".hdf5") as f:
        with HDF5TableWriter(
            f.name, group_name="data", mode="w", filters=no_comp
        ) as writer:
            assert writer._h5file.filters.complevel == 0

            c = TestContainer(value=5)
            writer.write("default", c)

            writer.filters = zstd
            writer.write("zstd", c)

            writer.filters = no_comp
            writer.write("nocomp", c)

        with open_file(f.name) as h5file:
            assert h5file.root.data.default.filters.complevel == 0
            assert h5file.root.data.zstd.filters.complevel == 5
            assert h5file.root.data.zstd.filters.complib == "blosc:zstd"
            assert h5file.root.data.nocomp.filters.complevel == 0 
Example #27
Source File: data_source_tables_gen.py    From zipline-chinese with Apache License 2.0 5 votes vote down vote up
def merge_all_files_into_pytables(file_dir, file_out):
    """
    process each file into pytables
    """
    start = None
    start = datetime.datetime.now()
    out_h5 = tables.openFile(file_out,
                             mode="w",
                             title="bars",
                             filters=tables.Filters(complevel=9,
                                                    complib='zlib'))
    table = None
    for file_in in glob.glob(file_dir + "/*.gz"):
        gzip_file = gzip.open(file_in)
        expected_header = ["dt", "sid", "open", "high", "low", "close",
                           "volume"]
        csv_reader = csv.DictReader(gzip_file)
        header = csv_reader.fieldnames
        if header != expected_header:
            logging.warn("expected header %s\n" % (expected_header))
            logging.warn("header_found %s" % (header))
            return

        for current_date, rows in parse_csv(csv_reader):
            table = out_h5.createTable("/TD", "date_" + current_date,
                                       OHLCTableDescription,
                                       expectedrows=len(rows),
                                       createparents=True)
            table.append(rows)
            table.flush()
        if table is not None:
            table.flush()
    end = datetime.datetime.now()
    diff = (end - start).seconds
    logging.debug("finished  it took %d." % (diff)) 
Example #28
Source File: test_hdf5.py    From fuel with MIT License 5 votes vote down vote up
def setUp(self):
        num_rows = 500
        filters = tables.Filters(complib='blosc', complevel=5)
        h5file = tables.open_file(
            'tmp.h5', mode='w', title='Test', filters=filters)
        group = h5file.create_group("/", 'Data')
        atom = tables.UInt8Atom()
        y = h5file.create_carray(group, 'y', atom=atom, title='Data targets',
                                 shape=(num_rows, 1), filters=filters)
        for i in range(num_rows):
            y[i] = i
        h5file.flush()
        h5file.close()
        self.dataset = PytablesDataset('tmp.h5', ('y',), 20, 500)
        self.dataset_default = PytablesDataset('tmp.h5', ('y',)) 
Example #29
Source File: preprocess.py    From NMT-Coverage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def safe_hdf(array, name):
    if os.path.isfile(name + '.hdf') and not args.overwrite:
        logger.warning("Not saving %s, already exists." % (name + '.hdf'))
    else:
        if os.path.isfile(name + '.hdf'):
            logger.info("Overwriting %s." % (name + '.hdf'))
        else:
            logger.info("Saving to %s." % (name + '.hdf'))
        with tables.openFile(name + '.hdf', 'w') as f:
            atom = tables.Atom.from_dtype(array.dtype)
            filters = tables.Filters(complib='blosc', complevel=5)
            ds = f.createCArray(f.root, name.replace('.', ''), atom,
                                array.shape, filters=filters)
            ds[:] = array 
Example #30
Source File: blizzard_data.py    From stcn with GNU General Public License v3.0 4 votes vote down vote up
def fetch_blizzard_tbptt(data_path, sz=8000, batch_size=100, file_name="blizzard_tbptt.h5"):

    hdf5_path = os.path.join(data_path, file_name)

    if not os.path.exists(hdf5_path):
        data_matches = []

        for root, dir_names, file_names in os.walk(data_path):
            for filename in fnmatch.filter(file_names, 'data_*.npy'):
                data_matches.append(os.path.join(root, filename))

        # sort in proper order
        data_matches = sorted(data_matches,
                              key=lambda x: int(
                                  x.split("/")[-1].split("_")[-1][0]))

        # setup tables
        compression_filter = tables.Filters(complevel=5, complib='blosc')
        hdf5_file = tables.open_file(hdf5_path, mode='w')
        data = hdf5_file.create_earray(hdf5_file.root, 'data',
                                      tables.Int16Atom(),
                                      shape=(0, sz),
                                      filters=compression_filter,)

        for n, f in enumerate(data_matches):
            print("Reading file %s" % (f))

            # with open(f) as fp:
            # Array of arrays, ragged
            #    d = np.load(fp)
            d = np.load(f)
            large_d = d[0]

            for i in range(1, len(d)):
                print("Processing line %i of %i" % (i+1, len(d)))
                di = d[i]

                if len(di.shape) > 1:
                    di = di[:, 0]

                large_d = np.concatenate([large_d, di])

            chunk_size = int(np.float(len(large_d) / batch_size))
            seg_d = segment_axis(large_d, chunk_size, 0)
            num_batch = int(np.float((seg_d.shape[-1] - 1)/float(sz)))

            for i in range(num_batch):
                batch = seg_d[:, i*sz:(i+1)*sz]

                for j in range(batch_size):
                    data.append(batch[j][None])

        hdf5_file.close()

    hdf5_file = tables.open_file(hdf5_path, mode='r')

    return hdf5_file.root.data