Python tables.Filters() Examples
The following are 30
code examples of tables.Filters().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tables
, or try the search function
.
Example #1
Source File: svhn.py From batchup with MIT License | 6 votes |
def fetch_svhn_extra(source_paths, target_path): extra_path = source_paths[0] print('Converting {} to HDF5 (compressed)...'.format(extra_path)) f_out = tables.open_file(target_path, mode='w') g_out = f_out.create_group(f_out.root, 'svhn', 'SVHN data') filters = tables.Filters(complevel=9, complib='blosc') X_u8_arr = f_out.create_earray( g_out, 'extra_X_u8', tables.UInt8Atom(), (0, 3, 32, 32), filters=filters) y_arr = f_out.create_earray( g_out, 'extra_y', tables.Int32Atom(), (0,), filters=filters) # Load in the extra data Matlab file _insert_svhn_matlab_to_h5(X_u8_arr, y_arr, extra_path) f_out.close() return target_path
Example #2
Source File: hdf5io.py From deepdish with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _get_compression_filters(compression='default'): if compression == 'default': config = conf.config() compression = config.get('io', 'compression') elif compression is True: compression = 'zlib' if (compression is False or compression is None or compression == 'none' or compression == 'None'): ff = None else: if isinstance(compression, (tuple, list)): compression, level = compression else: level = 9 try: ff = tables.Filters(complevel=level, complib=compression, shuffle=True) except Exception: warnings.warn(("(deepdish.io.save) Missing compression method {}: " "no compression will be used.").format(compression)) ff = None return ff
Example #3
Source File: dense_design_matrix.py From TextDetector with GNU General Public License v3.0 | 6 votes |
def __init__(self, X=None, topo_view=None, y=None, view_converter=None, axes=('b', 0, 1, 'c'), rng=_default_seed, X_labels=None, y_labels=None): super_self = super(DenseDesignMatrixPyTables, self) super_self.__init__(X=X, topo_view=topo_view, y=y, view_converter=view_converter, axes=axes, rng=rng, X_labels=X_labels, y_labels=y_labels) self._check_labels() ensure_tables() if not hasattr(self, 'filters'): self.filters = tables.Filters(complib='blosc', complevel=5)
Example #4
Source File: Sparse3DMatrix.py From emase with GNU General Public License v3.0 | 6 votes |
def save(self, h5file, title=None, index_dtype='uint32', data_dtype=float, incidence_only=True, complib='zlib'): if self.finalized: h5fh = tables.open_file(h5file, 'w', title=title) fil = tables.Filters(complevel=1, complib=complib) h5fh.set_node_attr(h5fh.root, 'incidence_only', incidence_only) h5fh.set_node_attr(h5fh.root, 'mtype', 'csc_matrix') h5fh.set_node_attr(h5fh.root, 'shape', self.shape) for hid in xrange(self.shape[1]): hgroup = h5fh.create_group(h5fh.root, 'h%d' % hid, 'Sparse matrix components for Haplotype %d' % hid) spmat = self.data[hid] i1 = h5fh.create_carray(hgroup, 'indptr', obj=spmat.indptr.astype(index_dtype), filters=fil) i2 = h5fh.create_carray(hgroup, 'indices', obj=spmat.indices.astype(index_dtype), filters=fil) if not incidence_only: d = h5fh.create_carray(hgroup, 'data', obj=spmat.data.astype(data_dtype), filters=fil) h5fh.flush() h5fh.close() else: raise RuntimeError('The matrix is not finalized.')
Example #5
Source File: bam2h5.py From WASP with Apache License 2.0 | 6 votes |
def create_carray(h5f, chrom, data_type): if data_type == "uint8": atom = tables.UInt8Atom(dflt=0) elif data_type == "uint16": atom = tables.UInt16Atom(dflt=0) else: raise NotImplementedError("unsupported datatype %s" % data_type) zlib_filter = tables.Filters(complevel=1, complib="zlib") # create CArray for this chromosome shape = [chrom.length] carray = h5f.create_carray(h5f.root, chrom.name, atom, shape, filters=zlib_filter) return carray
Example #6
Source File: tables_utils.py From mmvt with GNU General Public License v3.0 | 6 votes |
def create_hdf5_arr_table(hdf_file, group, array_name, dtype=np.dtype('float64'), shape=(), arr=None, complib='blosc', complevel=5): atom = tables.Atom.from_dtype(dtype) if arr is not None: shape = arr.shape # filters = tables.Filters(complib=complib, complevel=complevel) if not is_table_in_group(group, array_name): try: ds = hdf_file.create_carray(group, array_name, atom, shape) except: ds = hdf_file.createCArray(group, array_name, atom, shape) else: ds = group._v_children[array_name] if arr is not None: ds[:] = arr return ds
Example #7
Source File: test_hdf5.py From ctapipe with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_write_container(temp_h5_file): r0tel = R0CameraContainer() mc = MCEventContainer() mc.reset() r0tel.waveform = np.random.uniform(size=(50, 10)) r0tel.meta["test_attribute"] = 3.14159 r0tel.meta["date"] = "2020-10-10" with HDF5TableWriter( temp_h5_file, group_name="R0", filters=tables.Filters(complevel=7) ) as writer: writer.exclude("tel_002", ".*samples") # test exclusion of columns for ii in range(100): r0tel.waveform[:] = np.random.uniform(size=(50, 10)) mc.energy = 10 ** np.random.uniform(1, 2) * u.TeV mc.core_x = np.random.uniform(-1, 1) * u.m mc.core_y = np.random.uniform(-1, 1) * u.m writer.write("tel_001", r0tel) writer.write("tel_002", r0tel) # write a second table too writer.write("MC", mc)
Example #8
Source File: AlignmentMatrixFactory.py From emase with GNU General Public License v3.0 | 5 votes |
def produce(self, h5file, title='Alignments', index_dtype='uint32', data_dtype=float, complib='zlib', incidence_only=True): h5fh = tables.open_file(h5file, 'w', title=title) fil = tables.Filters(complevel=1, complib=complib) h5fh.set_node_attr(h5fh.root, 'incidence_only', incidence_only) h5fh.set_node_attr(h5fh.root, 'mtype', 'csc_matrix') h5fh.set_node_attr(h5fh.root, 'shape', (len(self.lname), len(self.hname), len(self.rname))) h5fh.set_node_attr(h5fh.root, 'hname', self.hname) h5fh.create_carray(h5fh.root, 'lname', obj=self.lname, title='Locus Names', filters=fil) h5fh.create_carray(h5fh.root, 'rname', obj=self.rname, title='Read Names', filters=fil) for hid in xrange(len(self.hname)): hap = self.hname[hid] infile = self.tmpfiles[hap] dmat = np.fromfile(open(infile, 'rb'), dtype='>I') dmat = dmat.reshape((len(dmat)/2, 2)).T if dmat.shape[0] > 2: dvec = dmat[2] else: dvec = np.ones(dmat.shape[1]) spmat = coo_matrix((dvec, dmat[:2]), shape=(len(self.rname), len(self.lname))) spmat = spmat.tocsc() hgroup = h5fh.create_group(h5fh.root, 'h%d' % hid, 'Sparse matrix components for Haplotype %d' % hid) i1 = h5fh.create_carray(hgroup, 'indptr', obj=spmat.indptr.astype(index_dtype), filters=fil) i2 = h5fh.create_carray(hgroup, 'indices', obj=spmat.indices.astype(index_dtype), filters=fil) if not incidence_only: d = h5fh.create_carray(hgroup, 'data', obj=spmat.data.astype(data_dtype), filters=fil) h5fh.flush() h5fh.close()
Example #9
Source File: read_array.py From seqc with GNU General Public License v2.0 | 5 votes |
def initial_filtering(self, required_poly_t=1): """Apply different filters to the read array. If a read fails a filter, it is not passed to the others and so the counts for each filter is the number of reads that failed that one but passed all previous ones Filters are not ordered in any particular way. :param required_poly_t: minimum number of T nucleotides in the primer tail of a valid read :return None: method sets the status vector according to the result of filtering. """ failing = np.zeros(self.data.shape[0], dtype=np.int8) # genes are dealt with differently depending on the state of the array if self._ambiguous_genes: nnz = self.genes.getnnz(axis=1) failing[nnz == 0] |= self.filter_codes['no_gene'] failing[nnz > 1] |= self.filter_codes['gene_not_unique'] else: # multiple gene filter is empty failing[self.genes == 0] |= self.filter_codes['no_gene'] # todo add logic for "primer_missing" failing[self.data['rmt'] == 0] |= self.filter_codes['primer_missing'] failing[self.data['cell'] == 0] |= self.filter_codes['primer_missing'] failing[self.data['n_poly_t'] < required_poly_t] |= self.filter_codes['low_polyt'] self.data['status'] = np.bitwise_or(self.data['status'], failing)
Example #10
Source File: read_array.py From seqc with GNU General Public License v2.0 | 5 votes |
def save(self, archive_name): """save a ReadArray object as an hdf5 archive :param str archive_name: filestem for the new archive :return None: """ def store_carray(archive, array, name): atom = tb.Atom.from_dtype(array.dtype) store = archive.create_carray(archive.root, name, atom, array.shape) store[:] = array store.flush() if not archive_name.endswith('.h5'): archive_name += '.h5' # construct container blosc5 = tb.Filters(complevel=5, complib='blosc') f = tb.open_file(archive_name, mode='w', title='Data for seqc.ReadArray', filters=blosc5) f.create_table(f.root, 'data', self.data) if self._ambiguous_genes: # each array is data, indices, indptr store_carray(f, self.genes.indices, 'indices') store_carray(f, self.genes.indptr, 'indptr') store_carray(f, self.genes.data, 'gene_data') store_carray(f, self.positions.data, 'positions_data') else: store_carray(f, self.genes, 'genes') store_carray(f, self.positions, 'positions') f.close()
Example #11
Source File: data.py From 3D-CNNs-for-Liver-Classification with Apache License 2.0 | 5 votes |
def create_data_file(out_file, n_channels, n_samples, image_shape): hdf5_file = tables.open_file(out_file, mode='w') filters = tables.Filters(complevel=5, complib='blosc') data_shape = tuple([0, n_channels] + list(image_shape)) truth_shape = tuple([0, 1] + list(image_shape)) data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=n_samples) affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4), filters=filters, expectedrows=n_samples) return hdf5_file, data_storage, truth_storage, affine_storage
Example #12
Source File: preprocess.py From 3D-CNNs-for-Liver-Classification with Apache License 2.0 | 5 votes |
def create_data_file(out_file, n_channels, n_samples, image_shape): hdf5_file = tables.open_file(out_file, mode='w') filters = tables.Filters(complevel=5, complib='blosc') data_shape = tuple([0, n_channels] + list(image_shape)) truth_shape = tuple([0, 1]) data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=n_samples) return hdf5_file, data_storage, truth_storage
Example #13
Source File: AlignmentPropertyMatrix.py From emase with GNU General Public License v3.0 | 5 votes |
def save(self, h5file, title=None, index_dtype='uint32', data_dtype=float, incidence_only=True, complib='zlib', shallow=False): Sparse3DMatrix.save(self, h5file=h5file, title=title, index_dtype=index_dtype, data_dtype=data_dtype, incidence_only=incidence_only, complib=complib) h5fh = tables.open_file(h5file, 'a') fil = tables.Filters(complevel=1, complib=complib) if self.count is not None: h5fh.create_carray(h5fh.root, 'count', obj=self.count, title='Equivalence Class Counts', filters=fil) if not shallow: h5fh.set_node_attr(h5fh.root, 'hname', self.hname) h5fh.create_carray(h5fh.root, 'lname', obj=self.lname, title='Locus Names', filters=fil) if self.rname is not None: h5fh.create_carray(h5fh.root, 'rname', obj=self.rname, title='Read Names', filters=fil) h5fh.flush() h5fh.close()
Example #14
Source File: data_loader.py From deepAPI with MIT License | 5 votes |
def save_vecs(vecs, fout): fvec = tables.open_file(fout, 'w') atom = tables.Atom.from_dtype(vecs.dtype) filters = tables.Filters(complib='blosc', complevel=5) ds = fvec.create_carray(fvec.root,'vecs', atom, vecs.shape,filters=filters) ds[:] = vecs print('done') fvec.close()
Example #15
Source File: data_loader.py From deep-code-search with MIT License | 5 votes |
def save_code_reprs(vecs, path): npvecs=np.array(vecs) fvec = tables.open_file(path, 'w') atom = tables.Atom.from_dtype(npvecs.dtype) filters = tables.Filters(complib='blosc', complevel=5) ds = fvec.create_carray(fvec.root, 'vecs', atom, npvecs.shape,filters=filters) ds[:] = npvecs fvec.close()
Example #16
Source File: data_loader.py From deep-code-search with MIT License | 5 votes |
def save_vecs(vecs, fout): fvec = tables.open_file(fout, 'w') atom = tables.Atom.from_dtype(vecs.dtype) filters = tables.Filters(complib='blosc', complevel=5) ds = fvec.create_carray(fvec.root,'vecs', atom, vecs.shape,filters=filters) ds[:] = vecs print('done') fvec.close()
Example #17
Source File: data.py From Keras-Brats-Improved-Unet3d with MIT License | 5 votes |
def create_data_file(out_file, n_channels, n_samples, image_shape): hdf5_file = tables.open_file(out_file, mode='w') filters = tables.Filters(complevel=5, complib='blosc') data_shape = tuple([0, n_channels] + list(image_shape)) truth_shape = tuple([0, 1] + list(image_shape)) data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=n_samples) affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4), filters=filters, expectedrows=n_samples) return hdf5_file, data_storage, truth_storage, affine_storage
Example #18
Source File: moving_mnist.py From RATM with MIT License | 5 votes |
def dump_test_set(self, h5filepath, nframes, framesize): # set rng to a hardcoded state, so we always have the same test set! self.numpy_rng.seed(1) with tables.openFile(h5filepath, 'w') as h5file: h5file.createArray(h5file.root, 'test_targets', self.partitions['test']['targets']) vids = h5file.createCArray( h5file.root, 'test_images', tables.Float32Atom(), shape=(10000, nframes, framesize, framesize), filters=tables.Filters(complevel=5, complib='zlib')) pos = h5file.createCArray( h5file.root, 'test_pos', tables.UInt16Atom(), shape=(10000, nframes, 2), filters=tables.Filters(complevel=5, complib='zlib')) for i in range(100): print i (vids[i*100:(i+1)*100], pos[i*100:(i+1)*100], _) = self.get_batch( 'test', 100, nframes, framesize, idx=np.arange(i*100,(i+1)*100)) h5file.flush()
Example #19
Source File: h5tools.py From pywr with GNU General Public License v3.0 | 5 votes |
def __init__(self, filename, filter_kwds=None, mode="r", title='', metadata=None, create_directories=False): self._opened = False if isinstance(filename, (str, os.PathLike)): # filename is a path to open self.filename = filename # Note sure how else to deal with str / unicode requirements in pytables # See this issue: https://github.com/PyTables/PyTables/issues/522 import sys if filter_kwds: if sys.version_info[0] == 2 and 'complib' in filter_kwds: filter_kwds['complib'] = filter_kwds['complib'].encode() filters = tables.Filters(**filter_kwds) else: filters = None # Create directories for the filename if required if create_directories: try: os.makedirs(os.path.dirname(filename)) except OSError as exception: import errno if exception.errno != errno.EEXIST: raise self.file = tables.open_file(filename, mode=mode, filters=filters, title=title) self._opened = True elif isinstance(filename, tables.File): # filename is a pytables file self.file = filename assert(self.file.isopen) self.filename = self.file.filename self._opened = False else: raise TypeError("{} must be initalised with a filename to open or an open tables.File".format(self.__class__.__name__)) # now update metadata if given if metadata is not None and self.file.mode != 'r': for k, v in metadata.items(): setattr(self.file.root._v_attrs, k, v)
Example #20
Source File: get_target_regions.py From WASP with Apache License 2.0 | 5 votes |
def create_carray(self, h5f, chrom, atom): zlib_filter = tables.Filters(complevel=1, complib="zlib") # create CArray for this chromosome shape = [chrom.length] carray = h5f.create_carray(h5f.root, chrom.name, atom, shape, filters=zlib_filter) return carray
Example #21
Source File: pytables_array_list.py From FRETBursts with GNU General Public License v2.0 | 5 votes |
def append(self, ndarray): name = self.get_name() comp_filter = tables.Filters(**self.compression) tarray = self.data_file.create_carray(self.group, name, obj=ndarray, filters=comp_filter) self.data_file.flush() super(PyTablesList, self).append(tarray) #print(self.prefix+str(self.size), ndarray) self.size += 1 self.group._v_attrs.size = self.size
Example #22
Source File: preprocess.py From LV_groundhog with BSD 3-Clause "New" or "Revised" License | 5 votes |
def safe_hdf(array, name): if os.path.isfile(name + '.hdf') and not args.overwrite: logger.warning("Not saving %s, already exists." % (name + '.hdf')) else: if os.path.isfile(name + '.hdf'): logger.info("Overwriting %s." % (name + '.hdf')) else: logger.info("Saving to %s." % (name + '.hdf')) with tables.openFile(name + '.hdf', 'w') as f: atom = tables.Atom.from_dtype(array.dtype) filters = tables.Filters(complib='blosc', complevel=5) ds = f.createCArray(f.root, name.replace('.', ''), atom, array.shape, filters=filters) ds[:] = array
Example #23
Source File: test_hdf5.py From attention-lvcsr with MIT License | 5 votes |
def setUp(self): num_rows = 500 filters = tables.Filters(complib='blosc', complevel=5) h5file = tables.open_file( 'tmp.h5', mode='w', title='Test', filters=filters) group = h5file.create_group("/", 'Data') atom = tables.UInt8Atom() y = h5file.create_carray(group, 'y', atom=atom, title='Data targets', shape=(num_rows, 1), filters=filters) for i in range(num_rows): y[i] = i h5file.flush() h5file.close() self.dataset = PytablesDataset('tmp.h5', ('y',), 20, 500) self.dataset_default = PytablesDataset('tmp.h5', ('y',))
Example #24
Source File: test_find_intersecting_snps.py From WASP with Apache License 2.0 | 5 votes |
def write_snp_index_h5(self): atom = tables.Int16Atom(dflt=0) zlib_filter = tables.Filters(complevel=1, complib="zlib") snp_index_h5 = tables.open_file(self.snp_index_filename, "w") snp_index = 0 chrom_arrays = {} chrom_lengths = self.get_chrom_lengths() for snp in self.snp_list: if snp[0] in chrom_arrays: carray = chrom_arrays[snp[0]] else: # create CArray for this chromosome shape = [chrom_lengths[snp[0]]] carray = snp_index_h5.create_carray(snp_index_h5.root, snp[0], atom, shape, filters=zlib_filter) carray[:] = -1 chrom_arrays[snp[0]] = carray pos = snp[1] carray[pos-1] = snp_index snp_index += 1 self.write_hap_samples(snp_index_h5) snp_index_h5.close()
Example #25
Source File: data.py From 3DUnetCNN with MIT License | 5 votes |
def create_data_file(out_file, n_channels, n_samples, image_shape): hdf5_file = tables.open_file(out_file, mode='w') filters = tables.Filters(complevel=5, complib='blosc') data_shape = tuple([0, n_channels] + list(image_shape)) truth_shape = tuple([0, 1] + list(image_shape)) data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float32Atom(), shape=data_shape, filters=filters, expectedrows=n_samples) truth_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=truth_shape, filters=filters, expectedrows=n_samples) affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float32Atom(), shape=(0, 4, 4), filters=filters, expectedrows=n_samples) return hdf5_file, data_storage, truth_storage, affine_storage
Example #26
Source File: test_hdf5.py From ctapipe with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_filters(): from tables import Filters, open_file class TestContainer(Container): value = Field(-1, "test") no_comp = Filters(complevel=0) zstd = Filters(complevel=5, complib="blosc:zstd") with tempfile.NamedTemporaryFile(suffix=".hdf5") as f: with HDF5TableWriter( f.name, group_name="data", mode="w", filters=no_comp ) as writer: assert writer._h5file.filters.complevel == 0 c = TestContainer(value=5) writer.write("default", c) writer.filters = zstd writer.write("zstd", c) writer.filters = no_comp writer.write("nocomp", c) with open_file(f.name) as h5file: assert h5file.root.data.default.filters.complevel == 0 assert h5file.root.data.zstd.filters.complevel == 5 assert h5file.root.data.zstd.filters.complib == "blosc:zstd" assert h5file.root.data.nocomp.filters.complevel == 0
Example #27
Source File: data_source_tables_gen.py From zipline-chinese with Apache License 2.0 | 5 votes |
def merge_all_files_into_pytables(file_dir, file_out): """ process each file into pytables """ start = None start = datetime.datetime.now() out_h5 = tables.openFile(file_out, mode="w", title="bars", filters=tables.Filters(complevel=9, complib='zlib')) table = None for file_in in glob.glob(file_dir + "/*.gz"): gzip_file = gzip.open(file_in) expected_header = ["dt", "sid", "open", "high", "low", "close", "volume"] csv_reader = csv.DictReader(gzip_file) header = csv_reader.fieldnames if header != expected_header: logging.warn("expected header %s\n" % (expected_header)) logging.warn("header_found %s" % (header)) return for current_date, rows in parse_csv(csv_reader): table = out_h5.createTable("/TD", "date_" + current_date, OHLCTableDescription, expectedrows=len(rows), createparents=True) table.append(rows) table.flush() if table is not None: table.flush() end = datetime.datetime.now() diff = (end - start).seconds logging.debug("finished it took %d." % (diff))
Example #28
Source File: test_hdf5.py From fuel with MIT License | 5 votes |
def setUp(self): num_rows = 500 filters = tables.Filters(complib='blosc', complevel=5) h5file = tables.open_file( 'tmp.h5', mode='w', title='Test', filters=filters) group = h5file.create_group("/", 'Data') atom = tables.UInt8Atom() y = h5file.create_carray(group, 'y', atom=atom, title='Data targets', shape=(num_rows, 1), filters=filters) for i in range(num_rows): y[i] = i h5file.flush() h5file.close() self.dataset = PytablesDataset('tmp.h5', ('y',), 20, 500) self.dataset_default = PytablesDataset('tmp.h5', ('y',))
Example #29
Source File: preprocess.py From NMT-Coverage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def safe_hdf(array, name): if os.path.isfile(name + '.hdf') and not args.overwrite: logger.warning("Not saving %s, already exists." % (name + '.hdf')) else: if os.path.isfile(name + '.hdf'): logger.info("Overwriting %s." % (name + '.hdf')) else: logger.info("Saving to %s." % (name + '.hdf')) with tables.openFile(name + '.hdf', 'w') as f: atom = tables.Atom.from_dtype(array.dtype) filters = tables.Filters(complib='blosc', complevel=5) ds = f.createCArray(f.root, name.replace('.', ''), atom, array.shape, filters=filters) ds[:] = array
Example #30
Source File: blizzard_data.py From stcn with GNU General Public License v3.0 | 4 votes |
def fetch_blizzard_tbptt(data_path, sz=8000, batch_size=100, file_name="blizzard_tbptt.h5"): hdf5_path = os.path.join(data_path, file_name) if not os.path.exists(hdf5_path): data_matches = [] for root, dir_names, file_names in os.walk(data_path): for filename in fnmatch.filter(file_names, 'data_*.npy'): data_matches.append(os.path.join(root, filename)) # sort in proper order data_matches = sorted(data_matches, key=lambda x: int( x.split("/")[-1].split("_")[-1][0])) # setup tables compression_filter = tables.Filters(complevel=5, complib='blosc') hdf5_file = tables.open_file(hdf5_path, mode='w') data = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Int16Atom(), shape=(0, sz), filters=compression_filter,) for n, f in enumerate(data_matches): print("Reading file %s" % (f)) # with open(f) as fp: # Array of arrays, ragged # d = np.load(fp) d = np.load(f) large_d = d[0] for i in range(1, len(d)): print("Processing line %i of %i" % (i+1, len(d))) di = d[i] if len(di.shape) > 1: di = di[:, 0] large_d = np.concatenate([large_d, di]) chunk_size = int(np.float(len(large_d) / batch_size)) seg_d = segment_axis(large_d, chunk_size, 0) num_batch = int(np.float((seg_d.shape[-1] - 1)/float(sz))) for i in range(num_batch): batch = seg_d[:, i*sz:(i+1)*sz] for j in range(batch_size): data.append(batch[j][None]) hdf5_file.close() hdf5_file = tables.open_file(hdf5_path, mode='r') return hdf5_file.root.data