Python Examples of h5py.special

Source File: ilsvrc2010.py From attention-lvcsr with MIT License

6 votes

def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets', shape=(n_total, 1), dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')

Source File: million_song_dataset.py From implicit with MIT License

6 votes

def _hfd5_from_dataframe(data, track_info, outputfilename):
    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(np.float32),
                       (data['track'].cat.codes.copy(),
                        data['user'].cat.codes.copy()))).tocsr()

    with h5py.File(outputfilename, "w") as f:
        g = f.create_group('track_user_plays')
        g.create_dataset("data", data=plays.data)
        g.create_dataset("indptr", data=plays.indptr)
        g.create_dataset("indices", data=plays.indices)

        dt = h5py.special_dtype(vlen=str)
        dset = f.create_dataset('track', track_info.shape, dtype=dt)
        dset[:] = track_info

        user = list(data['user'].cat.categories)
        dset = f.create_dataset('user', (len(user),), dtype=dt)
        dset[:] = user

Source File: sketchfab.py From implicit with MIT License

6 votes

def _hfd5_from_dataframe(data, outputfilename):
    items = data['mid'].cat.codes.copy()
    users = data['uid'].cat.codes.copy()
    values = np.ones(len(items)).astype(np.float32)

    # create a sparse matrix of all the item/users/likes
    likes = coo_matrix((values, (items, users))).astype(np.float32).tocsr()

    with h5py.File(outputfilename, "w") as f:
        g = f.create_group('item_user_likes')
        g.create_dataset("data", data=likes.data)
        g.create_dataset("indptr", data=likes.indptr)
        g.create_dataset("indices", data=likes.indices)

        dt = h5py.special_dtype(vlen=str)
        item = list(data['mid'].cat.categories)
        dset = f.create_dataset('item', (len(item),), dtype=dt)
        dset[:] = item

        user = list(data['uid'].cat.categories)
        dset = f.create_dataset('user', (len(user),), dtype=dt)
        dset[:] = user

Source File: lastfm.py From implicit with MIT License

6 votes

def _hfd5_from_dataframe(data, outputfilename):
    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(np.float32),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy()))).tocsr()

    with h5py.File(outputfilename, "w") as f:
        g = f.create_group('artist_user_plays')
        g.create_dataset("data", data=plays.data)
        g.create_dataset("indptr", data=plays.indptr)
        g.create_dataset("indices", data=plays.indices)

        dt = h5py.special_dtype(vlen=str)
        artist = list(data['artist'].cat.categories)
        dset = f.create_dataset('artist', (len(artist),), dtype=dt)
        dset[:] = artist

        user = list(data['user'].cat.categories)
        dset = f.create_dataset('user', (len(user),), dtype=dt)
        dset[:] = user

Source File: h5_test.py From keras-image-segmentation with MIT License

6 votes

def write_data(h5py_file, mode, x_paths, y_paths):
    num_data = len(x_paths)

    uint8_dt = h5py.special_dtype(vlen=np.uint8)
    string_dt = h5py.special_dtype(vlen=str)

    group = h5py_file.create_group(mode)
    h5_name = group.create_dataset('name', shape=(num_data,), dtype=string_dt)
    h5_image = group.create_dataset('image', shape=(num_data,), dtype=uint8_dt)
    h5_label = group.create_dataset('label', shape=(num_data,), dtype=uint8_dt)

    h5_image.attrs['size'] = [256,512,3]
    h5_label.attrs['size'] = [256,512,1]

    for i in range(num_data):
        x_img = cv2.imread(x_paths[i], 1)
        y_img = cv2.imread(y_paths[i], 0)
        x_img = cv2.resize(x_img, None, fx=0.25, fy=0.25, interpolation=cv2.INTER_LINEAR)
        y_img = cv2.resize(y_img, None, fx=0.25, fy=0.25, interpolation=cv2.INTER_NEAREST)

        h5_image[i] = x_img.flatten()
        h5_label[i] = y_img.flatten()
        h5_name[i] = os.path.basename(x_paths[i])

        # break

Source File: pyMtsf.py From PySimulator with GNU Lesser General Public License v3.0

6 votes

def WriteUnits(self):
        if len(self.units) == 0:
            return

        #maxLenTypeName = self._getMaxLength([x.name for x in self.units])
        numpyDataType = numpy.dtype({'names': ['name', 'factor',
                                              'offset', 'mode'],
                               'formats': [h5py.special_dtype(vlen=unicode),
                                          'double',
                                          'double',
                                          h5py.special_dtype(enum=(numpy.uint8, {'BaseUnit':0, 'Unit':1, 'DefaultDisplayUnit':2}))]})  # 'uint8']})

        dataset = self.description.create_dataset('Units', (len(self.units), 1), dtype=numpyDataType, maxshape=(len(self.units), 1), compression='gzip')
        allData = []
        for unit in self.units:
            allData.append((unit.name, unit.factor, unit.offset, unit.mode))
        dataset[:, 0] = allData

Source File: movielens.py From implicit with MIT License

6 votes

def _hfd5_from_dataframe(ratings, movies, outputfilename):
    # transform ratings dataframe into a sparse matrix
    m = coo_matrix((ratings['rating'].astype(np.float32),
                   (ratings['movieId'], ratings['userId']))).tocsr()

    with h5py.File(outputfilename, "w") as f:
        # write out the ratings matrix
        g = f.create_group('movie_user_ratings')
        g.create_dataset("data", data=m.data)
        g.create_dataset("indptr", data=m.indptr)
        g.create_dataset("indices", data=m.indices)

        # write out the titles as a numpy array
        titles = np.empty(shape=(movies.movieId.max()+1,), dtype=np.object)
        titles[movies.movieId] = movies.title
        dt = h5py.special_dtype(vlen=str)
        dset = f.create_dataset('movie', (len(titles),), dtype=dt)
        dset[:] = titles

Source File: data_generator.py From GroundedTranslation with BSD 3-Clause "New" or "Revised" License

6 votes

def set_predicted_description(self, split, data_key, sentence):
        '''
        Set the predicted sentence tokens in the data_key group,
        creating the group if necessary, or erasing the current value if
        necessary.
        '''

        if self.openmode != "r+":
            # forcefully quit when trying to write to a read-only file
            raise RuntimeError("Dataset is read-only, try again with --h5_writable")

        dataset_key = 'predicted_description'

        try:
            predicted_text = self.dataset[split][data_key].create_dataset(dataset_key, (1,), dtype=h5py.special_dtype(vlen=unicode))
        except RuntimeError:
            # the dataset already exists, erase it and create an empty space
            del self.dataset[split][data_key][dataset_key]
            predicted_text = self.dataset[split][data_key].create_dataset(dataset_key, (1,), dtype=h5py.special_dtype(vlen=unicode))

        predicted_text[0] = " ".join([x for x in sentence])

Source File: vectors.py From text_embedding with MIT License

6 votes

def text2hdf5(textfile, hdf5file, **kwargs):
  '''converts word embeddings file from text to HDF5 format
  Args:
      textfile: word embeddings file in format "word float ... float\n"
      hdf5file: output file ; will have keys 'words' and 'vectors'
      kwargs: passed to load
  Returns:
      None
  '''

  words, vectors = zip(*load(textfile, **kwargs))
  f = h5py.File(hdf5file)
  f.create_dataset('words', (len(words),), dtype=h5py.special_dtype(vlen=str))
  for i, word in enumerate(words):
      f['words'][i] = word
  f.create_dataset('vectors', data=np.vstack(vectors))
  f.close()

Source File: hdf5.py From pyPESTO with BSD 3-Clause "New" or "Revised" License

6 votes

def write_string_array(f: h5py.Group,
                       path: str,
                       strings: Collection) -> None:
    """
    Write string array to hdf5

    Parameters
    -------------
    f:
        h5py.Group where dataset should be created
    path:
        path of the dataset to create
    strings:
        list of strings to be written to f
    """
    dt = h5py.special_dtype(vlen=str)
    dset = f.create_dataset(path, (len(strings),), dtype=dt)
    dset[:] = [s.encode('utf8') for s in strings]

Source File: ilsvrc2012.py From fuel with MIT License

6 votes

def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    n_labeled = n_train + n_valid
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets', shape=(n_labeled, 1),
                             dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')

Source File: ilsvrc2010.py From fuel with MIT License

6 votes

def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets', shape=(n_total, 1), dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')

Source File: hdf5_dataset_writer.py From calamari with Apache License 2.0

6 votes

def finish_chunck(self):
        if len(self.text) == 0:
            return

        codec = self.compute_codec()

        filename = "{}_{:03d}{}".format(self.output_filename, self.current_chunk, DataSetType.gt_extension(DataSetType.HDF5))
        self.files.append(filename)
        file = h5py.File(filename, 'w')
        dti32 = h5py.special_dtype(vlen=np.dtype('int32'))
        dtui8 = h5py.special_dtype(vlen=np.dtype('uint8'))
        file.create_dataset('transcripts', (len(self.text),), dtype=dti32, compression='gzip')
        file.create_dataset('images_dims', data=[d.shape for d in self.data], dtype=int)
        file.create_dataset('images', (len(self.text),), dtype=dtui8, compression='gzip')
        file.create_dataset('codec', data=list(map(ord, codec)))
        file['transcripts'][...] = [list(map(codec.index, d)) for d in self.text]
        file['images'][...] = [d.reshape(-1) for d in self.data]
        file.close()

        self.current_chunk += 1
        self.data = []
        self.text = []

Source File: test_datatype.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_compound_vlen(self):
        vidt = h5py.special_dtype(vlen=np.uint8)
        eidt = h5py.special_dtype(enum=(np.uint8, {'OFF': 0, 'ON': 1}))

        for np_align in (False, True):
            dt = np.dtype([
                ('a', eidt),
                ('foo', vidt),
                ('bar', vidt),
                ('switch', eidt)], align=np_align)
            np_offsets = [dt.fields[i][1] for i in dt.names]

            for logical in (False, True):
                if logical and np_align:
                    # Vlen types have different size in the numpy struct
                    self.assertRaises(TypeError, h5py.h5t.py_create, dt,
                            logical=logical)
                else:
                    ht = h5py.h5t.py_create(dt, logical=logical)
                    offsets = [ht.get_member_offset(i)
                               for i in range(ht.get_nmembers())]
                    if np_align:
                        self.assertEqual(np_offsets, offsets)

Source File: test_datatype.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_vlen_enum(self):
        fname = self.mktemp()
        arr1 = [[1],[1,2]]
        dt1 = h5py.special_dtype(vlen=h5py.special_dtype(
            enum=('i', dict(foo=1, bar=2))))

        with h5py.File(fname,'w') as f:
            df1 = f.create_dataset('test', (len(arr1),), dtype=dt1)
            df1[:] = np.array(arr1)

        with h5py.File(fname,'r') as f:
            df2  = f['test']
            dt2  = df2.dtype
            arr2 = [e.tolist() for e in df2[:]]

        self.assertEqual(arr1, arr2)
        self.assertEqual(h5py.check_dtype(enum=h5py.check_dtype(vlen=dt1)),
                         h5py.check_dtype(enum=h5py.check_dtype(vlen=dt2)))

Source File: test_datatype.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_compound_vlen_enum(self):
        eidt = h5py.special_dtype(enum=(np.uint8, {'OFF': 0, 'ON': 1}))
        vidt = h5py.special_dtype(vlen=np.uint8)
        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vve = np.dtype([
            ('foo', vidt),
            ('bar', vidt),
            ('switch', eidt)])
        vve = f.create_dataset('dt_vve', shape=(2,), dtype=dt_vve)
        data = np.array([(a([1,2,3]), a([1,2]),   1),
                         (a([]),      a([2,4,6]), 0),],
                         dtype=dt_vve)
        vve[:] = data
        actual = vve[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['switch'], actual['switch'])

Source File: _utils.py From PyINT with GNU General Public License v3.0

6 votes

def write_h5(datasetDict, out_file, metadata=None, ref_file=None, compression=None):

    if os.path.isfile(out_file):
        print('delete exsited file: {}'.format(out_file))
        os.remove(out_file)

    print('create HDF5 file: {} with w mode'.format(out_file))
    dt = h5py.special_dtype(vlen=np.dtype('float64'))

    
    with h5py.File(out_file, 'w') as f:
        for dsName in datasetDict.keys():
            data = datasetDict[dsName]
            ds = f.create_dataset(dsName,
                              data=data,
                              compression=compression)
        
        for key, value in metadata.items():
            f.attrs[key] = str(value)
            #print(key + ': ' +  value)
    print('finished writing to {}'.format(out_file))
        
    return out_file 
    
######################################################################

Source File: h5f.py From costar_plan with Apache License 2.0

6 votes

def write(self, example, filename, image_types=[]):
        '''
        Write an example out to disk.

        status: success, failure or error.failure
        '''
        filename = os.path.join(self.name, filename)
        f = h5f.File(filename, 'w')
        if image_types != []:
            dt = h5f.special_dtype(vlen=bytes)
            for (img_type_str, img_format_str) in image_types:
                f.create_dataset("type_" + img_type_str, data=[img_format_str])
        for key, value in example.items():
            if self.verbose > 0:
                print('H5fDataset writing key: ' + str(key))
            f.create_dataset(key, data=value)
        f.close()

Source File: h5ad.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
    # group here is an h5py type, otherwise categoricals won’t write
    if series.dtype == object:  # Assuming it’s string
        group.create_dataset(
            key,
            data=series.values,
            dtype=h5py.special_dtype(vlen=str),
            **dataset_kwargs,
        )
    elif is_categorical_dtype(series):
        # This should work for categorical Index and Series
        categorical: pd.Categorical = series.values
        categories: np.ndarray = categorical.categories.values
        codes: np.ndarray = categorical.codes
        category_key = f"__categories/{key}"

        write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs)
        write_array(group, key, codes, dataset_kwargs=dataset_kwargs)

        group[key].attrs["categories"] = group[category_key].ref
        group[category_key].attrs["ordered"] = categorical.ordered
    else:
        group[key] = series.values

Source File: legacyapi.py From h5netcdf with BSD 3-Clause "New" or "Revised" License

6 votes

def createVariable(self, varname, datatype, dimensions=(), zlib=False,
                       complevel=4, shuffle=True, fletcher32=False,
                       chunksizes=None, fill_value=None):
        if len(dimensions) == 0:  # it's a scalar
            # rip off chunk and filter options for consistency with netCDF4-python

            chunksizes = None
            zlib = False
            fletcher32 = False
            shuffle = False

        if datatype is str:
            datatype = h5py.special_dtype(vlen=unicode)

        kwds = {}
        if zlib:
            # only add compression related keyword arguments if relevant (h5py
            # chokes otherwise)
            kwds['compression'] = 'gzip'
            kwds['compression_opts'] = complevel
            kwds['shuffle'] = shuffle

        return super(Group, self).create_variable(
            varname, dimensions, dtype=datatype, fletcher32=fletcher32,
            chunks=chunksizes, fillvalue=fill_value, **kwds)

Source File: test_dataset.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_convert(self):
        dt = h5py.special_dtype(vlen=int)
        ds = self.f.create_dataset('vlen', (3,), dtype=dt)
        ds[0] = np.array([1.4, 1.2])
        ds[1] = np.array([1.2])
        ds[2] = [1.2, 2, 3]
        self.assertArrayEqual(ds[0], np.array([1, 1]))
        self.assertArrayEqual(ds[1], np.array([1]))
        self.assertArrayEqual(ds[2], np.array([1, 2, 3]))
        ds[0:2] = np.array([[0.1, 1.1, 2.1, 3.1, 4], np.arange(4)])
        self.assertArrayEqual(ds[0], np.arange(5))
        self.assertArrayEqual(ds[1], np.arange(4))
        ds[0:2] = np.array([np.array([0.1, 1.2, 2.2]),
                            np.array([0.2, 1.2, 2.2])])
        self.assertArrayEqual(ds[0], np.arange(3))
        self.assertArrayEqual(ds[1], np.arange(3))

Source File: test_dataset.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_int(self):
        dt = h5py.special_dtype(vlen=int)
        ds = self.f.create_dataset('vlen', (4,), dtype=dt)
        ds[0] = np.arange(3)
        ds[1] = np.arange(0)
        ds[2] = [1, 2, 3]
        ds[3] = np.arange(1)
        self.assertArrayEqual(ds[0], np.arange(3))
        self.assertArrayEqual(ds[1], np.arange(0))
        self.assertArrayEqual(ds[2], np.array([1, 2, 3]))
        self.assertArrayEqual(ds[1], np.arange(0))
        ds[0:2] = np.array([np.arange(5), np.arange(4)])
        self.assertArrayEqual(ds[0], np.arange(5))
        self.assertArrayEqual(ds[1], np.arange(4))
        ds[0:2] = np.array([np.arange(3), np.arange(3)])
        self.assertArrayEqual(ds[0], np.arange(3))
        self.assertArrayEqual(ds[1], np.arange(3))

Source File: utils.py From acerta-abide with GNU General Public License v2.0

5 votes

def hdf5_handler(filename, mode="r"):
    h5py.File(filename, "a").close()
    propfaid = h5py.h5p.create(h5py.h5p.FILE_ACCESS)
    settings = list(propfaid.get_cache())
    settings[1] = 0
    settings[2] = 0
    propfaid.set_cache(*settings)
    with contextlib.closing(h5py.h5f.open(filename, fapl=propfaid)) as fid:
        f = h5py.File(fid, mode)
        # f.attrs.create(dtype=h5py.special_dtype(vlen=str)) 
        return f

Source File: test_dataset.py From keras-lambda with MIT License

5 votes

def test_vlen_bytes(self):
        """ Vlen bytes dataset maps to vlen ascii in the file """
        dt = h5py.special_dtype(vlen=bytes)
        ds = self.f.create_dataset('x', (100,), dtype=dt)
        tid = ds.id.get_type()
        self.assertEqual(type(tid), h5py.h5t.TypeStringID)
        self.assertEqual(tid.get_cset(), h5py.h5t.CSET_ASCII)

Source File: global_attribute_manager.py From loompy with BSD 2-Clause "Simplified" License

5 votes

def __setattr__(self, name: str, val: Any) -> None:
		if name.startswith("!"):
			super(GlobalAttributeManager, self).__setattr__(name[1:], val)
		elif "/" in name:
			raise KeyError("Attribute name cannot contain slash (/)")
		else:
			if self.f is not None:
				if loompy.compare_loom_spec_version(self.f, "3.0.0") < 0 and "attrs" not in self.f["/"]:
					normalized = loompy.normalize_attr_values(val, False)
					self.f.attrs[name] = normalized
					self.f.flush()
					val = self.f.attrs[name]
					# Read it back in to ensure it's synced and normalized
					normalized = loompy.materialize_attr_values(val)
					self.__dict__["storage"][name] = normalized
				else:
					normalized = loompy.normalize_attr_values(val, True)
					if name in self.f["attrs"]:
						del self.f["attrs"][name]
					if not np.isscalar(normalized) and normalized.dtype == np.object_:
						self.ds._file.create_dataset("attrs/" + name, data=normalized, dtype=h5py.special_dtype(vlen=str))
					else:
						self.f["attrs"][name] = normalized
					self.f.flush()
					val = self.f["attrs"][name][()]
					# Read it back in to ensure it's synced and normalized
					normalized = loompy.materialize_attr_values(val)
					self.__dict__["storage"][name] = normalized

Source File: datasets.py From pysaliency with MIT License

5 votes

def to_hdf5(self, target):
        """ Write FileStimuli to hdf5 file or hdf5 group
        """

        target.attrs['type'] = np.string_('FileStimuli')
        target.attrs['version'] = np.string_('2.1')

        import h5py
        # make sure everything is unicode

        hdf5_filename = target.file.filename
        hdf5_directory = os.path.dirname(hdf5_filename)

        relative_filenames = [os.path.relpath(filename, hdf5_directory) for filename in self.filenames]
        decoded_filenames = [decode_string(filename) for filename in relative_filenames]
        encoded_filenames = [filename.encode('utf8') for filename in decoded_filenames]

        target.create_dataset(
            'filenames',
            data=np.array(encoded_filenames),
            dtype=h5py.special_dtype(vlen=str)
        )

        shape_dataset = target.create_dataset(
            'shapes',
            (len(self), ),
            dtype=h5py.special_dtype(vlen=np.dtype('int64'))
        )

        for n, shape in enumerate(self.shapes):
            shape_dataset[n] = np.array(shape)

        for attribute_name, attribute_value in self.attributes.items():
            target.create_dataset(attribute_name, data=attribute_value)
        target.attrs['__attributes__'] = np.string_(json.dumps(self.__attributes__))

        target.attrs['size'] = len(self)

Source File: hdf5_dataset_writer.py From aiexamples with Apache License 2.0

5 votes

def store_class_labels(self, class_labels):
    dt = h5py.special_dtype(vlen=str)
    labelset = self.db.create_dataset("label_names", (len(class_labels),), dtype=dt)
    labelset[:] = class_labels

Source File: lm_utils.py From espnet with Apache License 2.0

5 votes

def load_dataset(path, label_dict, outdir=None):
    """Load and save HDF5 that contains a dataset and stats for LM

    Args:
        path (str): The path of an input text dataset file
        label_dict (dict[str, int]):
            dictionary that maps token label string to its ID number
        outdir (str): The path of an output dir

    Returns:
        tuple[list[np.ndarray], int, int]: Tuple of
            token IDs in np.int32 converted by `read_tokens`
            the number of tokens by `count_tokens`,
            and the number of OOVs by `count_tokens`
    """
    if outdir is not None:
        os.makedirs(outdir, exist_ok=True)
        filename = outdir + "/" + os.path.basename(path) + ".h5"
        if os.path.exists(filename):
            logging.info(f"loading binary dataset: {filename}")
            f = h5py.File(filename, "r")
            return f["data"][:], f["n_tokens"][()], f["n_oovs"][()]
    else:
        logging.info("skip dump/load HDF5 because the output dir is not specified")
    logging.info(f"reading text dataset: {path}")
    ret = read_tokens(path, label_dict)
    n_tokens, n_oovs = count_tokens(ret, label_dict["<unk>"])
    if outdir is not None:
        logging.info(f"saving binary dataset: {filename}")
        with h5py.File(filename, "w") as f:
            # http://docs.h5py.org/en/stable/special.html#arbitrary-vlen-data
            data = f.create_dataset(
                "data", (len(ret),), dtype=h5py.special_dtype(vlen=np.int32)
            )
            data[:] = ret
            f["n_tokens"] = n_tokens
            f["n_oovs"] = n_oovs
    return ret, n_tokens, n_oovs

Source File: dbfun_lookuptable.py From ABXpy with MIT License

5 votes

def get_dtype(data):
    str_dtype = h5py.special_dtype(vlen=unicode)
    # allow for the use of strings
    if isinstance(data[0], str):
        dtype = str_dtype
    # could add some checks that the dtype is one of those supported by h5 ?
    elif hasattr(data, 'dtype'):
        dtype = data.dtype
    else:
        dtype = numpy.array(data).dtype
    return dtype


# item_size given in bytes, size_in_mem given in kilobytes

Source File: embed.py From triplet-reid-pytorch with MIT License

5 votes

def write_to_h5(output_file, model, endpoints, num_augmentations, dataloader, dataset, keys=["emb"]):
    """
    Writes model to h5
    """
    print(len(dataloader), len(dataset))

    print("Model dimension is {}".format(model.module.dim))
    if len(keys) == 0:
        raise RuntimeError("Plase specify at least one key that should be written to file.")

    with h5py.File(output_file) as f_out:
        # Dataparallel class!
        datasets = {}
        for key in keys:
            datasets[key] = f_out.create_dataset(key, shape=(len(dataset), num_augmentations)                                      + model.module.dimensions[key], dtype=np.float32)
        for key in dataset.header:
            datasets[key] = f_out.create_dataset(
                    key,
                    shape=(len(dataset),),
                    dtype=h5py.special_dtype(vlen=str))
        start_idx = 0

        for endpoints, rows in run_forward_pass(dataloader, model, endpoints):
            # TODO this will not work if for some reason some endpoints are shorter than others
            for key in keys:
                end_idx = start_idx + len(endpoints[key])
                datasets[key][start_idx:end_idx] = endpoints[key]
            for key, values in rows.items():
                end_idx = start_idx + len(values)
                datasets[key][start_idx:end_idx] = np.asarray(values)
            start_idx = end_idx
    return output_file

Python h5py.special_dtype() Examples