Python Examples of pandas.HDFStore

Source File: minute_bars.py From catalyst with Apache License 2.0

6 votes

def write(self, frames):
        """
        Write the frames to the target HDF5 file, using the format used by
        ``pd.Panel.to_hdf``

        Parameters
        ----------
        frames : iter[(int, DataFrame)] or dict[int -> DataFrame]
            An iterable or other mapping of sid to the corresponding OHLCV
            pricing data.
        """
        with HDFStore(self._path, 'w',
                      complevel=self._complevel, complib=self._complib) \
                as store:
            panel = pd.Panel.from_dict(dict(frames))
            panel.to_hdf(store, 'updates')
        with tables.open_file(self._path, mode='r+') as h5file:
            h5file.set_node_attr('/', 'version', 0)

Source File: eq_loc.py From pykonal with GNU General Public License v3.0

6 votes

def load_stations(input_file):
    """
    Load and return network geometry from input file.

    Input file must be HDF5 file created using pandas.HDFStore with a
    "stations" table that contains "network", "station", "latitude",
    "longitude", and "elevation" fields. Units of degrees are assumed
    for "latitude" and "longitude", and units of kilometers are assumed
    for "elevation".

    Returns: pandas.DataFrame object with "network", "station",
    "latitude", "longitude", and "depth" fields. Units of "depth" are
    kilometers.
    """

    with pd.HDFStore(input_file, mode="r") as store:
        stations = store["stations"]

    stations["depth"] = -stations["elevation"]
    stations = stations[
        ["network", "station", "latitude", "longitude", "depth"]
    ]

    return (stations)

Source File: Omlette.py From OpenTrader with GNU Lesser General Public License v3.0

6 votes

def iMain():
    """
    Read an hdf file generated by us to make sure
    we can recover its content and structure.
    Give the name of an hdf5 file as a command-line argument.
    """
    assert sys.argv, __doc__
    sFile = sys.argv[1]
    assert os.path.isfile(sFile)
    oHdfStore = pandas.HDFStore(sFile, mode='r')
    print oHdfStore.groups()
    # bug - no return value
    # oSignals = pandas.read_hdf(oHdfStore, '/servings/signals')
    mSignals = oHdfStore.select('/recipe/servings/mSignals', auto_close=False)    
    print mSignals
    print oHdfStore.get_node('/recipe')._v_attrs.metadata[0]['sUrl']

Source File: dataset.py From avocado with MIT License

6 votes

def write_models(self, tag=None):
        """Write the models of the light curves to disk.

        The models will be stored in the features directory using the dataset's
        name and the given features tag. Note that for now the models are
        stored as individual tables in the HDF5 file because there doesn't
        appear to be a good way to store fixed length arrays in pandas.

        WARNING: This is not the best way to implement this, and there are
        definitely much better ways. This also isn't thread-safe at all.

        Parameters
        ----------
        tag : str (optional)
            The tag for this version of the features. By default, this will use
            settings['features_tag'].
        """
        models_path = self.get_models_path(tag=tag)

        store = pd.HDFStore(models_path, "a")
        for model_name, model in self.models.items():
            model.to_hdf(store, model_name, mode="a")
        store.close()

Source File: utils.py From avocado with MIT License

6 votes

def _create_csi_index(store, key, column_name):
    """Create a CSI index on a column in an HDF5 file.

    The column must have been already specified in the data_columns call to
    to_hdf or it won't be stored correctly in the HDF5 file.

    Parameters
    ----------
    store : :class:`pandas.HDFStore`
        An HDF5 file opened as an instance of a :class:`pandas.HDFStore`
        object.
    key : str
        The key of the DataFrame to use.
    column_name : str
        The column to add a CSI index to.
    """
    key_store = store.get_storer(key)
    use_name = _map_column_name(key_store, column_name)
    column = key_store.table.colinstances[use_name]

    if not column.index.is_csi:
        column.remove_index()
        column.create_csindex()

Source File: process_ow.py From tierpsy-tracker with MIT License

6 votes

def ow_plate_summary(fname):
    all_feats = read_feat_events(fname)
    
    with pd.HDFStore(fname, 'r') as fid:
        features_timeseries = fid['/features_timeseries']
    for cc in features_timeseries:
        all_feats[cc] = features_timeseries[cc].values
    
    wStats = WormStats()
    exp_feats = wStats.getWormStats(all_feats, np.nanmean)
    
    
    exp_feats = pd.DataFrame(exp_feats)
    
    valid_order = [x for x in exp_feats.columns if x not in wStats.extra_fields]
    exp_feats = exp_feats.loc[:, valid_order]
    
    return [exp_feats]
#%%

Source File: hdf.py From vivarium with GNU General Public License v3.0

6 votes

def _write_pandas_data(path: Path, entity_key: EntityKey, data: Union[PandasObj]):
    """Write data in a pandas format to an HDF file.

    This method currently supports :class:`pandas DataFrame` objects, with or
    with or without columns, and :class:`pandas.Series` objects.

    """
    if data.empty:
        # Our data is indexed, sometimes with no other columns. This leaves an
        # empty dataframe that store.put will silently fail to write in table
        # format.
        data = data.reset_index()
        if data.empty:
            raise ValueError("Cannot write an empty dataframe that does not have an index.")
        metadata = {'is_empty': True}
        data_columns = True
    else:
        metadata = {'is_empty': False}
        data_columns = None

    with pd.HDFStore(str(path), complevel=9) as store:
        store.put(entity_key.path, data, format="table", data_columns=data_columns)
        store.get_storer(entity_key.path).attrs.metadata = metadata  # NOTE: must use attrs. write this up

Source File: burstlib_ext.py From FRETBursts with GNU General Public License v2.0

6 votes

def _store_bg_data(store, base_name, min_ph_delays_us, best_bg, best_th,
                   BG_data, BG_data_e):
    if not base_name.endswith('/'):
        base_name = base_name + '/'
    store_name = store.filename
    group_name = '/' + base_name[:-1]
    store.create_carray(group_name, 'min_ph_delays_us', obj=min_ph_delays_us,
                        createparents=True)
    for ph_sel, values in BG_data.items():
        store.create_carray(group_name, str(ph_sel), obj=values)
    for ph_sel, values in BG_data_e.items():
        store.create_carray(group_name, str(ph_sel) + '_err', obj=values)
    store.close()
    store = pd.HDFStore(store_name)
    store[base_name + 'best_bg'] = best_bg
    store[base_name + 'best_th'] = best_th
    store.close()

Source File: panda.py From twint with MIT License

6 votes

def save(_filename, _dataframe, **options):
    if options.get("dataname"):
        _dataname = options.get("dataname")
    else:
        _dataname = "twint"

    if not options.get("type"):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            _store = pd.HDFStore(_filename + ".h5")
            _store[_dataname] = _dataframe
            _store.close()
    elif options.get("type") == "Pickle":
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            _dataframe.to_pickle(_filename + ".pkl")
    else:
        print("""Please specify: filename, DataFrame, DataFrame name and type
              (HDF5, default, or Pickle)""")

Source File: panda.py From twint with MIT License

6 votes

def read(_filename, **options):
    if not options.get("dataname"):
        _dataname = "twint"
    else:
        _dataname = options.get("dataname")

    if not options.get("type"):
        _store = pd.HDFStore(_filename + ".h5")
        _df = _store[_dataname]
        return _df
    elif options.get("type") == "Pickle":
        _df = pd.read_pickle(_filename + ".pkl")
        return _df
    else:
        print("""Please specify: DataFrame, DataFrame name (twint as default),
              filename and type (HDF5, default, or Pickle""")

Source File: dc2_object.py From gcr-catalogs with BSD 3-Clause "New" or "Revised" License

6 votes

def _open_hdf5(self, file_path):
        """Return the file handle of an HDF5 file as an pd.HDFStore object

        Cache and return the file handle for the HDF5 file at <file_path>

        Args:
            file_path (str): The path of the desired file

        Return:
            The cached file handle
        """

        if (file_path not in self._file_handles or
                not self._file_handles[file_path].is_open):
            self._file_handles[file_path] = pd.HDFStore(file_path, 'r')

        return self._file_handles[file_path]

Source File: burstlib_ext.py From FRETBursts with GNU General Public License v2.0

6 votes

def _load_bg_data(store, base_name, ph_streams):
    if not base_name.endswith('/'):
        base_name = base_name + '/'
    store_name = store.filename
    group_name = '/' + base_name[:-1]
    min_ph_delays = store.get_node(group_name, 'min_ph_delays_us')[:]
    BG_data = {}
    for ph_sel in ph_streams:
        BG_data[ph_sel] = store.get_node(group_name, str(ph_sel))[:]
    BG_data_e = {}
    for ph_sel in ph_streams:
        BG_data_e[ph_sel] = store.get_node(group_name, str(ph_sel) + '_err')[:]
    store.close()
    store = pd.HDFStore(store_name)
    best_bg = store[base_name + 'best_bg']
    best_th = store[base_name + 'best_th']
    store.close()
    return best_th, best_bg, BG_data, BG_data_e, min_ph_delays

Source File: hlatyper.py From OptiType with BSD 3-Clause "New" or "Revised" License

6 votes

def store_dataframes(out_hdf, **kwargs):
    # DataFrames to serialize have to be passed by keyword arguments. An argument matrix1=DataFrame(...)
    # will be written into table 'matrix1' in the HDF file.

    complevel = kwargs.pop('complevel', 9)   # default complevel & complib values if
    complib = kwargs.pop('complib', 'zlib')  # not explicitly asked for as arguments

    if VERBOSE:
        print(now(), 'Storing %d DataFrames in file %s with compression settings %d %s...' % (len(kwargs), out_hdf, complevel, complib))

    store = pd.HDFStore(out_hdf, complevel=complevel, complib=complib)  # TODO: WRITE ONLY? it probably appends now
    for table_name, dataframe in kwargs.items():
        store[table_name] = dataframe
    store.close()

    if VERBOSE:
        print(now(), 'DataFrames stored in file.')

Source File: test_orca.py From orca with BSD 3-Clause "New" or "Revised" License

6 votes

def test_write_tables(df, store_name):
    orca.add_table('table', df)

    @orca.step()
    def step(table):
        pass

    step_tables = orca.get_step_table_names(['step'])

    orca.write_tables(store_name, step_tables, None)
    with pd.HDFStore(store_name, mode='r') as store:
        assert 'table' in store
        pdt.assert_frame_equal(store['table'], df)

    orca.write_tables(store_name, step_tables, 1969)

    with pd.HDFStore(store_name, mode='r') as store:
        assert '1969/table' in store
        pdt.assert_frame_equal(store['1969/table'], df)

Source File: test_orca.py From orca with BSD 3-Clause "New" or "Revised" License

6 votes

def test_run_and_write_tables_out_tables_provided(df, store_name):
    table_names = ['table', 'table2', 'table3']
    for t in table_names:
        orca.add_table(t, df)

    @orca.step()
    def step(iter_var, table, table2):
        return

    orca.run(
        ['step'],
        iter_vars=range(1),
        data_out=store_name,
        out_base_tables=table_names,
        out_run_tables=['table'])

    with pd.HDFStore(store_name, mode='r') as store:

        for t in table_names:
            assert 'base/{}'.format(t) in store

        assert '0/table' in store
        assert '0/table2' not in store
        assert '0/table3' not in store

Source File: helper.py From tierpsy-tracker with MIT License

6 votes

def calculate_bgnd_from_masked_fulldata(masked_image_file):
    """
    - Opens the masked_image_file hdf5 file, reads the /full_data node and 
      creates a "background" by taking the maximum value of each pixel over time.
    - Parses the file name to find a camera serial number
    - reads the pixel/um ratio from the masked_image_file
    """
    import numpy as np
    from tierpsy.helper.params import read_unit_conversions

    # read attributes of masked_image_file
    _, (microns_per_pixel, xy_units) , is_light_background = read_unit_conversions(masked_image_file)
    # get "background" and px2um
    with pd.HDFStore(masked_image_file, 'r') as fid:
        assert is_light_background, \
        'MultiWell recognition is only available for brightfield at the moment'
        img = np.max(fid.get_node('/full_data'), axis=0)
    
    camera_serial = parse_camera_serial(masked_image_file)
    
    return img, camera_serial, microns_per_pixel

Source File: process_ow.py From tierpsy-tracker with MIT License

5 votes

def ow_trajectories_summary(fname):
    
    fps = read_fps(fname)
    with pd.HDFStore(fname, 'r') as fid:
        features_timeseries = fid['/features_timeseries']
    
    all_summary = []
    
    valid_order = None
    wStats = WormStats()
    for w_ind, w_ts_data in features_timeseries.groupby('worm_index'):
        
        ll = ['worm_{}'.format(int(w_ind))]
        all_feats = read_feat_events(fname, ll)
        for cc in w_ts_data:
            all_feats[cc] = w_ts_data[cc].values
        
        
        exp_feats = wStats.getWormStats(all_feats, np.nanmean)
        exp_feats = pd.DataFrame(exp_feats)
        
        if valid_order is None:
            #only calculate this the first time...
            valid_order = [x for x in exp_feats.columns if x not in wStats.extra_fields]
        
        #remove uncalculated indexes from wStats
        exp_feats = exp_feats.loc[:, valid_order]
        assert not 'worm_index' in exp_feats
        
        exp_feats = add_trajectory_info(exp_feats, w_ind, w_ts_data, fps)
        
        
        all_summary.append(exp_feats)
    all_summary = pd.concat(all_summary, ignore_index=True)

    return [all_summary]
#%%

Source File: getIntensityProfile.py From tierpsy-tracker with MIT License

5 votes

def setIntMapIndexes(skeletons_file, min_num_skel):
    # get index of valid skeletons. Let's use pandas because it is easier to
    # process.
    with pd.HDFStore(skeletons_file, 'r') as fid:
        trajectories_data = fid['/trajectories_data']

        if 'is_good_skel' in trajectories_data:
            # select rows with only valid filtered skeletons
            good = trajectories_data['is_good_skel'] == 1
        else:
            # or that at least have an skeleton
            good = trajectories_data['has_skeleton'] == 1

        trajectories_data_valid = trajectories_data[good]

        # select trajectories that have at least min_num_skel valid skeletons
        N = trajectories_data_valid.groupby(
            'worm_index_joined').agg({'has_skeleton': np.nansum})
        N = N[N > min_num_skel].dropna()
        good = trajectories_data_valid['worm_index_joined'].isin(N.index)
        trajectories_data_valid = trajectories_data_valid.loc[good]

    # assing indexes to the new rows
    tot_valid_rows = len(trajectories_data_valid)
    trajectories_data['int_map_id'] = -1
    trajectories_data.loc[
        trajectories_data_valid.index,
        'int_map_id'] = np.arange(tot_valid_rows)

    # let's save this data into the skeletons file
    save_modified_table(skeletons_file, trajectories_data, 'trajectories_data')

    # get the valid trajectories with the correct index. There is probably a
    # faster way to do this, but this is less prone to errors.
    trajectories_data_valid = trajectories_data[
        trajectories_data['int_map_id'] != -1]

    # return the reduced version with only valid rows
    return trajectories_data_valid

Source File: eq_loc.py From pykonal with GNU General Public License v3.0

5 votes

def write_events(dataframe, output_file):
    """
    Write event locations to HDF5 file via pandas.HDFStore.
    """

    logger.debug("Saving event locations to disk.")

    # Convert dtypes before saving event locations.
    for field in DTYPES:
        dataframe[field] = dataframe[field].astype(DTYPES[field])

    with pd.HDFStore(output_file, mode="w") as store:
        store["events"] = dataframe

    return (True)

Source File: io.py From PyPSA with GNU General Public License v3.0

5 votes

def __init__(self, path, **kwargs):
        self.ds = pd.HDFStore(path, mode='w', **kwargs)
        self.index = {}

Source File: io.py From PyPSA with GNU General Public License v3.0

5 votes

def __init__(self, path):
        self.ds = pd.HDFStore(path, mode='r')
        self.index = {}

Source File: seqlib.py From Enrich2 with BSD 3-Clause "New" or "Revised" License

5 votes

def counts_from_file_h5(self, fname):
        """
        If an HDF store containing raw counts has been specified, open the
        store, copy those counts into this store, and close the counts store.

        Copies all tables in the ``'/raw'`` group along with their metadata.
        """
        store = pd.HDFStore(fname)
        self.logger.info(
            "Using existing HDF5 data store '{}' for raw data" "".format(fname)
        )
        # this could probably be much more efficient, but the PyTables docs
        # don't explain copying subsets of files adequately
        raw_keys = [key for key in store.keys() if key.startswith("/raw/")]
        if len(raw_keys) == 0:
            raise ValueError(
                "No raw counts found in '{}' [{}]" "".format(fname, self.name)
            )
        else:
            for k in raw_keys:
                # copy the data table
                raw = store[k]
                self.store.put(k, raw, format="table", data_columns=raw.columns)
                # copy the metadata
                self.set_metadata(k, self.get_metadata(k, store=store), update=False)
                self.logger.info("Copied raw data '{}'".format(k))
        store.close()

Source File: storemanager.py From Enrich2 with BSD 3-Clause "New" or "Revised" License

5 votes

def get_metadata(self, key, store=None):
        """
        Retrieve the Enrich2 metadata dictionary from the HDF5 store.

        *key* is the name of the group or node in the HDF5 data store.

        Returns the metadata dictionary for *key*. If no metadata has been set
        for *key*, returns ``None``.

        *store* can be an external open HDFStore (used when copying metadata
        from raw counts). If it is ``None``, use this object's store.

        """
        if store is None:
            store = self.store
        try:
            metadata = store.get_storer(key).attrs["enrich2"]
        except AttributeError:
            if store is self.store:  # store parameter was None
                raise AttributeError(
                    "Invalid HDF store node '{}' [{}]".format(key, self.name)
                )
            else:
                raise AttributeError(
                    "Invalid external HDF store node '{}' in "
                    "'{}' [{}]".format(key, store.filename, self.name)
                )
        except KeyError:  # no enrich2 metadata
            return None
        else:
            return metadata

Source File: dataio.py From tribeflow with BSD 3-Clause "New" or "Revised" License

5 votes

def save_model(out_fpath, model):
    '''
    Saves the given model to out_fpath. The model is simply a map of string
    keys, numpy array or dict values. Nothing else is supported.

    key -> array
    key -> dict

    Only.

    Parameters
    ----------
    out_fpath : string
        Where to save the model
    model : dict
        The actual model
    '''
    store = pd.HDFStore(out_fpath, 'w')
    for model_key in model:
        model_val = model[model_key]
        
        if type(model_val) == np.ndarray:
            store[model_key] = pd.DataFrame(model_val)
        else:
            store[model_key] = pd.DataFrame(model_val.items(), \
                    columns=['Name', 'Id'])
    store.close()

Source File: storemanager.py From Enrich2 with BSD 3-Clause "New" or "Revised" License

5 votes

def store_open(self, children=False, force_delete=True):
        """
        Open the HDF5 file associated with this object. If the
        ``force_recalculate`` option is selected and ``force_delete`` is
        ``True``, the existing tables under ``'/main'`` will be deleted upon
        opening.

        This method needs a lot more error checking.
        """
        if self.has_store:
            if not self.store_cfg:
                self.store_path = os.path.join(
                    self.output_dir,
                    "{}_{}.h5".format(fix_filename(self.name), self.store_suffix),
                )
                if os.path.exists(self.store_path):
                    self.logger.info(
                        'Found existing HDF5 data store "{}"'.format(self.store_path)
                    )
                else:
                    self.logger.info(
                        'Creating new HDF5 data store "{}"'.format(self.store_path)
                    )
            self.store = pd.HDFStore(self.store_path)
            if self.force_recalculate and force_delete:
                if "/main" in self.store:
                    self.logger.info("Deleting existing calculated values")
                    self.store.remove("/main")
                else:
                    self.logger.warning("No existing calculated values in file")

        if children and self.children is not None:
            for child in self.children:
                child.store_open(children=True)

Source File: network.py From teneto with GNU General Public License v3.0

5 votes

def hdf5_setup(self, hdf5path):
        """
        """
        hdf = pd.HDFStore(hdf5path)
        hdf.put('network', self.network, format='table', data_columns=True)
        hdf.close()
        self.hdf5 = True
        self.network = hdf5path

Source File: bic.py From tribeflow with BSD 3-Clause "New" or "Revised" License

5 votes

def main(model):
    store = pd.HDFStore(model)
    
    from_ = store['from_'][0][0]
    to = store['to'][0][0]
    assert from_ == 0
    
    trace_fpath = store['trace_fpath'][0][0]
    Psi_oz = store['Psi_sz'].values
    count_z = store['count_z'].values[:, 0]

    obj2id = dict(store['source2id'].values)

    Psi_oz = Psi_oz / Psi_oz.sum(axis=0)
    Psi_zo = (Psi_oz * count_z).T
    Psi_zo = Psi_zo / Psi_zo.sum(axis=0)

    mem_size = store['Dts'].values.shape[1]
    
    probs = {}
    ll = 0.0
    n = 0.0
    with open(trace_fpath) as trace_file:
        for i, l in enumerate(trace_file): 
            if i >= to:
                break
            
            n += 1
            spl = l.strip().split('\t')
            _, _, s, d = spl
            if (obj2id[d], obj2id[s]) not in probs:
                probs[obj2id[d], obj2id[s]] = \
                        (Psi_oz[obj2id[d]] * Psi_zo[:, obj2id[s]]).sum()
            ll += np.log(probs[obj2id[d], obj2id[s]])

    print(-2 * ll + n * np.log(count_z.shape[0] + sum(Psi_oz.shape)))
    store.close()

Source File: network.py From teneto with GNU General Public License v3.0

5 votes

def drop_edge(self, edgelist):
        """
        Removes an edge from network.

        Parameters
        ----------

        edgelist : list
            a list (or list of lists) containing the i,j and t indicies to be removes.

        Returns
        --------
            Updates TenetoBIDS.network dataframe
        """
        if not isinstance(edgelist[0], list):
            edgelist = [edgelist]
        self._check_input(edgelist, 'edgelist')
        if self.hdf5:
            with pd.HDFStore(self.network) as hdf:
                for e in edgelist:
                    hdf.remove(
                        'network', 'i == ' + str(e[0]) + ' & ' + 'j == ' + str(e[1]) + ' & ' + 't == ' + str(e[2]))
            print('HDF5 delete warning. This will not reduce the size of the file.')
        else:
            for e in edgelist:
                idx = self.network[(self.network['i'] == e[0]) & (
                    self.network['j'] == e[1]) & (self.network['t'] == e[2])].index
                self.network.drop(idx, inplace=True)
            self.network.reset_index(inplace=True, drop=True)
            self._update_network()

Source File: network.py From teneto with GNU General Public License v3.0

5 votes

def add_edge(self, edgelist):
        """
        Adds an edge from network.

        Parameters
        ----------

        edgelist : list
            a list (or list of lists) containing the i,j and t indicies to be added. For weighted networks list should also contain a 'weight' key.

        Returns
        --------
            Updates TenetoBIDS.network dataframe with new edge
        """
        if not self.sparse:
            raise ValueError('Add edge not compatible with dense network')
        if not isinstance(edgelist[0], list):
            edgelist = [edgelist]
        self._check_input(edgelist, 'edgelist')
        if len(edgelist[0]) == 4:
            colnames = ['i', 'j', 't', 'weight']
        elif len(edgelist[0]) == 3:
            colnames = ['i', 'j', 't']
        if self.hdf5:
            with pd.HDFStore(self.network) as hdf:
                rows = hdf.get_storer('network').nrows
                hdf.append('network', pd.DataFrame(edgelist, columns=colnames, index=np.arange(
                    rows, rows+len(edgelist))), format='table', data_columns=True)
            edgelist = np.array(edgelist)
            if np.max(edgelist[:, :2]) > self.netshape[0]:
                self.netshape[0] = np.max(edgelist[:, :2])
            if np.max(edgelist[:, 2]) > self.netshape[1]:
                self.netshape[1] = np.max(edgelist[:, 2])
        else:
            newedges = pd.DataFrame(edgelist, columns=colnames)
            self.network = pd.concat(
                [self.network, newedges], ignore_index=True, sort=True)
            self._update_network()

Source File: Omlette.py From OpenTrader with GNU Lesser General Public License v3.0

5 votes

def __init__(self, sHdfStore="", oFd=sys.stdout):
        self.oHdfStore = None
        self.oFd = oFd
        if sHdfStore:
            # ugly - active
            self.oHdfStore = pandas.HDFStore(sHdfStore, mode='w')
            self.oFd.write("INFO: hdf store" +self.oHdfStore.filename +'\n')

        self.oRecipe = None
        self.oChefModule = None

Python pandas.HDFStore() Examples