Python pandas.read_hdf() Examples
The following are 30
code examples of pandas.read_hdf().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: demand_writers.py From CityEnergyAnalyst with MIT License | 6 votes |
def write_to_hdf5(self, list_buildings, locator): """read in the temporary results files and append them to the Totals.csv file.""" df = None for name in list_buildings: temporary_file = locator.get_temporary_file('%(name)sT.hdf' % locals()) if df is None: df = pd.read_hdf(temporary_file, key='dataset') else: df = df.append(pd.read_hdf(temporary_file, key='dataset')) df.to_hdf(locator.get_total_demand('hdf'), key='dataset') """read saved data of monthly values and return as totals""" monthly_data_buildings = [pd.read_hdf(locator.get_demand_results_file(building_name, 'hdf'), key=building_name) for building_name in list_buildings] return df, monthly_data_buildings
Example #2
Source File: runwrapper2017mpi_r3.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def tradesrets_to_list(h5in, outlist): indf = pd.read_hdf(h5in, 'trades') trades = indf.price.count() minprice = indf.price.min() maxprice = indf.price.max() indf = indf.assign(ret = 100*indf.price.pct_change()) indf = indf.assign(abs_ret = np.abs(indf.ret)) lags = [] autocorr = [] abs_autocorr = [] for i in range(1,51): ac = indf.ret.autocorr(lag = i) aac = indf.abs_ret.autocorr(lag = i) lags.append(i) autocorr.append(ac) abs_autocorr.append(aac) ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr}) ar_df.set_index('lag', inplace=True) clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum()) returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant, 'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(), 'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j} outlist.append(returns_dict)
Example #3
Source File: runwrapper2017mpi_r4.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def participation_to_list(h5in, outlist): trade_df = pd.read_hdf(h5in, 'trades') trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0]) lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count()) lt_df.rename(columns={'quantity': 'trade'}, inplace=True) if 'p999999' in lt_df.index: lt_df.drop('p999999', inplace=True) ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum()) ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True) ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum()) providers = ltsum_df.index.unique() market_makers = [x for x in providers if x.startswith('m')] market_makers.append('j0') ltsum_df = ltsum_df.ix[market_makers] part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']} if 'j0' in providers: part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']}) outlist.append(part_dict)
Example #4
Source File: runwrapper2017mpi_r3.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def profit_to_list(h5in, outlist): trade_df = pd.read_hdf(h5in, 'trades') trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0]) buy_trades = trade_df[trade_df.side=='buy'] buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity) buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(), CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum() ) buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True) sell_trades = trade_df[trade_df.side=='sell'] sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity) sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(), CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum() ) sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True) buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']] sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']] cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol']) cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF) temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last() temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol) temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']] outlist.append(temp_df)
Example #5
Source File: photoz.py From gcr-catalogs with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _iter_native_dataset(self, native_filters=None): current_fname = None for meta_tract in self._metadata: for meta_patch in meta_tract['patches']: tract_patch = {'tract': meta_tract['tract'], 'patch': meta_patch['patch']} if native_filters and not native_filters.check_scalar(tract_patch): continue if current_fname != meta_tract['filename']: current_fname = meta_tract['filename'] df = pd.read_hdf(os.path.join(self.base_dir, current_fname), 'df') slice_this = slice(*meta_patch['slice']) def native_quantity_getter(native_quantity): # pylint: disable=W0640 # variables (df and slice_this) intentionally defined in loop if native_quantity == '_FULL_PDF': return df.iloc[slice_this, :self._n_pdf_bins].values return df[native_quantity].values[slice_this] yield native_quantity_getter # Native quantity names in the photo-z catalog are too uninformative # Since native quantities will become regular quantities in composite catalog, # let us hide them all.
Example #6
Source File: runwrapper2017mpi_r3.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def participation_to_list(h5in, outlist): trade_df = pd.read_hdf(h5in, 'trades') trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0]) lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count()) lt_df.rename(columns={'quantity': 'trade'}, inplace=True) if 'p999999' in lt_df.index: lt_df.drop('p999999', inplace=True) ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum()) ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True) ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum()) providers = ltsum_df.index.unique() market_makers = [x for x in providers if x.startswith('m')] market_makers.append('j0') ltsum_df = ltsum_df.ix[market_makers] part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']} if 'j0' in providers: part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']}) outlist.append(part_dict)
Example #7
Source File: data_utils.py From RGAN with MIT License | 6 votes |
def resample_eICU_patient(pid, resample_factor_in_min, variables, upto_in_minutes): """ Resample a *single* patient. """ pat_df = pd.read_hdf(paths.eICU_hdf_dir + '/vitalPeriodic.h5', where='patientunitstayid = ' + str(pid), columns=['observationoffset', 'patientunitstayid'] + variables, mode='r') # sometimes it's empty if pat_df.empty: return None if not upto_in_minutes is None: pat_df = pat_df.loc[0:upto_in_minutes*60] # convert the offset to a TimedeltaIndex (necessary for resampling) pat_df.observationoffset = pd.TimedeltaIndex(pat_df.observationoffset, unit='m') pat_df.set_index('observationoffset', inplace=True) pat_df.sort_index(inplace=True) # resample by time pat_df_resampled = pat_df.resample(str(resample_factor_in_min) + 'T').median() # pandas ignores NA in median by default # rename pid, cast to int pat_df_resampled.rename(columns={'patientunitstayid': 'pid'}, inplace=True) pat_df_resampled['pid'] = np.int32(pat_df_resampled['pid']) # get offsets in minutes from index pat_df_resampled['offset'] = np.int32(pat_df_resampled.index.total_seconds()/60) return pat_df_resampled
Example #8
Source File: runwrapper2017mpi_r4.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def profit_to_list(h5in, outlist): trade_df = pd.read_hdf(h5in, 'trades') trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0]) buy_trades = trade_df[trade_df.side=='buy'] buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity) buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(), CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum() ) buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True) sell_trades = trade_df[trade_df.side=='sell'] sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity) sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(), CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum() ) sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True) buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']] sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']] cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol']) cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF) temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last() temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol) temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']] outlist.append(temp_df)
Example #9
Source File: runwrapper2017mpi_r4.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def tradesrets_to_list(h5in, outlist): indf = pd.read_hdf(h5in, 'trades') trades = indf.price.count() minprice = indf.price.min() maxprice = indf.price.max() indf = indf.assign(ret = 100*indf.price.pct_change()) indf = indf.assign(abs_ret = np.abs(indf.ret)) lags = [] autocorr = [] abs_autocorr = [] for i in range(1,51): ac = indf.ret.autocorr(lag = i) aac = indf.abs_ret.autocorr(lag = i) lags.append(i) autocorr.append(ac) abs_autocorr.append(aac) ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr}) ar_df.set_index('lag', inplace=True) clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum()) returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant, 'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(), 'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j} outlist.append(returns_dict)
Example #10
Source File: runwrapper2017mpi_r3x.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def participation_to_list(h5in, outlist): trade_df = pd.read_hdf(h5in, 'trades') trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0]) lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count()) lt_df.rename(columns={'quantity': 'trade'}, inplace=True) if 'p999999' in lt_df.index: lt_df.drop('p999999', inplace=True) ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum()) ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True) ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum()) providers = ltsum_df.index.unique() market_makers = [x for x in providers if x.startswith('m')] market_makers.append('j0') ltsum_df = ltsum_df.ix[market_makers] part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']} if 'j0' in providers: part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']}) outlist.append(part_dict)
Example #11
Source File: test_hdf.py From vivarium with GNU General Public License v3.0 | 6 votes |
def test_write_data_frame(hdf_file_path): key = hdf.EntityKey('cause.test.prevalence') data = build_table([lambda *args, **kwargs: random.choice([0, 1]), "Kenya", 1], 2005, 2010, columns=('age', 'year', 'sex', 'draw', 'location', 'value')) non_val_columns = data.columns.difference({'value'}) data = data.set_index(list(non_val_columns)) hdf._write_pandas_data(hdf_file_path, key, data) written_data = pd.read_hdf(hdf_file_path, key.path) assert written_data.equals(data) filter_terms = ['draw == 0'] written_data = pd.read_hdf(hdf_file_path, key.path, where=filter_terms) assert written_data.equals(data.xs(0, level='draw', drop_level=False))
Example #12
Source File: test_common.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll # have to read and compare equality pytest.importorskip('tables') df = pd.DataFrame({"A": [1, 2]}) p1 = tm.ensure_clean('string') p2 = tm.ensure_clean('fspath') with p1 as string, p2 as fspath: mypath = CustomFSPath(fspath) df.to_hdf(mypath, key='bar') df.to_hdf(string, key='bar') result = pd.read_hdf(fspath, key='bar') expected = pd.read_hdf(string, key='bar') tm.assert_frame_equal(result, expected)
Example #13
Source File: utils_testing.py From auto_ml with MIT License | 6 votes |
def get_twitter_sentiment_multilabel_classification_dataset(): file_name = os.path.join('tests', 'twitter_sentiment.h5') try: df_twitter = pd.read_hdf(file_name) except Exception as e: print('Error') print(e) dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv' df_twitter = pd.read_csv(dataset_url, encoding='latin-1') # Do not write the index that pandas automatically creates df_twitter.to_hdf(file_name, key='df', format='fixed') # Grab only 10% of the dataset- runs much faster this way df_twitter = df_twitter.sample(frac=0.1) df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created) df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42) return df_twitter_train, df_twitter_test
Example #14
Source File: test_temporalnetwork.py From teneto with GNU General Public License v3.0 | 6 votes |
def test_hdf5(): df = pd.DataFrame({'i': [0, 0], 'j': [1, 2], 't': [0, 1]}) tnet = teneto.TemporalNetwork(from_df=df, hdf5=True) if not tnet.network == './teneto_temporalnetwork.h5': raise AssertionError() df2 = pd.read_hdf('./teneto_temporalnetwork.h5') if not (df == df2).all().all(): raise AssertionError() tnet.add_edge([0, 2, 2]) df3 = pd.read_hdf('./teneto_temporalnetwork.h5') if not (df3.iloc[2].values == [0, 2, 2]).all(): raise AssertionError() tnet.drop_edge([0, 2, 2]) df4 = pd.read_hdf('./teneto_temporalnetwork.h5') if not (df == df4).all().all(): raise AssertionError()
Example #15
Source File: __main__.py From picasso with MIT License | 6 votes |
def _hdf2csv(path): from glob import glob import pandas as pd from tqdm import tqdm as _tqdm from os.path import isdir if isdir(path): paths = glob(path + "/*.hdf5") else: paths = glob(path) if paths: import os.path for path in _tqdm(paths): base, ext = os.path.splitext(path) if ext == ".hdf5": print("Converting {}".format(path)) out_path = base + ".csv" locs = pd.read_hdf(path) print("A total of {} rows loaded".format(len(locs))) locs.to_csv(out_path, sep=",", encoding="utf-8") print("Complete.")
Example #16
Source File: main.py From predictatops with MIT License | 6 votes |
def load_prev_results_at_path(full_path_to_results_file, key="df"): """ A function used to return a dataframe of wells stored in an h5 file at a given path with a given key. Parameters ---------- full_path_to_results_file: string A path to a .h5 file that contains a wells dataframe. key: string A string representation of a key used to find the dataframe in the h5 file whose path is defined by the full_path_to_results_file argument. Returns ------- wells_df_from_wellsKNN: dataframe Returns a dataframe of wells that existed at the path defined in the full_path_to_results_file argument. """ wells_df_from_wellsKNN = pd.read_hdf(full_path_to_results_file, key=key) return wells_df_from_wellsKNN
Example #17
Source File: runwrapper2017mpi_r3x.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def profit_to_list(h5in, outlist): trade_df = pd.read_hdf(h5in, 'trades') trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0]) buy_trades = trade_df[trade_df.side=='buy'] buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity) buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(), CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum() ) buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True) sell_trades = trade_df[trade_df.side=='sell'] sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity) sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(), CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum() ) sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True) buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']] sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']] cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol']) cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF) temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last() temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol) temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']] outlist.append(temp_df)
Example #18
Source File: runwrapper2017mpi_r3x.py From pyziabm with BSD 3-Clause "New" or "Revised" License | 6 votes |
def tradesrets_to_list(h5in, outlist): indf = pd.read_hdf(h5in, 'trades') trades = indf.price.count() minprice = indf.price.min() maxprice = indf.price.max() indf = indf.assign(ret = 100*indf.price.pct_change()) indf = indf.assign(abs_ret = np.abs(indf.ret)) lags = [] autocorr = [] abs_autocorr = [] for i in range(1,51): ac = indf.ret.autocorr(lag = i) aac = indf.abs_ret.autocorr(lag = i) lags.append(i) autocorr.append(ac) abs_autocorr.append(aac) ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr}) ar_df.set_index('lag', inplace=True) clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum()) returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant, 'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(), 'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j} outlist.append(returns_dict)
Example #19
Source File: plot_history.py From age-gender-estimator-keras with MIT License | 6 votes |
def main(): args = get_args() input_path = args.input df = pd.read_hdf(input_path, "history") input_dir = os.path.dirname(input_path) plt.plot(df["gender_loss"], label="loss (gender)") plt.plot(df["age_loss"], label="loss (age)") plt.plot(df["val_gender_loss"], label="val_loss (gender)") plt.plot(df["val_age_loss"], label="val_loss (age)") plt.xlabel("number of epochs") plt.ylabel("loss") plt.legend() plt.savefig(os.path.join(input_dir, "loss.png")) plt.cla() plt.plot(df["gender_acc"], label="accuracy (gender)") plt.plot(df["age_acc"], label="accuracy (age)") plt.plot(df["val_gender_acc"], label="val_accuracy (gender)") plt.plot(df["val_age_acc"], label="val_accuracy (age)") plt.xlabel("number of epochs") plt.ylabel("accuracy") plt.legend() plt.savefig(os.path.join(input_dir, "accuracy.png"))
Example #20
Source File: batch.py From batchflow with Apache License 2.0 | 6 votes |
def _load_table(self, src, fmt, dst=None, post=None, *args, **kwargs): """ Load a data frame from table formats: csv, hdf5, feather """ if fmt == 'csv': _data = pd.read_csv(src, *args, **kwargs) elif fmt == 'feather': _data = feather.read_dataframe(src, *args, **kwargs) elif fmt == 'hdf5': _data = pd.read_hdf(src, *args, **kwargs) # Put into this batch only part of it (defined by index) if isinstance(_data, pd.DataFrame): _data = _data.loc[self.indices] elif isinstance(_data, dd.DataFrame): # dask.DataFrame.loc supports advanced indexing only with lists _data = _data.loc[list(self.indices)].compute() if callable(post): _data = post(_data, src=src, fmt=fmt, dst=dst, **kwargs) self.load(src=_data, dst=dst)
Example #21
Source File: Omlette.py From OpenTrader with GNU Lesser General Public License v3.0 | 6 votes |
def iMain(): """ Read an hdf file generated by us to make sure we can recover its content and structure. Give the name of an hdf5 file as a command-line argument. """ assert sys.argv, __doc__ sFile = sys.argv[1] assert os.path.isfile(sFile) oHdfStore = pandas.HDFStore(sFile, mode='r') print oHdfStore.groups() # bug - no return value # oSignals = pandas.read_hdf(oHdfStore, '/servings/signals') mSignals = oHdfStore.select('/recipe/servings/mSignals', auto_close=False) print mSignals print oHdfStore.get_node('/recipe')._v_attrs.metadata[0]['sUrl']
Example #22
Source File: test_common.py From vnpy_crypto with MIT License | 6 votes |
def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll # have to read and compare equality pytest.importorskip('tables') df = pd.DataFrame({"A": [1, 2]}) p1 = tm.ensure_clean('string') p2 = tm.ensure_clean('fspath') with p1 as string, p2 as fspath: mypath = CustomFSPath(fspath) df.to_hdf(mypath, key='bar') df.to_hdf(string, key='bar') result = pd.read_hdf(fspath, key='bar') expected = pd.read_hdf(string, key='bar') tm.assert_frame_equal(result, expected)
Example #23
Source File: data_utils.py From autodeepnet with MIT License | 6 votes |
def load_hdf5_data(file_path, **kwargs): key = kwargs.get('key', None) pandas_format = kwargs.get('pandas_format', True) mode = kwargs.get('mode', 'r') logger.info("Opening HDF5 file {} to read...".format(file_path)) try: if pandas_format: data = pd.read_hdf(file_path, key=key, mode=mode) else: with h5py.File(file_path, mode) as f: data = f[key][()] except KeyError as e: logger.exception("Dataset {} does not exist".format(dataset)) raise exceptions.FileLoadError("Dataset does not exist") except Exception as e: logger.exception("Problem loading dataset: {0}".format(e)) raise exceptions.FileLoadError logger.info("Successfully loaded HDF5 data") return data
Example #24
Source File: test_common.py From recruit with Apache License 2.0 | 6 votes |
def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll # have to read and compare equality pytest.importorskip('tables') df = pd.DataFrame({"A": [1, 2]}) p1 = tm.ensure_clean('string') p2 = tm.ensure_clean('fspath') with p1 as string, p2 as fspath: mypath = CustomFSPath(fspath) df.to_hdf(mypath, key='bar') df.to_hdf(string, key='bar') result = pd.read_hdf(fspath, key='bar') expected = pd.read_hdf(string, key='bar') tm.assert_frame_equal(result, expected)
Example #25
Source File: hdfstore.py From PyTrendFollow with MIT License | 5 votes |
def read_symbol(symbol, q_type, provider): """ Read data from the corresponding HDF file """ if os.path.exists(fname(symbol, q_type, provider)): data = pd.read_hdf(fname(symbol, q_type, provider), 'quotes') return data else: # if symbol doesn't exist, an empty df is returned with only index columns c = ['contract', 'date'] if q_type == 'futures' else ['date'] return pd.DataFrame(columns=c).set_index(c)
Example #26
Source File: photoz.py From gcr-catalogs with BSD 3-Clause "New" or "Revised" License | 5 votes |
def generate_metadata(self, write_to_yaml=False): """ generate metadata """ meta = list() for fname in sorted(os.listdir(self.base_dir)): if not self._filename_re.match(fname): continue file_path = os.path.join(self.base_dir, fname) try: df = pd.read_hdf(file_path, 'df') except (IOError, OSError): warnings.warn('Cannot access {}; skipped'.format(file_path)) continue meta_tract = { 'tract': int(df['tract'].iloc[0]), 'filename': fname, } # Each file contains all patches in one tract, # but we want to be able to iterate over patches as well. # Here, we find the indices where the adjacent patch values differ, # and we record the slice indices for each patch. patches = df['patch'].values.astype('<U') indices = np.flatnonzero(np.concatenate(([True], patches[1:] != patches[:-1], [True]))) indices = np.vstack((indices[:-1], indices[1:])).T meta_tract['patches'] = [{'patch': str(patches[i]), 'slice': [int(i), int(j)]} for i, j in indices] meta.append(meta_tract) if write_to_yaml: if self._metadata_path and os.path.isfile(self._metadata_path): warnings.warn('Overwriting metadata file `{0}`, which is backed up at `{0}.bak`'.format(self._metadata_path)) shutil.copyfile(self._metadata_path, self._metadata_path + '.bak') with open(self._metadata_path, 'w') as meta_stream: yaml.dump(meta, meta_stream) return meta
Example #27
Source File: hdfstore.py From PyTrendFollow with MIT License | 5 votes |
def read_contract(symbol, contract, provider): """ Read a single contract for a future instrument """ if os.path.exists(fname(symbol, 'futures', provider)): data = pd.read_hdf(fname(symbol, 'futures', provider), 'quotes') return data.loc[int(contract), :] else: c = ['contract', 'date'] return pd.DataFrame(columns=c).set_index(c)
Example #28
Source File: utils.py From ladder with MIT License | 5 votes |
def load_df(dirpath, filename, varname=None): varname = filename if varname is None else varname fn = os.path.join(dirpath, filename) return read_hdf(fn, varname)
Example #29
Source File: mixins.py From traffic with MIT License | 5 votes |
def from_file( cls: Type[T], filename: Union[Path, str], **kwargs ) -> Optional[T]: """Read data from various formats. This class method dispatches the loading of data in various format to the proper ``pandas.read_*`` method based on the extension of the filename. - .pkl and .pkl.gz dispatch to ``pandas.read_pickle``; - .parquet and .parquet.gz dispatch to ``pandas.read_parquet``; - .json and .json.gz dispatch to ``pandas.read_json``; - .csv and .csv.gz dispatch to ``pandas.read_csv``; - .h5 dispatch to ``pandas.read_hdf``. Other extensions return ``None``. Specific arguments may be passed to the underlying ``pandas.read_*`` method with the kwargs argument. Example usage: >>> t = Traffic.from_file("data/sample_opensky.pkl") """ path = Path(filename) if path.suffixes in [[".pkl"], [".pkl", ".gz"]]: return cls(pd.read_pickle(path, **kwargs)) if path.suffixes in [[".parquet"], [".parquet", ".gz"]]: return cls(pd.read_parquet(path, **kwargs)) if path.suffixes in [[".json"], [".json", ".gz"]]: return cls(pd.read_json(path, **kwargs)) if path.suffixes in [[".csv"], [".csv", ".gz"]]: return cls(pd.read_csv(path, **kwargs)) if path.suffixes == [".h5"]: return cls(pd.read_hdf(path, **kwargs)) return None # --- Special methods ---
Example #30
Source File: eval_baseline_methods.py From DCRNN with MIT License | 5 votes |
def main(args): traffic_reading_df = pd.read_hdf(args.traffic_reading_filename) eval_static(traffic_reading_df) eval_historical_average(traffic_reading_df, period=7 * 24 * 12) eval_var(traffic_reading_df, n_lags=3)