Python pandas.read_hdf() Examples

The following are 30 code examples of pandas.read_hdf(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: demand_writers.py    From CityEnergyAnalyst with MIT License 6 votes vote down vote up
def write_to_hdf5(self, list_buildings, locator):
        """read in the temporary results files and append them to the Totals.csv file."""
        df = None
        for name in list_buildings:
            temporary_file = locator.get_temporary_file('%(name)sT.hdf' % locals())
            if df is None:
                df = pd.read_hdf(temporary_file, key='dataset')
            else:
                df = df.append(pd.read_hdf(temporary_file, key='dataset'))
        df.to_hdf(locator.get_total_demand('hdf'), key='dataset')

        """read saved data of monthly values and return as totals"""
        monthly_data_buildings = [pd.read_hdf(locator.get_demand_results_file(building_name, 'hdf'), key=building_name)
                                  for building_name in
                                  list_buildings]
        return df, monthly_data_buildings 
Example #2
Source File: runwrapper2017mpi_r3.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def tradesrets_to_list(h5in, outlist):
    indf = pd.read_hdf(h5in, 'trades')
    trades = indf.price.count()
    minprice = indf.price.min()
    maxprice = indf.price.max()
    
    indf = indf.assign(ret = 100*indf.price.pct_change())
    indf = indf.assign(abs_ret = np.abs(indf.ret))
    lags = []
    autocorr = []
    abs_autocorr = []
    for i in range(1,51):
        ac = indf.ret.autocorr(lag = i)
        aac = indf.abs_ret.autocorr(lag = i)
        lags.append(i)
        autocorr.append(ac)
        abs_autocorr.append(aac)
    ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr})
    ar_df.set_index('lag', inplace=True)
    clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum())
    
    returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant,
                    'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(),
                    'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j}
    outlist.append(returns_dict) 
Example #3
Source File: runwrapper2017mpi_r4.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def participation_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count())
    lt_df.rename(columns={'quantity': 'trade'}, inplace=True)
    if 'p999999' in lt_df.index:
        lt_df.drop('p999999', inplace=True)
    ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum())
    ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True)
    ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum())
    providers = ltsum_df.index.unique()
    market_makers = [x for x in providers if x.startswith('m')]
    market_makers.append('j0')
    ltsum_df = ltsum_df.ix[market_makers]
    part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']}
    if 'j0' in providers:
        part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']})
    outlist.append(part_dict) 
Example #4
Source File: runwrapper2017mpi_r3.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def profit_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    buy_trades = trade_df[trade_df.side=='buy']
    buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity)
    buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(),
                                   CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum()
                                  )
    buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True)
    sell_trades = trade_df[trade_df.side=='sell']
    sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity)
    sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(),
                                     CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum()
                                    )
    sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True)
    buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']]
    sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']]
    cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol'])
    cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF)
    temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last()
    temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol)
    temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']]
    outlist.append(temp_df) 
Example #5
Source File: photoz.py    From gcr-catalogs with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _iter_native_dataset(self, native_filters=None):
        current_fname = None
        for meta_tract in self._metadata:
            for meta_patch in meta_tract['patches']:
                tract_patch = {'tract': meta_tract['tract'], 'patch': meta_patch['patch']}
                if native_filters and not native_filters.check_scalar(tract_patch):
                    continue

                if current_fname != meta_tract['filename']:
                    current_fname = meta_tract['filename']
                    df = pd.read_hdf(os.path.join(self.base_dir, current_fname), 'df')

                slice_this = slice(*meta_patch['slice'])
                def native_quantity_getter(native_quantity):
                    # pylint: disable=W0640
                    # variables (df and slice_this) intentionally defined in loop
                    if native_quantity == '_FULL_PDF':
                        return df.iloc[slice_this, :self._n_pdf_bins].values
                    return df[native_quantity].values[slice_this]
                yield native_quantity_getter

    # Native quantity names in the photo-z catalog are too uninformative
    # Since native quantities will become regular quantities in composite catalog,
    # let us hide them all. 
Example #6
Source File: runwrapper2017mpi_r3.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def participation_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count())
    lt_df.rename(columns={'quantity': 'trade'}, inplace=True)
    if 'p999999' in lt_df.index:
        lt_df.drop('p999999', inplace=True)
    ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum())
    ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True)
    ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum())
    providers = ltsum_df.index.unique()
    market_makers = [x for x in providers if x.startswith('m')]
    market_makers.append('j0')
    ltsum_df = ltsum_df.ix[market_makers]
    part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']}
    if 'j0' in providers:
        part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']})
    outlist.append(part_dict) 
Example #7
Source File: data_utils.py    From RGAN with MIT License 6 votes vote down vote up
def resample_eICU_patient(pid, resample_factor_in_min, variables, upto_in_minutes):
    """
    Resample a *single* patient.
    """
    pat_df = pd.read_hdf(paths.eICU_hdf_dir + '/vitalPeriodic.h5',
                         where='patientunitstayid = ' + str(pid),
                         columns=['observationoffset', 'patientunitstayid'] + variables,
                         mode='r')
    # sometimes it's empty
    if pat_df.empty:
        return None
    if not upto_in_minutes is None:
        pat_df = pat_df.loc[0:upto_in_minutes*60]
    # convert the offset to a TimedeltaIndex (necessary for resampling)
    pat_df.observationoffset = pd.TimedeltaIndex(pat_df.observationoffset, unit='m')
    pat_df.set_index('observationoffset', inplace=True)
    pat_df.sort_index(inplace=True)
    # resample by time
    pat_df_resampled = pat_df.resample(str(resample_factor_in_min) + 'T').median()  # pandas ignores NA in median by default
    # rename pid, cast to int
    pat_df_resampled.rename(columns={'patientunitstayid': 'pid'}, inplace=True)
    pat_df_resampled['pid'] = np.int32(pat_df_resampled['pid'])
    # get offsets in minutes from index
    pat_df_resampled['offset'] = np.int32(pat_df_resampled.index.total_seconds()/60)
    return pat_df_resampled 
Example #8
Source File: runwrapper2017mpi_r4.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def profit_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    buy_trades = trade_df[trade_df.side=='buy']
    buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity)
    buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(),
                                   CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum()
                                  )
    buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True)
    sell_trades = trade_df[trade_df.side=='sell']
    sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity)
    sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(),
                                     CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum()
                                    )
    sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True)
    buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']]
    sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']]
    cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol'])
    cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF)
    temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last()
    temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol)
    temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']]
    outlist.append(temp_df) 
Example #9
Source File: runwrapper2017mpi_r4.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def tradesrets_to_list(h5in, outlist):
    indf = pd.read_hdf(h5in, 'trades')
    trades = indf.price.count()
    minprice = indf.price.min()
    maxprice = indf.price.max()
    
    indf = indf.assign(ret = 100*indf.price.pct_change())
    indf = indf.assign(abs_ret = np.abs(indf.ret))
    lags = []
    autocorr = []
    abs_autocorr = []
    for i in range(1,51):
        ac = indf.ret.autocorr(lag = i)
        aac = indf.abs_ret.autocorr(lag = i)
        lags.append(i)
        autocorr.append(ac)
        abs_autocorr.append(aac)
    ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr})
    ar_df.set_index('lag', inplace=True)
    clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum())
    
    returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant,
                    'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(),
                    'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j}
    outlist.append(returns_dict) 
Example #10
Source File: runwrapper2017mpi_r3x.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def participation_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count())
    lt_df.rename(columns={'quantity': 'trade'}, inplace=True)
    if 'p999999' in lt_df.index:
        lt_df.drop('p999999', inplace=True)
    ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum())
    ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True)
    ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum())
    providers = ltsum_df.index.unique()
    market_makers = [x for x in providers if x.startswith('m')]
    market_makers.append('j0')
    ltsum_df = ltsum_df.ix[market_makers]
    part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']}
    if 'j0' in providers:
        part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']})
    outlist.append(part_dict) 
Example #11
Source File: test_hdf.py    From vivarium with GNU General Public License v3.0 6 votes vote down vote up
def test_write_data_frame(hdf_file_path):
    key = hdf.EntityKey('cause.test.prevalence')
    data = build_table([lambda *args, **kwargs: random.choice([0, 1]), "Kenya", 1],
                       2005, 2010, columns=('age', 'year', 'sex', 'draw', 'location', 'value'))

    non_val_columns = data.columns.difference({'value'})
    data = data.set_index(list(non_val_columns))

    hdf._write_pandas_data(hdf_file_path, key, data)

    written_data = pd.read_hdf(hdf_file_path, key.path)
    assert written_data.equals(data)

    filter_terms = ['draw == 0']
    written_data = pd.read_hdf(hdf_file_path, key.path, where=filter_terms)
    assert written_data.equals(data.xs(0, level='draw', drop_level=False)) 
Example #12
Source File: test_common.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_write_fspath_hdf5(self):
        # Same test as write_fspath_all, except HDF5 files aren't
        # necessarily byte-for-byte identical for a given dataframe, so we'll
        # have to read and compare equality
        pytest.importorskip('tables')

        df = pd.DataFrame({"A": [1, 2]})
        p1 = tm.ensure_clean('string')
        p2 = tm.ensure_clean('fspath')

        with p1 as string, p2 as fspath:
            mypath = CustomFSPath(fspath)
            df.to_hdf(mypath, key='bar')
            df.to_hdf(string, key='bar')

            result = pd.read_hdf(fspath, key='bar')
            expected = pd.read_hdf(string, key='bar')

        tm.assert_frame_equal(result, expected) 
Example #13
Source File: utils_testing.py    From auto_ml with MIT License 6 votes vote down vote up
def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.h5')

    try:
        df_twitter = pd.read_hdf(file_name)
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
        # Do not write the index that pandas automatically creates

        df_twitter.to_hdf(file_name, key='df', format='fixed')

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test 
Example #14
Source File: test_temporalnetwork.py    From teneto with GNU General Public License v3.0 6 votes vote down vote up
def test_hdf5():
    df = pd.DataFrame({'i': [0, 0], 'j': [1, 2], 't': [0, 1]})
    tnet = teneto.TemporalNetwork(from_df=df, hdf5=True)
    if not tnet.network == './teneto_temporalnetwork.h5':
        raise AssertionError()
    df2 = pd.read_hdf('./teneto_temporalnetwork.h5')
    if not (df == df2).all().all():
        raise AssertionError()
    tnet.add_edge([0, 2, 2])
    df3 = pd.read_hdf('./teneto_temporalnetwork.h5')
    if not (df3.iloc[2].values == [0, 2, 2]).all():
        raise AssertionError()
    tnet.drop_edge([0, 2, 2])
    df4 = pd.read_hdf('./teneto_temporalnetwork.h5')
    if not (df == df4).all().all():
        raise AssertionError() 
Example #15
Source File: __main__.py    From picasso with MIT License 6 votes vote down vote up
def _hdf2csv(path):
    from glob import glob
    import pandas as pd
    from tqdm import tqdm as _tqdm
    from os.path import isdir

    if isdir(path):
        paths = glob(path + "/*.hdf5")
    else:
        paths = glob(path)
    if paths:
        import os.path

        for path in _tqdm(paths):
            base, ext = os.path.splitext(path)
            if ext == ".hdf5":
                print("Converting {}".format(path))
                out_path = base + ".csv"
                locs = pd.read_hdf(path)
                print("A total of {} rows loaded".format(len(locs)))
                locs.to_csv(out_path, sep=",", encoding="utf-8")
    print("Complete.") 
Example #16
Source File: main.py    From predictatops with MIT License 6 votes vote down vote up
def load_prev_results_at_path(full_path_to_results_file, key="df"):
    """
    A function used to return a dataframe of wells stored in an h5 file at a given path with a given key.
    
    Parameters
    ----------
    full_path_to_results_file: string
        A path to a .h5 file that contains a wells dataframe.

    key: string
        A string representation of a key used to find the dataframe in the h5 file whose path is defined by the full_path_to_results_file argument.

    Returns
    -------
    wells_df_from_wellsKNN: dataframe
        Returns a dataframe of wells that existed at the path defined in the full_path_to_results_file argument.
    """
    wells_df_from_wellsKNN = pd.read_hdf(full_path_to_results_file, key=key)
    return wells_df_from_wellsKNN 
Example #17
Source File: runwrapper2017mpi_r3x.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def profit_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    buy_trades = trade_df[trade_df.side=='buy']
    buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity)
    buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(),
                                   CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum()
                                  )
    buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True)
    sell_trades = trade_df[trade_df.side=='sell']
    sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity)
    sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(),
                                     CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum()
                                    )
    sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True)
    buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']]
    sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']]
    cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol'])
    cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF)
    temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last()
    temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol)
    temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']]
    outlist.append(temp_df) 
Example #18
Source File: runwrapper2017mpi_r3x.py    From pyziabm with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def tradesrets_to_list(h5in, outlist):
    indf = pd.read_hdf(h5in, 'trades')
    trades = indf.price.count()
    minprice = indf.price.min()
    maxprice = indf.price.max()
    indf = indf.assign(ret = 100*indf.price.pct_change())
    indf = indf.assign(abs_ret = np.abs(indf.ret))
    lags = []
    autocorr = []
    abs_autocorr = []
    for i in range(1,51):
        ac = indf.ret.autocorr(lag = i)
        aac = indf.abs_ret.autocorr(lag = i)
        lags.append(i)
        autocorr.append(ac)
        abs_autocorr.append(aac)
    ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr})
    ar_df.set_index('lag', inplace=True)
    clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum())
    returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant,
                    'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(),
                    'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j}
    outlist.append(returns_dict) 
Example #19
Source File: plot_history.py    From age-gender-estimator-keras with MIT License 6 votes vote down vote up
def main():
    args = get_args()
    input_path = args.input

    df = pd.read_hdf(input_path, "history")
    input_dir = os.path.dirname(input_path)
    plt.plot(df["gender_loss"], label="loss (gender)")
    plt.plot(df["age_loss"], label="loss (age)")
    plt.plot(df["val_gender_loss"], label="val_loss (gender)")
    plt.plot(df["val_age_loss"], label="val_loss (age)")
    plt.xlabel("number of epochs")
    plt.ylabel("loss")
    plt.legend()
    plt.savefig(os.path.join(input_dir, "loss.png"))
    plt.cla()

    plt.plot(df["gender_acc"], label="accuracy (gender)")
    plt.plot(df["age_acc"], label="accuracy (age)")
    plt.plot(df["val_gender_acc"], label="val_accuracy (gender)")
    plt.plot(df["val_age_acc"], label="val_accuracy (age)")
    plt.xlabel("number of epochs")
    plt.ylabel("accuracy")
    plt.legend()
    plt.savefig(os.path.join(input_dir, "accuracy.png")) 
Example #20
Source File: batch.py    From batchflow with Apache License 2.0 6 votes vote down vote up
def _load_table(self, src, fmt, dst=None, post=None, *args, **kwargs):
        """ Load a data frame from table formats: csv, hdf5, feather """
        if fmt == 'csv':
            _data = pd.read_csv(src, *args, **kwargs)
        elif fmt == 'feather':
            _data = feather.read_dataframe(src, *args, **kwargs)
        elif fmt == 'hdf5':
            _data = pd.read_hdf(src, *args, **kwargs)

        # Put into this batch only part of it (defined by index)
        if isinstance(_data, pd.DataFrame):
            _data = _data.loc[self.indices]
        elif isinstance(_data, dd.DataFrame):
            # dask.DataFrame.loc supports advanced indexing only with lists
            _data = _data.loc[list(self.indices)].compute()

        if callable(post):
            _data = post(_data, src=src, fmt=fmt, dst=dst, **kwargs)

        self.load(src=_data, dst=dst) 
Example #21
Source File: Omlette.py    From OpenTrader with GNU Lesser General Public License v3.0 6 votes vote down vote up
def iMain():
    """
    Read an hdf file generated by us to make sure
    we can recover its content and structure.
    Give the name of an hdf5 file as a command-line argument.
    """
    assert sys.argv, __doc__
    sFile = sys.argv[1]
    assert os.path.isfile(sFile)
    oHdfStore = pandas.HDFStore(sFile, mode='r')
    print oHdfStore.groups()
    # bug - no return value
    # oSignals = pandas.read_hdf(oHdfStore, '/servings/signals')
    mSignals = oHdfStore.select('/recipe/servings/mSignals', auto_close=False)    
    print mSignals
    print oHdfStore.get_node('/recipe')._v_attrs.metadata[0]['sUrl'] 
Example #22
Source File: test_common.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_write_fspath_hdf5(self):
        # Same test as write_fspath_all, except HDF5 files aren't
        # necessarily byte-for-byte identical for a given dataframe, so we'll
        # have to read and compare equality
        pytest.importorskip('tables')

        df = pd.DataFrame({"A": [1, 2]})
        p1 = tm.ensure_clean('string')
        p2 = tm.ensure_clean('fspath')

        with p1 as string, p2 as fspath:
            mypath = CustomFSPath(fspath)
            df.to_hdf(mypath, key='bar')
            df.to_hdf(string, key='bar')

            result = pd.read_hdf(fspath, key='bar')
            expected = pd.read_hdf(string, key='bar')

        tm.assert_frame_equal(result, expected) 
Example #23
Source File: data_utils.py    From autodeepnet with MIT License 6 votes vote down vote up
def load_hdf5_data(file_path, **kwargs):
    key = kwargs.get('key', None)
    pandas_format = kwargs.get('pandas_format', True)
    mode = kwargs.get('mode', 'r')
    logger.info("Opening HDF5 file {} to read...".format(file_path))
    try:
        if pandas_format:
            data = pd.read_hdf(file_path, key=key, mode=mode)
        else:
            with h5py.File(file_path, mode) as f:
                data = f[key][()]
    except KeyError as e:
        logger.exception("Dataset {} does not exist".format(dataset))
        raise exceptions.FileLoadError("Dataset does not exist")
    except Exception as e:
        logger.exception("Problem loading dataset: {0}".format(e))
        raise exceptions.FileLoadError
    logger.info("Successfully loaded HDF5 data")
    return data 
Example #24
Source File: test_common.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_write_fspath_hdf5(self):
        # Same test as write_fspath_all, except HDF5 files aren't
        # necessarily byte-for-byte identical for a given dataframe, so we'll
        # have to read and compare equality
        pytest.importorskip('tables')

        df = pd.DataFrame({"A": [1, 2]})
        p1 = tm.ensure_clean('string')
        p2 = tm.ensure_clean('fspath')

        with p1 as string, p2 as fspath:
            mypath = CustomFSPath(fspath)
            df.to_hdf(mypath, key='bar')
            df.to_hdf(string, key='bar')

            result = pd.read_hdf(fspath, key='bar')
            expected = pd.read_hdf(string, key='bar')

        tm.assert_frame_equal(result, expected) 
Example #25
Source File: hdfstore.py    From PyTrendFollow with MIT License 5 votes vote down vote up
def read_symbol(symbol, q_type, provider):
    """
    Read data from the corresponding HDF file
    """
    if os.path.exists(fname(symbol, q_type, provider)):
        data = pd.read_hdf(fname(symbol, q_type, provider), 'quotes')
        return data
    else:  # if symbol doesn't exist, an empty df is returned with only index columns
        c = ['contract', 'date'] if q_type == 'futures' else ['date']
        return pd.DataFrame(columns=c).set_index(c) 
Example #26
Source File: photoz.py    From gcr-catalogs with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def generate_metadata(self, write_to_yaml=False):
        """
        generate metadata
        """
        meta = list()
        for fname in sorted(os.listdir(self.base_dir)):
            if not self._filename_re.match(fname):
                continue

            file_path = os.path.join(self.base_dir, fname)
            try:
                df = pd.read_hdf(file_path, 'df')

            except (IOError, OSError):
                warnings.warn('Cannot access {}; skipped'.format(file_path))
                continue

            meta_tract = {
                'tract': int(df['tract'].iloc[0]),
                'filename': fname,
            }

            # Each file contains all patches in one tract,
            # but we want to be able to iterate over patches as well.
            # Here, we find the indices where the adjacent patch values differ,
            # and we record the slice indices for each patch.
            patches = df['patch'].values.astype('<U')
            indices = np.flatnonzero(np.concatenate(([True], patches[1:] != patches[:-1], [True])))
            indices = np.vstack((indices[:-1], indices[1:])).T
            meta_tract['patches'] = [{'patch': str(patches[i]), 'slice': [int(i), int(j)]} for i, j in indices]

            meta.append(meta_tract)

        if write_to_yaml:
            if self._metadata_path and os.path.isfile(self._metadata_path):
                warnings.warn('Overwriting metadata file `{0}`, which is backed up at `{0}.bak`'.format(self._metadata_path))
                shutil.copyfile(self._metadata_path, self._metadata_path + '.bak')
            with open(self._metadata_path, 'w') as meta_stream:
                yaml.dump(meta, meta_stream)

        return meta 
Example #27
Source File: hdfstore.py    From PyTrendFollow with MIT License 5 votes vote down vote up
def read_contract(symbol, contract, provider):
    """
    Read a single contract for a future instrument
    """
    if os.path.exists(fname(symbol, 'futures', provider)):
        data = pd.read_hdf(fname(symbol, 'futures', provider), 'quotes')
        return data.loc[int(contract), :]
    else:
        c = ['contract', 'date']
        return pd.DataFrame(columns=c).set_index(c) 
Example #28
Source File: utils.py    From ladder with MIT License 5 votes vote down vote up
def load_df(dirpath, filename, varname=None):
    varname = filename if varname is None else varname
    fn = os.path.join(dirpath, filename)
    return read_hdf(fn, varname) 
Example #29
Source File: mixins.py    From traffic with MIT License 5 votes vote down vote up
def from_file(
        cls: Type[T], filename: Union[Path, str], **kwargs
    ) -> Optional[T]:
        """Read data from various formats.

        This class method dispatches the loading of data in various format to
        the proper ``pandas.read_*`` method based on the extension of the
        filename.

        - .pkl and .pkl.gz dispatch to ``pandas.read_pickle``;
        - .parquet and .parquet.gz dispatch to ``pandas.read_parquet``;
        - .json and .json.gz dispatch to ``pandas.read_json``;
        - .csv and .csv.gz dispatch to ``pandas.read_csv``;
        - .h5 dispatch to ``pandas.read_hdf``.

        Other extensions return ``None``.
        Specific arguments may be passed to the underlying ``pandas.read_*``
        method with the kwargs argument.

        Example usage:

        >>> t = Traffic.from_file("data/sample_opensky.pkl")
        """
        path = Path(filename)
        if path.suffixes in [[".pkl"], [".pkl", ".gz"]]:
            return cls(pd.read_pickle(path, **kwargs))
        if path.suffixes in [[".parquet"], [".parquet", ".gz"]]:
            return cls(pd.read_parquet(path, **kwargs))
        if path.suffixes in [[".json"], [".json", ".gz"]]:
            return cls(pd.read_json(path, **kwargs))
        if path.suffixes in [[".csv"], [".csv", ".gz"]]:
            return cls(pd.read_csv(path, **kwargs))
        if path.suffixes == [".h5"]:
            return cls(pd.read_hdf(path, **kwargs))
        return None

    # --- Special methods --- 
Example #30
Source File: eval_baseline_methods.py    From DCRNN with MIT License 5 votes vote down vote up
def main(args):
    traffic_reading_df = pd.read_hdf(args.traffic_reading_filename)
    eval_static(traffic_reading_df)
    eval_historical_average(traffic_reading_df, period=7 * 24 * 12)
    eval_var(traffic_reading_df, n_lags=3)