Python Examples of pandas.read

Source File: demand_writers.py From CityEnergyAnalyst with MIT License

6 votes

def write_to_hdf5(self, list_buildings, locator):
        """read in the temporary results files and append them to the Totals.csv file."""
        df = None
        for name in list_buildings:
            temporary_file = locator.get_temporary_file('%(name)sT.hdf' % locals())
            if df is None:
                df = pd.read_hdf(temporary_file, key='dataset')
            else:
                df = df.append(pd.read_hdf(temporary_file, key='dataset'))
        df.to_hdf(locator.get_total_demand('hdf'), key='dataset')

        """read saved data of monthly values and return as totals"""
        monthly_data_buildings = [pd.read_hdf(locator.get_demand_results_file(building_name, 'hdf'), key=building_name)
                                  for building_name in
                                  list_buildings]
        return df, monthly_data_buildings

Source File: runwrapper2017mpi_r3.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def tradesrets_to_list(h5in, outlist):
    indf = pd.read_hdf(h5in, 'trades')
    trades = indf.price.count()
    minprice = indf.price.min()
    maxprice = indf.price.max()
    
    indf = indf.assign(ret = 100*indf.price.pct_change())
    indf = indf.assign(abs_ret = np.abs(indf.ret))
    lags = []
    autocorr = []
    abs_autocorr = []
    for i in range(1,51):
        ac = indf.ret.autocorr(lag = i)
        aac = indf.abs_ret.autocorr(lag = i)
        lags.append(i)
        autocorr.append(ac)
        abs_autocorr.append(aac)
    ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr})
    ar_df.set_index('lag', inplace=True)
    clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum())
    
    returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant,
                    'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(),
                    'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j}
    outlist.append(returns_dict)

Source File: runwrapper2017mpi_r4.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def participation_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count())
    lt_df.rename(columns={'quantity': 'trade'}, inplace=True)
    if 'p999999' in lt_df.index:
        lt_df.drop('p999999', inplace=True)
    ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum())
    ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True)
    ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum())
    providers = ltsum_df.index.unique()
    market_makers = [x for x in providers if x.startswith('m')]
    market_makers.append('j0')
    ltsum_df = ltsum_df.ix[market_makers]
    part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']}
    if 'j0' in providers:
        part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']})
    outlist.append(part_dict)

Source File: runwrapper2017mpi_r3.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def profit_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    buy_trades = trade_df[trade_df.side=='buy']
    buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity)
    buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(),
                                   CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum()
                                  )
    buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True)
    sell_trades = trade_df[trade_df.side=='sell']
    sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity)
    sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(),
                                     CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum()
                                    )
    sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True)
    buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']]
    sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']]
    cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol'])
    cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF)
    temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last()
    temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol)
    temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']]
    outlist.append(temp_df)

Source File: photoz.py From gcr-catalogs with BSD 3-Clause "New" or "Revised" License

6 votes

def _iter_native_dataset(self, native_filters=None):
        current_fname = None
        for meta_tract in self._metadata:
            for meta_patch in meta_tract['patches']:
                tract_patch = {'tract': meta_tract['tract'], 'patch': meta_patch['patch']}
                if native_filters and not native_filters.check_scalar(tract_patch):
                    continue

                if current_fname != meta_tract['filename']:
                    current_fname = meta_tract['filename']
                    df = pd.read_hdf(os.path.join(self.base_dir, current_fname), 'df')

                slice_this = slice(*meta_patch['slice'])
                def native_quantity_getter(native_quantity):
                    # pylint: disable=W0640
                    # variables (df and slice_this) intentionally defined in loop
                    if native_quantity == '_FULL_PDF':
                        return df.iloc[slice_this, :self._n_pdf_bins].values
                    return df[native_quantity].values[slice_this]
                yield native_quantity_getter

    # Native quantity names in the photo-z catalog are too uninformative
    # Since native quantities will become regular quantities in composite catalog,
    # let us hide them all.

Source File: runwrapper2017mpi_r3.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def participation_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count())
    lt_df.rename(columns={'quantity': 'trade'}, inplace=True)
    if 'p999999' in lt_df.index:
        lt_df.drop('p999999', inplace=True)
    ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum())
    ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True)
    ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum())
    providers = ltsum_df.index.unique()
    market_makers = [x for x in providers if x.startswith('m')]
    market_makers.append('j0')
    ltsum_df = ltsum_df.ix[market_makers]
    part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']}
    if 'j0' in providers:
        part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']})
    outlist.append(part_dict)

Source File: data_utils.py From RGAN with MIT License

6 votes

def resample_eICU_patient(pid, resample_factor_in_min, variables, upto_in_minutes):
    """
    Resample a *single* patient.
    """
    pat_df = pd.read_hdf(paths.eICU_hdf_dir + '/vitalPeriodic.h5',
                         where='patientunitstayid = ' + str(pid),
                         columns=['observationoffset', 'patientunitstayid'] + variables,
                         mode='r')
    # sometimes it's empty
    if pat_df.empty:
        return None
    if not upto_in_minutes is None:
        pat_df = pat_df.loc[0:upto_in_minutes*60]
    # convert the offset to a TimedeltaIndex (necessary for resampling)
    pat_df.observationoffset = pd.TimedeltaIndex(pat_df.observationoffset, unit='m')
    pat_df.set_index('observationoffset', inplace=True)
    pat_df.sort_index(inplace=True)
    # resample by time
    pat_df_resampled = pat_df.resample(str(resample_factor_in_min) + 'T').median()  # pandas ignores NA in median by default
    # rename pid, cast to int
    pat_df_resampled.rename(columns={'patientunitstayid': 'pid'}, inplace=True)
    pat_df_resampled['pid'] = np.int32(pat_df_resampled['pid'])
    # get offsets in minutes from index
    pat_df_resampled['offset'] = np.int32(pat_df_resampled.index.total_seconds()/60)
    return pat_df_resampled

Source File: runwrapper2017mpi_r4.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def profit_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    buy_trades = trade_df[trade_df.side=='buy']
    buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity)
    buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(),
                                   CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum()
                                  )
    buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True)
    sell_trades = trade_df[trade_df.side=='sell']
    sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity)
    sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(),
                                     CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum()
                                    )
    sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True)
    buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']]
    sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']]
    cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol'])
    cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF)
    temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last()
    temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol)
    temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']]
    outlist.append(temp_df)

Source File: runwrapper2017mpi_r4.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def tradesrets_to_list(h5in, outlist):
    indf = pd.read_hdf(h5in, 'trades')
    trades = indf.price.count()
    minprice = indf.price.min()
    maxprice = indf.price.max()
    
    indf = indf.assign(ret = 100*indf.price.pct_change())
    indf = indf.assign(abs_ret = np.abs(indf.ret))
    lags = []
    autocorr = []
    abs_autocorr = []
    for i in range(1,51):
        ac = indf.ret.autocorr(lag = i)
        aac = indf.abs_ret.autocorr(lag = i)
        lags.append(i)
        autocorr.append(ac)
        abs_autocorr.append(aac)
    ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr})
    ar_df.set_index('lag', inplace=True)
    clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum())
    
    returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant,
                    'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(),
                    'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j}
    outlist.append(returns_dict)

Source File: runwrapper2017mpi_r3x.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def participation_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    lt_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.count())
    lt_df.rename(columns={'quantity': 'trade'}, inplace=True)
    if 'p999999' in lt_df.index:
        lt_df.drop('p999999', inplace=True)
    ltsum_df = pd.DataFrame(trade_df.groupby(['trader_id']).quantity.sum())
    ltsum_df.rename(columns={'quantity': 'trade_vol'}, inplace=True)
    ltsum_df = ltsum_df.assign(Participation = 100*ltsum_df.trade_vol/ltsum_df.trade_vol.sum())
    providers = ltsum_df.index.unique()
    market_makers = [x for x in providers if x.startswith('m')]
    market_makers.append('j0')
    ltsum_df = ltsum_df.ix[market_makers]
    part_dict = {'MCRun': j, 'MM_Participation': ltsum_df.loc['m0', 'Participation']}
    if 'j0' in providers:
        part_dict.update({'PJ_Participation': ltsum_df.loc['j0', 'Participation']})
    outlist.append(part_dict)

Source File: test_hdf.py From vivarium with GNU General Public License v3.0

6 votes

def test_write_data_frame(hdf_file_path):
    key = hdf.EntityKey('cause.test.prevalence')
    data = build_table([lambda *args, **kwargs: random.choice([0, 1]), "Kenya", 1],
                       2005, 2010, columns=('age', 'year', 'sex', 'draw', 'location', 'value'))

    non_val_columns = data.columns.difference({'value'})
    data = data.set_index(list(non_val_columns))

    hdf._write_pandas_data(hdf_file_path, key, data)

    written_data = pd.read_hdf(hdf_file_path, key.path)
    assert written_data.equals(data)

    filter_terms = ['draw == 0']
    written_data = pd.read_hdf(hdf_file_path, key.path, where=filter_terms)
    assert written_data.equals(data.xs(0, level='draw', drop_level=False))

Source File: test_common.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_write_fspath_hdf5(self):
        # Same test as write_fspath_all, except HDF5 files aren't
        # necessarily byte-for-byte identical for a given dataframe, so we'll
        # have to read and compare equality
        pytest.importorskip('tables')

        df = pd.DataFrame({"A": [1, 2]})
        p1 = tm.ensure_clean('string')
        p2 = tm.ensure_clean('fspath')

        with p1 as string, p2 as fspath:
            mypath = CustomFSPath(fspath)
            df.to_hdf(mypath, key='bar')
            df.to_hdf(string, key='bar')

            result = pd.read_hdf(fspath, key='bar')
            expected = pd.read_hdf(string, key='bar')

        tm.assert_frame_equal(result, expected)

Source File: utils_testing.py From auto_ml with MIT License

6 votes

def get_twitter_sentiment_multilabel_classification_dataset():

    file_name = os.path.join('tests', 'twitter_sentiment.h5')

    try:
        df_twitter = pd.read_hdf(file_name)
    except Exception as e:
        print('Error')
        print(e)
        dataset_url = 'https://raw.githubusercontent.com/ClimbsRocks/sample_datasets/master/twitter_airline_sentiment.csv'
        df_twitter = pd.read_csv(dataset_url, encoding='latin-1')
        # Do not write the index that pandas automatically creates

        df_twitter.to_hdf(file_name, key='df', format='fixed')

    # Grab only 10% of the dataset- runs much faster this way
    df_twitter = df_twitter.sample(frac=0.1)

    df_twitter['tweet_created'] = pd.to_datetime(df_twitter.tweet_created)

    df_twitter_train, df_twitter_test = train_test_split(df_twitter, test_size=0.33, random_state=42)
    return df_twitter_train, df_twitter_test

Source File: test_temporalnetwork.py From teneto with GNU General Public License v3.0

6 votes

def test_hdf5():
    df = pd.DataFrame({'i': [0, 0], 'j': [1, 2], 't': [0, 1]})
    tnet = teneto.TemporalNetwork(from_df=df, hdf5=True)
    if not tnet.network == './teneto_temporalnetwork.h5':
        raise AssertionError()
    df2 = pd.read_hdf('./teneto_temporalnetwork.h5')
    if not (df == df2).all().all():
        raise AssertionError()
    tnet.add_edge([0, 2, 2])
    df3 = pd.read_hdf('./teneto_temporalnetwork.h5')
    if not (df3.iloc[2].values == [0, 2, 2]).all():
        raise AssertionError()
    tnet.drop_edge([0, 2, 2])
    df4 = pd.read_hdf('./teneto_temporalnetwork.h5')
    if not (df == df4).all().all():
        raise AssertionError()

Source File: __main__.py From picasso with MIT License

6 votes

def _hdf2csv(path):
    from glob import glob
    import pandas as pd
    from tqdm import tqdm as _tqdm
    from os.path import isdir

    if isdir(path):
        paths = glob(path + "/*.hdf5")
    else:
        paths = glob(path)
    if paths:
        import os.path

        for path in _tqdm(paths):
            base, ext = os.path.splitext(path)
            if ext == ".hdf5":
                print("Converting {}".format(path))
                out_path = base + ".csv"
                locs = pd.read_hdf(path)
                print("A total of {} rows loaded".format(len(locs)))
                locs.to_csv(out_path, sep=",", encoding="utf-8")
    print("Complete.")

Source File: main.py From predictatops with MIT License

6 votes

def load_prev_results_at_path(full_path_to_results_file, key="df"):
    """
    A function used to return a dataframe of wells stored in an h5 file at a given path with a given key.
    
    Parameters
    ----------
    full_path_to_results_file: string
        A path to a .h5 file that contains a wells dataframe.

    key: string
        A string representation of a key used to find the dataframe in the h5 file whose path is defined by the full_path_to_results_file argument.

    Returns
    -------
    wells_df_from_wellsKNN: dataframe
        Returns a dataframe of wells that existed at the path defined in the full_path_to_results_file argument.
    """
    wells_df_from_wellsKNN = pd.read_hdf(full_path_to_results_file, key=key)
    return wells_df_from_wellsKNN

Source File: runwrapper2017mpi_r3x.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def profit_to_list(h5in, outlist):
    trade_df = pd.read_hdf(h5in, 'trades')
    trade_df = trade_df.assign(trader_id = trade_df.resting_order_id.str.split('_').str[0])
    buy_trades = trade_df[trade_df.side=='buy']
    buy_trades = buy_trades.assign(BuyCashFlow = buy_trades.price*buy_trades.quantity)
    buy_trades = buy_trades.assign(BuyVol = buy_trades.groupby('trader_id').quantity.cumsum(),
                                   CumulBuyCF = buy_trades.groupby('trader_id').BuyCashFlow.cumsum()
                                  )
    buy_trades.rename(columns={'timestamp': 'buytimestamp'}, inplace=True)
    sell_trades = trade_df[trade_df.side=='sell']
    sell_trades = sell_trades.assign(SellCashFlow = -sell_trades.price*sell_trades.quantity)
    sell_trades = sell_trades.assign(SellVol = sell_trades.groupby('trader_id').quantity.cumsum(),
                                     CumulSellCF = sell_trades.groupby('trader_id').SellCashFlow.cumsum()
                                    )
    sell_trades.rename(columns={'timestamp': 'selltimestamp'}, inplace=True)
    buy_trades = buy_trades[['trader_id', 'BuyVol', 'CumulBuyCF', 'buytimestamp']]
    sell_trades = sell_trades[['trader_id', 'SellVol', 'CumulSellCF', 'selltimestamp']]
    cash_flow = pd.merge(buy_trades, sell_trades, left_on=['trader_id', 'BuyVol'], right_on=['trader_id', 'SellVol'])
    cash_flow = cash_flow.assign(NetCashFlow = cash_flow.CumulBuyCF + cash_flow.CumulSellCF)
    temp_df = cash_flow.groupby('trader_id')['NetCashFlow', 'BuyVol'].last()
    temp_df = temp_df.assign(NetCFPerShare = temp_df.NetCashFlow/temp_df.BuyVol)
    temp_df = temp_df[['NetCashFlow', 'NetCFPerShare']]
    outlist.append(temp_df)

Source File: runwrapper2017mpi_r3x.py From pyziabm with BSD 3-Clause "New" or "Revised" License

6 votes

def tradesrets_to_list(h5in, outlist):
    indf = pd.read_hdf(h5in, 'trades')
    trades = indf.price.count()
    minprice = indf.price.min()
    maxprice = indf.price.max()
    indf = indf.assign(ret = 100*indf.price.pct_change())
    indf = indf.assign(abs_ret = np.abs(indf.ret))
    lags = []
    autocorr = []
    abs_autocorr = []
    for i in range(1,51):
        ac = indf.ret.autocorr(lag = i)
        aac = indf.abs_ret.autocorr(lag = i)
        lags.append(i)
        autocorr.append(ac)
        abs_autocorr.append(aac)
    ar_df = pd.DataFrame({'lag': lags, 'autocorrelation': autocorr, 'autocorrelation_abs': abs_autocorr})
    ar_df.set_index('lag', inplace=True)
    clustering_constant = np.abs(ar_df.autocorrelation_abs.sum()/ar_df.autocorrelation.sum())
    returns_dict = {'Trades': trades, 'MinPrice': minprice, 'MaxPrice': maxprice, 'ClusteringConstant': clustering_constant,
                    'MeanRet': indf.ret.mean(), 'StdRet': indf.ret.std(), 'SkewRet': indf.ret.skew(),
                    'KurtosisRet': indf.ret.kurtosis(), 'MCRun': j}
    outlist.append(returns_dict)

Source File: plot_history.py From age-gender-estimator-keras with MIT License

6 votes

def main():
    args = get_args()
    input_path = args.input

    df = pd.read_hdf(input_path, "history")
    input_dir = os.path.dirname(input_path)
    plt.plot(df["gender_loss"], label="loss (gender)")
    plt.plot(df["age_loss"], label="loss (age)")
    plt.plot(df["val_gender_loss"], label="val_loss (gender)")
    plt.plot(df["val_age_loss"], label="val_loss (age)")
    plt.xlabel("number of epochs")
    plt.ylabel("loss")
    plt.legend()
    plt.savefig(os.path.join(input_dir, "loss.png"))
    plt.cla()

    plt.plot(df["gender_acc"], label="accuracy (gender)")
    plt.plot(df["age_acc"], label="accuracy (age)")
    plt.plot(df["val_gender_acc"], label="val_accuracy (gender)")
    plt.plot(df["val_age_acc"], label="val_accuracy (age)")
    plt.xlabel("number of epochs")
    plt.ylabel("accuracy")
    plt.legend()
    plt.savefig(os.path.join(input_dir, "accuracy.png"))

Source File: batch.py From batchflow with Apache License 2.0

6 votes

def _load_table(self, src, fmt, dst=None, post=None, *args, **kwargs):
        """ Load a data frame from table formats: csv, hdf5, feather """
        if fmt == 'csv':
            _data = pd.read_csv(src, *args, **kwargs)
        elif fmt == 'feather':
            _data = feather.read_dataframe(src, *args, **kwargs)
        elif fmt == 'hdf5':
            _data = pd.read_hdf(src, *args, **kwargs)

        # Put into this batch only part of it (defined by index)
        if isinstance(_data, pd.DataFrame):
            _data = _data.loc[self.indices]
        elif isinstance(_data, dd.DataFrame):
            # dask.DataFrame.loc supports advanced indexing only with lists
            _data = _data.loc[list(self.indices)].compute()

        if callable(post):
            _data = post(_data, src=src, fmt=fmt, dst=dst, **kwargs)

        self.load(src=_data, dst=dst)

Source File: Omlette.py From OpenTrader with GNU Lesser General Public License v3.0

6 votes

def iMain():
    """
    Read an hdf file generated by us to make sure
    we can recover its content and structure.
    Give the name of an hdf5 file as a command-line argument.
    """
    assert sys.argv, __doc__
    sFile = sys.argv[1]
    assert os.path.isfile(sFile)
    oHdfStore = pandas.HDFStore(sFile, mode='r')
    print oHdfStore.groups()
    # bug - no return value
    # oSignals = pandas.read_hdf(oHdfStore, '/servings/signals')
    mSignals = oHdfStore.select('/recipe/servings/mSignals', auto_close=False)    
    print mSignals
    print oHdfStore.get_node('/recipe')._v_attrs.metadata[0]['sUrl']

Source File: test_common.py From vnpy_crypto with MIT License

6 votes

def test_write_fspath_hdf5(self):
        # Same test as write_fspath_all, except HDF5 files aren't
        # necessarily byte-for-byte identical for a given dataframe, so we'll
        # have to read and compare equality
        pytest.importorskip('tables')

        df = pd.DataFrame({"A": [1, 2]})
        p1 = tm.ensure_clean('string')
        p2 = tm.ensure_clean('fspath')

        with p1 as string, p2 as fspath:
            mypath = CustomFSPath(fspath)
            df.to_hdf(mypath, key='bar')
            df.to_hdf(string, key='bar')

            result = pd.read_hdf(fspath, key='bar')
            expected = pd.read_hdf(string, key='bar')

        tm.assert_frame_equal(result, expected)

Source File: data_utils.py From autodeepnet with MIT License

6 votes

def load_hdf5_data(file_path, **kwargs):
    key = kwargs.get('key', None)
    pandas_format = kwargs.get('pandas_format', True)
    mode = kwargs.get('mode', 'r')
    logger.info("Opening HDF5 file {} to read...".format(file_path))
    try:
        if pandas_format:
            data = pd.read_hdf(file_path, key=key, mode=mode)
        else:
            with h5py.File(file_path, mode) as f:
                data = f[key][()]
    except KeyError as e:
        logger.exception("Dataset {} does not exist".format(dataset))
        raise exceptions.FileLoadError("Dataset does not exist")
    except Exception as e:
        logger.exception("Problem loading dataset: {0}".format(e))
        raise exceptions.FileLoadError
    logger.info("Successfully loaded HDF5 data")
    return data

Source File: test_common.py From recruit with Apache License 2.0

6 votes

def test_write_fspath_hdf5(self):
        # Same test as write_fspath_all, except HDF5 files aren't
        # necessarily byte-for-byte identical for a given dataframe, so we'll
        # have to read and compare equality
        pytest.importorskip('tables')

        df = pd.DataFrame({"A": [1, 2]})
        p1 = tm.ensure_clean('string')
        p2 = tm.ensure_clean('fspath')

        with p1 as string, p2 as fspath:
            mypath = CustomFSPath(fspath)
            df.to_hdf(mypath, key='bar')
            df.to_hdf(string, key='bar')

            result = pd.read_hdf(fspath, key='bar')
            expected = pd.read_hdf(string, key='bar')

        tm.assert_frame_equal(result, expected)

Source File: hdfstore.py From PyTrendFollow with MIT License

5 votes

def read_symbol(symbol, q_type, provider):
    """
    Read data from the corresponding HDF file
    """
    if os.path.exists(fname(symbol, q_type, provider)):
        data = pd.read_hdf(fname(symbol, q_type, provider), 'quotes')
        return data
    else:  # if symbol doesn't exist, an empty df is returned with only index columns
        c = ['contract', 'date'] if q_type == 'futures' else ['date']
        return pd.DataFrame(columns=c).set_index(c)

Source File: photoz.py From gcr-catalogs with BSD 3-Clause "New" or "Revised" License

5 votes

def generate_metadata(self, write_to_yaml=False):
        """
        generate metadata
        """
        meta = list()
        for fname in sorted(os.listdir(self.base_dir)):
            if not self._filename_re.match(fname):
                continue

            file_path = os.path.join(self.base_dir, fname)
            try:
                df = pd.read_hdf(file_path, 'df')

            except (IOError, OSError):
                warnings.warn('Cannot access {}; skipped'.format(file_path))
                continue

            meta_tract = {
                'tract': int(df['tract'].iloc[0]),
                'filename': fname,
            }

            # Each file contains all patches in one tract,
            # but we want to be able to iterate over patches as well.
            # Here, we find the indices where the adjacent patch values differ,
            # and we record the slice indices for each patch.
            patches = df['patch'].values.astype('<U')
            indices = np.flatnonzero(np.concatenate(([True], patches[1:] != patches[:-1], [True])))
            indices = np.vstack((indices[:-1], indices[1:])).T
            meta_tract['patches'] = [{'patch': str(patches[i]), 'slice': [int(i), int(j)]} for i, j in indices]

            meta.append(meta_tract)

        if write_to_yaml:
            if self._metadata_path and os.path.isfile(self._metadata_path):
                warnings.warn('Overwriting metadata file `{0}`, which is backed up at `{0}.bak`'.format(self._metadata_path))
                shutil.copyfile(self._metadata_path, self._metadata_path + '.bak')
            with open(self._metadata_path, 'w') as meta_stream:
                yaml.dump(meta, meta_stream)

        return meta

Source File: hdfstore.py From PyTrendFollow with MIT License

5 votes

def read_contract(symbol, contract, provider):
    """
    Read a single contract for a future instrument
    """
    if os.path.exists(fname(symbol, 'futures', provider)):
        data = pd.read_hdf(fname(symbol, 'futures', provider), 'quotes')
        return data.loc[int(contract), :]
    else:
        c = ['contract', 'date']
        return pd.DataFrame(columns=c).set_index(c)

Source File: utils.py From ladder with MIT License

5 votes

def load_df(dirpath, filename, varname=None):
    varname = filename if varname is None else varname
    fn = os.path.join(dirpath, filename)
    return read_hdf(fn, varname)

Source File: mixins.py From traffic with MIT License

5 votes

def from_file(
        cls: Type[T], filename: Union[Path, str], **kwargs
    ) -> Optional[T]:
        """Read data from various formats.

        This class method dispatches the loading of data in various format to
        the proper ``pandas.read_*`` method based on the extension of the
        filename.

        - .pkl and .pkl.gz dispatch to ``pandas.read_pickle``;
        - .parquet and .parquet.gz dispatch to ``pandas.read_parquet``;
        - .json and .json.gz dispatch to ``pandas.read_json``;
        - .csv and .csv.gz dispatch to ``pandas.read_csv``;
        - .h5 dispatch to ``pandas.read_hdf``.

        Other extensions return ``None``.
        Specific arguments may be passed to the underlying ``pandas.read_*``
        method with the kwargs argument.

        Example usage:

        >>> t = Traffic.from_file("data/sample_opensky.pkl")
        """
        path = Path(filename)
        if path.suffixes in [[".pkl"], [".pkl", ".gz"]]:
            return cls(pd.read_pickle(path, **kwargs))
        if path.suffixes in [[".parquet"], [".parquet", ".gz"]]:
            return cls(pd.read_parquet(path, **kwargs))
        if path.suffixes in [[".json"], [".json", ".gz"]]:
            return cls(pd.read_json(path, **kwargs))
        if path.suffixes in [[".csv"], [".csv", ".gz"]]:
            return cls(pd.read_csv(path, **kwargs))
        if path.suffixes == [".h5"]:
            return cls(pd.read_hdf(path, **kwargs))
        return None

    # --- Special methods ---

Source File: eval_baseline_methods.py From DCRNN with MIT License

5 votes

def main(args):
    traffic_reading_df = pd.read_hdf(args.traffic_reading_filename)
    eval_static(traffic_reading_df)
    eval_historical_average(traffic_reading_df, period=7 * 24 * 12)
    eval_var(traffic_reading_df, n_lags=3)

Python pandas.read_hdf() Examples