Python Examples of pandas.read

Source File: arrow.py From spectre with Apache License 2.0

6 votes

def __init__(self, path: str = None, keep_in_memory: bool = True) -> None:
        if not os.path.exists(path + '.meta'):
            raise FileNotFoundError(os.path.abspath(path + '.meta'))

        # pandas 0.22 has the fastest MultiIndex
        if pd.__version__.startswith('0.22'):
            import feather
            cols = feather.read_dataframe(path + '.meta')
        else:
            cols = pd.read_feather(path + '.meta')

        ohlcv = cols.ohlcv.values
        adjustments = cols.adjustments.values[:2]
        if adjustments[0] is None:
            adjustments = None
        super().__init__(path, ohlcv, adjustments)
        self.keep_in_memory = keep_in_memory
        self._cache = None

Source File: from_kkbox.py From pycox with BSD 2-Clause "Simplified" License

6 votes

def _make_train_test_split(self, seed=1234):
        from sklearn.model_selection import train_test_split
        np.random.seed(seed)
        covariates = pd.read_feather(self._path_dir / 'covariates.feather')

        def train_test_split_customer(df, col_customer, test_size):
            tr, te = train_test_split(df[[col_customer]].drop_duplicates(), test_size=test_size)
            train =  df.merge(tr, how='right', on=col_customer)
            test =  df.merge(te, how='right', on=col_customer)
            return train, test

        train, test = train_test_split_customer(covariates, 'msno', 0.25)
        train, val = train_test_split_customer(train, 'msno', 0.1)

        assert train.merge(test, how='inner', on='msno').shape[0] == 0
        assert train.merge(val, how='inner', on='msno').shape[0] == 0
        assert test.merge(val, how='inner', on='msno').shape[0] == 0

        train.to_feather(self._path_dir / 'train.feather')
        test.to_feather(self._path_dir / 'test.feather')
        val.to_feather(self._path_dir / 'val.feather')

Source File: run.py From talkingdata-adtracking-fraud-detection with MIT License

6 votes

def load_dataset(paths, index=None) -> pd.DataFrame:
    assert len(paths) > 0

    feature_datasets = []
    for path in paths:
        if index is None:
            feature_datasets.append(pd.read_feather(path))
        else:
            feature_datasets.append(pd.read_feather(path).loc[index])
        gc.collect()
    # check if all of feature dataset share the same index
    index = feature_datasets[0].index
    for feature_dataset in feature_datasets[1:]:
        pandas.testing.assert_index_equal(index, feature_dataset.index)

    return pd.concat(feature_datasets, axis=1)

Source File: feature_store.py From nyaggle with MIT License

6 votes

def load_feature(feature_name: Union[int, str], directory: str = './features/',
                 ignore_columns: List[str] = None) -> pd.DataFrame:
    """
    Load feature as pandas DataFrame.

    Args:
        feature_name:
            The name of the feature (used in ``save_feature``).
        directory:
            The directory where the feature is stored.
        ignore_columns:
            The list of columns that will be dropped from the loaded dataframe.
    Returns:
        The feature dataframe
    """
    path = os.path.join(directory, str(feature_name) + '.f')

    df = pd.read_feather(path)
    if ignore_columns:
        return df.drop([c for c in ignore_columns if c in df.columns], axis=1)
    else:
        return df

Source File: atlas3.py From ssbio with MIT License

6 votes

def get_proteome_percentages(counts_df, outpath, force_rerun=False):
    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outpath):
        big_strain_percents_df = pd.DataFrame(columns=counts_df.columns)
        for strain in counts_df.columns:
            totals = list(filter(lambda x: x.endswith('total'), counts_df[strain].index))
            for t in totals:
                counts = t.rsplit('_', 1)[0]
                aa_counts = list(filter(lambda x: (x.startswith(counts) and x not in totals), counts_df[strain].index))
                for aa_count in aa_counts:
                    big_strain_percents_df.at[aa_count.replace('count', '%'), strain] = counts_df[strain][aa_count]/counts_df[strain][t]

        big_strain_percents_df.astype(float).reset_index().to_feather(outpath)
    else:
        big_strain_percents_df = pd.read_feather(outpath).set_index('index')

    big_strain_percents_df.index.name = None
    return big_strain_percents_df

Source File: parsers.py From modin with Apache License 2.0

5 votes

def parse(fname, **kwargs):
        from pyarrow import feather

        num_splits = kwargs.pop("num_splits", None)
        if num_splits is None:
            return pandas.read_feather(fname, **kwargs)
        df = feather.read_feather(fname, **kwargs)
        # Append the length of the index here to build it externally
        return _split_result_for_readers(0, num_splits, df) + [len(df.index), df.dtypes]

Source File: test_feather.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_path_localpath(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Source File: arrow.py From spectre with Apache License 2.0

5 votes

def _load(self) -> pd.DataFrame:
        if self._cache is not None:
            return self._cache

        if pd.__version__.startswith('0.22'):
            import feather
            df = feather.read_dataframe(self._path)
        else:
            df = pd.read_feather(self._path)
        df.set_index(['date', 'asset'], inplace=True)

        if self.keep_in_memory:
            self._cache = df
        return df

Source File: protocols.py From bionic with Apache License 2.0

5 votes

def read(self, path):
        with path.open("rb") as file_:
            return pd.read_feather(file_)

Source File: test_feather.py From elasticintel with GNU General Public License v3.0

5 votes

def check_round_trip(self, df, **kwargs):

        with ensure_clean() as path:
            to_feather(df, path)
            result = read_feather(path, **kwargs)
            assert_frame_equal(result, df)

Source File: test_feather.py From elasticintel with GNU General Public License v3.0

5 votes

def test_path_pathlib(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_pathlib(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Source File: test_feather.py From elasticintel with GNU General Public License v3.0

5 votes

def test_path_localpath(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Source File: io.py From modin with Apache License 2.0

5 votes

def read_feather(cls, path, columns=None, use_threads=True):
        ErrorMessage.default_to_pandas("`read_feather`")
        return cls.from_pandas(
            pandas.read_feather(path, columns=columns, use_threads=use_threads)
        )

Source File: test_feather.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def check_round_trip(self, df, expected=None, **kwargs):

        if expected is None:
            expected = df

        with ensure_clean() as path:
            to_feather(df, path)

            result = read_feather(path, **kwargs)
            assert_frame_equal(result, expected)

Source File: test_io.py From modin with Apache License 2.0

5 votes

def test_from_feather():
    setup_feather_file(SMALL_ROW_SIZE)

    pandas_df = pandas.read_feather(TEST_FEATHER_FILENAME)
    modin_df = pd.read_feather(TEST_FEATHER_FILENAME)

    df_equals(modin_df, pandas_df)

    teardown_feather_file()

Source File: Utils.py From Kaggle-Competition-Favorita with MIT License

5 votes

def load_data():
    # df_train = pd.read_feather('train_after1608_raw')
    df_train = pd.read_csv('train.csv', usecols=[1, 2, 3, 4, 5], dtype={'onpromotion': bool},
                           converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
                           parse_dates=["date"])
    df_test = pd.read_csv("test.csv", usecols=[0, 1, 2, 3, 4], dtype={'onpromotion': bool},
                          parse_dates=["date"]).set_index(['store_nbr', 'item_nbr', 'date'])

    # subset data
    df_2017 = df_train.loc[df_train.date>=pd.datetime(2016,1,1)]

    # promo
    promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
    promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
    promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
    promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
    promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
    promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
    del promo_2017_test, promo_2017_train

    df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
    df_2017.columns = df_2017.columns.get_level_values(1)

    # items
    items = pd.read_csv("items.csv").set_index("item_nbr")
    stores = pd.read_csv("stores.csv").set_index("store_nbr")
    # items = items.reindex(df_2017.index.get_level_values(1))

    return df_2017, promo_2017, items, stores

Source File: Utils.py From Kaggle-Competition-Favorita with MIT License

5 votes

def load_unstack(filename):
    df_name, promo_name = 'df_' + filename + '_raw', 'promo_' + filename + '_raw'
    df_2017 = pd.read_feather(df_name).set_index(['store_nbr','item_nbr'])
    df_2017.columns = pd.to_datetime(df_2017.columns)
    promo_2017 = pd.read_feather(promo_name).set_index(['store_nbr','item_nbr'])
    promo_2017.columns = pd.to_datetime(promo_2017.columns)
    items = pd.read_csv("items.csv").set_index("item_nbr")
    stores = pd.read_csv("stores.csv").set_index("store_nbr")

    return df_2017, promo_2017, items, stores

# Create validation and test data

Source File: _dataset_loader.py From pycox with BSD 2-Clause "Simplified" License

5 votes

def read_df(self):
        if not self.path.exists():
            print(f"Dataset '{self.name}' not locally available. Downloading...")
            self._download()
            print(f"Done")
        df = pd.read_feather(self.path)
        df = self._label_cols_at_end(df)
        return df

Source File: dataset_view.py From QCFractal with BSD 3-Clause "New" or "Revised" License

5 votes

def _deserialize(data: bytes, msgpacked_cols: List[str]) -> pd.DataFrame:
        """
        Data are returned as feather-packed pandas DataFrames.
        Due to limitations in pyarrow, some objects are msgpacked inside the DataFrame.
        """
        import pyarrow

        df = pd.read_feather(pyarrow.BufferReader(data))
        for col in msgpacked_cols:
            df[col] = df[col].apply(lambda element: deserialize(element, "msgpack-ext"))

        if "index" in df.columns:
            df.set_index("index", inplace=True)  # pandas.to_feather does not support indexes,
            # so we have to send indexless frames over the wire, and set the index here.
        return df

Source File: test_feather.py From twitter-stock-recommendation with MIT License

5 votes

def check_round_trip(self, df, **kwargs):

        with ensure_clean() as path:
            to_feather(df, path)

            with catch_warnings(record=True):
                result = read_feather(path, **kwargs)
            assert_frame_equal(result, df)

Source File: test_feather.py From twitter-stock-recommendation with MIT License

5 votes

def test_path_pathlib(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_pathlib(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Source File: test_feather.py From twitter-stock-recommendation with MIT License

5 votes

def test_path_localpath(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Source File: test_pandas.py From docker-python with Apache License 2.0

5 votes

def test_read_feather(self):
        data = pd.read_feather("/input/tests/data/feather-0_3_1.feather")

        self.assertEqual(10, data.size)

Source File: test_feather.py From recruit with Apache License 2.0

5 votes

def test_path_pathlib(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_pathlib(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Source File: test_feather.py From recruit with Apache License 2.0

5 votes

def test_path_localpath(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Source File: base.py From ml-competition-template-titanic with MIT License

5 votes

def load(self):
        self.train = pd.read_feather(str(self.train_path))
        self.test = pd.read_feather(str(self.test_path))

Source File: __init__.py From ml-competition-template-titanic with MIT License

5 votes

def load_datasets(feats):
    dfs = [pd.read_feather(f'features/{f}_train.feather') for f in feats]
    X_train = pd.concat(dfs, axis=1, sort=False)
    dfs = [pd.read_feather(f'features/{f}_test.feather') for f in feats]
    X_test = pd.concat(dfs, axis=1, sort=False)
    return X_train, X_test

Source File: test_feather.py From vnpy_crypto with MIT License

5 votes

def check_round_trip(self, df, **kwargs):

        with ensure_clean() as path:
            to_feather(df, path)

            with catch_warnings(record=True):
                result = read_feather(path, **kwargs)
            assert_frame_equal(result, df)

Source File: test_feather.py From vnpy_crypto with MIT License

5 votes

def test_path_pathlib(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_pathlib(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Source File: test_feather.py From vnpy_crypto with MIT License

5 votes

def test_path_localpath(self):
        df = tm.makeDataFrame().reset_index()
        result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
        tm.assert_frame_equal(df, result)

Python pandas.read_feather() Examples