Python Examples of pandas.read

Source File: validator_kfold.py From mljar-supervised with MIT License

7 votes

def get_split(self, k):

        train_index_file = os.path.join(
            self._results_path, "folds", f"fold_{k}_train_indices.npy"
        )
        validation_index_file = os.path.join(
            self._results_path, "folds", f"fold_{k}_validation_indices.npy"
        )

        train_index = np.load(train_index_file)
        validation_index = np.load(validation_index_file)

        X = pd.read_parquet(self._X_train_path)
        y = pd.read_parquet(self._y_train_path)
        y = y["target"]

        return (
            {"X": X.loc[train_index], "y": y.loc[train_index]},
            {"X": X.loc[validation_index], "y": y.loc[validation_index]},
        )

Source File: test_dataframe.py From sdc with BSD 2-Clause "Simplified" License

6 votes

def test_sort_parallel(self):
        # create `kde.parquet` file
        ParquetGenerator.gen_kde_pq()

        # TODO: better parallel sort test
        def test_impl():
            df = pd.read_parquet('kde.parquet')
            df['A'] = df.points.astype(np.float64)
            df.sort_values('points', inplace=True)
            res = df.A.values
            return res

        hpat_func = self.jit(locals={'res:return': 'distributed'})(test_impl)

        save_min_samples = sdc.hiframes.sort.MIN_SAMPLES
        try:
            sdc.hiframes.sort.MIN_SAMPLES = 10
            res = hpat_func()
            self.assertTrue((np.diff(res) >= 0).all())
        finally:
            # restore global val
            sdc.hiframes.sort.MIN_SAMPLES = save_min_samples

Source File: parquet.py From timeserio with MIT License

6 votes

def make_subgen(self, chunk):
        filename = chunk
        subgen = single_sequence.SequenceForecastBatchGenerator(
            df=pd.read_parquet(filename).reset_index(),
            batch_size=self.batch_size,
            sequence_length=self.sequence_length,
            id_column=self.id_column,
            sequence_columns=self.sequence_columns,
            sequence_prefix=self.sequence_prefix,
            last_step_columns=self.last_step_columns,
            last_step_prefix=self.last_step_prefix,
            forecast_steps_min=self.forecast_steps_min,
            forecast_steps_max=self.forecast_steps_max,
            batch_offset=self.batch_offset,
            batch_offset_period=self.batch_offset_period,
            dt_column=self.dt_column,
            start_time=self.start_time,
        )
        return subgen

Source File: data.py From lkpy with MIT License

6 votes

def read_df_detect(path):
    """
    Read a Pandas data frame, auto-detecting the file format based on filename suffix.
    The following file types are supported:

    CSV
        File has suffix ``.csv``, read with :py:func:`pandas.read_csv`.
    Parquet
        File has suffix ``.parquet``, ``.parq``, or ``.pq``, read with
        :py:func:`pandas.read_parquet`.
    """
    import pandas as pd
    if not isinstance(path, pathlib.Path):
        path = pathlib.Path(path)

    if path.suffix == '.csv':
        return pd.read_csv(path)
    elif path.suffix in ('.parquet', '.parq', '.pq'):
        return pd.read_parquet(path)

Source File: io.py From modin with Apache License 2.0

6 votes

def read_parquet(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a Modin DataFrame.
           Modin only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Modin only supports pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        ErrorMessage.default_to_pandas("`read_parquet`")
        return cls.from_pandas(pandas.read_parquet(path, engine, columns, **kwargs))

Source File: parsers.py From modin with Apache License 2.0

6 votes

def parse(fname, **kwargs):
        num_splits = kwargs.pop("num_splits", None)
        columns = kwargs.get("columns", None)
        if num_splits is None:
            return pandas.read_parquet(fname, **kwargs)
        kwargs["use_pandas_metadata"] = True
        df = pandas.read_parquet(fname, **kwargs)
        if isinstance(df.index, pandas.RangeIndex):
            idx = len(df.index)
        else:
            idx = df.index
        columns = [c for c in columns if c not in df.index.names and c in df.columns]
        if columns is not None:
            df = df[columns]
        # Append the length of the index here to build it externally
        return _split_result_for_readers(0, num_splits, df) + [idx, df.dtypes]

Source File: test_dataframe_spark_io.py From koalas with Apache License 2.0

6 votes

def test_parquet_write(self):
        with self.temp_dir() as tmp:
            pdf = self.test_pdf
            expected = ks.DataFrame(pdf)

            # Write out partitioned by one column
            expected.to_parquet(tmp, mode="overwrite", partition_cols="i32")
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

            # Write out partitioned by two columns
            expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"])
            # Reset column order, as once the data is written out, Spark rearranges partition
            # columns to appear first.
            actual = ks.read_parquet(tmp)[self.test_column_order]
            self.assert_eq(
                actual.sort_values(by="f").to_spark().toPandas(),
                expected.sort_values(by="f").to_spark().toPandas(),
            )

Source File: test_config_driven_df.py From dagster with Apache License 2.0

6 votes

def test_dataframe_parquet_materialization():
    check_parquet_support()

    @solid(output_defs=[OutputDefinition(DataFrame)])
    def return_df(_context):
        return pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]})

    @pipeline
    def return_df_pipeline():
        return_df()

    with get_temp_file_name() as filename:
        result = execute_pipeline(
            return_df_pipeline,
            {'solids': {'return_df': {'outputs': [{'result': {'parquet': {'path': filename}}}]}}},
        )

        assert result.success

        df = pd.read_parquet(filename)
        assert df.to_dict('list') == {'num1': [1, 3], 'num2': [2, 4]}

Source File: automl.py From mljar-supervised with MIT License

6 votes

def _load_data_variables(self, X_train):
        if X_train.shape[1] == 0:
            X = pd.read_parquet(self._X_train_path)
            for c in X.columns:
                X_train.insert(loc=X_train.shape[1], column=c, value=X[c])

        os.remove(self._X_train_path)
        os.remove(self._y_train_path)

Source File: validator_split.py From mljar-supervised with MIT License

6 votes

def get_split(self, k=0):

        X = pd.read_parquet(self._X_train_path)
        y = pd.read_parquet(self._y_train_path)
        y = y["target"]

        stratify = None
        if self.stratify:
            stratify = y
        if self.shuffle == False:
            stratify = None

        X_train, X_validation, y_train, y_validation = train_test_split(
            X,
            y,
            train_size=self.train_ratio,
            test_size=1.0 - self.train_ratio,
            shuffle=self.shuffle,
            stratify=stratify,
            random_state=self.random_seed,
        )
        return {"X": X_train, "y": y_train}, {"X": X_validation, "y": y_validation}

Source File: test_schema.py From flytekit with Apache License 2.0

6 votes

def test_datetime_coercion_explicitly():
    """
    Sanity check that we're using a version of pyarrow that allows us to
    truncate timestamps
    """
    dt = _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1, microsecond=1)
    values = [(dt,)]
    df = _pd.DataFrame.from_records(values, columns=['testname'])
    assert df['testname'][0] == dt

    with _utils.AutoDeletingTempDir('test') as tmpdir:
        tmpfile = tmpdir.get_named_tempfile('repro.parquet')
        df.to_parquet(tmpfile, coerce_timestamps='ms', allow_truncated_timestamps=True)
        df2 = _pd.read_parquet(tmpfile)

    dt2 = _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1)
    assert df2['testname'][0] == dt2

Source File: invoker.py From anomalydetector with MIT License

6 votes

def read_as_dataframe(input_path: str):
    if os.path.isfile(input_path):
        if input_path.endswith(".csv"):
            return pd.read_csv(input_path)
        elif input_path.endswith(".parquet"):
            return pd.read_parquet(input_path)
    else:
        dir_path = pathlib.Path(input_path)

        csv_files = list(dir_path.glob("**/*.csv"))
        if csv_files:
            df_from_csv_files = (pd.read_csv(f) for f in csv_files)
            return pd.concat(df_from_csv_files, ignore_index=True)

        parquet_files = list(dir_path.glob("**/*.parquet"))
        if parquet_files:
            df_from_parquet_files = (pd.read_parquet(f) for f in parquet_files)
            return pd.concat(df_from_parquet_files, ignore_index=True)

    raise ValueError(f"Failed to read path: {input_path}")

Source File: file_type.py From gordo with GNU Affero General Public License v3.0

6 votes

def read_df(self, f: BinaryIO) -> pd.DataFrame:
        columns = self.time_series_columns.columns
        datetime_column = self.time_series_columns.datetime_column
        df = pd.read_parquet(f, engine="pyarrow", columns=columns).set_index(
            datetime_column
        )
        df.index = pd.to_datetime(df.index, utc=True)
        return df

Source File: test_io.py From modin with Apache License 2.0

5 votes

def test_from_parquet_partition(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, directory=True)

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    df_equals(modin_df, pandas_df)

Source File: message_collector.py From aki with GNU Affero General Public License v3.0

5 votes

def load(dt_: datetime) -> Optional[DataFrame]:
    filename = make_filename(dt_)
    if path.isfile(filename):
        return read_parquet(filename)

Source File: test_io.py From modin with Apache License 2.0

5 votes

def test_from_parquet_hdfs():
    path = "modin/pandas/test/data/hdfs.parquet"
    pandas_df = pandas.read_parquet(path)
    modin_df = pd.read_parquet(path)
    df_equals(modin_df, pandas_df)

Source File: core.py From tensorqtl with BSD 3-Clause "New" or "Revised" License

5 votes

def read_phenotype_bed(phenotype_bed):
    """Load phenotype BED file as phenotype and TSS DataFrames"""
    if phenotype_bed.endswith('.bed.gz'):
        phenotype_df = pd.read_csv(phenotype_bed, sep='\t', index_col=3, dtype={'#chr':str, '#Chr':str})
    elif phenotype_bed.endswith('.parquet'):
        phenotype_df = pd.read_parquet(phenotype_bed)
        phenotype_df.set_index(phenotype_df.columns[3], inplace=True)
    else:
        raise ValueError('Unsupported file type.')
    phenotype_df.rename(columns={i:i.lower().replace('#chr','chr') for i in phenotype_df.columns[:3]}, inplace=True)
    phenotype_pos_df = phenotype_df[['chr', 'end']].rename(columns={'end':'tss'})
    phenotype_df.drop(['chr', 'start', 'end'], axis=1, inplace=True)
    return phenotype_df, phenotype_pos_df

Source File: test_io.py From modin with Apache License 2.0

5 votes

def test_from_parquet_partitioned_columns(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    df_equals(modin_df, pandas_df)

Source File: io.py From mlcomp with Apache License 2.0

5 votes

def read_pandas(file):
    if file.endswith('.csv'):
        df = pd.read_csv(file)
    elif file.endswith('.parquet'):
        df = pd.read_parquet(file)
    else:
        raise Exception('Unknown file type')
    return df

Source File: eqtl_prepare_expression.py From gtex-pipeline with BSD 3-Clause "New" or "Revised" License

5 votes

def read_gct(gct_file, sample_ids=None, dtype=None):
    """
    Load GCT as DataFrame. The first two columns must be 'Name' and 'Description'.
    """
    if sample_ids is not None:
        sample_ids = ['Name']+list(sample_ids)

    if gct_file.endswith('.gct.gz') or gct_file.endswith('.gct'):
        if dtype is not None:
            with gzip.open(gct_file, 'rt') as gct:
                gct.readline()
                gct.readline()
                sample_ids = gct.readline().strip().split()
            dtypes = {i:dtype for i in sample_ids[2:]}
            dtypes['Name'] = str
            dtypes['Description'] = str
            df = pd.read_csv(gct_file, sep='\t', skiprows=2, usecols=sample_ids, index_col=0, dtype=dtypes)
        else:
            df = pd.read_csv(gct_file, sep='\t', skiprows=2, usecols=sample_ids, index_col=0)
    elif gct_file.endswith('.parquet'):
        df = pd.read_parquet(gct_file, columns=sample_ids)
    elif gct_file.endswith('.ft'):  # feather format
        df = feather.read_dataframe(gct_file, columns=sample_ids)
        df = df.set_index('Name')
    else:
        raise ValueError('Unsupported input format.')
    df.index.name = 'gene_id'
    if 'Description' in df.columns:
        df = df.drop('Description', axis=1)
    return df

Source File: test_io.py From modin with Apache License 2.0

5 votes

def test_from_parquet_partitioned_columns_with_columns(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    df_equals(modin_df, pandas_df)

Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_pq_read_freevar_str1(self):
        kde_file2 = 'kde.parquet'

        def test_impl():
            df = pd.read_parquet(kde_file2)
            X = df['points']
            return X.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)

Source File: parquet.py From timeserio with MIT License

5 votes

def make_subgen(self, chunk):
        filename = chunk
        subgen = single_row.RowBatchGenerator(
            df=pd.read_parquet(filename).reset_index(),
            batch_size=self.batch_size,
            columns=self.columns
        )
        return subgen

Source File: test_groupby.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_crosstab_parallel1(self):
        def test_impl():
            df = pd.read_parquet("pivot2.pq")
            pt = pd.crosstab(df.A, df.C)
            res = pt.small.values.sum()
            return res

        hpat_func = self.jit(
            pivots={'pt': ['small', 'large']})(test_impl)
        self.assertEqual(hpat_func(), test_impl())

Source File: test_groupby.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_pivot_parallel(self):
        def test_impl():
            df = pd.read_parquet("pivot2.pq")
            pt = df.pivot_table(index='A', columns='C', values='D', aggfunc='sum')
            res = pt.small.values.sum()
            return res

        hpat_func = self.jit(
            pivots={'pt': ['small', 'large']})(test_impl)
        self.assertEqual(hpat_func(), test_impl())

Source File: test_join.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_merge_asof_parallel1(self):
        def test_impl():
            df1 = pd.read_parquet('asof1.pq')
            df2 = pd.read_parquet('asof2.pq')
            df3 = pd.merge_asof(df1, df2, on='time')
            return (df3.A.sum(), df3.time.max(), df3.B.sum())

        hpat_func = self.jit(test_impl)
        self.assertEqual(hpat_func(), test_impl())

Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_pq_spark_date(self):
        def test_impl():
            df = pd.read_parquet('sdf_dt.pq')
            return pd.DataFrame({'DT64': df.DT64, 'col2': df.DATE})

        hpat_func = self.jit(test_impl)
        pd.testing.assert_frame_equal(hpat_func(), test_impl())

Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_pq_pandas_date(self):
        def test_impl():
            df = pd.read_parquet('pandas_dt.pq')
            return pd.DataFrame({'DT64': df.DT64, 'col2': df.DATE})

        hpat_func = self.jit(test_impl)
        pd.testing.assert_frame_equal(hpat_func(), test_impl())

Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_pd_read_parquet(self):
        def test_impl():
            df = pd.read_parquet('kde.parquet')
            X = df['points']
            return X.sum()

        hpat_func = self.jit(test_impl)
        np.testing.assert_almost_equal(hpat_func(), test_impl())
        self.assertEqual(count_array_REPs(), 0)
        self.assertEqual(count_parfor_REPs(), 0)

Source File: brute_force_plotter.py From brute-force-plotter with MIT License

5 votes

def plot_numeric_numeric(input_file, col1, col2, path):
    df = pd.read_parquet(input_file, columns=[col1, col2])
    file_name = os.path.join(path, f"{col1}-{col2}-scatter-plot.png")
    scatter_plot(df, col1, col2, file_name=file_name)

Python pandas.read_parquet() Examples