Python pandas.read_parquet() Examples
The following are 30
code examples of pandas.read_parquet().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: validator_kfold.py From mljar-supervised with MIT License | 7 votes |
def get_split(self, k): train_index_file = os.path.join( self._results_path, "folds", f"fold_{k}_train_indices.npy" ) validation_index_file = os.path.join( self._results_path, "folds", f"fold_{k}_validation_indices.npy" ) train_index = np.load(train_index_file) validation_index = np.load(validation_index_file) X = pd.read_parquet(self._X_train_path) y = pd.read_parquet(self._y_train_path) y = y["target"] return ( {"X": X.loc[train_index], "y": y.loc[train_index]}, {"X": X.loc[validation_index], "y": y.loc[validation_index]}, )
Example #2
Source File: test_dataframe.py From sdc with BSD 2-Clause "Simplified" License | 6 votes |
def test_sort_parallel(self): # create `kde.parquet` file ParquetGenerator.gen_kde_pq() # TODO: better parallel sort test def test_impl(): df = pd.read_parquet('kde.parquet') df['A'] = df.points.astype(np.float64) df.sort_values('points', inplace=True) res = df.A.values return res hpat_func = self.jit(locals={'res:return': 'distributed'})(test_impl) save_min_samples = sdc.hiframes.sort.MIN_SAMPLES try: sdc.hiframes.sort.MIN_SAMPLES = 10 res = hpat_func() self.assertTrue((np.diff(res) >= 0).all()) finally: # restore global val sdc.hiframes.sort.MIN_SAMPLES = save_min_samples
Example #3
Source File: parquet.py From timeserio with MIT License | 6 votes |
def make_subgen(self, chunk): filename = chunk subgen = single_sequence.SequenceForecastBatchGenerator( df=pd.read_parquet(filename).reset_index(), batch_size=self.batch_size, sequence_length=self.sequence_length, id_column=self.id_column, sequence_columns=self.sequence_columns, sequence_prefix=self.sequence_prefix, last_step_columns=self.last_step_columns, last_step_prefix=self.last_step_prefix, forecast_steps_min=self.forecast_steps_min, forecast_steps_max=self.forecast_steps_max, batch_offset=self.batch_offset, batch_offset_period=self.batch_offset_period, dt_column=self.dt_column, start_time=self.start_time, ) return subgen
Example #4
Source File: data.py From lkpy with MIT License | 6 votes |
def read_df_detect(path): """ Read a Pandas data frame, auto-detecting the file format based on filename suffix. The following file types are supported: CSV File has suffix ``.csv``, read with :py:func:`pandas.read_csv`. Parquet File has suffix ``.parquet``, ``.parq``, or ``.pq``, read with :py:func:`pandas.read_parquet`. """ import pandas as pd if not isinstance(path, pathlib.Path): path = pathlib.Path(path) if path.suffix == '.csv': return pd.read_csv(path) elif path.suffix in ('.parquet', '.parq', '.pq'): return pd.read_parquet(path)
Example #5
Source File: io.py From modin with Apache License 2.0 | 6 votes |
def read_parquet(cls, path, engine, columns, **kwargs): """Load a parquet object from the file path, returning a Modin DataFrame. Modin only supports pyarrow engine for now. Args: path: The filepath of the parquet file. We only support local files for now. engine: Modin only supports pyarrow reader. This argument doesn't do anything for now. kwargs: Pass into parquet's read_pandas function. Notes: ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ ErrorMessage.default_to_pandas("`read_parquet`") return cls.from_pandas(pandas.read_parquet(path, engine, columns, **kwargs))
Example #6
Source File: parsers.py From modin with Apache License 2.0 | 6 votes |
def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) columns = kwargs.get("columns", None) if num_splits is None: return pandas.read_parquet(fname, **kwargs) kwargs["use_pandas_metadata"] = True df = pandas.read_parquet(fname, **kwargs) if isinstance(df.index, pandas.RangeIndex): idx = len(df.index) else: idx = df.index columns = [c for c in columns if c not in df.index.names and c in df.columns] if columns is not None: df = df[columns] # Append the length of the index here to build it externally return _split_result_for_readers(0, num_splits, df) + [idx, df.dtypes]
Example #7
Source File: test_dataframe_spark_io.py From koalas with Apache License 2.0 | 6 votes |
def test_parquet_write(self): with self.temp_dir() as tmp: pdf = self.test_pdf expected = ks.DataFrame(pdf) # Write out partitioned by one column expected.to_parquet(tmp, mode="overwrite", partition_cols="i32") # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_parquet(tmp)[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), ) # Write out partitioned by two columns expected.to_parquet(tmp, mode="overwrite", partition_cols=["i32", "bhello"]) # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ks.read_parquet(tmp)[self.test_column_order] self.assert_eq( actual.sort_values(by="f").to_spark().toPandas(), expected.sort_values(by="f").to_spark().toPandas(), )
Example #8
Source File: test_config_driven_df.py From dagster with Apache License 2.0 | 6 votes |
def test_dataframe_parquet_materialization(): check_parquet_support() @solid(output_defs=[OutputDefinition(DataFrame)]) def return_df(_context): return pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]}) @pipeline def return_df_pipeline(): return_df() with get_temp_file_name() as filename: result = execute_pipeline( return_df_pipeline, {'solids': {'return_df': {'outputs': [{'result': {'parquet': {'path': filename}}}]}}}, ) assert result.success df = pd.read_parquet(filename) assert df.to_dict('list') == {'num1': [1, 3], 'num2': [2, 4]}
Example #9
Source File: automl.py From mljar-supervised with MIT License | 6 votes |
def _load_data_variables(self, X_train): if X_train.shape[1] == 0: X = pd.read_parquet(self._X_train_path) for c in X.columns: X_train.insert(loc=X_train.shape[1], column=c, value=X[c]) os.remove(self._X_train_path) os.remove(self._y_train_path)
Example #10
Source File: validator_split.py From mljar-supervised with MIT License | 6 votes |
def get_split(self, k=0): X = pd.read_parquet(self._X_train_path) y = pd.read_parquet(self._y_train_path) y = y["target"] stratify = None if self.stratify: stratify = y if self.shuffle == False: stratify = None X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=self.train_ratio, test_size=1.0 - self.train_ratio, shuffle=self.shuffle, stratify=stratify, random_state=self.random_seed, ) return {"X": X_train, "y": y_train}, {"X": X_validation, "y": y_validation}
Example #11
Source File: test_schema.py From flytekit with Apache License 2.0 | 6 votes |
def test_datetime_coercion_explicitly(): """ Sanity check that we're using a version of pyarrow that allows us to truncate timestamps """ dt = _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1, microsecond=1) values = [(dt,)] df = _pd.DataFrame.from_records(values, columns=['testname']) assert df['testname'][0] == dt with _utils.AutoDeletingTempDir('test') as tmpdir: tmpfile = tmpdir.get_named_tempfile('repro.parquet') df.to_parquet(tmpfile, coerce_timestamps='ms', allow_truncated_timestamps=True) df2 = _pd.read_parquet(tmpfile) dt2 = _datetime.datetime(day=1, month=1, year=2017, hour=1, minute=1, second=1) assert df2['testname'][0] == dt2
Example #12
Source File: invoker.py From anomalydetector with MIT License | 6 votes |
def read_as_dataframe(input_path: str): if os.path.isfile(input_path): if input_path.endswith(".csv"): return pd.read_csv(input_path) elif input_path.endswith(".parquet"): return pd.read_parquet(input_path) else: dir_path = pathlib.Path(input_path) csv_files = list(dir_path.glob("**/*.csv")) if csv_files: df_from_csv_files = (pd.read_csv(f) for f in csv_files) return pd.concat(df_from_csv_files, ignore_index=True) parquet_files = list(dir_path.glob("**/*.parquet")) if parquet_files: df_from_parquet_files = (pd.read_parquet(f) for f in parquet_files) return pd.concat(df_from_parquet_files, ignore_index=True) raise ValueError(f"Failed to read path: {input_path}")
Example #13
Source File: file_type.py From gordo with GNU Affero General Public License v3.0 | 6 votes |
def read_df(self, f: BinaryIO) -> pd.DataFrame: columns = self.time_series_columns.columns datetime_column = self.time_series_columns.datetime_column df = pd.read_parquet(f, engine="pyarrow", columns=columns).set_index( datetime_column ) df.index = pd.to_datetime(df.index, utc=True) return df
Example #14
Source File: test_io.py From modin with Apache License 2.0 | 5 votes |
def test_from_parquet_partition(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE, directory=True) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME) df_equals(modin_df, pandas_df)
Example #15
Source File: message_collector.py From aki with GNU Affero General Public License v3.0 | 5 votes |
def load(dt_: datetime) -> Optional[DataFrame]: filename = make_filename(dt_) if path.isfile(filename): return read_parquet(filename)
Example #16
Source File: test_io.py From modin with Apache License 2.0 | 5 votes |
def test_from_parquet_hdfs(): path = "modin/pandas/test/data/hdfs.parquet" pandas_df = pandas.read_parquet(path) modin_df = pd.read_parquet(path) df_equals(modin_df, pandas_df)
Example #17
Source File: core.py From tensorqtl with BSD 3-Clause "New" or "Revised" License | 5 votes |
def read_phenotype_bed(phenotype_bed): """Load phenotype BED file as phenotype and TSS DataFrames""" if phenotype_bed.endswith('.bed.gz'): phenotype_df = pd.read_csv(phenotype_bed, sep='\t', index_col=3, dtype={'#chr':str, '#Chr':str}) elif phenotype_bed.endswith('.parquet'): phenotype_df = pd.read_parquet(phenotype_bed) phenotype_df.set_index(phenotype_df.columns[3], inplace=True) else: raise ValueError('Unsupported file type.') phenotype_df.rename(columns={i:i.lower().replace('#chr','chr') for i in phenotype_df.columns[:3]}, inplace=True) phenotype_pos_df = phenotype_df[['chr', 'end']].rename(columns={'end':'tss'}) phenotype_df.drop(['chr', 'start', 'end'], axis=1, inplace=True) return phenotype_df, phenotype_pos_df
Example #18
Source File: test_io.py From modin with Apache License 2.0 | 5 votes |
def test_from_parquet_partitioned_columns(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"]) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME) df_equals(modin_df, pandas_df)
Example #19
Source File: io.py From mlcomp with Apache License 2.0 | 5 votes |
def read_pandas(file): if file.endswith('.csv'): df = pd.read_csv(file) elif file.endswith('.parquet'): df = pd.read_parquet(file) else: raise Exception('Unknown file type') return df
Example #20
Source File: eqtl_prepare_expression.py From gtex-pipeline with BSD 3-Clause "New" or "Revised" License | 5 votes |
def read_gct(gct_file, sample_ids=None, dtype=None): """ Load GCT as DataFrame. The first two columns must be 'Name' and 'Description'. """ if sample_ids is not None: sample_ids = ['Name']+list(sample_ids) if gct_file.endswith('.gct.gz') or gct_file.endswith('.gct'): if dtype is not None: with gzip.open(gct_file, 'rt') as gct: gct.readline() gct.readline() sample_ids = gct.readline().strip().split() dtypes = {i:dtype for i in sample_ids[2:]} dtypes['Name'] = str dtypes['Description'] = str df = pd.read_csv(gct_file, sep='\t', skiprows=2, usecols=sample_ids, index_col=0, dtype=dtypes) else: df = pd.read_csv(gct_file, sep='\t', skiprows=2, usecols=sample_ids, index_col=0) elif gct_file.endswith('.parquet'): df = pd.read_parquet(gct_file, columns=sample_ids) elif gct_file.endswith('.ft'): # feather format df = feather.read_dataframe(gct_file, columns=sample_ids) df = df.set_index('Name') else: raise ValueError('Unsupported input format.') df.index.name = 'gene_id' if 'Description' in df.columns: df = df.drop('Description', axis=1) return df
Example #21
Source File: test_io.py From modin with Apache License 2.0 | 5 votes |
def test_from_parquet_partitioned_columns_with_columns(make_parquet_file): make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"]) pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) df_equals(modin_df, pandas_df)
Example #22
Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def test_pq_read_freevar_str1(self): kde_file2 = 'kde.parquet' def test_impl(): df = pd.read_parquet(kde_file2) X = df['points'] return X.sum() hpat_func = self.jit(test_impl) np.testing.assert_almost_equal(hpat_func(), test_impl()) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
Example #23
Source File: parquet.py From timeserio with MIT License | 5 votes |
def make_subgen(self, chunk): filename = chunk subgen = single_row.RowBatchGenerator( df=pd.read_parquet(filename).reset_index(), batch_size=self.batch_size, columns=self.columns ) return subgen
Example #24
Source File: test_groupby.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def test_crosstab_parallel1(self): def test_impl(): df = pd.read_parquet("pivot2.pq") pt = pd.crosstab(df.A, df.C) res = pt.small.values.sum() return res hpat_func = self.jit( pivots={'pt': ['small', 'large']})(test_impl) self.assertEqual(hpat_func(), test_impl())
Example #25
Source File: test_groupby.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def test_pivot_parallel(self): def test_impl(): df = pd.read_parquet("pivot2.pq") pt = df.pivot_table(index='A', columns='C', values='D', aggfunc='sum') res = pt.small.values.sum() return res hpat_func = self.jit( pivots={'pt': ['small', 'large']})(test_impl) self.assertEqual(hpat_func(), test_impl())
Example #26
Source File: test_join.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def test_merge_asof_parallel1(self): def test_impl(): df1 = pd.read_parquet('asof1.pq') df2 = pd.read_parquet('asof2.pq') df3 = pd.merge_asof(df1, df2, on='time') return (df3.A.sum(), df3.time.max(), df3.B.sum()) hpat_func = self.jit(test_impl) self.assertEqual(hpat_func(), test_impl())
Example #27
Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def test_pq_spark_date(self): def test_impl(): df = pd.read_parquet('sdf_dt.pq') return pd.DataFrame({'DT64': df.DT64, 'col2': df.DATE}) hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl())
Example #28
Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def test_pq_pandas_date(self): def test_impl(): df = pd.read_parquet('pandas_dt.pq') return pd.DataFrame({'DT64': df.DT64, 'col2': df.DATE}) hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(), test_impl())
Example #29
Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def test_pd_read_parquet(self): def test_impl(): df = pd.read_parquet('kde.parquet') X = df['points'] return X.sum() hpat_func = self.jit(test_impl) np.testing.assert_almost_equal(hpat_func(), test_impl()) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
Example #30
Source File: brute_force_plotter.py From brute-force-plotter with MIT License | 5 votes |
def plot_numeric_numeric(input_file, col1, col2, path): df = pd.read_parquet(input_file, columns=[col1, col2]) file_name = os.path.join(path, f"{col1}-{col2}-scatter-plot.png") scatter_plot(df, col1, col2, file_name=file_name)