Python pyarrow.parquet() Examples
The following are 30
code examples of pyarrow.parquet().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: parquet.py From recruit with Apache License 2.0 | 6 votes |
def validate_dataframe(df): if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) if df.columns.inferred_type not in {'string', 'unicode'}: raise ValueError("parquet must have string column names") # index level names must be strings valid_names = all( isinstance(name, string_types) for name in df.index.names if name is not None ) if not valid_names: raise ValueError("Index level names must be strings")
Example #2
Source File: dataframe_bytes_storage.py From pyABC with BSD 3-Clause "New" or "Revised" License | 6 votes |
def df_from_bytes_parquet_(bytes_: bytes) -> pd.DataFrame: """ Since pyabc 0.9.14, pandas DataFrames are converted using pyarrow parquet. If the conversion to DataFrame fails, then `df_from_bytes_msgpack_` is tried, which was the formerly used method. This is in particular useful for databases that still employ the old format. In case errors occur here, it may be necessary to use a pandas version prior to 0.25.0. """ try: b = BytesIO(bytes_) table = parquet.read_table(b) df = table.to_pandas() except pyarrow.lib.ArrowIOError: df = df_from_bytes_msgpack_(bytes_) return df
Example #3
Source File: parquet.py From elasticintel with GNU General Public License v3.0 | 6 votes |
def read_parquet(path, engine='auto', **kwargs): """ Load a parquet object from the file path, returning a DataFrame. .. versionadded 0.21.0 Parameters ---------- path : string File path engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet reader library to use. If 'auto', then the option 'io.parquet.engine' is used. If 'auto', then the first library to be installed is used. kwargs are passed to the engine Returns ------- DataFrame """ impl = get_engine(engine) return impl.read(path)
Example #4
Source File: parquet.py From elasticintel with GNU General Public License v3.0 | 6 votes |
def __init__(self): # since pandas is a dependency of fastparquet # we need to import on first use try: import fastparquet except ImportError: raise ImportError("fastparquet is required for parquet support\n\n" "you can install via conda\n" "conda install fastparquet -c conda-forge\n" "\nor via pip\n" "pip install -U fastparquet") if LooseVersion(fastparquet.__version__) < '0.1.0': raise ImportError("fastparquet >= 0.1.0 is required for parquet " "support\n\n" "you can install via conda\n" "conda install fastparquet -c conda-forge\n" "\nor via pip\n" "pip install -U fastparquet") self.api = fastparquet
Example #5
Source File: parquet.py From elasticintel with GNU General Public License v3.0 | 6 votes |
def __init__(self): # since pandas is a dependency of pyarrow # we need to import on first use try: import pyarrow import pyarrow.parquet except ImportError: raise ImportError("pyarrow is required for parquet support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n") if LooseVersion(pyarrow.__version__) < '0.4.1': raise ImportError("pyarrow >= 0.4.1 is required for parquet" "support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n") self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0' self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0' self.api = pyarrow
Example #6
Source File: parquet.py From elasticintel with GNU General Public License v3.0 | 6 votes |
def get_engine(engine): """ return our implementation """ if engine == 'auto': engine = get_option('io.parquet.engine') if engine == 'auto': # try engines in this order try: return PyArrowImpl() except ImportError: pass try: return FastParquetImpl() except ImportError: pass if engine not in ['pyarrow', 'fastparquet']: raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") if engine == 'pyarrow': return PyArrowImpl() elif engine == 'fastparquet': return FastParquetImpl()
Example #7
Source File: util.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def parquet_file( table: Union[Dict[str, List[Any]], pyarrow.Table], dir: Optional[pathlib.Path] = None, ) -> ContextManager[pathlib.Path]: """ Yield a filename with `table` written to a Parquet file. """ if isinstance(table, dict): table = pyarrow.table(table) with tempfile_context(dir=dir) as parquet_path: pyarrow.parquet.write_table( table, parquet_path, version="2.0", compression="SNAPPY", use_dictionary=[ name.encode("utf-8") for name, column in zip(table.column_names, table.columns) if pyarrow.types.is_dictionary(column.type) ], ) yield parquet_path
Example #8
Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): """ Write a DataFrame to the parquet format. Parameters ---------- df : DataFrame path : string File path engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet reader library to use. If 'auto', then the option 'io.parquet.engine' is used. If 'auto', then the first library to be installed is used. compression : str, optional, default 'snappy' compression method, includes {'gzip', 'snappy', 'brotli'} kwargs Additional keyword arguments passed to the engine """ impl = get_engine(engine) return impl.write(df, path, compression=compression, **kwargs)
Example #9
Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def write(self, df, path, compression='snappy', coerce_timestamps='ms', **kwargs): self.validate_dataframe(df) if self._pyarrow_lt_070: self._validate_write_lt_070(df) path, _, _ = get_filepath_or_buffer(path) if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True) self.api.parquet.write_table( table, path, compression=compression, **kwargs) else: table = self.api.Table.from_pandas(df) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
Example #10
Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0 | 6 votes |
def validate_dataframe(df): if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) if df.columns.inferred_type not in {'string', 'unicode'}: raise ValueError("parquet must have string column names") # index level names must be strings valid_names = all( isinstance(name, string_types) for name in df.index.names if name is not None ) if not valid_names: raise ValueError("Index level names must be strings")
Example #11
Source File: parquet.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def write(self, df, path, compression='snappy', coerce_timestamps='ms', index=None, partition_cols=None, **kwargs): self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if index is None: from_pandas_kwargs = {} else: from_pandas_kwargs = {'preserve_index': index} table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: self.api.parquet.write_to_dataset( table, path, compression=compression, coerce_timestamps=coerce_timestamps, partition_cols=partition_cols, **kwargs) else: self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
Example #12
Source File: parquet.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def __init__(self): # since pandas is a dependency of pyarrow # we need to import on first use try: import pyarrow import pyarrow.parquet except ImportError: raise ImportError( "pyarrow is required for parquet support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n" ) if LooseVersion(pyarrow.__version__) < '0.9.0': raise ImportError( "pyarrow >= 0.9.0 is required for parquet support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n" ) self.api = pyarrow
Example #13
Source File: parquet.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def validate_dataframe(df): if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) if df.columns.inferred_type not in {'string', 'unicode'}: raise ValueError("parquet must have string column names") # index level names must be strings valid_names = all( isinstance(name, string_types) for name in df.index.names if name is not None ) if not valid_names: raise ValueError("Index level names must be strings")
Example #14
Source File: parquet.py From vnpy_crypto with MIT License | 6 votes |
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): """ Write a DataFrame to the parquet format. Parameters ---------- df : DataFrame path : string File path engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. kwargs Additional keyword arguments passed to the engine """ impl = get_engine(engine) return impl.write(df, path, compression=compression, **kwargs)
Example #15
Source File: parquet.py From vnpy_crypto with MIT License | 6 votes |
def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) if self._pyarrow_lt_070: result = self.api.parquet.read_pandas(path, columns=columns, **kwargs).to_pandas() else: kwargs['use_pandas_metadata'] = True result = self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas() if should_close: try: path.close() except: # noqa: flake8 pass return result
Example #16
Source File: parquet.py From vnpy_crypto with MIT License | 6 votes |
def write(self, df, path, compression='snappy', coerce_timestamps='ms', **kwargs): self.validate_dataframe(df) if self._pyarrow_lt_070: self._validate_write_lt_070(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True) self.api.parquet.write_table( table, path, compression=compression, **kwargs) else: table = self.api.Table.from_pandas(df) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
Example #17
Source File: parquet.py From recruit with Apache License 2.0 | 6 votes |
def __init__(self): # since pandas is a dependency of pyarrow # we need to import on first use try: import pyarrow import pyarrow.parquet except ImportError: raise ImportError( "pyarrow is required for parquet support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n" ) if LooseVersion(pyarrow.__version__) < '0.9.0': raise ImportError( "pyarrow >= 0.9.0 is required for parquet support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n" ) self.api = pyarrow
Example #18
Source File: parquet.py From recruit with Apache License 2.0 | 6 votes |
def write(self, df, path, compression='snappy', coerce_timestamps='ms', index=None, partition_cols=None, **kwargs): self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if index is None: from_pandas_kwargs = {} else: from_pandas_kwargs = {'preserve_index': index} table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: self.api.parquet.write_to_dataset( table, path, compression=compression, coerce_timestamps=coerce_timestamps, partition_cols=partition_cols, **kwargs) else: self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
Example #19
Source File: parquet.py From vnpy_crypto with MIT License | 6 votes |
def validate_dataframe(df): if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) if df.columns.inferred_type not in {'string', 'unicode'}: raise ValueError("parquet must have string column names") # index level names must be strings valid_names = all( isinstance(name, string_types) for name in df.index.names if name is not None ) if not valid_names: raise ValueError("Index level names must be strings")
Example #20
Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self): # since pandas is a dependency of pyarrow # we need to import on first use try: import pyarrow import pyarrow.parquet except ImportError: raise ImportError( "pyarrow is required for parquet support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n" ) if LooseVersion(pyarrow.__version__) < '0.4.1': raise ImportError( "pyarrow >= 0.4.1 is required for parquet support\n\n" "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" "pip install -U pyarrow\n" ) self._pyarrow_lt_060 = ( LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0')) self._pyarrow_lt_070 = ( LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0')) self.api = pyarrow
Example #21
Source File: parquet.py From recruit with Apache License 2.0 | 5 votes |
def get_engine(engine): """ return our implementation """ if engine == 'auto': engine = get_option('io.parquet.engine') if engine == 'auto': # try engines in this order try: return PyArrowImpl() except ImportError: pass try: return FastParquetImpl() except ImportError: pass raise ImportError("Unable to find a usable engine; " "tried using: 'pyarrow', 'fastparquet'.\n" "pyarrow or fastparquet is required for parquet " "support") if engine not in ['pyarrow', 'fastparquet']: raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") if engine == 'pyarrow': return PyArrowImpl() elif engine == 'fastparquet': return FastParquetImpl()
Example #22
Source File: dataframe_bytes_storage.py From pyABC with BSD 3-Clause "New" or "Revised" License | 5 votes |
def df_to_bytes_parquet_(df: pd.DataFrame) -> bytes: """ pyarrow parquet is the standard conversion method of pandas DataFrames since pyabc 0.9.14, because msgpack became deprecated in pandas 0.25.0. """ b = BytesIO() table = pyarrow.Table.from_pandas(df) parquet.write_table(table, b) b.seek(0) return b.read()
Example #23
Source File: parquet.py From elasticintel with GNU General Public License v3.0 | 5 votes |
def read(self, path): path, _, _ = get_filepath_or_buffer(path) return self.api.parquet.read_table(path).to_pandas()
Example #24
Source File: parquet.py From elasticintel with GNU General Public License v3.0 | 5 votes |
def write(self, df, path, compression='snappy', coerce_timestamps='ms', **kwargs): path, _, _ = get_filepath_or_buffer(path) if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True) self.api.parquet.write_table( table, path, compression=compression, **kwargs) else: table = self.api.Table.from_pandas(df) self.api.parquet.write_table( table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs)
Example #25
Source File: parquet.py From recruit with Apache License 2.0 | 5 votes |
def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) kwargs['use_pandas_metadata'] = True result = self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas() if should_close: try: path.close() except: # noqa: flake8 pass return result
Example #26
Source File: parquet.py From recruit with Apache License 2.0 | 5 votes |
def to_parquet(df, path, engine='auto', compression='snappy', index=None, partition_cols=None, **kwargs): """ Write a DataFrame to the parquet format. Parameters ---------- path : str File path or Root Directory path. Will be used as Root Directory path while writing a partitioned dataset. .. versionchanged:: 0.24.0 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, the engine's default behavior will be used. .. versionadded 0.24.0 partition_cols : list, optional, default None Column names by which to partition the dataset Columns are partitioned in the order they are given .. versionadded:: 0.24.0 kwargs Additional keyword arguments passed to the engine """ impl = get_engine(engine) return impl.write(df, path, compression=compression, index=index, partition_cols=partition_cols, **kwargs)
Example #27
Source File: parquet.py From recruit with Apache License 2.0 | 5 votes |
def read_parquet(path, engine='auto', columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. .. versionadded 0.21.0 Parameters ---------- path : string File path columns : list, default=None If not None, only these columns will be read from the file. .. versionadded 0.21.1 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. kwargs are passed to the engine Returns ------- DataFrame """ impl = get_engine(engine) return impl.read(path, columns=columns, **kwargs)
Example #28
Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def read_parquet(path, engine='auto', columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. .. versionadded 0.21.0 Parameters ---------- path : string File path columns: list, default=None If not None, only these columns will be read from the file. .. versionadded 0.21.1 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet reader library to use. If 'auto', then the option 'io.parquet.engine' is used. If 'auto', then the first library to be installed is used. kwargs are passed to the engine Returns ------- DataFrame """ impl = get_engine(engine) return impl.read(path, columns=columns, **kwargs)
Example #29
Source File: parquet.py From vnpy_crypto with MIT License | 5 votes |
def get_engine(engine): """ return our implementation """ if engine == 'auto': engine = get_option('io.parquet.engine') if engine == 'auto': # try engines in this order try: return PyArrowImpl() except ImportError: pass try: return FastParquetImpl() except ImportError: pass raise ImportError("Unable to find a usable engine; " "tried using: 'pyarrow', 'fastparquet'.\n" "pyarrow or fastparquet is required for parquet " "support") if engine not in ['pyarrow', 'fastparquet']: raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") if engine == 'pyarrow': return PyArrowImpl() elif engine == 'fastparquet': return FastParquetImpl()
Example #30
Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) if self._pyarrow_lt_070: return self.api.parquet.read_pandas(path, columns=columns, **kwargs).to_pandas() kwargs['use_pandas_metadata'] = True return self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas()