Python Examples of pyarrow.parquet

Source File: parquet.py From recruit with Apache License 2.0

6 votes

def validate_dataframe(df):

        if not isinstance(df, DataFrame):
            raise ValueError("to_parquet only supports IO with DataFrames")

        # must have value column names (strings only)
        if df.columns.inferred_type not in {'string', 'unicode'}:
            raise ValueError("parquet must have string column names")

        # index level names must be strings
        valid_names = all(
            isinstance(name, string_types)
            for name in df.index.names
            if name is not None
        )
        if not valid_names:
            raise ValueError("Index level names must be strings")

Source File: dataframe_bytes_storage.py From pyABC with BSD 3-Clause "New" or "Revised" License

6 votes

def df_from_bytes_parquet_(bytes_: bytes) -> pd.DataFrame:
    """
    Since pyabc 0.9.14, pandas DataFrames are converted using
    pyarrow parquet. If the conversion to DataFrame fails,
    then `df_from_bytes_msgpack_` is tried, which was the formerly
    used method. This is in particular useful for databases that
    still employ the old format. In case errors occur here, it may
    be necessary to use a pandas version prior to 0.25.0.
    """
    try:
        b = BytesIO(bytes_)
        table = parquet.read_table(b)
        df = table.to_pandas()
    except pyarrow.lib.ArrowIOError:
        df = df_from_bytes_msgpack_(bytes_)
    return df

Source File: parquet.py From elasticintel with GNU General Public License v3.0

6 votes

def read_parquet(path, engine='auto', **kwargs):
    """
    Load a parquet object from the file path, returning a DataFrame.

    .. versionadded 0.21.0

    Parameters
    ----------
    path : string
        File path
    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
        Parquet reader library to use. If 'auto', then the option
        'io.parquet.engine' is used. If 'auto', then the first
        library to be installed is used.
    kwargs are passed to the engine

    Returns
    -------
    DataFrame

    """

    impl = get_engine(engine)
    return impl.read(path)

Source File: parquet.py From elasticintel with GNU General Public License v3.0

6 votes

def __init__(self):
        # since pandas is a dependency of fastparquet
        # we need to import on first use

        try:
            import fastparquet
        except ImportError:
            raise ImportError("fastparquet is required for parquet support\n\n"
                              "you can install via conda\n"
                              "conda install fastparquet -c conda-forge\n"
                              "\nor via pip\n"
                              "pip install -U fastparquet")

        if LooseVersion(fastparquet.__version__) < '0.1.0':
            raise ImportError("fastparquet >= 0.1.0 is required for parquet "
                              "support\n\n"
                              "you can install via conda\n"
                              "conda install fastparquet -c conda-forge\n"
                              "\nor via pip\n"
                              "pip install -U fastparquet")

        self.api = fastparquet

Source File: parquet.py From elasticintel with GNU General Public License v3.0

6 votes

def __init__(self):
        # since pandas is a dependency of pyarrow
        # we need to import on first use

        try:
            import pyarrow
            import pyarrow.parquet
        except ImportError:
            raise ImportError("pyarrow is required for parquet support\n\n"
                              "you can install via conda\n"
                              "conda install pyarrow -c conda-forge\n"
                              "\nor via pip\n"
                              "pip install -U pyarrow\n")

        if LooseVersion(pyarrow.__version__) < '0.4.1':
            raise ImportError("pyarrow >= 0.4.1 is required for parquet"
                              "support\n\n"
                              "you can install via conda\n"
                              "conda install pyarrow -c conda-forge\n"
                              "\nor via pip\n"
                              "pip install -U pyarrow\n")

        self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0'
        self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0'
        self.api = pyarrow

Source File: parquet.py From elasticintel with GNU General Public License v3.0

6 votes

def get_engine(engine):
    """ return our implementation """

    if engine == 'auto':
        engine = get_option('io.parquet.engine')

    if engine == 'auto':
        # try engines in this order
        try:
            return PyArrowImpl()
        except ImportError:
            pass

        try:
            return FastParquetImpl()
        except ImportError:
            pass

    if engine not in ['pyarrow', 'fastparquet']:
        raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")

    if engine == 'pyarrow':
        return PyArrowImpl()
    elif engine == 'fastparquet':
        return FastParquetImpl()

Source File: util.py From cjworkbench with GNU Affero General Public License v3.0

6 votes

def parquet_file(
    table: Union[Dict[str, List[Any]], pyarrow.Table],
    dir: Optional[pathlib.Path] = None,
) -> ContextManager[pathlib.Path]:
    """
    Yield a filename with `table` written to a Parquet file.
    """
    if isinstance(table, dict):
        table = pyarrow.table(table)

    with tempfile_context(dir=dir) as parquet_path:
        pyarrow.parquet.write_table(
            table,
            parquet_path,
            version="2.0",
            compression="SNAPPY",
            use_dictionary=[
                name.encode("utf-8")
                for name, column in zip(table.column_names, table.columns)
                if pyarrow.types.is_dictionary(column.type)
            ],
        )
        yield parquet_path

Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0

6 votes

def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
    """
    Write a DataFrame to the parquet format.

    Parameters
    ----------
    df : DataFrame
    path : string
        File path
    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
        Parquet reader library to use. If 'auto', then the option
        'io.parquet.engine' is used. If 'auto', then the first
        library to be installed is used.
    compression : str, optional, default 'snappy'
        compression method, includes {'gzip', 'snappy', 'brotli'}
    kwargs
        Additional keyword arguments passed to the engine
    """
    impl = get_engine(engine)
    return impl.write(df, path, compression=compression, **kwargs)

Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0

6 votes

def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', **kwargs):
        self.validate_dataframe(df)
        if self._pyarrow_lt_070:
            self._validate_write_lt_070(df)
        path, _, _ = get_filepath_or_buffer(path)

        if self._pyarrow_lt_060:
            table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
            self.api.parquet.write_table(
                table, path, compression=compression, **kwargs)

        else:
            table = self.api.Table.from_pandas(df)
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)

Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0

6 votes

def validate_dataframe(df):

        if not isinstance(df, DataFrame):
            raise ValueError("to_parquet only supports IO with DataFrames")

        # must have value column names (strings only)
        if df.columns.inferred_type not in {'string', 'unicode'}:
            raise ValueError("parquet must have string column names")

        # index level names must be strings
        valid_names = all(
            isinstance(name, string_types)
            for name in df.index.names
            if name is not None
        )
        if not valid_names:
            raise ValueError("Index level names must be strings")

Source File: parquet.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', index=None, partition_cols=None,
              **kwargs):
        self.validate_dataframe(df)
        path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

        if index is None:
            from_pandas_kwargs = {}
        else:
            from_pandas_kwargs = {'preserve_index': index}
        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
        if partition_cols is not None:
            self.api.parquet.write_to_dataset(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps,
                partition_cols=partition_cols, **kwargs)
        else:
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)

Source File: parquet.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def __init__(self):
        # since pandas is a dependency of pyarrow
        # we need to import on first use
        try:
            import pyarrow
            import pyarrow.parquet
        except ImportError:
            raise ImportError(
                "pyarrow is required for parquet support\n\n"
                "you can install via conda\n"
                "conda install pyarrow -c conda-forge\n"
                "\nor via pip\n"
                "pip install -U pyarrow\n"
            )
        if LooseVersion(pyarrow.__version__) < '0.9.0':
            raise ImportError(
                "pyarrow >= 0.9.0 is required for parquet support\n\n"
                "you can install via conda\n"
                "conda install pyarrow -c conda-forge\n"
                "\nor via pip\n"
                "pip install -U pyarrow\n"
            )

        self.api = pyarrow

Source File: parquet.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def validate_dataframe(df):

        if not isinstance(df, DataFrame):
            raise ValueError("to_parquet only supports IO with DataFrames")

        # must have value column names (strings only)
        if df.columns.inferred_type not in {'string', 'unicode'}:
            raise ValueError("parquet must have string column names")

        # index level names must be strings
        valid_names = all(
            isinstance(name, string_types)
            for name in df.index.names
            if name is not None
        )
        if not valid_names:
            raise ValueError("Index level names must be strings")

Source File: parquet.py From vnpy_crypto with MIT License

6 votes

def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
    """
    Write a DataFrame to the parquet format.

    Parameters
    ----------
    df : DataFrame
    path : string
        File path
    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
        Parquet library to use. If 'auto', then the option
        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
        behavior is to try 'pyarrow', falling back to 'fastparquet' if
        'pyarrow' is unavailable.
    compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
        Name of the compression to use. Use ``None`` for no compression.
    kwargs
        Additional keyword arguments passed to the engine
    """
    impl = get_engine(engine)
    return impl.write(df, path, compression=compression, **kwargs)

Source File: parquet.py From vnpy_crypto with MIT License

6 votes

def read(self, path, columns=None, **kwargs):
        path, _, _, should_close = get_filepath_or_buffer(path)
        if self._pyarrow_lt_070:
            result = self.api.parquet.read_pandas(path, columns=columns,
                                                  **kwargs).to_pandas()
        else:
            kwargs['use_pandas_metadata'] = True
            result = self.api.parquet.read_table(path, columns=columns,
                                                 **kwargs).to_pandas()
        if should_close:
            try:
                path.close()
            except:  # noqa: flake8
                pass

        return result

Source File: parquet.py From vnpy_crypto with MIT License

6 votes

def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', **kwargs):
        self.validate_dataframe(df)
        if self._pyarrow_lt_070:
            self._validate_write_lt_070(df)
        path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

        if self._pyarrow_lt_060:
            table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
            self.api.parquet.write_table(
                table, path, compression=compression, **kwargs)

        else:
            table = self.api.Table.from_pandas(df)
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)

Source File: parquet.py From recruit with Apache License 2.0

6 votes

def __init__(self):
        # since pandas is a dependency of pyarrow
        # we need to import on first use
        try:
            import pyarrow
            import pyarrow.parquet
        except ImportError:
            raise ImportError(
                "pyarrow is required for parquet support\n\n"
                "you can install via conda\n"
                "conda install pyarrow -c conda-forge\n"
                "\nor via pip\n"
                "pip install -U pyarrow\n"
            )
        if LooseVersion(pyarrow.__version__) < '0.9.0':
            raise ImportError(
                "pyarrow >= 0.9.0 is required for parquet support\n\n"
                "you can install via conda\n"
                "conda install pyarrow -c conda-forge\n"
                "\nor via pip\n"
                "pip install -U pyarrow\n"
            )

        self.api = pyarrow

Source File: parquet.py From recruit with Apache License 2.0

6 votes

def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', index=None, partition_cols=None,
              **kwargs):
        self.validate_dataframe(df)
        path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

        if index is None:
            from_pandas_kwargs = {}
        else:
            from_pandas_kwargs = {'preserve_index': index}
        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
        if partition_cols is not None:
            self.api.parquet.write_to_dataset(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps,
                partition_cols=partition_cols, **kwargs)
        else:
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)

Source File: parquet.py From vnpy_crypto with MIT License

6 votes

def validate_dataframe(df):

        if not isinstance(df, DataFrame):
            raise ValueError("to_parquet only supports IO with DataFrames")

        # must have value column names (strings only)
        if df.columns.inferred_type not in {'string', 'unicode'}:
            raise ValueError("parquet must have string column names")

        # index level names must be strings
        valid_names = all(
            isinstance(name, string_types)
            for name in df.index.names
            if name is not None
        )
        if not valid_names:
            raise ValueError("Index level names must be strings")

Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def __init__(self):
        # since pandas is a dependency of pyarrow
        # we need to import on first use
        try:
            import pyarrow
            import pyarrow.parquet
        except ImportError:
            raise ImportError(
                "pyarrow is required for parquet support\n\n"
                "you can install via conda\n"
                "conda install pyarrow -c conda-forge\n"
                "\nor via pip\n"
                "pip install -U pyarrow\n"
            )
        if LooseVersion(pyarrow.__version__) < '0.4.1':
            raise ImportError(
                "pyarrow >= 0.4.1 is required for parquet support\n\n"
                "you can install via conda\n"
                "conda install pyarrow -c conda-forge\n"
                "\nor via pip\n"
                "pip install -U pyarrow\n"
            )

        self._pyarrow_lt_060 = (
            LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
        self._pyarrow_lt_070 = (
            LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))

        self.api = pyarrow

Source File: parquet.py From recruit with Apache License 2.0

5 votes

def get_engine(engine):
    """ return our implementation """

    if engine == 'auto':
        engine = get_option('io.parquet.engine')

    if engine == 'auto':
        # try engines in this order
        try:
            return PyArrowImpl()
        except ImportError:
            pass

        try:
            return FastParquetImpl()
        except ImportError:
            pass

        raise ImportError("Unable to find a usable engine; "
                          "tried using: 'pyarrow', 'fastparquet'.\n"
                          "pyarrow or fastparquet is required for parquet "
                          "support")

    if engine not in ['pyarrow', 'fastparquet']:
        raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")

    if engine == 'pyarrow':
        return PyArrowImpl()
    elif engine == 'fastparquet':
        return FastParquetImpl()

Source File: dataframe_bytes_storage.py From pyABC with BSD 3-Clause "New" or "Revised" License

5 votes

def df_to_bytes_parquet_(df: pd.DataFrame) -> bytes:
    """
    pyarrow parquet is the standard conversion method of pandas
    DataFrames since pyabc 0.9.14, because msgpack became
    deprecated in pandas 0.25.0.
    """
    b = BytesIO()
    table = pyarrow.Table.from_pandas(df)
    parquet.write_table(table, b)
    b.seek(0)
    return b.read()

Source File: parquet.py From elasticintel with GNU General Public License v3.0

5 votes

def read(self, path):
        path, _, _ = get_filepath_or_buffer(path)
        return self.api.parquet.read_table(path).to_pandas()

Source File: parquet.py From elasticintel with GNU General Public License v3.0

5 votes

def write(self, df, path, compression='snappy',
              coerce_timestamps='ms', **kwargs):
        path, _, _ = get_filepath_or_buffer(path)
        if self._pyarrow_lt_060:
            table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
            self.api.parquet.write_table(
                table, path, compression=compression, **kwargs)

        else:
            table = self.api.Table.from_pandas(df)
            self.api.parquet.write_table(
                table, path, compression=compression,
                coerce_timestamps=coerce_timestamps, **kwargs)

Source File: parquet.py From recruit with Apache License 2.0

5 votes

def read(self, path, columns=None, **kwargs):
        path, _, _, should_close = get_filepath_or_buffer(path)

        kwargs['use_pandas_metadata'] = True
        result = self.api.parquet.read_table(path, columns=columns,
                                             **kwargs).to_pandas()
        if should_close:
            try:
                path.close()
            except:  # noqa: flake8
                pass

        return result

Source File: parquet.py From recruit with Apache License 2.0

5 votes

def to_parquet(df, path, engine='auto', compression='snappy', index=None,
               partition_cols=None, **kwargs):
    """
    Write a DataFrame to the parquet format.

    Parameters
    ----------
    path : str
        File path or Root Directory path. Will be used as Root Directory path
        while writing a partitioned dataset.

        .. versionchanged:: 0.24.0

    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
        Parquet library to use. If 'auto', then the option
        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
        behavior is to try 'pyarrow', falling back to 'fastparquet' if
        'pyarrow' is unavailable.
    compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
        Name of the compression to use. Use ``None`` for no compression.
    index : bool, default None
        If ``True``, include the dataframe's index(es) in the file output. If
        ``False``, they will not be written to the file. If ``None``, the
        engine's default behavior will be used.

        .. versionadded 0.24.0

    partition_cols : list, optional, default None
        Column names by which to partition the dataset
        Columns are partitioned in the order they are given

        .. versionadded:: 0.24.0

    kwargs
        Additional keyword arguments passed to the engine
    """
    impl = get_engine(engine)
    return impl.write(df, path, compression=compression, index=index,
                      partition_cols=partition_cols, **kwargs)

Source File: parquet.py From recruit with Apache License 2.0

5 votes

def read_parquet(path, engine='auto', columns=None, **kwargs):
    """
    Load a parquet object from the file path, returning a DataFrame.

    .. versionadded 0.21.0

    Parameters
    ----------
    path : string
        File path
    columns : list, default=None
        If not None, only these columns will be read from the file.

        .. versionadded 0.21.1
    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
        Parquet library to use. If 'auto', then the option
        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
        behavior is to try 'pyarrow', falling back to 'fastparquet' if
        'pyarrow' is unavailable.
    kwargs are passed to the engine

    Returns
    -------
    DataFrame
    """

    impl = get_engine(engine)
    return impl.read(path, columns=columns, **kwargs)

Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def read_parquet(path, engine='auto', columns=None, **kwargs):
    """
    Load a parquet object from the file path, returning a DataFrame.

    .. versionadded 0.21.0

    Parameters
    ----------
    path : string
        File path
    columns: list, default=None
        If not None, only these columns will be read from the file.

        .. versionadded 0.21.1
    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
        Parquet reader library to use. If 'auto', then the option
        'io.parquet.engine' is used. If 'auto', then the first
        library to be installed is used.
    kwargs are passed to the engine

    Returns
    -------
    DataFrame

    """

    impl = get_engine(engine)
    return impl.read(path, columns=columns, **kwargs)

Source File: parquet.py From vnpy_crypto with MIT License

5 votes

def get_engine(engine):
    """ return our implementation """

    if engine == 'auto':
        engine = get_option('io.parquet.engine')

    if engine == 'auto':
        # try engines in this order
        try:
            return PyArrowImpl()
        except ImportError:
            pass

        try:
            return FastParquetImpl()
        except ImportError:
            pass

        raise ImportError("Unable to find a usable engine; "
                          "tried using: 'pyarrow', 'fastparquet'.\n"
                          "pyarrow or fastparquet is required for parquet "
                          "support")

    if engine not in ['pyarrow', 'fastparquet']:
        raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")

    if engine == 'pyarrow':
        return PyArrowImpl()
    elif engine == 'fastparquet':
        return FastParquetImpl()

Source File: parquet.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def read(self, path, columns=None, **kwargs):
        path, _, _ = get_filepath_or_buffer(path)
        if self._pyarrow_lt_070:
            return self.api.parquet.read_pandas(path, columns=columns,
                                                **kwargs).to_pandas()
        kwargs['use_pandas_metadata'] = True
        return self.api.parquet.read_table(path, columns=columns,
                                           **kwargs).to_pandas()

Python pyarrow.parquet() Examples