Python pyarrow.parquet.write_table() Examples

The following are 21 code examples of pyarrow.parquet.write_table(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow.parquet , or try the search function .
Example #1
Source File: utils.py    From gordo with GNU Affero General Public License v3.0 10 votes vote down vote up
def dataframe_into_parquet_bytes(
    df: pd.DataFrame, compression: str = "snappy"
) -> bytes:
    """
    Convert a dataframe into bytes representing a parquet table.

    Parameters
    ----------
    df: pd.DataFrame
        DataFrame to be compressed
    compression: str
        Compression to use, passed to  :func:`pyarrow.parquet.write_table`

    Returns
    -------
    bytes
    """
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression)
    return buf.getvalue().to_pybytes() 
Example #2
Source File: test_parquet.py    From kartothek with MIT License 8 votes vote down vote up
def test_index_metadata(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df) 
Example #3
Source File: component.py    From pipelines with Apache License 2.0 6 votes vote down vote up
def convert_apache_arrow_feather_to_apache_parquet(
    data_path: InputPath('ApacheArrowFeather'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts Apache Arrow Feather to Apache Parquet.

    [Apache Arrow Feather](https://arrow.apache.org/docs/python/feather.html)
    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import feather, parquet

    table = feather.read_table(data_path)
    parquet.write_table(table, output_data_path) 
Example #4
Source File: test_parquet.py    From kartothek with MIT License 6 votes vote down vote up
def test_pyarrow_07992(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"},
            {"metadata": null, "name": null, "numpy_type": "int64", "pandas_type": "int64"}
        ],
        "column_indexes": [
            {"metadata": null, "name": null, "numpy_type": "object", "pandas_type": "string"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df) 
Example #5
Source File: test_advanced_3.py    From ray with Apache License 2.0 6 votes vote down vote up
def test_pandas_parquet_serialization():
    # Only test this if pandas is installed
    pytest.importorskip("pandas")

    import pandas as pd
    import pyarrow as pa
    import pyarrow.parquet as pq

    tempdir = tempfile.mkdtemp()
    filename = os.path.join(tempdir, "parquet-test")
    pd.DataFrame({"col1": [0, 1], "col2": [0, 1]}).to_parquet(filename)
    with open(os.path.join(tempdir, "parquet-compression"), "wb") as f:
        table = pa.Table.from_arrays([pa.array([1, 2, 3])], ["hello"])
        pq.write_table(table, f, compression="lz4")
    # Clean up
    shutil.rmtree(tempdir) 
Example #6
Source File: _parquet.py    From kartothek with MIT License 6 votes vote down vote up
def store(self, store, key_prefix, df):
        key = "{}.parquet".format(key_prefix)
        if isinstance(df, pa.Table):
            table = df
        else:
            table = pa.Table.from_pandas(df)
        buf = pa.BufferOutputStream()
        if (
            self.chunk_size
            and self.chunk_size < len(table)
            and not ARROW_LARGER_EQ_0150
        ):
            table = _reset_dictionary_columns(table)
        pq.write_table(
            table,
            buf,
            version=self._PARQUET_VERSION,
            chunk_size=self.chunk_size,
            compression=self.compression,
            coerce_timestamps="us",
        )
        store.put(key, buf.getvalue().to_pybytes())
        return key 
Example #7
Source File: test_pyarrow_roundtrip.py    From fletcher with MIT License 5 votes vote down vote up
def test_parquet_roundtrip(array_type):
    df = pd.DataFrame({"col": array_type(["A", "B"])})
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    reader = pa.BufferReader(buf.getvalue().to_pybytes())
    table = pq.read_table(reader)
    pdt.assert_frame_equal(df, table.to_pandas()) 
Example #8
Source File: dataframe_bytes_storage.py    From pyABC with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def df_to_bytes_parquet_(df: pd.DataFrame) -> bytes:
    """
    pyarrow parquet is the standard conversion method of pandas
    DataFrames since pyabc 0.9.14, because msgpack became
    deprecated in pandas 0.25.0.
    """
    b = BytesIO()
    table = pyarrow.Table.from_pandas(df)
    parquet.write_table(table, b)
    b.seek(0)
    return b.read() 
Example #9
Source File: client.py    From json2parquet with MIT License 5 votes vote down vote up
def write_parquet_dataset(data, destination, **kwargs):
    """
    data: PyArrow record batch
    destination: Output directory

    **kwargs: defined at https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html

    This adds support for writing with partitions, compared with 'write_table'.
    """
    try:
        table = pa.Table.from_batches(data)
    except TypeError:
        table = pa.Table.from_batches([data])
    pq.write_to_dataset(table, destination, **kwargs) 
Example #10
Source File: client.py    From json2parquet with MIT License 5 votes vote down vote up
def write_parquet(data, destination, **kwargs):
    """
    data: PyArrow record batch
    destination: Output file name

    **kwargs: defined at https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
    """
    try:
        table = pa.Table.from_batches(data)
    except TypeError:
        table = pa.Table.from_batches([data])
    pq.write_table(table, destination, **kwargs) 
Example #11
Source File: protocols.py    From bionic with Apache License 2.0 5 votes vote down vote up
def write(self, df, path):
        self._check_no_duplicate_cols(df)
        if self._check_dtypes:
            self._check_no_categorical_cols(df)
        with path.open("wb") as file_:
            parquet.write_table(Table.from_pandas(df), file_) 
Example #12
Source File: index.py    From kartothek with MIT License 5 votes vote down vote up
def __getstate__(self):
        if not self.loaded:
            return (self.column, self.index_storage_key, self.dtype, None)

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)
        parquet_bytes = buf.getvalue().to_pybytes()
        # Since `self.dtype` will be inferred by parquet bytes, do not return
        # this argument during serialization to avoid unnecessary memory consumption
        return (self.column, self.index_storage_key, None, parquet_bytes) 
Example #13
Source File: index.py    From kartothek with MIT License 5 votes vote down vote up
def store(self, store: KeyValueStore, dataset_uuid: str) -> str:
        """
        Store the index as a parquet file

        If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
        If this attribute is None, a new key will be generated of the format

            `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`

        where the timestamp is in nanosecond accuracy and is created upon Index object initialization

        Parameters
        ----------
        store:
        dataset_uuid:
        """
        storage_key = None

        if (
            self.index_storage_key is not None
            and dataset_uuid
            and dataset_uuid in self.index_storage_key
        ):
            storage_key = self.index_storage_key
        if storage_key is None:
            storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
                dataset_uuid=dataset_uuid,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
                column=quote(self.column),
                timestamp=quote(self.creation_time.isoformat()),
            )

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)

        store.put(storage_key, buf.getvalue().to_pybytes())
        return storage_key 
Example #14
Source File: gen_kde_pq.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def gen_kde(N, file_name):
    # np.random.seed(0)
    df = pd.DataFrame({'points': np.random.random(N)})
    table = pa.Table.from_pandas(df)
    row_group_size = 128
    pq.write_table(table, 'kde.parquet', row_group_size) 
Example #15
Source File: test_io.py    From fletcher with MIT License 5 votes vote down vote up
def test_read_parquet(tmpdir, continuous):
    str_arr = pa.array(["a", None, "c"], pa.string())
    int_arr = pa.array([1, None, -2], pa.int32())
    bool_arr = pa.array([True, None, False], pa.bool_())
    table = pa.Table.from_arrays([str_arr, int_arr, bool_arr], ["str", "int", "bool"])

    pq.write_table(table, "df.parquet")
    result = fr.read_parquet("df.parquet", continuous=continuous)
    expected = fr.pandas_from_arrow(table, continuous=continuous)
    tm.assert_frame_equal(result, expected) 
Example #16
Source File: component.py    From pipelines with Apache License 2.0 5 votes vote down vote up
def convert_csv_to_apache_parquet(
    data_path: InputPath('CSV'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts CSV table to Apache Parquet.

    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import csv, parquet

    table = csv.read_csv(data_path)
    parquet.write_table(table, output_data_path) 
Example #17
Source File: component.py    From pipelines with Apache License 2.0 5 votes vote down vote up
def convert_tsv_to_apache_parquet(
    data_path: InputPath('TSV'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts TSV table to Apache Parquet.

    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import csv, parquet

    table = csv.read_csv(data_path, parse_options=csv.ParseOptions(delimiter='\t'))
    parquet.write_table(table, output_data_path) 
Example #18
Source File: hydrofunctions.py    From hydrofunctions with MIT License 5 votes vote down vote up
def save_parquet(filename, dataframe, hf_meta):
    table = pa.Table.from_pandas(dataframe, preserve_index=True)
    meta_dict = table.schema.metadata
    hf_string = json.dumps(hf_meta).encode()
    meta_dict[b"hydrofunctions_meta"] = hf_string
    table = table.replace_schema_metadata(meta_dict)
    pq.write_table(table, filename) 
Example #19
Source File: util.py    From PyAthena with MIT License 5 votes vote down vote up
def to_parquet(
    df,
    bucket_name,
    prefix,
    retry_config,
    session_kwargs,
    client_kwargs,
    compression=None,
    flavor="spark",
):
    import pyarrow as pa
    import pyarrow.parquet as pq

    session = Session(**session_kwargs)
    client = session.resource("s3", **client_kwargs)
    bucket = client.Bucket(bucket_name)
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression, flavor=flavor)
    response = retry_api_call(
        bucket.put_object,
        config=retry_config,
        Body=buf.getvalue().to_pybytes(),
        Key=prefix + str(uuid.uuid4()),
    )
    return "s3://{0}/{1}".format(response.bucket_name, response.key) 
Example #20
Source File: parquet.py    From ibis with Apache License 2.0 5 votes vote down vote up
def insert(self, path, expr, **kwargs):
        path = self.root / path
        df = execute(expr)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, str(path)) 
Example #21
Source File: test_schema.py    From ibis with Apache License 2.0 4 votes vote down vote up
def parquet_schema():
    np.random.seed(0)
    size = 100
    df = pd.DataFrame(
        {
            'uint8': np.arange(size, dtype=np.uint8),
            'uint16': np.arange(size, dtype=np.uint16),
            'uint32': np.arange(size, dtype=np.uint32),
            'uint64': np.arange(size, dtype=np.uint64),
            'int8': np.arange(size, dtype=np.int16),
            'int16': np.arange(size, dtype=np.int16),
            'int32': np.arange(size, dtype=np.int32),
            'int64': np.arange(size, dtype=np.int64),
            'float32': np.arange(size, dtype=np.float32),
            'float64': np.arange(size, dtype=np.float64),
            'bool': np.random.randn(size) > 0,
            # TODO(wesm): Test other timestamp resolutions now that arrow
            # supports them
            'datetime': np.arange(
                "2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'
            ),
            'str': [str(x) for x in range(size)],
            'str_with_nulls': [None]
            + [str(x) for x in range(size - 2)]
            + [None],
            'empty_str': [''] * size,
            'bytes': [b'foo'] * size,
        },
        columns=[
            'uint8',
            'uint16',
            'uint32',
            'uint64',
            'int8',
            'int16',
            'int32',
            'int64',
            'float32',
            'float64',
            'bool',
            'datetime',
            'str',
            'str_with_nulls',
            'empty_str',
            'bytes',
        ],
    )

    with tempfile.TemporaryFile() as path:
        table = pa.Table.from_pandas(df)
        pq.write_table(table, path)
        parquet_file = pq.ParquetFile(path)
        return parquet_file.schema