Python Examples of pyarrow.parquet.write

Source File: utils.py From gordo with GNU Affero General Public License v3.0

10 votes

def dataframe_into_parquet_bytes(
    df: pd.DataFrame, compression: str = "snappy"
) -> bytes:
    """
    Convert a dataframe into bytes representing a parquet table.

    Parameters
    ----------
    df: pd.DataFrame
        DataFrame to be compressed
    compression: str
        Compression to use, passed to  :func:`pyarrow.parquet.write_table`

    Returns
    -------
    bytes
    """
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression)
    return buf.getvalue().to_pybytes()

Source File: test_parquet.py From kartothek with MIT License

8 votes

def test_index_metadata(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)

Source File: component.py From pipelines with Apache License 2.0

6 votes

def convert_apache_arrow_feather_to_apache_parquet(
    data_path: InputPath('ApacheArrowFeather'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts Apache Arrow Feather to Apache Parquet.

    [Apache Arrow Feather](https://arrow.apache.org/docs/python/feather.html)
    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import feather, parquet

    table = feather.read_table(data_path)
    parquet.write_table(table, output_data_path)

Source File: test_parquet.py From kartothek with MIT License

6 votes

def test_pyarrow_07992(store):
    key = "test.parquet"
    df = pd.DataFrame({"a": [1]})
    table = pa.Table.from_pandas(df)
    meta = b"""{
        "pandas_version": "0.20.3",
        "index_columns": ["__index_level_0__"],
        "columns": [
            {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"},
            {"metadata": null, "name": null, "numpy_type": "int64", "pandas_type": "int64"}
        ],
        "column_indexes": [
            {"metadata": null, "name": null, "numpy_type": "object", "pandas_type": "string"}
        ]
    }"""
    table = table.replace_schema_metadata({b"pandas": meta})
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    store.put(key, buf.getvalue().to_pybytes())
    pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)

Source File: test_advanced_3.py From ray with Apache License 2.0

6 votes

def test_pandas_parquet_serialization():
    # Only test this if pandas is installed
    pytest.importorskip("pandas")

    import pandas as pd
    import pyarrow as pa
    import pyarrow.parquet as pq

    tempdir = tempfile.mkdtemp()
    filename = os.path.join(tempdir, "parquet-test")
    pd.DataFrame({"col1": [0, 1], "col2": [0, 1]}).to_parquet(filename)
    with open(os.path.join(tempdir, "parquet-compression"), "wb") as f:
        table = pa.Table.from_arrays([pa.array([1, 2, 3])], ["hello"])
        pq.write_table(table, f, compression="lz4")
    # Clean up
    shutil.rmtree(tempdir)

Source File: _parquet.py From kartothek with MIT License

6 votes

def store(self, store, key_prefix, df):
        key = "{}.parquet".format(key_prefix)
        if isinstance(df, pa.Table):
            table = df
        else:
            table = pa.Table.from_pandas(df)
        buf = pa.BufferOutputStream()
        if (
            self.chunk_size
            and self.chunk_size < len(table)
            and not ARROW_LARGER_EQ_0150
        ):
            table = _reset_dictionary_columns(table)
        pq.write_table(
            table,
            buf,
            version=self._PARQUET_VERSION,
            chunk_size=self.chunk_size,
            compression=self.compression,
            coerce_timestamps="us",
        )
        store.put(key, buf.getvalue().to_pybytes())
        return key

Source File: test_pyarrow_roundtrip.py From fletcher with MIT License

5 votes

def test_parquet_roundtrip(array_type):
    df = pd.DataFrame({"col": array_type(["A", "B"])})
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    reader = pa.BufferReader(buf.getvalue().to_pybytes())
    table = pq.read_table(reader)
    pdt.assert_frame_equal(df, table.to_pandas())

Source File: dataframe_bytes_storage.py From pyABC with BSD 3-Clause "New" or "Revised" License

5 votes

def df_to_bytes_parquet_(df: pd.DataFrame) -> bytes:
    """
    pyarrow parquet is the standard conversion method of pandas
    DataFrames since pyabc 0.9.14, because msgpack became
    deprecated in pandas 0.25.0.
    """
    b = BytesIO()
    table = pyarrow.Table.from_pandas(df)
    parquet.write_table(table, b)
    b.seek(0)
    return b.read()

Source File: client.py From json2parquet with MIT License

5 votes

def write_parquet_dataset(data, destination, **kwargs):
    """
    data: PyArrow record batch
    destination: Output directory

    **kwargs: defined at https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html

    This adds support for writing with partitions, compared with 'write_table'.
    """
    try:
        table = pa.Table.from_batches(data)
    except TypeError:
        table = pa.Table.from_batches([data])
    pq.write_to_dataset(table, destination, **kwargs)

Source File: client.py From json2parquet with MIT License

5 votes

def write_parquet(data, destination, **kwargs):
    """
    data: PyArrow record batch
    destination: Output file name

    **kwargs: defined at https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
    """
    try:
        table = pa.Table.from_batches(data)
    except TypeError:
        table = pa.Table.from_batches([data])
    pq.write_table(table, destination, **kwargs)

Source File: protocols.py From bionic with Apache License 2.0

5 votes

def write(self, df, path):
        self._check_no_duplicate_cols(df)
        if self._check_dtypes:
            self._check_no_categorical_cols(df)
        with path.open("wb") as file_:
            parquet.write_table(Table.from_pandas(df), file_)

Source File: index.py From kartothek with MIT License

5 votes

def __getstate__(self):
        if not self.loaded:
            return (self.column, self.index_storage_key, self.dtype, None)

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)
        parquet_bytes = buf.getvalue().to_pybytes()
        # Since `self.dtype` will be inferred by parquet bytes, do not return
        # this argument during serialization to avoid unnecessary memory consumption
        return (self.column, self.index_storage_key, None, parquet_bytes)

Source File: index.py From kartothek with MIT License

5 votes

def store(self, store: KeyValueStore, dataset_uuid: str) -> str:
        """
        Store the index as a parquet file

        If compatible, the new keyname will be the name stored under the attribute `index_storage_key`.
        If this attribute is None, a new key will be generated of the format

            `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet`

        where the timestamp is in nanosecond accuracy and is created upon Index object initialization

        Parameters
        ----------
        store:
        dataset_uuid:
        """
        storage_key = None

        if (
            self.index_storage_key is not None
            and dataset_uuid
            and dataset_uuid in self.index_storage_key
        ):
            storage_key = self.index_storage_key
        if storage_key is None:
            storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format(
                dataset_uuid=dataset_uuid,
                suffix=naming.EXTERNAL_INDEX_SUFFIX,
                column=quote(self.column),
                timestamp=quote(self.creation_time.isoformat()),
            )

        table = _index_dct_to_table(self.index_dct, self.column, self.dtype)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf)

        store.put(storage_key, buf.getvalue().to_pybytes())
        return storage_key

Source File: gen_kde_pq.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def gen_kde(N, file_name):
    # np.random.seed(0)
    df = pd.DataFrame({'points': np.random.random(N)})
    table = pa.Table.from_pandas(df)
    row_group_size = 128
    pq.write_table(table, 'kde.parquet', row_group_size)

Source File: test_io.py From fletcher with MIT License

5 votes

def test_read_parquet(tmpdir, continuous):
    str_arr = pa.array(["a", None, "c"], pa.string())
    int_arr = pa.array([1, None, -2], pa.int32())
    bool_arr = pa.array([True, None, False], pa.bool_())
    table = pa.Table.from_arrays([str_arr, int_arr, bool_arr], ["str", "int", "bool"])

    pq.write_table(table, "df.parquet")
    result = fr.read_parquet("df.parquet", continuous=continuous)
    expected = fr.pandas_from_arrow(table, continuous=continuous)
    tm.assert_frame_equal(result, expected)

Source File: component.py From pipelines with Apache License 2.0

5 votes

def convert_csv_to_apache_parquet(
    data_path: InputPath('CSV'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts CSV table to Apache Parquet.

    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import csv, parquet

    table = csv.read_csv(data_path)
    parquet.write_table(table, output_data_path)

Source File: component.py From pipelines with Apache License 2.0

5 votes

def convert_tsv_to_apache_parquet(
    data_path: InputPath('TSV'),
    output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts TSV table to Apache Parquet.

    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <alexey.volkov@ark-kun.com>
    '''
    from pyarrow import csv, parquet

    table = csv.read_csv(data_path, parse_options=csv.ParseOptions(delimiter='\t'))
    parquet.write_table(table, output_data_path)

Source File: hydrofunctions.py From hydrofunctions with MIT License

5 votes

def save_parquet(filename, dataframe, hf_meta):
    table = pa.Table.from_pandas(dataframe, preserve_index=True)
    meta_dict = table.schema.metadata
    hf_string = json.dumps(hf_meta).encode()
    meta_dict[b"hydrofunctions_meta"] = hf_string
    table = table.replace_schema_metadata(meta_dict)
    pq.write_table(table, filename)

Source File: util.py From PyAthena with MIT License

5 votes

def to_parquet(
    df,
    bucket_name,
    prefix,
    retry_config,
    session_kwargs,
    client_kwargs,
    compression=None,
    flavor="spark",
):
    import pyarrow as pa
    import pyarrow.parquet as pq

    session = Session(**session_kwargs)
    client = session.resource("s3", **client_kwargs)
    bucket = client.Bucket(bucket_name)
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf, compression=compression, flavor=flavor)
    response = retry_api_call(
        bucket.put_object,
        config=retry_config,
        Body=buf.getvalue().to_pybytes(),
        Key=prefix + str(uuid.uuid4()),
    )
    return "s3://{0}/{1}".format(response.bucket_name, response.key)

Source File: parquet.py From ibis with Apache License 2.0

5 votes

def insert(self, path, expr, **kwargs):
        path = self.root / path
        df = execute(expr)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, str(path))

Source File: test_schema.py From ibis with Apache License 2.0

4 votes

def parquet_schema():
    np.random.seed(0)
    size = 100
    df = pd.DataFrame(
        {
            'uint8': np.arange(size, dtype=np.uint8),
            'uint16': np.arange(size, dtype=np.uint16),
            'uint32': np.arange(size, dtype=np.uint32),
            'uint64': np.arange(size, dtype=np.uint64),
            'int8': np.arange(size, dtype=np.int16),
            'int16': np.arange(size, dtype=np.int16),
            'int32': np.arange(size, dtype=np.int32),
            'int64': np.arange(size, dtype=np.int64),
            'float32': np.arange(size, dtype=np.float32),
            'float64': np.arange(size, dtype=np.float64),
            'bool': np.random.randn(size) > 0,
            # TODO(wesm): Test other timestamp resolutions now that arrow
            # supports them
            'datetime': np.arange(
                "2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'
            ),
            'str': [str(x) for x in range(size)],
            'str_with_nulls': [None]
            + [str(x) for x in range(size - 2)]
            + [None],
            'empty_str': [''] * size,
            'bytes': [b'foo'] * size,
        },
        columns=[
            'uint8',
            'uint16',
            'uint32',
            'uint64',
            'int8',
            'int16',
            'int32',
            'int64',
            'float32',
            'float64',
            'bool',
            'datetime',
            'str',
            'str_with_nulls',
            'empty_str',
            'bytes',
        ],
    )

    with tempfile.TemporaryFile() as path:
        table = pa.Table.from_pandas(df)
        pq.write_table(table, path)
        parquet_file = pq.ParquetFile(path)
        return parquet_file.schema

Python pyarrow.parquet.write_table() Examples