Python pyarrow.parquet.write_table() Examples
The following are 21
code examples of pyarrow.parquet.write_table().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow.parquet
, or try the search function
.
Example #1
Source File: utils.py From gordo with GNU Affero General Public License v3.0 | 10 votes |
def dataframe_into_parquet_bytes( df: pd.DataFrame, compression: str = "snappy" ) -> bytes: """ Convert a dataframe into bytes representing a parquet table. Parameters ---------- df: pd.DataFrame DataFrame to be compressed compression: str Compression to use, passed to :func:`pyarrow.parquet.write_table` Returns ------- bytes """ table = pa.Table.from_pandas(df) buf = pa.BufferOutputStream() pq.write_table(table, buf, compression=compression) return buf.getvalue().to_pybytes()
Example #2
Source File: test_parquet.py From kartothek with MIT License | 8 votes |
def test_index_metadata(store): key = "test.parquet" df = pd.DataFrame({"a": [1]}) table = pa.Table.from_pandas(df) meta = b"""{ "pandas_version": "0.20.3", "index_columns": ["__index_level_0__"], "columns": [ {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"} ] }""" table = table.replace_schema_metadata({b"pandas": meta}) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put(key, buf.getvalue().to_pybytes()) pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
Example #3
Source File: component.py From pipelines with Apache License 2.0 | 6 votes |
def convert_apache_arrow_feather_to_apache_parquet( data_path: InputPath('ApacheArrowFeather'), output_data_path: OutputPath('ApacheParquet'), ): '''Converts Apache Arrow Feather to Apache Parquet. [Apache Arrow Feather](https://arrow.apache.org/docs/python/feather.html) [Apache Parquet](https://parquet.apache.org/) Annotations: author: Alexey Volkov <alexey.volkov@ark-kun.com> ''' from pyarrow import feather, parquet table = feather.read_table(data_path) parquet.write_table(table, output_data_path)
Example #4
Source File: test_parquet.py From kartothek with MIT License | 6 votes |
def test_pyarrow_07992(store): key = "test.parquet" df = pd.DataFrame({"a": [1]}) table = pa.Table.from_pandas(df) meta = b"""{ "pandas_version": "0.20.3", "index_columns": ["__index_level_0__"], "columns": [ {"metadata": null, "name": "a", "numpy_type": "int64", "pandas_type": "int64"}, {"metadata": null, "name": null, "numpy_type": "int64", "pandas_type": "int64"} ], "column_indexes": [ {"metadata": null, "name": null, "numpy_type": "object", "pandas_type": "string"} ] }""" table = table.replace_schema_metadata({b"pandas": meta}) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put(key, buf.getvalue().to_pybytes()) pdt.assert_frame_equal(DataFrameSerializer.restore_dataframe(store, key), df)
Example #5
Source File: test_advanced_3.py From ray with Apache License 2.0 | 6 votes |
def test_pandas_parquet_serialization(): # Only test this if pandas is installed pytest.importorskip("pandas") import pandas as pd import pyarrow as pa import pyarrow.parquet as pq tempdir = tempfile.mkdtemp() filename = os.path.join(tempdir, "parquet-test") pd.DataFrame({"col1": [0, 1], "col2": [0, 1]}).to_parquet(filename) with open(os.path.join(tempdir, "parquet-compression"), "wb") as f: table = pa.Table.from_arrays([pa.array([1, 2, 3])], ["hello"]) pq.write_table(table, f, compression="lz4") # Clean up shutil.rmtree(tempdir)
Example #6
Source File: _parquet.py From kartothek with MIT License | 6 votes |
def store(self, store, key_prefix, df): key = "{}.parquet".format(key_prefix) if isinstance(df, pa.Table): table = df else: table = pa.Table.from_pandas(df) buf = pa.BufferOutputStream() if ( self.chunk_size and self.chunk_size < len(table) and not ARROW_LARGER_EQ_0150 ): table = _reset_dictionary_columns(table) pq.write_table( table, buf, version=self._PARQUET_VERSION, chunk_size=self.chunk_size, compression=self.compression, coerce_timestamps="us", ) store.put(key, buf.getvalue().to_pybytes()) return key
Example #7
Source File: test_pyarrow_roundtrip.py From fletcher with MIT License | 5 votes |
def test_parquet_roundtrip(array_type): df = pd.DataFrame({"col": array_type(["A", "B"])}) table = pa.Table.from_pandas(df) buf = pa.BufferOutputStream() pq.write_table(table, buf) reader = pa.BufferReader(buf.getvalue().to_pybytes()) table = pq.read_table(reader) pdt.assert_frame_equal(df, table.to_pandas())
Example #8
Source File: dataframe_bytes_storage.py From pyABC with BSD 3-Clause "New" or "Revised" License | 5 votes |
def df_to_bytes_parquet_(df: pd.DataFrame) -> bytes: """ pyarrow parquet is the standard conversion method of pandas DataFrames since pyabc 0.9.14, because msgpack became deprecated in pandas 0.25.0. """ b = BytesIO() table = pyarrow.Table.from_pandas(df) parquet.write_table(table, b) b.seek(0) return b.read()
Example #9
Source File: client.py From json2parquet with MIT License | 5 votes |
def write_parquet_dataset(data, destination, **kwargs): """ data: PyArrow record batch destination: Output directory **kwargs: defined at https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html This adds support for writing with partitions, compared with 'write_table'. """ try: table = pa.Table.from_batches(data) except TypeError: table = pa.Table.from_batches([data]) pq.write_to_dataset(table, destination, **kwargs)
Example #10
Source File: client.py From json2parquet with MIT License | 5 votes |
def write_parquet(data, destination, **kwargs): """ data: PyArrow record batch destination: Output file name **kwargs: defined at https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html """ try: table = pa.Table.from_batches(data) except TypeError: table = pa.Table.from_batches([data]) pq.write_table(table, destination, **kwargs)
Example #11
Source File: protocols.py From bionic with Apache License 2.0 | 5 votes |
def write(self, df, path): self._check_no_duplicate_cols(df) if self._check_dtypes: self._check_no_categorical_cols(df) with path.open("wb") as file_: parquet.write_table(Table.from_pandas(df), file_)
Example #12
Source File: index.py From kartothek with MIT License | 5 votes |
def __getstate__(self): if not self.loaded: return (self.column, self.index_storage_key, self.dtype, None) table = _index_dct_to_table(self.index_dct, self.column, self.dtype) buf = pa.BufferOutputStream() pq.write_table(table, buf) parquet_bytes = buf.getvalue().to_pybytes() # Since `self.dtype` will be inferred by parquet bytes, do not return # this argument during serialization to avoid unnecessary memory consumption return (self.column, self.index_storage_key, None, parquet_bytes)
Example #13
Source File: index.py From kartothek with MIT License | 5 votes |
def store(self, store: KeyValueStore, dataset_uuid: str) -> str: """ Store the index as a parquet file If compatible, the new keyname will be the name stored under the attribute `index_storage_key`. If this attribute is None, a new key will be generated of the format `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet` where the timestamp is in nanosecond accuracy and is created upon Index object initialization Parameters ---------- store: dataset_uuid: """ storage_key = None if ( self.index_storage_key is not None and dataset_uuid and dataset_uuid in self.index_storage_key ): storage_key = self.index_storage_key if storage_key is None: storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format( dataset_uuid=dataset_uuid, suffix=naming.EXTERNAL_INDEX_SUFFIX, column=quote(self.column), timestamp=quote(self.creation_time.isoformat()), ) table = _index_dct_to_table(self.index_dct, self.column, self.dtype) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put(storage_key, buf.getvalue().to_pybytes()) return storage_key
Example #14
Source File: gen_kde_pq.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def gen_kde(N, file_name): # np.random.seed(0) df = pd.DataFrame({'points': np.random.random(N)}) table = pa.Table.from_pandas(df) row_group_size = 128 pq.write_table(table, 'kde.parquet', row_group_size)
Example #15
Source File: test_io.py From fletcher with MIT License | 5 votes |
def test_read_parquet(tmpdir, continuous): str_arr = pa.array(["a", None, "c"], pa.string()) int_arr = pa.array([1, None, -2], pa.int32()) bool_arr = pa.array([True, None, False], pa.bool_()) table = pa.Table.from_arrays([str_arr, int_arr, bool_arr], ["str", "int", "bool"]) pq.write_table(table, "df.parquet") result = fr.read_parquet("df.parquet", continuous=continuous) expected = fr.pandas_from_arrow(table, continuous=continuous) tm.assert_frame_equal(result, expected)
Example #16
Source File: component.py From pipelines with Apache License 2.0 | 5 votes |
def convert_csv_to_apache_parquet( data_path: InputPath('CSV'), output_data_path: OutputPath('ApacheParquet'), ): '''Converts CSV table to Apache Parquet. [Apache Parquet](https://parquet.apache.org/) Annotations: author: Alexey Volkov <alexey.volkov@ark-kun.com> ''' from pyarrow import csv, parquet table = csv.read_csv(data_path) parquet.write_table(table, output_data_path)
Example #17
Source File: component.py From pipelines with Apache License 2.0 | 5 votes |
def convert_tsv_to_apache_parquet( data_path: InputPath('TSV'), output_data_path: OutputPath('ApacheParquet'), ): '''Converts TSV table to Apache Parquet. [Apache Parquet](https://parquet.apache.org/) Annotations: author: Alexey Volkov <alexey.volkov@ark-kun.com> ''' from pyarrow import csv, parquet table = csv.read_csv(data_path, parse_options=csv.ParseOptions(delimiter='\t')) parquet.write_table(table, output_data_path)
Example #18
Source File: hydrofunctions.py From hydrofunctions with MIT License | 5 votes |
def save_parquet(filename, dataframe, hf_meta): table = pa.Table.from_pandas(dataframe, preserve_index=True) meta_dict = table.schema.metadata hf_string = json.dumps(hf_meta).encode() meta_dict[b"hydrofunctions_meta"] = hf_string table = table.replace_schema_metadata(meta_dict) pq.write_table(table, filename)
Example #19
Source File: util.py From PyAthena with MIT License | 5 votes |
def to_parquet( df, bucket_name, prefix, retry_config, session_kwargs, client_kwargs, compression=None, flavor="spark", ): import pyarrow as pa import pyarrow.parquet as pq session = Session(**session_kwargs) client = session.resource("s3", **client_kwargs) bucket = client.Bucket(bucket_name) table = pa.Table.from_pandas(df) buf = pa.BufferOutputStream() pq.write_table(table, buf, compression=compression, flavor=flavor) response = retry_api_call( bucket.put_object, config=retry_config, Body=buf.getvalue().to_pybytes(), Key=prefix + str(uuid.uuid4()), ) return "s3://{0}/{1}".format(response.bucket_name, response.key)
Example #20
Source File: parquet.py From ibis with Apache License 2.0 | 5 votes |
def insert(self, path, expr, **kwargs): path = self.root / path df = execute(expr) table = pa.Table.from_pandas(df) pq.write_table(table, str(path))
Example #21
Source File: test_schema.py From ibis with Apache License 2.0 | 4 votes |
def parquet_schema(): np.random.seed(0) size = 100 df = pd.DataFrame( { 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, # TODO(wesm): Test other timestamp resolutions now that arrow # supports them 'datetime': np.arange( "2016-01-01T00:00:00.001", size, dtype='datetime64[ms]' ), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size, 'bytes': [b'foo'] * size, }, columns=[ 'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64', 'bool', 'datetime', 'str', 'str_with_nulls', 'empty_str', 'bytes', ], ) with tempfile.TemporaryFile() as path: table = pa.Table.from_pandas(df) pq.write_table(table, path) parquet_file = pq.ParquetFile(path) return parquet_file.schema