Python pyarrow.BufferReader() Examples
The following are 6
code examples of pyarrow.BufferReader().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: index.py From kartothek with MIT License | 6 votes |
def _parquet_bytes_to_dict(column: str, index_buffer: bytes): reader = pa.BufferReader(index_buffer) # This can be done much more efficient but would take a lot more # time to implement so this will be only done on request. table = pq.read_table(reader) if ARROW_LARGER_EQ_0150: column_type = table.schema.field(column).type else: column_type = table.schema.field_by_name(column).type # `datetime.datetime` objects have a precision of up to microseconds only, so arrow # parses the type to `pa.timestamp("us")`. Since the # values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this # and load the column type as `pa.timestamp("ns")` if column_type == pa.timestamp("us"): column_type = pa.timestamp("ns") df = _fix_pyarrow_07992_table(table).to_pandas() # Could eventually be phased out index_dct = dict( zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values)) ) return index_dct, column_type
Example #2
Source File: server.py From mars with Apache License 2.0 | 5 votes |
def write_mutable_tensor(self, session_id, name, payload_type, body): import pyarrow from ..serialize import dataserializer from ..tensor.core import Indexes session_uid = SessionActor.gen_uid(session_id) session_ref = self.get_actor_ref(session_uid) index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item() index_json = json.loads(body[8:8+index_json_size].decode('ascii')) index = Indexes.from_json(index_json).indexes if payload_type is None: value = dataserializer.loads(body[8+index_json_size:]) elif payload_type == 'tensor': tensor_chunk_offset = 8 + index_json_size with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader: value = pyarrow.read_tensor(reader).to_numpy() elif payload_type == 'record_batch': schema_size = np.frombuffer(body[8+index_json_size:8+index_json_size+8], dtype=np.int64).item() schema_offset = 8 + index_json_size + 8 with pyarrow.BufferReader(body[schema_offset:schema_offset+schema_size]) as reader: schema = pyarrow.read_schema(reader) record_batch_offset = schema_offset + schema_size with pyarrow.BufferReader(body[record_batch_offset:]) as reader: record_batch = pyarrow.read_record_batch(reader, schema) value = record_batch.to_pandas().to_records(index=False) else: raise ValueError('Not supported payload type: %s' % payload_type) return session_ref.write_mutable_tensor(name, index, value)
Example #3
Source File: dataset_view.py From QCPortal with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _deserialize(data: bytes, msgpacked_cols: List[str]) -> pd.DataFrame: """ Data are returned as feather-packed pandas DataFrames. Due to limitations in pyarrow, some objects are msgpacked inside the DataFrame. """ import pyarrow df = pd.read_feather(pyarrow.BufferReader(data)) for col in msgpacked_cols: df[col] = df[col].apply(lambda element: deserialize(element, "msgpack-ext")) if "index" in df.columns: df.set_index("index", inplace=True) # pandas.to_feather does not support indexes, # so we have to send indexless frames over the wire, and set the index here. return df
Example #4
Source File: test_pyarrow_roundtrip.py From fletcher with MIT License | 5 votes |
def test_parquet_roundtrip(array_type): df = pd.DataFrame({"col": array_type(["A", "B"])}) table = pa.Table.from_pandas(df) buf = pa.BufferOutputStream() pq.write_table(table, buf) reader = pa.BufferReader(buf.getvalue().to_pybytes()) table = pq.read_table(reader) pdt.assert_frame_equal(df, table.to_pandas())
Example #5
Source File: common_metadata.py From kartothek with MIT License | 5 votes |
def _bytes2schema(data): reader = pa.BufferReader(data) schema = pq.read_schema(reader) fields = [] for idx in range(len(schema)): f = schema[idx] # schema data recovered from parquet always contains timestamp data in us-granularity, but pandas will use # ns-granularity, so we re-align the two different worlds here if f.type == pa.timestamp("us"): f = pa.field(f.name, pa.timestamp("ns")) fields.append(f) return pa.schema(fields, schema.metadata)
Example #6
Source File: dataset_view.py From QCFractal with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _deserialize(data: bytes, msgpacked_cols: List[str]) -> pd.DataFrame: """ Data are returned as feather-packed pandas DataFrames. Due to limitations in pyarrow, some objects are msgpacked inside the DataFrame. """ import pyarrow df = pd.read_feather(pyarrow.BufferReader(data)) for col in msgpacked_cols: df[col] = df[col].apply(lambda element: deserialize(element, "msgpack-ext")) if "index" in df.columns: df.set_index("index", inplace=True) # pandas.to_feather does not support indexes, # so we have to send indexless frames over the wire, and set the index here. return df