Python pyarrow.BufferReader() Examples

The following are 6 code examples of pyarrow.BufferReader(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow , or try the search function

Example #1

Source File: index.py From kartothek with MIT License

6 votes

def _parquet_bytes_to_dict(column: str, index_buffer: bytes):
    reader = pa.BufferReader(index_buffer)
    # This can be done much more efficient but would take a lot more
    # time to implement so this will be only done on request.
    table = pq.read_table(reader)
    if ARROW_LARGER_EQ_0150:
        column_type = table.schema.field(column).type
    else:
        column_type = table.schema.field_by_name(column).type

    # `datetime.datetime` objects have a precision of up to microseconds only, so arrow
    # parses the type to `pa.timestamp("us")`. Since the
    # values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this
    # and load the column type as `pa.timestamp("ns")`
    if column_type == pa.timestamp("us"):
        column_type = pa.timestamp("ns")

    df = _fix_pyarrow_07992_table(table).to_pandas()  # Could eventually be phased out

    index_dct = dict(
        zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))
    )
    return index_dct, column_type

Example #2

Source File: server.py From mars with Apache License 2.0

5 votes

def write_mutable_tensor(self, session_id, name, payload_type, body):
        import pyarrow

        from ..serialize import dataserializer
        from ..tensor.core import Indexes
        session_uid = SessionActor.gen_uid(session_id)
        session_ref = self.get_actor_ref(session_uid)

        index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item()
        index_json = json.loads(body[8:8+index_json_size].decode('ascii'))
        index = Indexes.from_json(index_json).indexes
        if payload_type is None:
            value = dataserializer.loads(body[8+index_json_size:])
        elif payload_type == 'tensor':
            tensor_chunk_offset = 8 + index_json_size
            with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader:
                value = pyarrow.read_tensor(reader).to_numpy()
        elif payload_type == 'record_batch':
            schema_size = np.frombuffer(body[8+index_json_size:8+index_json_size+8], dtype=np.int64).item()
            schema_offset = 8 + index_json_size + 8
            with pyarrow.BufferReader(body[schema_offset:schema_offset+schema_size]) as reader:
                schema = pyarrow.read_schema(reader)
            record_batch_offset = schema_offset + schema_size
            with pyarrow.BufferReader(body[record_batch_offset:]) as reader:
                record_batch = pyarrow.read_record_batch(reader, schema)
                value = record_batch.to_pandas().to_records(index=False)
        else:
            raise ValueError('Not supported payload type: %s' % payload_type)
        return session_ref.write_mutable_tensor(name, index, value)

Example #3

Source File: dataset_view.py From QCPortal with BSD 3-Clause "New" or "Revised" License

5 votes

def _deserialize(data: bytes, msgpacked_cols: List[str]) -> pd.DataFrame:
        """
        Data are returned as feather-packed pandas DataFrames.
        Due to limitations in pyarrow, some objects are msgpacked inside the DataFrame.
        """
        import pyarrow

        df = pd.read_feather(pyarrow.BufferReader(data))
        for col in msgpacked_cols:
            df[col] = df[col].apply(lambda element: deserialize(element, "msgpack-ext"))

        if "index" in df.columns:
            df.set_index("index", inplace=True)  # pandas.to_feather does not support indexes,
            # so we have to send indexless frames over the wire, and set the index here.
        return df

Example #4

Source File: test_pyarrow_roundtrip.py From fletcher with MIT License

5 votes

def test_parquet_roundtrip(array_type):
    df = pd.DataFrame({"col": array_type(["A", "B"])})
    table = pa.Table.from_pandas(df)
    buf = pa.BufferOutputStream()
    pq.write_table(table, buf)
    reader = pa.BufferReader(buf.getvalue().to_pybytes())
    table = pq.read_table(reader)
    pdt.assert_frame_equal(df, table.to_pandas())

Example #5

Source File: common_metadata.py From kartothek with MIT License

5 votes

def _bytes2schema(data):
    reader = pa.BufferReader(data)
    schema = pq.read_schema(reader)
    fields = []
    for idx in range(len(schema)):
        f = schema[idx]

        # schema data recovered from parquet always contains timestamp data in us-granularity, but pandas will use
        # ns-granularity, so we re-align the two different worlds here
        if f.type == pa.timestamp("us"):
            f = pa.field(f.name, pa.timestamp("ns"))

        fields.append(f)
    return pa.schema(fields, schema.metadata)

Example #6

Source File: dataset_view.py From QCFractal with BSD 3-Clause "New" or "Revised" License

5 votes

def _deserialize(data: bytes, msgpacked_cols: List[str]) -> pd.DataFrame:
        """
        Data are returned as feather-packed pandas DataFrames.
        Due to limitations in pyarrow, some objects are msgpacked inside the DataFrame.
        """
        import pyarrow

        df = pd.read_feather(pyarrow.BufferReader(data))
        for col in msgpacked_cols:
            df[col] = df[col].apply(lambda element: deserialize(element, "msgpack-ext"))

        if "index" in df.columns:
            df.set_index("index", inplace=True)  # pandas.to_feather does not support indexes,
            # so we have to send indexless frames over the wire, and set the index here.
        return df