Python pyarrow.py_buffer() Examples

The following are 10 code examples of pyarrow.py_buffer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow , or try the search function .
Example #1
Source File: dataserializer.py    From mars with Apache License 2.0 6 votes vote down vote up
def loads(buf):
    mv = memoryview(buf)
    header = read_file_header(mv)
    compress = header.compress

    if compress == CompressType.NONE:
        data = buf[HEADER_LENGTH:]
    else:
        data = decompressors[compress](mv[HEADER_LENGTH:])

    if header.type == SerialType.ARROW:
        try:
            return pyarrow.deserialize(memoryview(data), mars_serialize_context())
        except pyarrow.lib.ArrowInvalid:  # pragma: no cover
            # reconstruct value from buffers of arrow components
            data_view = memoryview(data)
            meta_block_size = np.frombuffer(data_view[0:4], dtype='int32').item()
            meta = pickle.loads(data_view[4:4 + meta_block_size])  # nosec
            buffer_sizes = meta.pop('buffer_sizes')
            bounds = np.cumsum([4 + meta_block_size] + buffer_sizes)
            meta['data'] = [pyarrow.py_buffer(data_view[bounds[idx]:bounds[idx + 1]])
                            for idx in range(len(buffer_sizes))]
            return pyarrow.deserialize_components(meta, mars_serialize_context())
    else:
        return pickle.loads(data) 
Example #2
Source File: string.py    From fletcher with MIT License 6 votes vote down vote up
def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array:
    if len(a) != len(b):
        raise ValueError("Lengths of arrays don't match")

    offsets_a, data_a = _extract_string_buffers(a)
    offsets_b, data_b = _extract_string_buffers(b)
    if len(a) > 0:
        valid = _merge_valid_bitmaps(a, b)
        result_offsets = np.empty(len(a) + 1, dtype=np.int32)
        result_offsets[0] = 0
        total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
        result_data = np.empty(total_size, dtype=np.uint8)
        _merge_string_data(
            len(a),
            valid,
            offsets_a,
            data_a,
            offsets_b,
            data_b,
            result_offsets,
            result_data,
        )
        buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
        return pa.Array.from_buffers(pa.string(), len(a), buffers)
    return a 
Example #3
Source File: bool.py    From fletcher with MIT License 6 votes vote down vote up
def or_na(arr: pa.Array) -> pa.Array:
    """Apply ``array | NA`` with a boolean pyarrow.Array."""
    output_length = len(arr) // 8
    if len(arr) % 8 != 0:
        output_length += 1

    if arr.null_count == 0:
        return pa.Array.from_buffers(
            pa.bool_(),
            len(arr),
            [arr.buffers()[1], arr.buffers()[1]],
            null_count=-1,
            offset=arr.offset,
        )
    else:
        output = np.zeros(output_length, dtype=np.uint8)
        null_count = _or_na(
            len(arr), arr.offset, arr.buffers()[0], arr.buffers()[1], output
        )
        buf = pa.py_buffer(output)
        return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], null_count) 
Example #4
Source File: bool.py    From fletcher with MIT License 6 votes vote down vote up
def all_true_like(arr: pa.Array) -> pa.Array:
    """Return a boolean array with all-True with the same size as the input and the same valid bitmap."""
    valid_buffer = arr.buffers()[0]
    if valid_buffer:
        valid_buffer = valid_buffer.slice(arr.offset // 8)

    output_offset = arr.offset % 8
    output_length = len(arr) + output_offset

    output_size = output_length // 8
    if output_length % 8 > 0:
        output_size += 1
    output = np.full(output_size, fill_value=255, dtype=np.uint8)

    return pa.Array.from_buffers(
        pa.bool_(),
        len(arr),
        [valid_buffer, pa.py_buffer(output)],
        arr.null_count,
        output_offset,
    ) 
Example #5
Source File: string.py    From fletcher with MIT License 5 votes vote down vote up
def shift_unaligned_bitmap(
    valid_buffer: pa.Buffer, offset: int, length: int
) -> pa.Buffer:
    """Shift an unaligned bitmap to be offsetted at 0."""
    output_size = length // 8
    if length % 8 > 0:
        output_size += 1
    output = np.zeros(output_size, dtype=np.uint8)

    _shift_unaligned_bitmap(valid_buffer, offset, length, output)

    return pa.py_buffer(output) 
Example #6
Source File: string.py    From fletcher with MIT License 5 votes vote down vote up
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array:
    """
    Check for each element in the data whether it contains the pattern ``pat``.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """
    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()

    # Initialise boolean (bit-packaed) output array.
    output_size = len(data) // 8
    if len(data) % 8 > 0:
        output_size += 1
    output = np.empty(output_size, dtype=np.uint8)
    if len(data) % 8 > 0:
        # Zero trailing bits
        output[-1] = 0

    offsets, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = None
        _text_contains_case_sensitive_nonnull(
            len(data), offsets, data_buffer, pat_bytes, output
        )
    else:
        valid = _buffer_to_view(data.buffers()[0])
        _text_contains_case_sensitive_nulls(
            len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output
        )
        valid_buffer = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            valid_buffer = shift_unaligned_bitmap(
                valid_buffer, data.offset % 8, len(data)
            )

    return pa.Array.from_buffers(
        pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count
    ) 
Example #7
Source File: bool.py    From fletcher with MIT License 5 votes vote down vote up
def or_array_nparray(a: pa.Array, b: np.ndarray) -> pa.Array:
    """Perform ``pa.Array | np.ndarray``."""
    output_length = len(a) // 8
    if len(a) % 8 != 0:
        output_length += 1

    if a.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned_with_numpy_nonnull(
            len(a), a.buffers()[1], a.offset, b, result
        )
        return pa.Array.from_buffers(
            pa.bool_(), len(a), [None, pa.py_buffer(result)], 0
        )
    else:
        result = np.zeros(output_length, dtype=np.uint8)
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        null_count = bitmap_or_unaligned_with_numpy(
            len(a), a.buffers()[0], a.buffers()[1], a.offset, b, result, valid_bits
        )
        return pa.Array.from_buffers(
            pa.bool_(),
            len(a),
            [pa.py_buffer(valid_bits), pa.py_buffer(result)],
            null_count,
        ) 
Example #8
Source File: test_validate.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_arrow_file_does_not_validate(self):
        array = pyarrow.StringArray.from_buffers(
            1,
            # value_offsets: first item spans buffer offsets 0 to 1
            pyarrow.py_buffer(struct.pack("II", 0, 1)),
            # data: a not-UTF8-safe character
            pyarrow.py_buffer(b"\xc9"),
        )
        with arrow_file({"A": array}) as path:
            with self.assertRaisesRegex(
                InvalidArrowFile, "arrow-validate: --check-utf8 failed on column A"
            ):
                validate_arrow_file(path) 
Example #9
Source File: test_dataio.py    From mars with Apache License 2.0 4 votes vote down vote up
def testArrowBufferIO(self):
        if not np:
            return
        from numpy.testing import assert_array_equal

        for compress in [dataserializer.CompressType.LZ4, dataserializer.CompressType.GZIP]:
            if compress not in dataserializer.get_supported_compressions():
                continue

            data = np.random.random((1000, 100))
            serialized = pyarrow.serialize(data).to_buffer()

            # test complete read
            reader = ArrowBufferIO(
                pyarrow.py_buffer(serialized), 'r', compress_out=compress)
            assert_array_equal(data, dataserializer.loads(reader.read()))

            # test partial read
            reader = ArrowBufferIO(
                pyarrow.py_buffer(serialized), 'r', compress_out=compress)
            block = reader.read(128)
            data_left = reader.read()
            assert_array_equal(data, dataserializer.loads(block + data_left))

            # test read by chunks
            bio = BytesIO()
            reader = ArrowBufferIO(
                pyarrow.py_buffer(serialized), 'r', compress_out=compress)
            while True:
                block = reader.read(128)
                if not block:
                    break
                bio.write(block)

            compressed = bio.getvalue()
            assert_array_equal(data, dataserializer.loads(compressed))

            # test write by chunks
            data_sink = bytearray(len(serialized))
            compressed_mv = memoryview(compressed)
            writer = ArrowBufferIO(pyarrow.py_buffer(data_sink), 'w')
            pos = 0
            while pos < len(compressed):
                endpos = min(pos + 128, len(compressed))
                writer.write(compressed_mv[pos:endpos])
                pos = endpos

            assert_array_equal(data, pyarrow.deserialize(data_sink)) 
Example #10
Source File: bool.py    From fletcher with MIT License 4 votes vote down vote up
def or_array_array(a: pa.Array, b: pa.Array) -> pa.Array:
    """Perform ``pyarrow.Array | pyarrow.Array``."""
    output_length = len(a) // 8
    if len(a) % 8 != 0:
        output_length += 1

    if a.null_count == 0 and b.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(
            len(a), a.buffers()[1], a.offset, b.buffers()[1], b.offset, result
        )
        return pa.Array.from_buffers(
            pa.bool_(), len(a), [None, pa.py_buffer(result)], 0
        )
    elif a.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(
            len(a), a.buffers()[1], a.offset, b.buffers()[1], b.offset, result
        )
        # b has nulls, mark all occasions of b(None) & a(True) as True -> valid_bits = a.data or b.valid_bits
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned(
            len(a), a.buffers()[1], a.offset, b.buffers()[0], b.offset, valid_bits
        )
        return pa.Array.from_buffers(
            pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)]
        )
        pass
    elif b.null_count == 0:
        return or_array_array(b, a)
    else:
        result = np.zeros(output_length, dtype=np.uint8)
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        null_count = masked_bitmap_or_unaligned(
            len(a),
            a.buffers()[0],
            a.buffers()[1],
            a.offset,
            b.buffers()[0],
            b.buffers()[1],
            b.offset,
            result,
            valid_bits,
        )
        return pa.Array.from_buffers(
            pa.bool_(),
            len(a),
            [pa.py_buffer(valid_bits), pa.py_buffer(result)],
            null_count,
        )