Python pyarrow.py_buffer() Examples
The following are 10
code examples of pyarrow.py_buffer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: dataserializer.py From mars with Apache License 2.0 | 6 votes |
def loads(buf): mv = memoryview(buf) header = read_file_header(mv) compress = header.compress if compress == CompressType.NONE: data = buf[HEADER_LENGTH:] else: data = decompressors[compress](mv[HEADER_LENGTH:]) if header.type == SerialType.ARROW: try: return pyarrow.deserialize(memoryview(data), mars_serialize_context()) except pyarrow.lib.ArrowInvalid: # pragma: no cover # reconstruct value from buffers of arrow components data_view = memoryview(data) meta_block_size = np.frombuffer(data_view[0:4], dtype='int32').item() meta = pickle.loads(data_view[4:4 + meta_block_size]) # nosec buffer_sizes = meta.pop('buffer_sizes') bounds = np.cumsum([4 + meta_block_size] + buffer_sizes) meta['data'] = [pyarrow.py_buffer(data_view[bounds[idx]:bounds[idx + 1]]) for idx in range(len(buffer_sizes))] return pyarrow.deserialize_components(meta, mars_serialize_context()) else: return pickle.loads(data)
Example #2
Source File: string.py From fletcher with MIT License | 6 votes |
def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array: if len(a) != len(b): raise ValueError("Lengths of arrays don't match") offsets_a, data_a = _extract_string_buffers(a) offsets_b, data_b = _extract_string_buffers(b) if len(a) > 0: valid = _merge_valid_bitmaps(a, b) result_offsets = np.empty(len(a) + 1, dtype=np.int32) result_offsets[0] = 0 total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0]) result_data = np.empty(total_size, dtype=np.uint8) _merge_string_data( len(a), valid, offsets_a, data_a, offsets_b, data_b, result_offsets, result_data, ) buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]] return pa.Array.from_buffers(pa.string(), len(a), buffers) return a
Example #3
Source File: bool.py From fletcher with MIT License | 6 votes |
def or_na(arr: pa.Array) -> pa.Array: """Apply ``array | NA`` with a boolean pyarrow.Array.""" output_length = len(arr) // 8 if len(arr) % 8 != 0: output_length += 1 if arr.null_count == 0: return pa.Array.from_buffers( pa.bool_(), len(arr), [arr.buffers()[1], arr.buffers()[1]], null_count=-1, offset=arr.offset, ) else: output = np.zeros(output_length, dtype=np.uint8) null_count = _or_na( len(arr), arr.offset, arr.buffers()[0], arr.buffers()[1], output ) buf = pa.py_buffer(output) return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], null_count)
Example #4
Source File: bool.py From fletcher with MIT License | 6 votes |
def all_true_like(arr: pa.Array) -> pa.Array: """Return a boolean array with all-True with the same size as the input and the same valid bitmap.""" valid_buffer = arr.buffers()[0] if valid_buffer: valid_buffer = valid_buffer.slice(arr.offset // 8) output_offset = arr.offset % 8 output_length = len(arr) + output_offset output_size = output_length // 8 if output_length % 8 > 0: output_size += 1 output = np.full(output_size, fill_value=255, dtype=np.uint8) return pa.Array.from_buffers( pa.bool_(), len(arr), [valid_buffer, pa.py_buffer(output)], arr.null_count, output_offset, )
Example #5
Source File: string.py From fletcher with MIT License | 5 votes |
def shift_unaligned_bitmap( valid_buffer: pa.Buffer, offset: int, length: int ) -> pa.Buffer: """Shift an unaligned bitmap to be offsetted at 0.""" output_size = length // 8 if length % 8 > 0: output_size += 1 output = np.zeros(output_size, dtype=np.uint8) _shift_unaligned_bitmap(valid_buffer, offset, length, output) return pa.py_buffer(output)
Example #6
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array: """ Check for each element in the data whether it contains the pattern ``pat``. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() # Initialise boolean (bit-packaed) output array. output_size = len(data) // 8 if len(data) % 8 > 0: output_size += 1 output = np.empty(output_size, dtype=np.uint8) if len(data) % 8 > 0: # Zero trailing bits output[-1] = 0 offsets, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = None _text_contains_case_sensitive_nonnull( len(data), offsets, data_buffer, pat_bytes, output ) else: valid = _buffer_to_view(data.buffers()[0]) _text_contains_case_sensitive_nulls( len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output ) valid_buffer = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: valid_buffer = shift_unaligned_bitmap( valid_buffer, data.offset % 8, len(data) ) return pa.Array.from_buffers( pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count )
Example #7
Source File: bool.py From fletcher with MIT License | 5 votes |
def or_array_nparray(a: pa.Array, b: np.ndarray) -> pa.Array: """Perform ``pa.Array | np.ndarray``.""" output_length = len(a) // 8 if len(a) % 8 != 0: output_length += 1 if a.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned_with_numpy_nonnull( len(a), a.buffers()[1], a.offset, b, result ) return pa.Array.from_buffers( pa.bool_(), len(a), [None, pa.py_buffer(result)], 0 ) else: result = np.zeros(output_length, dtype=np.uint8) valid_bits = np.zeros(output_length, dtype=np.uint8) null_count = bitmap_or_unaligned_with_numpy( len(a), a.buffers()[0], a.buffers()[1], a.offset, b, result, valid_bits ) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)], null_count, )
Example #8
Source File: test_validate.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_arrow_file_does_not_validate(self): array = pyarrow.StringArray.from_buffers( 1, # value_offsets: first item spans buffer offsets 0 to 1 pyarrow.py_buffer(struct.pack("II", 0, 1)), # data: a not-UTF8-safe character pyarrow.py_buffer(b"\xc9"), ) with arrow_file({"A": array}) as path: with self.assertRaisesRegex( InvalidArrowFile, "arrow-validate: --check-utf8 failed on column A" ): validate_arrow_file(path)
Example #9
Source File: test_dataio.py From mars with Apache License 2.0 | 4 votes |
def testArrowBufferIO(self): if not np: return from numpy.testing import assert_array_equal for compress in [dataserializer.CompressType.LZ4, dataserializer.CompressType.GZIP]: if compress not in dataserializer.get_supported_compressions(): continue data = np.random.random((1000, 100)) serialized = pyarrow.serialize(data).to_buffer() # test complete read reader = ArrowBufferIO( pyarrow.py_buffer(serialized), 'r', compress_out=compress) assert_array_equal(data, dataserializer.loads(reader.read())) # test partial read reader = ArrowBufferIO( pyarrow.py_buffer(serialized), 'r', compress_out=compress) block = reader.read(128) data_left = reader.read() assert_array_equal(data, dataserializer.loads(block + data_left)) # test read by chunks bio = BytesIO() reader = ArrowBufferIO( pyarrow.py_buffer(serialized), 'r', compress_out=compress) while True: block = reader.read(128) if not block: break bio.write(block) compressed = bio.getvalue() assert_array_equal(data, dataserializer.loads(compressed)) # test write by chunks data_sink = bytearray(len(serialized)) compressed_mv = memoryview(compressed) writer = ArrowBufferIO(pyarrow.py_buffer(data_sink), 'w') pos = 0 while pos < len(compressed): endpos = min(pos + 128, len(compressed)) writer.write(compressed_mv[pos:endpos]) pos = endpos assert_array_equal(data, pyarrow.deserialize(data_sink))
Example #10
Source File: bool.py From fletcher with MIT License | 4 votes |
def or_array_array(a: pa.Array, b: pa.Array) -> pa.Array: """Perform ``pyarrow.Array | pyarrow.Array``.""" output_length = len(a) // 8 if len(a) % 8 != 0: output_length += 1 if a.null_count == 0 and b.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned( len(a), a.buffers()[1], a.offset, b.buffers()[1], b.offset, result ) return pa.Array.from_buffers( pa.bool_(), len(a), [None, pa.py_buffer(result)], 0 ) elif a.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned( len(a), a.buffers()[1], a.offset, b.buffers()[1], b.offset, result ) # b has nulls, mark all occasions of b(None) & a(True) as True -> valid_bits = a.data or b.valid_bits valid_bits = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned( len(a), a.buffers()[1], a.offset, b.buffers()[0], b.offset, valid_bits ) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)] ) pass elif b.null_count == 0: return or_array_array(b, a) else: result = np.zeros(output_length, dtype=np.uint8) valid_bits = np.zeros(output_length, dtype=np.uint8) null_count = masked_bitmap_or_unaligned( len(a), a.buffers()[0], a.buffers()[1], a.offset, b.buffers()[0], b.buffers()[1], b.offset, result, valid_bits, ) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)], null_count, )