Python pyarrow.Array() Examples
The following are 30
code examples of pyarrow.Array().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: string.py From fletcher with MIT License | 6 votes |
def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array: if len(a) != len(b): raise ValueError("Lengths of arrays don't match") offsets_a, data_a = _extract_string_buffers(a) offsets_b, data_b = _extract_string_buffers(b) if len(a) > 0: valid = _merge_valid_bitmaps(a, b) result_offsets = np.empty(len(a) + 1, dtype=np.int32) result_offsets[0] = 0 total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0]) result_data = np.empty(total_size, dtype=np.uint8) _merge_string_data( len(a), valid, offsets_a, data_a, offsets_b, data_b, result_offsets, result_data, ) buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]] return pa.Array.from_buffers(pa.string(), len(a), buffers) return a
Example #2
Source File: base.py From fletcher with MIT License | 6 votes |
def __init__(self, array, dtype=None, copy: Optional[bool] = None): # Copy is not used at the moment. It's only affect will be when we # allow array to be a FletcherContinuousArray if is_array_like(array) or isinstance(array, list): self.data = pa.array(array, type=dtype) elif isinstance(array, pa.Array): # TODO: Assert dtype self.data = array elif isinstance(array, pa.ChunkedArray): # TODO: Assert dtype if array.num_chunks == 1: self.data = array.chunk(0) else: self.data = pa.concat_arrays(array.iterchunks()) else: raise ValueError( "Unsupported type passed for {}: {}".format( self.__class__.__name__, type(array) ) ) self._dtype = FletcherContinuousDtype(self.data.type)
Example #3
Source File: bool.py From fletcher with MIT License | 6 votes |
def or_na(arr: pa.Array) -> pa.Array: """Apply ``array | NA`` with a boolean pyarrow.Array.""" output_length = len(arr) // 8 if len(arr) % 8 != 0: output_length += 1 if arr.null_count == 0: return pa.Array.from_buffers( pa.bool_(), len(arr), [arr.buffers()[1], arr.buffers()[1]], null_count=-1, offset=arr.offset, ) else: output = np.zeros(output_length, dtype=np.uint8) null_count = _or_na( len(arr), arr.offset, arr.buffers()[0], arr.buffers()[1], output ) buf = pa.py_buffer(output) return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], null_count)
Example #4
Source File: chunking.py From fletcher with MIT License | 6 votes |
def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append( dispatch_chunked_binary_map(a[offset : offset + len(chunk)], chunk, ops) ) return pa.chunked_array(new_chunks) elif isinstance(b, pa.Array): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") return ops.get("array_array", _not_implemented_path)(a, b) else: if np.isscalar(b): return ops.get("array_scalar", _not_implemented_path)(a, b) else: if len(a) != len(b): raise ValueError("Inputs don't have the same length.") return ops.get("array_nparray", _not_implemented_path)(a, b)
Example #5
Source File: _algorithms.py From fletcher with MIT License | 6 votes |
def np_ufunc_array_array(a: pa.Array, b: pa.Array, op: Callable): np_arr_a = _extract_data_buffer_as_np_array(a) np_arr_b = _extract_data_buffer_as_np_array(b) if a.null_count > 0 and b.null_count > 0: # TODO: Combine them before extracting mask_a = extract_isnull_bytemap(a) mask_b = extract_isnull_bytemap(b) mask = mask_a | mask_b elif a.null_count > 0: mask = extract_isnull_bytemap(a) elif b.null_count > 0: mask = extract_isnull_bytemap(b) else: mask = None new_arr = op(np_arr_a, np_arr_b) # Don't set type as we might have valid casts like int->float in truediv return pa.array(new_arr, mask=mask)
Example #6
Source File: basic_stats_generator.py From data-validation with Apache License 2.0 | 6 votes |
def update(self, feature_array: pa.Array) -> None: """Update the partial string statistics using the input value.""" if pa.types.is_null(feature_array.type): return # Iterate through the value array and update the partial stats. flattened_values_array, _ = arrow_util.flatten_nested(feature_array) if arrow_util.is_binary_like(flattened_values_array.type): # GetBinaryArrayTotalByteSize returns a Python long (to be compatible # with Python3). To make sure we do cheaper integer arithemetics in # Python2, we first convert it to int. self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize( flattened_values_array)) elif flattened_values_array: # We can only do flattened_values_array.to_numpy() when it's not empty. # This could be computed faster by taking log10 of the integer. def _len_after_conv(s): return len(str(s)) self.total_bytes_length += np.sum( np.vectorize(_len_after_conv, otypes=[np.int32])(np.asarray(flattened_values_array)))
Example #7
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _ListArrayToTensor( self, list_array: pa.Array, produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]: """Converts a ListArray to a dense tensor.""" values = list_array.flatten() batch_size = len(list_array) expected_num_elements = batch_size * self._unbatched_flat_len if len(values) != expected_num_elements: raise ValueError( "Unable to convert ListArray {} to {}: size mismatch. expected {} " "elements but got {}".format( list_array, self.type_spec, expected_num_elements, len(values))) actual_shape = list(self._shape) actual_shape[0] = batch_size if self._convert_to_binary_fn is not None: values = self._convert_to_binary_fn(values) values_np = np.asarray(values).reshape(actual_shape) if produce_eager_tensors: return tf.convert_to_tensor(values_np) return values_np
Example #8
Source File: basic_stats_generator.py From data-validation with Apache License 2.0 | 6 votes |
def update(self, feature_array: pa.Array, presence_mask: np.ndarray, num_values: np.ndarray, num_values_not_none: np.ndarray, weights: Optional[np.ndarray]) -> None: """Updates the stats with a feature array.""" self.num_non_missing += len(feature_array) - feature_array.null_count self.max_num_values = np.maximum.reduce( num_values_not_none, initial=self.max_num_values) self.min_num_values = np.minimum.reduce(num_values_not_none, initial=self.min_num_values) self.total_num_values += np.sum(num_values_not_none) if weights is not None: if weights.size != num_values.size: raise ValueError('Weight feature must not be missing.') self.weighted_total_num_values += np.sum(num_values * weights) self.weighted_num_non_missing += np.sum(weights[presence_mask])
Example #9
Source File: chunking.py From fletcher with MIT License | 5 votes |
def apply_per_chunk(func): """Apply a function to each chunk if the input is chunked.""" @wraps(func) def wrapper(arr: Union[pa.Array, pa.ChunkedArray], *args, **kwargs): if isinstance(arr, pa.ChunkedArray): return pa.chunked_array( [func(chunk, *args, **kwargs) for chunk in arr.chunks] ) else: return func(arr, *args, **kwargs) return wrapper
Example #10
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array: """ Check for each element in the data whether it contains the pattern ``pat``. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() # Initialise boolean (bit-packaed) output array. output_size = len(data) // 8 if len(data) % 8 > 0: output_size += 1 output = np.empty(output_size, dtype=np.uint8) if len(data) % 8 > 0: # Zero trailing bits output[-1] = 0 offsets, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = None _text_contains_case_sensitive_nonnull( len(data), offsets, data_buffer, pat_bytes, output ) else: valid = _buffer_to_view(data.buffers()[0]) _text_contains_case_sensitive_nulls( len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output ) valid_buffer = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: valid_buffer = shift_unaligned_bitmap( valid_buffer, data.offset % 8, len(data) ) return pa.Array.from_buffers( pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count )
Example #11
Source File: bool.py From fletcher with MIT License | 5 votes |
def any_op(arr: Union[pa.ChunkedArray, pa.Array], skipna: bool) -> bool: """Perform any() on a boolean Arrow structure.""" if isinstance(arr, pa.ChunkedArray): return any(any_op(chunk, skipna) for chunk in arr.chunks) if arr.null_count == 0: return _any_op_nonnull(len(arr), arr.buffers()[1]) if skipna: return _any_op_skipna(len(arr), *arr.buffers()) return _any_op(len(arr), *arr.buffers())
Example #12
Source File: bool.py From fletcher with MIT License | 5 votes |
def all_op(arr: Union[pa.ChunkedArray, pa.Array], skipna: bool) -> bool: """Perform all() on a boolean Arrow structure.""" if isinstance(arr, pa.ChunkedArray): return all(all_op(chunk, skipna) for chunk in arr.chunks) if arr.null_count == 0: return _all_op_nonnull(len(arr), arr.buffers()[1]) # skipna is not relevant in the Pandas behaviour return _all_op(len(arr), *arr.buffers())
Example #13
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray: new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append(_text_cat(a[offset : offset + len(chunk)], chunk)) return pa.chunked_array(new_chunks)
Example #14
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray: in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]] new_chunks.append(_text_cat(a_slice, b_slice)) return pa.chunked_array(new_chunks)
Example #15
Source File: bool.py From fletcher with MIT License | 5 votes |
def all_true(arr: pa.Array) -> pa.Array: """Return a boolean array with all-True, all-valid with the same size .""" output_length = len(arr) // 8 if len(arr) % 8 != 0: output_length += 1 buf = pa.py_buffer(np.full(output_length, 255, dtype=np.uint8)) return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], 0)
Example #16
Source File: bool.py From fletcher with MIT License | 5 votes |
def or_array_nparray(a: pa.Array, b: np.ndarray) -> pa.Array: """Perform ``pa.Array | np.ndarray``.""" output_length = len(a) // 8 if len(a) % 8 != 0: output_length += 1 if a.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned_with_numpy_nonnull( len(a), a.buffers()[1], a.offset, b, result ) return pa.Array.from_buffers( pa.bool_(), len(a), [None, pa.py_buffer(result)], 0 ) else: result = np.zeros(output_length, dtype=np.uint8) valid_bits = np.zeros(output_length, dtype=np.uint8) null_count = bitmap_or_unaligned_with_numpy( len(a), a.buffers()[0], a.buffers()[1], a.offset, b, result, valid_bits ) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)], null_count, )
Example #17
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_cat_chunked(a: Any, b: pa.ChunkedArray) -> pa.ChunkedArray: raise NotImplementedError( "_text_cat_chunked is only implemented for pa.Array and pa.ChunkedArray" )
Example #18
Source File: bool.py From fletcher with MIT License | 5 votes |
def or_vectorised(a: Union[pa.Array, pa.ChunkedArray], b: Any): """Perform OR on a boolean Arrow structure and a second operator.""" # Scalar should be handled by or_na or all_true ops = {"array_array": or_array_array, "array_nparray": or_array_nparray} return dispatch_chunked_binary_map(a, b, ops)
Example #19
Source File: _algorithms.py From fletcher with MIT License | 5 votes |
def np_ufunc_scalar_array(a: Any, b: pa.Array, op: Callable): # a is non-masked, either array-like or scalar # numpy can handle all types of b from here np_arr = _extract_data_buffer_as_np_array(b) mask = extract_isnull_bytemap(b) if np.isscalar(a): a = np.array(a) new_arr = op(a, np_arr) # Don't set type as we might have valid casts like int->float in truediv return pa.array(new_arr, mask=mask)
Example #20
Source File: chunking.py From fletcher with MIT License | 5 votes |
def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]] new_chunks.append(dispatch_chunked_binary_map(a_slice, b_slice, ops)) return pa.chunked_array(new_chunks) elif np.isscalar(b): new_chunks = [] for chunk in a.iterchunks(): new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops)) return pa.chunked_array(new_chunks) else: if len(a) != len(b): raise ValueError("Inputs don't have the same length.") new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append( dispatch_chunked_binary_map(chunk, b[offset : offset + len(chunk)], ops) ) return pa.chunked_array(new_chunks)
Example #21
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def series_to_arrow_array(series: pd.Series) -> pyarrow.Array: """ Convert a Pandas series to an in-memory Arrow array. """ if hasattr(series, "cat"): return pyarrow.DictionaryArray.from_arrays( # Pandas categorical value "-1" means None pyarrow.Array.from_pandas(series.cat.codes, mask=(series.cat.codes == -1)), series_to_arrow_array(series.cat.categories), ) else: return pyarrow.array(series, type=_dtype_to_arrow_type(series.dtype))
Example #22
Source File: array_util.py From tfx-bsl with Apache License 2.0 | 5 votes |
def ToSingletonListArray(array: pa.Array): """Converts an array of `type` to a `ListArray<type>`. Where result[i] is null if array[i] is null; [array[i]] otherwise. Args: array: an arrow Array. Returns: a ListArray. """ array_size = len(array) # fast path: values are not copied. if array.null_count == 0: return pa.ListArray.from_arrays( pa.array(np.arange(0, array_size + 1, dtype=np.int32)), array) # null_mask[i] = 1 iff array[i] is null. null_mask = np.asarray(GetArrayNullBitmapAsByteArray(array)) # presence_mask[i] = 0 iff array[i] is null presence_mask = np.subtract(1, null_mask, dtype=np.uint8) offsets_np = np.zeros((array_size + 1,), np.int32) np.cumsum(presence_mask, out=offsets_np[1:]) # This is the null mask over offsets (but ListArray.from_arrays() uses it as # the null mask for the ListArray), so its length is array_size +1, but the # last element is always False. list_array_null_mask = np.zeros((array_size + 1,), np.bool) list_array_null_mask[:array_size] = null_mask.view(np.bool) values_non_null = array.take(pa.array(np.flatnonzero(presence_mask))) return pa.ListArray.from_arrays( pa.array(offsets_np, mask=list_array_null_mask), values_non_null)
Example #23
Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0 | 5 votes |
def convert(self, tensor: TensorAlike) -> List[pa.Array]: if not self._type_spec.is_compatible_with(tensor): raise TypeError("Expected {} but got {}".format( self._type_spec, _get_type_spec(tensor))) return self._convert_internal(tensor)
Example #24
Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _convert_internal(self, tensor: TensorAlike) -> List[pa.Array]: """Converts the given TensorAlike to a list of pa.Arrays. Each element in the list should correspond to one in `arrow_fields()`. Args: tensor: the TensorAlike to be converted. """
Example #25
Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _convert_internal(self, tensor: TensorAlike) -> List[pa.Array]: r = tf.RaggedTensor.from_sparse(tensor) return [pa.ListArray.from_arrays( pa.array(r.row_splits.numpy(), type=pa.int32()), pa.array(r.values.numpy(), type=self._values_arrow_type))]
Example #26
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _GetConvertToBinaryFn( array_type: pa.DataType) -> Optional[Callable[[pa.Array], pa.Array]]: """Returns a function that converts a StringArray to BinaryArray.""" if pa.types.is_string(array_type): return lambda array: array.view(pa.binary()) if pa.types.is_large_string(array_type): return lambda array: array.view(pa.large_binary()) return None
Example #27
Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0 | 5 votes |
def CreateRawRecordColumn( raw_records: List[bytes], produce_large_types: bool) -> pa.Array: """Returns an Array that satisfies the requirement of a raw record column.""" list_array_factory = ( pa.LargeListArray.from_arrays if produce_large_types else pa.ListArray.from_arrays) binary_type = pa.large_binary() if produce_large_types else pa.binary() return list_array_factory( np.arange(0, len(raw_records) + 1, dtype=np.int64), pa.array(raw_records, type=binary_type))
Example #28
Source File: bool.py From coffeegrindsize with MIT License | 5 votes |
def from_array(cls, arr): assert isinstance(arr, pa.Array) return cls(pa.chunked_array([arr]))
Example #29
Source File: arrow_util.py From data-validation with Apache License 2.0 | 5 votes |
def is_binary_like(data_type: pa.DataType) -> bool: """Returns true if an Arrow type is binary-like. Qualified types are {Large,}BinaryArray, {Large,}StringArray. Args: data_type: a pa.Array. Returns: bool. """ return (pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type) or pa.types.is_unicode(data_type) or pa.types.is_large_unicode(data_type))
Example #30
Source File: basic_stats_generator.py From data-validation with Apache License 2.0 | 5 votes |
def update(self, feature_array: pa.Array) -> None: """Update the partial bytes statistics using the input value.""" if pa.types.is_null(feature_array.type): return # Iterate through the value array and update the partial stats.' flattened_values_array, _ = arrow_util.flatten_nested(feature_array) if (pa.types.is_floating(flattened_values_array.type) or pa.types.is_integer(flattened_values_array.type)): raise ValueError('Bytes stats cannot be computed on INT/FLOAT features.') if flattened_values_array: num_bytes = array_util.GetElementLengths( flattened_values_array).to_numpy() self.min_num_bytes = min(self.min_num_bytes, np.min(num_bytes)) self.max_num_bytes = max(self.max_num_bytes, np.max(num_bytes)) self.total_num_bytes += np.sum(num_bytes)