Python pyarrow.ChunkedArray() Examples
The following are 27
code examples of pyarrow.ChunkedArray().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: chunking.py From fletcher with MIT License | 6 votes |
def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append( dispatch_chunked_binary_map(a[offset : offset + len(chunk)], chunk, ops) ) return pa.chunked_array(new_chunks) elif isinstance(b, pa.Array): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") return ops.get("array_array", _not_implemented_path)(a, b) else: if np.isscalar(b): return ops.get("array_scalar", _not_implemented_path)(a, b) else: if len(a) != len(b): raise ValueError("Inputs don't have the same length.") return ops.get("array_nparray", _not_implemented_path)(a, b)
Example #2
Source File: base.py From fletcher with MIT License | 6 votes |
def __init__(self, array, dtype=None, copy=None): # Copy is not used at the moment. It's only affect will be when we # allow array to be a FletcherChunkedArray if is_array_like(array) or isinstance(array, list): self.data = pa.chunked_array([pa.array(array, type=dtype)]) elif isinstance(array, pa.Array): # ARROW-7008: pyarrow.chunked_array([array]) fails on array with all-None buffers if len(array) == 0 and all(b is None for b in array.buffers()): array = pa.array([], type=array.type) # TODO: Assert dtype self.data = pa.chunked_array([array]) elif isinstance(array, pa.ChunkedArray): # TODO: Assert dtype self.data = array else: raise ValueError( "Unsupported type passed for {}: {}".format( self.__class__.__name__, type(array) ) ) self._dtype = FletcherChunkedDtype(self.data.type) self.offsets = self._calculate_chunk_offsets()
Example #3
Source File: string_array.py From fletcher with MIT License | 6 votes |
def _call_x_with(self, impl, needle, na=None): needle = NumbaString.make(needle) # type: ignore result = np.zeros(len(self.data), dtype=np.uint8) if isinstance(self.data, pa.ChunkedArray): offset = 0 for chunk in self.data.chunks: str_arr = NumbaStringArray.make(chunk) # type: ignore impl(str_arr, needle, 2, offset, result) offset += len(chunk) else: str_arr = NumbaStringArray.make(self.data) # type: ignore impl(str_arr, needle, 2, 0, result) return pd.Series( type(self.obj.values)(pa.array(result.astype(bool), mask=(result == 2))) )
Example #4
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_cat_chunked(a: Any, b: pa.ChunkedArray) -> pa.ChunkedArray: raise NotImplementedError( "_text_cat_chunked is only implemented for pa.Array and pa.ChunkedArray" )
Example #5
Source File: bool.py From coffeegrindsize with MIT License | 5 votes |
def __init__(self, values): if not isinstance(values, pa.ChunkedArray): raise ValueError assert values.type == pa.bool_() self._data = values self._dtype = ArrowBoolDtype()
Example #6
Source File: util.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column: if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer(column.type): column_type = ColumnType.Number("{:,}") elif pyarrow.types.is_timestamp(column.type): column_type = ColumnType.Datetime() elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary( column.type ): column_type = ColumnType.Text() else: raise RuntimeError("Unknown column type %r" % column.type) return Column(name, column_type)
Example #7
Source File: chunking.py From fletcher with MIT License | 5 votes |
def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]): """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure.""" if isinstance(b, pa.ChunkedArray): if len(a) != len(b): raise ValueError("Inputs don't have the same length.") in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]] new_chunks.append(dispatch_chunked_binary_map(a_slice, b_slice, ops)) return pa.chunked_array(new_chunks) elif np.isscalar(b): new_chunks = [] for chunk in a.iterchunks(): new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops)) return pa.chunked_array(new_chunks) else: if len(a) != len(b): raise ValueError("Inputs don't have the same length.") new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append( dispatch_chunked_binary_map(chunk, b[offset : offset + len(chunk)], ops) ) return pa.chunked_array(new_chunks)
Example #8
Source File: chunking.py From fletcher with MIT License | 5 votes |
def dispatch_chunked_binary_map(a: Any, b: Any, ops: Dict[str, Callable]): """ Apply a map-like binary function where at least one of the arguments is an Arrow structure. This will yield a pyarrow.Arrow or pyarrow.ChunkedArray as an output. Parameters ---------- a: scalar or np.ndarray or pa.Array or pa.ChunkedArray b: scalar or np.ndarray or pa.Array or pa.ChunkedArray op: dict Dictionary with the keys ('array_array', 'array_nparray', 'nparray_array', 'array_scalar', 'scalar_array') """ # a is neither a pa.Array nor a pa.ChunkedArray, we expect only numpy.ndarray or scalars. if isinstance(b, pa.ChunkedArray): if np.isscalar(a): new_chunks = [] for chunk in b.iterchunks(): new_chunks.append(dispatch_chunked_binary_map(a, chunk, ops)) return pa.chunked_array(new_chunks) else: if len(a) != len(b): raise ValueError("Inputs don't have the same length.") new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append( dispatch_chunked_binary_map( a[offset : offset + len(chunk)], chunk, ops ) ) return pa.chunked_array(new_chunks) elif isinstance(b, pa.Array): if np.isscalar(a): return ops.get("scalar_array", _not_implemented_path)(a, b) else: return ops.get("nparray_array", _not_implemented_path)(a, b) else: # Should never be reached, add a safe-guard raise NotImplementedError(f"Cannot apply ufunc on {type(a)} and {type(b)}")
Example #9
Source File: chunking.py From fletcher with MIT License | 5 votes |
def _combined_in_chunk_offsets( a: pa.ChunkedArray, b: pa.ChunkedArray ) -> Tuple[List[Tuple[int, int, int]], List[Tuple[int, int, int]]]: offsets_a = _calculate_chunk_offsets(a) offsets_b = _calculate_chunk_offsets(b) offsets = sorted(set(list(offsets_a) + list(offsets_b))) in_a_offsets = _in_chunk_offsets(a, offsets) in_b_offsets = _in_chunk_offsets(b, offsets) return in_a_offsets, in_b_offsets
Example #10
Source File: chunking.py From fletcher with MIT License | 5 votes |
def _in_chunk_offsets( arr: pa.ChunkedArray, offsets: List[int] ) -> List[Tuple[int, int, int]]: """Calculate the access ranges for a given list of offsets. All chunk start indices must be included as offsets and the offsets must be unique. Returns a list of tuples that contain: * The index of the given chunk * The position inside the chunk * The length of the current range """ new_offsets = [] pos = 0 chunk = 0 chunk_pos = 0 for offset, offset_next in zip(offsets, offsets[1:] + [len(arr)]): diff = offset - pos chunk_remains = len(arr.chunk(chunk)) - chunk_pos step = offset_next - offset if diff == 0: # The first offset new_offsets.append((chunk, chunk_pos, step)) elif diff == chunk_remains: chunk += 1 chunk_pos = 0 pos += chunk_remains new_offsets.append((chunk, chunk_pos, step)) else: # diff < chunk_remains chunk_pos += diff pos += diff new_offsets.append((chunk, chunk_pos, step)) return new_offsets
Example #11
Source File: chunking.py From fletcher with MIT License | 5 votes |
def _calculate_chunk_offsets(chunked_array: pa.ChunkedArray) -> np.ndarray: """Return an array holding the indices pointing to the first element of each chunk.""" offset = 0 offsets = [] for chunk in chunked_array.iterchunks(): offsets.append(offset) offset += len(chunk) return np.array(offsets)
Example #12
Source File: bool.py From fletcher with MIT License | 5 votes |
def or_vectorised(a: Union[pa.Array, pa.ChunkedArray], b: Any): """Perform OR on a boolean Arrow structure and a second operator.""" # Scalar should be handled by or_na or all_true ops = {"array_array": or_array_array, "array_nparray": or_array_nparray} return dispatch_chunked_binary_map(a, b, ops)
Example #13
Source File: bool.py From fletcher with MIT License | 5 votes |
def all_op(arr: Union[pa.ChunkedArray, pa.Array], skipna: bool) -> bool: """Perform all() on a boolean Arrow structure.""" if isinstance(arr, pa.ChunkedArray): return all(all_op(chunk, skipna) for chunk in arr.chunks) if arr.null_count == 0: return _all_op_nonnull(len(arr), arr.buffers()[1]) # skipna is not relevant in the Pandas behaviour return _all_op(len(arr), *arr.buffers())
Example #14
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_cat_chunked_mixed(a: pa.ChunkedArray, b: pa.Array) -> pa.ChunkedArray: new_chunks = [] offsets = _calculate_chunk_offsets(a) for chunk, offset in zip(a.iterchunks(), offsets): new_chunks.append(_text_cat(chunk, b[offset : offset + len(chunk)])) return pa.chunked_array(new_chunks)
Example #15
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray: new_chunks = [] offsets = _calculate_chunk_offsets(b) for chunk, offset in zip(b.iterchunks(), offsets): new_chunks.append(_text_cat(a[offset : offset + len(chunk)], chunk)) return pa.chunked_array(new_chunks)
Example #16
Source File: string.py From fletcher with MIT License | 5 votes |
def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray: in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b) new_chunks: List[pa.Array] = [] for a_offset, b_offset in zip(in_a_offsets, in_b_offsets): a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]] b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]] new_chunks.append(_text_cat(a_slice, b_slice)) return pa.chunked_array(new_chunks)
Example #17
Source File: bool.py From recruit with Apache License 2.0 | 5 votes |
def __init__(self, values): if not isinstance(values, pa.ChunkedArray): raise ValueError assert values.type == pa.bool_() self._data = values self._dtype = ArrowBoolDtype()
Example #18
Source File: _algorithms.py From fletcher with MIT License | 5 votes |
def pd_nanop(nanop: Callable, arr: Union[pa.ChunkedArray, pa.Array], skipna: bool): """Use pandas.core.nanops to provide a reduction.""" if isinstance(arr, pa.ChunkedArray): data = pa.concat_arrays(arr.iterchunks()) else: data = arr np_arr = _extract_data_buffer_as_np_array(data) mask = extract_isnull_bytemap(data) return nanop(np_arr, skipna=skipna, mask=mask)
Example #19
Source File: _algorithms.py From fletcher with MIT License | 5 votes |
def extract_isnull_bytemap(array: Union[pa.ChunkedArray, pa.Array]) -> np.ndarray: """ Extract the valid bitmaps of a (chunked) array into numpy isnull bytemaps. Parameters ---------- array Array from which we extract the validity bits as bytes Returns ------- valid_bytemap """ if array.null_count == len(array): return np.ones(len(array), dtype=bool) if isinstance(array, pa.ChunkedArray): result = np.zeros(len(array), dtype=bool) if array.null_count == 0: return result offset = 0 for chunk in array.chunks: if chunk.null_count > 0: _extract_isnull_bytemap( chunk.buffers()[0], len(chunk), chunk.offset, offset, result ) offset += len(chunk) else: valid_bitmap = array.buffers()[0] if valid_bitmap: # TODO: Can we use np.empty here to improve performance? result = np.zeros(len(array), dtype=bool) # TODO(ARROW-2664): We only need to following line to support # executing the code in disabled-JIT mode. buf = memoryview(valid_bitmap) _extract_isnull_bytemap(buf, len(array), array.offset, 0, result) else: result = np.full(len(array), False) return result
Example #20
Source File: string_array.py From fletcher with MIT License | 5 votes |
def _series_like(self, array: Union[pa.Array, pa.ChunkedArray]) -> pd.Series: """Return an Arrow result as a series with the same base classes as the input.""" return pd.Series( type(self.obj.values)(array), dtype=type(self.obj.dtype)(array.type), index=self.obj.index, )
Example #21
Source File: base.py From fletcher with MIT License | 5 votes |
def pandas_from_arrow( arrow_object: Union[pa.RecordBatch, pa.Table, pa.Array, pa.ChunkedArray], continuous: bool = False, ): """ Convert Arrow object instance to their Pandas equivalent by using Fletcher. The conversion rules are: * {RecordBatch, Table} -> DataFrame * {Array, ChunkedArray} -> Series Parameters ---------- arrow_object : RecordBatch, Table, Array or ChunkedArray object to be converted continuous : bool Use FletcherContinuousArray instead of FletcherChunkedArray """ if continuous: array_type = FletcherContinuousArray else: array_type = FletcherChunkedArray if isinstance(arrow_object, pa.RecordBatch): data: OrderedDict = OrderedDict() for ix, arr in enumerate(arrow_object): col_name = arrow_object.schema.names[ix] data[col_name] = array_type(arr) return pd.DataFrame(data) elif isinstance(arrow_object, pa.Table): data = OrderedDict() for name, col in zip(arrow_object.column_names, arrow_object.itercolumns()): data[name] = array_type(col) return pd.DataFrame(data) elif isinstance(arrow_object, (pa.ChunkedArray, pa.Array)): return pd.Series(array_type(arrow_object)) else: raise NotImplementedError( "Objects of type {} are not supported".format(type(arrow_object)) )
Example #22
Source File: base.py From fletcher with MIT License | 5 votes |
def unique(self): """ Compute the ExtensionArray of unique values. It relies on the Pyarrow.ChunkedArray.unique and if it fails, comes back to the naive implementation. Returns ------- uniques : ExtensionArray """ try: return type(self)(self.data.unique()) except NotImplementedError: return super().unique()
Example #23
Source File: base.py From fletcher with MIT License | 5 votes |
def base(self) -> Union[pa.Array, pa.ChunkedArray]: """Return base object of the underlying data.""" return self.data
Example #24
Source File: base.py From fletcher with MIT License | 5 votes |
def __arrow_array__(self, type=None): """Convert myself to a pyarrow Array or ChunkedArray.""" return self.data
Example #25
Source File: test_algorithms.py From fletcher with MIT License | 5 votes |
def assert_content_equals_array(result, expected): """Assert that the result is an Arrow structure and the content matches an array.""" assert isinstance(result, (pa.Array, pa.ChunkedArray)) if isinstance(result, pa.ChunkedArray): result = pa.concat_arrays(result.iterchunks()) assert result.equals(expected)
Example #26
Source File: test_algorithms.py From fletcher with MIT License | 5 votes |
def check_valid_in_offsets( arr: pa.ChunkedArray, in_offsets: List[Tuple[int, int, int]] ) -> None: if arr.num_chunks == 0: assert in_offsets == [] return # We always start at the beginning assert in_offsets[0][0] == 0 assert in_offsets[0][1] == 0 # Overall, the chunk offsets must have the same length as the array assert sum(x[2] for x in in_offsets) == len(arr)
Example #27
Source File: bool.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def __init__(self, values): if not isinstance(values, pa.ChunkedArray): raise ValueError assert values.type == pa.bool_() self._data = values self._dtype = ArrowBoolDtype()