Python pyarrow.ChunkedArray() Examples

The following are 27 code examples of pyarrow.ChunkedArray(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow , or try the search function .
Example #1
Source File: chunking.py    From fletcher with MIT License 6 votes vote down vote up
def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(b)
        for chunk, offset in zip(b.iterchunks(), offsets):
            new_chunks.append(
                dispatch_chunked_binary_map(a[offset : offset + len(chunk)], chunk, ops)
            )
        return pa.chunked_array(new_chunks)
    elif isinstance(b, pa.Array):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        return ops.get("array_array", _not_implemented_path)(a, b)
    else:
        if np.isscalar(b):
            return ops.get("array_scalar", _not_implemented_path)(a, b)
        else:
            if len(a) != len(b):
                raise ValueError("Inputs don't have the same length.")
            return ops.get("array_nparray", _not_implemented_path)(a, b) 
Example #2
Source File: base.py    From fletcher with MIT License 6 votes vote down vote up
def __init__(self, array, dtype=None, copy=None):
        # Copy is not used at the moment. It's only affect will be when we
        # allow array to be a FletcherChunkedArray
        if is_array_like(array) or isinstance(array, list):
            self.data = pa.chunked_array([pa.array(array, type=dtype)])
        elif isinstance(array, pa.Array):
            # ARROW-7008: pyarrow.chunked_array([array]) fails on array with all-None buffers
            if len(array) == 0 and all(b is None for b in array.buffers()):
                array = pa.array([], type=array.type)
            # TODO: Assert dtype
            self.data = pa.chunked_array([array])
        elif isinstance(array, pa.ChunkedArray):
            # TODO: Assert dtype
            self.data = array
        else:
            raise ValueError(
                "Unsupported type passed for {}: {}".format(
                    self.__class__.__name__, type(array)
                )
            )
        self._dtype = FletcherChunkedDtype(self.data.type)
        self.offsets = self._calculate_chunk_offsets() 
Example #3
Source File: string_array.py    From fletcher with MIT License 6 votes vote down vote up
def _call_x_with(self, impl, needle, na=None):
        needle = NumbaString.make(needle)  # type: ignore
        result = np.zeros(len(self.data), dtype=np.uint8)

        if isinstance(self.data, pa.ChunkedArray):
            offset = 0
            for chunk in self.data.chunks:
                str_arr = NumbaStringArray.make(chunk)  # type: ignore
                impl(str_arr, needle, 2, offset, result)
                offset += len(chunk)
        else:
            str_arr = NumbaStringArray.make(self.data)  # type: ignore
            impl(str_arr, needle, 2, 0, result)

        return pd.Series(
            type(self.obj.values)(pa.array(result.astype(bool), mask=(result == 2)))
        ) 
Example #4
Source File: string.py    From fletcher with MIT License 5 votes vote down vote up
def _text_cat_chunked(a: Any, b: pa.ChunkedArray) -> pa.ChunkedArray:
    raise NotImplementedError(
        "_text_cat_chunked is only implemented for pa.Array and pa.ChunkedArray"
    ) 
Example #5
Source File: bool.py    From coffeegrindsize with MIT License 5 votes vote down vote up
def __init__(self, values):
        if not isinstance(values, pa.ChunkedArray):
            raise ValueError

        assert values.type == pa.bool_()
        self._data = values
        self._dtype = ArrowBoolDtype() 
Example #6
Source File: util.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column:
    if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer(column.type):
        column_type = ColumnType.Number("{:,}")
    elif pyarrow.types.is_timestamp(column.type):
        column_type = ColumnType.Datetime()
    elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary(
        column.type
    ):
        column_type = ColumnType.Text()
    else:
        raise RuntimeError("Unknown column type %r" % column.type)
    return Column(name, column_type) 
Example #7
Source File: chunking.py    From fletcher with MIT License 5 votes vote down vote up
def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

        new_chunks: List[pa.Array] = []
        for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
            a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
            b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
            new_chunks.append(dispatch_chunked_binary_map(a_slice, b_slice, ops))
        return pa.chunked_array(new_chunks)
    elif np.isscalar(b):
        new_chunks = []
        for chunk in a.iterchunks():
            new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops))
        return pa.chunked_array(new_chunks)
    else:
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(a)
        for chunk, offset in zip(a.iterchunks(), offsets):
            new_chunks.append(
                dispatch_chunked_binary_map(chunk, b[offset : offset + len(chunk)], ops)
            )
        return pa.chunked_array(new_chunks) 
Example #8
Source File: chunking.py    From fletcher with MIT License 5 votes vote down vote up
def dispatch_chunked_binary_map(a: Any, b: Any, ops: Dict[str, Callable]):
    """
    Apply a map-like binary function where at least one of the arguments is an Arrow structure.

    This will yield a pyarrow.Arrow or pyarrow.ChunkedArray as an output.

    Parameters
    ----------
    a: scalar or np.ndarray or pa.Array or pa.ChunkedArray
    b: scalar or np.ndarray or pa.Array or pa.ChunkedArray
    op: dict
        Dictionary with the keys ('array_array', 'array_nparray', 'nparray_array',
        'array_scalar', 'scalar_array')
    """
    # a is neither a pa.Array nor a pa.ChunkedArray, we expect only numpy.ndarray or scalars.
    if isinstance(b, pa.ChunkedArray):
        if np.isscalar(a):
            new_chunks = []
            for chunk in b.iterchunks():
                new_chunks.append(dispatch_chunked_binary_map(a, chunk, ops))
            return pa.chunked_array(new_chunks)
        else:
            if len(a) != len(b):
                raise ValueError("Inputs don't have the same length.")
            new_chunks = []
            offsets = _calculate_chunk_offsets(b)
            for chunk, offset in zip(b.iterchunks(), offsets):
                new_chunks.append(
                    dispatch_chunked_binary_map(
                        a[offset : offset + len(chunk)], chunk, ops
                    )
                )
            return pa.chunked_array(new_chunks)
    elif isinstance(b, pa.Array):
        if np.isscalar(a):
            return ops.get("scalar_array", _not_implemented_path)(a, b)
        else:
            return ops.get("nparray_array", _not_implemented_path)(a, b)
    else:
        # Should never be reached, add a safe-guard
        raise NotImplementedError(f"Cannot apply ufunc on {type(a)} and {type(b)}") 
Example #9
Source File: chunking.py    From fletcher with MIT License 5 votes vote down vote up
def _combined_in_chunk_offsets(
    a: pa.ChunkedArray, b: pa.ChunkedArray
) -> Tuple[List[Tuple[int, int, int]], List[Tuple[int, int, int]]]:
    offsets_a = _calculate_chunk_offsets(a)
    offsets_b = _calculate_chunk_offsets(b)
    offsets = sorted(set(list(offsets_a) + list(offsets_b)))
    in_a_offsets = _in_chunk_offsets(a, offsets)
    in_b_offsets = _in_chunk_offsets(b, offsets)
    return in_a_offsets, in_b_offsets 
Example #10
Source File: chunking.py    From fletcher with MIT License 5 votes vote down vote up
def _in_chunk_offsets(
    arr: pa.ChunkedArray, offsets: List[int]
) -> List[Tuple[int, int, int]]:
    """Calculate the access ranges for a given list of offsets.

    All chunk start indices must be included as offsets and the offsets must be
    unique.

    Returns a list of tuples that contain:
     * The index of the given chunk
     * The position inside the chunk
     * The length of the current range
    """
    new_offsets = []
    pos = 0
    chunk = 0
    chunk_pos = 0
    for offset, offset_next in zip(offsets, offsets[1:] + [len(arr)]):
        diff = offset - pos
        chunk_remains = len(arr.chunk(chunk)) - chunk_pos
        step = offset_next - offset
        if diff == 0:  # The first offset
            new_offsets.append((chunk, chunk_pos, step))
        elif diff == chunk_remains:
            chunk += 1
            chunk_pos = 0
            pos += chunk_remains
            new_offsets.append((chunk, chunk_pos, step))
        else:  # diff < chunk_remains
            chunk_pos += diff
            pos += diff
            new_offsets.append((chunk, chunk_pos, step))
    return new_offsets 
Example #11
Source File: chunking.py    From fletcher with MIT License 5 votes vote down vote up
def _calculate_chunk_offsets(chunked_array: pa.ChunkedArray) -> np.ndarray:
    """Return an array holding the indices pointing to the first element of each chunk."""
    offset = 0
    offsets = []
    for chunk in chunked_array.iterchunks():
        offsets.append(offset)
        offset += len(chunk)
    return np.array(offsets) 
Example #12
Source File: bool.py    From fletcher with MIT License 5 votes vote down vote up
def or_vectorised(a: Union[pa.Array, pa.ChunkedArray], b: Any):
    """Perform OR on a boolean Arrow structure and a second operator."""
    # Scalar should be handled by or_na or all_true
    ops = {"array_array": or_array_array, "array_nparray": or_array_nparray}
    return dispatch_chunked_binary_map(a, b, ops) 
Example #13
Source File: bool.py    From fletcher with MIT License 5 votes vote down vote up
def all_op(arr: Union[pa.ChunkedArray, pa.Array], skipna: bool) -> bool:
    """Perform all() on a boolean Arrow structure."""
    if isinstance(arr, pa.ChunkedArray):
        return all(all_op(chunk, skipna) for chunk in arr.chunks)

    if arr.null_count == 0:
        return _all_op_nonnull(len(arr), arr.buffers()[1])
    # skipna is not relevant in the Pandas behaviour
    return _all_op(len(arr), *arr.buffers()) 
Example #14
Source File: string.py    From fletcher with MIT License 5 votes vote down vote up
def _text_cat_chunked_mixed(a: pa.ChunkedArray, b: pa.Array) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(a)
    for chunk, offset in zip(a.iterchunks(), offsets):
        new_chunks.append(_text_cat(chunk, b[offset : offset + len(chunk)]))
    return pa.chunked_array(new_chunks) 
Example #15
Source File: string.py    From fletcher with MIT License 5 votes vote down vote up
def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(b)
    for chunk, offset in zip(b.iterchunks(), offsets):
        new_chunks.append(_text_cat(a[offset : offset + len(chunk)], chunk))
    return pa.chunked_array(new_chunks) 
Example #16
Source File: string.py    From fletcher with MIT License 5 votes vote down vote up
def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray:
    in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

    new_chunks: List[pa.Array] = []
    for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
        a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
        b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
        new_chunks.append(_text_cat(a_slice, b_slice))
    return pa.chunked_array(new_chunks) 
Example #17
Source File: bool.py    From recruit with Apache License 2.0 5 votes vote down vote up
def __init__(self, values):
        if not isinstance(values, pa.ChunkedArray):
            raise ValueError

        assert values.type == pa.bool_()
        self._data = values
        self._dtype = ArrowBoolDtype() 
Example #18
Source File: _algorithms.py    From fletcher with MIT License 5 votes vote down vote up
def pd_nanop(nanop: Callable, arr: Union[pa.ChunkedArray, pa.Array], skipna: bool):
    """Use pandas.core.nanops to provide a reduction."""
    if isinstance(arr, pa.ChunkedArray):
        data = pa.concat_arrays(arr.iterchunks())
    else:
        data = arr
    np_arr = _extract_data_buffer_as_np_array(data)
    mask = extract_isnull_bytemap(data)

    return nanop(np_arr, skipna=skipna, mask=mask) 
Example #19
Source File: _algorithms.py    From fletcher with MIT License 5 votes vote down vote up
def extract_isnull_bytemap(array: Union[pa.ChunkedArray, pa.Array]) -> np.ndarray:
    """
    Extract the valid bitmaps of a (chunked) array into numpy isnull bytemaps.

    Parameters
    ----------
    array
        Array from which we extract the validity bits as bytes

    Returns
    -------
    valid_bytemap
    """
    if array.null_count == len(array):
        return np.ones(len(array), dtype=bool)

    if isinstance(array, pa.ChunkedArray):
        result = np.zeros(len(array), dtype=bool)
        if array.null_count == 0:
            return result

        offset = 0
        for chunk in array.chunks:
            if chunk.null_count > 0:
                _extract_isnull_bytemap(
                    chunk.buffers()[0], len(chunk), chunk.offset, offset, result
                )
            offset += len(chunk)
    else:
        valid_bitmap = array.buffers()[0]
        if valid_bitmap:
            # TODO: Can we use np.empty here to improve performance?
            result = np.zeros(len(array), dtype=bool)
            # TODO(ARROW-2664): We only need to following line to support
            #   executing the code in disabled-JIT mode.
            buf = memoryview(valid_bitmap)
            _extract_isnull_bytemap(buf, len(array), array.offset, 0, result)
        else:
            result = np.full(len(array), False)

    return result 
Example #20
Source File: string_array.py    From fletcher with MIT License 5 votes vote down vote up
def _series_like(self, array: Union[pa.Array, pa.ChunkedArray]) -> pd.Series:
        """Return an Arrow result as a series with the same base classes as the input."""
        return pd.Series(
            type(self.obj.values)(array),
            dtype=type(self.obj.dtype)(array.type),
            index=self.obj.index,
        ) 
Example #21
Source File: base.py    From fletcher with MIT License 5 votes vote down vote up
def pandas_from_arrow(
    arrow_object: Union[pa.RecordBatch, pa.Table, pa.Array, pa.ChunkedArray],
    continuous: bool = False,
):
    """
    Convert Arrow object instance to their Pandas equivalent by using Fletcher.

    The conversion rules are:
      * {RecordBatch, Table} -> DataFrame
      * {Array, ChunkedArray} -> Series

    Parameters
    ----------
    arrow_object : RecordBatch, Table, Array or ChunkedArray
        object to be converted
    continuous : bool
        Use FletcherContinuousArray instead of FletcherChunkedArray
    """
    if continuous:
        array_type = FletcherContinuousArray
    else:
        array_type = FletcherChunkedArray
    if isinstance(arrow_object, pa.RecordBatch):
        data: OrderedDict = OrderedDict()
        for ix, arr in enumerate(arrow_object):
            col_name = arrow_object.schema.names[ix]
            data[col_name] = array_type(arr)
        return pd.DataFrame(data)
    elif isinstance(arrow_object, pa.Table):
        data = OrderedDict()
        for name, col in zip(arrow_object.column_names, arrow_object.itercolumns()):
            data[name] = array_type(col)
        return pd.DataFrame(data)
    elif isinstance(arrow_object, (pa.ChunkedArray, pa.Array)):
        return pd.Series(array_type(arrow_object))
    else:
        raise NotImplementedError(
            "Objects of type {} are not supported".format(type(arrow_object))
        ) 
Example #22
Source File: base.py    From fletcher with MIT License 5 votes vote down vote up
def unique(self):
        """
        Compute the ExtensionArray of unique values.

        It relies on the Pyarrow.ChunkedArray.unique and if
        it fails, comes back to the naive implementation.

        Returns
        -------
        uniques : ExtensionArray
        """
        try:
            return type(self)(self.data.unique())
        except NotImplementedError:
            return super().unique() 
Example #23
Source File: base.py    From fletcher with MIT License 5 votes vote down vote up
def base(self) -> Union[pa.Array, pa.ChunkedArray]:
        """Return base object of the underlying data."""
        return self.data 
Example #24
Source File: base.py    From fletcher with MIT License 5 votes vote down vote up
def __arrow_array__(self, type=None):
        """Convert myself to a pyarrow Array or ChunkedArray."""
        return self.data 
Example #25
Source File: test_algorithms.py    From fletcher with MIT License 5 votes vote down vote up
def assert_content_equals_array(result, expected):
    """Assert that the result is an Arrow structure and the content matches an array."""
    assert isinstance(result, (pa.Array, pa.ChunkedArray))
    if isinstance(result, pa.ChunkedArray):
        result = pa.concat_arrays(result.iterchunks())
    assert result.equals(expected) 
Example #26
Source File: test_algorithms.py    From fletcher with MIT License 5 votes vote down vote up
def check_valid_in_offsets(
    arr: pa.ChunkedArray, in_offsets: List[Tuple[int, int, int]]
) -> None:
    if arr.num_chunks == 0:
        assert in_offsets == []
        return

    # We always start at the beginning
    assert in_offsets[0][0] == 0
    assert in_offsets[0][1] == 0

    # Overall, the chunk offsets must have the same length as the array
    assert sum(x[2] for x in in_offsets) == len(arr) 
Example #27
Source File: bool.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def __init__(self, values):
        if not isinstance(values, pa.ChunkedArray):
            raise ValueError

        assert values.type == pa.bool_()
        self._data = values
        self._dtype = ArrowBoolDtype()