Python Examples of pyarrow.ChunkedArray

Source File: chunking.py From fletcher with MIT License

6 votes

def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(b)
        for chunk, offset in zip(b.iterchunks(), offsets):
            new_chunks.append(
                dispatch_chunked_binary_map(a[offset : offset + len(chunk)], chunk, ops)
            )
        return pa.chunked_array(new_chunks)
    elif isinstance(b, pa.Array):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        return ops.get("array_array", _not_implemented_path)(a, b)
    else:
        if np.isscalar(b):
            return ops.get("array_scalar", _not_implemented_path)(a, b)
        else:
            if len(a) != len(b):
                raise ValueError("Inputs don't have the same length.")
            return ops.get("array_nparray", _not_implemented_path)(a, b)

Source File: base.py From fletcher with MIT License

6 votes

def __init__(self, array, dtype=None, copy=None):
        # Copy is not used at the moment. It's only affect will be when we
        # allow array to be a FletcherChunkedArray
        if is_array_like(array) or isinstance(array, list):
            self.data = pa.chunked_array([pa.array(array, type=dtype)])
        elif isinstance(array, pa.Array):
            # ARROW-7008: pyarrow.chunked_array([array]) fails on array with all-None buffers
            if len(array) == 0 and all(b is None for b in array.buffers()):
                array = pa.array([], type=array.type)
            # TODO: Assert dtype
            self.data = pa.chunked_array([array])
        elif isinstance(array, pa.ChunkedArray):
            # TODO: Assert dtype
            self.data = array
        else:
            raise ValueError(
                "Unsupported type passed for {}: {}".format(
                    self.__class__.__name__, type(array)
                )
            )
        self._dtype = FletcherChunkedDtype(self.data.type)
        self.offsets = self._calculate_chunk_offsets()

Source File: string_array.py From fletcher with MIT License

6 votes

def _call_x_with(self, impl, needle, na=None):
        needle = NumbaString.make(needle)  # type: ignore
        result = np.zeros(len(self.data), dtype=np.uint8)

        if isinstance(self.data, pa.ChunkedArray):
            offset = 0
            for chunk in self.data.chunks:
                str_arr = NumbaStringArray.make(chunk)  # type: ignore
                impl(str_arr, needle, 2, offset, result)
                offset += len(chunk)
        else:
            str_arr = NumbaStringArray.make(self.data)  # type: ignore
            impl(str_arr, needle, 2, 0, result)

        return pd.Series(
            type(self.obj.values)(pa.array(result.astype(bool), mask=(result == 2)))
        )

Source File: string.py From fletcher with MIT License

5 votes

def _text_cat_chunked(a: Any, b: pa.ChunkedArray) -> pa.ChunkedArray:
    raise NotImplementedError(
        "_text_cat_chunked is only implemented for pa.Array and pa.ChunkedArray"
    )

Source File: bool.py From coffeegrindsize with MIT License

5 votes

def __init__(self, values):
        if not isinstance(values, pa.ChunkedArray):
            raise ValueError

        assert values.type == pa.bool_()
        self._data = values
        self._dtype = ArrowBoolDtype()

Source File: util.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def _arrow_column_to_column(name: str, column: pyarrow.ChunkedArray) -> Column:
    if pyarrow.types.is_floating(column.type) or pyarrow.types.is_integer(column.type):
        column_type = ColumnType.Number("{:,}")
    elif pyarrow.types.is_timestamp(column.type):
        column_type = ColumnType.Datetime()
    elif pyarrow.types.is_string(column.type) or pyarrow.types.is_dictionary(
        column.type
    ):
        column_type = ColumnType.Text()
    else:
        raise RuntimeError("Unknown column type %r" % column.type)
    return Column(name, column_type)

Source File: chunking.py From fletcher with MIT License

5 votes

def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

        new_chunks: List[pa.Array] = []
        for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
            a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
            b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
            new_chunks.append(dispatch_chunked_binary_map(a_slice, b_slice, ops))
        return pa.chunked_array(new_chunks)
    elif np.isscalar(b):
        new_chunks = []
        for chunk in a.iterchunks():
            new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops))
        return pa.chunked_array(new_chunks)
    else:
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(a)
        for chunk, offset in zip(a.iterchunks(), offsets):
            new_chunks.append(
                dispatch_chunked_binary_map(chunk, b[offset : offset + len(chunk)], ops)
            )
        return pa.chunked_array(new_chunks)

Source File: chunking.py From fletcher with MIT License

5 votes

def dispatch_chunked_binary_map(a: Any, b: Any, ops: Dict[str, Callable]):
    """
    Apply a map-like binary function where at least one of the arguments is an Arrow structure.

    This will yield a pyarrow.Arrow or pyarrow.ChunkedArray as an output.

    Parameters
    ----------
    a: scalar or np.ndarray or pa.Array or pa.ChunkedArray
    b: scalar or np.ndarray or pa.Array or pa.ChunkedArray
    op: dict
        Dictionary with the keys ('array_array', 'array_nparray', 'nparray_array',
        'array_scalar', 'scalar_array')
    """
    # a is neither a pa.Array nor a pa.ChunkedArray, we expect only numpy.ndarray or scalars.
    if isinstance(b, pa.ChunkedArray):
        if np.isscalar(a):
            new_chunks = []
            for chunk in b.iterchunks():
                new_chunks.append(dispatch_chunked_binary_map(a, chunk, ops))
            return pa.chunked_array(new_chunks)
        else:
            if len(a) != len(b):
                raise ValueError("Inputs don't have the same length.")
            new_chunks = []
            offsets = _calculate_chunk_offsets(b)
            for chunk, offset in zip(b.iterchunks(), offsets):
                new_chunks.append(
                    dispatch_chunked_binary_map(
                        a[offset : offset + len(chunk)], chunk, ops
                    )
                )
            return pa.chunked_array(new_chunks)
    elif isinstance(b, pa.Array):
        if np.isscalar(a):
            return ops.get("scalar_array", _not_implemented_path)(a, b)
        else:
            return ops.get("nparray_array", _not_implemented_path)(a, b)
    else:
        # Should never be reached, add a safe-guard
        raise NotImplementedError(f"Cannot apply ufunc on {type(a)} and {type(b)}")

Source File: chunking.py From fletcher with MIT License

5 votes

def _combined_in_chunk_offsets(
    a: pa.ChunkedArray, b: pa.ChunkedArray
) -> Tuple[List[Tuple[int, int, int]], List[Tuple[int, int, int]]]:
    offsets_a = _calculate_chunk_offsets(a)
    offsets_b = _calculate_chunk_offsets(b)
    offsets = sorted(set(list(offsets_a) + list(offsets_b)))
    in_a_offsets = _in_chunk_offsets(a, offsets)
    in_b_offsets = _in_chunk_offsets(b, offsets)
    return in_a_offsets, in_b_offsets

Source File: chunking.py From fletcher with MIT License

5 votes

def _in_chunk_offsets(
    arr: pa.ChunkedArray, offsets: List[int]
) -> List[Tuple[int, int, int]]:
    """Calculate the access ranges for a given list of offsets.

    All chunk start indices must be included as offsets and the offsets must be
    unique.

    Returns a list of tuples that contain:
     * The index of the given chunk
     * The position inside the chunk
     * The length of the current range
    """
    new_offsets = []
    pos = 0
    chunk = 0
    chunk_pos = 0
    for offset, offset_next in zip(offsets, offsets[1:] + [len(arr)]):
        diff = offset - pos
        chunk_remains = len(arr.chunk(chunk)) - chunk_pos
        step = offset_next - offset
        if diff == 0:  # The first offset
            new_offsets.append((chunk, chunk_pos, step))
        elif diff == chunk_remains:
            chunk += 1
            chunk_pos = 0
            pos += chunk_remains
            new_offsets.append((chunk, chunk_pos, step))
        else:  # diff < chunk_remains
            chunk_pos += diff
            pos += diff
            new_offsets.append((chunk, chunk_pos, step))
    return new_offsets

Source File: chunking.py From fletcher with MIT License

5 votes

def _calculate_chunk_offsets(chunked_array: pa.ChunkedArray) -> np.ndarray:
    """Return an array holding the indices pointing to the first element of each chunk."""
    offset = 0
    offsets = []
    for chunk in chunked_array.iterchunks():
        offsets.append(offset)
        offset += len(chunk)
    return np.array(offsets)

Source File: bool.py From fletcher with MIT License

5 votes

def or_vectorised(a: Union[pa.Array, pa.ChunkedArray], b: Any):
    """Perform OR on a boolean Arrow structure and a second operator."""
    # Scalar should be handled by or_na or all_true
    ops = {"array_array": or_array_array, "array_nparray": or_array_nparray}
    return dispatch_chunked_binary_map(a, b, ops)

Source File: bool.py From fletcher with MIT License

5 votes

def all_op(arr: Union[pa.ChunkedArray, pa.Array], skipna: bool) -> bool:
    """Perform all() on a boolean Arrow structure."""
    if isinstance(arr, pa.ChunkedArray):
        return all(all_op(chunk, skipna) for chunk in arr.chunks)

    if arr.null_count == 0:
        return _all_op_nonnull(len(arr), arr.buffers()[1])
    # skipna is not relevant in the Pandas behaviour
    return _all_op(len(arr), *arr.buffers())

Source File: string.py From fletcher with MIT License

5 votes

def _text_cat_chunked_mixed(a: pa.ChunkedArray, b: pa.Array) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(a)
    for chunk, offset in zip(a.iterchunks(), offsets):
        new_chunks.append(_text_cat(chunk, b[offset : offset + len(chunk)]))
    return pa.chunked_array(new_chunks)

Source File: string.py From fletcher with MIT License

5 votes

def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(b)
    for chunk, offset in zip(b.iterchunks(), offsets):
        new_chunks.append(_text_cat(a[offset : offset + len(chunk)], chunk))
    return pa.chunked_array(new_chunks)

Source File: string.py From fletcher with MIT License

5 votes

def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray:
    in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

    new_chunks: List[pa.Array] = []
    for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
        a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
        b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
        new_chunks.append(_text_cat(a_slice, b_slice))
    return pa.chunked_array(new_chunks)

Source File: bool.py From recruit with Apache License 2.0

5 votes

def __init__(self, values):
        if not isinstance(values, pa.ChunkedArray):
            raise ValueError

        assert values.type == pa.bool_()
        self._data = values
        self._dtype = ArrowBoolDtype()

Source File: _algorithms.py From fletcher with MIT License

5 votes

def pd_nanop(nanop: Callable, arr: Union[pa.ChunkedArray, pa.Array], skipna: bool):
    """Use pandas.core.nanops to provide a reduction."""
    if isinstance(arr, pa.ChunkedArray):
        data = pa.concat_arrays(arr.iterchunks())
    else:
        data = arr
    np_arr = _extract_data_buffer_as_np_array(data)
    mask = extract_isnull_bytemap(data)

    return nanop(np_arr, skipna=skipna, mask=mask)

Source File: _algorithms.py From fletcher with MIT License

5 votes

def extract_isnull_bytemap(array: Union[pa.ChunkedArray, pa.Array]) -> np.ndarray:
    """
    Extract the valid bitmaps of a (chunked) array into numpy isnull bytemaps.

    Parameters
    ----------
    array
        Array from which we extract the validity bits as bytes

    Returns
    -------
    valid_bytemap
    """
    if array.null_count == len(array):
        return np.ones(len(array), dtype=bool)

    if isinstance(array, pa.ChunkedArray):
        result = np.zeros(len(array), dtype=bool)
        if array.null_count == 0:
            return result

        offset = 0
        for chunk in array.chunks:
            if chunk.null_count > 0:
                _extract_isnull_bytemap(
                    chunk.buffers()[0], len(chunk), chunk.offset, offset, result
                )
            offset += len(chunk)
    else:
        valid_bitmap = array.buffers()[0]
        if valid_bitmap:
            # TODO: Can we use np.empty here to improve performance?
            result = np.zeros(len(array), dtype=bool)
            # TODO(ARROW-2664): We only need to following line to support
            #   executing the code in disabled-JIT mode.
            buf = memoryview(valid_bitmap)
            _extract_isnull_bytemap(buf, len(array), array.offset, 0, result)
        else:
            result = np.full(len(array), False)

    return result

Source File: string_array.py From fletcher with MIT License

5 votes

def _series_like(self, array: Union[pa.Array, pa.ChunkedArray]) -> pd.Series:
        """Return an Arrow result as a series with the same base classes as the input."""
        return pd.Series(
            type(self.obj.values)(array),
            dtype=type(self.obj.dtype)(array.type),
            index=self.obj.index,
        )

Source File: base.py From fletcher with MIT License

5 votes

def pandas_from_arrow(
    arrow_object: Union[pa.RecordBatch, pa.Table, pa.Array, pa.ChunkedArray],
    continuous: bool = False,
):
    """
    Convert Arrow object instance to their Pandas equivalent by using Fletcher.

    The conversion rules are:
      * {RecordBatch, Table} -> DataFrame
      * {Array, ChunkedArray} -> Series

    Parameters
    ----------
    arrow_object : RecordBatch, Table, Array or ChunkedArray
        object to be converted
    continuous : bool
        Use FletcherContinuousArray instead of FletcherChunkedArray
    """
    if continuous:
        array_type = FletcherContinuousArray
    else:
        array_type = FletcherChunkedArray
    if isinstance(arrow_object, pa.RecordBatch):
        data: OrderedDict = OrderedDict()
        for ix, arr in enumerate(arrow_object):
            col_name = arrow_object.schema.names[ix]
            data[col_name] = array_type(arr)
        return pd.DataFrame(data)
    elif isinstance(arrow_object, pa.Table):
        data = OrderedDict()
        for name, col in zip(arrow_object.column_names, arrow_object.itercolumns()):
            data[name] = array_type(col)
        return pd.DataFrame(data)
    elif isinstance(arrow_object, (pa.ChunkedArray, pa.Array)):
        return pd.Series(array_type(arrow_object))
    else:
        raise NotImplementedError(
            "Objects of type {} are not supported".format(type(arrow_object))
        )

Source File: base.py From fletcher with MIT License

5 votes

def unique(self):
        """
        Compute the ExtensionArray of unique values.

        It relies on the Pyarrow.ChunkedArray.unique and if
        it fails, comes back to the naive implementation.

        Returns
        -------
        uniques : ExtensionArray
        """
        try:
            return type(self)(self.data.unique())
        except NotImplementedError:
            return super().unique()

Source File: base.py From fletcher with MIT License

5 votes

def base(self) -> Union[pa.Array, pa.ChunkedArray]:
        """Return base object of the underlying data."""
        return self.data

Source File: base.py From fletcher with MIT License

5 votes

def __arrow_array__(self, type=None):
        """Convert myself to a pyarrow Array or ChunkedArray."""
        return self.data

Source File: test_algorithms.py From fletcher with MIT License

5 votes

def assert_content_equals_array(result, expected):
    """Assert that the result is an Arrow structure and the content matches an array."""
    assert isinstance(result, (pa.Array, pa.ChunkedArray))
    if isinstance(result, pa.ChunkedArray):
        result = pa.concat_arrays(result.iterchunks())
    assert result.equals(expected)

Source File: test_algorithms.py From fletcher with MIT License

5 votes

def check_valid_in_offsets(
    arr: pa.ChunkedArray, in_offsets: List[Tuple[int, int, int]]
) -> None:
    if arr.num_chunks == 0:
        assert in_offsets == []
        return

    # We always start at the beginning
    assert in_offsets[0][0] == 0
    assert in_offsets[0][1] == 0

    # Overall, the chunk offsets must have the same length as the array
    assert sum(x[2] for x in in_offsets) == len(arr)

Source File: bool.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def __init__(self, values):
        if not isinstance(values, pa.ChunkedArray):
            raise ValueError

        assert values.type == pa.bool_()
        self._data = values
        self._dtype = ArrowBoolDtype()

Python pyarrow.ChunkedArray() Examples