Python Examples of pyarrow.Array

Source File: string.py From fletcher with MIT License

6 votes

def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array:
    if len(a) != len(b):
        raise ValueError("Lengths of arrays don't match")

    offsets_a, data_a = _extract_string_buffers(a)
    offsets_b, data_b = _extract_string_buffers(b)
    if len(a) > 0:
        valid = _merge_valid_bitmaps(a, b)
        result_offsets = np.empty(len(a) + 1, dtype=np.int32)
        result_offsets[0] = 0
        total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
        result_data = np.empty(total_size, dtype=np.uint8)
        _merge_string_data(
            len(a),
            valid,
            offsets_a,
            data_a,
            offsets_b,
            data_b,
            result_offsets,
            result_data,
        )
        buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
        return pa.Array.from_buffers(pa.string(), len(a), buffers)
    return a

Source File: base.py From fletcher with MIT License

6 votes

def __init__(self, array, dtype=None, copy: Optional[bool] = None):
        # Copy is not used at the moment. It's only affect will be when we
        # allow array to be a FletcherContinuousArray
        if is_array_like(array) or isinstance(array, list):
            self.data = pa.array(array, type=dtype)
        elif isinstance(array, pa.Array):
            # TODO: Assert dtype
            self.data = array
        elif isinstance(array, pa.ChunkedArray):
            # TODO: Assert dtype
            if array.num_chunks == 1:
                self.data = array.chunk(0)
            else:
                self.data = pa.concat_arrays(array.iterchunks())
        else:
            raise ValueError(
                "Unsupported type passed for {}: {}".format(
                    self.__class__.__name__, type(array)
                )
            )
        self._dtype = FletcherContinuousDtype(self.data.type)

Source File: bool.py From fletcher with MIT License

6 votes

def or_na(arr: pa.Array) -> pa.Array:
    """Apply ``array | NA`` with a boolean pyarrow.Array."""
    output_length = len(arr) // 8
    if len(arr) % 8 != 0:
        output_length += 1

    if arr.null_count == 0:
        return pa.Array.from_buffers(
            pa.bool_(),
            len(arr),
            [arr.buffers()[1], arr.buffers()[1]],
            null_count=-1,
            offset=arr.offset,
        )
    else:
        output = np.zeros(output_length, dtype=np.uint8)
        null_count = _or_na(
            len(arr), arr.offset, arr.buffers()[0], arr.buffers()[1], output
        )
        buf = pa.py_buffer(output)
        return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], null_count)

Source File: chunking.py From fletcher with MIT License

6 votes

def _2(a: pa.Array, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(b)
        for chunk, offset in zip(b.iterchunks(), offsets):
            new_chunks.append(
                dispatch_chunked_binary_map(a[offset : offset + len(chunk)], chunk, ops)
            )
        return pa.chunked_array(new_chunks)
    elif isinstance(b, pa.Array):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        return ops.get("array_array", _not_implemented_path)(a, b)
    else:
        if np.isscalar(b):
            return ops.get("array_scalar", _not_implemented_path)(a, b)
        else:
            if len(a) != len(b):
                raise ValueError("Inputs don't have the same length.")
            return ops.get("array_nparray", _not_implemented_path)(a, b)

Source File: _algorithms.py From fletcher with MIT License

6 votes

def np_ufunc_array_array(a: pa.Array, b: pa.Array, op: Callable):
    np_arr_a = _extract_data_buffer_as_np_array(a)
    np_arr_b = _extract_data_buffer_as_np_array(b)
    if a.null_count > 0 and b.null_count > 0:
        # TODO: Combine them before extracting
        mask_a = extract_isnull_bytemap(a)
        mask_b = extract_isnull_bytemap(b)
        mask = mask_a | mask_b
    elif a.null_count > 0:
        mask = extract_isnull_bytemap(a)
    elif b.null_count > 0:
        mask = extract_isnull_bytemap(b)
    else:
        mask = None

    new_arr = op(np_arr_a, np_arr_b)
    # Don't set type as we might have valid casts like int->float in truediv
    return pa.array(new_arr, mask=mask)

Source File: basic_stats_generator.py From data-validation with Apache License 2.0

6 votes

def update(self, feature_array: pa.Array) -> None:
    """Update the partial string statistics using the input value."""
    if pa.types.is_null(feature_array.type):
      return
    # Iterate through the value array and update the partial stats.
    flattened_values_array, _ = arrow_util.flatten_nested(feature_array)
    if arrow_util.is_binary_like(flattened_values_array.type):
      # GetBinaryArrayTotalByteSize returns a Python long (to be compatible
      # with Python3). To make sure we do cheaper integer arithemetics in
      # Python2, we first convert it to int.
      self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize(
          flattened_values_array))
    elif flattened_values_array:
      # We can only do flattened_values_array.to_numpy() when it's not empty.
      # This could be computed faster by taking log10 of the integer.
      def _len_after_conv(s):
        return len(str(s))
      self.total_bytes_length += np.sum(
          np.vectorize(_len_after_conv,
                       otypes=[np.int32])(np.asarray(flattened_values_array)))

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

6 votes

def _ListArrayToTensor(
      self, list_array: pa.Array,
      produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]:
    """Converts a ListArray to a dense tensor."""
    values = list_array.flatten()
    batch_size = len(list_array)
    expected_num_elements = batch_size * self._unbatched_flat_len
    if len(values) != expected_num_elements:
      raise ValueError(
          "Unable to convert ListArray {} to {}: size mismatch. expected {} "
          "elements but got {}".format(
              list_array, self.type_spec, expected_num_elements, len(values)))
    actual_shape = list(self._shape)
    actual_shape[0] = batch_size
    if self._convert_to_binary_fn is not None:
      values = self._convert_to_binary_fn(values)
    values_np = np.asarray(values).reshape(actual_shape)
    if produce_eager_tensors:
      return tf.convert_to_tensor(values_np)

    return values_np

Source File: basic_stats_generator.py From data-validation with Apache License 2.0

6 votes

def update(self, feature_array: pa.Array, presence_mask: np.ndarray,
             num_values: np.ndarray, num_values_not_none: np.ndarray,
             weights: Optional[np.ndarray]) -> None:
    """Updates the stats with a feature array."""
    self.num_non_missing += len(feature_array) - feature_array.null_count

    self.max_num_values = np.maximum.reduce(
        num_values_not_none, initial=self.max_num_values)
    self.min_num_values = np.minimum.reduce(num_values_not_none,
                                            initial=self.min_num_values)
    self.total_num_values += np.sum(num_values_not_none)

    if weights is not None:
      if weights.size != num_values.size:
        raise ValueError('Weight feature must not be missing.')
      self.weighted_total_num_values += np.sum(num_values * weights)
      self.weighted_num_non_missing += np.sum(weights[presence_mask])

Source File: chunking.py From fletcher with MIT License

5 votes

def apply_per_chunk(func):
    """Apply a function to each chunk if the input is chunked."""

    @wraps(func)
    def wrapper(arr: Union[pa.Array, pa.ChunkedArray], *args, **kwargs):
        if isinstance(arr, pa.ChunkedArray):
            return pa.chunked_array(
                [func(chunk, *args, **kwargs) for chunk in arr.chunks]
            )
        else:
            return func(arr, *args, **kwargs)

    return wrapper

Source File: string.py From fletcher with MIT License

5 votes

def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array:
    """
    Check for each element in the data whether it contains the pattern ``pat``.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """
    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()

    # Initialise boolean (bit-packaed) output array.
    output_size = len(data) // 8
    if len(data) % 8 > 0:
        output_size += 1
    output = np.empty(output_size, dtype=np.uint8)
    if len(data) % 8 > 0:
        # Zero trailing bits
        output[-1] = 0

    offsets, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = None
        _text_contains_case_sensitive_nonnull(
            len(data), offsets, data_buffer, pat_bytes, output
        )
    else:
        valid = _buffer_to_view(data.buffers()[0])
        _text_contains_case_sensitive_nulls(
            len(data), valid, data.offset, offsets, data_buffer, pat_bytes, output
        )
        valid_buffer = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            valid_buffer = shift_unaligned_bitmap(
                valid_buffer, data.offset % 8, len(data)
            )

    return pa.Array.from_buffers(
        pa.bool_(), len(data), [valid_buffer, pa.py_buffer(output)], data.null_count
    )

Source File: bool.py From fletcher with MIT License

5 votes

def any_op(arr: Union[pa.ChunkedArray, pa.Array], skipna: bool) -> bool:
    """Perform any() on a boolean Arrow structure."""
    if isinstance(arr, pa.ChunkedArray):
        return any(any_op(chunk, skipna) for chunk in arr.chunks)

    if arr.null_count == 0:
        return _any_op_nonnull(len(arr), arr.buffers()[1])
    if skipna:
        return _any_op_skipna(len(arr), *arr.buffers())
    return _any_op(len(arr), *arr.buffers())

Source File: bool.py From fletcher with MIT License

5 votes

def all_op(arr: Union[pa.ChunkedArray, pa.Array], skipna: bool) -> bool:
    """Perform all() on a boolean Arrow structure."""
    if isinstance(arr, pa.ChunkedArray):
        return all(all_op(chunk, skipna) for chunk in arr.chunks)

    if arr.null_count == 0:
        return _all_op_nonnull(len(arr), arr.buffers()[1])
    # skipna is not relevant in the Pandas behaviour
    return _all_op(len(arr), *arr.buffers())

Source File: string.py From fletcher with MIT License

5 votes

def _text_cat_chunked_2(a: pa.Array, b: pa.ChunkedArray) -> pa.ChunkedArray:
    new_chunks = []
    offsets = _calculate_chunk_offsets(b)
    for chunk, offset in zip(b.iterchunks(), offsets):
        new_chunks.append(_text_cat(a[offset : offset + len(chunk)], chunk))
    return pa.chunked_array(new_chunks)

Source File: string.py From fletcher with MIT License

5 votes

def _text_cat_chunked_1(a: pa.ChunkedArray, b: pa.ChunkedArray) -> pa.ChunkedArray:
    in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

    new_chunks: List[pa.Array] = []
    for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
        a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
        b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
        new_chunks.append(_text_cat(a_slice, b_slice))
    return pa.chunked_array(new_chunks)

Source File: bool.py From fletcher with MIT License

5 votes

def all_true(arr: pa.Array) -> pa.Array:
    """Return a boolean array with all-True, all-valid with the same size ."""
    output_length = len(arr) // 8
    if len(arr) % 8 != 0:
        output_length += 1

    buf = pa.py_buffer(np.full(output_length, 255, dtype=np.uint8))
    return pa.Array.from_buffers(pa.bool_(), len(arr), [buf, buf], 0)

Source File: bool.py From fletcher with MIT License

5 votes

def or_array_nparray(a: pa.Array, b: np.ndarray) -> pa.Array:
    """Perform ``pa.Array | np.ndarray``."""
    output_length = len(a) // 8
    if len(a) % 8 != 0:
        output_length += 1

    if a.null_count == 0:
        result = np.zeros(output_length, dtype=np.uint8)
        bitmap_or_unaligned_with_numpy_nonnull(
            len(a), a.buffers()[1], a.offset, b, result
        )
        return pa.Array.from_buffers(
            pa.bool_(), len(a), [None, pa.py_buffer(result)], 0
        )
    else:
        result = np.zeros(output_length, dtype=np.uint8)
        valid_bits = np.zeros(output_length, dtype=np.uint8)
        null_count = bitmap_or_unaligned_with_numpy(
            len(a), a.buffers()[0], a.buffers()[1], a.offset, b, result, valid_bits
        )
        return pa.Array.from_buffers(
            pa.bool_(),
            len(a),
            [pa.py_buffer(valid_bits), pa.py_buffer(result)],
            null_count,
        )

Source File: string.py From fletcher with MIT License

5 votes

def _text_cat_chunked(a: Any, b: pa.ChunkedArray) -> pa.ChunkedArray:
    raise NotImplementedError(
        "_text_cat_chunked is only implemented for pa.Array and pa.ChunkedArray"
    )

Source File: bool.py From fletcher with MIT License

5 votes

def or_vectorised(a: Union[pa.Array, pa.ChunkedArray], b: Any):
    """Perform OR on a boolean Arrow structure and a second operator."""
    # Scalar should be handled by or_na or all_true
    ops = {"array_array": or_array_array, "array_nparray": or_array_nparray}
    return dispatch_chunked_binary_map(a, b, ops)

Source File: _algorithms.py From fletcher with MIT License

5 votes

def np_ufunc_scalar_array(a: Any, b: pa.Array, op: Callable):
    # a is non-masked, either array-like or scalar
    # numpy can handle all types of b from here
    np_arr = _extract_data_buffer_as_np_array(b)
    mask = extract_isnull_bytemap(b)
    if np.isscalar(a):
        a = np.array(a)
    new_arr = op(a, np_arr)
    # Don't set type as we might have valid casts like int->float in truediv
    return pa.array(new_arr, mask=mask)

Source File: chunking.py From fletcher with MIT License

5 votes

def _1(a: pa.ChunkedArray, b: Any, ops: Dict[str, Callable]):
    """Apply a NumPy ufunc where at least one of the arguments is an Arrow structure."""
    if isinstance(b, pa.ChunkedArray):
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        in_a_offsets, in_b_offsets = _combined_in_chunk_offsets(a, b)

        new_chunks: List[pa.Array] = []
        for a_offset, b_offset in zip(in_a_offsets, in_b_offsets):
            a_slice = a.chunk(a_offset[0])[a_offset[1] : a_offset[1] + a_offset[2]]
            b_slice = b.chunk(b_offset[0])[b_offset[1] : b_offset[1] + b_offset[2]]
            new_chunks.append(dispatch_chunked_binary_map(a_slice, b_slice, ops))
        return pa.chunked_array(new_chunks)
    elif np.isscalar(b):
        new_chunks = []
        for chunk in a.iterchunks():
            new_chunks.append(dispatch_chunked_binary_map(chunk, b, ops))
        return pa.chunked_array(new_chunks)
    else:
        if len(a) != len(b):
            raise ValueError("Inputs don't have the same length.")
        new_chunks = []
        offsets = _calculate_chunk_offsets(a)
        for chunk, offset in zip(a.iterchunks(), offsets):
            new_chunks.append(
                dispatch_chunked_binary_map(chunk, b[offset : offset + len(chunk)], ops)
            )
        return pa.chunked_array(new_chunks)

Source File: types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def series_to_arrow_array(series: pd.Series) -> pyarrow.Array:
    """
    Convert a Pandas series to an in-memory Arrow array.
    """
    if hasattr(series, "cat"):
        return pyarrow.DictionaryArray.from_arrays(
            # Pandas categorical value "-1" means None
            pyarrow.Array.from_pandas(series.cat.codes, mask=(series.cat.codes == -1)),
            series_to_arrow_array(series.cat.categories),
        )
    else:
        return pyarrow.array(series, type=_dtype_to_arrow_type(series.dtype))

Source File: array_util.py From tfx-bsl with Apache License 2.0

5 votes

def ToSingletonListArray(array: pa.Array):
  """Converts an array of `type` to a `ListArray<type>`.

  Where result[i] is null if array[i] is null; [array[i]] otherwise.

  Args:
    array: an arrow Array.
  Returns:
    a ListArray.
  """
  array_size = len(array)
  # fast path: values are not copied.
  if array.null_count == 0:
    return pa.ListArray.from_arrays(
        pa.array(np.arange(0, array_size + 1, dtype=np.int32)), array)

  # null_mask[i] = 1 iff array[i] is null.
  null_mask = np.asarray(GetArrayNullBitmapAsByteArray(array))
  # presence_mask[i] = 0 iff array[i] is null
  presence_mask = np.subtract(1, null_mask, dtype=np.uint8)
  offsets_np = np.zeros((array_size + 1,), np.int32)
  np.cumsum(presence_mask, out=offsets_np[1:])

  # This is the null mask over offsets (but ListArray.from_arrays() uses it as
  # the null mask for the ListArray), so its length is array_size +1, but the
  # last element is always False.
  list_array_null_mask = np.zeros((array_size + 1,), np.bool)
  list_array_null_mask[:array_size] = null_mask.view(np.bool)
  values_non_null = array.take(pa.array(np.flatnonzero(presence_mask)))
  return pa.ListArray.from_arrays(
      pa.array(offsets_np, mask=list_array_null_mask), values_non_null)

Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0

5 votes

def convert(self, tensor: TensorAlike) -> List[pa.Array]:
    if not self._type_spec.is_compatible_with(tensor):
      raise TypeError("Expected {} but got {}".format(
          self._type_spec, _get_type_spec(tensor)))
    return self._convert_internal(tensor)

Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0

5 votes

def _convert_internal(self, tensor: TensorAlike) -> List[pa.Array]:
    """Converts the given TensorAlike to a list of pa.Arrays.

    Each element in the list should correspond to one in `arrow_fields()`.

    Args:
      tensor: the TensorAlike to be converted.
    """

Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0

5 votes

def _convert_internal(self, tensor: TensorAlike) -> List[pa.Array]:
    r = tf.RaggedTensor.from_sparse(tensor)
    return [pa.ListArray.from_arrays(
        pa.array(r.row_splits.numpy(), type=pa.int32()),
        pa.array(r.values.numpy(), type=self._values_arrow_type))]

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

5 votes

def _GetConvertToBinaryFn(
    array_type: pa.DataType) -> Optional[Callable[[pa.Array], pa.Array]]:
  """Returns a function that converts a StringArray to BinaryArray."""

  if pa.types.is_string(array_type):
    return lambda array: array.view(pa.binary())
  if pa.types.is_large_string(array_type):
    return lambda array: array.view(pa.large_binary())
  return None

Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0

5 votes

def CreateRawRecordColumn(
    raw_records: List[bytes], produce_large_types: bool) -> pa.Array:
  """Returns an Array that satisfies the requirement of a raw record column."""
  list_array_factory = (
      pa.LargeListArray.from_arrays
      if produce_large_types else pa.ListArray.from_arrays)
  binary_type = pa.large_binary() if produce_large_types else pa.binary()
  return list_array_factory(
      np.arange(0, len(raw_records) + 1, dtype=np.int64),
      pa.array(raw_records, type=binary_type))

Source File: bool.py From coffeegrindsize with MIT License

5 votes

def from_array(cls, arr):
        assert isinstance(arr, pa.Array)
        return cls(pa.chunked_array([arr]))

Source File: arrow_util.py From data-validation with Apache License 2.0

5 votes

def is_binary_like(data_type: pa.DataType) -> bool:
  """Returns true if an Arrow type is binary-like.

  Qualified types are {Large,}BinaryArray, {Large,}StringArray.

  Args:
    data_type: a pa.Array.

  Returns:
    bool.
  """
  return (pa.types.is_binary(data_type) or
          pa.types.is_large_binary(data_type) or
          pa.types.is_unicode(data_type) or
          pa.types.is_large_unicode(data_type))

Source File: basic_stats_generator.py From data-validation with Apache License 2.0

5 votes

def update(self, feature_array: pa.Array) -> None:
    """Update the partial bytes statistics using the input value."""
    if pa.types.is_null(feature_array.type):
      return
    # Iterate through the value array and update the partial stats.'
    flattened_values_array, _ = arrow_util.flatten_nested(feature_array)
    if (pa.types.is_floating(flattened_values_array.type) or
        pa.types.is_integer(flattened_values_array.type)):
      raise ValueError('Bytes stats cannot be computed on INT/FLOAT features.')
    if flattened_values_array:
      num_bytes = array_util.GetElementLengths(
          flattened_values_array).to_numpy()
      self.min_num_bytes = min(self.min_num_bytes, np.min(num_bytes))
      self.max_num_bytes = max(self.max_num_bytes, np.max(num_bytes))
      self.total_num_bytes += np.sum(num_bytes)

Python pyarrow.Array() Examples