Python Examples of pyarrow.DataType

Source File: dataset.py From kartothek with MIT License

6 votes

def _get_type_from_meta(
    table_meta: Optional[Dict[str, SchemaWrapper]],
    column: str,
    default: Optional[pa.DataType],
) -> pa.DataType:
    # use first schema that provides type information, since write path should ensure that types are normalized and
    # equal
    if table_meta is not None:
        for schema in table_meta.values():
            if column not in schema.names:
                continue
            idx = schema.get_field_index(column)
            return schema[idx].type

    if default is not None:
        return default

    raise ValueError(
        'Cannot find type information for partition column "{}"'.format(column)
    )

Source File: arrow_util.py From data-validation with Apache License 2.0

6 votes

def get_nest_level(array_type: pa.DataType) -> int:
  """Returns the nest level of an array type.

  The nest level of primitive types is 0.
  The nest level of null is 1, because an null array is to represent
    list<unknown_type>.
  The nest level of list<inner_type> is get_nest_level(inner_type) + 1

  Args:
    array_type: pa.DataType

  Returns:
    the nest level.
  """
  result = 0
  while is_list_like(array_type):
    result += 1
    array_type = array_type.value_type

  # null is like list<unkown_primitive>
  if pa.types.is_null(array_type):
    result += 1
  return result

Source File: index.py From kartothek with MIT License

6 votes

def __init__(
        self,
        column: str,
        index_dct: Optional[IndexDictType] = None,
        index_storage_key: Optional[str] = None,
        dtype: Optional[pa.DataType] = None,
        normalize_dtype: bool = True,
    ):
        if (index_dct is None) and not index_storage_key:
            raise ValueError("No valid index source specified")
        if not index_storage_key and not index_dct and dtype is None:
            raise ValueError("Trying to create non-typesafe index")
        self.index_storage_key = index_storage_key
        super(ExplicitSecondaryIndex, self).__init__(
            column=column,
            index_dct=index_dct,
            dtype=dtype,
            normalize_dtype=normalize_dtype,
        )

Source File: index.py From kartothek with MIT License

6 votes

def __init__(
        self,
        column: str,
        index_dct: Optional[IndexDictType] = None,
        dtype: pa.DataType = None,
        normalize_dtype: bool = True,
    ):
        if dtype is None:
            raise ValueError(
                'PartitionIndex dtype of column "{}" cannot be None!'.format(column)
            )
        super(PartitionIndex, self).__init__(
            column=column,
            index_dct=index_dct,
            dtype=dtype,
            normalize_dtype=normalize_dtype,
        )

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

6 votes

def _GetNestDepthAndValueType(
    arrow_schema: pa.Schema,
    column_path: path.ColumnPath) -> Tuple[int, pa.DataType]:
  """Returns the depth of a leaf field, and its innermost value type.

  The Depth is constituted by the number of nested lists in the leaf field.

  Args:
    arrow_schema: The arrow schema to traverse.
    column_path: A path of field names. The path must describe a leaf struct.

  Returns: A Tuple of depth and arrow type
  """
  arrow_type = arrow_schema.field(column_path.steps()[0]).type
  depth = 0

  for arrow_type in _EnumerateTypesAlongPath(arrow_schema, column_path):
    if _IsListLike(arrow_type):
      depth += 1

  return depth, arrow_type

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

6 votes

def _GetAllowedDefaultValue(
    value_type: pa.DataType,
    default_value_proto: schema_pb2.TensorRepresentation.DefaultValue
) -> Union[int, float, bytes]:
  """Returns the default value set in DefaultValue proto or raises."""
  kind = default_value_proto.WhichOneof("kind")
  if kind in ("int_value", "uint_value") and pa.types.is_integer(value_type):
    value = getattr(default_value_proto, kind)
    iinfo = np.iinfo(value_type.to_pandas_dtype())
    if value <= iinfo.max and value >= iinfo.min:
      return value
    else:
      raise ValueError("Integer default value out of range: {} is set for a "
                       "{} column".format(value, value_type))
  elif kind == "float_value" and pa.types.is_floating(value_type):
    return default_value_proto.float_value
  elif kind == "bytes_value" and _IsBinaryLike(value_type):
    return default_value_proto.bytes_value

  raise ValueError(
      "Incompatible default value: {} is set for a {} column".format(
          kind, value_type))

Source File: types.py From cjworkbench with GNU Affero General Public License v3.0

6 votes

def _pyarrow_type_to_column_type(
    dtype: pyarrow.DataType, fallback_column_type: Optional[ColumnType]
) -> ColumnType:
    if pyarrow.types.is_floating(dtype) or pyarrow.types.is_integer(dtype):
        if fallback_column_type is not None and fallback_column_type.name == "number":
            return ColumnTypeNumber(fallback_column_type.format)
        else:
            return ColumnTypeNumber()
    elif pyarrow.types.is_string(dtype) or (
        pyarrow.types.is_dictionary(dtype) and pyarrow.types.is_string(dtype.value_type)
    ):
        return ColumnTypeText()
    elif pyarrow.types.is_timestamp(dtype):
        return ColumnTypeDatetime()
    else:
        return ValueError("Unknown pyarrow type %r" % dtype)

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

5 votes

def _ArrowTypeToTfDtype(arrow_type: pa.DataType) -> tf.DType:
  # TODO(zhuo): Remove the special handling for LargeString/Binary when
  # to_pandas_dtype() can handle them.
  if _IsBinaryLike(arrow_type):
    return tf.string
  return tf.dtypes.as_dtype(arrow_type.to_pandas_dtype())

Source File: result_set.py From incubator-superset with Apache License 2.0

5 votes

def data_type(self, col_name: str, pa_dtype: pa.DataType) -> Optional[str]:
        """Given a pyarrow data type, Returns a generic database type"""
        set_type = self._type_dict.get(col_name)
        if set_type:
            return set_type

        mapped_type = self.convert_pa_dtype(pa_dtype)
        if mapped_type:
            return mapped_type

        return None

Source File: table_util.py From tfx-bsl with Apache License 2.0

5 votes

def NumpyKindToArrowType(kind: Text) -> Optional[pa.DataType]:
  return _NUMPY_KIND_TO_ARROW_TYPE.get(kind)

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

5 votes

def _IsListLike(arrow_type: pa.DataType) -> bool:
  return pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type)

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

5 votes

def _IsBinaryLike(arrow_type: pa.DataType) -> bool:
  return (pa.types.is_binary(arrow_type) or
          pa.types.is_large_binary(arrow_type) or
          pa.types.is_string(arrow_type) or
          pa.types.is_large_string(arrow_type))

Source File: batched_input_extractor.py From model-analysis with Apache License 2.0

5 votes

def _IsSupportedArrowValueType(arrow_type: pa.DataType) -> bool:
  return (pa.types.is_integer(arrow_type) or pa.types.is_floating(arrow_type) or
          _IsBinaryLike(arrow_type))

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

5 votes

def _GetDefaultFill(
    unbatched_shape: List[int], value_type: pa.DataType,
    default_value_proto: schema_pb2.TensorRepresentation.DefaultValue
) -> pa.Array:
  """Returns an Array full of the default value given in the proto."""

  size = int(np.prod(unbatched_shape, initial=1))
  return pa.array([
      _GetAllowedDefaultValue(value_type, default_value_proto)] * size,
                  type=value_type)

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

5 votes

def _GetConvertToBinaryFn(
    array_type: pa.DataType) -> Optional[Callable[[pa.Array], pa.Array]]:
  """Returns a function that converts a StringArray to BinaryArray."""

  if pa.types.is_string(array_type):
    return lambda array: array.view(pa.binary())
  if pa.types.is_large_string(array_type):
    return lambda array: array.view(pa.large_binary())
  return None

Source File: telemetry.py From tfx-bsl with Apache License 2.0

5 votes

def _IsListLike(data_type: pa.DataType) -> bool:
  return pa.types.is_list(data_type) or pa.types.is_large_list(data_type)

Source File: csv_decoder.py From tfx-bsl with Apache License 2.0

5 votes

def _GetFeatureTypeToArrowTypeMapping(
    large_types: bool) -> Dict[int, pa.DataType]:
  if large_types:
    return {
        ColumnType.UNKNOWN: pa.null(),
        ColumnType.INT: pa.large_list(pa.int64()),
        ColumnType.FLOAT: pa.large_list(pa.float32()),
        ColumnType.STRING: pa.large_list(pa.large_binary())
    }
  return {
      ColumnType.UNKNOWN: pa.null(),
      ColumnType.INT: pa.list_(pa.int64()),
      ColumnType.FLOAT: pa.list_(pa.float32()),
      ColumnType.STRING: pa.list_(pa.binary())
  }

Source File: batched_input_extractor.py From model-analysis with Apache License 2.0

5 votes

def _IsListLike(arrow_type: pa.DataType) -> bool:
  return pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type)

Source File: batched_input_extractor.py From model-analysis with Apache License 2.0

5 votes

def _IsBinaryLike(arrow_type: pa.DataType) -> bool:
  return (pa.types.is_binary(arrow_type) or
          pa.types.is_large_binary(arrow_type) or
          pa.types.is_string(arrow_type) or
          pa.types.is_large_string(arrow_type))

Source File: stats_util.py From data-validation with Apache License 2.0

5 votes

def get_feature_type_from_arrow_type(
    feature_path: types.FeaturePath,
    arrow_type: pa.DataType) -> Optional[types.FeatureNameStatisticsType]:
  """Get feature type from Arrow type.

  Args:
    feature_path: path of the feature.
    arrow_type: Arrow DataType.

  Returns:
    A statistics_pb2.FeatureNameStatistics.Type value or None if arrow_type
    is null (which means it cannot be determined for now).

  Raises:
    TypeError: if the type is not supported.
  """
  if pa.types.is_null(arrow_type):
    return None
  if not arrow_util.is_list_like(arrow_type):
    raise TypeError('Expected feature column to be a '
                    '(Large)List<primitive|struct> or null, but feature {} '
                    'was {}.'.format(feature_path, arrow_type))

  value_type = arrow_util.get_innermost_nested_type(arrow_type)
  if pa.types.is_integer(value_type):
    return statistics_pb2.FeatureNameStatistics.INT
  elif pa.types.is_floating(value_type):
    return statistics_pb2.FeatureNameStatistics.FLOAT
  elif arrow_util.is_binary_like(value_type):
    return statistics_pb2.FeatureNameStatistics.STRING
  elif pa.types.is_struct(value_type):
    return statistics_pb2.FeatureNameStatistics.STRUCT
  elif pa.types.is_null(value_type):
    return None

  raise TypeError('Feature {} has unsupported arrow type: {}'.format(
      feature_path, arrow_type))

Source File: result_set.py From incubator-superset with Apache License 2.0

5 votes

def convert_pa_dtype(pa_dtype: pa.DataType) -> Optional[str]:
        if pa.types.is_boolean(pa_dtype):
            return "BOOL"
        if pa.types.is_integer(pa_dtype):
            return "INT"
        if pa.types.is_floating(pa_dtype):
            return "FLOAT"
        if pa.types.is_string(pa_dtype):
            return "STRING"
        if pa.types.is_temporal(pa_dtype):
            return "DATETIME"
        return None

Source File: validate.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def __init__(self, name: str, expected: ColumnType, actual: pyarrow.DataType):
        super().__init__(
            "Table column '%s' has wrong type: expected %r, got %r"
            % (name, expected, actual)
        )

Source File: types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType:
    if dtype == np.int8:
        return pyarrow.int8()
    elif dtype == np.int16:
        return pyarrow.int16()
    elif dtype == np.int32:
        return pyarrow.int32()
    elif dtype == np.int64:
        return pyarrow.int64()
    elif dtype == np.uint8:
        return pyarrow.uint8()
    elif dtype == np.uint16:
        return pyarrow.uint16()
    elif dtype == np.uint32:
        return pyarrow.uint32()
    elif dtype == np.uint64:
        return pyarrow.uint64()
    elif dtype == np.float16:
        return pyarrow.float16()
    elif dtype == np.float32:
        return pyarrow.float32()
    elif dtype == np.float64:
        return pyarrow.float64()
    elif dtype.kind == "M":
        # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns]
        # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
        assert dtype.str.endswith("[ns]")
        return pyarrow.timestamp(unit="ns", tz=None)
    elif dtype == np.object_:
        return pyarrow.string()
    else:
        raise RuntimeError("Unhandled dtype %r" % dtype)

Source File: base.py From fletcher with MIT License

5 votes

def __init__(self, arrow_dtype: pa.DataType):
        self.arrow_dtype = arrow_dtype

Source File: base.py From fletcher with MIT License

5 votes

def _is_numeric(arrow_dtype: pa.DataType) -> bool:
    return (
        pa.types.is_integer(arrow_dtype)
        or pa.types.is_floating(arrow_dtype)
        or pa.types.is_decimal(arrow_dtype)
    )

Source File: base.py From fletcher with MIT License

5 votes

def _get_example(arrow_dtype: pa.DataType) -> pa.Array:
    if isinstance(arrow_dtype, pa.ListType):
        return pa.array(
            [None, _get_example(arrow_dtype.value_type).to_pylist()], type=arrow_dtype
        )
    return _examples[arrow_dtype]

Source File: test_pandas_extension.py From fletcher with MIT License

5 votes

def is_arithmetic_type(arrow_dtype: pa.DataType) -> bool:
    """Check whether this is a type that support arithmetics."""
    return (
        pa.types.is_integer(arrow_dtype)
        or pa.types.is_floating(arrow_dtype)
        or pa.types.is_decimal(arrow_dtype)
    )

Source File: serializers.py From LearningApacheSpark with MIT License

5 votes

def _create_batch(series, timezone):
    """
    Create an Arrow record batch from the given pandas.Series or list of Series, with optional type.

    :param series: A single pandas.Series, list of Series, or list of (series, arrow_type)
    :param timezone: A timezone to respect when handling timestamp values
    :return: Arrow RecordBatch
    """
    import decimal
    from distutils.version import LooseVersion
    import pyarrow as pa
    from pyspark.sql.types import _check_series_convert_timestamps_internal
    # Make input conform to [(series1, type1), (series2, type2), ...]
    if not isinstance(series, (list, tuple)) or \
            (len(series) == 2 and isinstance(series[1], pa.DataType)):
        series = [series]
    series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series)

    def create_array(s, t):
        mask = s.isnull()
        # Ensure timestamp series are in expected form for Spark internal representation
        # TODO: maybe don't need None check anymore as of Arrow 0.9.1
        if t is not None and pa.types.is_timestamp(t):
            s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
            # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
            return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
        elif t is not None and pa.types.is_string(t) and sys.version < '3':
            # TODO: need decode before converting to Arrow in Python 2
            # TODO: don't need as of Arrow 0.9.1
            return pa.Array.from_pandas(s.apply(
                lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t)
        elif t is not None and pa.types.is_decimal(t) and \
                LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0.
            return pa.Array.from_pandas(s.apply(
                lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t)
        return pa.Array.from_pandas(s, mask=mask, type=t)

    arrs = [create_array(s, t) for s, t in series]
    return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])

Source File: arrow_util.py From data-validation with Apache License 2.0

5 votes

def get_innermost_nested_type(arrow_type: pa.DataType) -> pa.DataType:
  """Returns the innermost type of a nested list type."""
  while is_list_like(arrow_type):
    arrow_type = arrow_type.value_type
  return arrow_type

Source File: arrow_util.py From data-validation with Apache License 2.0

5 votes

def is_list_like(data_type: pa.DataType) -> bool:
  """Returns true if an Arrow type is list-like."""
  return pa.types.is_list(data_type) or pa.types.is_large_list(data_type)

Python pyarrow.DataType() Examples