Python pyarrow.DataType() Examples
The following are 30
code examples of pyarrow.DataType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: dataset.py From kartothek with MIT License | 6 votes |
def _get_type_from_meta( table_meta: Optional[Dict[str, SchemaWrapper]], column: str, default: Optional[pa.DataType], ) -> pa.DataType: # use first schema that provides type information, since write path should ensure that types are normalized and # equal if table_meta is not None: for schema in table_meta.values(): if column not in schema.names: continue idx = schema.get_field_index(column) return schema[idx].type if default is not None: return default raise ValueError( 'Cannot find type information for partition column "{}"'.format(column) )
Example #2
Source File: arrow_util.py From data-validation with Apache License 2.0 | 6 votes |
def get_nest_level(array_type: pa.DataType) -> int: """Returns the nest level of an array type. The nest level of primitive types is 0. The nest level of null is 1, because an null array is to represent list<unknown_type>. The nest level of list<inner_type> is get_nest_level(inner_type) + 1 Args: array_type: pa.DataType Returns: the nest level. """ result = 0 while is_list_like(array_type): result += 1 array_type = array_type.value_type # null is like list<unkown_primitive> if pa.types.is_null(array_type): result += 1 return result
Example #3
Source File: index.py From kartothek with MIT License | 6 votes |
def __init__( self, column: str, index_dct: Optional[IndexDictType] = None, index_storage_key: Optional[str] = None, dtype: Optional[pa.DataType] = None, normalize_dtype: bool = True, ): if (index_dct is None) and not index_storage_key: raise ValueError("No valid index source specified") if not index_storage_key and not index_dct and dtype is None: raise ValueError("Trying to create non-typesafe index") self.index_storage_key = index_storage_key super(ExplicitSecondaryIndex, self).__init__( column=column, index_dct=index_dct, dtype=dtype, normalize_dtype=normalize_dtype, )
Example #4
Source File: index.py From kartothek with MIT License | 6 votes |
def __init__( self, column: str, index_dct: Optional[IndexDictType] = None, dtype: pa.DataType = None, normalize_dtype: bool = True, ): if dtype is None: raise ValueError( 'PartitionIndex dtype of column "{}" cannot be None!'.format(column) ) super(PartitionIndex, self).__init__( column=column, index_dct=index_dct, dtype=dtype, normalize_dtype=normalize_dtype, )
Example #5
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _GetNestDepthAndValueType( arrow_schema: pa.Schema, column_path: path.ColumnPath) -> Tuple[int, pa.DataType]: """Returns the depth of a leaf field, and its innermost value type. The Depth is constituted by the number of nested lists in the leaf field. Args: arrow_schema: The arrow schema to traverse. column_path: A path of field names. The path must describe a leaf struct. Returns: A Tuple of depth and arrow type """ arrow_type = arrow_schema.field(column_path.steps()[0]).type depth = 0 for arrow_type in _EnumerateTypesAlongPath(arrow_schema, column_path): if _IsListLike(arrow_type): depth += 1 return depth, arrow_type
Example #6
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _GetAllowedDefaultValue( value_type: pa.DataType, default_value_proto: schema_pb2.TensorRepresentation.DefaultValue ) -> Union[int, float, bytes]: """Returns the default value set in DefaultValue proto or raises.""" kind = default_value_proto.WhichOneof("kind") if kind in ("int_value", "uint_value") and pa.types.is_integer(value_type): value = getattr(default_value_proto, kind) iinfo = np.iinfo(value_type.to_pandas_dtype()) if value <= iinfo.max and value >= iinfo.min: return value else: raise ValueError("Integer default value out of range: {} is set for a " "{} column".format(value, value_type)) elif kind == "float_value" and pa.types.is_floating(value_type): return default_value_proto.float_value elif kind == "bytes_value" and _IsBinaryLike(value_type): return default_value_proto.bytes_value raise ValueError( "Incompatible default value: {} is set for a {} column".format( kind, value_type))
Example #7
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def _pyarrow_type_to_column_type( dtype: pyarrow.DataType, fallback_column_type: Optional[ColumnType] ) -> ColumnType: if pyarrow.types.is_floating(dtype) or pyarrow.types.is_integer(dtype): if fallback_column_type is not None and fallback_column_type.name == "number": return ColumnTypeNumber(fallback_column_type.format) else: return ColumnTypeNumber() elif pyarrow.types.is_string(dtype) or ( pyarrow.types.is_dictionary(dtype) and pyarrow.types.is_string(dtype.value_type) ): return ColumnTypeText() elif pyarrow.types.is_timestamp(dtype): return ColumnTypeDatetime() else: return ValueError("Unknown pyarrow type %r" % dtype)
Example #8
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _ArrowTypeToTfDtype(arrow_type: pa.DataType) -> tf.DType: # TODO(zhuo): Remove the special handling for LargeString/Binary when # to_pandas_dtype() can handle them. if _IsBinaryLike(arrow_type): return tf.string return tf.dtypes.as_dtype(arrow_type.to_pandas_dtype())
Example #9
Source File: result_set.py From incubator-superset with Apache License 2.0 | 5 votes |
def data_type(self, col_name: str, pa_dtype: pa.DataType) -> Optional[str]: """Given a pyarrow data type, Returns a generic database type""" set_type = self._type_dict.get(col_name) if set_type: return set_type mapped_type = self.convert_pa_dtype(pa_dtype) if mapped_type: return mapped_type return None
Example #10
Source File: table_util.py From tfx-bsl with Apache License 2.0 | 5 votes |
def NumpyKindToArrowType(kind: Text) -> Optional[pa.DataType]: return _NUMPY_KIND_TO_ARROW_TYPE.get(kind)
Example #11
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _IsListLike(arrow_type: pa.DataType) -> bool: return pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type)
Example #12
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _IsBinaryLike(arrow_type: pa.DataType) -> bool: return (pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type) or pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type))
Example #13
Source File: batched_input_extractor.py From model-analysis with Apache License 2.0 | 5 votes |
def _IsSupportedArrowValueType(arrow_type: pa.DataType) -> bool: return (pa.types.is_integer(arrow_type) or pa.types.is_floating(arrow_type) or _IsBinaryLike(arrow_type))
Example #14
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _GetDefaultFill( unbatched_shape: List[int], value_type: pa.DataType, default_value_proto: schema_pb2.TensorRepresentation.DefaultValue ) -> pa.Array: """Returns an Array full of the default value given in the proto.""" size = int(np.prod(unbatched_shape, initial=1)) return pa.array([ _GetAllowedDefaultValue(value_type, default_value_proto)] * size, type=value_type)
Example #15
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _GetConvertToBinaryFn( array_type: pa.DataType) -> Optional[Callable[[pa.Array], pa.Array]]: """Returns a function that converts a StringArray to BinaryArray.""" if pa.types.is_string(array_type): return lambda array: array.view(pa.binary()) if pa.types.is_large_string(array_type): return lambda array: array.view(pa.large_binary()) return None
Example #16
Source File: telemetry.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _IsListLike(data_type: pa.DataType) -> bool: return pa.types.is_list(data_type) or pa.types.is_large_list(data_type)
Example #17
Source File: csv_decoder.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _GetFeatureTypeToArrowTypeMapping( large_types: bool) -> Dict[int, pa.DataType]: if large_types: return { ColumnType.UNKNOWN: pa.null(), ColumnType.INT: pa.large_list(pa.int64()), ColumnType.FLOAT: pa.large_list(pa.float32()), ColumnType.STRING: pa.large_list(pa.large_binary()) } return { ColumnType.UNKNOWN: pa.null(), ColumnType.INT: pa.list_(pa.int64()), ColumnType.FLOAT: pa.list_(pa.float32()), ColumnType.STRING: pa.list_(pa.binary()) }
Example #18
Source File: batched_input_extractor.py From model-analysis with Apache License 2.0 | 5 votes |
def _IsListLike(arrow_type: pa.DataType) -> bool: return pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type)
Example #19
Source File: batched_input_extractor.py From model-analysis with Apache License 2.0 | 5 votes |
def _IsBinaryLike(arrow_type: pa.DataType) -> bool: return (pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type) or pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type))
Example #20
Source File: stats_util.py From data-validation with Apache License 2.0 | 5 votes |
def get_feature_type_from_arrow_type( feature_path: types.FeaturePath, arrow_type: pa.DataType) -> Optional[types.FeatureNameStatisticsType]: """Get feature type from Arrow type. Args: feature_path: path of the feature. arrow_type: Arrow DataType. Returns: A statistics_pb2.FeatureNameStatistics.Type value or None if arrow_type is null (which means it cannot be determined for now). Raises: TypeError: if the type is not supported. """ if pa.types.is_null(arrow_type): return None if not arrow_util.is_list_like(arrow_type): raise TypeError('Expected feature column to be a ' '(Large)List<primitive|struct> or null, but feature {} ' 'was {}.'.format(feature_path, arrow_type)) value_type = arrow_util.get_innermost_nested_type(arrow_type) if pa.types.is_integer(value_type): return statistics_pb2.FeatureNameStatistics.INT elif pa.types.is_floating(value_type): return statistics_pb2.FeatureNameStatistics.FLOAT elif arrow_util.is_binary_like(value_type): return statistics_pb2.FeatureNameStatistics.STRING elif pa.types.is_struct(value_type): return statistics_pb2.FeatureNameStatistics.STRUCT elif pa.types.is_null(value_type): return None raise TypeError('Feature {} has unsupported arrow type: {}'.format( feature_path, arrow_type))
Example #21
Source File: result_set.py From incubator-superset with Apache License 2.0 | 5 votes |
def convert_pa_dtype(pa_dtype: pa.DataType) -> Optional[str]: if pa.types.is_boolean(pa_dtype): return "BOOL" if pa.types.is_integer(pa_dtype): return "INT" if pa.types.is_floating(pa_dtype): return "FLOAT" if pa.types.is_string(pa_dtype): return "STRING" if pa.types.is_temporal(pa_dtype): return "DATETIME" return None
Example #22
Source File: validate.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self, name: str, expected: ColumnType, actual: pyarrow.DataType): super().__init__( "Table column '%s' has wrong type: expected %r, got %r" % (name, expected, actual) )
Example #23
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType: if dtype == np.int8: return pyarrow.int8() elif dtype == np.int16: return pyarrow.int16() elif dtype == np.int32: return pyarrow.int32() elif dtype == np.int64: return pyarrow.int64() elif dtype == np.uint8: return pyarrow.uint8() elif dtype == np.uint16: return pyarrow.uint16() elif dtype == np.uint32: return pyarrow.uint32() elif dtype == np.uint64: return pyarrow.uint64() elif dtype == np.float16: return pyarrow.float16() elif dtype == np.float32: return pyarrow.float32() elif dtype == np.float64: return pyarrow.float64() elif dtype.kind == "M": # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns] # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563 assert dtype.str.endswith("[ns]") return pyarrow.timestamp(unit="ns", tz=None) elif dtype == np.object_: return pyarrow.string() else: raise RuntimeError("Unhandled dtype %r" % dtype)
Example #24
Source File: base.py From fletcher with MIT License | 5 votes |
def __init__(self, arrow_dtype: pa.DataType): self.arrow_dtype = arrow_dtype
Example #25
Source File: base.py From fletcher with MIT License | 5 votes |
def _is_numeric(arrow_dtype: pa.DataType) -> bool: return ( pa.types.is_integer(arrow_dtype) or pa.types.is_floating(arrow_dtype) or pa.types.is_decimal(arrow_dtype) )
Example #26
Source File: base.py From fletcher with MIT License | 5 votes |
def _get_example(arrow_dtype: pa.DataType) -> pa.Array: if isinstance(arrow_dtype, pa.ListType): return pa.array( [None, _get_example(arrow_dtype.value_type).to_pylist()], type=arrow_dtype ) return _examples[arrow_dtype]
Example #27
Source File: test_pandas_extension.py From fletcher with MIT License | 5 votes |
def is_arithmetic_type(arrow_dtype: pa.DataType) -> bool: """Check whether this is a type that support arithmetics.""" return ( pa.types.is_integer(arrow_dtype) or pa.types.is_floating(arrow_dtype) or pa.types.is_decimal(arrow_dtype) )
Example #28
Source File: serializers.py From LearningApacheSpark with MIT License | 5 votes |
def _create_batch(series, timezone): """ Create an Arrow record batch from the given pandas.Series or list of Series, with optional type. :param series: A single pandas.Series, list of Series, or list of (series, arrow_type) :param timezone: A timezone to respect when handling timestamp values :return: Arrow RecordBatch """ import decimal from distutils.version import LooseVersion import pyarrow as pa from pyspark.sql.types import _check_series_convert_timestamps_internal # Make input conform to [(series1, type1), (series2, type2), ...] if not isinstance(series, (list, tuple)) or \ (len(series) == 2 and isinstance(series[1], pa.DataType)): series = [series] series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series) def create_array(s, t): mask = s.isnull() # Ensure timestamp series are in expected form for Spark internal representation # TODO: maybe don't need None check anymore as of Arrow 0.9.1 if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) elif t is not None and pa.types.is_string(t) and sys.version < '3': # TODO: need decode before converting to Arrow in Python 2 # TODO: don't need as of Arrow 0.9.1 return pa.Array.from_pandas(s.apply( lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) elif t is not None and pa.types.is_decimal(t) and \ LooseVersion("0.9.0") <= LooseVersion(pa.__version__) < LooseVersion("0.10.0"): # TODO: see ARROW-2432. Remove when the minimum PyArrow version becomes 0.10.0. return pa.Array.from_pandas(s.apply( lambda v: decimal.Decimal('NaN') if v is None else v), mask=mask, type=t) return pa.Array.from_pandas(s, mask=mask, type=t) arrs = [create_array(s, t) for s, t in series] return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
Example #29
Source File: arrow_util.py From data-validation with Apache License 2.0 | 5 votes |
def get_innermost_nested_type(arrow_type: pa.DataType) -> pa.DataType: """Returns the innermost type of a nested list type.""" while is_list_like(arrow_type): arrow_type = arrow_type.value_type return arrow_type
Example #30
Source File: arrow_util.py From data-validation with Apache License 2.0 | 5 votes |
def is_list_like(data_type: pa.DataType) -> bool: """Returns true if an Arrow type is list-like.""" return pa.types.is_list(data_type) or pa.types.is_large_list(data_type)