Python pandas.CategoricalDtype() Examples

The following are 30 code examples of pandas.CategoricalDtype(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: dataserializer.py    From mars with Apache License 2.0 6 votes vote down vote up
def mars_serialize_context():
    global _serialize_context
    if _serialize_context is None:
        ctx = pyarrow.default_serialization_context()
        ctx.register_type(SparseNDArray, 'mars.SparseNDArray',
                          custom_serializer=_serialize_sparse_nd_array,
                          custom_deserializer=_deserialize_sparse_nd_array)
        ctx.register_type(GroupByWrapper, 'pandas.GroupByWrapper',
                          custom_serializer=_serialize_groupby_wrapper,
                          custom_deserializer=_deserialize_groupby_wrapper)
        ctx.register_type(pd.Interval, 'pandas.Interval',
                          custom_serializer=_serialize_pandas_interval,
                          custom_deserializer=_deserialize_pandas_interval)
        ctx.register_type(pd.Categorical, 'pandas.Categorical',
                          custom_serializer=_serialze_pandas_categorical,
                          custom_deserializer=_deserialize_pandas_categorical)
        ctx.register_type(pd.CategoricalDtype, 'pandas.CategoricalDtype',
                          custom_serializer=_serialize_pandas_categorical_dtype,
                          custom_deserializer=_deserialize_pandas_categorical_dtype)
        _apply_pyarrow_serialization_patch(ctx)
        if vineyard is not None:  # pragma: no cover
            vineyard.register_vineyard_serialize_context(ctx)
        _serialize_context = ctx
    return _serialize_context 
Example #2
Source File: series.py    From modin with Apache License 2.0 6 votes vote down vote up
def ravel(self, order="C"):
        """
        Returns the flattened containing data as ndarray.

        Parameters
        ----------
        order : {'C', 'F', 'A', 'K'}, optional

        Returns
        ----------
        numpy.ndarray or ndarray-like
            Flattened data of the Series.

        """
        data = self._query_compiler.to_numpy().flatten(order=order)
        if isinstance(self.dtype, pandas.CategoricalDtype):
            data = pandas.Categorical(data, dtype=self.dtype)

        return data 
Example #3
Source File: utils.py    From mars with Apache License 2.0 6 votes vote down vote up
def build_series(series_obj, fill_value=1, size=1):
    empty_series = build_empty_series(series_obj.dtype, index=series_obj.index_value.to_pandas()[:0])
    record = _generate_value(series_obj.dtype, fill_value)
    if isinstance(empty_series.index, pd.MultiIndex):
        index = tuple(_generate_value(level.dtype, fill_value) for level in empty_series.index.levels)
        empty_series.loc[index, ] = record
    else:
        if isinstance(empty_series.index.dtype, pd.CategoricalDtype):
            index = None
        else:
            index = _generate_value(empty_series.index.dtype, fill_value)
        empty_series.loc[index] = record

    empty_series = pd.concat([empty_series] * size)
    # make sure dtype correct for MultiIndex
    empty_series = empty_series.astype(series_obj.dtype, copy=False)
    return empty_series 
Example #4
Source File: _pandas_loaders.py    From pymapd with Apache License 2.0 6 votes vote down vote up
def get_mapd_type_from_known(dtype):
    """For cases where pandas type system matches"""
    if is_bool_dtype(dtype):
        return 'BOOL'
    elif is_integer_dtype(dtype):
        if dtype.itemsize <= 1:
            return 'TINYINT'
        elif dtype.itemsize == 2:
            return 'SMALLINT'
        elif dtype.itemsize == 4:
            return 'INT'
        else:
            return 'BIGINT'
    elif is_float_dtype(dtype):
        if dtype.itemsize <= 4:
            return 'FLOAT'
        else:
            return 'DOUBLE'
    elif is_datetime64_any_dtype(dtype):
        return 'TIMESTAMP'
    elif isinstance(dtype, pd.CategoricalDtype):
        return 'STR'
    else:
        raise TypeError("Unhandled type {}".format(dtype)) 
Example #5
Source File: filtering_fe_autotype.py    From dash-docs with MIT License 6 votes vote down vote up
def table_type(df_column):
    # Note - this only works with Pandas >= 1.0.0

    if sys.version_info < (3, 0):  # Pandas 1.0.0 does not support Python 2
        return 'any'

    if isinstance(df_column.dtype, pd.DatetimeTZDtype):
        return 'datetime',
    elif (isinstance(df_column.dtype, pd.StringDtype) or
            isinstance(df_column.dtype, pd.BooleanDtype) or
            isinstance(df_column.dtype, pd.CategoricalDtype) or
            isinstance(df_column.dtype, pd.PeriodDtype)):
        return 'text'
    elif (isinstance(df_column.dtype, pd.SparseDtype) or
            isinstance(df_column.dtype, pd.IntervalDtype) or
            isinstance(df_column.dtype, pd.Int8Dtype) or
            isinstance(df_column.dtype, pd.Int16Dtype) or
            isinstance(df_column.dtype, pd.Int32Dtype) or
            isinstance(df_column.dtype, pd.Int64Dtype)):
        return 'numeric'
    else:
        return 'any' 
Example #6
Source File: rewrites.py    From sdc with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def check_dtype_is_categorical(self, expr, func_ir, block, typemap, calltypes):
    dtype_var = None
    for name, var in expr.kws:
        if name == 'dtype':
            dtype_var = var
    if not dtype_var:
        return False

    dtype_var_def = guard(get_definition, func_ir, dtype_var)
    is_alias = isinstance(dtype_var_def, ir.Const) and dtype_var_def.value == 'category'
    is_categoricaldtype = (hasattr(dtype_var_def, 'func') and
                           func_ir.infer_constant(dtype_var_def.func) == pd.CategoricalDtype)
    if not (is_alias or is_categoricaldtype):
        return False

    return True 
Example #7
Source File: pandas_support.py    From sdc with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def from_dtype(pdtype):
    """
    Return a Numba Type instance corresponding to the given Pandas *dtype*.
    NotImplementedError is raised if unsupported Pandas dtypes.
    """
    # TODO: use issubclass
    if isinstance(pdtype, pd.CategoricalDtype):
        if pdtype.categories is None:
            categories = None
        else:
            categories = list(pdtype.categories)
        return CategoricalDtypeType(categories=categories,
                                    ordered=pdtype.ordered)

    raise NotImplementedError("%r cannot be represented as a Numba type"
                              % (pdtype,)) 
Example #8
Source File: file_reader.py    From modin with Apache License 2.0 6 votes vote down vote up
def read(cls, *args, **kwargs):
        query_compiler = cls._read(*args, **kwargs)
        # TODO (devin-petersohn): Make this section more general for non-pandas kernel
        # implementations.
        if partition_format.get().lower() != "pandas":
            raise NotImplementedError("FIXME")
        import pandas

        if hasattr(query_compiler, "dtypes") and any(
            isinstance(t, pandas.CategoricalDtype) for t in query_compiler.dtypes
        ):
            dtypes = query_compiler.dtypes
            return query_compiler.astype(
                {
                    t: dtypes[t]
                    for t in dtypes.index
                    if isinstance(dtypes[t], pandas.CategoricalDtype)
                }
            )
        return query_compiler 
Example #9
Source File: csv_ext.py    From sdc with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def _get_dtype_str(t):
    dtype = t.dtype

    if isinstance(t, Categorical):
        # return categorical representation
        # for some reason pandas and pyarrow read_csv() return CategoricalDtype with
        # ordered=False in case when dtype is with ordered=None
        return str(t).replace('ordered=None', 'ordered=False')

    if dtype == types.NPDatetime('ns'):
        dtype = 'NPDatetime("ns")'
    if t == string_array_type:
        # HACK: add string_array_type to numba.types
        # FIXME: fix after Numba #3372 is resolved
        types.string_array_type = string_array_type
        return 'string_array_type'
    return '{}[::1]'.format(dtype) 
Example #10
Source File: test_categoricaldtype.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_constructor_no_order(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype(categories=('b', 'a'))

        boxed = func()
        assert(boxed == self._pd_dtype(ordered=False)) 
Example #11
Source File: test_categoricaldtype.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _pd_dtype(self, ordered=True):
        return pd.CategoricalDtype(categories=['b', 'a'], ordered=ordered) 
Example #12
Source File: test_categoricaldtype.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_constructor_no_categories(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype()

        boxed = func()
        expected = pd.CategoricalDtype(ordered=None)
        assert(boxed == expected)
        assert(boxed.categories == expected.categories)
        assert(boxed.ordered == expected.ordered) 
Example #13
Source File: test_validate.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_unused_categories(self):
        with self.assertRaisesRegex(ValueError, "unused category 'b'"):
            validate_dataframe(
                pd.DataFrame({"foo": ["a", "a"]}, dtype=pd.CategoricalDtype(["a", "b"]))
            ) 
Example #14
Source File: test_validate.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_null_is_not_a_category(self):
        # pd.CategoricalDtype means storing nulls as -1. Don't consider -1 when
        # counting the used categories.
        with self.assertRaisesRegex(ValueError, "unused category 'b'"):
            validate_dataframe(
                pd.DataFrame(
                    {"foo": ["a", None]}, dtype=pd.CategoricalDtype(["a", "b"])
                )
            ) 
Example #15
Source File: test_parquet.py    From kartothek with MIT License 5 votes vote down vote up
def test_read_categorical(store):
    df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"})

    serialiser = ParquetSerializer()
    key = serialiser.store(store, "prefix", df)

    df = serialiser.restore_dataframe(store, key)
    assert df.dtypes["col"] == "O"

    df = serialiser.restore_dataframe(store, key, categories=["col"])
    assert df.dtypes["col"] == pd.CategoricalDtype(["a"], ordered=False) 
Example #16
Source File: test_parquet.py    From kartothek with MIT License 5 votes vote down vote up
def test_read_categorical_empty(store):

    df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"}).iloc[:0]
    serialiser = ParquetSerializer()
    key = serialiser.store(store, "prefix", df)

    df = serialiser.restore_dataframe(store, key)
    assert df.dtypes["col"] == "O"

    df = serialiser.restore_dataframe(store, key, categories=["col"])

    assert df.dtypes["col"] == pd.CategoricalDtype([], ordered=False) 
Example #17
Source File: label.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fit(self, y: Union[ArrayLike, SeriesType]) -> "LabelEncoder":
        y = self._check_array(y)

        if isinstance(y, da.Array):
            classes_ = _encode_dask_array(y)
            self.classes_ = classes_.compute()
            self.dtype_: Optional[pd.CategoricalDtype] = None
        elif _is_categorical(y):
            self.classes_ = _encode_categorical(y)
            self.dtype_ = y.dtype
        else:
            self.dtype_ = None
            return super(LabelEncoder, self).fit(y)

        return self 
Example #18
Source File: parsers.py    From modin with Apache License 2.0 5 votes vote down vote up
def find_common_type_cat(types):
    if all(isinstance(t, pandas.CategoricalDtype) for t in types):
        if all(t.ordered for t in types):
            return pandas.CategoricalDtype(
                np.sort(np.unique([c for t in types for c in t.categories])[0]),
                ordered=True,
            )
        return union_categoricals(
            [pandas.Categorical([], dtype=t) for t in types],
            sort_categories=all(t.ordered for t in types),
        ).dtype
    else:
        return find_common_type(types) 
Example #19
Source File: geometric_data.py    From gempy with GNU Lesser General Public License v3.0 5 votes vote down vote up
def map_data_from_series(self, series, attribute: str, idx=None):
        """
        Map columns from the :class:`Series` data frame to a :class:`GeometricData` data frame.

        Args:
            series (:class:`Series`): [s0]
            attribute (str): column to be mapped from the :class:`Series` to the :class:`GeometricData`.
            idx (Optional[int, list[int]): If passed, list of indices of the :class:`GeometricData` that will be mapped.

        Returns:
            :class:GeometricData
        """
        if idx is None:
            idx = self.df.index

        idx = np.atleast_1d(idx)
        if attribute in ['id', 'order_series']:
            self.df.loc[idx, attribute] = self.df['series'].map(series.df[attribute]).astype(int)

        else:
            self.df.loc[idx, attribute] = self.df['series'].map(series.df[attribute])

        if type(self.df['order_series'].dtype) is pn.CategoricalDtype:

            self.df['order_series'].cat.remove_unused_categories(inplace=True)
        return self 
Example #20
Source File: utils.py    From mixed-anomaly with Apache License 2.0 5 votes vote down vote up
def is_column_categorical(column: pd.Series) -> bool:
    return isinstance(column.dtype, pd.CategoricalDtype) or column.dtype == np.object_ 
Example #21
Source File: test_categoricaldtype.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_constructor_categories_set(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype(categories={'b', 'a'}, ordered=True)

        boxed = func()
        assert(boxed == self._pd_dtype()) 
Example #22
Source File: test_categoricaldtype.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_constructor_categories_list(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype(categories=['b', 'a'], ordered=True)

        boxed = func()
        assert(boxed == self._pd_dtype()) 
Example #23
Source File: test_series_category.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_constructor_CategoricalDtype_list(self):
        @nb.njit
        def func():
            return pd.Series(data=[1, 2, 3, 2, 1], dtype=pd.CategoricalDtype(categories=[1, 2, 3]))

        boxed = func()
        assert(boxed.equals(self._pd_value())) 
Example #24
Source File: test_series_category.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def test_constructor_CategoricalDtype(self):
        @nb.njit
        def func():
            return pd.Series(data=(1, 2, 3, 2, 1), dtype=pd.CategoricalDtype(categories=(1, 2, 3)))

        boxed = func()
        assert(boxed.equals(self._pd_value())) 
Example #25
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def pd_csv_cat2(self, use_pyarrow=False):
        read_csv = self._read_csv(use_pyarrow)
        int_type = self._int_type()

        def test_impl():
            ct_dtype = CategoricalDtype(['A', 'B', 'C', 'D'])
            df = read_csv("csv_data_cat1.csv",
                          names=['C1', 'C2', 'C3'],
                          dtype={'C1': int_type, 'C2': ct_dtype, 'C3': str},
                          )
            return df

        return test_impl 
Example #26
Source File: test_io.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def pd_csv_cat1(self, use_pyarrow=False):
        read_csv = self._read_csv(use_pyarrow)

        def test_impl():
            names = ['C1', 'C2', 'C3']
            ct_dtype = CategoricalDtype(['A', 'B', 'C'])
            dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str}
            df = read_csv("csv_data_cat1.csv", names=names, dtype=dtypes)
            return df

        return test_impl 
Example #27
Source File: pdimpl.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _CategoricalDtype_intrinsic(typingctx, categories, ordered):
    """
    Creates CategoricalDtype object.

    Assertions:
        categories - Tuple of literal values or None
        ordered - literal Bool
    """
    if isinstance(categories, types.NoneType):
        categories_list = None
    if isinstance(categories, types.Tuple):
        categories_list = [c.literal_value for c in categories]

    if isinstance(ordered, types.NoneType):
        ordered_value = None
    if isinstance(ordered, types.Literal):
        ordered_value = ordered.literal_value

    return_type = CategoricalDtypeType(categories_list, ordered_value)
    sig = return_type(categories, ordered)

    def codegen(context, builder, signature, args):
        # All CategoricalDtype objects are dummy values in LLVM.
        # They only exist in the type level.
        return context.get_dummy_value()

    return sig, codegen


# TODO: move to tools 
Example #28
Source File: pdimpl.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _CategoricalDtype(categories=None, ordered=None):
    """
    Implementation of constructor for pandas CategoricalDtype.
    """
    if isinstance(ordered, types.Literal):
        ordered_const = ordered.literal_value
    else:
        ordered_const = ordered

    def impl(categories=None, ordered=None):
        return _CategoricalDtype_intrinsic(categories, ordered_const)
    return impl 
Example #29
Source File: pandas_support.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def as_dtype(nbtype):
    """
    Return a Pandas *dtype* instance corresponding to the given Numba type.
    NotImplementedError is raised if no correspondence is known.
    """
    nbtype = types.unliteral(nbtype)
    if isinstance(nbtype, CategoricalDtypeType):
        return pd.CategoricalDtype(categories=nbtype.categories,
                                   ordered=nbtype.ordered)

    raise NotImplementedError("%r cannot be represented as a Pandas dtype"
                              % (nbtype,)) 
Example #30
Source File: dataserializer.py    From mars with Apache License 2.0 5 votes vote down vote up
def _deserialize_pandas_categorical_dtype(data):
    return pd.CategoricalDtype(data[0], data[1])