Python Examples of pandas.CategoricalDtype

Source File: dataserializer.py From mars with Apache License 2.0

6 votes

def mars_serialize_context():
    global _serialize_context
    if _serialize_context is None:
        ctx = pyarrow.default_serialization_context()
        ctx.register_type(SparseNDArray, 'mars.SparseNDArray',
                          custom_serializer=_serialize_sparse_nd_array,
                          custom_deserializer=_deserialize_sparse_nd_array)
        ctx.register_type(GroupByWrapper, 'pandas.GroupByWrapper',
                          custom_serializer=_serialize_groupby_wrapper,
                          custom_deserializer=_deserialize_groupby_wrapper)
        ctx.register_type(pd.Interval, 'pandas.Interval',
                          custom_serializer=_serialize_pandas_interval,
                          custom_deserializer=_deserialize_pandas_interval)
        ctx.register_type(pd.Categorical, 'pandas.Categorical',
                          custom_serializer=_serialze_pandas_categorical,
                          custom_deserializer=_deserialize_pandas_categorical)
        ctx.register_type(pd.CategoricalDtype, 'pandas.CategoricalDtype',
                          custom_serializer=_serialize_pandas_categorical_dtype,
                          custom_deserializer=_deserialize_pandas_categorical_dtype)
        _apply_pyarrow_serialization_patch(ctx)
        if vineyard is not None:  # pragma: no cover
            vineyard.register_vineyard_serialize_context(ctx)
        _serialize_context = ctx
    return _serialize_context

Source File: series.py From modin with Apache License 2.0

6 votes

def ravel(self, order="C"):
        """
        Returns the flattened containing data as ndarray.

        Parameters
        ----------
        order : {'C', 'F', 'A', 'K'}, optional

        Returns
        ----------
        numpy.ndarray or ndarray-like
            Flattened data of the Series.

        """
        data = self._query_compiler.to_numpy().flatten(order=order)
        if isinstance(self.dtype, pandas.CategoricalDtype):
            data = pandas.Categorical(data, dtype=self.dtype)

        return data

Source File: utils.py From mars with Apache License 2.0

6 votes

def build_series(series_obj, fill_value=1, size=1):
    empty_series = build_empty_series(series_obj.dtype, index=series_obj.index_value.to_pandas()[:0])
    record = _generate_value(series_obj.dtype, fill_value)
    if isinstance(empty_series.index, pd.MultiIndex):
        index = tuple(_generate_value(level.dtype, fill_value) for level in empty_series.index.levels)
        empty_series.loc[index, ] = record
    else:
        if isinstance(empty_series.index.dtype, pd.CategoricalDtype):
            index = None
        else:
            index = _generate_value(empty_series.index.dtype, fill_value)
        empty_series.loc[index] = record

    empty_series = pd.concat([empty_series] * size)
    # make sure dtype correct for MultiIndex
    empty_series = empty_series.astype(series_obj.dtype, copy=False)
    return empty_series

Source File: _pandas_loaders.py From pymapd with Apache License 2.0

6 votes

def get_mapd_type_from_known(dtype):
    """For cases where pandas type system matches"""
    if is_bool_dtype(dtype):
        return 'BOOL'
    elif is_integer_dtype(dtype):
        if dtype.itemsize <= 1:
            return 'TINYINT'
        elif dtype.itemsize == 2:
            return 'SMALLINT'
        elif dtype.itemsize == 4:
            return 'INT'
        else:
            return 'BIGINT'
    elif is_float_dtype(dtype):
        if dtype.itemsize <= 4:
            return 'FLOAT'
        else:
            return 'DOUBLE'
    elif is_datetime64_any_dtype(dtype):
        return 'TIMESTAMP'
    elif isinstance(dtype, pd.CategoricalDtype):
        return 'STR'
    else:
        raise TypeError("Unhandled type {}".format(dtype))

Source File: filtering_fe_autotype.py From dash-docs with MIT License

6 votes

def table_type(df_column):
    # Note - this only works with Pandas >= 1.0.0

    if sys.version_info < (3, 0):  # Pandas 1.0.0 does not support Python 2
        return 'any'

    if isinstance(df_column.dtype, pd.DatetimeTZDtype):
        return 'datetime',
    elif (isinstance(df_column.dtype, pd.StringDtype) or
            isinstance(df_column.dtype, pd.BooleanDtype) or
            isinstance(df_column.dtype, pd.CategoricalDtype) or
            isinstance(df_column.dtype, pd.PeriodDtype)):
        return 'text'
    elif (isinstance(df_column.dtype, pd.SparseDtype) or
            isinstance(df_column.dtype, pd.IntervalDtype) or
            isinstance(df_column.dtype, pd.Int8Dtype) or
            isinstance(df_column.dtype, pd.Int16Dtype) or
            isinstance(df_column.dtype, pd.Int32Dtype) or
            isinstance(df_column.dtype, pd.Int64Dtype)):
        return 'numeric'
    else:
        return 'any'

Source File: rewrites.py From sdc with BSD 2-Clause "Simplified" License

6 votes

def check_dtype_is_categorical(self, expr, func_ir, block, typemap, calltypes):
    dtype_var = None
    for name, var in expr.kws:
        if name == 'dtype':
            dtype_var = var
    if not dtype_var:
        return False

    dtype_var_def = guard(get_definition, func_ir, dtype_var)
    is_alias = isinstance(dtype_var_def, ir.Const) and dtype_var_def.value == 'category'
    is_categoricaldtype = (hasattr(dtype_var_def, 'func') and
                           func_ir.infer_constant(dtype_var_def.func) == pd.CategoricalDtype)
    if not (is_alias or is_categoricaldtype):
        return False

    return True

Source File: pandas_support.py From sdc with BSD 2-Clause "Simplified" License

6 votes

def from_dtype(pdtype):
    """
    Return a Numba Type instance corresponding to the given Pandas *dtype*.
    NotImplementedError is raised if unsupported Pandas dtypes.
    """
    # TODO: use issubclass
    if isinstance(pdtype, pd.CategoricalDtype):
        if pdtype.categories is None:
            categories = None
        else:
            categories = list(pdtype.categories)
        return CategoricalDtypeType(categories=categories,
                                    ordered=pdtype.ordered)

    raise NotImplementedError("%r cannot be represented as a Numba type"
                              % (pdtype,))

Source File: file_reader.py From modin with Apache License 2.0

6 votes

def read(cls, *args, **kwargs):
        query_compiler = cls._read(*args, **kwargs)
        # TODO (devin-petersohn): Make this section more general for non-pandas kernel
        # implementations.
        if partition_format.get().lower() != "pandas":
            raise NotImplementedError("FIXME")
        import pandas

        if hasattr(query_compiler, "dtypes") and any(
            isinstance(t, pandas.CategoricalDtype) for t in query_compiler.dtypes
        ):
            dtypes = query_compiler.dtypes
            return query_compiler.astype(
                {
                    t: dtypes[t]
                    for t in dtypes.index
                    if isinstance(dtypes[t], pandas.CategoricalDtype)
                }
            )
        return query_compiler

Source File: csv_ext.py From sdc with BSD 2-Clause "Simplified" License

6 votes

def _get_dtype_str(t):
    dtype = t.dtype

    if isinstance(t, Categorical):
        # return categorical representation
        # for some reason pandas and pyarrow read_csv() return CategoricalDtype with
        # ordered=False in case when dtype is with ordered=None
        return str(t).replace('ordered=None', 'ordered=False')

    if dtype == types.NPDatetime('ns'):
        dtype = 'NPDatetime("ns")'
    if t == string_array_type:
        # HACK: add string_array_type to numba.types
        # FIXME: fix after Numba #3372 is resolved
        types.string_array_type = string_array_type
        return 'string_array_type'
    return '{}[::1]'.format(dtype)

Source File: test_categoricaldtype.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_constructor_no_order(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype(categories=('b', 'a'))

        boxed = func()
        assert(boxed == self._pd_dtype(ordered=False))

Source File: test_categoricaldtype.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def _pd_dtype(self, ordered=True):
        return pd.CategoricalDtype(categories=['b', 'a'], ordered=ordered)

Source File: test_categoricaldtype.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_constructor_no_categories(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype()

        boxed = func()
        expected = pd.CategoricalDtype(ordered=None)
        assert(boxed == expected)
        assert(boxed.categories == expected.categories)
        assert(boxed.ordered == expected.ordered)

Source File: test_validate.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def test_unused_categories(self):
        with self.assertRaisesRegex(ValueError, "unused category 'b'"):
            validate_dataframe(
                pd.DataFrame({"foo": ["a", "a"]}, dtype=pd.CategoricalDtype(["a", "b"]))
            )

Source File: test_validate.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def test_null_is_not_a_category(self):
        # pd.CategoricalDtype means storing nulls as -1. Don't consider -1 when
        # counting the used categories.
        with self.assertRaisesRegex(ValueError, "unused category 'b'"):
            validate_dataframe(
                pd.DataFrame(
                    {"foo": ["a", None]}, dtype=pd.CategoricalDtype(["a", "b"])
                )
            )

Source File: test_parquet.py From kartothek with MIT License

5 votes

def test_read_categorical(store):
    df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"})

    serialiser = ParquetSerializer()
    key = serialiser.store(store, "prefix", df)

    df = serialiser.restore_dataframe(store, key)
    assert df.dtypes["col"] == "O"

    df = serialiser.restore_dataframe(store, key, categories=["col"])
    assert df.dtypes["col"] == pd.CategoricalDtype(["a"], ordered=False)

Source File: test_parquet.py From kartothek with MIT License

5 votes

def test_read_categorical_empty(store):

    df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"}).iloc[:0]
    serialiser = ParquetSerializer()
    key = serialiser.store(store, "prefix", df)

    df = serialiser.restore_dataframe(store, key)
    assert df.dtypes["col"] == "O"

    df = serialiser.restore_dataframe(store, key, categories=["col"])

    assert df.dtypes["col"] == pd.CategoricalDtype([], ordered=False)

Source File: label.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def fit(self, y: Union[ArrayLike, SeriesType]) -> "LabelEncoder":
        y = self._check_array(y)

        if isinstance(y, da.Array):
            classes_ = _encode_dask_array(y)
            self.classes_ = classes_.compute()
            self.dtype_: Optional[pd.CategoricalDtype] = None
        elif _is_categorical(y):
            self.classes_ = _encode_categorical(y)
            self.dtype_ = y.dtype
        else:
            self.dtype_ = None
            return super(LabelEncoder, self).fit(y)

        return self

Source File: parsers.py From modin with Apache License 2.0

5 votes

def find_common_type_cat(types):
    if all(isinstance(t, pandas.CategoricalDtype) for t in types):
        if all(t.ordered for t in types):
            return pandas.CategoricalDtype(
                np.sort(np.unique([c for t in types for c in t.categories])[0]),
                ordered=True,
            )
        return union_categoricals(
            [pandas.Categorical([], dtype=t) for t in types],
            sort_categories=all(t.ordered for t in types),
        ).dtype
    else:
        return find_common_type(types)

Source File: geometric_data.py From gempy with GNU Lesser General Public License v3.0

5 votes

def map_data_from_series(self, series, attribute: str, idx=None):
        """
        Map columns from the :class:`Series` data frame to a :class:`GeometricData` data frame.

        Args:
            series (:class:`Series`): [s0]
            attribute (str): column to be mapped from the :class:`Series` to the :class:`GeometricData`.
            idx (Optional[int, list[int]): If passed, list of indices of the :class:`GeometricData` that will be mapped.

        Returns:
            :class:GeometricData
        """
        if idx is None:
            idx = self.df.index

        idx = np.atleast_1d(idx)
        if attribute in ['id', 'order_series']:
            self.df.loc[idx, attribute] = self.df['series'].map(series.df[attribute]).astype(int)

        else:
            self.df.loc[idx, attribute] = self.df['series'].map(series.df[attribute])

        if type(self.df['order_series'].dtype) is pn.CategoricalDtype:

            self.df['order_series'].cat.remove_unused_categories(inplace=True)
        return self

Source File: utils.py From mixed-anomaly with Apache License 2.0

5 votes

def is_column_categorical(column: pd.Series) -> bool:
    return isinstance(column.dtype, pd.CategoricalDtype) or column.dtype == np.object_

Source File: test_categoricaldtype.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_constructor_categories_set(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype(categories={'b', 'a'}, ordered=True)

        boxed = func()
        assert(boxed == self._pd_dtype())

Source File: test_categoricaldtype.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_constructor_categories_list(self):
        @nb.njit
        def func():
            return pd.CategoricalDtype(categories=['b', 'a'], ordered=True)

        boxed = func()
        assert(boxed == self._pd_dtype())

Source File: test_series_category.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_constructor_CategoricalDtype_list(self):
        @nb.njit
        def func():
            return pd.Series(data=[1, 2, 3, 2, 1], dtype=pd.CategoricalDtype(categories=[1, 2, 3]))

        boxed = func()
        assert(boxed.equals(self._pd_value()))

Source File: test_series_category.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def test_constructor_CategoricalDtype(self):
        @nb.njit
        def func():
            return pd.Series(data=(1, 2, 3, 2, 1), dtype=pd.CategoricalDtype(categories=(1, 2, 3)))

        boxed = func()
        assert(boxed.equals(self._pd_value()))

Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def pd_csv_cat2(self, use_pyarrow=False):
        read_csv = self._read_csv(use_pyarrow)
        int_type = self._int_type()

        def test_impl():
            ct_dtype = CategoricalDtype(['A', 'B', 'C', 'D'])
            df = read_csv("csv_data_cat1.csv",
                          names=['C1', 'C2', 'C3'],
                          dtype={'C1': int_type, 'C2': ct_dtype, 'C3': str},
                          )
            return df

        return test_impl

Source File: test_io.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def pd_csv_cat1(self, use_pyarrow=False):
        read_csv = self._read_csv(use_pyarrow)

        def test_impl():
            names = ['C1', 'C2', 'C3']
            ct_dtype = CategoricalDtype(['A', 'B', 'C'])
            dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str}
            df = read_csv("csv_data_cat1.csv", names=names, dtype=dtypes)
            return df

        return test_impl

Source File: pdimpl.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def _CategoricalDtype_intrinsic(typingctx, categories, ordered):
    """
    Creates CategoricalDtype object.

    Assertions:
        categories - Tuple of literal values or None
        ordered - literal Bool
    """
    if isinstance(categories, types.NoneType):
        categories_list = None
    if isinstance(categories, types.Tuple):
        categories_list = [c.literal_value for c in categories]

    if isinstance(ordered, types.NoneType):
        ordered_value = None
    if isinstance(ordered, types.Literal):
        ordered_value = ordered.literal_value

    return_type = CategoricalDtypeType(categories_list, ordered_value)
    sig = return_type(categories, ordered)

    def codegen(context, builder, signature, args):
        # All CategoricalDtype objects are dummy values in LLVM.
        # They only exist in the type level.
        return context.get_dummy_value()

    return sig, codegen


# TODO: move to tools

Source File: pdimpl.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def _CategoricalDtype(categories=None, ordered=None):
    """
    Implementation of constructor for pandas CategoricalDtype.
    """
    if isinstance(ordered, types.Literal):
        ordered_const = ordered.literal_value
    else:
        ordered_const = ordered

    def impl(categories=None, ordered=None):
        return _CategoricalDtype_intrinsic(categories, ordered_const)
    return impl

Source File: pandas_support.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def as_dtype(nbtype):
    """
    Return a Pandas *dtype* instance corresponding to the given Numba type.
    NotImplementedError is raised if no correspondence is known.
    """
    nbtype = types.unliteral(nbtype)
    if isinstance(nbtype, CategoricalDtypeType):
        return pd.CategoricalDtype(categories=nbtype.categories,
                                   ordered=nbtype.ordered)

    raise NotImplementedError("%r cannot be represented as a Pandas dtype"
                              % (nbtype,))

Source File: dataserializer.py From mars with Apache License 2.0

5 votes

def _deserialize_pandas_categorical_dtype(data):
    return pd.CategoricalDtype(data[0], data[1])

Python pandas.CategoricalDtype() Examples