Python Examples of pandas.SparseDtype

Source File: filtering_fe_autotype.py From dash-docs with MIT License

6 votes

def table_type(df_column):
    # Note - this only works with Pandas >= 1.0.0

    if sys.version_info < (3, 0):  # Pandas 1.0.0 does not support Python 2
        return 'any'

    if isinstance(df_column.dtype, pd.DatetimeTZDtype):
        return 'datetime',
    elif (isinstance(df_column.dtype, pd.StringDtype) or
            isinstance(df_column.dtype, pd.BooleanDtype) or
            isinstance(df_column.dtype, pd.CategoricalDtype) or
            isinstance(df_column.dtype, pd.PeriodDtype)):
        return 'text'
    elif (isinstance(df_column.dtype, pd.SparseDtype) or
            isinstance(df_column.dtype, pd.IntervalDtype) or
            isinstance(df_column.dtype, pd.Int8Dtype) or
            isinstance(df_column.dtype, pd.Int16Dtype) or
            isinstance(df_column.dtype, pd.Int32Dtype) or
            isinstance(df_column.dtype, pd.Int64Dtype)):
        return 'numeric'
    else:
        return 'any'

Source File: test_sparse.py From coffeegrindsize with MIT License

6 votes

def test_where_series(self, data, na_value):
        assert data[0] != data[1]
        cls = type(data)
        a, b = data[:2]

        ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))

        cond = np.array([True, True, False, False])
        result = ser.where(cond)

        new_dtype = SparseDtype('float', 0.0)
        expected = pd.Series(cls._from_sequence([a, a, na_value, na_value],
                                                dtype=new_dtype))
        self.assert_series_equal(result, expected)

        other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
        cond = np.array([True, False, True, True])
        result = ser.where(cond, other)
        expected = pd.Series(cls._from_sequence([a, b, b, b],
                                                dtype=data.dtype))
        self.assert_series_equal(result, expected)

Source File: test_sparse.py From coffeegrindsize with MIT License

6 votes

def test_fillna_frame(self, data_missing):
        # Have to override to specify that fill_value will change.
        fill_value = data_missing[1]

        result = pd.DataFrame({
            "A": data_missing,
            "B": [1, 2]
        }).fillna(fill_value)

        if pd.isna(data_missing.fill_value):
            dtype = SparseDtype(data_missing.dtype, fill_value)
        else:
            dtype = data_missing.dtype

        expected = pd.DataFrame({
            "A": data_missing._from_sequence([fill_value, fill_value],
                                             dtype=dtype),
            "B": [1, 2],
        })

        self.assert_frame_equal(result, expected)

Source File: test_sparse.py From coffeegrindsize with MIT License

6 votes

def test_isna(self, data_missing):
        expected_dtype = SparseDtype(bool,
                                     pd.isna(data_missing.dtype.fill_value))
        expected = SparseArray([True, False], dtype=expected_dtype)

        result = pd.isna(data_missing)
        self.assert_equal(result, expected)

        result = pd.Series(data_missing).isna()
        expected = pd.Series(expected)
        self.assert_series_equal(result, expected)

        # GH 21189
        result = pd.Series(data_missing).drop([0, 1]).isna()
        expected = pd.Series([], dtype=expected_dtype)
        self.assert_series_equal(result, expected)

Source File: test_utils.py From scprep with GNU General Public License v3.0

6 votes

def test_SparseDataFrame():
    X = data.load_10X(sparse=False)
    Y = X.astype(pd.SparseDtype(float, fill_value=0.0))
    index = X.index
    columns = X.columns

    def test_fun(X):
        X = scprep.utils.SparseDataFrame(X, index=index, columns=columns)
        utils.assert_matrix_class_equivalent(X, Y)

    matrix.test_all_matrix_types(X, test_fun)
    matrix.test_pandas_matrix_types(
        X,
        utils.assert_transform_equivalent,
        Y=Y,
        transform=scprep.utils.SparseDataFrame,
    )

Source File: test_utils.py From scprep with GNU General Public License v3.0

6 votes

def test_is_sparse_dataframe():
    X = data.load_10X(sparse=False)
    Y = X.astype(pd.SparseDtype(float, fill_value=0.0))
    assert scprep.utils.is_sparse_dataframe(Y)

    def test_fun(X):
        assert not scprep.utils.is_sparse_dataframe(X)

    types = (
        matrix._scipy_matrix_types
        + matrix._numpy_matrix_types
        + matrix._pandas_dense_matrix_types
    )
    if matrix._pandas_0:
        types.append(matrix.SparseDataFrame_deprecated)
    matrix.test_matrix_types(
        X, test_fun, types,
    )

Source File: test_sparse.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_where_series(self, data, na_value):
        assert data[0] != data[1]
        cls = type(data)
        a, b = data[:2]

        ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))

        cond = np.array([True, True, False, False])
        result = ser.where(cond)

        new_dtype = SparseDtype('float', 0.0)
        expected = pd.Series(cls._from_sequence([a, a, na_value, na_value],
                                                dtype=new_dtype))
        self.assert_series_equal(result, expected)

        other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
        cond = np.array([True, False, True, True])
        result = ser.where(cond, other)
        expected = pd.Series(cls._from_sequence([a, b, b, b],
                                                dtype=data.dtype))
        self.assert_series_equal(result, expected)

Source File: test_sparse.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_fillna_frame(self, data_missing):
        # Have to override to specify that fill_value will change.
        fill_value = data_missing[1]

        result = pd.DataFrame({
            "A": data_missing,
            "B": [1, 2]
        }).fillna(fill_value)

        if pd.isna(data_missing.fill_value):
            dtype = SparseDtype(data_missing.dtype, fill_value)
        else:
            dtype = data_missing.dtype

        expected = pd.DataFrame({
            "A": data_missing._from_sequence([fill_value, fill_value],
                                             dtype=dtype),
            "B": [1, 2],
        })

        self.assert_frame_equal(result, expected)

Source File: test_sparse.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_isna(self, data_missing):
        expected_dtype = SparseDtype(bool,
                                     pd.isna(data_missing.dtype.fill_value))
        expected = SparseArray([True, False], dtype=expected_dtype)

        result = pd.isna(data_missing)
        self.assert_equal(result, expected)

        result = pd.Series(data_missing).isna()
        expected = pd.Series(expected)
        self.assert_series_equal(result, expected)

        # GH 21189
        result = pd.Series(data_missing).drop([0, 1]).isna()
        expected = pd.Series([], dtype=expected_dtype)
        self.assert_series_equal(result, expected)

Source File: test_sparse.py From recruit with Apache License 2.0

6 votes

def test_isna(self, data_missing):
        expected_dtype = SparseDtype(bool,
                                     pd.isna(data_missing.dtype.fill_value))
        expected = SparseArray([True, False], dtype=expected_dtype)

        result = pd.isna(data_missing)
        self.assert_equal(result, expected)

        result = pd.Series(data_missing).isna()
        expected = pd.Series(expected)
        self.assert_series_equal(result, expected)

        # GH 21189
        result = pd.Series(data_missing).drop([0, 1]).isna()
        expected = pd.Series([], dtype=expected_dtype)
        self.assert_series_equal(result, expected)

Source File: test_sparse.py From recruit with Apache License 2.0

6 votes

def test_fillna_frame(self, data_missing):
        # Have to override to specify that fill_value will change.
        fill_value = data_missing[1]

        result = pd.DataFrame({
            "A": data_missing,
            "B": [1, 2]
        }).fillna(fill_value)

        if pd.isna(data_missing.fill_value):
            dtype = SparseDtype(data_missing.dtype, fill_value)
        else:
            dtype = data_missing.dtype

        expected = pd.DataFrame({
            "A": data_missing._from_sequence([fill_value, fill_value],
                                             dtype=dtype),
            "B": [1, 2],
        })

        self.assert_frame_equal(result, expected)

Source File: test_sparse.py From recruit with Apache License 2.0

6 votes

def test_where_series(self, data, na_value):
        assert data[0] != data[1]
        cls = type(data)
        a, b = data[:2]

        ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))

        cond = np.array([True, True, False, False])
        result = ser.where(cond)

        new_dtype = SparseDtype('float', 0.0)
        expected = pd.Series(cls._from_sequence([a, a, na_value, na_value],
                                                dtype=new_dtype))
        self.assert_series_equal(result, expected)

        other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
        cond = np.array([True, False, True, True])
        result = ser.where(cond, other)
        expected = pd.Series(cls._from_sequence([a, b, b, b],
                                                dtype=data.dtype))
        self.assert_series_equal(result, expected)

Source File: test_subclass.py From recruit with Apache License 2.0

5 votes

def test_subclass_sparse_slice(self):
        # int64
        s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5])
        exp = tm.SubclassedSparseSeries([2, 3, 4], index=[1, 2, 3])
        tm.assert_sp_series_equal(s.loc[1:3], exp)
        assert s.loc[1:3].dtype == SparseDtype(np.int64)

        exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2])
        tm.assert_sp_series_equal(s.iloc[1:3], exp)
        assert s.iloc[1:3].dtype == SparseDtype(np.int64)

        exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2])
        tm.assert_sp_series_equal(s[1:3], exp)
        assert s[1:3].dtype == SparseDtype(np.int64)

        # float64
        s = tm.SubclassedSparseSeries([1., 2., 3., 4., 5.])
        exp = tm.SubclassedSparseSeries([2., 3., 4.], index=[1, 2, 3])
        tm.assert_sp_series_equal(s.loc[1:3], exp)
        assert s.loc[1:3].dtype == SparseDtype(np.float64)

        exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2])
        tm.assert_sp_series_equal(s.iloc[1:3], exp)
        assert s.iloc[1:3].dtype == SparseDtype(np.float64)

        exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2])
        tm.assert_sp_series_equal(s[1:3], exp)
        assert s[1:3].dtype == SparseDtype(np.float64)

Source File: test_sparse.py From recruit with Apache License 2.0

5 votes

def _check_unsupported(self, data):
        if data.dtype == SparseDtype(int, 0):
            pytest.skip("Can't store nan in int array.")

Source File: test_sparse.py From coffeegrindsize with MIT License

5 votes

def _check_unsupported(self, data):
        if data.dtype == SparseDtype(int, 0):
            pytest.skip("Can't store nan in int array.")

Source File: test_sparse.py From coffeegrindsize with MIT License

5 votes

def dtype():
    return SparseDtype()

Source File: test_encoders.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_basic_dataframe(sparse, method, dask_data, dtype):
    a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)
    b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)

    if method == "fit":
        a.fit(df)
        b.fit(dask_data)
        expected = a.transform(df)
        result = b.transform(dask_data)
    else:
        expected = a.fit_transform(df)
        result = b.fit_transform(dask_data)

    assert_estimator_equal(
        a,
        b,
        exclude={
            "n_values_",
            "feature_indices_",
            "active_features_",
            "dtypes_",
            "drop_idx_",
        },
    )

    assert isinstance(result, type(dask_data))
    assert len(result.columns) == expected.shape[1]
    if sparse and PANDAS_VERSION >= packaging.version.parse("0.24.0"):
        # pandas sparse ExtensionDtype interface
        dtype = pd.SparseDtype(dtype, dtype(0))
    assert (result.dtypes == dtype).all()

    da.utils.assert_eq(result.values, expected)

Source File: test_patch.py From scprep with GNU General Public License v3.0

5 votes

def test_fill_value():
    values = pd.Series(np.arange(3), dtype=pd.UInt16Dtype())
    custom_block = CustomBlock(values, placement=slice(1, 2))
    assert pd.isna(custom_block.fill_value)
    values = pd.Series(np.arange(3), dtype=pd.SparseDtype(float, 0.0))
    custom_block = CustomBlock(values, placement=slice(1, 2))
    assert not pd.isna(custom_block.fill_value)

Source File: test_sparse.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def _check_unsupported(self, data):
        if data.dtype == SparseDtype(int, 0):
            pytest.skip("Can't store nan in int array.")

Source File: matrix.py From scprep with GNU General Public License v3.0

5 votes

def SparseDataFrame(X, default_fill_value=0.0):
    if sparse.issparse(X):
        X = pd.DataFrame.sparse.from_spmatrix(X)
        X.sparse.fill_value = default_fill_value
    elif is_SparseDataFrame(X) or not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    return X.astype(pd.SparseDtype(float, fill_value=default_fill_value))

Source File: matrix.py From scprep with GNU General Public License v3.0

5 votes

def SparseSeries(X, default_fill_value=0.0):
    return pd.Series(X).astype(pd.SparseDtype(float, fill_value=default_fill_value))

Source File: utils.py From scprep with GNU General Public License v3.0

5 votes

def dataframe_to_sparse(x, fill_value=0.0):
    return x.astype(pd.SparseDtype(float, fill_value=fill_value))

Source File: utils.py From anndata with BSD 3-Clause "New" or "Revised" License

5 votes

def ensure_df_homogeneous(
    df: pd.DataFrame, name: str
) -> Union[np.ndarray, sparse.csr_matrix]:
    # TODO: rename this function, I would not expect this to return a non-dataframe
    if all(isinstance(dt, pd.SparseDtype) for dt in df.dtypes):
        arr = df.sparse.to_coo().tocsr()
    else:
        arr = df.to_numpy()
    if df.dtypes.nunique() != 1:
        warnings.warn(f"{name} converted to numpy array with dtype {arr.dtype}")
    return arr

Source File: test_sparse.py From recruit with Apache License 2.0

5 votes

def dtype():
    return SparseDtype()

Source File: test_subclass.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_subclass_sparse_slice(self):
        # int64
        s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5])
        exp = tm.SubclassedSparseSeries([2, 3, 4], index=[1, 2, 3])
        tm.assert_sp_series_equal(s.loc[1:3], exp)
        assert s.loc[1:3].dtype == SparseDtype(np.int64)

        exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2])
        tm.assert_sp_series_equal(s.iloc[1:3], exp)
        assert s.iloc[1:3].dtype == SparseDtype(np.int64)

        exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2])
        tm.assert_sp_series_equal(s[1:3], exp)
        assert s[1:3].dtype == SparseDtype(np.int64)

        # float64
        s = tm.SubclassedSparseSeries([1., 2., 3., 4., 5.])
        exp = tm.SubclassedSparseSeries([2., 3., 4.], index=[1, 2, 3])
        tm.assert_sp_series_equal(s.loc[1:3], exp)
        assert s.loc[1:3].dtype == SparseDtype(np.float64)

        exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2])
        tm.assert_sp_series_equal(s.iloc[1:3], exp)
        assert s.iloc[1:3].dtype == SparseDtype(np.float64)

        exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2])
        tm.assert_sp_series_equal(s[1:3], exp)
        assert s[1:3].dtype == SparseDtype(np.float64)

Source File: test_sparse.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def dtype():
    return SparseDtype()

Source File: test_dtypes.py From pandera with MIT License

4 votes

def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (
            pd.CategoricalDtype(),
            pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"),
            None
        ),
        (
            pd.DatetimeTZDtype(tz='UTC'),
            pd.Series(
                pd.date_range(start="20200101", end="20200301"),
                dtype="datetime64[ns, utc]"
            ),
            None
        ),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None),
        (
            pd.PeriodDtype(freq='D'),
            pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')),
            None
        ),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100)).where(
                lambda s: s < 5, other=np.nan).astype("Sparse[float]"),
            {"nullable": True},
        ),
        (
            pd.BooleanDtype(),
            pd.Series([1, 0, 0, 1, 1], dtype="boolean"),
            None
        ),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        )
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series)

Python pandas.SparseDtype() Examples