Python Examples of sklearn.preprocessing.OrdinalEncoder

Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License

6 votes

def test_ordinal_encoder_twocats(self):
        data = [["cat2"], ["cat1"]]
        model = OrdinalEncoder(categories="auto")
        model.fit(data)
        inputs = [("input1", StringTensorType([None, 1]))]
        model_onnx = convert_sklearn(model, "ordinal encoder two string cats",
                                     inputs)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
            basename="SklearnOrdinalEncoderTwoStringCat",
        )

Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License

6 votes

def test_ordinal_encoder_onecat(self):
        data = [["cat"], ["cat"]]
        model = OrdinalEncoder(categories="auto")
        model.fit(data)
        inputs = [("input1", StringTensorType([None, 1]))]
        model_onnx = convert_sklearn(model, "ordinal encoder one string cat",
                                     inputs)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnOrdinalEncoderOneStringCat",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
        )

Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License

6 votes

def test_model_ordinal_encoder(self):
        model = OrdinalEncoder(dtype=np.int64)
        data = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]],
                        dtype=np.int64)
        model.fit(data)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn ordinal encoder",
            [("input", Int64TensorType([None, 3]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnOrdinalEncoderInt64-SkipDim1",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
        )

Source File: xgboost.py From sklearn2pmml with GNU Affero General Public License v3.0

6 votes

def make_xgboost_column_transformer(dtypes, missing_value_aware = True):
	"""Construct a ColumnTransformer for feeding complex data into an XGBModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns
	-------
	ColumnTransformer

	"""
	transformers = list()
	for column, dtype in dtypes.items():
		if _is_categorical(dtype):
			transformers.append((str(column), PMMLLabelBinarizer(sparse_output = True) if missing_value_aware else Pipeline([("ordinal_encoder", OrdinalEncoder()), ("one_hot_encoder", OneHotEncoder())]), [column]))
		else:
			transformers.append((str(column), "passthrough", [column]))
	return ColumnTransformer(transformers, remainder = "drop")

Source File: test_kernels.py From scikit-optimize with BSD 3-Clause "New" or "Revised" License

6 votes

def test_gp_regressor():
    rng = np.random.RandomState(0)
    X = np.asarray([
        ["ham", "spam", "ted"],
        ["ham", "ted", "ted"],
        ["ham", "spam", "spam"]])
    y = rng.randn(3)
    hm = HammingKernel(length_scale=[1.0, 1.0, 1.0])
    if UseOrdinalEncoder:
        enc = OrdinalEncoder()
        enc.fit(X)

    gpr = GaussianProcessRegressor(hm)
    if UseOrdinalEncoder:
        gpr.fit(enc.transform(X), y)
        assert_array_almost_equal(gpr.predict(enc.transform(X)), y)
        assert_array_almost_equal(gpr.predict(enc.transform(X[:2])), y[:2])
    else:
        gpr.fit(X, y)
        assert_array_almost_equal(gpr.predict(X), y)
        assert_array_almost_equal(gpr.predict(X[:2]), y[:2])

Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_ordinal_encoder(X):
    enc = OrdinalEncoder()
    exp = np.array([[0, 1, 0],
                    [1, 0, 0]], dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
    enc = OrdinalEncoder(dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp)

Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License

5 votes

def test_model_ordinal_encoder_cat_list(self):
        model = OrdinalEncoder(categories=[[0, 1, 4, 5],
                                           [1, 2, 3, 5],
                                           [0, 3, 4, 6]])
        data = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]],
                        dtype=np.int64)
        model.fit(data)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn ordinal encoder",
            [("input", Int64TensorType([None, 3]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnOrdinalEncoderCatList",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
        )

Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License

5 votes

def test_ordinal_encoder_mixed_string_int_drop(self):
        data = [
            ["c0.4", "c0.2", 3],
            ["c1.4", "c1.2", 0],
            ["c0.2", "c2.2", 1],
            ["c0.2", "c2.2", 1],
            ["c0.2", "c2.2", 1],
            ["c0.2", "c2.2", 1],
        ]
        test = [["c0.2", "c2.2", 1]]
        model = OrdinalEncoder(categories="auto")
        model.fit(data)
        inputs = [
            ("input1", StringTensorType([None, 2])),
            ("input2", Int64TensorType([None, 1])),
        ]
        model_onnx = convert_sklearn(
            model, "ordinal encoder", inputs)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            test,
            model,
            model_onnx,
            basename="SklearnOrdinalEncoderMixedStringIntDrop",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.5.0')",
        )

Source File: lightgbm.py From sklearn2pmml with GNU Affero General Public License v3.0

5 votes

def make_lightgbm_column_transformer(dtypes, missing_value_aware = True):
	"""Construct a ColumnTransformer for feeding complex data into a LGBMModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns:
	Tuple (ColumnTransformer, list of categorical column indices)

	"""
	transformers = list()
	categorical_features = list()
	i = 0
	for column, dtype in dtypes.items():
		if _is_categorical(dtype):
			transformers.append((str(column), PMMLLabelEncoder(missing_values = -1) if missing_value_aware else OrdinalEncoder(), [column]))
			categorical_features.append(i)
		else:
			transformers.append((str(column), "passthrough", [column]))
		i += 1
	return (ColumnTransformer(transformers, remainder = "drop"), categorical_features)

Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_ordinal_encoder_raise_categories_shape():

    X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
    cats = ['Low', 'Medium', 'High']
    enc = OrdinalEncoder(categories=cats)
    msg = ("Shape mismatch: if categories is an array,")

    with pytest.raises(ValueError, match=msg):
        enc.fit(X)

Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_ordinal_encoder_raise_missing(X):
    ohe = OrdinalEncoder()

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit_transform(X)

    ohe.fit(X[:1, :])

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.transform(X)

Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_ordinal_encoder_inverse():
    X = [['abc', 2, 55], ['def', 1, 55]]
    enc = OrdinalEncoder()
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
    msg = re.escape('Shape of the passed X data is not correct')
    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)

Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OrdinalEncoder(categories=cats)
    exp = np.array([[0.], [1.]])
    assert_array_equal(enc.fit_transform(X), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OrdinalEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2)

Source File: kdd99_model.py From tcav with Apache License 2.0

5 votes

def encode_variables(data):
  """ Encodes variables using simple ordinal encoding."""
  data2 = np.copy(data)
  encoder = OrdinalEncoder()
  categorical_indices = kBytesIndices
  data2[:,
        categorical_indices] = encoder.fit_transform(data2[:,
                                                           categorical_indices])
  return data2

Source File: encoders.py From sagemaker-scikit-learn-extension with Apache License 2.0

4 votes

def inverse_transform(self, X):
        """Convert the data back to the original representation.
        In slots where the encoding is that of an unrecognised category, the output of the inverse transform is np.nan
        for float or complex arrays, and None otherwise

        Parameters
        ----------
        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
            The transformed data.

        Returns
        -------
        X_tr : array-like, shape [n_samples, n_features]
            Inverse transformed array.

        Notes
        -----
        Most of the logic is copied from sklearn.preprocessing.OrdinalEncoder.inverse_transform. The difference is in
        handling unknown values.

        """
        check_is_fitted(self, "categories_")
        X = check_array(X, dtype="numeric")

        n_samples, _ = X.shape
        n_features = len(self.categories_)

        # validate shape of passed X
        msg = "Shape of the passed X data is not correct. Expected {0} " "columns, got {1}."
        if X.shape[1] != n_features:
            raise ValueError(msg.format(n_features, X.shape[1]))

        # create resulting array of appropriate dtype
        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
        X_tr = np.empty((n_samples, n_features), dtype=dt)

        found_unknown = {}
        for i in range(n_features):
            labels = X[:, i].astype("int64", copy=False)
            known_mask = labels != self.categories_[i].shape[0]
            labels *= known_mask
            X_tr[:, i] = self.categories_[i][labels]
            if not np.all(known_mask):
                found_unknown[i] = ~known_mask

        # if unknown are found cast to an object array and transform the missing values to None
        if found_unknown:
            if X_tr.dtype != object:
                X_tr = X_tr.astype(object)

            for idx, unknown_mask in found_unknown.items():
                X_tr[unknown_mask, idx] = None

        return X_tr

Source File: feature_selection.py From driverlessai-recipes with Apache License 2.0

4 votes

def create_data(X: dt.Frame = None):
        if X is None:
            return []

        data = X.to_pandas().copy()

        # identify categorical colmns and trasform them
        cats = [x for x in data.select_dtypes(exclude=np.number).columns if x not in [target] + cols2ignore]

        for c in cats:
            data[c] = OrdinalEncoder().fit_transform(data[c].astype(str).values.reshape(-1, 1))

        # Get the actual importance, i.e. without shuffling
        actual_imp_df = get_feature_importances(data=data, cats=cats, shuffle=False, seed=42)

        # Seed the unexpected randomness of this world
        np.random.seed(123)

        seeds = np.random.randint(0, 2 ** 30, size=number_of_iterations)
        null_imp_df = pd.DataFrame()

        for i, s in enumerate(seeds):
            # Get current run importances
            imp_df = get_feature_importances(data=data, cats=cats, shuffle=True, seed=s)
            imp_df['run'] = i + 1
            # Concat the latest importances with the old ones
            null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)

        feature_scores = []
        for _f in actual_imp_df['feature'].unique():
            f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance'].values
            f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance'].mean()
            _score = np.log(
                1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, max(75, min(99, threshold)))))

            feature_scores.append((_f, _score))

        scores_df = pd.DataFrame(feature_scores, columns=['feature', 'score'])
        # final feature selection
        selected_features = scores_df[scores_df['score'] > 0]['feature'].values.tolist()
        selected_features = np.unique(selected_features).tolist()

        data = X.to_pandas().copy()
        return data[cols2ignore + selected_features + [target]]

Source File: dataset_wrapper.py From interpret-community with MIT License

4 votes

def string_index(self, columns=None):
        """Indexes categorical string features on the dataset.

        :param columns: Optional parameter specifying the subset of columns that may need to be string indexed.
        :type columns: list
        :return: The transformation steps to index the given dataset.
        :rtype: ColumnTransformer
        """
        if self._string_indexed:
            return self._column_indexer
        # Optimization so we don't redo this operation multiple times on the same dataset
        self._string_indexed = True
        # If the data was previously successfully summarized, then there are no
        # categorical columns as it must be numeric.
        # Also, if the dataset is sparse, we can assume there are no categorical strings
        if isinstance(self._dataset, DenseData) or issparse(self._dataset):
            return None
        # If the user doesn't have a newer version of scikit-learn with OrdinalEncoder, don't do encoding
        try:
            from sklearn.compose import ColumnTransformer
            from sklearn.preprocessing import OrdinalEncoder
        except ImportError:
            return None
        tmp_dataset = self._dataset
        # Temporarily convert to pandas for easier and uniform string handling
        if isinstance(self._dataset, np.ndarray):
            tmp_dataset = pd.DataFrame(self._dataset, dtype=self._dataset.dtype)
        categorical_col_names = list(np.array(list(tmp_dataset))[(tmp_dataset.applymap(type) == str).all(0)])
        if categorical_col_names:
            all_columns = tmp_dataset.columns
            if columns is not None:
                categorical_col_indices = \
                    [all_columns.get_loc(col_name) for col_name in categorical_col_names if col_name in columns]
            else:
                categorical_col_indices = [all_columns.get_loc(col_name) for col_name in categorical_col_names]
            ordinal_enc = OrdinalEncoder()
            ct = ColumnTransformer([('ord', ordinal_enc, categorical_col_indices)], remainder='drop')
            string_indexes_dataset = ct.fit_transform(tmp_dataset)
            # Inplace replacement of columns
            # (danger: using remainder=passthrough with ColumnTransformer will change column order!)
            for idx, categorical_col_index in enumerate(categorical_col_indices):
                self._dataset[:, categorical_col_index] = string_indexes_dataset[:, idx]
            self._column_indexer = ct
        return self._column_indexer

Python sklearn.preprocessing.OrdinalEncoder() Examples