Python sklearn.preprocessing.OrdinalEncoder() Examples
The following are 17
code examples of sklearn.preprocessing.OrdinalEncoder().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.preprocessing
, or try the search function
.
Example #1
Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License | 6 votes |
def test_ordinal_encoder_twocats(self): data = [["cat2"], ["cat1"]] model = OrdinalEncoder(categories="auto") model.fit(data) inputs = [("input1", StringTensorType([None, 1]))] model_onnx = convert_sklearn(model, "ordinal encoder two string cats", inputs) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.5.0')", basename="SklearnOrdinalEncoderTwoStringCat", )
Example #2
Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License | 6 votes |
def test_ordinal_encoder_onecat(self): data = [["cat"], ["cat"]] model = OrdinalEncoder(categories="auto") model.fit(data) inputs = [("input1", StringTensorType([None, 1]))] model_onnx = convert_sklearn(model, "ordinal encoder one string cat", inputs) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnOrdinalEncoderOneStringCat", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.5.0')", )
Example #3
Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License | 6 votes |
def test_model_ordinal_encoder(self): model = OrdinalEncoder(dtype=np.int64) data = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.int64) model.fit(data) model_onnx = convert_sklearn( model, "scikit-learn ordinal encoder", [("input", Int64TensorType([None, 3]))], ) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnOrdinalEncoderInt64-SkipDim1", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.5.0')", )
Example #4
Source File: xgboost.py From sklearn2pmml with GNU Affero General Public License v3.0 | 6 votes |
def make_xgboost_column_transformer(dtypes, missing_value_aware = True): """Construct a ColumnTransformer for feeding complex data into an XGBModel. Parameters ---------- dtypes: iterable of tuples (column, dtype) missing_value_aware: boolean If true, use missing value aware transformers. Returns ------- ColumnTransformer """ transformers = list() for column, dtype in dtypes.items(): if _is_categorical(dtype): transformers.append((str(column), PMMLLabelBinarizer(sparse_output = True) if missing_value_aware else Pipeline([("ordinal_encoder", OrdinalEncoder()), ("one_hot_encoder", OneHotEncoder())]), [column])) else: transformers.append((str(column), "passthrough", [column])) return ColumnTransformer(transformers, remainder = "drop")
Example #5
Source File: test_kernels.py From scikit-optimize with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_gp_regressor(): rng = np.random.RandomState(0) X = np.asarray([ ["ham", "spam", "ted"], ["ham", "ted", "ted"], ["ham", "spam", "spam"]]) y = rng.randn(3) hm = HammingKernel(length_scale=[1.0, 1.0, 1.0]) if UseOrdinalEncoder: enc = OrdinalEncoder() enc.fit(X) gpr = GaussianProcessRegressor(hm) if UseOrdinalEncoder: gpr.fit(enc.transform(X), y) assert_array_almost_equal(gpr.predict(enc.transform(X)), y) assert_array_almost_equal(gpr.predict(enc.transform(X[:2])), y[:2]) else: gpr.fit(X, y) assert_array_almost_equal(gpr.predict(X), y) assert_array_almost_equal(gpr.predict(X[:2]), y[:2])
Example #6
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ordinal_encoder(X): enc = OrdinalEncoder() exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64') assert_array_equal(enc.fit_transform(X), exp.astype('float64')) enc = OrdinalEncoder(dtype='int64') assert_array_equal(enc.fit_transform(X), exp)
Example #7
Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License | 5 votes |
def test_model_ordinal_encoder_cat_list(self): model = OrdinalEncoder(categories=[[0, 1, 4, 5], [1, 2, 3, 5], [0, 3, 4, 6]]) data = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.int64) model.fit(data) model_onnx = convert_sklearn( model, "scikit-learn ordinal encoder", [("input", Int64TensorType([None, 3]))], ) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnOrdinalEncoderCatList", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.5.0')", )
Example #8
Source File: test_sklearn_ordinal_encoder.py From sklearn-onnx with MIT License | 5 votes |
def test_ordinal_encoder_mixed_string_int_drop(self): data = [ ["c0.4", "c0.2", 3], ["c1.4", "c1.2", 0], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ["c0.2", "c2.2", 1], ] test = [["c0.2", "c2.2", 1]] model = OrdinalEncoder(categories="auto") model.fit(data) inputs = [ ("input1", StringTensorType([None, 2])), ("input2", Int64TensorType([None, 1])), ] model_onnx = convert_sklearn( model, "ordinal encoder", inputs) self.assertTrue(model_onnx is not None) dump_data_and_model( test, model, model_onnx, basename="SklearnOrdinalEncoderMixedStringIntDrop", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.5.0')", )
Example #9
Source File: lightgbm.py From sklearn2pmml with GNU Affero General Public License v3.0 | 5 votes |
def make_lightgbm_column_transformer(dtypes, missing_value_aware = True): """Construct a ColumnTransformer for feeding complex data into a LGBMModel. Parameters ---------- dtypes: iterable of tuples (column, dtype) missing_value_aware: boolean If true, use missing value aware transformers. Returns: Tuple (ColumnTransformer, list of categorical column indices) """ transformers = list() categorical_features = list() i = 0 for column, dtype in dtypes.items(): if _is_categorical(dtype): transformers.append((str(column), PMMLLabelEncoder(missing_values = -1) if missing_value_aware else OrdinalEncoder(), [column])) categorical_features.append(i) else: transformers.append((str(column), "passthrough", [column])) i += 1 return (ColumnTransformer(transformers, remainder = "drop"), categorical_features)
Example #10
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ordinal_encoder_raise_categories_shape(): X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T cats = ['Low', 'Medium', 'High'] enc = OrdinalEncoder(categories=cats) msg = ("Shape mismatch: if categories is an array,") with pytest.raises(ValueError, match=msg): enc.fit(X)
Example #11
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ordinal_encoder_raise_missing(X): ohe = OrdinalEncoder() with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit(X) with pytest.raises(ValueError, match="Input contains NaN"): ohe.fit_transform(X) ohe.fit(X[:1, :]) with pytest.raises(ValueError, match="Input contains NaN"): ohe.transform(X)
Example #12
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ordinal_encoder_inverse(): X = [['abc', 2, 55], ['def', 1, 55]] enc = OrdinalEncoder() X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) assert_array_equal(enc.inverse_transform(X_tr), exp) # incorrect shape raises X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) msg = re.escape('Shape of the passed X data is not correct') assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
Example #13
Source File: test_encoders.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): enc = OrdinalEncoder(categories=cats) exp = np.array([[0.], [1.]]) assert_array_equal(enc.fit_transform(X), exp) assert list(enc.categories[0]) == list(cats[0]) assert enc.categories_[0].tolist() == list(cats[0]) # manually specified categories should have same dtype as # the data when coerced from lists assert enc.categories_[0].dtype == cat_dtype # when specifying categories manually, unknown categories should already # raise when fitting enc = OrdinalEncoder(categories=cats) with pytest.raises(ValueError, match="Found unknown categories"): enc.fit(X2)
Example #14
Source File: kdd99_model.py From tcav with Apache License 2.0 | 5 votes |
def encode_variables(data): """ Encodes variables using simple ordinal encoding.""" data2 = np.copy(data) encoder = OrdinalEncoder() categorical_indices = kBytesIndices data2[:, categorical_indices] = encoder.fit_transform(data2[:, categorical_indices]) return data2
Example #15
Source File: encoders.py From sagemaker-scikit-learn-extension with Apache License 2.0 | 4 votes |
def inverse_transform(self, X): """Convert the data back to the original representation. In slots where the encoding is that of an unrecognised category, the output of the inverse transform is np.nan for float or complex arrays, and None otherwise Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. Returns ------- X_tr : array-like, shape [n_samples, n_features] Inverse transformed array. Notes ----- Most of the logic is copied from sklearn.preprocessing.OrdinalEncoder.inverse_transform. The difference is in handling unknown values. """ check_is_fitted(self, "categories_") X = check_array(X, dtype="numeric") n_samples, _ = X.shape n_features = len(self.categories_) # validate shape of passed X msg = "Shape of the passed X data is not correct. Expected {0} " "columns, got {1}." if X.shape[1] != n_features: raise ValueError(msg.format(n_features, X.shape[1])) # create resulting array of appropriate dtype dt = np.find_common_type([cat.dtype for cat in self.categories_], []) X_tr = np.empty((n_samples, n_features), dtype=dt) found_unknown = {} for i in range(n_features): labels = X[:, i].astype("int64", copy=False) known_mask = labels != self.categories_[i].shape[0] labels *= known_mask X_tr[:, i] = self.categories_[i][labels] if not np.all(known_mask): found_unknown[i] = ~known_mask # if unknown are found cast to an object array and transform the missing values to None if found_unknown: if X_tr.dtype != object: X_tr = X_tr.astype(object) for idx, unknown_mask in found_unknown.items(): X_tr[unknown_mask, idx] = None return X_tr
Example #16
Source File: feature_selection.py From driverlessai-recipes with Apache License 2.0 | 4 votes |
def create_data(X: dt.Frame = None): if X is None: return [] data = X.to_pandas().copy() # identify categorical colmns and trasform them cats = [x for x in data.select_dtypes(exclude=np.number).columns if x not in [target] + cols2ignore] for c in cats: data[c] = OrdinalEncoder().fit_transform(data[c].astype(str).values.reshape(-1, 1)) # Get the actual importance, i.e. without shuffling actual_imp_df = get_feature_importances(data=data, cats=cats, shuffle=False, seed=42) # Seed the unexpected randomness of this world np.random.seed(123) seeds = np.random.randint(0, 2 ** 30, size=number_of_iterations) null_imp_df = pd.DataFrame() for i, s in enumerate(seeds): # Get current run importances imp_df = get_feature_importances(data=data, cats=cats, shuffle=True, seed=s) imp_df['run'] = i + 1 # Concat the latest importances with the old ones null_imp_df = pd.concat([null_imp_df, imp_df], axis=0) feature_scores = [] for _f in actual_imp_df['feature'].unique(): f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance'].values f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance'].mean() _score = np.log( 1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, max(75, min(99, threshold))))) feature_scores.append((_f, _score)) scores_df = pd.DataFrame(feature_scores, columns=['feature', 'score']) # final feature selection selected_features = scores_df[scores_df['score'] > 0]['feature'].values.tolist() selected_features = np.unique(selected_features).tolist() data = X.to_pandas().copy() return data[cols2ignore + selected_features + [target]]
Example #17
Source File: dataset_wrapper.py From interpret-community with MIT License | 4 votes |
def string_index(self, columns=None): """Indexes categorical string features on the dataset. :param columns: Optional parameter specifying the subset of columns that may need to be string indexed. :type columns: list :return: The transformation steps to index the given dataset. :rtype: ColumnTransformer """ if self._string_indexed: return self._column_indexer # Optimization so we don't redo this operation multiple times on the same dataset self._string_indexed = True # If the data was previously successfully summarized, then there are no # categorical columns as it must be numeric. # Also, if the dataset is sparse, we can assume there are no categorical strings if isinstance(self._dataset, DenseData) or issparse(self._dataset): return None # If the user doesn't have a newer version of scikit-learn with OrdinalEncoder, don't do encoding try: from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder except ImportError: return None tmp_dataset = self._dataset # Temporarily convert to pandas for easier and uniform string handling if isinstance(self._dataset, np.ndarray): tmp_dataset = pd.DataFrame(self._dataset, dtype=self._dataset.dtype) categorical_col_names = list(np.array(list(tmp_dataset))[(tmp_dataset.applymap(type) == str).all(0)]) if categorical_col_names: all_columns = tmp_dataset.columns if columns is not None: categorical_col_indices = \ [all_columns.get_loc(col_name) for col_name in categorical_col_names if col_name in columns] else: categorical_col_indices = [all_columns.get_loc(col_name) for col_name in categorical_col_names] ordinal_enc = OrdinalEncoder() ct = ColumnTransformer([('ord', ordinal_enc, categorical_col_indices)], remainder='drop') string_indexes_dataset = ct.fit_transform(tmp_dataset) # Inplace replacement of columns # (danger: using remainder=passthrough with ColumnTransformer will change column order!) for idx, categorical_col_index in enumerate(categorical_col_indices): self._dataset[:, categorical_col_index] = string_indexes_dataset[:, idx] self._column_indexer = ct return self._column_indexer