Python sklearn.compose.ColumnTransformer() Examples
The following are 30
code examples of sklearn.compose.ColumnTransformer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.compose
, or try the search function
.
Example #1
Source File: common_tabular_tests.py From interpret-community with MIT License | 6 votes |
def _get_transformations_one_to_many_greater(self, feature_names): # results in number of features greater than original features # copy all features except last one. For last one, replicate columns to create 3 more features transformations = [] feature_names = list(feature_names) index = 0 for f in feature_names[:-1]: transformations.append(("{}".format(index), "passthrough", [f])) index += 1 def copy_func(x): return np.tile(x, (1, 3)) copy_transformer = FunctionTransformer(copy_func) transformations.append(("copy_transformer", copy_transformer, [feature_names[-1]])) return ColumnTransformer(transformations)
Example #2
Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_column_transformer_list(): X_list = [ [1, float('nan'), 'a'], [0, 0, 'b'] ] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
Example #3
Source File: normalization_strategy_selector.py From Auto-PyTorch with Apache License 2.0 | 6 votes |
def fit(self, hyperparameter_config, X, train_indices, dataset_info): hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) normalizer_name = hyperparameter_config['normalization_strategy'] if normalizer_name == 'none': return {'normalizer': None} if isinstance(X, csr_matrix): normalizer = self.normalization_strategies[normalizer_name](with_mean=False) else: normalizer = self.normalization_strategies[normalizer_name]() transformer = ColumnTransformer(transformers=[("normalize", normalizer, [i for i, c in enumerate(dataset_info.categorical_features) if not c])], remainder='passthrough') transformer.fit(X[train_indices]) X = transformer.transform(X) dataset_info.categorical_features = sorted(dataset_info.categorical_features) return {'X': X, 'normalizer': transformer, 'dataset_info': dataset_info}
Example #4
Source File: one_hot_encoding.py From Auto-PyTorch with Apache License 2.0 | 6 votes |
def fit(self, pipeline_config, X, Y, dataset_info): categorical_features = dataset_info.categorical_features ohe = OneHotEncoder(categories="auto", sparse=False, handle_unknown="ignore") encoder = ColumnTransformer(transformers=[("ohe", ohe, [i for i, f in enumerate(categorical_features) if f])], remainder="passthrough") encoder.categories_ = np.array([]) encoder.categorical_features = categorical_features if any(categorical_features) and not dataset_info.is_sparse: # encode X X = encoder.fit_transform(X) encoder.categories_ = encoder.transformers_[0][1].categories_ # Y to matrix Y, y_encoder = self.complete_y_tranformation(Y) dataset_info.categorical_features = None return {'X': X, 'one_hot_encoder': encoder, 'Y': Y, 'y_one_hot_encoder': y_encoder, 'dataset_info': dataset_info}
Example #5
Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() # no distinction between 1D and 2D X_res_first = X_sparse[:, 0] X_res_both = X_sparse for col in [0, [0], slice(0, 1)]: for remainder, res in [('drop', X_res_first), ('passthrough', X_res_both)]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder, sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res) for col in [[0, 1], slice(0, 2)]: ct = ColumnTransformer([('trans', Trans(), col)], sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
Example #6
Source File: imputation.py From Auto-PyTorch with Apache License 2.0 | 6 votes |
def fit(self, hyperparameter_config, X, train_indices, dataset_info): hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config) if dataset_info.is_sparse: return {'imputation_preprocessor': None, 'all_nan_columns': None} # delete all nan columns all_nan = np.all(np.isnan(X), axis=0) X = X[:, ~all_nan] dataset_info.categorical_features = [dataset_info.categorical_features[i] for i, is_nan in enumerate(all_nan) if not is_nan] strategy = hyperparameter_config['strategy'] fill_value = int(np.nanmax(X)) + 1 if not dataset_info.is_sparse else 0 numerical_imputer = SimpleImputer(strategy=strategy, copy=False) categorical_imputer = SimpleImputer(strategy='constant', copy=False, fill_value=fill_value) transformer = ColumnTransformer( transformers=[('numerical_imputer', numerical_imputer, [i for i, c in enumerate(dataset_info.categorical_features) if not c]), ('categorical_imputer', categorical_imputer, [i for i, c in enumerate(dataset_info.categorical_features) if c])]) transformer.fit(X[train_indices]) X = transformer.transform(X) dataset_info.categorical_features = sorted(dataset_info.categorical_features) return { 'X': X, 'imputation_preprocessor': transformer, 'dataset_info': dataset_info , 'all_nan_columns': all_nan}
Example #7
Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.8) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert sparse.issparse(X_trans) assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1)) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0])) assert len(col_trans.transformers_) == 2 assert col_trans.transformers_[-1][0] != 'remainder' col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.1) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert not sparse.issparse(X_trans) assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
Example #8
Source File: common_tabular_tests.py From interpret-community with MIT License | 6 votes |
def _get_transformations_many_to_many(self, feature_names): # Instantiate data mapper with many to many transformer support and test whether the feature map is generated # IdentityTransformer is our custom transformer, so not recognized as one to many transformations = [ ("column_0_1_2_3", Pipeline([ ("scaler", StandardScaler()), ("identity", IdentityTransformer())]), [f for f in feature_names[:-2]]), ("column_4_5", StandardScaler(), [f for f in feature_names[-2:]]) ] # add transformations with pandas index types transformations.append(("pandas_index_columns", "passthrough", pd.Index([feature_names[0], feature_names[1]]))) column_transformer = ColumnTransformer(transformations) return column_transformer
Example #9
Source File: test_encoders.py From category_encoders with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_column_transformer(self): # see issue #169 for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder does not accept handle_missing parameter with self.subTest(encoder_name=encoder_name): # we can only test one data type at once. Here, we test string columns. tested_columns = ['unique_str', 'invariant', 'underscore', 'none', 'extra'] # ColumnTransformer instantiates the encoder twice -> we have to make sure the encoder settings are correctly passed ct = ColumnTransformer([ ("dummy_encoder_name", getattr(encoders, encoder_name)(handle_missing="return_nan"), tested_columns) ]) obtained = ct.fit_transform(X, y) # the old-school approach enc = getattr(encoders, encoder_name)(handle_missing="return_nan", return_df=False) expected = enc.fit_transform(X[tested_columns], y) np.testing.assert_array_equal(obtained, expected)
Example #10
Source File: 03_fit_predict_plot_midwest_survey.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def make_pipeline(encoding_method): # static transformers from the other columns transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)] # adding the encoded column transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method], [dirty_column])] pipeline = Pipeline([ # Use ColumnTransformer to combine the features ('union', ColumnTransformer( transformers=transformers, remainder='drop')), ('scaler', StandardScaler(with_mean=False)), ('classifier', RandomForestClassifier(random_state=5)) ]) return pipeline ############################################################################### # Evaluation of different encoding methods # ----------------------------------------- # We then loop over encoding methods, scoring the different pipeline predictions # using a cross validation score:
Example #11
Source File: 02_fit_predict_plot_employee_salaries.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def make_pipeline(encoding_method): # static transformers from the other columns transformers = [(enc + '_' + col, encoders_dict[enc], [col]) for col, enc in clean_columns.items()] # adding the encoded column transformers += [(encoding_method, encoders_dict[encoding_method], [dirty_column])] pipeline = Pipeline([ # Use ColumnTransformer to combine the features ('union', ColumnTransformer( transformers=transformers, remainder='drop')), ('scaler', StandardScaler(with_mean=False)), ('clf', RidgeCV()) ]) return pipeline ######################################################################### # Fitting each encoding methods with a RidgeCV # -------------------------------------------- # Eventually, we loop over the different encoding methods, # instantiate each time a new pipeline, fit it # and store the returned cross-validation score:
Example #12
Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_column_transformer_remainder_pandas(key): # test different ways that columns are specified with passthrough pd = pytest.importorskip('pandas') if isinstance(key, str) and key == 'pd-index': key = pd.Index(['first']) X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1])
Example #13
Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_column_transformer_remainder_transformer(key): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T X_res_both = X_array.copy() # second and third columns are doubled when remainder = DoubleTrans X_res_both[:, 1:3] *= 2 ct = ColumnTransformer([('trans1', Trans(), key)], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], DoubleTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
Example #14
Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_column_transformer_drops_all_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T # columns are doubled when remainder = DoubleTrans X_res_both = 2 * X_array.copy()[:, 1:3] ct = ColumnTransformer([('trans1', 'drop', [0])], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], DoubleTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
Example #15
Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_column_transformer_sparse_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=SparseMatrixTrans(), sparse_threshold=0.8) X_trans = ct.fit_transform(X_array) assert sparse.issparse(X_trans) # SparseMatrixTrans creates 3 features for each column. There is # one column in ``transformers``, thus: assert X_trans.shape == (3, 3 + 1) exp_array = np.hstack( (X_array[:, 0].reshape(-1, 1), np.eye(3))) assert_array_equal(X_trans.toarray(), exp_array) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
Example #16
Source File: repeatingbasis.py From scikit-lego with MIT License | 6 votes |
def fit(self, X, y=None): self.pipeline_ = ColumnTransformer( [ ( "repeatingbasis", _RepeatingBasisFunction( n_periods=self.n_periods, input_range=self.input_range ), [self.column], ) ], remainder=self.remainder, ) self.pipeline_.fit(X, y) return self
Example #17
Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0 | 6 votes |
def _filter(obj): if isinstance(obj, DataFrameMapper): obj.features = _filter_steps(obj.features) if hasattr(obj, "built_features"): if obj.built_features is not None: obj.built_features = _filter_steps(obj.built_features) elif isinstance(obj, ColumnTransformer): obj.transformers = _filter_steps(obj.transformers) obj.remainder = _filter(obj.remainder) if hasattr(obj, "transformers_"): obj.transformers_ = _filter_steps(obj.transformers_) elif isinstance(obj, FeatureUnion): obj.transformer_list = _filter_steps(obj.transformer_list) elif isinstance(obj, Pipeline): obj.steps = _filter_steps(obj.steps) elif isinstance(obj, SelectorMixin): return SelectorProxy(obj) elif isinstance(obj, list): return [_filter(e) for e in obj] return obj
Example #18
Source File: xgboost.py From sklearn2pmml with GNU Affero General Public License v3.0 | 6 votes |
def make_xgboost_column_transformer(dtypes, missing_value_aware = True): """Construct a ColumnTransformer for feeding complex data into an XGBModel. Parameters ---------- dtypes: iterable of tuples (column, dtype) missing_value_aware: boolean If true, use missing value aware transformers. Returns ------- ColumnTransformer """ transformers = list() for column, dtype in dtypes.items(): if _is_categorical(dtype): transformers.append((str(column), PMMLLabelBinarizer(sparse_output = True) if missing_value_aware else Pipeline([("ordinal_encoder", OrdinalEncoder()), ("one_hot_encoder", OneHotEncoder())]), [column])) else: transformers.append((str(column), "passthrough", [column])) return ColumnTransformer(transformers, remainder = "drop")
Example #19
Source File: test_sklearn_concat.py From sklearn-onnx with MIT License | 6 votes |
def _column_tranformer_fitted_from_df(data): def transformer_for_column(column: pd.Series): if column.dtype in ['float64', 'float32', 'int64']: return MinMaxScaler() if column.dtype in ['bool']: return 'passthrough' if column.dtype in ['O']: try: return OneHotEncoder(drop='first') except TypeError: # older version of scikit-learn return OneHotEncoder() raise ValueError( 'Unexpected column dtype for {column.name}:{column.dtype}'.format( column=column)) return ColumnTransformer( [(col, transformer_for_column( data[col]), [col]) for col in data.columns], remainder='drop' ).fit(data)
Example #20
Source File: test_sklearn_tfidf_vectorizer_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_model_tfidf_vectorizer11_compose(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) corpus = numpy.hstack([corpus, corpus]) y = numpy.array([0, 1, 0, 1]) model = ColumnTransformer([ ('a', TfidfVectorizer(), 0), ('b', TfidfVectorizer(), 1), ]) model.fit(corpus, y) model_onnx = convert_sklearn(model, "TfIdfcomp", [("input", StringTensorType([4, 2]))], options=self.get_options()) sess = InferenceSession(model_onnx.SerializeToString()) res = sess.run(None, {'input': corpus})[0] exp = model.transform(corpus) assert_almost_equal(res, exp)
Example #21
Source File: test_sklearn_pipeline.py From sklearn-onnx with MIT License | 6 votes |
def test_column_transformer_passthrough_no_weights(self): model, X = fit_classification_model( ColumnTransformer( [('pca', PCA(n_components=5), slice(0, 10)), ('svd', TruncatedSVD(n_components=5), slice(70, 80))], remainder='passthrough'), 3, n_features=100) model_onnx = convert_sklearn( model, "column transformer passthrough", [("input", FloatTensorType([None, X.shape[1]]))], dtype=numpy.float32, ) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnColumnTransformerPassthroughNoWeights", allow_failure="StrictVersion(onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
Example #22
Source File: test_sklearn_pipeline.py From sklearn-onnx with MIT License | 6 votes |
def test_column_transformer_weights(self): model, X = fit_classification_model( ColumnTransformer( [('pca', PCA(n_components=5), slice(0, 10)), ('svd', TruncatedSVD(n_components=5), slice(10, 100))], transformer_weights={'pca': 2, 'svd': 3}), 3, n_features=100) model_onnx = convert_sklearn( model, "column transformer weights", [("input", FloatTensorType([None, X.shape[1]]))], dtype=numpy.float32, ) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnColumnTransformerWeights-Dec4", allow_failure="StrictVersion(onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
Example #23
Source File: test_sklearn_pipeline.py From sklearn-onnx with MIT License | 6 votes |
def test_column_transformer_drop(self): model, X = fit_classification_model( ColumnTransformer( [('pca', PCA(n_components=5), slice(0, 10)), ('svd', TruncatedSVD(n_components=5), slice(80, 100))], remainder='drop'), 3, n_features=100) model_onnx = convert_sklearn( model, "column transformer drop", [("input", FloatTensorType([None, X.shape[1]]))], dtype=numpy.float32, ) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnColumnTransformerDrop", allow_failure="StrictVersion(onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
Example #24
Source File: test_sklearn_array_feature_extractor.py From sklearn-onnx with MIT License | 5 votes |
def test_array_feature_extractor(self): data_to_cluster = pd.DataFrame( [[1, 2, 3.5, 4.5], [1, 2, 1.7, 4.0], [2, 4, 2.4, 4.3], [2, 4, 2.5, 4.0]], columns=[1, 2, 3, 4]) cat_attributes_clustering = [1, 2] num_attributes_clustering = [3, 4] # this is of length 12 in reality gmm = GaussianMixture(n_components=2, random_state=1) ohe_cat = [OneHotEncoder(categories='auto', sparse=False, drop=None) for i in cat_attributes_clustering] ct_cat = ColumnTransformer([ ("oneHotEncoder" + str(i), ohe_cat[i], [i]) for i, item in enumerate(cat_attributes_clustering) ], remainder='passthrough') onehotencoding_pipeline = Pipeline([("columnTransformer", ct_cat), ]) clustering_pipeline = Pipeline([ ('onehotencoder_and_scaler', onehotencoding_pipeline), ('clustering', gmm)]) clustering_pipeline.fit(X=data_to_cluster) initial_type = [ ('float_input', FloatTensorType( [None, len([*cat_attributes_clustering, *num_attributes_clustering])]))] data = data_to_cluster.values.astype(np.float32) # checks the first step model_onnx = to_onnx( clustering_pipeline.steps[0][1], initial_types=initial_type, target_opset=TARGET_OPSET, dtype=np.float32) dump_data_and_model( data, clustering_pipeline.steps[0][1], model_onnx, basename="SklearnArrayFeatureExtractorStep0") # checks the whole pipeline model_onnx = to_onnx( clustering_pipeline, initial_types=initial_type, target_opset=TARGET_OPSET, dtype=np.float32) dump_data_and_model( data, clustering_pipeline, model_onnx, basename="SklearnArrayFeatureExtractor")
Example #25
Source File: test_investigate.py From sklearn-onnx with MIT License | 5 votes |
def test_simple_column_transformer(self): if ColumnTransformer is None: return data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]], dtype=numpy.float32) model = ColumnTransformer([("scaler1", StandardScaler(), [0]), ("scaler2", RobustScaler(), [1])]) model.fit(data) all_models = list(enumerate_pipeline_models(model)) steps = collect_intermediate_steps(model, "coulmn transformer", [("input", FloatTensorType([None, 2]))]) assert len(steps) == 2 assert len(all_models) == 3 model.transform(data) for step in steps: onnx_step = step['onnx_step'] sess = onnxruntime.InferenceSession(onnx_step.SerializeToString()) onnx_outputs = sess.run(None, {'input': data}) onnx_output = onnx_outputs[0] skl_outputs = step['model']._debug.outputs['transform'] assert_almost_equal(onnx_output, skl_outputs) compare_objects(onnx_output.tolist(), skl_outputs.tolist())
Example #26
Source File: test_xgboost_pipeline.py From onnxmltools with MIT License | 5 votes |
def _column_tranformer_fitted_from_df(self, data): def transformer_for_column(column): if column.dtype in ['float64', 'float32']: return MinMaxScaler() if column.dtype in ['bool']: return 'passthrough' if column.dtype in ['O']: return OneHotEncoder(sparse=False) raise ValueError() return ColumnTransformer( [(col, transformer_for_column(data[col]), [col]) for col in data.columns], remainder='drop' ).fit(data)
Example #27
Source File: tabular_nn_model.py From autogluon with Apache License 2.0 | 5 votes |
def _create_preprocessor(self, impute_strategy, max_category_levels): """ Defines data encoders used to preprocess different data types and creates instance variable which is sklearn ColumnTransformer object """ if self.processor is not None: Warning("Attempting to process training data for TabularNeuralNetModel, but previously already did this.") continuous_features = self.types_of_features['continuous'] skewed_features = self.types_of_features['skewed'] onehot_features = self.types_of_features['onehot'] embed_features = self.types_of_features['embed'] language_features = self.types_of_features['language'] transformers = [] # order of various column transformers in this list is important! if len(continuous_features) > 0: continuous_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy=impute_strategy)), ('scaler', StandardScaler())]) transformers.append( ('continuous', continuous_transformer, continuous_features) ) if len(skewed_features) > 0: power_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy=impute_strategy)), ('quantile', QuantileTransformer(output_distribution='normal')) ]) # Or output_distribution = 'uniform' # TODO: remove old code: ('power', PowerTransformer(method=self.params['proc.power_transform_method'])) ]) transformers.append( ('skewed', power_transformer, skewed_features) ) if len(onehot_features) > 0: onehot_transformer = Pipeline(steps=[ # TODO: Consider avoiding converting to string for improved memory efficiency ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)), ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)), ('onehot', OneHotMergeRaresHandleUnknownEncoder(max_levels=max_category_levels, sparse=False))]) # test-time unknown values will be encoded as all zeros vector transformers.append( ('onehot', onehot_transformer, onehot_features) ) if len(embed_features) > 0: # Ordinal transformer applied to convert to-be-embedded categorical features to integer levels ordinal_transformer = Pipeline(steps=[ ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)), ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)), ('ordinal', OrdinalMergeRaresHandleUnknownEncoder(max_levels=max_category_levels))]) # returns 0-n when max_category_levels = n-1. category n is reserved for unknown test-time categories. transformers.append( ('ordinal', ordinal_transformer, embed_features) ) if len(language_features) > 0: raise NotImplementedError("language_features cannot be used at the moment") return ColumnTransformer(transformers=transformers) # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.
Example #28
Source File: test_model_performance.py From DALEX with GNU General Public License v3.0 | 5 votes |
def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=400, random_state=0))]) clf.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, verbose=False) self.exp2 = dx.Explainer(clf, self.X, self.y, label="model2", verbose=False)
Example #29
Source File: test_predict.py From DALEX with GNU General Public License v3.0 | 5 votes |
def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPRegressor(hidden_layer_sizes=(150, 100, 50), max_iter=500, random_state=0))]) clf.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, verbose=False)
Example #30
Source File: test_aggregated_profiles.py From DALEX with GNU General Public License v3.0 | 5 votes |
def setUp(self): data = dx.datasets.load_titanic() data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived) self.X = data.drop(columns='survived') self.y = data.survived numeric_features = ['age', 'fare', 'sibsp', 'parch'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['gender', 'class', 'embarked'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(20, 20), max_iter=400, random_state=0))]) clf2 = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50), max_iter=400, random_state=0))]) clf.fit(self.X, self.y) clf2.fit(self.X, self.y) self.exp = dx.Explainer(clf, self.X, self.y, label="model1", verbose=False) self.exp2 = dx.Explainer(clf2, self.X, self.y, verbose=False) self.exp3 = dx.Explainer(clf, self.X, self.y, label="model3", verbose=False)