Python Examples of sklearn.compose.ColumnTransformer

Source File: common_tabular_tests.py From interpret-community with MIT License

6 votes

def _get_transformations_one_to_many_greater(self, feature_names):
        # results in number of features greater than original features
        # copy all features except last one. For last one, replicate columns to create 3 more features
        transformations = []
        feature_names = list(feature_names)
        index = 0
        for f in feature_names[:-1]:
            transformations.append(("{}".format(index), "passthrough", [f]))
            index += 1

        def copy_func(x):
            return np.tile(x, (1, 3))

        copy_transformer = FunctionTransformer(copy_func)

        transformations.append(("copy_transformer", copy_transformer, [feature_names[-1]]))

        return ColumnTransformer(transformations)

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    assert_array_equal(ct.fit_transform(X_list), expected_result)
    assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)

Source File: normalization_strategy_selector.py From Auto-PyTorch with Apache License 2.0

6 votes

def fit(self, hyperparameter_config, X, train_indices, dataset_info):
        hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config)

        normalizer_name = hyperparameter_config['normalization_strategy']

        if normalizer_name == 'none':
            return {'normalizer': None}

        if isinstance(X, csr_matrix):
            normalizer = self.normalization_strategies[normalizer_name](with_mean=False)
        else:
            normalizer = self.normalization_strategies[normalizer_name]()
        
        transformer = ColumnTransformer(transformers=[("normalize", normalizer, [i for i, c in enumerate(dataset_info.categorical_features) if not c])],
                                        remainder='passthrough')

        transformer.fit(X[train_indices])

        X = transformer.transform(X)
        
        dataset_info.categorical_features = sorted(dataset_info.categorical_features)

        return {'X': X, 'normalizer': transformer, 'dataset_info': dataset_info}

Source File: one_hot_encoding.py From Auto-PyTorch with Apache License 2.0

6 votes

def fit(self, pipeline_config, X, Y, dataset_info):
        categorical_features = dataset_info.categorical_features
        ohe = OneHotEncoder(categories="auto", sparse=False, handle_unknown="ignore")
        encoder = ColumnTransformer(transformers=[("ohe", ohe, [i for i, f in enumerate(categorical_features) if f])], remainder="passthrough")
        encoder.categories_ = np.array([])
        encoder.categorical_features = categorical_features

        if any(categorical_features) and not dataset_info.is_sparse:
            # encode X
            X = encoder.fit_transform(X)
            encoder.categories_ = encoder.transformers_[0][1].categories_

        # Y to matrix
        Y, y_encoder = self.complete_y_tranformation(Y)

        dataset_info.categorical_features = None
        return {'X': X, 'one_hot_encoder': encoder, 'Y': Y, 'y_one_hot_encoder': y_encoder, 'dataset_info': dataset_info}

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_column_transformer_sparse_array():
    X_sparse = sparse.eye(3, 2).tocsr()

    # no distinction between 1D and 2D
    X_res_first = X_sparse[:, 0]
    X_res_both = X_sparse

    for col in [0, [0], slice(0, 1)]:
        for remainder, res in [('drop', X_res_first),
                               ('passthrough', X_res_both)]:
            ct = ColumnTransformer([('trans', Trans(), col)],
                                   remainder=remainder,
                                   sparse_threshold=0.8)
            assert sparse.issparse(ct.fit_transform(X_sparse))
            assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
            assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                         res)

    for col in [[0, 1], slice(0, 2)]:
        ct = ColumnTransformer([('trans', Trans(), col)],
                               sparse_threshold=0.8)
        assert sparse.issparse(ct.fit_transform(X_sparse))
        assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
        assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                     X_res_both)

Source File: imputation.py From Auto-PyTorch with Apache License 2.0

6 votes

def fit(self, hyperparameter_config, X, train_indices, dataset_info):
        hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config)

        if dataset_info.is_sparse:
            return {'imputation_preprocessor': None, 'all_nan_columns': None}

        # delete all nan columns
        all_nan = np.all(np.isnan(X), axis=0)
        X = X[:, ~all_nan]
        dataset_info.categorical_features = [dataset_info.categorical_features[i] for i, is_nan in enumerate(all_nan) if not is_nan]

        strategy = hyperparameter_config['strategy']
        fill_value = int(np.nanmax(X)) + 1 if not dataset_info.is_sparse else 0
        numerical_imputer = SimpleImputer(strategy=strategy, copy=False)
        categorical_imputer = SimpleImputer(strategy='constant', copy=False, fill_value=fill_value)
        transformer = ColumnTransformer(
            transformers=[('numerical_imputer', numerical_imputer, [i for i, c in enumerate(dataset_info.categorical_features) if not c]),
                          ('categorical_imputer', categorical_imputer,  [i for i, c in enumerate(dataset_info.categorical_features) if c])])
        transformer.fit(X[train_indices])
        X = transformer.transform(X)
        
        dataset_info.categorical_features = sorted(dataset_info.categorical_features)
        return { 'X': X, 'imputation_preprocessor': transformer, 'dataset_info': dataset_info , 'all_nan_columns': all_nan}

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_column_transformer_sparse_stacking():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.8)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert sparse.issparse(X_trans)
    assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1))
    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
    assert len(col_trans.transformers_) == 2
    assert col_trans.transformers_[-1][0] != 'remainder'

    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.1)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert not sparse.issparse(X_trans)
    assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
    assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))

Source File: common_tabular_tests.py From interpret-community with MIT License

6 votes

def _get_transformations_many_to_many(self, feature_names):
        # Instantiate data mapper with many to many transformer support and test whether the feature map is generated

        # IdentityTransformer is our custom transformer, so not recognized as one to many
        transformations = [
            ("column_0_1_2_3", Pipeline([
                ("scaler", StandardScaler()),
                ("identity", IdentityTransformer())]), [f for f in feature_names[:-2]]),
            ("column_4_5", StandardScaler(), [f for f in feature_names[-2:]])
        ]

        # add transformations with pandas index types
        transformations.append(("pandas_index_columns", "passthrough",
                                pd.Index([feature_names[0], feature_names[1]])))

        column_transformer = ColumnTransformer(transformations)

        return column_transformer

Source File: test_encoders.py From category_encoders with BSD 3-Clause "New" or "Revised" License

6 votes

def test_column_transformer(self):
        # see issue #169
            for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}):  # HashingEncoder does not accept handle_missing parameter
                with self.subTest(encoder_name=encoder_name):

                    # we can only test one data type at once. Here, we test string columns.
                    tested_columns = ['unique_str', 'invariant', 'underscore', 'none', 'extra']

                    # ColumnTransformer instantiates the encoder twice -> we have to make sure the encoder settings are correctly passed
                    ct = ColumnTransformer([
                        ("dummy_encoder_name", getattr(encoders, encoder_name)(handle_missing="return_nan"), tested_columns)
                    ])
                    obtained = ct.fit_transform(X, y)

                    # the old-school approach
                    enc = getattr(encoders, encoder_name)(handle_missing="return_nan", return_df=False)
                    expected = enc.fit_transform(X[tested_columns], y)

                    np.testing.assert_array_equal(obtained, expected)

Source File: 03_fit_predict_plot_midwest_survey.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

6 votes

def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
    # adding the encoded column
    transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', RandomForestClassifier(random_state=5))
    ])

    return pipeline


###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score:

Source File: 02_fit_predict_plot_employee_salaries.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

6 votes

def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [(enc + '_' + col, encoders_dict[enc], [col])
                    for col, enc in clean_columns.items()]
    # adding the encoded column
    transformers += [(encoding_method, encoders_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', RidgeCV())
    ])
    return pipeline


#########################################################################
# Fitting each encoding methods with a RidgeCV
# --------------------------------------------
# Eventually, we loop over the different encoding methods,
# instantiate each time a new pipeline, fit it
# and store the returned cross-validation score:

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_column_transformer_remainder_pandas(key):
    # test different ways that columns are specified with passthrough
    pd = pytest.importorskip('pandas')
    if isinstance(key, str) and key == 'pd-index':
        key = pd.Index(['first'])

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_column_transformer_remainder_transformer(key):
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T
    X_res_both = X_array.copy()

    # second and third columns are doubled when remainder = DoubleTrans
    X_res_both[:, 1:3] *= 2

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_column_transformer_drops_all_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    # columns are doubled when remainder = DoubleTrans
    X_res_both = 2 * X_array.copy()[:, 1:3]

    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])

Source File: test_column_transformer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_column_transformer_sparse_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    ct = ColumnTransformer([('trans1', Trans(), [0])],
                           remainder=SparseMatrixTrans(),
                           sparse_threshold=0.8)

    X_trans = ct.fit_transform(X_array)
    assert sparse.issparse(X_trans)
    # SparseMatrixTrans creates 3 features for each column. There is
    # one column in ``transformers``, thus:
    assert X_trans.shape == (3, 3 + 1)

    exp_array = np.hstack(
        (X_array[:, 0].reshape(-1, 1), np.eye(3)))
    assert_array_equal(X_trans.toarray(), exp_array)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])

Source File: repeatingbasis.py From scikit-lego with MIT License

6 votes

def fit(self, X, y=None):
        self.pipeline_ = ColumnTransformer(
            [
                (
                    "repeatingbasis",
                    _RepeatingBasisFunction(
                        n_periods=self.n_periods, input_range=self.input_range
                    ),
                    [self.column],
                )
            ],
            remainder=self.remainder,
        )

        self.pipeline_.fit(X, y)

        return self

Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0

6 votes

def _filter(obj):
	if isinstance(obj, DataFrameMapper):
		obj.features = _filter_steps(obj.features)
		if hasattr(obj, "built_features"):
			if obj.built_features is not None:
				obj.built_features = _filter_steps(obj.built_features)
	elif isinstance(obj, ColumnTransformer):
		obj.transformers = _filter_steps(obj.transformers)
		obj.remainder = _filter(obj.remainder)
		if hasattr(obj, "transformers_"):
			obj.transformers_ = _filter_steps(obj.transformers_)
	elif isinstance(obj, FeatureUnion):
		obj.transformer_list = _filter_steps(obj.transformer_list)
	elif isinstance(obj, Pipeline):
		obj.steps = _filter_steps(obj.steps)
	elif isinstance(obj, SelectorMixin):
		return SelectorProxy(obj)
	elif isinstance(obj, list):
		return [_filter(e) for e in obj]
	return obj

Source File: xgboost.py From sklearn2pmml with GNU Affero General Public License v3.0

6 votes

def make_xgboost_column_transformer(dtypes, missing_value_aware = True):
	"""Construct a ColumnTransformer for feeding complex data into an XGBModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns
	-------
	ColumnTransformer

	"""
	transformers = list()
	for column, dtype in dtypes.items():
		if _is_categorical(dtype):
			transformers.append((str(column), PMMLLabelBinarizer(sparse_output = True) if missing_value_aware else Pipeline([("ordinal_encoder", OrdinalEncoder()), ("one_hot_encoder", OneHotEncoder())]), [column]))
		else:
			transformers.append((str(column), "passthrough", [column]))
	return ColumnTransformer(transformers, remainder = "drop")

Source File: test_sklearn_concat.py From sklearn-onnx with MIT License

6 votes

def _column_tranformer_fitted_from_df(data):
    def transformer_for_column(column: pd.Series):
        if column.dtype in ['float64', 'float32', 'int64']:
            return MinMaxScaler()
        if column.dtype in ['bool']:
            return 'passthrough'
        if column.dtype in ['O']:
            try:
                return OneHotEncoder(drop='first')
            except TypeError:
                # older version of scikit-learn
                return OneHotEncoder()
        raise ValueError(
            'Unexpected column dtype for {column.name}:{column.dtype}'.format(
                column=column))

    return ColumnTransformer(
        [(col, transformer_for_column(
            data[col]), [col]) for col in data.columns],
        remainder='drop'
    ).fit(data)

Source File: test_sklearn_tfidf_vectorizer_converter.py From sklearn-onnx with MIT License

6 votes

def test_model_tfidf_vectorizer11_compose(self):
        corpus = numpy.array([
            "This is the first document.",
            "This document is the second document.",
            "And this is the third one.",
            "Is this the first document?",
        ]).reshape((4, 1))
        corpus = numpy.hstack([corpus, corpus])
        y = numpy.array([0, 1, 0, 1])
        model = ColumnTransformer([
            ('a', TfidfVectorizer(), 0),
            ('b', TfidfVectorizer(), 1),
        ])
        model.fit(corpus, y)
        model_onnx = convert_sklearn(model, "TfIdfcomp",
                                     [("input", StringTensorType([4, 2]))],
                                     options=self.get_options())
        sess = InferenceSession(model_onnx.SerializeToString())
        res = sess.run(None, {'input': corpus})[0]
        exp = model.transform(corpus)
        assert_almost_equal(res, exp)

Source File: test_sklearn_pipeline.py From sklearn-onnx with MIT License

6 votes

def test_column_transformer_passthrough_no_weights(self):
        model, X = fit_classification_model(
            ColumnTransformer(
                [('pca', PCA(n_components=5), slice(0, 10)),
                 ('svd', TruncatedSVD(n_components=5), slice(70, 80))],
                remainder='passthrough'), 3, n_features=100)
        model_onnx = convert_sklearn(
            model,
            "column transformer passthrough",
            [("input", FloatTensorType([None, X.shape[1]]))],
            dtype=numpy.float32,
        )
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X,
            model,
            model_onnx,
            basename="SklearnColumnTransformerPassthroughNoWeights",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_pipeline.py From sklearn-onnx with MIT License

6 votes

def test_column_transformer_weights(self):
        model, X = fit_classification_model(
            ColumnTransformer(
                [('pca', PCA(n_components=5), slice(0, 10)),
                 ('svd', TruncatedSVD(n_components=5), slice(10, 100))],
                transformer_weights={'pca': 2, 'svd': 3}), 3, n_features=100)
        model_onnx = convert_sklearn(
            model,
            "column transformer weights",
            [("input", FloatTensorType([None, X.shape[1]]))],
            dtype=numpy.float32,
        )
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X,
            model,
            model_onnx,
            basename="SklearnColumnTransformerWeights-Dec4",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_pipeline.py From sklearn-onnx with MIT License

6 votes

def test_column_transformer_drop(self):
        model, X = fit_classification_model(
            ColumnTransformer(
                [('pca', PCA(n_components=5), slice(0, 10)),
                 ('svd', TruncatedSVD(n_components=5), slice(80, 100))],
                remainder='drop'), 3, n_features=100)
        model_onnx = convert_sklearn(
            model,
            "column transformer drop",
            [("input", FloatTensorType([None, X.shape[1]]))],
            dtype=numpy.float32,
        )
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X,
            model,
            model_onnx,
            basename="SklearnColumnTransformerDrop",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_array_feature_extractor.py From sklearn-onnx with MIT License

5 votes

def test_array_feature_extractor(self):
        data_to_cluster = pd.DataFrame(
            [[1, 2, 3.5, 4.5], [1, 2, 1.7, 4.0],
             [2, 4, 2.4, 4.3], [2, 4, 2.5, 4.0]],
            columns=[1, 2, 3, 4])
        cat_attributes_clustering = [1, 2]
        num_attributes_clustering = [3, 4]  # this is of length 12 in reality
        gmm = GaussianMixture(n_components=2, random_state=1)
        ohe_cat = [OneHotEncoder(categories='auto', sparse=False, drop=None)
                   for i in cat_attributes_clustering]
        ct_cat = ColumnTransformer([
            ("oneHotEncoder" + str(i), ohe_cat[i], [i])
            for i, item in enumerate(cat_attributes_clustering)
        ], remainder='passthrough')
        onehotencoding_pipeline = Pipeline([("columnTransformer", ct_cat), ])
        clustering_pipeline = Pipeline([
            ('onehotencoder_and_scaler', onehotencoding_pipeline),
            ('clustering', gmm)])
        clustering_pipeline.fit(X=data_to_cluster)
        initial_type = [
            ('float_input', FloatTensorType(
                [None, len([*cat_attributes_clustering,
                            *num_attributes_clustering])]))]
        data = data_to_cluster.values.astype(np.float32)

        # checks the first step
        model_onnx = to_onnx(
            clustering_pipeline.steps[0][1], initial_types=initial_type,
            target_opset=TARGET_OPSET, dtype=np.float32)
        dump_data_and_model(
            data, clustering_pipeline.steps[0][1], model_onnx,
            basename="SklearnArrayFeatureExtractorStep0")

        # checks the whole pipeline
        model_onnx = to_onnx(
            clustering_pipeline, initial_types=initial_type,
            target_opset=TARGET_OPSET, dtype=np.float32)
        dump_data_and_model(
            data, clustering_pipeline, model_onnx,
            basename="SklearnArrayFeatureExtractor")

Source File: test_investigate.py From sklearn-onnx with MIT License

5 votes

def test_simple_column_transformer(self):
        if ColumnTransformer is None:
            return
        data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]],
                           dtype=numpy.float32)
        model = ColumnTransformer([("scaler1", StandardScaler(), [0]),
                                  ("scaler2", RobustScaler(), [1])])
        model.fit(data)
        all_models = list(enumerate_pipeline_models(model))

        steps = collect_intermediate_steps(model, "coulmn transformer",
                                           [("input",
                                             FloatTensorType([None, 2]))])

        assert len(steps) == 2
        assert len(all_models) == 3

        model.transform(data)
        for step in steps:
            onnx_step = step['onnx_step']
            sess = onnxruntime.InferenceSession(onnx_step.SerializeToString())
            onnx_outputs = sess.run(None, {'input': data})
            onnx_output = onnx_outputs[0]
            skl_outputs = step['model']._debug.outputs['transform']
            assert_almost_equal(onnx_output, skl_outputs)
            compare_objects(onnx_output.tolist(), skl_outputs.tolist())

Source File: test_xgboost_pipeline.py From onnxmltools with MIT License

5 votes

def _column_tranformer_fitted_from_df(self, data):
        def transformer_for_column(column):
            if column.dtype in ['float64', 'float32']:
                return MinMaxScaler()
            if column.dtype in ['bool']:
                return 'passthrough'
            if column.dtype in ['O']:
                return OneHotEncoder(sparse=False)
            raise ValueError()

        return ColumnTransformer(
            [(col, transformer_for_column(data[col]), [col]) for col in data.columns],
            remainder='drop'
        ).fit(data)

Source File: tabular_nn_model.py From autogluon with Apache License 2.0

5 votes

def _create_preprocessor(self, impute_strategy, max_category_levels):
        """ Defines data encoders used to preprocess different data types and creates instance variable which is sklearn ColumnTransformer object """
        if self.processor is not None:
            Warning("Attempting to process training data for TabularNeuralNetModel, but previously already did this.")
        continuous_features = self.types_of_features['continuous']
        skewed_features = self.types_of_features['skewed']
        onehot_features = self.types_of_features['onehot']
        embed_features = self.types_of_features['embed']
        language_features = self.types_of_features['language']
        transformers = [] # order of various column transformers in this list is important!
        if len(continuous_features) > 0:
            continuous_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=impute_strategy)),
                ('scaler', StandardScaler())])
            transformers.append( ('continuous', continuous_transformer, continuous_features) )
        if len(skewed_features) > 0:
            power_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=impute_strategy)),
                ('quantile', QuantileTransformer(output_distribution='normal')) ]) # Or output_distribution = 'uniform'
                # TODO: remove old code: ('power', PowerTransformer(method=self.params['proc.power_transform_method'])) ])
            transformers.append( ('skewed', power_transformer, skewed_features) )
        if len(onehot_features) > 0:
            onehot_transformer = Pipeline(steps=[
                # TODO: Consider avoiding converting to string for improved memory efficiency
                ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)),
                ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)),
                ('onehot', OneHotMergeRaresHandleUnknownEncoder(max_levels=max_category_levels, sparse=False))]) # test-time unknown values will be encoded as all zeros vector
            transformers.append( ('onehot', onehot_transformer, onehot_features) )
        if len(embed_features) > 0: # Ordinal transformer applied to convert to-be-embedded categorical features to integer levels
            ordinal_transformer = Pipeline(steps=[
                ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)),
                ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)),
                ('ordinal', OrdinalMergeRaresHandleUnknownEncoder(max_levels=max_category_levels))]) # returns 0-n when max_category_levels = n-1. category n is reserved for unknown test-time categories.
            transformers.append( ('ordinal', ordinal_transformer, embed_features) )
        if len(language_features) > 0:
            raise NotImplementedError("language_features cannot be used at the moment")
        return ColumnTransformer(transformers=transformers) # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.

Source File: test_model_performance.py From DALEX with GNU General Public License v3.0

5 votes

def setUp(self):
        data = dx.datasets.load_titanic()
        data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived)

        self.X = data.drop(columns='survived')
        self.y = data.survived

        numeric_features = ['age', 'fare', 'sibsp', 'parch']
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

        categorical_features = ['gender', 'class', 'embarked']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])

        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50),
                                                           max_iter=400, random_state=0))])

        clf.fit(self.X, self.y)

        self.exp = dx.Explainer(clf, self.X, self.y, verbose=False)
        self.exp2 = dx.Explainer(clf, self.X, self.y, label="model2", verbose=False)

Source File: test_predict.py From DALEX with GNU General Public License v3.0

5 votes

def setUp(self):
        data = dx.datasets.load_titanic()
        data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived)

        self.X = data.drop(columns='survived')
        self.y = data.survived

        numeric_features = ['age', 'fare', 'sibsp', 'parch']
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

        categorical_features = ['gender', 'class', 'embarked']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])

        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', MLPRegressor(hidden_layer_sizes=(150, 100, 50),
                                                          max_iter=500, random_state=0))])

        clf.fit(self.X, self.y)

        self.exp = dx.Explainer(clf, self.X, self.y, verbose=False)

Source File: test_aggregated_profiles.py From DALEX with GNU General Public License v3.0

5 votes

def setUp(self):
        data = dx.datasets.load_titanic()
        data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived)

        self.X = data.drop(columns='survived')
        self.y = data.survived

        numeric_features = ['age', 'fare', 'sibsp', 'parch']
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

        categorical_features = ['gender', 'class', 'embarked']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])

        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', MLPClassifier(hidden_layer_sizes=(20, 20),
                                                           max_iter=400, random_state=0))])
        clf2 = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50),
                                                            max_iter=400, random_state=0))])

        clf.fit(self.X, self.y)
        clf2.fit(self.X, self.y)

        self.exp = dx.Explainer(clf, self.X, self.y, label="model1", verbose=False)
        self.exp2 = dx.Explainer(clf2, self.X, self.y, verbose=False)
        self.exp3 = dx.Explainer(clf, self.X, self.y, label="model3", verbose=False)

Python sklearn.compose.ColumnTransformer() Examples