Python sklearn.compose.ColumnTransformer() Examples

The following are 30 code examples of sklearn.compose.ColumnTransformer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.compose , or try the search function .
Example #1
Source File: common_tabular_tests.py    From interpret-community with MIT License 6 votes vote down vote up
def _get_transformations_one_to_many_greater(self, feature_names):
        # results in number of features greater than original features
        # copy all features except last one. For last one, replicate columns to create 3 more features
        transformations = []
        feature_names = list(feature_names)
        index = 0
        for f in feature_names[:-1]:
            transformations.append(("{}".format(index), "passthrough", [f]))
            index += 1

        def copy_func(x):
            return np.tile(x, (1, 3))

        copy_transformer = FunctionTransformer(copy_func)

        transformations.append(("copy_transformer", copy_transformer, [feature_names[-1]]))

        return ColumnTransformer(transformations) 
Example #2
Source File: test_column_transformer.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    assert_array_equal(ct.fit_transform(X_list), expected_result)
    assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) 
Example #3
Source File: normalization_strategy_selector.py    From Auto-PyTorch with Apache License 2.0 6 votes vote down vote up
def fit(self, hyperparameter_config, X, train_indices, dataset_info):
        hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config)

        normalizer_name = hyperparameter_config['normalization_strategy']

        if normalizer_name == 'none':
            return {'normalizer': None}

        if isinstance(X, csr_matrix):
            normalizer = self.normalization_strategies[normalizer_name](with_mean=False)
        else:
            normalizer = self.normalization_strategies[normalizer_name]()
        
        transformer = ColumnTransformer(transformers=[("normalize", normalizer, [i for i, c in enumerate(dataset_info.categorical_features) if not c])],
                                        remainder='passthrough')

        transformer.fit(X[train_indices])

        X = transformer.transform(X)
        
        dataset_info.categorical_features = sorted(dataset_info.categorical_features)

        return {'X': X, 'normalizer': transformer, 'dataset_info': dataset_info} 
Example #4
Source File: one_hot_encoding.py    From Auto-PyTorch with Apache License 2.0 6 votes vote down vote up
def fit(self, pipeline_config, X, Y, dataset_info):
        categorical_features = dataset_info.categorical_features
        ohe = OneHotEncoder(categories="auto", sparse=False, handle_unknown="ignore")
        encoder = ColumnTransformer(transformers=[("ohe", ohe, [i for i, f in enumerate(categorical_features) if f])], remainder="passthrough")
        encoder.categories_ = np.array([])
        encoder.categorical_features = categorical_features

        if any(categorical_features) and not dataset_info.is_sparse:
            # encode X
            X = encoder.fit_transform(X)
            encoder.categories_ = encoder.transformers_[0][1].categories_

        # Y to matrix
        Y, y_encoder = self.complete_y_tranformation(Y)

        dataset_info.categorical_features = None
        return {'X': X, 'one_hot_encoder': encoder, 'Y': Y, 'y_one_hot_encoder': y_encoder, 'dataset_info': dataset_info} 
Example #5
Source File: test_column_transformer.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_column_transformer_sparse_array():
    X_sparse = sparse.eye(3, 2).tocsr()

    # no distinction between 1D and 2D
    X_res_first = X_sparse[:, 0]
    X_res_both = X_sparse

    for col in [0, [0], slice(0, 1)]:
        for remainder, res in [('drop', X_res_first),
                               ('passthrough', X_res_both)]:
            ct = ColumnTransformer([('trans', Trans(), col)],
                                   remainder=remainder,
                                   sparse_threshold=0.8)
            assert sparse.issparse(ct.fit_transform(X_sparse))
            assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
            assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                         res)

    for col in [[0, 1], slice(0, 2)]:
        ct = ColumnTransformer([('trans', Trans(), col)],
                               sparse_threshold=0.8)
        assert sparse.issparse(ct.fit_transform(X_sparse))
        assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
        assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                     X_res_both) 
Example #6
Source File: imputation.py    From Auto-PyTorch with Apache License 2.0 6 votes vote down vote up
def fit(self, hyperparameter_config, X, train_indices, dataset_info):
        hyperparameter_config = ConfigWrapper(self.get_name(), hyperparameter_config)

        if dataset_info.is_sparse:
            return {'imputation_preprocessor': None, 'all_nan_columns': None}

        # delete all nan columns
        all_nan = np.all(np.isnan(X), axis=0)
        X = X[:, ~all_nan]
        dataset_info.categorical_features = [dataset_info.categorical_features[i] for i, is_nan in enumerate(all_nan) if not is_nan]

        strategy = hyperparameter_config['strategy']
        fill_value = int(np.nanmax(X)) + 1 if not dataset_info.is_sparse else 0
        numerical_imputer = SimpleImputer(strategy=strategy, copy=False)
        categorical_imputer = SimpleImputer(strategy='constant', copy=False, fill_value=fill_value)
        transformer = ColumnTransformer(
            transformers=[('numerical_imputer', numerical_imputer, [i for i, c in enumerate(dataset_info.categorical_features) if not c]),
                          ('categorical_imputer', categorical_imputer,  [i for i, c in enumerate(dataset_info.categorical_features) if c])])
        transformer.fit(X[train_indices])
        X = transformer.transform(X)
        
        dataset_info.categorical_features = sorted(dataset_info.categorical_features)
        return { 'X': X, 'imputation_preprocessor': transformer, 'dataset_info': dataset_info , 'all_nan_columns': all_nan} 
Example #7
Source File: test_column_transformer.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_column_transformer_sparse_stacking():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.8)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert sparse.issparse(X_trans)
    assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1))
    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
    assert len(col_trans.transformers_) == 2
    assert col_trans.transformers_[-1][0] != 'remainder'

    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.1)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert not sparse.issparse(X_trans)
    assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
    assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0])) 
Example #8
Source File: common_tabular_tests.py    From interpret-community with MIT License 6 votes vote down vote up
def _get_transformations_many_to_many(self, feature_names):
        # Instantiate data mapper with many to many transformer support and test whether the feature map is generated

        # IdentityTransformer is our custom transformer, so not recognized as one to many
        transformations = [
            ("column_0_1_2_3", Pipeline([
                ("scaler", StandardScaler()),
                ("identity", IdentityTransformer())]), [f for f in feature_names[:-2]]),
            ("column_4_5", StandardScaler(), [f for f in feature_names[-2:]])
        ]

        # add transformations with pandas index types
        transformations.append(("pandas_index_columns", "passthrough",
                                pd.Index([feature_names[0], feature_names[1]])))

        column_transformer = ColumnTransformer(transformations)

        return column_transformer 
Example #9
Source File: test_encoders.py    From category_encoders with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_column_transformer(self):
        # see issue #169
            for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}):  # HashingEncoder does not accept handle_missing parameter
                with self.subTest(encoder_name=encoder_name):

                    # we can only test one data type at once. Here, we test string columns.
                    tested_columns = ['unique_str', 'invariant', 'underscore', 'none', 'extra']

                    # ColumnTransformer instantiates the encoder twice -> we have to make sure the encoder settings are correctly passed
                    ct = ColumnTransformer([
                        ("dummy_encoder_name", getattr(encoders, encoder_name)(handle_missing="return_nan"), tested_columns)
                    ])
                    obtained = ct.fit_transform(X, y)

                    # the old-school approach
                    enc = getattr(encoders, encoder_name)(handle_missing="return_nan", return_df=False)
                    expected = enc.fit_transform(X[tested_columns], y)

                    np.testing.assert_array_equal(obtained, expected) 
Example #10
Source File: 03_fit_predict_plot_midwest_survey.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
    # adding the encoded column
    transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', RandomForestClassifier(random_state=5))
    ])

    return pipeline


###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score: 
Example #11
Source File: 02_fit_predict_plot_employee_salaries.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [(enc + '_' + col, encoders_dict[enc], [col])
                    for col, enc in clean_columns.items()]
    # adding the encoded column
    transformers += [(encoding_method, encoders_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', RidgeCV())
    ])
    return pipeline


#########################################################################
# Fitting each encoding methods with a RidgeCV
# --------------------------------------------
# Eventually, we loop over the different encoding methods,
# instantiate each time a new pipeline, fit it
# and store the returned cross-validation score: 
Example #12
Source File: test_column_transformer.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_column_transformer_remainder_pandas(key):
    # test different ways that columns are specified with passthrough
    pd = pytest.importorskip('pandas')
    if isinstance(key, str) and key == 'pd-index':
        key = pd.Index(['first'])

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1]) 
Example #13
Source File: test_column_transformer.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_column_transformer_remainder_transformer(key):
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T
    X_res_both = X_array.copy()

    # second and third columns are doubled when remainder = DoubleTrans
    X_res_both[:, 1:3] *= 2

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2]) 
Example #14
Source File: test_column_transformer.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_column_transformer_drops_all_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    # columns are doubled when remainder = DoubleTrans
    X_res_both = 2 * X_array.copy()[:, 1:3]

    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2]) 
Example #15
Source File: test_column_transformer.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_column_transformer_sparse_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    ct = ColumnTransformer([('trans1', Trans(), [0])],
                           remainder=SparseMatrixTrans(),
                           sparse_threshold=0.8)

    X_trans = ct.fit_transform(X_array)
    assert sparse.issparse(X_trans)
    # SparseMatrixTrans creates 3 features for each column. There is
    # one column in ``transformers``, thus:
    assert X_trans.shape == (3, 3 + 1)

    exp_array = np.hstack(
        (X_array[:, 0].reshape(-1, 1), np.eye(3)))
    assert_array_equal(X_trans.toarray(), exp_array)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2]) 
Example #16
Source File: repeatingbasis.py    From scikit-lego with MIT License 6 votes vote down vote up
def fit(self, X, y=None):
        self.pipeline_ = ColumnTransformer(
            [
                (
                    "repeatingbasis",
                    _RepeatingBasisFunction(
                        n_periods=self.n_periods, input_range=self.input_range
                    ),
                    [self.column],
                )
            ],
            remainder=self.remainder,
        )

        self.pipeline_.fit(X, y)

        return self 
Example #17
Source File: __init__.py    From sklearn2pmml with GNU Affero General Public License v3.0 6 votes vote down vote up
def _filter(obj):
	if isinstance(obj, DataFrameMapper):
		obj.features = _filter_steps(obj.features)
		if hasattr(obj, "built_features"):
			if obj.built_features is not None:
				obj.built_features = _filter_steps(obj.built_features)
	elif isinstance(obj, ColumnTransformer):
		obj.transformers = _filter_steps(obj.transformers)
		obj.remainder = _filter(obj.remainder)
		if hasattr(obj, "transformers_"):
			obj.transformers_ = _filter_steps(obj.transformers_)
	elif isinstance(obj, FeatureUnion):
		obj.transformer_list = _filter_steps(obj.transformer_list)
	elif isinstance(obj, Pipeline):
		obj.steps = _filter_steps(obj.steps)
	elif isinstance(obj, SelectorMixin):
		return SelectorProxy(obj)
	elif isinstance(obj, list):
		return [_filter(e) for e in obj]
	return obj 
Example #18
Source File: xgboost.py    From sklearn2pmml with GNU Affero General Public License v3.0 6 votes vote down vote up
def make_xgboost_column_transformer(dtypes, missing_value_aware = True):
	"""Construct a ColumnTransformer for feeding complex data into an XGBModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns
	-------
	ColumnTransformer

	"""
	transformers = list()
	for column, dtype in dtypes.items():
		if _is_categorical(dtype):
			transformers.append((str(column), PMMLLabelBinarizer(sparse_output = True) if missing_value_aware else Pipeline([("ordinal_encoder", OrdinalEncoder()), ("one_hot_encoder", OneHotEncoder())]), [column]))
		else:
			transformers.append((str(column), "passthrough", [column]))
	return ColumnTransformer(transformers, remainder = "drop") 
Example #19
Source File: test_sklearn_concat.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def _column_tranformer_fitted_from_df(data):
    def transformer_for_column(column: pd.Series):
        if column.dtype in ['float64', 'float32', 'int64']:
            return MinMaxScaler()
        if column.dtype in ['bool']:
            return 'passthrough'
        if column.dtype in ['O']:
            try:
                return OneHotEncoder(drop='first')
            except TypeError:
                # older version of scikit-learn
                return OneHotEncoder()
        raise ValueError(
            'Unexpected column dtype for {column.name}:{column.dtype}'.format(
                column=column))

    return ColumnTransformer(
        [(col, transformer_for_column(
            data[col]), [col]) for col in data.columns],
        remainder='drop'
    ).fit(data) 
Example #20
Source File: test_sklearn_tfidf_vectorizer_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_model_tfidf_vectorizer11_compose(self):
        corpus = numpy.array([
            "This is the first document.",
            "This document is the second document.",
            "And this is the third one.",
            "Is this the first document?",
        ]).reshape((4, 1))
        corpus = numpy.hstack([corpus, corpus])
        y = numpy.array([0, 1, 0, 1])
        model = ColumnTransformer([
            ('a', TfidfVectorizer(), 0),
            ('b', TfidfVectorizer(), 1),
        ])
        model.fit(corpus, y)
        model_onnx = convert_sklearn(model, "TfIdfcomp",
                                     [("input", StringTensorType([4, 2]))],
                                     options=self.get_options())
        sess = InferenceSession(model_onnx.SerializeToString())
        res = sess.run(None, {'input': corpus})[0]
        exp = model.transform(corpus)
        assert_almost_equal(res, exp) 
Example #21
Source File: test_sklearn_pipeline.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_column_transformer_passthrough_no_weights(self):
        model, X = fit_classification_model(
            ColumnTransformer(
                [('pca', PCA(n_components=5), slice(0, 10)),
                 ('svd', TruncatedSVD(n_components=5), slice(70, 80))],
                remainder='passthrough'), 3, n_features=100)
        model_onnx = convert_sklearn(
            model,
            "column transformer passthrough",
            [("input", FloatTensorType([None, X.shape[1]]))],
            dtype=numpy.float32,
        )
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X,
            model,
            model_onnx,
            basename="SklearnColumnTransformerPassthroughNoWeights",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        ) 
Example #22
Source File: test_sklearn_pipeline.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_column_transformer_weights(self):
        model, X = fit_classification_model(
            ColumnTransformer(
                [('pca', PCA(n_components=5), slice(0, 10)),
                 ('svd', TruncatedSVD(n_components=5), slice(10, 100))],
                transformer_weights={'pca': 2, 'svd': 3}), 3, n_features=100)
        model_onnx = convert_sklearn(
            model,
            "column transformer weights",
            [("input", FloatTensorType([None, X.shape[1]]))],
            dtype=numpy.float32,
        )
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X,
            model,
            model_onnx,
            basename="SklearnColumnTransformerWeights-Dec4",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        ) 
Example #23
Source File: test_sklearn_pipeline.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_column_transformer_drop(self):
        model, X = fit_classification_model(
            ColumnTransformer(
                [('pca', PCA(n_components=5), slice(0, 10)),
                 ('svd', TruncatedSVD(n_components=5), slice(80, 100))],
                remainder='drop'), 3, n_features=100)
        model_onnx = convert_sklearn(
            model,
            "column transformer drop",
            [("input", FloatTensorType([None, X.shape[1]]))],
            dtype=numpy.float32,
        )
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X,
            model,
            model_onnx,
            basename="SklearnColumnTransformerDrop",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        ) 
Example #24
Source File: test_sklearn_array_feature_extractor.py    From sklearn-onnx with MIT License 5 votes vote down vote up
def test_array_feature_extractor(self):
        data_to_cluster = pd.DataFrame(
            [[1, 2, 3.5, 4.5], [1, 2, 1.7, 4.0],
             [2, 4, 2.4, 4.3], [2, 4, 2.5, 4.0]],
            columns=[1, 2, 3, 4])
        cat_attributes_clustering = [1, 2]
        num_attributes_clustering = [3, 4]  # this is of length 12 in reality
        gmm = GaussianMixture(n_components=2, random_state=1)
        ohe_cat = [OneHotEncoder(categories='auto', sparse=False, drop=None)
                   for i in cat_attributes_clustering]
        ct_cat = ColumnTransformer([
            ("oneHotEncoder" + str(i), ohe_cat[i], [i])
            for i, item in enumerate(cat_attributes_clustering)
        ], remainder='passthrough')
        onehotencoding_pipeline = Pipeline([("columnTransformer", ct_cat), ])
        clustering_pipeline = Pipeline([
            ('onehotencoder_and_scaler', onehotencoding_pipeline),
            ('clustering', gmm)])
        clustering_pipeline.fit(X=data_to_cluster)
        initial_type = [
            ('float_input', FloatTensorType(
                [None, len([*cat_attributes_clustering,
                            *num_attributes_clustering])]))]
        data = data_to_cluster.values.astype(np.float32)

        # checks the first step
        model_onnx = to_onnx(
            clustering_pipeline.steps[0][1], initial_types=initial_type,
            target_opset=TARGET_OPSET, dtype=np.float32)
        dump_data_and_model(
            data, clustering_pipeline.steps[0][1], model_onnx,
            basename="SklearnArrayFeatureExtractorStep0")

        # checks the whole pipeline
        model_onnx = to_onnx(
            clustering_pipeline, initial_types=initial_type,
            target_opset=TARGET_OPSET, dtype=np.float32)
        dump_data_and_model(
            data, clustering_pipeline, model_onnx,
            basename="SklearnArrayFeatureExtractor") 
Example #25
Source File: test_investigate.py    From sklearn-onnx with MIT License 5 votes vote down vote up
def test_simple_column_transformer(self):
        if ColumnTransformer is None:
            return
        data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]],
                           dtype=numpy.float32)
        model = ColumnTransformer([("scaler1", StandardScaler(), [0]),
                                  ("scaler2", RobustScaler(), [1])])
        model.fit(data)
        all_models = list(enumerate_pipeline_models(model))

        steps = collect_intermediate_steps(model, "coulmn transformer",
                                           [("input",
                                             FloatTensorType([None, 2]))])

        assert len(steps) == 2
        assert len(all_models) == 3

        model.transform(data)
        for step in steps:
            onnx_step = step['onnx_step']
            sess = onnxruntime.InferenceSession(onnx_step.SerializeToString())
            onnx_outputs = sess.run(None, {'input': data})
            onnx_output = onnx_outputs[0]
            skl_outputs = step['model']._debug.outputs['transform']
            assert_almost_equal(onnx_output, skl_outputs)
            compare_objects(onnx_output.tolist(), skl_outputs.tolist()) 
Example #26
Source File: test_xgboost_pipeline.py    From onnxmltools with MIT License 5 votes vote down vote up
def _column_tranformer_fitted_from_df(self, data):
        def transformer_for_column(column):
            if column.dtype in ['float64', 'float32']:
                return MinMaxScaler()
            if column.dtype in ['bool']:
                return 'passthrough'
            if column.dtype in ['O']:
                return OneHotEncoder(sparse=False)
            raise ValueError()

        return ColumnTransformer(
            [(col, transformer_for_column(data[col]), [col]) for col in data.columns],
            remainder='drop'
        ).fit(data) 
Example #27
Source File: tabular_nn_model.py    From autogluon with Apache License 2.0 5 votes vote down vote up
def _create_preprocessor(self, impute_strategy, max_category_levels):
        """ Defines data encoders used to preprocess different data types and creates instance variable which is sklearn ColumnTransformer object """
        if self.processor is not None:
            Warning("Attempting to process training data for TabularNeuralNetModel, but previously already did this.")
        continuous_features = self.types_of_features['continuous']
        skewed_features = self.types_of_features['skewed']
        onehot_features = self.types_of_features['onehot']
        embed_features = self.types_of_features['embed']
        language_features = self.types_of_features['language']
        transformers = [] # order of various column transformers in this list is important!
        if len(continuous_features) > 0:
            continuous_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=impute_strategy)),
                ('scaler', StandardScaler())])
            transformers.append( ('continuous', continuous_transformer, continuous_features) )
        if len(skewed_features) > 0:
            power_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy=impute_strategy)),
                ('quantile', QuantileTransformer(output_distribution='normal')) ]) # Or output_distribution = 'uniform'
                # TODO: remove old code: ('power', PowerTransformer(method=self.params['proc.power_transform_method'])) ])
            transformers.append( ('skewed', power_transformer, skewed_features) )
        if len(onehot_features) > 0:
            onehot_transformer = Pipeline(steps=[
                # TODO: Consider avoiding converting to string for improved memory efficiency
                ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)),
                ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)),
                ('onehot', OneHotMergeRaresHandleUnknownEncoder(max_levels=max_category_levels, sparse=False))]) # test-time unknown values will be encoded as all zeros vector
            transformers.append( ('onehot', onehot_transformer, onehot_features) )
        if len(embed_features) > 0: # Ordinal transformer applied to convert to-be-embedded categorical features to integer levels
            ordinal_transformer = Pipeline(steps=[
                ('to_str', FunctionTransformer(self.convert_df_dtype_to_str)),
                ('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)),
                ('ordinal', OrdinalMergeRaresHandleUnknownEncoder(max_levels=max_category_levels))]) # returns 0-n when max_category_levels = n-1. category n is reserved for unknown test-time categories.
            transformers.append( ('ordinal', ordinal_transformer, embed_features) )
        if len(language_features) > 0:
            raise NotImplementedError("language_features cannot be used at the moment")
        return ColumnTransformer(transformers=transformers) # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same. 
Example #28
Source File: test_model_performance.py    From DALEX with GNU General Public License v3.0 5 votes vote down vote up
def setUp(self):
        data = dx.datasets.load_titanic()
        data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived)

        self.X = data.drop(columns='survived')
        self.y = data.survived

        numeric_features = ['age', 'fare', 'sibsp', 'parch']
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

        categorical_features = ['gender', 'class', 'embarked']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])

        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50),
                                                           max_iter=400, random_state=0))])

        clf.fit(self.X, self.y)

        self.exp = dx.Explainer(clf, self.X, self.y, verbose=False)
        self.exp2 = dx.Explainer(clf, self.X, self.y, label="model2", verbose=False) 
Example #29
Source File: test_predict.py    From DALEX with GNU General Public License v3.0 5 votes vote down vote up
def setUp(self):
        data = dx.datasets.load_titanic()
        data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived)

        self.X = data.drop(columns='survived')
        self.y = data.survived

        numeric_features = ['age', 'fare', 'sibsp', 'parch']
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

        categorical_features = ['gender', 'class', 'embarked']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])

        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', MLPRegressor(hidden_layer_sizes=(150, 100, 50),
                                                          max_iter=500, random_state=0))])

        clf.fit(self.X, self.y)

        self.exp = dx.Explainer(clf, self.X, self.y, verbose=False) 
Example #30
Source File: test_aggregated_profiles.py    From DALEX with GNU General Public License v3.0 5 votes vote down vote up
def setUp(self):
        data = dx.datasets.load_titanic()
        data.loc[:, 'survived'] = LabelEncoder().fit_transform(data.survived)

        self.X = data.drop(columns='survived')
        self.y = data.survived

        numeric_features = ['age', 'fare', 'sibsp', 'parch']
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

        categorical_features = ['gender', 'class', 'embarked']
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])

        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', MLPClassifier(hidden_layer_sizes=(20, 20),
                                                           max_iter=400, random_state=0))])
        clf2 = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', MLPClassifier(hidden_layer_sizes=(50, 100, 50),
                                                            max_iter=400, random_state=0))])

        clf.fit(self.X, self.y)
        clf2.fit(self.X, self.y)

        self.exp = dx.Explainer(clf, self.X, self.y, label="model1", verbose=False)
        self.exp2 = dx.Explainer(clf2, self.X, self.y, verbose=False)
        self.exp3 = dx.Explainer(clf, self.X, self.y, label="model3", verbose=False)