Python Examples of sklearn.preprocessing.FunctionTransformer

Source File: test_compose.py From sktime with BSD 3-Clause "New" or "Revised" License

7 votes

def test_ColumnTransformer_pipeline():
    X_train, y_train = load_basic_motions(split="train", return_X_y=True)
    X_test, y_test = load_basic_motions(split="test", return_X_y=True)

    # using Identity function transformers (transform series to series)
    def id_func(X):
        return X
    column_transformer = ColumnTransformer([
        ('id0', FunctionTransformer(func=id_func, validate=False), ['dim_0']),
        ('id1', FunctionTransformer(func=id_func, validate=False), ['dim_1'])
    ])
    steps = [
        ('extract', column_transformer),
        ('tabularise', Tabularizer()),
        ('classify', RandomForestClassifier(n_estimators=2, random_state=1))]
    model = Pipeline(steps=steps)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    assert y_pred.shape[0] == y_test.shape[0]
    np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))

Source File: test_RandomIntervalFeatureExtractor.py From sktime with BSD 3-Clause "New" or "Revised" License

6 votes

def test_different_implementations():
    random_state = 1233
    X_train, y_train = make_classification_problem()

    # Compare with chained transformations.
    tran1 = RandomIntervalSegmenter(n_intervals='sqrt',
                                    random_state=random_state)
    tran2 = RowTransformer(FunctionTransformer(func=np.mean, validate=False))
    A = tran2.fit_transform(tran1.fit_transform(X_train))

    tran = RandomIntervalFeatureExtractor(n_intervals='sqrt',
                                          features=[np.mean],
                                          random_state=random_state)
    B = tran.fit_transform(X_train)

    np.testing.assert_array_equal(A, B)


# Compare with transformer pipeline using TSFeatureUnion.

Source File: test_RandomIntervalFeatureExtractor.py From sktime with BSD 3-Clause "New" or "Revised" License

6 votes

def test_different_pipelines():
    random_state = 1233
    X_train, y_train = make_classification_problem()
    steps = [
        ('segment', RandomIntervalSegmenter(n_intervals='sqrt',
                                            random_state=random_state)),
        ('transform', FeatureUnion([
            ('mean', RowTransformer(
                FunctionTransformer(func=np.mean, validate=False))),
            ('std',
             RowTransformer(FunctionTransformer(func=np.std, validate=False))),
            ('slope', RowTransformer(
                FunctionTransformer(func=time_series_slope, validate=False))),
        ])),
    ]
    pipe = Pipeline(steps)
    a = pipe.fit_transform(X_train)
    tran = RandomIntervalFeatureExtractor(n_intervals='sqrt',
                                          features=[np.mean, np.std,
                                                    time_series_slope],
                                          random_state=random_state)
    b = tran.fit_transform(X_train)
    np.testing.assert_array_equal(a, b)
    np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)

Source File: sindy.py From sparsereg with MIT License

6 votes

def __init__(
        self,
        alpha=1.0,
        threshold=0.1,
        degree=3,
        operators=None,
        dt=1.0,
        n_jobs=1,
        derivative=None,
        feature_names=None,
        kw={},
    ):
        self.alpha = alpha
        self.threshold = threshold
        self.degree = degree
        self.operators = operators
        self.n_jobs = n_jobs
        self.derivative = derivative or FunctionTransformer(func=_derivative, kw_args={"dt": dt})
        self.feature_names = feature_names
        self.kw = kw

Source File: common_tabular_tests.py From interpret-community with MIT License

6 votes

def _get_transformations_one_to_many_greater(self, feature_names):
        # results in number of features greater than original features
        # copy all features except last one. For last one, replicate columns to create 3 more features
        transformations = []
        feature_names = list(feature_names)
        index = 0
        for f in feature_names[:-1]:
            transformations.append(("{}".format(index), "passthrough", [f]))
            index += 1

        def copy_func(x):
            return np.tile(x, (1, 3))

        copy_transformer = FunctionTransformer(copy_func)

        transformations.append(("copy_transformer", copy_transformer, [feature_names[-1]]))

        return ColumnTransformer(transformations)

Source File: dataloader.py From models with MIT License

6 votes

def __init__(self, pos_features, pipeline_obj_path):
        """
        Args:
          pos_features: list of positional features to use
          pipeline_obj_path: path to the serialized pipeline obj_path
        """
        self.pos_features = pos_features
        self.pipeline_obj_path = pipeline_obj_path

        # deserialize the pickle file
        with open(self.pipeline_obj_path, "rb") as f:
            pipeline_obj = pickle.load(f)
        self.POS_FEATURES = pipeline_obj[0]
        self.minmax_scaler = pipeline_obj[1]
        self.imp = pipeline_obj[2]

        self.funct_transform = FunctionTransformer(func=sign_log_func,
                                                   inverse_func=sign_log_func_inverse)
        # for simplicity, assume all current pos_features are the
        # same as from before
        assert self.POS_FEATURES == self.pos_features

Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
        self.assertIs(df.preprocessing.FunctionTransformer,
                      pp.FunctionTransformer)
        self.assertIs(df.preprocessing.Imputer, pp.Imputer)
        self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
        self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
        self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
        self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
        self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
        self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
        self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
        self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
        self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
        self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
        self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)

Source File: test_pipeline.py From sktime with BSD 3-Clause "New" or "Revised" License

6 votes

def test_FeatureUnion_pipeline():
    # pipeline with segmentation plus multiple feature extraction
    steps = [
        ('segment', RandomIntervalSegmenter(n_intervals=3)),
        ('transform', FeatureUnion([
            ('mean', RowTransformer(
                FunctionTransformer(func=np.mean, validate=False))),
            ('std',
             RowTransformer(FunctionTransformer(func=np.std, validate=False)))
        ])),
        ('clf', DecisionTreeClassifier())
    ]
    clf = Pipeline(steps)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    assert y_pred.shape[0] == y_test.shape[0]
    np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))

Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0

5 votes

def test_predict_proba_transform(self):
		predict_proba_transformer = FunctionTransformer(numpy.log)
		pipeline = PMMLPipeline([("estimator", DummyClassifier(strategy = "prior"))], predict_proba_transformer = predict_proba_transformer)
		X = DataFrame([1.0, 1.0, 1.0, 1.0, 1.0, 1.0], columns = ["x"])
		y = Series(["green", "red", "yellow", "green", "red", "green"], name = "y")
		pipeline.fit(X, y)
		self.assertEqual(["green", "red", "yellow"], pipeline._final_estimator.classes_.tolist())
		y_proba = [3 / 6.0, 2 / 6.0, 1 / 6.0]
		y_probat = [numpy.log(x) for x in y_proba]
		self.assertEqual([y_proba for i in range(0, 6)], pipeline.predict_proba(X).tolist())
		self.assertEqual([y_proba + y_probat for i in range(0, 6)], pipeline.predict_proba_transform(X).tolist())

Source File: test_compose.py From sktime with BSD 3-Clause "New" or "Revised" License

5 votes

def test_row_transformer_function_transformer_series_to_primitives():
    X, y = load_gunpoint(return_X_y=True)
    ft = FunctionTransformer(func=np.mean, validate=False)
    t = RowTransformer(ft)
    Xt = t.fit_transform(X, y)
    assert Xt.shape == X.shape
    assert isinstance(Xt.iloc[0, 0],
                      float)  # check series-to-primitive transforms

Source File: test_pipeline.py From sktime with BSD 3-Clause "New" or "Revised" License

5 votes

def test_FeatureUnion():
    X, y = load_gunpoint(return_X_y=True)
    ft = FunctionTransformer(func=np.mean, validate=False)
    t = RowTransformer(ft)
    fu = FeatureUnion([
        ('mean', t),
        ('std',
         RowTransformer(FunctionTransformer(func=np.std, validate=False)))
    ])
    Xt = fu.fit_transform(X, y)
    assert Xt.shape == (X.shape[0], X.shape[1] * len(fu.transformer_list))

Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0

5 votes

def test_predict_transform(self):
		predict_transformer = FeatureUnion([
			("identity", FunctionTransformer(None)),
			("log10", FunctionTransformer(numpy.log10))
		])
		pipeline = PMMLPipeline([("estimator", DummyRegressor())], predict_transformer = predict_transformer)
		X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = ["X1", "X2"])
		y = Series([0.5, 1.0, 1.5], name = "y")
		pipeline.fit(X, y)
		y_pred = [1.0, 1.0, 1.0]
		y_predt = [1.0, 1.0, numpy.log10(1.0)]
		self.assertEqual(y_pred, pipeline.predict(X).tolist())
		self.assertEqual([y_predt for i in range(0, 3)], pipeline.predict_transform(X).tolist())

Source File: test_compose.py From sktime with BSD 3-Clause "New" or "Revised" License

5 votes

def test_row_transformer_function_transformer_series_to_series():
    X, y = load_gunpoint(return_X_y=True)

    # series-to-series transform function
    def powerspectrum(x):
        fft = np.fft.fft(x)
        ps = fft.real * fft.real + fft.imag * fft.imag
        return ps[:ps.shape[0] // 2]

    ft = FunctionTransformer(func=powerspectrum, validate=False)
    t = RowTransformer(ft)
    Xt = t.fit_transform(X, y)
    assert Xt.shape == X.shape
    assert isinstance(Xt.iloc[0, 0], (
        pd.Series, np.ndarray))  # check series-to-series transforms

Source File: test_sklearn_model_export.py From mlflow with Apache License 2.0

5 votes

def sklearn_custom_transformer_model(sklearn_knn_model):
    def transform(vec):
        print("Invoking custom transformer!")
        return vec + 1

    transformer = SKFunctionTransformer(transform, validate=True)
    pipeline = SKPipeline([("custom_transformer", transformer), ("knn", sklearn_knn_model.model)])
    return ModelWithData(pipeline, inference_data=datasets.load_iris().data[:, :2])

Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_FunctionTransformer(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        mod1 = df.pp.FunctionTransformer(func=lambda x: x + 1)
        df.fit(mod1)
        result = df.transform(mod1)

        exp = df.copy()
        exp.data = exp.data + 1

        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, exp)

Source File: test_function_transformer.py From twitter-stock-recommendation with MIT License

5 votes

def test_np_log():
    X = np.arange(10).reshape((5, 2))

    # Test that the numpy.log example still works.
    assert_array_equal(
        FunctionTransformer(np.log1p).transform(X),
        np.log1p(X),
    )

Source File: test_function_transformer.py From twitter-stock-recommendation with MIT License

5 votes

def test_kw_arg():
    X = np.linspace(0, 1, num=10).reshape((5, 2))

    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))

    # Test that rounding is correct
    assert_array_equal(F.transform(X),
                       np.around(X, decimals=3))

Source File: test_function_transformer.py From twitter-stock-recommendation with MIT License

5 votes

def test_kw_arg_update():
    X = np.linspace(0, 1, num=10).reshape((5, 2))

    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))

    F.kw_args['decimals'] = 1

    # Test that rounding is correct
    assert_array_equal(F.transform(X), np.around(X, decimals=1))

Source File: test_function_transformer.py From twitter-stock-recommendation with MIT License

5 votes

def test_kw_arg_reset():
    X = np.linspace(0, 1, num=10).reshape((5, 2))

    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))

    F.kw_args = dict(decimals=1)

    # Test that rounding is correct
    assert_array_equal(F.transform(X), np.around(X, decimals=1))

Source File: mercari_golf.py From mercari-solution with MIT License

5 votes

def main():
    vectorizer = make_union(
        on_field('name', Tfidf(max_features=100000, token_pattern='\w+')),
        on_field('text', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))),
        on_field(['shipping', 'item_condition_id'],
                 FunctionTransformer(to_records, validate=False), DictVectorizer()),
        n_jobs=4)
    y_scaler = StandardScaler()
    with timer('process train'):
        train = pd.read_table('../input/train.tsv')
        train = train[train['price'] > 0].reset_index(drop=True)
        cv = KFold(n_splits=20, shuffle=True, random_state=42)
        train_ids, valid_ids = next(cv.split(train))
        train, valid = train.iloc[train_ids], train.iloc[valid_ids]
        y_train = y_scaler.fit_transform(np.log1p(train['price'].values.reshape(-1, 1)))
        X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)
        print(f'X_train: {X_train.shape} of {X_train.dtype}')
        del train
    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
    with ThreadPool(processes=4) as pool:
        Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0)
    y_pred = np.expm1(y_scaler.inverse_transform(y_pred.reshape(-1, 1))[:, 0])
    print('Valid RMSLE: {:.4f}'.format(np.sqrt(mean_squared_log_error(valid['price'], y_pred))))

Source File: test_transformers.py From gordo with GNU Affero General Public License v3.0

5 votes

def test_multiply_by_function_transformer(self):
        from gordo.machine.model.transformer_funcs.general import multiply_by

        # Provide a require argument
        tf = FunctionTransformer(func=multiply_by, kw_args={"factor": 2})
        self._validate_transformer(tf)

        # Ignore the required argument
        tf = FunctionTransformer(func=multiply_by)
        with self.assertRaises(TypeError):
            self._validate_transformer(tf)

Source File: test_target.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_transform_target_regressor_1d_transformer(X, y):
    # All transformer in scikit-learn expect 2D data. FunctionTransformer with
    # validate=False lift this constraint without checking that the input is a
    # 2D vector. We check the consistency of the data shape using a 1D and 2D y
    # array.
    transformer = FunctionTransformer(func=lambda x: x + 1,
                                      inverse_func=lambda x: x - 1,
                                      validate=False)
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=transformer)
    y_pred = regr.fit(X, y).predict(X)
    assert y.shape == y_pred.shape
    # consistency forward transform
    y_tran = regr.transformer_.transform(y)
    _check_shifted_by_one(y, y_tran)
    assert y.shape == y_pred.shape
    # consistency inverse transform
    assert_allclose(y, regr.transformer_.inverse_transform(
        y_tran).squeeze())
    # consistency of the regressor
    lr = LinearRegression()
    transformer2 = clone(transformer)
    lr.fit(X, transformer2.fit_transform(y))
    y_lr_pred = lr.predict(X)
    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
    assert_allclose(regr.regressor_.coef_, lr.coef_)