Python Examples of sklearn.pipeline.make

Source File: 2_transform_solution.py From pandas-feature-union with MIT License

7 votes

def main():
    raw_data = load_iris()
    data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])
    data.loc[:, "class"] = raw_data["target"]

    pipeline = FeatureUnion([
        ("1", make_pipeline(
            PandasTransform(lambda X: X.loc[:, ["sepal length (cm)"]]),
            # other transformations
        )),
        ("2", make_pipeline(
            PandasTransform(lambda X: X.loc[:, ["sepal width (cm)"]]),
            # other transformations
        ))
    ])

    X = pipeline.fit_transform(data)
    print(X["sepal length (cm)"].mean())
    print(X["sepal width (cm)"].mean())

Source File: 1_problem.py From pandas-feature-union with MIT License

7 votes

def main():
    raw_data = load_iris()
    data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])

    pipeline = FeatureUnion([
        ("1", make_pipeline(
            FunctionTransformer(lambda X: X.loc[:, ["sepal length (cm)"]]),
            # other transformations
        )),
        ("2", make_pipeline(
            FunctionTransformer(lambda X: X.loc[:, ["sepal width (cm)"]]),
            # other transformations
        ))
    ])

    X = pipeline.fit_transform(data)
    print(X["sepal length (cm)"].mean())
    print(X["sepal width (cm)"].mean())

Source File: test_pipeline.py From Mastering-Elasticsearch-7.0 with MIT License

7 votes

def test_make_pipeline():
    t1 = Transf()
    t2 = Transf()
    pipe = make_pipeline(t1, t2)
    assert isinstance(pipe, Pipeline)
    assert_equal(pipe.steps[0][0], "transf-1")
    assert_equal(pipe.steps[1][0], "transf-2")

    pipe = make_pipeline(t1, t2, FitParamT())
    assert isinstance(pipe, Pipeline)
    assert_equal(pipe.steps[0][0], "transf-1")
    assert_equal(pipe.steps[1][0], "transf-2")
    assert_equal(pipe.steps[2][0], "fitparamt")

    assert_raise_message(
        TypeError,
        'Unknown keyword arguments: "random_parameter"',
        make_pipeline, t1, t2, random_parameter='rnd'
    )

Source File: benchmark_test.py From nni with MIT License

6 votes

def test_time(pipeline_name, name, path):
    if pipeline_name == "LR":
        pipeline = make_pipeline(LogisticRegression())

    if pipeline_name == "FGS":
        pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())

    if pipeline_name == "Tree":
        pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
    
    test_benchmark = Benchmark()
    print("Dataset:\t", name)
    print("Pipeline:\t", pipeline_name)
    starttime = datetime.datetime.now()
    test_benchmark.run_test(pipeline, name, path)
    endtime = datetime.datetime.now()
    print("Used time: ", (endtime - starttime).microseconds/1000)
    print("")

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_gradient_boosting_with_init_pipeline():
    # Check that the init estimator can be a pipeline (see issue #13466)

    X, y = make_regression(random_state=0)
    init = make_pipeline(LinearRegression())
    gb = GradientBoostingRegressor(init=init)
    gb.fit(X, y)  # pipeline without sample_weight works fine

    with pytest.raises(
            ValueError,
            match='The initial estimator Pipeline does not support sample '
                  'weights'):
        gb.fit(X, y, sample_weight=np.ones(X.shape[0]))

    # Passing sample_weight to a pipeline raises a ValueError. This test makes
    # sure we make the distinction between ValueError raised by a pipeline that
    # was passed sample_weight, and a ValueError raised by a regular estimator
    # whose input checking failed.
    with pytest.raises(
            ValueError,
            match='nu <= 0 or nu > 1'):
        # Note that NuSVR properly supports sample_weight
        init = NuSVR(gamma='auto', nu=1.5)
        gb = GradientBoostingRegressor(init=init)
        gb.fit(X, y, sample_weight=np.ones(X.shape[0]))

Source File: test_pprint.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_pipeline():
    # Render a pipeline object
    pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
    expected = """
Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=999, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)"""

    expected = expected[1:]  # remove first \n
    assert pipeline.__repr__() == expected

Source File: test_core_pipeline.py From lale with Apache License 2.0

6 votes

def test_import_from_sklearn_pipeline_feature_union(self):
        from sklearn.pipeline import FeatureUnion        
        from sklearn.decomposition import PCA
        from sklearn.kernel_approximation import Nystroem
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.pipeline import make_pipeline
        union = FeatureUnion([("pca", PCA(n_components=1)), ("nys", Nystroem(n_components=2, random_state=42))])        
        sklearn_pipeline = make_pipeline(union, KNeighborsClassifier())
        lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
        self.assertEqual(len(lale_pipeline.edges()), 3)
        from lale.lib.sklearn.pca import PCAImpl
        from lale.lib.sklearn.nystroem import NystroemImpl
        from lale.lib.lale.concat_features import ConcatFeaturesImpl
        from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
        self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), PCAImpl)
        self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), KNeighborsClassifierImpl)
        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)

Source File: 3_feature_union_solution.py From pandas-feature-union with MIT License

6 votes

def main():
    raw_data = load_iris()
    data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"])
    data.loc[:, "class"] = raw_data["target"]

    pipeline = PandasFeatureUnion([
        ("1", make_pipeline(
            PandasTransform(lambda X: X.loc[:, ["sepal length (cm)"]]),
            # other transformations
        )),
        ("2", make_pipeline(
            PandasTransform(lambda X: X.loc[:, ["sepal width (cm)"]]),
            # other transformations
        ))
    ])

    X = pipeline.fit_transform(data)
    print(X["sepal length (cm)"].mean())
    print(X["sepal width (cm)"].mean())

Source File: test_core_pipeline.py From lale with Apache License 2.0

6 votes

def test_compare_with_sklearn(self):
        from lale.operators import make_pipeline
        tfm = PCA()
        clf = LogisticRegression(LogisticRegression.solver.lbfgs, LogisticRegression.multi_class.auto)
        trainable = make_pipeline(tfm, clf)
        digits = sklearn.datasets.load_digits()
        trained = trainable.fit(digits.data, digits.target)
        predicted = trained.predict(digits.data)
        from sklearn.pipeline import make_pipeline as scikit_make_pipeline
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.linear_model import LogisticRegression as SklearnLR
        sklearn_pipeline = scikit_make_pipeline(SklearnPCA(), SklearnLR(solver="lbfgs", multi_class="auto"))
        sklearn_pipeline.fit(digits.data, digits.target)
        predicted_sklearn = sklearn_pipeline.predict(digits.data)

        from sklearn.metrics import accuracy_score
        lale_score = accuracy_score(digits.target, predicted)
        scikit_score = accuracy_score(digits.target, predicted_sklearn)
        self.assertEqual(lale_score, scikit_score)

Source File: test_coordinate_descent.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_lasso_cv_with_some_model_selection():
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import StratifiedKFold
    from sklearn import datasets
    from sklearn.linear_model import LassoCV

    diabetes = datasets.load_diabetes()
    X = diabetes.data
    y = diabetes.target

    pipe = make_pipeline(
        StandardScaler(),
        LassoCV(cv=StratifiedKFold(n_splits=5))
    )
    pipe.fit(X, y)

Source File: language_detector.py From text-mining-class with MIT License

6 votes

def build_language_classifier(texts, labels, verbose=False, random_state=None):
    """Train a text classifier with scikit-learn

    The text classifier is composed of two elements assembled in a pipeline:

    - A text feature extractor (`TfidfVectorizer`) that extract the relative
      frequencies of unigrams, bigrams and trigrams of characters in the text.

    - An instance of `SGDClassifier` for the classification it-self. To speed
      up training it is recommended to enable early stopping.

    `random_state` is passed to the underlying `SGDClassifier` instance.
    """
    language_classifier = make_pipeline(
        TfidfVectorizer(analyzer="char", ngram_range=(1, 3),
                        min_df=2, max_df=0.9, norm="l2", dtype=np.float32),
        SGDClassifier(early_stopping=True, validation_fraction=0.2,
                      n_iter_no_change=3, max_iter=1000, tol=1e-3,
                      alpha=1e-5, penalty="l2", verbose=verbose,
                      random_state=random_state)
    )
    return language_classifier.fit(texts, labels)

Source File: tests.py From scikit-mdr with MIT License

6 votes

def test_mdr_sklearn_pipeline_parallel():
    """Ensure that MDR can be used as a transformer in a parallelized scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True), n_jobs=-1)
    assert np.mean(cv_scores) > 0.

Source File: tests.py From scikit-mdr with MIT License

6 votes

def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0.

Source File: sklearn_test.py From nni with MIT License

6 votes

def test():
    url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
    urllib.request.urlretrieve(url_zip_train, filename='train.bz2')

    f_svm = open('train.svm', 'wt')
    with bz2.open('train.bz2', 'rb') as f_zip:
        data = f_zip.read()
        f_svm.write(data.decode('utf-8'))
    f_svm.close()


    X, y = load_svmlight_file('train.svm')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


    pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
    # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())

    pipeline.fit(X_train, y_train)

    print("Pipeline Score: ", pipeline.score(X_train, y_train))

Source File: test_core_operators.py From lale with Apache License 2.0

5 votes

def test_comparison_with_scikit(self):
        import warnings
        warnings.filterwarnings("ignore")
        from lale.lib.sklearn import PCA
        import sklearn.datasets
        from lale.helpers import cross_val_score
        pca = PCA(n_components=3, random_state=42, svd_solver='arpack')
        nys = Nystroem(n_components=10, random_state=42)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        trainable = (pca & nys) >> concat >> lr
        digits = sklearn.datasets.load_digits()
        X, y = sklearn.utils.shuffle(digits.data, digits.target, random_state=42)

        cv_results = cross_val_score(trainable, X, y)
        cv_results = ['{0:.1%}'.format(score) for score in cv_results]

        from sklearn.pipeline import make_pipeline, FeatureUnion
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.kernel_approximation import Nystroem as SklearnNystroem
        from sklearn.linear_model import LogisticRegression as SklearnLR
        from sklearn.model_selection import cross_val_score
        union = FeatureUnion([("pca", SklearnPCA(n_components=3, random_state=42, svd_solver='arpack')),
                            ("nys", SklearnNystroem(n_components=10, random_state=42))])
        lr = SklearnLR(random_state=42, C=0.1)
        pipeline = make_pipeline(union, lr)

        scikit_cv_results = cross_val_score(pipeline, X, y, cv = 5)
        scikit_cv_results = ['{0:.1%}'.format(score) for score in scikit_cv_results]
        self.assertEqual(cv_results, scikit_cv_results)
        warnings.resetwarnings()

Source File: ols.py From bartpy with MIT License

5 votes

def run(n: int=10000, k_true: int=3, k_null: int=2):
    b_true = np.random.uniform(2, 0.1, size=k_true)
    b_true = np.array(list(b_true) + [0.0] * k_null)
    print(b_true)
    x = np.random.normal(0, 1, size=n * (k_true + k_null)).reshape(n, (k_true + k_null))

    X = pd.DataFrame(x)
    y = np.random.normal(0, 0.1, size=n) + np.array(X.multiply(b_true, axis=1).sum(axis=1))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42,
                                                        shuffle=True)

    model = SklearnModel(n_samples=50,
                         n_burn=50,
                         n_trees=20,
                         store_in_sample_predictions=False,
                         n_jobs=3,
                         n_chains=1)

    pipeline = make_pipeline(SelectNullDistributionThreshold(model, n_permutations=20), model)
    pipeline_model = pipeline.fit(X_train, y_train)
    print("Thresholds", pipeline_model.named_steps["selectnulldistributionthreshold"].thresholds)
    print("Feature Proportions", pipeline_model.named_steps["selectnulldistributionthreshold"].feature_proportions)
    print("Is Kept", pipeline_model.named_steps["selectnulldistributionthreshold"]._get_support_mask())
    pipeline_model.named_steps["selectnulldistributionthreshold"].plot()

Source File: test_core_operators.py From lale with Apache License 2.0

5 votes

def test_concat_with_hyperopt2(self):
        from lale.operators import make_pipeline, make_union
        from lale.lib.lale import Hyperopt
        pca = PCA(n_components=3)
        nys = Nystroem(n_components=10)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)

        trainable = make_pipeline(make_union(pca, nys), lr)
        clf = Hyperopt(estimator=trainable, max_evals=2)
        from sklearn.datasets import load_iris
        iris_data = load_iris()
        clf.fit(iris_data.data, iris_data.target)
        clf.predict(iris_data.data)

Source File: sklearn_patches.py From tslearn with BSD 2-Clause "Simplified" License

5 votes

def check_pipeline_consistency(name, estimator_orig):
    if estimator_orig._get_tags()['non_deterministic']:
        msg = name + ' is non deterministic'
        raise SkipTest(msg)

    # check that make_pipeline(est) gives same score as est
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X -= X.min()
    X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)
    set_random_state(estimator)
    pipeline = make_pipeline(estimator)
    estimator.fit(X, y)
    pipeline.fit(X, y)

    funcs = ["score", "fit_transform"]

    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func_pipeline = getattr(pipeline, func_name)
            result = func(X, y)
            result_pipe = func_pipeline(X, y)
            assert_allclose_dense_sparse(result, result_pipe)