Python Examples of sklearn.datasets.make_hastie_10

Source File: test_bagging.py From twitter-stock-recommendation with MIT License

6 votes

def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators does not",
                         clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_warm_start_fortran(Cls):
    # Test that feeding a X in Fortran-ordered is giving the same results as
    # in C-ordered
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est_c = Cls(n_estimators=1, random_state=1, warm_start=True)
    est_fortran = Cls(n_estimators=1, random_state=1, warm_start=True)

    est_c.fit(X, y)
    est_c.set_params(n_estimators=11)
    est_c.fit(X, y)

    X_fortran = np.asfortranarray(X)
    est_fortran.fit(X_fortran, y)
    est_fortran.set_params(n_estimators=11)
    est_fortran.fit(X_fortran, y)

    assert_array_almost_equal(est_c.predict(X), est_fortran.predict(X))

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

6 votes

def check_classification_synthetic(presort, loss):
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII Example 12.7.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, random_state=0)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert_less(error_rate, 0.09)

    gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, subsample=0.5,
                                      random_state=0,
                                      presort=presort)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert_less(error_rate, 0.08)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

6 votes

def test_warm_start_oob():
    # Test if warm start OOB equals fit.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=200, max_depth=1, subsample=0.5,
                  random_state=1)
        est.fit(X, y)

        est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5,
                     random_state=1, warm_start=True)
        est_ws.fit(X, y)
        est_ws.set_params(n_estimators=200)
        est_ws.fit(X, y)

        assert_array_almost_equal(est_ws.oob_improvement_[:100],
                                  est.oob_improvement_[:100])

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_warm_start(Cls):
    # Test if warm start equals fit.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=200, max_depth=1)
    est.fit(X, y)

    est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True)
    est_ws.fit(X, y)
    est_ws.set_params(n_estimators=200)
    est_ws.fit(X, y)

    if Cls is GradientBoostingRegressor:
        assert_array_almost_equal(est_ws.predict(X), est.predict(X))
    else:
        # Random state is preserved and hence predict_proba must also be
        # same
        assert_array_equal(est_ws.predict(X), est.predict(X))
        assert_array_almost_equal(est_ws.predict_proba(X),
                                  est.predict_proba(X))

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_check_inputs_predict_stages():
    # check that predict_stages through an error if the type of X is not
    # supported
    x, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    x_sparse_csc = csc_matrix(x)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(x, y)
    score = np.zeros((y.shape)).reshape(-1, 1)
    assert_raise_message(ValueError,
                         "When X is a sparse matrix, a CSR format is expected",
                         predict_stages, clf.estimators_, x_sparse_csc,
                         clf.learning_rate, score)
    x_fortran = np.asfortranarray(x)
    assert_raise_message(ValueError,
                         "X should be C-ordered np.ndarray",
                         predict_stages, clf.estimators_, x_fortran,
                         clf.learning_rate, score)

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def check_classification_synthetic(presort, loss):
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII Example 12.7.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, random_state=0)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert_less(error_rate, 0.09)

    gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, subsample=0.5,
                                      random_state=0,
                                      presort=presort)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert_less(error_rate, 0.08)

Source File: test_bagging.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = BaggingClassifier(n_estimators=n_estimators,
                                       random_state=random_state,
                                       warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert_equal(len(clf_ws), n_estimators)

    clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
                                  warm_start=False)
    clf_no_ws.fit(X, y)

    assert_equal(set([tree.random_state for tree in clf_ws]),
                 set([tree.random_state for tree in clf_no_ws]))

Source File: test_bagging.py From twitter-stock-recommendation with MIT License

6 votes

def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BaggingClassifier(n_estimators=5, warm_start=True,
                               random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BaggingClassifier(n_estimators=10, warm_start=False,
                            random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)

Source File: test_bagging.py From twitter-stock-recommendation with MIT License

6 votes

def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = BaggingClassifier(n_estimators=n_estimators,
                                       random_state=random_state,
                                       warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert_equal(len(clf_ws), n_estimators)

    clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
                                  warm_start=False)
    clf_no_ws.fit(X, y)

    assert_equal(set([tree.random_state for tree in clf_ws]),
                 set([tree.random_state for tree in clf_no_ws]))

Source File: test_bagging.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BaggingClassifier(n_estimators=5, warm_start=True,
                               random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BaggingClassifier(n_estimators=10, warm_start=False,
                            random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)

Source File: test_bagging.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators does not",
                         clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))

Source File: test_bagging.py From twitter-stock-recommendation with MIT License

5 votes

def test_warm_start_with_oob_score_fails():
    # Check using oob_score and warm_start simultaneously fails
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
    assert_raises(ValueError, clf.fit, X, y)

Source File: test_bagging.py From twitter-stock-recommendation with MIT License

5 votes

def test_oob_score_consistency():
    # Make sure OOB scores are identical when random_state, estimator, and
    # training data are fixed and fitting is done twice
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
                                max_features=0.5, oob_score=True,
                                random_state=1)
    assert_equal(bagging.fit(X, y).oob_score_, bagging.fit(X, y).oob_score_)

Source File: test_bagging.py From twitter-stock-recommendation with MIT License

5 votes

def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5,
                                max_features=0.5, random_state=1,
                                bootstrap=False)
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert_equal(len(estimators_samples), len(estimators))
    assert_equal(len(estimators_samples[0]), len(X))
    assert_equal(estimators_samples[0].dtype.kind, 'b')

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.coef_

    assert_array_almost_equal(orig_coefs, new_coefs)

Source File: test_bagging.py From twitter-stock-recommendation with MIT License

5 votes

def test_max_samples_consistency():
    # Make sure validated max_samples and original max_samples are identical
    # when valid integer max_samples supplied by user
    max_samples = 100
    X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1)
    bagging = BaggingClassifier(KNeighborsClassifier(),
                                max_samples=max_samples,
                                max_features=0.5, random_state=1)
    bagging.fit(X, y)
    assert_equal(bagging._max_samples, max_samples)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_max_feature_regression():
    # Test to make sure random state is set properly.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5,
                                      max_depth=2, learning_rate=.1,
                                      max_features=2, random_state=1)
    gbrt.fit(X_train, y_train)
    deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test))
    assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200,
                                     random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not fitted
    assert_raises(NotFittedError, lambda X: np.fromiter(
        clf.staged_predict_proba(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert_equal(y_test.shape, y_pred.shape)

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert_equal(y_test.shape[0], staged_proba.shape[0])
        assert_equal(2, staged_proba.shape[1])

    assert_array_equal(clf.predict_proba(X_test), staged_proba)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_warm_start():
    # Test if warm start equals fit.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=200, max_depth=1)
        est.fit(X, y)

        est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est_ws.fit(X, y)
        est_ws.set_params(n_estimators=200)
        est_ws.fit(X, y)

        assert_array_almost_equal(est_ws.predict(X), est.predict(X))

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_warm_start_n_estimators():
    # Test if warm start equals fit - set n_estimators.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=300, max_depth=1)
        est.fit(X, y)

        est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est_ws.fit(X, y)
        est_ws.set_params(n_estimators=300)
        est_ws.fit(X, y)

    assert_array_almost_equal(est_ws.predict(X), est.predict(X))

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_warm_start_smaller_n_estimators():
    # Test if warm start with smaller n_estimators raises error
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est.fit(X, y)
        est.set_params(n_estimators=99)
        assert_raises(ValueError, est.fit, X, y)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_warm_start_max_depth():
    # Test if possible to fit trees of different depth in ensemble.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est.fit(X, y)
        est.set_params(n_estimators=110, max_depth=2)
        est.fit(X, y)

        # last 10 trees have different depth
        assert_equal(est.estimators_[0, 0].max_depth, 1)
        for i in range(1, 11):
            assert_equal(est.estimators_[-i, 0].max_depth, 2)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_warm_start_clear():
    # Test if fit clears state.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=100, max_depth=1)
        est.fit(X, y)

        est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est_2.fit(X, y)  # inits state
        est_2.set_params(warm_start=False)
        est_2.fit(X, y)  # clears old state and equals est

        assert_array_almost_equal(est_2.predict(X), est.predict(X))

Source File: test_samples_generator.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_make_hastie_10_2():
    X, y = make_hastie_10_2(n_samples=100, random_state=0)
    assert_equal(X.shape, (100, 10), "X shape mismatch")
    assert_equal(y.shape, (100,), "y shape mismatch")
    assert_equal(np.unique(y).shape, (2,), "Unexpected number of classes")

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_warm_start_equal_n_estimators():
    # Test if warm start with equal n_estimators does nothing
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=100, max_depth=1)
        est.fit(X, y)

        est2 = clone(est)
        est2.set_params(n_estimators=est.n_estimators, warm_start=True)
        est2.fit(X, y)

        assert_array_almost_equal(est2.predict(X), est.predict(X))

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_warm_start_oob_switch():
    # Test if oob can be turned on during warm start.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=100, max_depth=1, warm_start=True)
        est.fit(X, y)
        est.set_params(n_estimators=110, subsample=0.5)
        est.fit(X, y)

        assert_array_equal(est.oob_improvement_[:100], np.zeros(100))
        # the last 10 are not zeros
        assert_array_equal(est.oob_improvement_[-10:] == 0.0,
                           np.zeros(10, dtype=np.bool))

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_monitor_early_stopping():
    # Test if monitor return value works.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)

    for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
        est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5)
        est.fit(X, y, monitor=early_stopping_monitor)
        assert_equal(est.n_estimators, 20)  # this is not altered
        assert_equal(est.estimators_.shape[0], 10)
        assert_equal(est.train_score_.shape[0], 10)
        assert_equal(est.oob_improvement_.shape[0], 10)

        # try refit
        est.set_params(n_estimators=30)
        est.fit(X, y)
        assert_equal(est.n_estimators, 30)
        assert_equal(est.estimators_.shape[0], 30)
        assert_equal(est.train_score_.shape[0], 30)

        est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5,
                  warm_start=True)
        est.fit(X, y, monitor=early_stopping_monitor)
        assert_equal(est.n_estimators, 20)
        assert_equal(est.estimators_.shape[0], 10)
        assert_equal(est.train_score_.shape[0], 10)
        assert_equal(est.oob_improvement_.shape[0], 10)

        # try refit
        est.set_params(n_estimators=30, warm_start=False)
        est.fit(X, y)
        assert_equal(est.n_estimators, 30)
        assert_equal(est.train_score_.shape[0], 30)
        assert_equal(est.estimators_.shape[0], 30)
        assert_equal(est.oob_improvement_.shape[0], 30)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_max_leaf_nodes_max_depth():
    # Test precedence of max_leaf_nodes over max_depth.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [GradientBoostingRegressor,
                      GradientBoostingClassifier]

    k = 4
    for GBEstimator in all_estimators:
        est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
        tree = est.estimators_[0, 0].tree_
        assert_greater(tree.max_depth, 1)

        est = GBEstimator(max_depth=1).fit(X, y)
        tree = est.estimators_[0, 0].tree_
        assert_equal(tree.max_depth, 1)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_min_impurity_split():
    # Test if min_impurity_split of base estimators is set
    # Regression test for #8006
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier]

    for GBEstimator in all_estimators:
        est = GBEstimator(min_impurity_split=0.1)
        est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
                                   est.fit, X, y)
        for tree in est.estimators_.flat:
            assert_equal(tree.min_impurity_split, 0.1)

Source File: test_gradient_boosting.py From twitter-stock-recommendation with MIT License

5 votes

def test_min_impurity_decrease():
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [GradientBoostingRegressor, GradientBoostingClassifier]

    for GBEstimator in all_estimators:
        est = GBEstimator(min_impurity_decrease=0.1)
        est.fit(X, y)
        for tree in est.estimators_.flat:
            # Simply check if the parameter is passed on correctly. Tree tests
            # will suffice for the actual working of this param
            assert_equal(tree.min_impurity_decrease, 0.1)

Python sklearn.datasets.make_hastie_10_2() Examples