Python Examples of sklearn.ensemble.GradientBoostingClassifier

Source File: grid_search_cv.py From text-classifier with Apache License 2.0

7 votes

def search_cv(x_train, y_train, x_test, y_test, model=GradientBoostingClassifier(n_estimators=30)):
    # grid search找到最好的参数
    parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 2, 4], 'gamma': [0.125, 0.25, 0.5, 1, 2, 4]}
    clf = GridSearchCV(model, param_grid=parameters)
    grid_search = clf.fit(x_train, y_train)
    # 对结果打分
    print("Best score: %0.3f" % grid_search.best_score_)
    print(grid_search.best_estimator_)

    # best prarams
    print('best prarams:', clf.best_params_)

    print('-----grid search end------------')
    print('on all train set')
    scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=3, scoring='accuracy')
    print(scores.mean(), scores)
    print('on test set')
    scores = cross_val_score(grid_search.best_estimator_, x_test, y_test, cv=3, scoring='accuracy')
    print(scores.mean(), scores)

Source File: test_partial_dependence.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_recursion_decision_function(target_feature):
    # Make sure the recursion method (implicitly uses decision_function) has
    # the same result as using brute method with
    # response_method=decision_function

    X, y = make_classification(n_classes=2, n_clusters_per_class=1,
                               random_state=1)
    assert np.mean(y) == .5  # make sure the init estimator predicts 0 anyway

    est = GradientBoostingClassifier(random_state=0, loss='deviance')
    est.fit(X, y)

    preds_1, _ = partial_dependence(est, X, [target_feature],
                                    response_method='decision_function',
                                    method='recursion')
    preds_2, _ = partial_dependence(est, X, [target_feature],
                                    response_method='decision_function',
                                    method='brute')

    assert_allclose(preds_1, preds_2, atol=1e-7)

Source File: GBDT_Classify_adult.py From Machine-Learning-for-Beginner-by-Python3 with MIT License

6 votes

def recspre(estrs, predata, datadict, zhe):

    mo, ze = estrs.split('-')
    model = GradientBoostingClassifier(loss='deviance', n_estimators=int(mo), max_depth=int(ze), learning_rate=0.1)

    model.fit(datadict[zhe]['train'][:, :-1], datadict[zhe]['train'][:, -1])

    # 预测
    yucede = model.predict(predata[:, :-1])
    # 计算混淆矩阵

    print(ConfuseMatrix(predata[:, -1], yucede))

    return fmse(predata[:, -1], yucede)

# 主函数

Source File: predict.py From Loan_Default_Prediction with BSD 3-Clause "New" or "Revised" License

6 votes

def gbc_gp_predict(train_x, train_y, test_x):
    feature_indexs = getTopFeatures(train_x, train_y)
    sub_x_Train = get_data(train_x, feature_indexs[:16], features.feature_pair_sub_list
            ,features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20])
    sub_x_Test = get_data(test_x, feature_indexs[:16], features.feature_pair_sub_list
            ,features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20])
    labels = toLabels(train_y)
    gbc = GradientBoostingClassifier(n_estimators=3000, max_depth=9)
    gbc.fit(sub_x_Train, labels)
    pred_probs = gbc.predict_proba(sub_x_Test)[:,1]
    ind_test = np.where(pred_probs>0.55)[0]
    gp_preds_part = gbc_gp_predict_part(sub_x_Train, train_y, sub_x_Test[ind_test])
    gp_preds = np.zeros(len(test_x))
    gp_preds[ind_test] = gp_preds_part
    return gp_preds


# invoke the function gbc_svr_predict_part

Source File: test_general.py From hyperparameter_hunter with MIT License

6 votes

def env_5(request):
    return Environment(
        train_dataset=get_breast_cancer_data(),
        results_path=assets_dir,
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type=StratifiedKFold,
        cv_params=dict(n_splits=3, shuffle=True, random_state=32),
        experiment_recorders=request.param,
    )


##################################################
# Experiment Fixtures
##################################################
#################### GradientBoostingClassifier Experiments ####################

Source File: GradientBoostingClassifier.py From Splunking-Crime with GNU Affero General Public License v3.0

6 votes

def __init__(self, options):
        self.handle_options(options)
        params = options.get('params', {})
        out_params = convert_params(
            params,
            strs=['loss', 'max_features'],
            floats=['learning_rate', 'min_weight_fraction_leaf'],
            ints=['n_estimators', 'max_depth', 'min_samples_split',
                  'min_samples_leaf', 'max_leaf_nodes', 'random_state'],
        )

        valid_loss = ['deviance', 'exponential']
        if 'loss' in out_params:
            if out_params['loss'] not in valid_loss:
                msg = "loss must be one of: {}".format(', '.join(valid_loss))
                raise RuntimeError(msg)

        if 'max_features' in out_params:
            out_params['max_features'] = handle_max_features(out_params['max_features'])

        if 'max_leaf_nodes' in out_params and 'max_depth' in out_params:
            messages.warn('max_depth ignored when max_leaf_nodes is set')

        self.estimator = _GradientBoostingClassifier(**out_params)

Source File: test_full_pipelines.py From python-sasctl with Apache License 2.0

6 votes

def test_register_model(self, iris_dataset):
        pytest.importorskip('sklearn')
        from sasctl import register_model
        from sklearn.ensemble import GradientBoostingClassifier

        TARGET = 'Species'

        X = iris_dataset.drop(TARGET, axis=1)
        y = iris_dataset[TARGET]

        model = GradientBoostingClassifier()
        model.fit(X, y)

        model = register_model(model, self.MODEL_NAME, self.PROJECT_NAME, input=X, force=True)
        assert model.name == self.MODEL_NAME
        assert model.projectName == self.PROJECT_NAME
        assert model.function.lower() == 'classification'
        assert model.algorithm.lower() == 'gradient boosting'
        assert model.tool.lower().startswith('python')

Source File: GBDT_Classify_adult.py From Machine-Learning-for-Beginner-by-Python3 with MIT License

6 votes

def Train(data, modelcount, censhu, yanzhgdata):
    model = GradientBoostingClassifier(loss='deviance', n_estimators=modelcount, max_depth=censhu, learning_rate=0.1, max_features='sqrt')

    model.fit(data[:, :-1], data[:, -1])
    # 给出训练数据的预测值
    train_out = model.predict(data[:, :-1])
    # 计算MSE
    train_mse = fmse(data[:, -1], train_out)[0]

    # 给出验证数据的预测值
    add_yan = model.predict(yanzhgdata[:, :-1])
    # 计算f1度量
    add_mse = fmse(yanzhgdata[:, -1], add_yan)[0]
    print(train_mse, add_mse)
    return train_mse, add_mse

# 最终确定组合的函数

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def check_classification_synthetic(presort, loss):
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII Example 12.7.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, random_state=0)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert_less(error_rate, 0.09)

    gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2,
                                      max_depth=1, loss=loss,
                                      learning_rate=1.0, subsample=0.5,
                                      random_state=0,
                                      presort=presort)
    gbrt.fit(X_train, y_train)
    error_rate = (1.0 - gbrt.score(X_test, y_test))
    assert_less(error_rate, 0.08)

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_probability_log():
    # Predict probabilities.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    assert_raises(ValueError, clf.predict_proba, T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # check if probabilities are in [0, 1].
    y_proba = clf.predict_proba(T)
    assert np.all(y_proba >= 0.0)
    assert np.all(y_proba <= 1.0)

    # derive predictions from probabilities
    y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
    assert_array_equal(y_pred, true_result)

Source File: gradient_boosting_blending.py From DataMiningCompetitionFirstPrize with MIT License

6 votes

def learn(x, y, test_x):
    # set sample weight


    weight_list = []
    for j in range(len(y)):
        if y[j] == "0":
            weight_list.append(variables.weight_0_gdbt_b)
        if y[j] == "1000":
            weight_list.append(variables.weight_1000_gdbt_b)
        if y[j] == "1500":
            weight_list.append(variables.weight_1500_gdbt_b)
        if y[j] == "2000":
            weight_list.append(variables.weight_2000_gdbt_b)

    clf = GradientBoostingClassifier(loss='deviance', n_estimators=variables.n_estimators_gdbt_b,
                                     learning_rate=variables.learning_rate_gdbt_b,
                                     max_depth=variables.max_depth_gdbt_b, random_state=0,
                                     min_samples_split=variables.min_samples_split_gdbt_b,
                                     min_samples_leaf=variables.min_samples_leaf_gdbt_b,
                                     subsample=variables.subsample_gdbt_b,
                                     ).fit(x, y, weight_list)
    prediction_list = clf.predict(test_x)

    return prediction_list

Source File: GradientBoosting.py From Awesome-Scripts with MIT License

6 votes

def main():
	# prepare data
	trainingSet=[]
	testSet=[]
	accuracy = 0.0
	split = 0.25
	loadDataset('../Dataset/phishing.data', split, trainingSet, testSet)
	print('Train set: ' + repr(len(trainingSet)))
	print('Test set: ' + repr(len(testSet)))
	trainData = np.array(trainingSet)[:,0:np.array(trainingSet).shape[1] - 1]
	columns = trainData.shape[1] 
	X = np.array(trainData)
	y = np.array(trainingSet)[:,columns]
	clf = GradientBoostingClassifier()
	clf.fit(X, y)
	testData = np.array(testSet)[:,0:np.array(trainingSet).shape[1] - 1]
	X_test = np.array(testData)
	y_test = np.array(testSet)[:,columns]
	accuracy = clf.score(X_test,y_test)
	accuracy *= 100
	print("Accuracy %:",accuracy)

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_check_inputs_predict_stages():
    # check that predict_stages through an error if the type of X is not
    # supported
    x, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    x_sparse_csc = csc_matrix(x)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(x, y)
    score = np.zeros((y.shape)).reshape(-1, 1)
    assert_raise_message(ValueError,
                         "When X is a sparse matrix, a CSR format is expected",
                         predict_stages, clf.estimators_, x_sparse_csc,
                         clf.learning_rate, score)
    x_fortran = np.asfortranarray(x)
    assert_raise_message(ValueError,
                         "X should be C-ordered np.ndarray",
                         predict_stages, clf.estimators_, x_fortran,
                         clf.learning_rate, score)

Source File: test_partial_dependence.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_warning_recursion_non_constant_init():
    # make sure that passing a non-constant init parameter to a GBDT and using
    # recursion method yields a warning.

    gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0)
    gbc.fit(X, y)

    with pytest.warns(
            UserWarning,
            match='Using recursion method with a non-constant init predictor'):
        partial_dependence(gbc, X, [0], method='recursion')

    with pytest.warns(
            UserWarning,
            match='Using recursion method with a non-constant init predictor'):
        partial_dependence(gbc, X, [0], method='recursion')

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_early_stopping_n_classes():
    # when doing early stopping (_, , y_train, _ = train_test_split(X, y))
    # there might be classes in y that are missing in y_train. As the init
    # estimator will be trained on y_train, we need to raise an error if this
    # happens.

    X = [[1]] * 10
    y = [0, 0] + [1] * 8  # only 2 negative class over 10 samples
    gb = GradientBoostingClassifier(n_iter_no_change=5, random_state=0,
                                    validation_fraction=8)
    with pytest.raises(
                ValueError,
                match='The training data after the early stopping split'):
        gb.fit(X, y)

    # No error if we let training data be big enough
    gb = GradientBoostingClassifier(n_iter_no_change=5, random_state=0,
                                    validation_fraction=4)

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_probability_exponential():
    # Predict probabilities.
    clf = GradientBoostingClassifier(loss='exponential',
                                     n_estimators=100, random_state=1)

    assert_raises(ValueError, clf.predict_proba, T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # check if probabilities are in [0, 1].
    y_proba = clf.predict_proba(T)
    assert np.all(y_proba >= 0.0)
    assert np.all(y_proba <= 1.0)
    score = clf.decision_function(T).ravel()
    assert_array_almost_equal(y_proba[:, 1], expit(2 * score))

    # derive predictions from probabilities
    y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
    assert_array_equal(y_pred, true_result)

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_zero_estimator_clf():
    # Test if init='zero' works for classification.
    X = iris.data
    y = np.array(iris.target)

    est = GradientBoostingClassifier(n_estimators=20, max_depth=1,
                                     random_state=1, init='zero')
    est.fit(X, y)

    assert_greater(est.score(X, y), 0.96)

    # binary clf
    mask = y != 0
    y[mask] = 1
    y[~mask] = 0
    est = GradientBoostingClassifier(n_estimators=20, max_depth=1,
                                     random_state=1, init='zero')
    est.fit(X, y)
    assert_greater(est.score(X, y), 0.96)

    est = GradientBoostingClassifier(n_estimators=20, max_depth=1,
                                     random_state=1, init='foobar')
    assert_raises(ValueError, est.fit, X, y)

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_more_verbose_output():
    # Check verbose=2 does not cause error.
    from io import StringIO
    import sys
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1,
                                     verbose=2)
    clf.fit(X, y)
    verbose_output = sys.stdout
    sys.stdout = old_stdout

    # check output
    verbose_output.seek(0)
    header = verbose_output.readline().rstrip()
    # no OOB
    true_header = ' '.join(['%10s'] + ['%16s'] * 2) % (
        'Iter', 'Train Loss', 'Remaining Time')
    assert_equal(true_header, header)

    n_lines = sum(1 for l in verbose_output.readlines())
    # 100 lines for n_estimators==100
    assert_equal(100, n_lines)

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_verbose_output():
    # Check verbose=1 does not cause error.
    from io import StringIO

    import sys
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1,
                                     verbose=1, subsample=0.8)
    clf.fit(X, y)
    verbose_output = sys.stdout
    sys.stdout = old_stdout

    # check output
    verbose_output.seek(0)
    header = verbose_output.readline().rstrip()
    # with OOB
    true_header = ' '.join(['%10s'] + ['%16s'] * 3) % (
        'Iter', 'Train Loss', 'OOB Improve', 'Remaining Time')
    assert_equal(true_header, header)

    n_lines = sum(1 for l in verbose_output.readlines())
    # one for 1-10 and then 9 for 20-100
    assert_equal(10 + 9, n_lines)

Source File: test_boosted_trees_classifier.py From coremltools with BSD 3-Clause "New" or "Revised" License

6 votes

def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston
        import numpy as np

        scikit_data = load_boston()
        scikit_model = GradientBoostingClassifier(random_state=1)
        t = scikit_data.target
        target = np.digitize(t, np.histogram(t)[1]) - 1
        scikit_model.fit(scikit_data.data, target)
        self.target = target

        # Save the data and the model
        self.scikit_data = scikit_data
        self.scikit_model = scikit_model

Source File: test_gradient_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_serialization():
    # Check model serialization.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

    try:
        import cPickle as pickle
    except ImportError:
        import pickle

    serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL)
    clf = None
    clf = pickle.loads(serialized_clf)
    assert_array_equal(clf.predict(T), true_result)
    assert_equal(100, len(clf.estimators_))

Source File: sentiment_analysis_ml.py From Sentiment_Analysis_cnn_lstm_cnnlstm_textcnn_bilstm with Apache License 2.0

5 votes

def gbdt_classifier(train_vecs,y_train,test_vecs,y_test):
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=10,random_state=0)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf,storedpaths+'model_gbdt.pkl')
    test_scores=clf.score(test_vecs,y_test)
    return test_scores
    
# 训练近邻分类算法

Source File: classification.py From pyImSegm with BSD 3-Clause "New" or "Revised" License

5 votes

def create_classifiers(nb_workers=-1):
    """ create all classifiers with default parameters

    :param int nb_workers: number of parallel if possible
    :return dict: {str: clf}

    >>> classifs = create_classifiers()
    >>> classifs  # doctest: +ELLIPSIS
    {...}
    >>> sum([isinstance(create_clf_param_search_grid(k), dict)
    ...      for k in classifs.keys()])
    7
    >>> sum([isinstance(create_clf_param_search_distrib(k), dict)
    ...      for k in classifs.keys()])
    7
    """
    clfs = {
        'RandForest': ensemble.RandomForestClassifier(n_estimators=20,
                                                      # oob_score=True,
                                                      min_samples_leaf=2,
                                                      min_samples_split=3,
                                                      n_jobs=nb_workers),
        'GradBoost': ensemble.GradientBoostingClassifier(subsample=0.25,
                                                         warm_start=False,
                                                         max_depth=6,
                                                         min_samples_leaf=6,
                                                         n_estimators=200,
                                                         min_samples_split=7),
        'LogistRegr': linear_model.LogisticRegression(solver='sag',
                                                      n_jobs=nb_workers),
        'KNN': neighbors.KNeighborsClassifier(n_jobs=nb_workers),
        'SVM': svm.SVC(kernel='rbf', probability=True,
                       tol=2e-3, max_iter=5000),
        'DecTree': tree.DecisionTreeClassifier(),
        # 'RBM': create_pipeline_neuron_net(),
        'AdaBoost': ensemble.AdaBoostClassifier(n_estimators=5),
        # 'NuSVM-rbf': svm.NuSVC(kernel='rbf', probability=True),
    }
    return clfs

Source File: predict.py From Loan_Default_Prediction with BSD 3-Clause "New" or "Revised" License

5 votes

def gbc_classify(train_x, train_y):
    feature_indexs = getTopFeatures(train_x, train_y)
    sub_x_Train = get_data(train_x, feature_indexs[:16], features.feature_pair_sub_list
                ,features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20],
                features.feature_pair_sub_mul_list[:20])
    labels = toLabels(train_y)
    gbc = GradientBoostingClassifier(n_estimators=3000, max_depth=8)
    gbc.fit(sub_x_Train, labels)
    return gbc

# use svm to predict the loss, based on the result of gbm classifier

Source File: test_boosted_trees_classifier.py From coremltools with BSD 3-Clause "New" or "Revised" License

5 votes

def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = GradientBoostingClassifier()
            spec = skl_converter.convert(model, "data", "out")

        # Check the expected class during covnersion.
        from sklearn.preprocessing import OneHotEncoder

        with self.assertRaises(Exception):
            model = OneHotEncoder()
            spec = skl_converter.convert(model, "data", "out")

Source File: binary.py From stacking with MIT License

5 votes

def build_model(self):
            return GradientBoostingClassifier(**self.params)

Source File: utils.py From disentanglement_lib with Apache License 2.0

5 votes

def gradient_boosting_classifier():
  """Default gradient boosting classifier."""
  return GradientBoostingClassifier()

Source File: test_boosted_trees_classifier.py From coremltools with BSD 3-Clause "New" or "Revised" License

5 votes

def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = GradientBoostingClassifier(random_state=1)
        target = scikit_data["target"] > scikit_data["target"].mean()
        scikit_model.fit(scikit_data["data"], target)

        # Save the data and the model
        self.scikit_data = scikit_data
        self.scikit_model = scikit_model

Source File: test_general.py From hyperparameter_hunter with MIT License

5 votes

def exp_gbc_1():
    return CVExperiment(GradientBoostingClassifier, dict(subsample=0.5))


#################### KNeighborsClassifier Experiments ####################

Source File: test_core_pipeline.py From lale with Apache License 2.0

5 votes

def test_import_from_sklearn_pipeline_noop(self):
        from sklearn.pipeline import Pipeline
        from sklearn.ensemble import GradientBoostingClassifier
        from lale.helpers import import_from_sklearn_pipeline
        pipe = Pipeline([('noop', None), ('gbc', GradientBoostingClassifier())])
        with self.assertRaises(ValueError):
            imported_pipeline = import_from_sklearn_pipeline(pipe)

Python sklearn.ensemble.GradientBoostingClassifier() Examples