Python Examples of sklearn.model_selection.cross_val

Source File: multi_class_classification.py From edge2vec with BSD 3-Clause "New" or "Revised" License

11 votes

def multi_class_classification(data_X,data_Y):
    '''
    calculate multi-class classification and return related evaluation metrics
    '''

    svc = svm.SVC(C=1, kernel='linear')
    # X_train, X_test, y_train, y_test = train_test_split( data_X, data_Y, test_size=0.4, random_state=0) 
    clf = svc.fit(data_X, data_Y) #svm
    # array = svc.coef_
    # print array
    predicted = cross_val_predict(clf, data_X, data_Y, cv=2)
    print "accuracy",metrics.accuracy_score(data_Y, predicted)
    print "f1 score macro",metrics.f1_score(data_Y, predicted, average='macro') 
    print "f1 score micro",metrics.f1_score(data_Y, predicted, average='micro') 
    print "precision score",metrics.precision_score(data_Y, predicted, average='macro') 
    print "recall score",metrics.recall_score(data_Y, predicted, average='macro') 
    print "hamming_loss",metrics.hamming_loss(data_Y, predicted)
    print "classification_report", metrics.classification_report(data_Y, predicted)
    print "jaccard_similarity_score", metrics.jaccard_similarity_score(data_Y, predicted)
    # print "log_loss", metrics.log_loss(data_Y, predicted)
    print "zero_one_loss", metrics.zero_one_loss(data_Y, predicted)
    # print "AUC&ROC",metrics.roc_auc_score(data_Y, predicted)
    # print "matthews_corrcoef", metrics.matthews_corrcoef(data_Y, predicted)

Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cv_pipeline_precomputed():
    # Cross-validate a regression on four coplanar points with the same
    # value. Use precomputed kernel to ensure Pipeline with KernelCenterer
    # is treated as a _pairwise operation.
    X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
    y_true = np.ones((4,))
    K = X.dot(X.T)
    kcent = KernelCenterer()
    pipeline = Pipeline([("kernel_centerer", kcent), ("svr",
                        SVR(gamma='scale'))])

    # did the pipeline set the _pairwise attribute?
    assert pipeline._pairwise

    # test cross-validation, score should be almost perfect
    # NB: this test is pretty vacuous -- it's mainly to test integration
    #     of Pipeline and KernelCenterer
    y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
    assert_array_almost_equal(y_true, y_pred)

Source File: TermDocMatrix.py From scattertext with Apache License 2.0

6 votes

def get_logistic_regression_coefs_l2(self, category,
                                         clf=RidgeClassifierCV()):
        ''' Computes l2-penalized logistic regression score.
        Parameters
        ----------
        category : str
            category name to score

        category : str
            category name to score
        Returns
        -------
            (coefficient array, accuracy, majority class baseline accuracy)
        '''
        try:
            from sklearn.cross_validation import cross_val_predict
        except:
            from sklearn.model_selection import cross_val_predict
        y = self._get_mask_from_category(category)
        X = TfidfTransformer().fit_transform(self._X)
        clf.fit(X, y)
        y_hat = cross_val_predict(clf, X, y)
        acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
        return clf.coef_[0], acc, baseline

Source File: plotfunctions.py From DataScience-webapp-with-flask with MIT License

6 votes

def plot_predVSreal(X, y, classifier, cv):
    from sklearn.model_selection import cross_val_predict
    # cross_val_predict returns an array of the same size as `y` where each entry
    # is a prediction obtained by cross validation:
    predicted = cross_val_predict(classifier, X, y, cv=cv)
    plt.gcf().clear()
    plt.scatter(y, predicted, edgecolors=(0, 0, 0))
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
    plt.xlabel('Measured')
    plt.ylabel('Predicted')
    from io import BytesIO
    figfile = BytesIO()
    plt.savefig(figfile, format='png')
    figfile.seek(0)  # rewind to beginning of file
    import base64
    figdata_png = base64.b64encode(figfile.getvalue())
    return figdata_png

Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv)

Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def check_cross_val_predict_multiclass(est, X, y, method):
    """Helper for tests of cross_val_predict with multiclass classification"""
    cv = KFold(n_splits=3, shuffle=False)

    # Generate expected outputs
    float_min = np.finfo(np.float64).min
    default_values = {'decision_function': float_min,
                      'predict_log_proba': float_min,
                      'predict_proba': 0}
    expected_predictions = np.full((len(X), len(set(y))),
                                   default_values[method],
                                   dtype=np.float64)
    _, y_enc = np.unique(y, return_inverse=True)
    for train, test in cv.split(X, y_enc):
        est = clone(est).fit(X[train], y_enc[train])
        fold_preds = getattr(est, method)(X[test])
        i_cols_fit = np.unique(y_enc[train])
        expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds

    # Check actual outputs for several representations of y
    for tg in [y, y + 1, y - 2, y.astype('str')]:
        assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv),
                        expected_predictions)

Source File: test_mlp_classifier.py From muffnn with BSD 3-Clause "New" or "Revised" License

6 votes

def test_cross_val_predict():
    # Make sure it works in cross_val_predict for multiclass.

    X, y = load_iris(return_X_y=True)
    y = LabelBinarizer().fit_transform(y)
    X = StandardScaler().fit_transform(X)

    mlp = MLPClassifier(n_epochs=10,
                        solver_kwargs={'learning_rate': 0.05},
                        random_state=4567).fit(X, y)

    cv = KFold(n_splits=4, random_state=457, shuffle=True)
    y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
    auc = roc_auc_score(y, y_oos, average=None)

    assert np.all(auc >= 0.96)

Source File: test_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv)

Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cross_val_predict_unbalanced():
    X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
                               n_informative=2, n_clusters_per_class=1,
                               random_state=1)
    # Change the first sample to a new class
    y[0] = 2
    clf = LogisticRegression(random_state=1)
    cv = StratifiedKFold(n_splits=2, random_state=1)
    train, test = list(cv.split(X, y))
    yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba")
    assert y[test[0]][0] == 2  # sanity check for further assertions
    assert np.all(yhat_proba[test[0]][:, 2] == 0)
    assert np.all(yhat_proba[test[0]][:, 0:1] > 0)
    assert np.all(yhat_proba[test[1]] > 0)
    assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape),
                              decimal=12)

Source File: _test.py From ibex with BSD 3-Clause "New" or "Revised" License

5 votes

def _generate_cross_val_predict_test(X, y, est, pd_est, must_match):
    def test(self):
        self.assertEqual(
            hasattr(est, 'predict'),
            hasattr(pd_est, 'predict'))
        if not hasattr(est, 'predict'):
            return
        pd_y_hat = pd_cross_val_predict(pd_est, X, y)
        self.assertTrue(isinstance(pd_y_hat, pd.Series))
        self.assertTrue(pd_y_hat.index.equals(X.index))
        if must_match:
            y_hat = cross_val_predict(est, X.as_matrix(), y.values)
            np.testing.assert_allclose(pd_y_hat, y_hat)
    return test

Source File: stacked_classifiers_standard.py From baikal with BSD 3-Clause "New" or "Revised" License

5 votes

def fit_predict(self, X, y):
    self.fit(X, y)
    return cross_val_predict(self, X, y, method="predict_proba")

Source File: cross_validation.py From Pyspatialml with GNU General Public License v3.0

5 votes

def fit(self, X, y=None, groups=None, **fit_params):
        """
        Run fit method with all sets of parameters

        Args
        ----
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning

        groups : array-like, shape = [n_samples], optional
            Training vector groups for cross-validation

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        # check estimator and cv methods are valid
        self.cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        # check for binary response
        if len(np.unique(y)) > 2:
            raise ValueError('Only a binary response vector is currently supported')

        # check that scoring metric has been specified
        if self.scoring is None:
            raise ValueError('No score function is defined')

        # perform cross validation prediction
        self.y_pred_ = cross_val_predict(
            estimator=self.estimator, X=X, y=y, groups=groups, cv=self.cv,
            method='predict_proba', n_jobs=self.n_jobs, **fit_params)
        self.y_true = y

        # add fold id to the predictions
        self.test_idx_ = [indexes[1] for indexes in self.cv.split(X, y, groups)]

Source File: classifier_selection.py From causallib with Apache License 2.0

5 votes

def _select_classifier_from_list(candidates, X, A, n_splits=5, seed=None, loss_type='01'):
    accuracies = np.zeros(len(candidates))

    class_weight = compute_class_weight('balanced', np.unique(A), A)[LabelEncoder().fit_transform(A)]

    if n_splits >= 2:
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        for model_idx, m in enumerate(candidates):
            if loss_type == '01':
                pred = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight}).reshape(-1)
            else:
                ps = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight},
                                       method='predict_proba')
                pred = ps[:, 1]
    else:
        for model_idx, m in enumerate(candidates):
            m.fit(X, A, sample_weight=class_weight)
            if loss_type == '01':
                pred = m.predict(X=X)
            else:
                pred = m.predict_proba(X=X)[:, 1]

    if loss_type == '01':
        accuracies[model_idx] = np.sum(class_weight[pred == A]) / np.sum(class_weight)
    else:
        logl = np.zeros(A.shape)
        logl[A == -1] = np.log(1.0 - pred[A == -1])
        logl[A == 1] = np.log(pred[A == 1])
        accuracies[model_idx] = np.sum(class_weight * logl) / np.sum(class_weight)

    i_best = np.argmax(accuracies)
    # print('accuracies =', accuracies, "accuracies-sorted", sorted(accuracies))
    # print('Selected model {} {}'.format(i_best, candidates[i_best]))
    return candidates[i_best]

Source File: classification_tests.py From drifter_ml with MIT License

5 votes

def cross_val_roc_auc(self, clf, cv=3, average="micro"):
        self.roc_auc_exception()
        roc_auc_score = partial(self.roc_auc_score, average=average)
        y_pred = cross_val_predict(clf, self.X, self.y, cv=cv)
        return roc_auc_score(self.y, y_pred)

Source File: classification_tests.py From drifter_ml with MIT License

5 votes

def cross_val_f1(self, clf, cv=3, average="binary"):
        average = self.reset_average(average)
        f1_score = partial(self.f1_score, average=average)
        y_pred = cross_val_predict(clf, self.X, self.y, cv=cv)
        return f1_score(self.y, y_pred)

Source File: classification_tests.py From drifter_ml with MIT License

5 votes

def cross_val_recall(self, clf, cv=3, average="binary"):
        average = self.reset_average(average)
        recall_score = partial(self.recall_score, average=average)
        y_pred = cross_val_predict(clf, self.X, self.y, cv=cv)
        return recall_score(self.y, y_pred)

Source File: classification_tests.py From drifter_ml with MIT License

5 votes

def cross_val_precision(self, clf, cv=3, average="binary"):
        average = self.reset_average(average)
        precision_score = partial(self.precision_score, average=average)
        y_pred = cross_val_predict(clf, self.X, self.y, cv=cv)
        return precision_score(self.y, y_pred)

Source File: sklearn_steps.py From baikal with BSD 3-Clause "New" or "Revised" License

5 votes

def _fit_predict_proba(self, X, y):
    self.fit(X, y)
    return cross_val_predict(self, X, y, method="predict_proba")

Source File: sklearn_steps.py From baikal with BSD 3-Clause "New" or "Revised" License

5 votes

def _fit_decision_function(self, X, y):
    self.fit(X, y)
    return cross_val_predict(self, X, y, method="decision_function")

Source File: classification_tests.py From drifter_ml with MIT License

5 votes

def cross_val_precision_per_class(self, clf, cv=3, average="binary"):
        average = self.reset_average(average)
        precision_score = partial(self.precision_score, average=average)
        y_pred = cross_val_predict(clf, self.X, self.y, cv=cv)
        precision = {}
        for klass in self.classes:
            y_pred_class = np.take(y_pred, self.y[self.y == klass].index, axis=0)
            y_class = self.y[self.y == klass]
            precision[klass] = precision_score(y_class, y_pred_class) 
        return precision

Source File: test_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_cross_val_predict_predict_proba_shape():
    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)

    preds = cross_val_predict(LogisticRegression(), X, y,
                              method='predict_proba')
    assert_equal(preds.shape, (50, 2))

    X, y = load_iris(return_X_y=True)

    preds = cross_val_predict(LogisticRegression(), X, y,
                              method='predict_proba')
    assert_equal(preds.shape, (150, 3))

Source File: test_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_cross_val_predict_predict_log_proba_shape():
    X, y = make_classification(n_classes=2, n_samples=50, random_state=0)

    preds = cross_val_predict(LogisticRegression(), X, y,
                              method='predict_log_proba')
    assert_equal(preds.shape, (50, 2))

    X, y = load_iris(return_X_y=True)

    preds = cross_val_predict(LogisticRegression(), X, y,
                              method='predict_log_proba')
    assert_equal(preds.shape, (150, 3))

Source File: test_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_cross_val_predict_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y2)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cross_val_predict(clf, X_df, y_ser)

Source File: test_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_cross_val_predict_sparse_prediction():
    # check that cross_val_predict gives same result for sparse and dense input
    X, y = make_multilabel_classification(n_classes=2, n_labels=1,
                                          allow_unlabeled=False,
                                          return_indicator=True,
                                          random_state=1)
    X_sparse = csr_matrix(X)
    y_sparse = csr_matrix(y)
    classif = OneVsRestClassifier(SVC(kernel='linear'))
    preds = cross_val_predict(classif, X, y, cv=10)
    preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10)
    preds_sparse = preds_sparse.toarray()
    assert_array_almost_equal(preds_sparse, preds)

Source File: test_validation.py From twitter-stock-recommendation with MIT License

5 votes

def check_cross_val_predict_with_method(est):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=0)
    classes = len(set(y))

    kfold = KFold()

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        predictions = cross_val_predict(est, X, y, method=method)
        assert_equal(len(predictions), len(y))

        expected_predictions = np.zeros([len(y), classes])
        func = getattr(est, method)

        # Naive loop (should be same as cross_val_predict):
        for train, test in kfold.split(X, y):
            est.fit(X[train], y[train])
            expected_predictions[test] = func(X[test])

        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold)
        assert_array_almost_equal(expected_predictions, predictions)

        # Test alternative representations of y
        predictions_y1 = cross_val_predict(est, X, y + 1, method=method,
                                           cv=kfold)
        assert_array_equal(predictions, predictions_y1)

        predictions_y2 = cross_val_predict(est, X, y - 2, method=method,
                                           cv=kfold)
        assert_array_equal(predictions, predictions_y2)

        predictions_ystr = cross_val_predict(est, X, y.astype('str'),
                                             method=method, cv=kfold)
        assert_array_equal(predictions, predictions_ystr)

Source File: test_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_cross_val_predict_method_checking():
    # Regression test for issue #9639. Tests that cross_val_predict does not
    # check estimator methods (e.g. predict_proba) before fitting
    est = SGDClassifier(loss='log', random_state=2)
    check_cross_val_predict_with_method(est)

Source File: test_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_cross_val_predict_class_subset():

    X = np.arange(200).reshape(100, 2)
    y = np.array([x//10 for x in range(100)])
    classes = 10

    kfold3 = KFold(n_splits=3)
    kfold4 = KFold(n_splits=4)

    le = LabelEncoder()

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        # Test with n_splits=3
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold3)

        # Runs a naive loop (should be same as cross_val_predict):
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Test with n_splits=4
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold4)
        expected_predictions = get_expected_predictions(X, y, kfold4, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Testing unordered labels
        y = shuffle(np.repeat(range(10), 10), random_state=0)
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold3)
        y = le.fit_transform(y)
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def check_cross_val_predict_binary(est, X, y, method):
    """Helper for tests of cross_val_predict with binary classification"""
    cv = KFold(n_splits=3, shuffle=False)

    # Generate expected outputs
    if y.ndim == 1:
        exp_shape = (len(X),) if method == 'decision_function' else (len(X), 2)
    else:
        exp_shape = y.shape
    expected_predictions = np.zeros(exp_shape)
    for train, test in cv.split(X, y):
        est = clone(est).fit(X[train], y[train])
        expected_predictions[test] = getattr(est, method)(X[test])

    # Check actual outputs for several representations of y
    for tg in [y, y + 1, y - 2, y.astype('str')]:
        assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv),
                        expected_predictions)

Source File: estimator.py From EDeN with MIT License

5 votes

def cross_val_predict(self, graphs, targets, cv=5):
        """cross_val_score."""
        x = self.transform(graphs)
        scores = cross_val_predict(
            self.model, x, targets, cv=cv, method='decision_function')
        return scores

Source File: TermDocMatrix.py From scattertext with Apache License 2.0

5 votes

def get_logistic_regression_coefs_l1(self, category,
                                         clf=LassoCV(alphas=[0.1, 0.001],
                                                     max_iter=10000,
                                                     n_jobs=-1)):
        ''' Computes l1-penalized logistic regression score.
        Parameters
        ----------
        category : str
            category name to score

        Returns
        -------
            (coefficient array, accuracy, majority class baseline accuracy)
        '''
        try:
            from sklearn.cross_validation import cross_val_predict
        except:
            from sklearn.model_selection import cross_val_predict
        y = self._get_mask_from_category(category)
        y_continuous = self._get_continuous_version_boolean_y(y)
        # X = TfidfTransformer().fit_transform(self._X)
        X = self._X

        clf.fit(X, y_continuous)
        y_hat = (cross_val_predict(clf, X, y_continuous) > 0)
        acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
        clf.fit(X, y_continuous)
        return clf.coef_, acc, baseline

Python sklearn.model_selection.cross_val_predict() Examples