Python Examples of sklearn.cross_validation.StratifiedKFold

Source File: embutils.py From DynamicTriad with Apache License 2.0

6 votes

def _validate_link_reconstruction(self, samples, lbs):
        # cache = utils.KeyDefaultDict(lambda x: self.embeddings_at(x))
        # feat = []
        # for v in samples:
        #     emb = cache[v[0] - 1]
        #     # feat.append(np.concatenate((emb[v[1]], emb[v[2]]), axis=0))
        #     feat.append(np.abs(emb[v[1]] - emb[v[2]]))
        # feat = np.vstack(feat)
        feat = self.make_features(samples)
        feat = np.abs(feat[:, 0] - feat[:, 1])

        clf = LogisticRegression()
        try:
            cv = StratifiedKFold(lbs, n_folds=2, shuffle=True)
            parts = cv
        except TypeError:
            cv = StratifiedKFold(n_splits=2, shuffle=True)
            parts = cv.split(feat, lbs)

        val_score = []
        for tr, te in parts:
            model = clf.fit(feat[tr], lbs[tr])
            p = model.predict(feat[te])
            val_score.append(f1_score(lbs[te], p))
        return np.mean(val_score)

Source File: test_grid_search.py From diogenes with MIT License

6 votes

def test_slice_on_dimension(self):
        iris = datasets.load_iris()
        y = iris.target
        M = iris.data
        clfs = [{'clf': RandomForestClassifier, 
                 'n_estimators': [10, 100], 
                 'max_depth': [1, 10],
                 'random_state': [0]}, 
                 {'clf': SVC, 'kernel': ['linear', 'rbf'], 
                  'random_state': [0]}]        
        subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 
                    'subset_size': [20, 40, 60, 80, 100],
                    'random_state': [0]}]
        cvs = [{'cv': StratifiedKFold}]
        exp = per.Experiment(M, y, clfs, subsets, cvs)
        result = [str(trial) for trial in exp.slice_on_dimension(
                per.CLF, 
                RandomForestClassifier).trials]
        self.__compare_to_ref_pkl(result, 'slice_on_dimension_clf')
        result = [str(trial) for trial  in exp.slice_on_dimension(
                per.SUBSET_PARAMS, 
                {'subset_size': 60}).trials]
        self.__compare_to_ref_pkl(result, 'slice_on_dimension_subset_params')

Source File: test_grid_search.py From diogenes with MIT License

6 votes

def test_slice_by_best_score(self):
        iris = datasets.load_iris()
        y = iris.target
        M = iris.data
        clfs = [{'clf': RandomForestClassifier, 
                 'n_estimators': [10, 100], 
                 'max_depth': [1, 10],
                 'random_state': [0]}, 
                 {'clf': SVC, 'kernel': ['linear', 'rbf'],
                  'random_state': [0]}]        
        subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 
                    'subset_size': [20, 40],
                    'random_state': [0]}]
        cvs = [{'cv': StratifiedKFold}]
        exp = per.Experiment(M, y, clfs, subsets, cvs)
        exp.run()
        result = {str(trial): trial.average_score() for trial in 
                  exp.slice_by_best_score(per.CLF_PARAMS).trials}
        self.__compare_to_ref_pkl(result, 'slice_by_best_score')

Source File: test_grid_search.py From diogenes with MIT License

6 votes

def test_make_csv(self):
        M, y = uft.generate_test_matrix(1000, 5, 2, random_state=0)
        clfs = [{'clf': RandomForestClassifier, 
                 'n_estimators': [10, 100], 
                 'max_depth': [5, 25],
                 'random_state': [0]},
                {'clf': SVC, 
                 'kernel': ['linear', 'rbf'], 
                 'probability': [True],
                 'random_state': [0]}]        
        subsets = [{'subset': per.SubsetSweepNumRows, 
                    'num_rows': [[100, 200]],
                    'random_state': [0]}]
        cvs = [{'cv': StratifiedKFold, 
                'n_folds': [2, 3]}]
        exp = per.Experiment(M, y, clfs=clfs, subsets=subsets, cvs=cvs)
        result_path = exp.make_csv()

Source File: test_grid_search.py From diogenes with MIT License

6 votes

def test_report_complex(self):
        M, y = uft.generate_test_matrix(100, 5, 2)
        clfs = [{'clf': RandomForestClassifier, 
                 'n_estimators': [10, 100], 
                 'max_depth': [1, 10],
                 'random_state': [0]}, 
                 {'clf': SVC, 
                  'kernel': ['linear', 'rbf'], 
                  'probability': [True],
                  'random_state': [0]}]        
        subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 
                    'subset_size': [20, 40, 60, 80, 100],
                    'random_state': [0]}]
        cvs = [{'cv': StratifiedKFold}]
        exp = per.Experiment(M, y, clfs, subsets, cvs)
        _, rep = exp.make_report(dimension=per.CLF, return_report_object=True, 
                                 verbose=False)
        self.report.add_heading('test_report_complex', 1)
        self.report.add_subreport(rep)

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_stratified_kfold_no_shuffle():
    # Manually check that StratifiedKFold preserves the data ordering as much
    # as possible on toy datasets in order to avoid hiding sample dependencies
    # when possible
    splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 2])
    assert_array_equal(train, [1, 3])

    train, test = next(splits)
    assert_array_equal(test, [1, 3])
    assert_array_equal(train, [0, 2])

    splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 3, 4])
    assert_array_equal(train, [2, 5, 6])

    train, test = next(splits)
    assert_array_equal(test, [2, 5, 6])
    assert_array_equal(train, [0, 1, 3, 4])

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]

Source File: class_w2v.py From 2016CCF-sougou with Apache License 2.0

6 votes

def validation(self,X,Y,kind):
        """

        使用2-fold进行验证
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean()

Source File: test_grid_search.py From twitter-stock-recommendation with MIT License

6 votes

def test_grid_search_score_consistency():
    # test that correct scores are used
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
        grid_search.fit(X, y)
        cv = StratifiedKFold(n_folds=3, y=y)
        for C, scores in zip(Cs, grid_search.grid_scores_):
            clf.set_params(C=C)
            scores = scores[2]  # get the separate runs from grid scores
            i = 0
            for train, test in cv:
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, scores[i])
                i += 1

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves label ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    labels = np.array([4] * int(0.10 * n_samples) +
                      [0] * int(0.89 * n_samples) +
                      [1] * int(0.01 * n_samples))
    for shuffle in [False, True]:
        for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
            assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
                                2)
            assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
                                2)
            assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
                                2)
            assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2)

Source File: stacker.py From brew with MIT License

6 votes

def fit_layer(self, layer_idx, X, y):
        if layer_idx >= len(self.layers):
            return
        elif layer_idx == len(self.layers) - 1:
            self.layers[layer_idx].fit(X, y)
        else:
            n_classes = len(set(y)) - 1
            n_classifiers = len(self.layers[layer_idx])
            output = np.zeros((X.shape[0], n_classes * n_classifiers))
            skf = cross_validation.StratifiedKFold(y, self.cv)
            for tra, tst in skf:
                self.layers[layer_idx].fit(X[tra], y[tra])
                out = self.layers[layer_idx].output(X[tst], mode=self.mode)
                output[tst, :] = out[:, 1:, :].reshape(
                    out.shape[0], (out.shape[1] - 1) * out.shape[2])

            self.layers[layer_idx].fit(X, y)
            self.fit_layer(layer_idx + 1, output, y)

Source File: simple_benchmark.py From RotationForest with MIT License

6 votes

def test_toy_data(name, clf):
    X, y = classification_data()
    k_folds = 5
    cv = StratifiedKFold(y, k_folds, random_state=1234)

    acc, auc = [], []
    for train, test in cv:
        xt, xv, yt, yv = X[train, :], X[test, :], y[train], y[test]
        clf.fit(xt, yt)
        yhat = clf.predict(xv)
        proba = clf.predict_proba(xv)[:, 1]
        acc.append(np.mean(yhat == yv))
        auc.append(roc_auc_score(yv, proba))

    acc_mean, acc_std = np.mean(acc), np.std(acc)
    auc_mean, auc_std = np.mean(auc), np.std(auc)
    print name
    print 'accuracy: {0:.3f} +/- {1:.3f}'.format(acc_mean, acc_std)
    print 'auc: {0:.3f} +/- {1:.3f}'.format(auc_mean, auc_std)
    print '-'*80
    return {'name': name,
            'acc_mean': acc_mean,
            'acc_std': acc_std,
            'auc_mean': auc_mean,
            'auc_std': auc_std}

Source File: classif_and_ktst.py From jstsp2015 with MIT License

6 votes

def compute_svm_score_nestedCV(K, y, n_folds,
                               scoring=balanced_accuracy_scoring,
                               random_state=None,
                               param_grid=[{'C': np.logspace(-5, 5, 25)}]):
    """Compute cross-validated score of SVM using precomputed kernel.
    """
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True,
                         random_state=random_state)
    scores = np.zeros(n_folds)
    for i, (train, test) in enumerate(cv):
        cvclf = SVC(kernel='precomputed')
        y_train = y[train]
        cvcv = StratifiedKFold(y_train, n_folds=n_folds,
                               shuffle=True,
                               random_state=random_state)
        clf = GridSearchCV(cvclf, param_grid=param_grid, scoring=scoring,
                           cv=cvcv, n_jobs=1)
        clf.fit(K[train, :][:, train], y_train)
        # print clf.best_params_
        scores[i] = clf.score(K[test, :][:, train], y[test])

    return scores.mean()

Source File: embutils.py From DynamicTriad with Apache License 2.0

6 votes

def _validate_node_classify(self, samples, lbs):
        # note that the 1-st dimension of feat is for each node in each sample (time, node1, node2, ...)
        feat = self.make_features(samples)[:, 0]
        assert len(feat) == len(lbs)

        clf = LogisticRegression(class_weight='balanced')
        try:
            cv = StratifiedKFold(lbs, n_folds=2, shuffle=True)
            parts = cv
        except TypeError as e:
            cv = StratifiedKFold(n_splits=2, shuffle=True)
            parts = cv.split(feat, lbs)

        val_score = []
        for tr, te in parts:
            model = clf.fit(feat[tr], lbs[tr])
            p = model.predict(feat[te])
            val_score.append(f1_score(lbs[te], p))
        return np.mean(val_score)

Source File: class_w2v.py From 2016_CCFsougou2 with MIT License

6 votes

def validation(self,X,Y,kind):
        """

        使用2-fold进行验证
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean()

Source File: naive_bayes.py From yenlp with GNU General Public License v3.0

6 votes

def naive_bayes(pos_samples, neg_samples, n_folds = 2):
    '''Trains a naive bayes classifier with NLTK. It uses stratified 
    n-fold validation. Inputs are the positive and negative samples and 
    the number of folds. Returns the total accuracy and the classifier and 
    the train/test sets of the last fold.'''
    samples = np.array(pos_samples + neg_samples)
    labels = [label for (words, label) in samples]
    cv = cross_validation.StratifiedKFold(labels, n_folds= n_folds, shuffle=True)
    
    accuracy = 0.0
    for traincv, testcv in cv:
        train_samples = samples[traincv]
        test_samples = samples[testcv]
        classifier = nltk.NaiveBayesClassifier.train(train_samples)
        accuracy += nltk.classify.util.accuracy(classifier, test_samples)
    accuracy /= n_folds
    return (accuracy, classifier, train_samples, test_samples)

Source File: class_w2v.py From 2016CCF_BDCI_Sougou with MIT License

6 votes

def validation(self,X,Y,kind):
        """

        使用2-fold进行验证
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean()

Source File: base.py From stacking with MIT License

6 votes

def create_cv_id(target, n_folds_ = 5, cv_id_name=cv_id_name, seed=407):
    try:
        a = StratifiedKFold(target['target'],n_folds=n_folds_, shuffle=True, random_state=seed)
        cv_index = a.test_folds
        print 'Done StratifiedKFold'
    except:
        cv_index = np.empty(len(target))
        a = KFold(len(target),n_folds=n_folds_, shuffle=True, random_state=seed)
        for idx, i in enumerate(a):
            cv_index[i[1]] = idx
        cv_index = cv_index.astype(int)
        print 'Done Kfold'
    
    np.save(INPUT_PATH + cv_id_name, cv_index)
    return 

######### Utils #########

#feature listを渡してデータを作成するutil関数

Source File: utils.py From kaggle_otto with BSD 3-Clause "New" or "Revised" License

6 votes

def make_blender_cv(classifier, x, y, calibrate=False):
    skf = StratifiedKFold(y, n_folds=5, random_state=23)
    scores, predictions = [], None
    for train_index, test_index in skf:
        if calibrate:
            # Make training and calibration
            calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y[train_index]))
            fitted_classifier = calibrated_classifier.fit(x[train_index, :], y[train_index])
        else:
            fitted_classifier = classifier.fit(x[train_index, :], y[train_index])
        preds = fitted_classifier.predict_proba(x[test_index, :])

        # Free memory
        calibrated_classifier, fitted_classifier = None, None
        gc.collect()

        scores.append(log_loss(y[test_index], preds))
        predictions = np.append(predictions, preds, axis=0) if predictions is not None else preds
    return scores, predictions

Source File: blender.py From kaggle_otto with BSD 3-Clause "New" or "Revised" License

6 votes

def get_weights():
    # Read validation labels
    _, labels, _, _, _ = utils.load_data()
    skf = StratifiedKFold(labels, n_folds=5, random_state=23)
    test_index = None
    for _, test_idx in skf:
        test_index = np.append(test_index, test_idx) if test_index is not None else test_idx
    val_labels = labels[test_index]
    # Read predictions on validation set
    val_predictions = []
    prediction_files = utils.get_prediction_files()
    for preds_file in prediction_files:
        vp = np.genfromtxt(os.path.join(consts.BLEND_PATH, preds_file), delimiter=',')
        val_predictions.append(vp)
    # Minimize blending function
    p0 = [1.] * len(prediction_files)
    p = fmin_cobyla(error, p0, args=(val_predictions, val_labels), cons=[constraint], rhoend=1e-5)

    return p

Source File: simulation.py From jstsp2015 with MIT License

6 votes

def compute_svm_score_nestedCV(K, y, n_folds, scoring='accuracy',
                               random_state=None,
                               param_grid=[{'C': np.logspace(-5, 5, 20)}]):
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True,
                         random_state=random_state)
    scores = np.zeros(n_folds)
    for i, (train, test) in enumerate(cv):
        cvclf = SVC(kernel='precomputed')
        y_train = y[train]
        cvcv = StratifiedKFold(y_train, n_folds=n_folds,
                               shuffle=True,
                               random_state=random_state)
        clf = GridSearchCV(cvclf, param_grid=param_grid, scoring=scoring,
                           cv=cvcv, n_jobs=1)
        clf.fit(K[:, train][train, :], y_train)
        scores[i] = clf.score(K[test, :][:, train], y[test])

    return scores.mean()

Source File: stacker.py From brew with MIT License

6 votes

def fit_layer(self, layer_idx, X, y):
        if layer_idx >= len(self.layers):
            return
        elif layer_idx == len(self.layers) - 1:
            self.layers[layer_idx].fit(X, y)
        else:
            n_classes = len(set(y)) - 1
            n_classifiers = len(self.layers[layer_idx])
            output = np.zeros((X.shape[0], n_classes * n_classifiers))
            skf = cross_validation.StratifiedKFold(y, self.cv)
            for tra, tst in skf:
                self.layers[layer_idx].fit(X[tra], y[tra])
                out = self.layers[layer_idx].output(X[tst], mode=self.mode)
                output[tst, :] = out[:, 1:, :].reshape(
                    out.shape[0], (out.shape[1] - 1) * out.shape[2])

            self.layers[layer_idx].fit(X, y)
            self.fit_layer(layer_idx + 1, output, y)

Source File: classify_nodes.py From PyTorch-Luna16 with Apache License 2.0

5 votes

def classifyData():
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")

    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = RF(n_estimators=100, n_jobs=3)
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
    print("logloss",logloss(Y, y_pred))

    # All Cancer
    print ("Predicting all positive")
    y_pred = np.ones(Y.shape)
    print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
    print("logloss",logloss(Y, y_pred))

    # No Cancer
    print ("Predicting all negative")
    y_pred = Y*0
    print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
    print("logloss",logloss(Y, y_pred))

    # try XGBoost
    print ("XGBoost")
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = xgb.XGBClassifier(objective="binary:logistic")
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
    print("logloss", logloss(Y, y_pred))

Source File: classify.py From 2016_CCFsougou2 with MIT License

5 votes

def validation(self, X, Y, wv_X, kind):
        """
        2-fold validation
        :param X: train text
        :param Y: train label
        :param wv_X: train wv_vec
        :param kind: age/gender/education
        :return: mean score of 2-fold validation
        """
        print '向量化中...'
        X=np.array(X)
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0))
        score = np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j+1,'-fold'

            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            wv_X_train =wv_X[train_idx]
            wv_X_test = wv_X[test_idx]

            vec = TfidfVectorizer(use_idf=True,sublinear_tf=False, max_features=50000, binary=True)
            vec.fit(X_train, y_train)
            X_train = vec.transform(X_train)
            X_test = vec.transform(X_test)

            print 'shape',X_train.shape

            ypre = self.stacking(X_train,y_train,X_test,wv_X_train,wv_X_test,kind)
            cur = sum(y_test == ypre) * 1.0 / len(ypre)
            score[j] = cur

        print score
        print score.mean(),kind
        return score.mean()

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

5 votes

def Get_yPred (X,y,clf_class,n_folds=10, pred_proba=False) : #,**kwargs):
    '''
    Return "Full" Y_predictions from a given c;assifier (not just from one split): (From def run_cv)
    http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html

    Could also be done with stratified shuffle split (+Append output) ?
    http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
    '''
    # Construct a kfolds object
    # kf = StratifiedKFold(len(y),n_folds,shuffle=True) #shuffle?
    kf = StratifiedKFold(y,n_folds,shuffle=True) #shuffle?
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # sample_weight=balance_weights(y_train)

        # Initialize a classifier with key word arguments
        clf = clf_class #(**kwargs)
        #sample_weight weighting not working here.. ?  TODO
        clf.fit(X_train,y_train) #,sample_weight) #
        if pred_proba == True:
            y_pred[test_index] = clf.predict_proba(X_test)
        else:
            y_pred[test_index] = clf.predict(X_test)
    return y_pred

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

5 votes

def plotRFECV (X,y,stepSize=0.05,scoring='f1'):
    '''
    Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation.
    http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py
    '''
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV

    # Create the RFE object and compute a cross-validated score.
    # svc = SVC(kernel="linear")
    svc = SVC(kernel="linear",class_weight='auto', cache_size=1400)
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2),
                  scoring=scoring)
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv

Source File: classif_and_ktst.py From jstsp2015 with MIT License

5 votes

def compute_svm_cv(K, y, C=100.0, n_folds=5,
                   scoring=balanced_accuracy_scoring):
    """Compute cross-validated score of SVM with given precomputed kernel.
    """
    cv = StratifiedKFold(y, n_folds=n_folds)
    clf = SVC(C=C, kernel='precomputed', class_weight='auto')
    scores = cross_val_score(clf, K, y,
                             scoring=scoring, cv=cv)
    return scores.mean()

Source File: simulation.py From jstsp2015 with MIT License

5 votes

def compute_svm_score(K, y, n_folds, scoring='accuracy', random_state=0):
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True,
                         random_state=random_state)
    clf = SVC(C=1.0, kernel='precomputed')
    scores = cross_val_score(clf, K, y, scoring=scoring, cv=cv, n_jobs=1)
    score = scores.mean()
    return score

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_cv_iterable_wrapper():
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv = OldSKF(y_multiclass, n_folds=3)
    wrapped_old_skf = _CVIterableWrapper(cv)

    # Check if split works correctly
    np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))

    # Check if get_n_splits works correctly
    assert_equal(len(cv), wrapped_old_skf.get_n_splits())

    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    # numpy's assert_array_equal properly compares nested lists
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert_false(splits_are_equal, "If the splits are randomized, "
                 "successive calls to split should yield different results")

Python sklearn.cross_validation.StratifiedKFold() Examples