Python sklearn.cross_validation.StratifiedKFold() Examples

The following are 30 code examples of sklearn.cross_validation.StratifiedKFold(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cross_validation , or try the search function .
Example #1
Source File: embutils.py    From DynamicTriad with Apache License 2.0 6 votes vote down vote up
def _validate_link_reconstruction(self, samples, lbs):
        # cache = utils.KeyDefaultDict(lambda x: self.embeddings_at(x))
        # feat = []
        # for v in samples:
        #     emb = cache[v[0] - 1]
        #     # feat.append(np.concatenate((emb[v[1]], emb[v[2]]), axis=0))
        #     feat.append(np.abs(emb[v[1]] - emb[v[2]]))
        # feat = np.vstack(feat)
        feat = self.make_features(samples)
        feat = np.abs(feat[:, 0] - feat[:, 1])

        clf = LogisticRegression()
        try:
            cv = StratifiedKFold(lbs, n_folds=2, shuffle=True)
            parts = cv
        except TypeError:
            cv = StratifiedKFold(n_splits=2, shuffle=True)
            parts = cv.split(feat, lbs)

        val_score = []
        for tr, te in parts:
            model = clf.fit(feat[tr], lbs[tr])
            p = model.predict(feat[te])
            val_score.append(f1_score(lbs[te], p))
        return np.mean(val_score) 
Example #2
Source File: test_grid_search.py    From diogenes with MIT License 6 votes vote down vote up
def test_slice_on_dimension(self):
        iris = datasets.load_iris()
        y = iris.target
        M = iris.data
        clfs = [{'clf': RandomForestClassifier, 
                 'n_estimators': [10, 100], 
                 'max_depth': [1, 10],
                 'random_state': [0]}, 
                 {'clf': SVC, 'kernel': ['linear', 'rbf'], 
                  'random_state': [0]}]        
        subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 
                    'subset_size': [20, 40, 60, 80, 100],
                    'random_state': [0]}]
        cvs = [{'cv': StratifiedKFold}]
        exp = per.Experiment(M, y, clfs, subsets, cvs)
        result = [str(trial) for trial in exp.slice_on_dimension(
                per.CLF, 
                RandomForestClassifier).trials]
        self.__compare_to_ref_pkl(result, 'slice_on_dimension_clf')
        result = [str(trial) for trial  in exp.slice_on_dimension(
                per.SUBSET_PARAMS, 
                {'subset_size': 60}).trials]
        self.__compare_to_ref_pkl(result, 'slice_on_dimension_subset_params') 
Example #3
Source File: test_grid_search.py    From diogenes with MIT License 6 votes vote down vote up
def test_slice_by_best_score(self):
        iris = datasets.load_iris()
        y = iris.target
        M = iris.data
        clfs = [{'clf': RandomForestClassifier, 
                 'n_estimators': [10, 100], 
                 'max_depth': [1, 10],
                 'random_state': [0]}, 
                 {'clf': SVC, 'kernel': ['linear', 'rbf'],
                  'random_state': [0]}]        
        subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 
                    'subset_size': [20, 40],
                    'random_state': [0]}]
        cvs = [{'cv': StratifiedKFold}]
        exp = per.Experiment(M, y, clfs, subsets, cvs)
        exp.run()
        result = {str(trial): trial.average_score() for trial in 
                  exp.slice_by_best_score(per.CLF_PARAMS).trials}
        self.__compare_to_ref_pkl(result, 'slice_by_best_score') 
Example #4
Source File: test_grid_search.py    From diogenes with MIT License 6 votes vote down vote up
def test_make_csv(self):
        M, y = uft.generate_test_matrix(1000, 5, 2, random_state=0)
        clfs = [{'clf': RandomForestClassifier, 
                 'n_estimators': [10, 100], 
                 'max_depth': [5, 25],
                 'random_state': [0]},
                {'clf': SVC, 
                 'kernel': ['linear', 'rbf'], 
                 'probability': [True],
                 'random_state': [0]}]        
        subsets = [{'subset': per.SubsetSweepNumRows, 
                    'num_rows': [[100, 200]],
                    'random_state': [0]}]
        cvs = [{'cv': StratifiedKFold, 
                'n_folds': [2, 3]}]
        exp = per.Experiment(M, y, clfs=clfs, subsets=subsets, cvs=cvs)
        result_path = exp.make_csv() 
Example #5
Source File: test_grid_search.py    From diogenes with MIT License 6 votes vote down vote up
def test_report_complex(self):
        M, y = uft.generate_test_matrix(100, 5, 2)
        clfs = [{'clf': RandomForestClassifier, 
                 'n_estimators': [10, 100], 
                 'max_depth': [1, 10],
                 'random_state': [0]}, 
                 {'clf': SVC, 
                  'kernel': ['linear', 'rbf'], 
                  'probability': [True],
                  'random_state': [0]}]        
        subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 
                    'subset_size': [20, 40, 60, 80, 100],
                    'random_state': [0]}]
        cvs = [{'cv': StratifiedKFold}]
        exp = per.Experiment(M, y, clfs, subsets, cvs)
        _, rep = exp.make_report(dimension=per.CLF, return_report_object=True, 
                                 verbose=False)
        self.report.add_heading('test_report_complex', 1)
        self.report.add_subreport(rep) 
Example #6
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_stratified_kfold_no_shuffle():
    # Manually check that StratifiedKFold preserves the data ordering as much
    # as possible on toy datasets in order to avoid hiding sample dependencies
    # when possible
    splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 2])
    assert_array_equal(train, [1, 3])

    train, test = next(splits)
    assert_array_equal(test, [1, 3])
    assert_array_equal(train, [0, 2])

    splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 3, 4])
    assert_array_equal(train, [2, 5, 6])

    train, test = next(splits)
    assert_array_equal(test, [2, 5, 6])
    assert_array_equal(train, [0, 1, 3, 4]) 
Example #7
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example #8
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example #9
Source File: class_w2v.py    From 2016CCF-sougou with Apache License 2.0 6 votes vote down vote up
def validation(self,X,Y,kind):
        """

        使用2-fold进行验证
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean() 
Example #10
Source File: test_grid_search.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_grid_search_score_consistency():
    # test that correct scores are used
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
        grid_search.fit(X, y)
        cv = StratifiedKFold(n_folds=3, y=y)
        for C, scores in zip(Cs, grid_search.grid_scores_):
            clf.set_params(C=C)
            scores = scores[2]  # get the separate runs from grid scores
            i = 0
            for train, test in cv:
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, scores[i])
                i += 1 
Example #11
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves label ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    labels = np.array([4] * int(0.10 * n_samples) +
                      [0] * int(0.89 * n_samples) +
                      [1] * int(0.01 * n_samples))
    for shuffle in [False, True]:
        for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
            assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
                                2)
            assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
                                2)
            assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
                                2)
            assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2) 
Example #12
Source File: stacker.py    From brew with MIT License 6 votes vote down vote up
def fit_layer(self, layer_idx, X, y):
        if layer_idx >= len(self.layers):
            return
        elif layer_idx == len(self.layers) - 1:
            self.layers[layer_idx].fit(X, y)
        else:
            n_classes = len(set(y)) - 1
            n_classifiers = len(self.layers[layer_idx])
            output = np.zeros((X.shape[0], n_classes * n_classifiers))
            skf = cross_validation.StratifiedKFold(y, self.cv)
            for tra, tst in skf:
                self.layers[layer_idx].fit(X[tra], y[tra])
                out = self.layers[layer_idx].output(X[tst], mode=self.mode)
                output[tst, :] = out[:, 1:, :].reshape(
                    out.shape[0], (out.shape[1] - 1) * out.shape[2])

            self.layers[layer_idx].fit(X, y)
            self.fit_layer(layer_idx + 1, output, y) 
Example #13
Source File: simple_benchmark.py    From RotationForest with MIT License 6 votes vote down vote up
def test_toy_data(name, clf):
    X, y = classification_data()
    k_folds = 5
    cv = StratifiedKFold(y, k_folds, random_state=1234)

    acc, auc = [], []
    for train, test in cv:
        xt, xv, yt, yv = X[train, :], X[test, :], y[train], y[test]
        clf.fit(xt, yt)
        yhat = clf.predict(xv)
        proba = clf.predict_proba(xv)[:, 1]
        acc.append(np.mean(yhat == yv))
        auc.append(roc_auc_score(yv, proba))

    acc_mean, acc_std = np.mean(acc), np.std(acc)
    auc_mean, auc_std = np.mean(auc), np.std(auc)
    print name
    print 'accuracy: {0:.3f} +/- {1:.3f}'.format(acc_mean, acc_std)
    print 'auc: {0:.3f} +/- {1:.3f}'.format(auc_mean, auc_std)
    print '-'*80
    return {'name': name,
            'acc_mean': acc_mean,
            'acc_std': acc_std,
            'auc_mean': auc_mean,
            'auc_std': auc_std} 
Example #14
Source File: classif_and_ktst.py    From jstsp2015 with MIT License 6 votes vote down vote up
def compute_svm_score_nestedCV(K, y, n_folds,
                               scoring=balanced_accuracy_scoring,
                               random_state=None,
                               param_grid=[{'C': np.logspace(-5, 5, 25)}]):
    """Compute cross-validated score of SVM using precomputed kernel.
    """
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True,
                         random_state=random_state)
    scores = np.zeros(n_folds)
    for i, (train, test) in enumerate(cv):
        cvclf = SVC(kernel='precomputed')
        y_train = y[train]
        cvcv = StratifiedKFold(y_train, n_folds=n_folds,
                               shuffle=True,
                               random_state=random_state)
        clf = GridSearchCV(cvclf, param_grid=param_grid, scoring=scoring,
                           cv=cvcv, n_jobs=1)
        clf.fit(K[train, :][:, train], y_train)
        # print clf.best_params_
        scores[i] = clf.score(K[test, :][:, train], y[test])

    return scores.mean() 
Example #15
Source File: embutils.py    From DynamicTriad with Apache License 2.0 6 votes vote down vote up
def _validate_node_classify(self, samples, lbs):
        # note that the 1-st dimension of feat is for each node in each sample (time, node1, node2, ...)
        feat = self.make_features(samples)[:, 0]
        assert len(feat) == len(lbs)

        clf = LogisticRegression(class_weight='balanced')
        try:
            cv = StratifiedKFold(lbs, n_folds=2, shuffle=True)
            parts = cv
        except TypeError as e:
            cv = StratifiedKFold(n_splits=2, shuffle=True)
            parts = cv.split(feat, lbs)

        val_score = []
        for tr, te in parts:
            model = clf.fit(feat[tr], lbs[tr])
            p = model.predict(feat[te])
            val_score.append(f1_score(lbs[te], p))
        return np.mean(val_score) 
Example #16
Source File: class_w2v.py    From 2016_CCFsougou2 with MIT License 6 votes vote down vote up
def validation(self,X,Y,kind):
        """

        使用2-fold进行验证
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean() 
Example #17
Source File: naive_bayes.py    From yenlp with GNU General Public License v3.0 6 votes vote down vote up
def naive_bayes(pos_samples, neg_samples, n_folds = 2):
    '''Trains a naive bayes classifier with NLTK. It uses stratified 
    n-fold validation. Inputs are the positive and negative samples and 
    the number of folds. Returns the total accuracy and the classifier and 
    the train/test sets of the last fold.'''
    samples = np.array(pos_samples + neg_samples)
    labels = [label for (words, label) in samples]
    cv = cross_validation.StratifiedKFold(labels, n_folds= n_folds, shuffle=True)
    
    accuracy = 0.0
    for traincv, testcv in cv:
        train_samples = samples[traincv]
        test_samples = samples[testcv]
        classifier = nltk.NaiveBayesClassifier.train(train_samples)
        accuracy += nltk.classify.util.accuracy(classifier, test_samples)
    accuracy /= n_folds
    return (accuracy, classifier, train_samples, test_samples) 
Example #18
Source File: class_w2v.py    From 2016CCF_BDCI_Sougou with MIT License 6 votes vote down vote up
def validation(self,X,Y,kind):
        """

        使用2-fold进行验证
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean() 
Example #19
Source File: base.py    From stacking with MIT License 6 votes vote down vote up
def create_cv_id(target, n_folds_ = 5, cv_id_name=cv_id_name, seed=407):
    try:
        a = StratifiedKFold(target['target'],n_folds=n_folds_, shuffle=True, random_state=seed)
        cv_index = a.test_folds
        print 'Done StratifiedKFold'
    except:
        cv_index = np.empty(len(target))
        a = KFold(len(target),n_folds=n_folds_, shuffle=True, random_state=seed)
        for idx, i in enumerate(a):
            cv_index[i[1]] = idx
        cv_index = cv_index.astype(int)
        print 'Done Kfold'
    
    np.save(INPUT_PATH + cv_id_name, cv_index)
    return 

######### Utils #########

#feature listを渡してデータを作成するutil関数 
Example #20
Source File: utils.py    From kaggle_otto with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def make_blender_cv(classifier, x, y, calibrate=False):
    skf = StratifiedKFold(y, n_folds=5, random_state=23)
    scores, predictions = [], None
    for train_index, test_index in skf:
        if calibrate:
            # Make training and calibration
            calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y[train_index]))
            fitted_classifier = calibrated_classifier.fit(x[train_index, :], y[train_index])
        else:
            fitted_classifier = classifier.fit(x[train_index, :], y[train_index])
        preds = fitted_classifier.predict_proba(x[test_index, :])

        # Free memory
        calibrated_classifier, fitted_classifier = None, None
        gc.collect()

        scores.append(log_loss(y[test_index], preds))
        predictions = np.append(predictions, preds, axis=0) if predictions is not None else preds
    return scores, predictions 
Example #21
Source File: blender.py    From kaggle_otto with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_weights():
    # Read validation labels
    _, labels, _, _, _ = utils.load_data()
    skf = StratifiedKFold(labels, n_folds=5, random_state=23)
    test_index = None
    for _, test_idx in skf:
        test_index = np.append(test_index, test_idx) if test_index is not None else test_idx
    val_labels = labels[test_index]
    # Read predictions on validation set
    val_predictions = []
    prediction_files = utils.get_prediction_files()
    for preds_file in prediction_files:
        vp = np.genfromtxt(os.path.join(consts.BLEND_PATH, preds_file), delimiter=',')
        val_predictions.append(vp)
    # Minimize blending function
    p0 = [1.] * len(prediction_files)
    p = fmin_cobyla(error, p0, args=(val_predictions, val_labels), cons=[constraint], rhoend=1e-5)

    return p 
Example #22
Source File: simulation.py    From jstsp2015 with MIT License 6 votes vote down vote up
def compute_svm_score_nestedCV(K, y, n_folds, scoring='accuracy',
                               random_state=None,
                               param_grid=[{'C': np.logspace(-5, 5, 20)}]):
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True,
                         random_state=random_state)
    scores = np.zeros(n_folds)
    for i, (train, test) in enumerate(cv):
        cvclf = SVC(kernel='precomputed')
        y_train = y[train]
        cvcv = StratifiedKFold(y_train, n_folds=n_folds,
                               shuffle=True,
                               random_state=random_state)
        clf = GridSearchCV(cvclf, param_grid=param_grid, scoring=scoring,
                           cv=cvcv, n_jobs=1)
        clf.fit(K[:, train][train, :], y_train)
        scores[i] = clf.score(K[test, :][:, train], y[test])

    return scores.mean() 
Example #23
Source File: stacker.py    From brew with MIT License 6 votes vote down vote up
def fit_layer(self, layer_idx, X, y):
        if layer_idx >= len(self.layers):
            return
        elif layer_idx == len(self.layers) - 1:
            self.layers[layer_idx].fit(X, y)
        else:
            n_classes = len(set(y)) - 1
            n_classifiers = len(self.layers[layer_idx])
            output = np.zeros((X.shape[0], n_classes * n_classifiers))
            skf = cross_validation.StratifiedKFold(y, self.cv)
            for tra, tst in skf:
                self.layers[layer_idx].fit(X[tra], y[tra])
                out = self.layers[layer_idx].output(X[tst], mode=self.mode)
                output[tst, :] = out[:, 1:, :].reshape(
                    out.shape[0], (out.shape[1] - 1) * out.shape[2])

            self.layers[layer_idx].fit(X, y)
            self.fit_layer(layer_idx + 1, output, y) 
Example #24
Source File: classify_nodes.py    From PyTorch-Luna16 with Apache License 2.0 5 votes vote down vote up
def classifyData():
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")

    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = RF(n_estimators=100, n_jobs=3)
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
    print("logloss",logloss(Y, y_pred))

    # All Cancer
    print ("Predicting all positive")
    y_pred = np.ones(Y.shape)
    print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
    print("logloss",logloss(Y, y_pred))

    # No Cancer
    print ("Predicting all negative")
    y_pred = Y*0
    print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
    print("logloss",logloss(Y, y_pred))

    # try XGBoost
    print ("XGBoost")
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = xgb.XGBClassifier(objective="binary:logistic")
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
    print("logloss", logloss(Y, y_pred)) 
Example #25
Source File: classify.py    From 2016_CCFsougou2 with MIT License 5 votes vote down vote up
def validation(self, X, Y, wv_X, kind):
        """
        2-fold validation
        :param X: train text
        :param Y: train label
        :param wv_X: train wv_vec
        :param kind: age/gender/education
        :return: mean score of 2-fold validation
        """
        print '向量化中...'
        X=np.array(X)
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0))
        score = np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j+1,'-fold'

            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            wv_X_train =wv_X[train_idx]
            wv_X_test = wv_X[test_idx]

            vec = TfidfVectorizer(use_idf=True,sublinear_tf=False, max_features=50000, binary=True)
            vec.fit(X_train, y_train)
            X_train = vec.transform(X_train)
            X_test = vec.transform(X_test)

            print 'shape',X_train.shape

            ypre = self.stacking(X_train,y_train,X_test,wv_X_train,wv_X_test,kind)
            cur = sum(y_test == ypre) * 1.0 / len(ypre)
            score[j] = cur

        print score
        print score.mean(),kind
        return score.mean() 
Example #26
Source File: PipeTasks.py    From ProFET with GNU General Public License v3.0 5 votes vote down vote up
def Get_yPred (X,y,clf_class,n_folds=10, pred_proba=False) : #,**kwargs):
    '''
    Return "Full" Y_predictions from a given c;assifier (not just from one split): (From def run_cv)
    http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html

    Could also be done with stratified shuffle split (+Append output) ?
    http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
    '''
    # Construct a kfolds object
    # kf = StratifiedKFold(len(y),n_folds,shuffle=True) #shuffle?
    kf = StratifiedKFold(y,n_folds,shuffle=True) #shuffle?
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # sample_weight=balance_weights(y_train)

        # Initialize a classifier with key word arguments
        clf = clf_class #(**kwargs)
        #sample_weight weighting not working here.. ?  TODO
        clf.fit(X_train,y_train) #,sample_weight) #
        if pred_proba == True:
            y_pred[test_index] = clf.predict_proba(X_test)
        else:
            y_pred[test_index] = clf.predict(X_test)
    return y_pred 
Example #27
Source File: PipeTasks.py    From ProFET with GNU General Public License v3.0 5 votes vote down vote up
def plotRFECV (X,y,stepSize=0.05,scoring='f1'):
    '''
    Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation.
    http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py
    '''
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV

    # Create the RFE object and compute a cross-validated score.
    # svc = SVC(kernel="linear")
    svc = SVC(kernel="linear",class_weight='auto', cache_size=1400)
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2),
                  scoring=scoring)
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv 
Example #28
Source File: classif_and_ktst.py    From jstsp2015 with MIT License 5 votes vote down vote up
def compute_svm_cv(K, y, C=100.0, n_folds=5,
                   scoring=balanced_accuracy_scoring):
    """Compute cross-validated score of SVM with given precomputed kernel.
    """
    cv = StratifiedKFold(y, n_folds=n_folds)
    clf = SVC(C=C, kernel='precomputed', class_weight='auto')
    scores = cross_val_score(clf, K, y,
                             scoring=scoring, cv=cv)
    return scores.mean() 
Example #29
Source File: simulation.py    From jstsp2015 with MIT License 5 votes vote down vote up
def compute_svm_score(K, y, n_folds, scoring='accuracy', random_state=0):
    cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True,
                         random_state=random_state)
    clf = SVC(C=1.0, kernel='precomputed')
    scores = cross_val_score(clf, K, y, scoring=scoring, cv=cv, n_jobs=1)
    score = scores.mean()
    return score 
Example #30
Source File: test_split.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_cv_iterable_wrapper():
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv = OldSKF(y_multiclass, n_folds=3)
    wrapped_old_skf = _CVIterableWrapper(cv)

    # Check if split works correctly
    np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))

    # Check if get_n_splits works correctly
    assert_equal(len(cv), wrapped_old_skf.get_n_splits())

    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    # numpy's assert_array_equal properly compares nested lists
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert_false(splits_are_equal, "If the splits are randomized, "
                 "successive calls to split should yield different results")