Python sklearn.cross_validation.StratifiedKFold() Examples
The following are 30
code examples of sklearn.cross_validation.StratifiedKFold().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.cross_validation
, or try the search function
.
Example #1
Source File: embutils.py From DynamicTriad with Apache License 2.0 | 6 votes |
def _validate_link_reconstruction(self, samples, lbs): # cache = utils.KeyDefaultDict(lambda x: self.embeddings_at(x)) # feat = [] # for v in samples: # emb = cache[v[0] - 1] # # feat.append(np.concatenate((emb[v[1]], emb[v[2]]), axis=0)) # feat.append(np.abs(emb[v[1]] - emb[v[2]])) # feat = np.vstack(feat) feat = self.make_features(samples) feat = np.abs(feat[:, 0] - feat[:, 1]) clf = LogisticRegression() try: cv = StratifiedKFold(lbs, n_folds=2, shuffle=True) parts = cv except TypeError: cv = StratifiedKFold(n_splits=2, shuffle=True) parts = cv.split(feat, lbs) val_score = [] for tr, te in parts: model = clf.fit(feat[tr], lbs[tr]) p = model.predict(feat[te]) val_score.append(f1_score(lbs[te], p)) return np.mean(val_score)
Example #2
Source File: test_grid_search.py From diogenes with MIT License | 6 votes |
def test_slice_on_dimension(self): iris = datasets.load_iris() y = iris.target M = iris.data clfs = [{'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [1, 10], 'random_state': [0]}, {'clf': SVC, 'kernel': ['linear', 'rbf'], 'random_state': [0]}] subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 'subset_size': [20, 40, 60, 80, 100], 'random_state': [0]}] cvs = [{'cv': StratifiedKFold}] exp = per.Experiment(M, y, clfs, subsets, cvs) result = [str(trial) for trial in exp.slice_on_dimension( per.CLF, RandomForestClassifier).trials] self.__compare_to_ref_pkl(result, 'slice_on_dimension_clf') result = [str(trial) for trial in exp.slice_on_dimension( per.SUBSET_PARAMS, {'subset_size': 60}).trials] self.__compare_to_ref_pkl(result, 'slice_on_dimension_subset_params')
Example #3
Source File: test_grid_search.py From diogenes with MIT License | 6 votes |
def test_slice_by_best_score(self): iris = datasets.load_iris() y = iris.target M = iris.data clfs = [{'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [1, 10], 'random_state': [0]}, {'clf': SVC, 'kernel': ['linear', 'rbf'], 'random_state': [0]}] subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 'subset_size': [20, 40], 'random_state': [0]}] cvs = [{'cv': StratifiedKFold}] exp = per.Experiment(M, y, clfs, subsets, cvs) exp.run() result = {str(trial): trial.average_score() for trial in exp.slice_by_best_score(per.CLF_PARAMS).trials} self.__compare_to_ref_pkl(result, 'slice_by_best_score')
Example #4
Source File: test_grid_search.py From diogenes with MIT License | 6 votes |
def test_make_csv(self): M, y = uft.generate_test_matrix(1000, 5, 2, random_state=0) clfs = [{'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [5, 25], 'random_state': [0]}, {'clf': SVC, 'kernel': ['linear', 'rbf'], 'probability': [True], 'random_state': [0]}] subsets = [{'subset': per.SubsetSweepNumRows, 'num_rows': [[100, 200]], 'random_state': [0]}] cvs = [{'cv': StratifiedKFold, 'n_folds': [2, 3]}] exp = per.Experiment(M, y, clfs=clfs, subsets=subsets, cvs=cvs) result_path = exp.make_csv()
Example #5
Source File: test_grid_search.py From diogenes with MIT License | 6 votes |
def test_report_complex(self): M, y = uft.generate_test_matrix(100, 5, 2) clfs = [{'clf': RandomForestClassifier, 'n_estimators': [10, 100], 'max_depth': [1, 10], 'random_state': [0]}, {'clf': SVC, 'kernel': ['linear', 'rbf'], 'probability': [True], 'random_state': [0]}] subsets = [{'subset': per.SubsetRandomRowsActualDistribution, 'subset_size': [20, 40, 60, 80, 100], 'random_state': [0]}] cvs = [{'cv': StratifiedKFold}] exp = per.Experiment(M, y, clfs, subsets, cvs) _, rep = exp.make_report(dimension=per.CLF, return_report_object=True, verbose=False) self.report.add_heading('test_report_complex', 1) self.report.add_subreport(rep)
Example #6
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_stratified_kfold_no_shuffle(): # Manually check that StratifiedKFold preserves the data ordering as much # as possible on toy datasets in order to avoid hiding sample dependencies # when possible splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2)) train, test = next(splits) assert_array_equal(test, [0, 2]) assert_array_equal(train, [1, 3]) train, test = next(splits) assert_array_equal(test, [1, 3]) assert_array_equal(train, [0, 2]) splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2)) train, test = next(splits) assert_array_equal(test, [0, 1, 3, 4]) assert_array_equal(train, [2, 5, 6]) train, test = next(splits) assert_array_equal(test, [2, 5, 6]) assert_array_equal(train, [0, 1, 3, 4])
Example #7
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_generator_with_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) # explicitly passing indices value is deprecated loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ps = cval.PredefinedSplit([1, 1, 2, 2]) ss = cval.ShuffleSplit(2) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
Example #8
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ss = cval.ShuffleSplit(2) ps = cval.PredefinedSplit([1, 1, 2, 2]) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
Example #9
Source File: class_w2v.py From 2016CCF-sougou with Apache License 2.0 | 6 votes |
def validation(self,X,Y,kind): """ 使用2-fold进行验证 """ print 'validating...' fold_n=2 folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0)) score=np.zeros(fold_n) for j, (train_idx, test_idx) in enumerate(folds): print j + 1, '-fold' X_train = X[train_idx] y_train = Y[train_idx] X_test = X[test_idx] y_test = Y[test_idx] res = self.fit(X_train, y_train, X_test) cur = sum(y_test == res) * 1.0 / len(res) score[j] = cur print score, score.mean() return score.mean()
Example #10
Source File: test_grid_search.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_grid_search_score_consistency(): # test that correct scores are used clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score) grid_search.fit(X, y) cv = StratifiedKFold(n_folds=3, y=y) for C, scores in zip(Cs, grid_search.grid_scores_): clf.set_params(C=C) scores = scores[2] # get the separate runs from grid scores i = 0 for train, test in cv: clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) assert_almost_equal(correct_score, scores[i]) i += 1
Example #11
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_stratified_kfold_ratios(): # Check that stratified kfold preserves label ratios in individual splits # Repeat with shuffling turned off and on n_samples = 1000 labels = np.array([4] * int(0.10 * n_samples) + [0] * int(0.89 * n_samples) + [1] * int(0.01 * n_samples)) for shuffle in [False, True]: for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle): assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10, 2) assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89, 2) assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01, 2) assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2) assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2) assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2)
Example #12
Source File: stacker.py From brew with MIT License | 6 votes |
def fit_layer(self, layer_idx, X, y): if layer_idx >= len(self.layers): return elif layer_idx == len(self.layers) - 1: self.layers[layer_idx].fit(X, y) else: n_classes = len(set(y)) - 1 n_classifiers = len(self.layers[layer_idx]) output = np.zeros((X.shape[0], n_classes * n_classifiers)) skf = cross_validation.StratifiedKFold(y, self.cv) for tra, tst in skf: self.layers[layer_idx].fit(X[tra], y[tra]) out = self.layers[layer_idx].output(X[tst], mode=self.mode) output[tst, :] = out[:, 1:, :].reshape( out.shape[0], (out.shape[1] - 1) * out.shape[2]) self.layers[layer_idx].fit(X, y) self.fit_layer(layer_idx + 1, output, y)
Example #13
Source File: simple_benchmark.py From RotationForest with MIT License | 6 votes |
def test_toy_data(name, clf): X, y = classification_data() k_folds = 5 cv = StratifiedKFold(y, k_folds, random_state=1234) acc, auc = [], [] for train, test in cv: xt, xv, yt, yv = X[train, :], X[test, :], y[train], y[test] clf.fit(xt, yt) yhat = clf.predict(xv) proba = clf.predict_proba(xv)[:, 1] acc.append(np.mean(yhat == yv)) auc.append(roc_auc_score(yv, proba)) acc_mean, acc_std = np.mean(acc), np.std(acc) auc_mean, auc_std = np.mean(auc), np.std(auc) print name print 'accuracy: {0:.3f} +/- {1:.3f}'.format(acc_mean, acc_std) print 'auc: {0:.3f} +/- {1:.3f}'.format(auc_mean, auc_std) print '-'*80 return {'name': name, 'acc_mean': acc_mean, 'acc_std': acc_std, 'auc_mean': auc_mean, 'auc_std': auc_std}
Example #14
Source File: classif_and_ktst.py From jstsp2015 with MIT License | 6 votes |
def compute_svm_score_nestedCV(K, y, n_folds, scoring=balanced_accuracy_scoring, random_state=None, param_grid=[{'C': np.logspace(-5, 5, 25)}]): """Compute cross-validated score of SVM using precomputed kernel. """ cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=random_state) scores = np.zeros(n_folds) for i, (train, test) in enumerate(cv): cvclf = SVC(kernel='precomputed') y_train = y[train] cvcv = StratifiedKFold(y_train, n_folds=n_folds, shuffle=True, random_state=random_state) clf = GridSearchCV(cvclf, param_grid=param_grid, scoring=scoring, cv=cvcv, n_jobs=1) clf.fit(K[train, :][:, train], y_train) # print clf.best_params_ scores[i] = clf.score(K[test, :][:, train], y[test]) return scores.mean()
Example #15
Source File: embutils.py From DynamicTriad with Apache License 2.0 | 6 votes |
def _validate_node_classify(self, samples, lbs): # note that the 1-st dimension of feat is for each node in each sample (time, node1, node2, ...) feat = self.make_features(samples)[:, 0] assert len(feat) == len(lbs) clf = LogisticRegression(class_weight='balanced') try: cv = StratifiedKFold(lbs, n_folds=2, shuffle=True) parts = cv except TypeError as e: cv = StratifiedKFold(n_splits=2, shuffle=True) parts = cv.split(feat, lbs) val_score = [] for tr, te in parts: model = clf.fit(feat[tr], lbs[tr]) p = model.predict(feat[te]) val_score.append(f1_score(lbs[te], p)) return np.mean(val_score)
Example #16
Source File: class_w2v.py From 2016_CCFsougou2 with MIT License | 6 votes |
def validation(self,X,Y,kind): """ 使用2-fold进行验证 """ print 'validating...' fold_n=2 folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0)) score=np.zeros(fold_n) for j, (train_idx, test_idx) in enumerate(folds): print j + 1, '-fold' X_train = X[train_idx] y_train = Y[train_idx] X_test = X[test_idx] y_test = Y[test_idx] res = self.fit(X_train, y_train, X_test) cur = sum(y_test == res) * 1.0 / len(res) score[j] = cur print score, score.mean() return score.mean()
Example #17
Source File: naive_bayes.py From yenlp with GNU General Public License v3.0 | 6 votes |
def naive_bayes(pos_samples, neg_samples, n_folds = 2): '''Trains a naive bayes classifier with NLTK. It uses stratified n-fold validation. Inputs are the positive and negative samples and the number of folds. Returns the total accuracy and the classifier and the train/test sets of the last fold.''' samples = np.array(pos_samples + neg_samples) labels = [label for (words, label) in samples] cv = cross_validation.StratifiedKFold(labels, n_folds= n_folds, shuffle=True) accuracy = 0.0 for traincv, testcv in cv: train_samples = samples[traincv] test_samples = samples[testcv] classifier = nltk.NaiveBayesClassifier.train(train_samples) accuracy += nltk.classify.util.accuracy(classifier, test_samples) accuracy /= n_folds return (accuracy, classifier, train_samples, test_samples)
Example #18
Source File: class_w2v.py From 2016CCF_BDCI_Sougou with MIT License | 6 votes |
def validation(self,X,Y,kind): """ 使用2-fold进行验证 """ print 'validating...' fold_n=2 folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0)) score=np.zeros(fold_n) for j, (train_idx, test_idx) in enumerate(folds): print j + 1, '-fold' X_train = X[train_idx] y_train = Y[train_idx] X_test = X[test_idx] y_test = Y[test_idx] res = self.fit(X_train, y_train, X_test) cur = sum(y_test == res) * 1.0 / len(res) score[j] = cur print score, score.mean() return score.mean()
Example #19
Source File: base.py From stacking with MIT License | 6 votes |
def create_cv_id(target, n_folds_ = 5, cv_id_name=cv_id_name, seed=407): try: a = StratifiedKFold(target['target'],n_folds=n_folds_, shuffle=True, random_state=seed) cv_index = a.test_folds print 'Done StratifiedKFold' except: cv_index = np.empty(len(target)) a = KFold(len(target),n_folds=n_folds_, shuffle=True, random_state=seed) for idx, i in enumerate(a): cv_index[i[1]] = idx cv_index = cv_index.astype(int) print 'Done Kfold' np.save(INPUT_PATH + cv_id_name, cv_index) return ######### Utils ######### #feature listを渡してデータを作成するutil関数
Example #20
Source File: utils.py From kaggle_otto with BSD 3-Clause "New" or "Revised" License | 6 votes |
def make_blender_cv(classifier, x, y, calibrate=False): skf = StratifiedKFold(y, n_folds=5, random_state=23) scores, predictions = [], None for train_index, test_index in skf: if calibrate: # Make training and calibration calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y[train_index])) fitted_classifier = calibrated_classifier.fit(x[train_index, :], y[train_index]) else: fitted_classifier = classifier.fit(x[train_index, :], y[train_index]) preds = fitted_classifier.predict_proba(x[test_index, :]) # Free memory calibrated_classifier, fitted_classifier = None, None gc.collect() scores.append(log_loss(y[test_index], preds)) predictions = np.append(predictions, preds, axis=0) if predictions is not None else preds return scores, predictions
Example #21
Source File: blender.py From kaggle_otto with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_weights(): # Read validation labels _, labels, _, _, _ = utils.load_data() skf = StratifiedKFold(labels, n_folds=5, random_state=23) test_index = None for _, test_idx in skf: test_index = np.append(test_index, test_idx) if test_index is not None else test_idx val_labels = labels[test_index] # Read predictions on validation set val_predictions = [] prediction_files = utils.get_prediction_files() for preds_file in prediction_files: vp = np.genfromtxt(os.path.join(consts.BLEND_PATH, preds_file), delimiter=',') val_predictions.append(vp) # Minimize blending function p0 = [1.] * len(prediction_files) p = fmin_cobyla(error, p0, args=(val_predictions, val_labels), cons=[constraint], rhoend=1e-5) return p
Example #22
Source File: simulation.py From jstsp2015 with MIT License | 6 votes |
def compute_svm_score_nestedCV(K, y, n_folds, scoring='accuracy', random_state=None, param_grid=[{'C': np.logspace(-5, 5, 20)}]): cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=random_state) scores = np.zeros(n_folds) for i, (train, test) in enumerate(cv): cvclf = SVC(kernel='precomputed') y_train = y[train] cvcv = StratifiedKFold(y_train, n_folds=n_folds, shuffle=True, random_state=random_state) clf = GridSearchCV(cvclf, param_grid=param_grid, scoring=scoring, cv=cvcv, n_jobs=1) clf.fit(K[:, train][train, :], y_train) scores[i] = clf.score(K[test, :][:, train], y[test]) return scores.mean()
Example #23
Source File: stacker.py From brew with MIT License | 6 votes |
def fit_layer(self, layer_idx, X, y): if layer_idx >= len(self.layers): return elif layer_idx == len(self.layers) - 1: self.layers[layer_idx].fit(X, y) else: n_classes = len(set(y)) - 1 n_classifiers = len(self.layers[layer_idx]) output = np.zeros((X.shape[0], n_classes * n_classifiers)) skf = cross_validation.StratifiedKFold(y, self.cv) for tra, tst in skf: self.layers[layer_idx].fit(X[tra], y[tra]) out = self.layers[layer_idx].output(X[tst], mode=self.mode) output[tst, :] = out[:, 1:, :].reshape( out.shape[0], (out.shape[1] - 1) * out.shape[2]) self.layers[layer_idx].fit(X, y) self.fit_layer(layer_idx + 1, output, y)
Example #24
Source File: classify_nodes.py From PyTorch-Luna16 with Apache License 2.0 | 5 votes |
def classifyData(): X = np.load("dataX.npy") Y = np.load("dataY.npy") kf = KFold(Y, n_folds=3) y_pred = Y * 0 for train, test in kf: X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test] clf = RF(n_estimators=100, n_jobs=3) clf.fit(X_train, y_train) y_pred[test] = clf.predict(X_test) print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])) print("logloss",logloss(Y, y_pred)) # All Cancer print ("Predicting all positive") y_pred = np.ones(Y.shape) print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])) print("logloss",logloss(Y, y_pred)) # No Cancer print ("Predicting all negative") y_pred = Y*0 print (classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])) print("logloss",logloss(Y, y_pred)) # try XGBoost print ("XGBoost") kf = KFold(Y, n_folds=3) y_pred = Y * 0 for train, test in kf: X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test] clf = xgb.XGBClassifier(objective="binary:logistic") clf.fit(X_train, y_train) y_pred[test] = clf.predict(X_test) print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])) print("logloss", logloss(Y, y_pred))
Example #25
Source File: classify.py From 2016_CCFsougou2 with MIT License | 5 votes |
def validation(self, X, Y, wv_X, kind): """ 2-fold validation :param X: train text :param Y: train label :param wv_X: train wv_vec :param kind: age/gender/education :return: mean score of 2-fold validation """ print '向量化中...' X=np.array(X) fold_n=2 folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0)) score = np.zeros(fold_n) for j, (train_idx, test_idx) in enumerate(folds): print j+1,'-fold' X_train = X[train_idx] y_train = Y[train_idx] X_test = X[test_idx] y_test = Y[test_idx] wv_X_train =wv_X[train_idx] wv_X_test = wv_X[test_idx] vec = TfidfVectorizer(use_idf=True,sublinear_tf=False, max_features=50000, binary=True) vec.fit(X_train, y_train) X_train = vec.transform(X_train) X_test = vec.transform(X_test) print 'shape',X_train.shape ypre = self.stacking(X_train,y_train,X_test,wv_X_train,wv_X_test,kind) cur = sum(y_test == ypre) * 1.0 / len(ypre) score[j] = cur print score print score.mean(),kind return score.mean()
Example #26
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 5 votes |
def Get_yPred (X,y,clf_class,n_folds=10, pred_proba=False) : #,**kwargs): ''' Return "Full" Y_predictions from a given c;assifier (not just from one split): (From def run_cv) http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html Could also be done with stratified shuffle split (+Append output) ? http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html ''' # Construct a kfolds object # kf = StratifiedKFold(len(y),n_folds,shuffle=True) #shuffle? kf = StratifiedKFold(y,n_folds,shuffle=True) #shuffle? y_pred = y.copy() # Iterate through folds for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # sample_weight=balance_weights(y_train) # Initialize a classifier with key word arguments clf = clf_class #(**kwargs) #sample_weight weighting not working here.. ? TODO clf.fit(X_train,y_train) #,sample_weight) # if pred_proba == True: y_pred[test_index] = clf.predict_proba(X_test) else: y_pred[test_index] = clf.predict(X_test) return y_pred
Example #27
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 5 votes |
def plotRFECV (X,y,stepSize=0.05,scoring='f1'): ''' Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation. http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py ''' from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. # svc = SVC(kernel="linear") svc = SVC(kernel="linear",class_weight='auto', cache_size=1400) # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2), scoring=scoring) rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores import matplotlib.pyplot as plt plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() return rfecv
Example #28
Source File: classif_and_ktst.py From jstsp2015 with MIT License | 5 votes |
def compute_svm_cv(K, y, C=100.0, n_folds=5, scoring=balanced_accuracy_scoring): """Compute cross-validated score of SVM with given precomputed kernel. """ cv = StratifiedKFold(y, n_folds=n_folds) clf = SVC(C=C, kernel='precomputed', class_weight='auto') scores = cross_val_score(clf, K, y, scoring=scoring, cv=cv) return scores.mean()
Example #29
Source File: simulation.py From jstsp2015 with MIT License | 5 votes |
def compute_svm_score(K, y, n_folds, scoring='accuracy', random_state=0): cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=random_state) clf = SVC(C=1.0, kernel='precomputed') scores = cross_val_score(clf, K, y, scoring=scoring, cv=cv, n_jobs=1) score = scores.mean() return score
Example #30
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cv_iterable_wrapper(): y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) with warnings.catch_warnings(record=True): from sklearn.cross_validation import StratifiedKFold as OldSKF cv = OldSKF(y_multiclass, n_folds=3) wrapped_old_skf = _CVIterableWrapper(cv) # Check if split works correctly np.testing.assert_equal(list(cv), list(wrapped_old_skf.split())) # Check if get_n_splits works correctly assert_equal(len(cv), wrapped_old_skf.get_n_splits()) kf_iter = KFold(n_splits=5).split(X, y) kf_iter_wrapped = check_cv(kf_iter) # Since the wrapped iterable is enlisted and stored, # split can be called any number of times to produce # consistent results. np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y))) # If the splits are randomized, successive calls to split yields different # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) # numpy's assert_array_equal properly compares nested lists np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) try: np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) splits_are_equal = True except AssertionError: splits_are_equal = False assert_false(splits_are_equal, "If the splits are randomized, " "successive calls to split should yield different results")