Python sklearn.cross_validation.StratifiedShuffleSplit() Examples
The following are 16
code examples of sklearn.cross_validation.StratifiedShuffleSplit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.cross_validation
, or try the search function
.
Example #1
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_stratified_shuffle_split_init(): y = np.asarray([0, 1, 1, 1, 2, 2, 2]) # Check that error is raised if there is a class with only one sample assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2) # Check that error is raised if the test set size is smaller than n_classes assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2) # Check that error is raised if the train set size is smaller than # n_classes assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2) y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2]) # Check that errors are raised if there is not enough samples assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8) # Train size or test size too small assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)
Example #2
Source File: car_eval.py From mHTM with MIT License | 5 votes |
def base_learners(data_path='data.csv', seed=123456789): """ Test some classifiers on the raw data. """ # Params nsplits = 8 pct_train = 0.8 # Get data data = pd.read_csv(data_path) x = data.ix[:, :-1].as_matrix() y = data.ix[:, -1].as_matrix() x, y = convert_data_to_int(x, y) # Run random forest in parallel sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train, random_state=seed) results = Parallel(n_jobs=-1)(delayed(train_score_clf)( RandomForestClassifier(random_state=i), x[tr], x[te], y[tr], y[te]) for i, (tr, te) in enumerate(sss)) print 'Random Forest: {0:.3f} %'.format(np.median(results)) # Run SVM in parallel sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train, random_state=seed) results = Parallel(n_jobs=-1)(delayed(train_score_clf)( LinearSVC(random_state=i), x[tr], x[te], y[tr], y[te]) for i, (tr, te) in enumerate(sss)) print 'Linear SVM: {0:.3f} %'.format(np.median(results))
Example #3
Source File: loader.py From mHTM with MIT License | 5 votes |
def _create_generator(self): """ Create a generator for the data. Yield a tuple containing the current training and testing split. """ # Create the CV iterators sss_tr = StratifiedShuffleSplit(self.tr_y, self.nsplits, train_size=self.train_size, random_state=self.seed) sss_te = StratifiedShuffleSplit(self.te_y, self.nsplits, train_size=self.test_size, random_state=self.seed) # Yield each item for tr, te in izip(sss_tr, sss_te): yield tr[0], te[0] + len(self.tr_y) # Offset testing indexes
Example #4
Source File: utils.py From kaggle_otto with BSD 3-Clause "New" or "Revised" License | 5 votes |
def stratified_split(x, y, test_size=0.2): strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23) train_index, valid_index = [s for s in strat_shuffled_split][0] x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index] return x_train, y_train, x_valid, y_valid
Example #5
Source File: data_dirs_organizer.py From painters with MIT License | 5 votes |
def _train_val_split_indices(labels): split = StratifiedShuffleSplit( labels, n_iter=1, test_size=VAL_SIZE, random_state=42) indices_tr, indices_val = next(iter(split)) _save_organized_data_info( split.classes, indices_tr, indices_val, multi_crop=False) _save_organized_data_info( split.classes, indices_tr, indices_val, multi_crop=True) return indices_tr, indices_val, split.classes
Example #6
Source File: data.py From kaggle_diabetic with MIT License | 5 votes |
def split_indices(files, labels, test_size=0.1, random_state=RANDOM_STATE): names = get_names(files) labels = get_labels(names, per_patient=True) spl = cross_validation.StratifiedShuffleSplit(labels[:, 0], test_size=test_size, random_state=random_state, n_iter=1) tr, te = next(iter(spl)) tr = np.hstack([tr * 2, tr * 2 + 1]) te = np.hstack([te * 2, te * 2 + 1]) return tr, te
Example #7
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 5 votes |
def Get_yPred (X,y,clf_class,n_folds=10, pred_proba=False) : #,**kwargs): ''' Return "Full" Y_predictions from a given c;assifier (not just from one split): (From def run_cv) http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html Could also be done with stratified shuffle split (+Append output) ? http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html ''' # Construct a kfolds object # kf = StratifiedKFold(len(y),n_folds,shuffle=True) #shuffle? kf = StratifiedKFold(y,n_folds,shuffle=True) #shuffle? y_pred = y.copy() # Iterate through folds for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # sample_weight=balance_weights(y_train) # Initialize a classifier with key word arguments clf = clf_class #(**kwargs) #sample_weight weighting not working here.. ? TODO clf.fit(X_train,y_train) #,sample_weight) # if pred_proba == True: y_pred[test_index] = clf.predict_proba(X_test) else: y_pred[test_index] = clf.predict(X_test) return y_pred
Example #8
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 5 votes |
def PlotPerfPercentFeatures(X,y,est=LinearSVC()): ''' Performance of a classifier (default: SVM-Anova) varying the percentile of features selected (F-test) . http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py See Also: (Similar but with model seelction from among classifiers): http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb ''' transform = SelectPercentile(f_classif) clf = Pipeline([('anova', transform), ('est', est)]) ############################################################################### # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99) # percentiles = (1,5,10,25,50,75,90) for percentile in percentiles: # print(percentile) clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) print("Outputting Graph:") plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Predictor Performance, varying percent of features used') plt.xlabel('Percentile') plt.ylabel('Prediction Performance') plt.axis('tight') plt.show()
Example #9
Source File: OutPutRes.py From ProFET with GNU General Public License v3.0 | 5 votes |
def PlotPerfPercentFeatures(X,y,est=LinearSVC()): ''' Performance of a classifier (default: SVM-Anova) varying the percentile of features selected (F-test) . http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py ''' transform = SelectPercentile(f_classif) clf = Pipeline([('anova', transform), ('est', est)]) ############################################################################### # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 100) # percentiles = (1,5,10,25,50,75,90) for percentile in percentiles: # print(percentile) clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) print("Outputting Graph:") plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Predictor Performance, varying percent of features used') plt.xlabel('Percentile') plt.ylabel('Prediction Performance') plt.axis('tight') plt.show()
Example #10
Source File: OutPutRes.py From ProFET with GNU General Public License v3.0 | 5 votes |
def CV_multi_stats(X, y, model,n=6) : ''' http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics This version uses multiclass (or multilabel) compatible metrics. May be expanded to use the cross_val_score helper function: http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics ''' scores = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1) #Accuracy scores_f1 = cross_val_score(estimator=model, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=n, test_size=0.16), n_jobs=-1, scoring='f1') print("Model Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) print("Model f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) return (scores.mean(), scores.std() ,scores_f1.mean(), scores_f1.std() ) #Removed * 2 from returned STD .. ?
Example #11
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_stratified_shuffle_split_iter(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), np.array([-1] * 800 + [1] * 50) ] for y in ys: sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33, random_state=0) test_size = np.ceil(0.33 * len(y)) train_size = len(y) - test_size for train, test in sss: assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1]) / float(len(y[train]))) p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(len(train) + len(test), y.size) assert_equal(len(train), train_size) assert_equal(len(test), test_size) assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
Example #12
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_stratified_shuffle_split_overlap_train_test_bug(): # See https://github.com/scikit-learn/scikit-learn/issues/6121 for # the original bug report labels = [0, 1, 2, 3] * 3 + [4, 5] * 5 splits = cval.StratifiedShuffleSplit(labels, n_iter=1, test_size=0.5, random_state=0) train, test = next(iter(splits)) assert_array_equal(np.intersect1d(train, test), [])
Example #13
Source File: cross_validation.py From smappPy with GNU General Public License v2.0 | 4 votes |
def grouped_stratified_train_test_split(y, x, group_by=None, test_size=0.33, group_labeler=None, return_indices=False, **kwargs): """ Split arrays or matrices into random training and test subsets. Subsets will contain equal proportions of each label in `y`. Based on StratifiedShuffleSplit from sklearn.cross_validation. if `group_by` is an iterable of length `len(y)`, indices with the same `group_by[i]` will be kept together in either the training or the test set. if `group_labeler` is a callable, it will be used to assign a label to a group of labels. The default is `lambda labels: int(np.round(np.average(labels)))` -------- Example: X = np.array([[1, 2], [3, 4], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1], [1, 4], [3, 1]]) y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]) id = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]) x_train, x_test, y_train, y_test = grouped_stratified_train_test_split(y,X,id) """ if not group_labeler: group_labeler = lambda labels: int(np.round(np.average(labels))) group_indices = dict() group_labels = dict() for i,(label, group) in enumerate(zip(y, group_by)): if not group in group_labels: group_labels[group] = list() group_indices[group] = list() group_indices[group].append(i) group_labels[group].append(label) groups, labels = zip(*{ group: group_labeler(labels) for group, labels in group_labels.items() }.items()) sss = StratifiedShuffleSplit(labels, 1, test_size=test_size, **kwargs) group_train_indices, group_test_indices = list(sss)[0] test_groups = [groups[i] for i in group_test_indices] train_groups = [groups[j] for j in group_train_indices] test_indices = [idx for group in test_groups for idx in group_indices[group]] train_indices = [idx for group in train_groups for idx in group_indices[group]] if return_indices: return train_indices, test_indices else: return x[train_indices], x[test_indices], y[train_indices], y[test_indices]
Example #14
Source File: helpers.py From scipy_2015_sklearn_tutorial with Creative Commons Zero v1.0 Universal | 4 votes |
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999): f = open(os.path.join('datasets', 'titanic', 'titanic3.csv')) # Remove . from home.dest, split on quotes because some fields have commas keys = f.readline().strip().replace('.', '').split('","') lines = f.readlines() f.close() string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 'homedest'] string_keys = [s for s in string_keys if s not in feature_skip_tuple] numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare'] numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple] train_vectorizer_list = [] test_vectorizer_list = [] n_samples = len(lines) numeric_data = np.zeros((n_samples, len(numeric_keys))) numeric_labels = np.zeros((n_samples,), dtype=int) # Doing this twice is horribly inefficient but the file is small... for n, l in enumerate(lines): line_dict = process_titanic_line(l) strings = {k: line_dict[k] for k in string_keys} numeric_labels[n] = line_dict["survived"] sss = StratifiedShuffleSplit(numeric_labels, n_iter=1, test_size=test_size, random_state=12) # This is a weird way to get the indices but it works train_idx = None test_idx = None for train_idx, test_idx in sss: pass for n, l in enumerate(lines): line_dict = process_titanic_line(l) strings = {k: line_dict[k] for k in string_keys} if n in train_idx: train_vectorizer_list.append(strings) else: test_vectorizer_list.append(strings) numeric_data[n] = np.asarray([line_dict[k] for k in numeric_keys]) train_numeric = numeric_data[train_idx] test_numeric = numeric_data[test_idx] train_labels = numeric_labels[train_idx] test_labels = numeric_labels[test_idx] vec = DictVectorizer() # .toarray() due to returning a scipy sparse array train_categorical = vec.fit_transform(train_vectorizer_list).toarray() test_categorical = vec.transform(test_vectorizer_list).toarray() train_data = np.concatenate([train_numeric, train_categorical], axis=1) test_data = np.concatenate([test_numeric, test_categorical], axis=1) keys = numeric_keys + string_keys return keys, train_data, test_data, train_labels, test_labels
Example #15
Source File: OutPutRes.py From ProFET with GNU General Public License v3.0 | 4 votes |
def CV_Binary_stats(X, y, model,n=10) : ''' http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics Note that some of the metrics here ONLY work for BINARY tasks. This will be VERY slow compared to the built-in, multicore CV implementation. (Unless used with a classifier that is parallelized anyway, such as RF). By default, balances weights when fitting http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics ''' from sklearn.metrics import precision_score, accuracy_score, recall_score,precision_recall_fscore_support mean_auc = 0.0 mean_precision = 0.0 mean_recall = 0.0 mean_accuracy = 0.0 sss = StratifiedShuffleSplit(y, n_iter=n, test_size=0.2, random_state=0) for train_index, test_index in sss: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # for i in range(n) : # # for each iteration, randomly hold out 30% of the data as CV set # X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, # test_size=.15, # random_state=i) # cv=StratifiedShuffleSplit(y=y_train, n_iter=11, test_size=0.11) # train model and make predictions model.fit(X_train, y_train,sample_weight=balance_weights(y_train)) # preds = model.predict(X_cv) preds = model.predict(X_test) ''' # ROC_AUC - Restricted to binary (not multiclass) case. fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) roc_auc = metrics.auc(fpr, tpr) # print("( %d/%d)" % (i + 1, n)) mean_auc += roc_auc ''' accuracy = accuracy_score(y_cv, preds) precision = precision_score(y_cv, preds) recall = recall_score(y_cv, preds) mean_accuracy += accuracy mean_precision += precision mean_recall += recall mean_accuracy = (mean_accuracy / n) mean_precision = mean_precision / n mean_recall = mean_recall / n # mean_auc = mean_auc / n print('mean_accuracy: %s ' %(round(mean_accuracy, 3))) print('mean_precision: %s ' %(round(mean_precision, 3))) print('mean_recall: %s ' %(round(mean_recall, 3))) # print('mean_auc: %s ' %(round(mean_auc, 3))) return (mean_accuracy,mean_precision,mean_recall)
Example #16
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_iter = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: p = bf.pmf(count) assert_true(p > threshold, "An index is not drawn with chance corresponding " "to even draws") for n_samples in (6, 22): labels = np.array((n_samples // 2) * [0, 1]) splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits = 0 for train, test in splits: n_splits += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits, n_iter) assert_equal(len(train), splits.n_train) assert_equal(len(test), splits.n_test) assert_equal(len(set(train).intersection(test)), 0) label_counts = np.unique(labels) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(splits.n_train + splits.n_test, len(labels)) assert_equal(len(label_counts), 2) ex_test_p = float(splits.n_test) / n_samples ex_train_p = float(splits.n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)