Python sklearn.cross_validation.ShuffleSplit() Examples
The following are 16
code examples of sklearn.cross_validation.ShuffleSplit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.cross_validation
, or try the search function
.
Example #1
Source File: ml.py From info-flow-experiments with GNU General Public License v3.0 | 6 votes |
def split_data(X, y, splittype='timed', splitfrac=0.1, verbose=False): if(splittype == 'rand'): rs1 = cross_validation.ShuffleSplit(len(X), n_iter=1, test_size=splitfrac) for train, test in rs1: if(verbose): print "Training blocks:", train print "Test blocks:", test X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] elif(splittype == 'timed'): split = int((1.-splitfrac)*len(X)) if(verbose): print "Split at block ", str(split) X_train, y_train, X_test, y_test = X[:split], y[:split], X[split:], y[split:] else: raw_input("Split type ERROR in ml.py") return X_train, y_train, X_test, y_test
Example #2
Source File: Deopen_classification.py From Deopen with MIT License | 6 votes |
def data_split(inputfile): data = hkl.load(inputfile) X = data['mat'] X_kspec = data['kmer'] y = data['y'] rs = ShuffleSplit(len(y), n_iter=1,random_state = 1) X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4)) X = np.concatenate((X,X_kspec), axis = 1) X = X[:,np.newaxis] X = X.transpose((0,1,3,2)) for train_idx, test_idx in rs: X_train = X[train_idx,:] y_train = y[train_idx] X_test = X[test_idx,:] y_test = y[test_idx] X_train = X_train.astype('float32') y_train = y_train.astype('int32') X_test = X_test.astype('float32') y_test = y_test.astype('int32') return [X_train, y_train, X_test, y_test] #define the network architecture
Example #3
Source File: 02_tuning.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 6 votes |
def grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__stop_words=[None, "english"], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clf
Example #4
Source File: 04_sent.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 6 votes |
def __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clf
Example #5
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_generator_with_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) # explicitly passing indices value is deprecated loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ps = cval.PredefinedSplit([1, 1, 2, 2]) ss = cval.ShuffleSplit(2) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
Example #6
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ss = cval.ShuffleSplit(2) ps = cval.PredefinedSplit([1, 1, 2, 2]) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
Example #7
Source File: Deopen_regression.py From Deopen with MIT License | 5 votes |
def data_split(inputfile,reads_count): data = hkl.load(inputfile) reads_count= hkl.load(reads_count) X = data['mat'] X_kspec = data['kmer'] reads_count = np.array(reads_count) y = np.mean(reads_count, axis = 1) y = np.log(y+1e-3) rs = ShuffleSplit(len(y), n_iter=1,random_state = 1) X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4)) X = np.concatenate((X,X_kspec), axis = 1) X = X[:,np.newaxis] X = X.transpose((0,1,3,2)) for train_idx, test_idx in rs: X_train = X[train_idx,:] y_train = y[train_idx] X_test = X[test_idx,:] y_test = y[test_idx] X_train = X_train.astype('float32') y_train = y_train.astype('float32') X_test = X_test.astype('float32') y_test = y_test.astype('float32') print 'Data prepration done!' return [X_train, y_train, X_test, y_test] #define the network architecture
Example #8
Source File: __init__.py From kaggle-right-whale with MIT License | 5 votes |
def __call__(self, X, y, net): if self.eval_size is not None: if net.regression or not self.stratify: # test_size = self.eval_size # kf = ShuffleSplit( # y.shape[0], test_size=test_size, # random_state=self.random_state # ) # train_indices, valid_indices = next(iter(kf)) # valid_indices = shuffle(valid_indices) test_size = 1 - self.eval_size kf = ShuffleSplit( y.shape[0], test_size=test_size, random_state=self.random_state ) valid_indices, train_indices = next(iter(kf)) else: n_folds = int(round(1 / self.eval_size)) kf = StratifiedKFold(y, n_folds=n_folds, random_state=self.random_state) train_indices, valid_indices = next(iter(kf)) X_train, y_train = X[train_indices], y[train_indices] X_valid, y_valid = X[valid_indices], y[valid_indices] else: X_train, y_train = X, y X_valid, y_valid = X[len(X):], y[len(y):] return X_train, X_valid, y_train, y_valid
Example #9
Source File: train_model.py From kaggle-right-whale with MIT License | 5 votes |
def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True): if stratify: n_folds = int(round(1 / test_size)) sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state) else: sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state) train_idx, test_idx = iter(sss).next() return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
Example #10
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_shuffle_split(): ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0) ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0) ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0) for typ in six.integer_types: ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0) for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): assert_array_equal(t1[0], t2[0]) assert_array_equal(t2[0], t3[0]) assert_array_equal(t3[0], t4[0]) assert_array_equal(t1[1], t2[1]) assert_array_equal(t2[1], t3[1]) assert_array_equal(t3[1], t4[1])
Example #11
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_shufflesplit_errors(): assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1, train_size=0.95) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, train_size=None)
Example #12
Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_shufflesplit_reproducible(): # Check that iterating twice on the ShuffleSplit gives the same # sequence of train-test when the random_state is given ss = cval.ShuffleSplit(10, random_state=21) assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))
Example #13
Source File: ml.py From info-flow-experiments with GNU General Public License v3.0 | 4 votes |
def crossVal_algo(k, algo, params, X, y, splittype, splitfrac, verbose=False): # performs cross_validation if(splittype=='rand'): rs2 = cross_validation.ShuffleSplit(len(X), n_iter=k, test_size=splitfrac) elif(splittype=='timed'): rs2 = cross_validation.KFold(n=len(X), n_folds=k) max, max_params = 0, {} par = [] for param in params.keys(): par.append(params[param]) for p in product(*par): if(verbose): print "val=", p score = 0.0 for train, test in rs2: X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] X_train = np.array([item for sublist in X_train for item in sublist]) y_train = np.array([item for sublist in y_train for item in sublist]) X_test = np.array([item for sublist in X_test for item in sublist]) y_test = np.array([item for sublist in y_test for item in sublist]) #print X_train.shape, y_train.shape, X_test.shape, y_test.shape if(algo == 'svc'): clf = LinearSVC(C=p[params.keys().index('C')], penalty="l1", dual=False) ## Larger C increases model complexity if(algo=='kNN'): clf = KNeighborsClassifier(n_neighbors=p[params.keys().index('k')], warn_on_equidistant=False, p=p[params.keys().index('p')]) if(algo=='linearSVM'): clf = svm.SVC(kernel='linear', C=p[params.keys().index('C')]) if(algo=='polySVM'): clf = svm.SVC(kernel='poly', degree = p[params.keys().index('degree')], C=p[params.keys().index('C')]) if(algo=='rbfSVM'): clf = svm.SVC(kernel='rbf', gamma = p[params.keys().index('gamma')], C=p[params.keys().index('C')]) ## a smaller gamma gives a decision boundary with a smoother curvature if(algo=='logit'): clf = LogisticRegression(penalty=p[params.keys().index('penalty')], dual=False, C=p[params.keys().index('C')]) if(algo=='tree'): clf = ExtraTreesClassifier(n_estimators=p[params.keys().index('ne')], compute_importances=True, random_state=0) if(algo=='randlog'): clf = RandomizedLogisticRegression(C=p[params.keys().index('C')]) clf.fit(X_train, y_train) score += clf.score(X_test, y_test) score /= k if(verbose): print score if score>max: max = score max_params = p classifier = clf return max, max_params, classifier
Example #14
Source File: 02_tuning.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 4 votes |
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)
Example #15
Source File: 03_clean.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 4 votes |
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)
Example #16
Source File: 04_sent.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 4 votes |
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)