Python sklearn.model_selection.ShuffleSplit() Examples
The following are 30
code examples of sklearn.model_selection.ShuffleSplit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection
, or try the search function
.
Example #1
Source File: TargetingSystem.py From poeai with MIT License | 7 votes |
def Train(self, C, A, Y, SF): ''' Train the classifier using the sample matrix A and target matrix Y ''' C.fit(A, Y) YH = np.zeros(Y.shape, dtype = np.object) for i in np.array_split(np.arange(A.shape[0]), 32): #Split up verification into chunks to prevent out of memory YH[i] = C.predict(A[i]) s1 = SF(Y, YH) print('All:{:8.6f}'.format(s1)) ''' ss = ShuffleSplit(random_state = 1151) #Use fixed state for so training can be repeated later trn, tst = next(ss.split(A, Y)) #Make train/test split mi = [8] * 1 #Maximum number of iterations at each iter YH = np.zeros((A.shape[0]), dtype = np.object) for mic in mi: #Chunk size to split dataset for CV results #C.SetMaxIter(mic) #Set the maximum number of iterations to run #C.fit(A[trn], Y[trn]) #Perform training iterations '''
Example #2
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #3
Source File: expedia_dataset_reader.py From cs-ranking with Apache License 2.0 | 6 votes |
def get_single_train_test_split(self): splits = dict() cv_iter = ShuffleSplit( n_splits=1, random_state=self.random_state, test_size=0.80 ) for n_obj, arr in self.X_dict.items(): if arr.shape[0] == 1: splits[n_obj] = ([0], [0]) else: splits[n_obj] = list(cv_iter.split(arr))[0] self.X_train = dict() self.Y_train = dict() self.X_test = dict() self.Y_test = dict() for n_obj, itr in splits.items(): train_idx, test_idx = itr self.X_train[n_obj] = np.copy(self.X_dict[n_obj][train_idx]) self.X_test[n_obj] = np.copy(self.X_dict[n_obj][test_idx]) self.Y_train[n_obj] = np.copy(self.Y_dict[n_obj][train_idx]) self.Y_test[n_obj] = np.copy(self.Y_dict[n_obj][test_idx]) self.X, self.Y = self.sub_sampling_from_dictionary() self.__check_dataset_validity__() self.X, self.X_test = standardize_features(self.X, self.X_test) return self.X, self.Y, self.X_test, self.Y_test
Example #4
Source File: image_dataset.py From self-ensemble-visual-domain-adapt-photo with MIT License | 6 votes |
def subset_indices(d_source, d_target, subsetsize, subsetseed): if subsetsize > 0: if subsetseed != 0: subset_rng = np.random.RandomState(subsetseed) else: subset_rng = np.random strat = StratifiedShuffleSplit(n_splits=1, test_size=subsetsize, random_state=subset_rng) shuf = ShuffleSplit(n_splits=1, test_size=subsetsize, random_state=subset_rng) _, source_indices = next(strat.split(d_source.y, d_source.y)) n_src = source_indices.shape[0] if d_target.has_ground_truth: _, target_indices = next(strat.split(d_target.y, d_target.y)) else: _, target_indices = next(shuf.split(np.arange(len(d_target.images)))) n_tgt = target_indices.shape[0] else: source_indices = None target_indices = None n_src = len(d_source.images) n_tgt = len(d_target.images) return source_indices, target_indices, n_src, n_tgt
Example #5
Source File: stockpredictor.py From SyBrain with GNU General Public License v3.0 | 6 votes |
def TestPerformance(self, df=None): # If no dataframe is provided, use the currently learned one if (df is None): D = self.D else: D = self.S.transform(df.copy()) # Get features from the data frame A = self._ExtractFeat(D) # Get the target values and their corresponding column names y, _ = self._ExtractTarg(D) # Begin cross validation ss = ShuffleSplit(n_splits=1) for trn, tst in ss.split(A): s1 = cross_val_score(self.R, A, y, cv=3, scoring=make_scorer(r2_score)) s2 = cross_val_score(self.R, A[tst], y[tst], cv=3, scoring=make_scorer(r2_score)) s3 = cross_val_score(self.R, A[trn], y[trn], cv=3, scoring=make_scorer(r2_score)) print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))
Example #6
Source File: test_multiclass.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") iris = datasets.load_iris() X, y = iris.data, iris.target K = np.dot(X, X.T) cv = ShuffleSplit(test_size=0.25, random_state=0) train, test = list(cv.split(X))[0] X_train, y_train = _safe_split(clf, X, y, train) K_train, y_train2 = _safe_split(clfp, K, y, train) assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) assert_array_almost_equal(y_train, y_train2) X_test, y_test = _safe_split(clf, X, y, test, train) K_test, y_test2 = _safe_split(clfp, K, y, test, train) assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) assert_array_almost_equal(y_test, y_test2)
Example #7
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #8
Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") iris = datasets.load_iris() X, y = iris.data, iris.target K = np.dot(X, X.T) cv = ShuffleSplit(test_size=0.25, random_state=0) train, test = list(cv.split(X))[0] X_train, y_train = _safe_split(clf, X, y, train) K_train, y_train2 = _safe_split(clfp, K, y, train) assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) assert_array_almost_equal(y_train, y_train2) X_test, y_test = _safe_split(clf, X, y, test, train) K_test, y_test2 = _safe_split(clfp, K, y, test, train) assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) assert_array_almost_equal(y_test, y_test2)
Example #9
Source File: active.py From chemml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _update_train_test(self): """ This function take care of the test_type parameter. """ if self.test_type == 'passive': return True if len(self._queries) > 0: return True else: # active test split all_indices = np.concatenate([self.train_indices, self.test_indices], axis=0) all_y = np.concatenate([self._Y_train, self._Y_test], axis=0) # select randomly ss = ShuffleSplit(n_splits=1, test_size=self.test_size, train_size=None, random_state=90) for train_indices, test_indices in ss.split(all_indices): # test self._Y_test = all_y[test_indices] self.test_indices = all_indices[test_indices] # train self._Y_train = all_y[train_indices] self.train_indices = all_indices[train_indices]
Example #10
Source File: split.py From gumpy with MIT License | 5 votes |
def stratified_shuffle_Split(features, labels, n_splits,test_size,random_state): """Stratified ShuffleSplit cross-validator """ cv = StratifiedShuffleSplit(n_splits, test_size, random_state=random_state) for train_index, test_index in cv.split(features,labels): X_train = features[train_index] X_test = features[test_index] Y_train = labels[train_index] Y_test = labels[test_index] return X_train, X_test, Y_train, Y_test #Random permutation cross-validator
Example #11
Source File: classifier_basetest.py From scikit-multilearn with BSD 2-Clause "Simplified" License | 5 votes |
def assertClassifierWorksWithCV(self, classifier): # all the nice stuff is tested here - whether the classifier is # clonable, etc. for X, y in self.get_multilabel_data_for_tests('dense'): n_iterations = 3 cv = model_selection.ShuffleSplit(n_splits=n_iterations, test_size=0.5, random_state=0) scores = model_selection.cross_val_score( classifier, X, y=y, cv=cv, scoring='accuracy') self.assertEqual(len(scores), n_iterations)
Example #12
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.model_selection.KFold, ms.KFold) self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold) self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut) self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.model_selection.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.model_selection.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV) self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid) self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler) # Model validation
Example #13
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper_abbr(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.ms.KFold, ms.KFold) self.assertIs(df.ms.GroupKFold, ms.GroupKFold) self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.ms.LeavePOut, ms.LeavePOut) self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.ms.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.ms.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV) self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid) self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler) # Model validation
Example #14
Source File: split.py From gumpy with MIT License | 5 votes |
def shuffle_Split(features, labels, n_splits,test_size,random_state): """ShuffleSplit: Random permutation cross-validator """ cv = ShuffleSplit(n_splits, test_size, random_state=random_state) for train_index, test_index in cv.split(features): X_train = features[train_index] X_test = features[test_index] Y_train = labels[train_index] Y_test = labels[test_index] return X_train, X_test, Y_train, Y_test
Example #15
Source File: DeepOCR.py From pythonml with MIT License | 5 votes |
def FitModel(cnnc, A, Y, T, FN): print('Fitting model...') ss = ShuffleSplit(n_splits = 1) trn, tst = next(ss.split(A)) #Fit the network cnnc.fit(A[trn], Y[trn]) #The predictions as sequences of character indices YH = [] for i in np.array_split(np.arange(A.shape[0]), 32): YH.append(cnnc.predict(A[i])) YH = np.vstack(YH) #Convert from sequence of char indices to strings PS = np.array([''.join(YHi) for YHi in YH]) #Compute the accuracy S1 = SAcc(PS[trn], T[trn]) S2 = SAcc(PS[tst], T[tst]) print('Train: ' + str(S1)) print('Test: ' + str(S2)) for PSi, Ti, FNi in zip(PS, T, FN): if np.random.rand() > 0.99: #Randomly select rows to print print(FNi + ': ' + Ti + ' -> ' + PSi) print('Fitting with CV data...') #Fit remainder cnnc.SetMaxIter(4) cnnc.fit(A, Y) return cnnc
Example #16
Source File: predict.py From NetMF with MIT License | 5 votes |
def predict_cv(X, y, train_ratio=0.2, n_splits=10, random_state=0, C=1.): micro, macro = [], [] shuffle = ShuffleSplit(n_splits=n_splits, test_size=1-train_ratio, random_state=random_state) for train_index, test_index in shuffle.split(X): print(train_index.shape, test_index.shape) assert len(set(train_index) & set(test_index)) == 0 assert len(train_index) + len(test_index) == X.shape[0] X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = OneVsRestClassifier( LogisticRegression( C=C, solver="liblinear", multi_class="ovr"), n_jobs=-1) clf.fit(X_train, y_train) y_score = clf.predict_proba(X_test) y_pred = construct_indicator(y_score, y_test) mi = f1_score(y_test, y_pred, average="micro") ma = f1_score(y_test, y_pred, average="macro") logger.info("micro f1 %f macro f1 %f", mi, ma) micro.append(mi) macro.append(ma) logger.info("%d fold validation, training ratio %f", len(micro), train_ratio) logger.info("Average micro %.2f, Average macro %.2f", np.mean(micro) * 100, np.mean(macro) * 100)
Example #17
Source File: acp.py From nonconformist with MIT License | 5 votes |
def gen_samples(self, y, n_samples, problem_type): if problem_type == 'classification': splits = StratifiedShuffleSplit( n_splits=n_samples, test_size=self.cal_portion ) split_ = splits.split(np.zeros((y.size, 1)), y) else: splits = ShuffleSplit( n_splits=n_samples, test_size=self.cal_portion ) split_ = splits.split(np.zeros((y.size, 1))) for train, cal in split_: yield train, cal # ----------------------------------------------------------------------------- # Conformal ensemble # -----------------------------------------------------------------------------
Example #18
Source File: Stacking.py From Kaggle-Competition-Sberbank with MIT License | 5 votes |
def fit_predict(self, trainDf, testDf): X = trainDf.drop(['price_doc', 'w'], 1).values y = trainDf['price_doc'].values w = trainDf['w'].values T = testDf.values X_fillna = trainDf.drop(['price_doc', 'w'], 1).fillna(-999).values T_fillna = testDf.fillna(-999).values folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True)) S_train = np.zeros((X.shape[0], len(self.base_models))) S_test = np.zeros((T.shape[0], len(self.base_models))) for i, clf in enumerate(self.base_models): print('Training base model ' + str(i+1) + '...') S_test_i = np.zeros((T.shape[0], len(folds))) for j, (train_idx, test_idx) in enumerate(folds): print('Training round ' + str(j+1) + '...') if clf not in [xgb1,lgb1]: # sklearn models cannot handle missing values. X = X_fillna T = T_fillna X_train = X[train_idx] y_train = y[train_idx] w_train = w[train_idx] X_holdout = X[test_idx] # w_holdout = w[test_idx] # y_holdout = y[test_idx] clf.fit(X_train, y_train, w_train) y_pred = clf.predict(X_holdout) S_train[test_idx, i] = y_pred S_test_i[:, j] = clf.predict(T) S_test[:, i] = S_test_i.mean(1) self.S_train, self.S_test, self.y = S_train, S_test, y # for diagnosis purpose self.corr = pd.concat([pd.DataFrame(S_train),trainDf['price_doc']],1).corr() # correlation of predictions by different models. # cv_stack = ShuffleSplit(n_splits=6, test_size=0.2) # score_stacking = cross_val_score(self.stacker, S_train, y, cv=cv_stack, n_jobs=1, scoring='neg_mean_squared_error') # print(np.sqrt(-score_stacking.mean())) # CV result of stacking self.stacker.fit(S_train, y) y_pred = self.stacker.predict(S_test) return y_pred
Example #19
Source File: sushi_discrete_choice_dataset_reader.py From cs-ranking with Apache License 2.0 | 5 votes |
def get_single_train_test_split(self): cv_iter = ShuffleSplit( n_splits=1, random_state=self.random_state, test_size=0.20 ) splits = list(cv_iter.split(self.X)) return list(self.splitter(splits))[0]
Example #20
Source File: survey_dataset_reader.py From cs-ranking with Apache License 2.0 | 5 votes |
def get_single_train_test_split(self): cv_iter = ShuffleSplit( n_splits=1, test_size=0.3, random_state=self.random_state ) (train_idx, test_idx) = list(cv_iter.split(self.X))[0] return self.X[train_idx], self.Y[train_idx], self.X[test_idx], self.Y[test_idx]
Example #21
Source File: intelligent_system_group_dataset_reader.py From cs-ranking with Apache License 2.0 | 5 votes |
def get_single_train_test_split(self, name="cold"): cv_iter = ShuffleSplit( n_splits=1, test_size=0.3, random_state=self.random_state ) (train_idx, test_idx) = list(cv_iter.split(self.X[name]))[0] return ( self.X[name][train_idx], self.Y[name][train_idx], self.X[name][test_idx], self.Y[name][test_idx], )
Example #22
Source File: sushi_dyad_ranking_dataset_reader.py From cs-ranking with Apache License 2.0 | 5 votes |
def get_single_train_test_split(self): cv_iter = ShuffleSplit( n_splits=1, random_state=self.random_state, test_size=0.30 ) splits = list(cv_iter.split(self.X)) return list(self.splitter(splits))[0]
Example #23
Source File: sushi_object_ranking_dataset_reader.py From cs-ranking with Apache License 2.0 | 5 votes |
def get_single_train_test_split(self): cv_iter = ShuffleSplit( n_splits=1, random_state=self.random_state, test_size=0.30 ) splits = list(cv_iter.split(self.X)) return list(self.splitter(splits))[0]
Example #24
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_shuffle_split(): ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X) ss2 = ShuffleSplit(test_size=2, random_state=0).split(X) ss3 = ShuffleSplit(test_size=np.int32(2), random_state=0).split(X) for typ in six.integer_types: ss4 = ShuffleSplit(test_size=typ(2), random_state=0).split(X) for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): assert_array_equal(t1[0], t2[0]) assert_array_equal(t2[0], t3[0]) assert_array_equal(t3[0], t4[0]) assert_array_equal(t1[1], t2[1]) assert_array_equal(t2[1], t3[1]) assert_array_equal(t3[1], t4[1])
Example #25
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_shufflesplit_errors(): # When the {test|train}_size is a float/invalid, error is raised at init assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None) assert_raises(ValueError, ShuffleSplit, test_size=2.0) assert_raises(ValueError, ShuffleSplit, test_size=1.0) assert_raises(ValueError, ShuffleSplit, test_size=0.1, train_size=0.95) assert_raises(ValueError, ShuffleSplit, train_size=1j) # When the {test|train}_size is an int, validation is based on the input X # and happens at split(...) assert_raises(ValueError, next, ShuffleSplit(test_size=11).split(X)) assert_raises(ValueError, next, ShuffleSplit(test_size=10).split(X)) assert_raises(ValueError, next, ShuffleSplit(test_size=8, train_size=3).split(X))
Example #26
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_train_test_default_warning(): assert_warns(FutureWarning, ShuffleSplit, train_size=0.75) assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75) assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75) assert_warns(FutureWarning, train_test_split, range(3), train_size=0.75)
Example #27
Source File: estimator_utils.py From EDeN with MIT License | 5 votes |
def estimate_predictive_performance(x_y, estimator=None, n_splits=10, random_state=1): """estimate_predictive_performance.""" x, y = x_y cv = ShuffleSplit(n_splits=n_splits, test_size=0.3, random_state=random_state) scoring = make_scorer(average_precision_score) scores = cross_val_score(estimator, x, y, cv=cv, scoring=scoring) return scores
Example #28
Source File: base.py From deep_pipe with MIT License | 5 votes |
def train_test_split_groups(X, *, val_size, groups=None, **kwargs): split_class = (ShuffleSplit if groups is None else GroupShuffleSplit) split = split_class(test_size=val_size, **kwargs) train, val = next(split.split(X=X, groups=groups)) return X[train], X[val]
Example #29
Source File: core.py From HungaBunga with MIT License | 5 votes |
def cv_reg(x, test_size = 0.2, n_splits = 5, random_state=None): return ss(n_splits, test_size, random_state=random_state).split(x)
Example #30
Source File: test_model_selection.py From verde with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_cross_val_score_client(trend): "Test the deprecated dask Client interface" coords, data = trend[:2] model = Trend(degree=1) nsplits = 5 cross_validator = ShuffleSplit(n_splits=nsplits, random_state=0) client = Client(processes=False) futures = cross_val_score(model, coords, data, cv=cross_validator, client=client) scores = [future.result() for future in futures] client.close() assert len(scores) == nsplits npt.assert_allclose(scores, 1)