Python sklearn.model_selection.PredefinedSplit() Examples
The following are 10
code examples of sklearn.model_selection.PredefinedSplit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection
, or try the search function
.
Example #1
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #2
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #3
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_predefinedsplit_with_kfold_split(): # Check that PredefinedSplit can reproduce a split generated by Kfold. folds = -1 * np.ones(10) kf_train = [] kf_test = [] for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)): kf_train.append(train_ind) kf_test.append(test_ind) folds[test_ind] = i ps_train = [] ps_test = [] ps = PredefinedSplit(folds) # n_splits is simply the no of unique folds assert_equal(len(np.unique(folds)), ps.get_n_splits()) for train_ind, test_ind in ps.split(): ps_train.append(train_ind) ps_test.append(test_ind) assert_array_equal(ps_train, kf_train) assert_array_equal(ps_test, kf_test)
Example #4
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_predefinedsplit_with_kfold_split(): # Check that PredefinedSplit can reproduce a split generated by Kfold. folds = np.full(10, -1.) kf_train = [] kf_test = [] for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)): kf_train.append(train_ind) kf_test.append(test_ind) folds[test_ind] = i ps = PredefinedSplit(folds) # n_splits is simply the no of unique folds assert_equal(len(np.unique(folds)), ps.get_n_splits()) ps_train, ps_test = zip(*ps.split()) assert_array_equal(ps_train, kf_train) assert_array_equal(ps_test, kf_test)
Example #5
Source File: test_dataset.py From skorch with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_predefined_split(self, cv_split_cls, data): from sklearn.model_selection import PredefinedSplit indices = (data.y > 0).astype(int) split = PredefinedSplit(indices) dataset_train, dataset_valid = cv_split_cls(split)(data) y_train = data_from_dataset(dataset_train)[1] y_valid = data_from_dataset(dataset_valid)[1] assert (y_train > 0).all() assert (y_valid == 0).all()
Example #6
Source File: baseline.py From gpt-2-output-dataset with MIT License | 5 votes |
def main(data_dir, log_dir, source='xl-1542M-k40', n_train=500000, n_valid=10000, n_jobs=None, verbose=False): train_texts, train_labels = load_split(data_dir, source, 'train', n=n_train) valid_texts, valid_labels = load_split(data_dir, source, 'valid', n=n_valid) test_texts, test_labels = load_split(data_dir, source, 'test') vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=2**21) train_features = vect.fit_transform(train_texts) valid_features = vect.transform(valid_texts) test_features = vect.transform(test_texts) model = LogisticRegression(solver='liblinear') params = {'C': [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4, 8, 16, 32, 64]} split = PredefinedSplit([-1]*n_train+[0]*n_valid) search = GridSearchCV(model, params, cv=split, n_jobs=n_jobs, verbose=verbose, refit=False) search.fit(sparse.vstack([train_features, valid_features]), train_labels+valid_labels) model = model.set_params(**search.best_params_) model.fit(train_features, train_labels) valid_accuracy = model.score(valid_features, valid_labels)*100. test_accuracy = model.score(test_features, test_labels)*100. data = { 'source':source, 'n_train':n_train, 'valid_accuracy':valid_accuracy, 'test_accuracy':test_accuracy } print(data) json.dump(data, open(os.path.join(log_dir, f'{source}.json'), 'w'))
Example #7
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.model_selection.KFold, ms.KFold) self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold) self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut) self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.model_selection.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.model_selection.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV) self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid) self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler) # Model validation
Example #8
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper_abbr(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.ms.KFold, ms.KFold) self.assertIs(df.ms.GroupKFold, ms.GroupKFold) self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.ms.LeavePOut, ms.LeavePOut) self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.ms.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.ms.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV) self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid) self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler) # Model validation
Example #9
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_cross_validator_with_default_params(): n_samples = 4 n_unique_groups = 4 n_splits = 2 p = 2 n_shuffle_splits = 10 # (the default value) X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) X_1d = np.array([1, 2, 3, 4]) y = np.array([1, 1, 2, 2]) groups = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) kf = KFold(n_splits) skf = StratifiedKFold(n_splits) lolo = LeaveOneGroupOut() lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, " "test_size=None, train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, n_unique_groups, comb(n_unique_groups, p), n_shuffle_splits, 2] for i, (cv, cv_repr) in enumerate(zip( [loo, lpo, kf, skf, lolo, lopo, ss, ps], [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, ss_repr, ps_repr])): # Test if get_n_splits works correctly assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups)) # Test if the cross-validator works as expected even if # the data is 1d np.testing.assert_equal(list(cv.split(X, y, groups)), list(cv.split(X_1d, y, groups))) # Test that train, test indices returned are integers for train, test in cv.split(X, y, groups): assert_equal(np.asarray(train).dtype.kind, 'i') assert_equal(np.asarray(train).dtype.kind, 'i') # Test if the repr works without any errors assert_equal(cv_repr, repr(cv)) # ValueError for get_n_splits methods msg = "The 'X' parameter should not be None." assert_raise_message(ValueError, msg, loo.get_n_splits, None, y, groups) assert_raise_message(ValueError, msg, lpo.get_n_splits, None, y, groups)
Example #10
Source File: test_split.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_cross_validator_with_default_params(): n_samples = 4 n_unique_groups = 4 n_splits = 2 p = 2 n_shuffle_splits = 10 # (the default value) X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) X_1d = np.array([1, 2, 3, 4]) y = np.array([1, 1, 2, 2]) groups = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) kf = KFold(n_splits) skf = StratifiedKFold(n_splits) lolo = LeaveOneGroupOut() lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, " "test_size='default',\n train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, n_unique_groups, comb(n_unique_groups, p), n_shuffle_splits, 2] for i, (cv, cv_repr) in enumerate(zip( [loo, lpo, kf, skf, lolo, lopo, ss, ps], [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, ss_repr, ps_repr])): # Test if get_n_splits works correctly assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups)) # Test if the cross-validator works as expected even if # the data is 1d np.testing.assert_equal(list(cv.split(X, y, groups)), list(cv.split(X_1d, y, groups))) # Test that train, test indices returned are integers for train, test in cv.split(X, y, groups): assert_equal(np.asarray(train).dtype.kind, 'i') assert_equal(np.asarray(train).dtype.kind, 'i') # Test if the repr works without any errors assert_equal(cv_repr, repr(cv)) # ValueError for get_n_splits methods msg = "The 'X' parameter should not be None." assert_raise_message(ValueError, msg, loo.get_n_splits, None, y, groups) assert_raise_message(ValueError, msg, lpo.get_n_splits, None, y, groups)