Python sklearn.model_selection.GroupShuffleSplit() Examples
The following are 12
code examples of sklearn.model_selection.GroupShuffleSplit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection
, or try the search function
.
Example #1
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #2
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #3
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_group_shuffle_split_default_test_size(train_size, exp_train, exp_test): # Check that the default value has the expected behavior, i.e. 0.2 if both # unspecified or complement train_size unless both are specified. X = np.ones(10) y = np.ones(10) groups = range(10) X_train, X_test = next(GroupShuffleSplit(train_size=train_size) .split(X, y, groups)) assert len(X_train) == exp_train assert len(X_test) == exp_test
Example #4
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_group_shuffle_split(): for groups_i in test_groups: X = y = np.ones(len(groups_i)) n_splits = 6 test_size = 1. / 3 slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0) # Make sure the repr works repr(slo) # Test that the length is correct assert_equal(slo.get_n_splits(X, y, groups=groups_i), n_splits) l_unique = np.unique(groups_i) l = np.asarray(groups_i) for train, test in slo.split(X, y, groups=groups_i): # First test: no train group is in the test set and vice versa l_train_unique = np.unique(l[train]) l_test_unique = np.unique(l[test]) assert not np.any(np.in1d(l[train], l_test_unique)) assert not np.any(np.in1d(l[test], l_train_unique)) # Second test: train and test add up to all the data assert_equal(l[train].size + l[test].size, l.size) # Third test: train and test are disjoint assert_array_equal(np.intersect1d(train, test), []) # Fourth test: # unique train and test groups are correct, +- 1 for rounding error assert abs(len(l_test_unique) - round(test_size * len(l_unique))) <= 1 assert abs(len(l_train_unique) - round((1.0 - test_size) * len(l_unique))) <= 1
Example #5
Source File: base.py From deep_pipe with MIT License | 5 votes |
def train_test_split_groups(X, *, val_size, groups=None, **kwargs): split_class = (ShuffleSplit if groups is None else GroupShuffleSplit) split = split_class(test_size=val_size, **kwargs) train, val = next(split.split(X=X, groups=groups)) return X[train], X[val]
Example #6
Source File: misc.py From open-solution-ship-detection with MIT License | 5 votes |
def train_test_split_with_empty_fraction_with_groups(df, groups, empty_fraction, test_size, shuffle=True, random_state=1234): cv = GroupShuffleSplit(n_splits=2, test_size=test_size, random_state=random_state) for train_inds, test_inds in cv.split(df.values, groups=groups.values): train, test = df.iloc[train_inds], df.iloc[test_inds] break empty_train, empty_test = train[train['is_not_empty'] == 0], test[test['is_not_empty'] == 0] non_empty_train, non_empty_test = train[train['is_not_empty'] == 1], test[test['is_not_empty'] == 1] test_empty_size = int(test_size * empty_fraction) test_non_empty_size = int(test_size * (1.0 - empty_fraction)) empty_test = empty_test.sample(test_empty_size, random_state=random_state) non_empty_test = non_empty_test.sample(test_non_empty_size, random_state=random_state) train = pd.concat([empty_train, non_empty_train], axis=0).sample(frac=1, random_state=random_state) test = pd.concat([empty_test, non_empty_test], axis=0) if shuffle: train = train.sample(frac=1, random_state=random_state) test = test.sample(frac=1, random_state=random_state) return train, test
Example #7
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.model_selection.KFold, ms.KFold) self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold) self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut) self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.model_selection.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.model_selection.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV) self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid) self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler) # Model validation
Example #8
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper_abbr(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.ms.KFold, ms.KFold) self.assertIs(df.ms.GroupKFold, ms.GroupKFold) self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.ms.LeavePOut, ms.LeavePOut) self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.ms.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.ms.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV) self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid) self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler) # Model validation
Example #9
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_group_shuffle_split(): for groups_i in test_groups: X = y = np.ones(len(groups_i)) n_splits = 6 test_size = 1. / 3 slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0) # Make sure the repr works repr(slo) # Test that the length is correct assert_equal(slo.get_n_splits(X, y, groups=groups_i), n_splits) l_unique = np.unique(groups_i) l = np.asarray(groups_i) for train, test in slo.split(X, y, groups=groups_i): # First test: no train group is in the test set and vice versa l_train_unique = np.unique(l[train]) l_test_unique = np.unique(l[test]) assert_false(np.any(np.in1d(l[train], l_test_unique))) assert_false(np.any(np.in1d(l[test], l_train_unique))) # Second test: train and test add up to all the data assert_equal(l[train].size + l[test].size, l.size) # Third test: train and test are disjoint assert_array_equal(np.intersect1d(train, test), []) # Fourth test: # unique train and test groups are correct, +- 1 for rounding error assert_true(abs(len(l_test_unique) - round(test_size * len(l_unique))) <= 1) assert_true(abs(len(l_train_unique) - round((1.0 - test_size) * len(l_unique))) <= 1)
Example #10
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_train_test_default_warning(): assert_warns(FutureWarning, ShuffleSplit, train_size=0.75) assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75) assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75) assert_warns(FutureWarning, train_test_split, range(3), train_size=0.75)
Example #11
Source File: sklearn_utils.py From ibeis with Apache License 2.0 | 4 votes |
def temp(samples): from sklearn import model_selection from ibeis.algo.verif import sklearn_utils def check_balance(idxs): # from sklearn.utils.fixes import bincount print('-------') for count, (test, train) in enumerate(idxs): print('split %r' % (count)) groups_train = set(groups.take(train)) groups_test = set(groups.take(test)) n_group_isect = len(groups_train.intersection(groups_test)) y_train_freq = bincount(y.take(train)) y_test_freq = bincount(y.take(test)) y_test_ratio = y_test_freq / y_test_freq.sum() y_train_ratio = y_train_freq / y_train_freq.sum() balance_error = np.sum((y_test_ratio - y_train_ratio) ** 2) print('n_group_isect = %r' % (n_group_isect,)) print('y_test_ratio = %r' % (y_test_ratio,)) print('y_train_ratio = %r' % (y_train_ratio,)) print('balance_error = %r' % (balance_error,)) X = np.empty((len(samples), 0)) y = samples.encoded_1d().values groups = samples.group_ids n_splits = 3 splitter = model_selection.GroupShuffleSplit(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = model_selection.GroupKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = model_selection.StratifiedKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = sklearn_utils.StratifiedGroupKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs)
Example #12
Source File: utils.py From medleydb with MIT License | 4 votes |
def artist_conditional_split(trackid_list=None, test_size=0.15, num_splits=5, random_state=None, artist_index=None): """Create artist-conditional train-test splits. The same artist (as defined by the artist_index) cannot appear in both the training and testing set. Parameters ---------- trackid_list : list or None, default=None List of trackids to use in train-test split. If None, uses all tracks test_size : float, default=0.15 Fraction of tracks to use in test set. The test set will be as close as possible in size to this value, but it may not be exact due to the artist-conditional constraint. num_splits : int, default=5 Number of random splits to create random_state : int or None, default=None A random state to optionally reproduce the same random split. artist_index : dict or None, default=None Dictionary mapping each track id in trackid_list to a string that uniquely identifies each artist. If None, uses the predefined index ARTIST_INDEX. Returns ------- splits : list of dicts List of length num_splits of train/test split dictionaries. Each dictionary has the keys 'train' and 'test', each which map to lists of trackids. """ if trackid_list is None: trackid_list = TRACK_LIST_V1 if artist_index is None: artist_index = ARTIST_INDEX artists = np.asarray([ARTIST_INDEX[trackid] for trackid in trackid_list]) splitter = GroupShuffleSplit(n_splits=num_splits, random_state=random_state, test_size=test_size) trackid_array = np.array(trackid_list) splits = [] for train, test in splitter.split(trackid_array, groups=artists): splits.append({ 'train': list(trackid_array[train]), 'test': list(trackid_array[test]) }) return splits