Python sklearn.model_selection.GroupKFold() Examples
The following are 24
code examples of sklearn.model_selection.GroupKFold().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection
, or try the search function
.
Example #1
Source File: test_search.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", gs.fit, X, y) gs.fit(X, y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_group_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
Example #2
Source File: test_rfe.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_rfe_cv_groups(): generator = check_random_state(0) iris = load_iris() number_groups = 4 groups = np.floor(np.linspace(0, number_groups, len(iris.target))) X = iris.data y = (iris.target > 0).astype(int) est_groups = RFECV( estimator=RandomForestClassifier(random_state=generator), step=1, scoring='accuracy', cv=GroupKFold(n_splits=2) ) est_groups.fit(X, y, groups=groups) assert est_groups.n_features_ > 0
Example #3
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #4
Source File: test_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_score_predict_groups(): # Check if ValueError (when groups is None) propagates to cross_val_score # and cross_val_predict # And also check if groups is correctly passed to the cv object X, y = make_classification(n_samples=20, n_classes=2, random_state=0) clf = SVC(kernel="linear") group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: assert_raise_message(ValueError, "The 'groups' parameter should not be None.", cross_val_score, estimator=clf, X=X, y=y, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
Example #5
Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_cross_val_score_predict_groups(): # Check if ValueError (when groups is None) propagates to cross_val_score # and cross_val_predict # And also check if groups is correctly passed to the cv object X, y = make_classification(n_samples=20, n_classes=2, random_state=0) clf = SVC(kernel="linear") group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: assert_raise_message(ValueError, "The 'groups' parameter should not be None.", cross_val_score, estimator=clf, X=X, y=y, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
Example #6
Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", gs.fit, X, y) gs.fit(X, y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_group_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
Example #7
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #8
Source File: tpot_tests.py From tpot with GNU Lesser General Public License v3.0 | 6 votes |
def test_fit_GroupKFold(): """Assert that TPOT properly handles the group parameter when using GroupKFold.""" # This check tests if the darker digits images would generalize to the lighter ones. means = np.mean(training_features, axis=1) groups = means >= np.median(means) tpot_obj = TPOTClassifier( random_state=42, population_size=2, offspring_size=4, generations=1, verbosity=0, config_dict='TPOT light', cv=model_selection.GroupKFold(n_splits=2), ) tpot_obj.fit(training_features, training_target, groups=groups) assert_greater_equal(tpot_obj.score(testing_features, testing_target), 0.97)
Example #9
Source File: SupervisedClassifier.py From CDSS with GNU General Public License v3.0 | 6 votes |
def __init__(self, classes, hyperparams=None, groups=None): self._classes = classes # Initialize params. self._params = {} self._model = None ''' Used by GroupKFold for splitting train/validation. ''' self._groups = groups # Initialize hyperparams. self._hyperparams = {} if hyperparams is None else hyperparams self._hyperparam_search_space = {} # Set algorithm. self._get_or_set_hyperparam('algorithm') # Set random state. self._get_or_set_hyperparam('random_state') # Set CV strategy. self._get_or_set_hyperparam('hyperparam_strategy')
Example #10
Source File: sklearn_test.py From keras-tuner with Apache License 2.0 | 6 votes |
def test_sklearn_cv_with_groups(tmp_dir): tuner = sklearn_tuner.Sklearn( oracle=kt.oracles.BayesianOptimization( objective=kt.Objective('score', 'max'), max_trials=10), hypermodel=build_model, cv=model_selection.GroupKFold(5), directory=tmp_dir) x = np.random.uniform(size=(50, 10)) y = np.random.randint(0, 2, size=(50,)) groups = np.random.randint(0, 5, size=(50,)) tuner.search(x, y, groups=groups) assert len(tuner.oracle.trials) == 10 best_trial = tuner.oracle.get_best_trials()[0] assert best_trial.status == 'COMPLETED' assert best_trial.score is not None assert best_trial.best_step == 0 assert best_trial.metrics.exists('score') # Make sure best model can be reloaded. best_model = tuner.get_best_models()[0] best_model.score(x, y)
Example #11
Source File: test_run.py From nyaggle with MIT License | 5 votes |
def test_experiment_manual_cv_group(tmpdir_name): df1 = pd.DataFrame() df1['x'] = np.random.randint(0, 10, size=1000) df1['y'] = df1['x'] > 5 df1['grp'] = 0 df2 = pd.DataFrame() df2['x'] = np.random.randint(0, 10, size=100) df2['y'] = df2['x'] <= 5 df2['grp'] = 1 X = pd.concat([df1, df2]).reset_index(drop=True) y = X['y'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) grp = X_train['grp'] X_train = X_train.drop(['y', 'grp'], axis=1) X_test = X_test.drop(['y', 'grp'], axis=1) params = { 'objective': 'binary', 'max_depth': 8 } result = run_experiment(params, X_train, y_train, X_test, tmpdir_name, cv=GroupKFold(2), groups=grp) assert result.metrics[-1] < 0.7
Example #12
Source File: test_validation.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_learning_curve_with_shuffle(): # Following test case was designed this way to verify the code # changes made in pull request: #7506. X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [11, 12], [13, 14], [15, 16], [17, 18], [19, 20], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16], [17, 18]]) y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4]) groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4]) # Splits on these groups fail without shuffle as the first iteration # of the learning curve doesn't contain label 4 in the training set. estimator = PassiveAggressiveClassifier(max_iter=5, tol=None, shuffle=False) cv = GroupKFold(n_splits=2) train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve( estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3), groups=groups, shuffle=True, random_state=2) assert_array_almost_equal(train_scores_batch.mean(axis=1), np.array([0.75, 0.3, 0.36111111])) assert_array_almost_equal(test_scores_batch.mean(axis=1), np.array([0.36111111, 0.25, 0.25])) assert_raises(ValueError, learning_curve, estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3), groups=groups) train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve( estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3), groups=groups, shuffle=True, random_state=2, exploit_incremental_learning=True) assert_array_almost_equal(train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)) assert_array_almost_equal(test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1))
Example #13
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_nested_cv(): # Test if nested cross validation works with different combinations of cv rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 5, 15) cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(), StratifiedShuffleSplit(n_splits=3, random_state=0)] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]}, cv=inner_cv) cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={'groups': groups})
Example #14
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper_abbr(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.ms.KFold, ms.KFold) self.assertIs(df.ms.GroupKFold, ms.GroupKFold) self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.ms.LeavePOut, ms.LeavePOut) self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.ms.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.ms.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV) self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid) self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler) # Model validation
Example #15
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.model_selection.KFold, ms.KFold) self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold) self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut) self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.model_selection.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.model_selection.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV) self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid) self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler) # Model validation
Example #16
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_nsplit_default_warn(): # Test that warnings are raised. Will be removed in 0.22 assert_warns_message(FutureWarning, NSPLIT_WARNING, KFold) assert_warns_message(FutureWarning, NSPLIT_WARNING, GroupKFold) assert_warns_message(FutureWarning, NSPLIT_WARNING, StratifiedKFold) assert_warns_message(FutureWarning, NSPLIT_WARNING, TimeSeriesSplit) assert_no_warnings(KFold, n_splits=5) assert_no_warnings(GroupKFold, n_splits=5) assert_no_warnings(StratifiedKFold, n_splits=5) assert_no_warnings(TimeSeriesSplit, n_splits=5)
Example #17
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_nested_cv(): # Test if nested cross validation works with different combinations of cv rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 5, 15) cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(), StratifiedShuffleSplit(n_splits=3, random_state=0)] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]}, cv=inner_cv, error_score='raise', iid=False) cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={'groups': groups})
Example #18
Source File: test_wrapper.py From category_encoders with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_custom_cv(self): x = np.array([ ['a', 'b', 'c'], ['a', 'b', 'c'], ['a', 'b', 'c'], ['a', 'b', 'c'], ['b', 'b', 'c'], ['b', 'b', 'c'], ['b', 'b', 'b'], ['b', 'b', 'b'], ['b', 'b', 'b'], ['b', 'b', 'b'], ['a', 'b', 'a'], ['a', 'b', 'a'], ]) groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] y = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] gkfold = GroupKFold(n_splits=3) wrapper = NestedCVWrapper(encoders.TargetEncoder(), cv=gkfold) result_train, result_valid = wrapper.fit_transform(x, y, X_test=x, groups=groups) # We would expect result_train != result_valid since result_train has been generated using nested # folds and result_valid is generated by fitting the encoder on all of the x & y daya self.assertFalse(np.allclose(result_train, result_valid))
Example #19
Source File: splitters.py From fklearn with Apache License 2.0 | 4 votes |
def out_of_time_and_space_splitter(train_data: pd.DataFrame, n_splits: int, in_time_limit: DateType, time_column: str, space_column: str, holdout_gap: timedelta = timedelta(days=0)) -> SplitterReturnType: """ Makes K grouped train/test split folds for cross validation. The folds are made so that every ID is used at least once for evaluating and K-1 times for training. Also, for each fold, evaluation will always be out-of-ID and out-of-time. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame that will be split into K out-of-time and ID folds for cross validation. n_splits : int The number of folds K for the K-Fold cross validation strategy. in_time_limit : str or datetime.datetime A String representing the end time of the training data. It should be in the same format as the Date column in `train_data`. time_column : str The name of the Date column of `train_data`. space_column : str The name of the ID column of `train_data`. holdout_gap: datetime.timedelta Timedelta of the gap between the end of the training period and the start of the validation period. """ # first generate folds by space, using LabelKFold # GroupKFold is not supposed to be randomized, that's why there's no random_state here train_data = train_data.reset_index() space_folds = GroupKFold(n_splits).split(train_data, groups=train_data[space_column]) if isinstance(in_time_limit, str): in_time_limit = datetime.strptime(in_time_limit, "%Y-%m-%d") # train_indexes have time_column <= in_time_limit # test_indexes have time_column > in_time_limit folds = pipe(space_folds, partial(starmap, lambda f_train, f_test: [train_data.iloc[f_train][time_column], train_data.iloc[f_test][time_column]]), partial(starmap, lambda train, test: (train[train <= in_time_limit], # filter train time test[test > (in_time_limit + holdout_gap)])), # filter test time list) logs = list(map(_log_time_fold, folds)) # get fold logs folds_indexes = _lc_fold_to_indexes(folds) # final formatting with idx return folds_indexes, logs
Example #20
Source File: split.py From mlcomp with Apache License 2.0 | 4 votes |
def file_group_kfold( n_splits: int, output: str = None, get_group=None, sort=False, must_equal=(), **files ): assert len(files) > 0, 'at lease 1 type of files is required' fold = GroupKFold(n_splits) keys = sorted(list(files)) def get_name(file): return splitext(basename(file))[0] if sort: for k, v in files.items(): files[k] = sorted(files[k], key=get_name) file_first = sorted(files[keys[0]]) assert len(file_first) > n_splits, \ f'at least {n_splits} files is required. Provided: {len(file_first)}' for k, v in files.items(): assert len(files[k]) == len(file_first), \ f'count of files in key = {k} is not the same as in {keys[0]}' for k, v in files.items(): if k not in must_equal: continue for i in range(len(file_first)): names_equal = get_name(v[i]) == get_name(file_first[i]) assert names_equal, \ f'file name in {k} does not equal to {keys[0]}, ' \ f'file name = {basename(v[i])}' df = pd.DataFrame(files)[keys] df['fold'] = 0 groups = [ i if not get_group else get_group(file) for i, file in enumerate(file_first) ] for i, (train_index, test_index) in enumerate(fold.split(groups, groups=groups)): df.loc[test_index, 'fold'] = i df = df.sample(frac=1) if output: df.to_csv(output, index=False) return df
Example #21
Source File: cross_validation.py From nltools with MIT License | 4 votes |
def set_cv(Y=None, cv_dict=None, return_generator=True): """ Helper function to create a sci-kit learn compatible cv object using common parameters for prediction analyses. Args: Y: (pd.DataFrame) Pandas Dataframe of Y labels cv_dict: (dict) Type of cross_validation to use. A dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout} return_generator (bool): return a cv generator instead of an instance; default True Returns: cv: a scikit-learn model-selection generator """ if isinstance(cv_dict, dict): if cv_dict['type'] == 'kfolds': if 'subject_id' in cv_dict: # Hold out subjects within each fold from sklearn.model_selection import GroupKFold cv_inst = GroupKFold(n_splits=cv_dict['n_folds']) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id']) elif 'stratified' in cv_dict: # Stratified K-Folds Continuous from nltools.cross_validation import KFoldStratified cv_inst = KFoldStratified(n_splits=cv_dict['n_folds']) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y) else: # Normal K-Folds from sklearn.model_selection import KFold cv_inst = KFold(n_splits=cv_dict['n_folds']) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y) elif cv_dict['type'] == 'loso': # Leave One Subject Out from sklearn.model_selection import LeaveOneGroupOut cv_inst = LeaveOneGroupOut() cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id']) else: raise ValueError("""Make sure you specify a dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout}, where n = number of folds, and subject = vector of subject ids that corresponds to self.Y""") else: raise ValueError("Make sure 'cv_dict' is a dictionary.") if return_generator: return cv else: return cv_inst
Example #22
Source File: skwrapper.py From Benchmarks with MIT License | 4 votes |
def split_data(df, ycol='0', classify=False, cv=5, bins=0, cutoffs=None, groupcols=None, ignore_categoricals=False, verbose=True): if groupcols is not None: groups = make_group_from_columns(df, groupcols) cat_cols = df.select_dtypes(['object']).columns if ignore_categoricals: df[cat_cols] = 0 else: df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('category').cat.codes) if ycol.isdigit(): ycol = df.columns[int(ycol)] y = df.loc[:, ycol].values x = df.drop(ycol, axis=1).values features = df.drop(ycol, axis=1).columns.tolist() if verbose: print('Target column: {}'.format(ycol)) print(' count = {}, uniq = {}, mean = {:.3g}, std = {:.3g}'.format(len(y), len(np.unique(y)), np.mean(y), np.std(y))) print(' min = {:.3g}, q1 = {:.3g}, median = {:.3g}, q3 = {:.3g}, max = {:.3g}'.format(np.min(y), np.percentile(y, 25), np.median(y), np.percentile(y, 75), np.max(y))) if not classify: y_even = discretize(y, bins=5, verbose=False) elif bins >= 2: y = discretize(y, bins=bins, min_count=cv, verbose=verbose) elif cutoffs: y = discretize(y, cutoffs=cutoffs, min_count=cv, verbose=verbose) elif df[ycol].dtype in [np.dtype('float64'), np.dtype('float32')]: warnings.warn('Warning: classification target is float; consider using --bins or --cutoffs') y = y.astype(int) if classify: mask = np.ones(len(y), dtype=bool) unique, counts = np.unique(y, return_counts=True) for v, c in zip(unique, counts): if c < cv: mask[y == v] = False x = x[mask] y = y[mask] removed = len(mask) - np.sum(mask) if removed and verbose: print('Removed {} rows in small classes: count < {}'.format(removed, cv)) if groupcols is None: if classify: y_even = y skf = StratifiedKFold(n_splits=cv, shuffle=True) splits = skf.split(x, y_even) else: if classify: groups = groups[mask] gkf = GroupKFold(n_splits=cv) splits = gkf.split(x, y, groups) if verbose: print() return x, y, list(splits), features
Example #23
Source File: sklearn_utils.py From ibeis with Apache License 2.0 | 4 votes |
def temp(samples): from sklearn import model_selection from ibeis.algo.verif import sklearn_utils def check_balance(idxs): # from sklearn.utils.fixes import bincount print('-------') for count, (test, train) in enumerate(idxs): print('split %r' % (count)) groups_train = set(groups.take(train)) groups_test = set(groups.take(test)) n_group_isect = len(groups_train.intersection(groups_test)) y_train_freq = bincount(y.take(train)) y_test_freq = bincount(y.take(test)) y_test_ratio = y_test_freq / y_test_freq.sum() y_train_ratio = y_train_freq / y_train_freq.sum() balance_error = np.sum((y_test_ratio - y_train_ratio) ** 2) print('n_group_isect = %r' % (n_group_isect,)) print('y_test_ratio = %r' % (y_test_ratio,)) print('y_train_ratio = %r' % (y_train_ratio,)) print('balance_error = %r' % (balance_error,)) X = np.empty((len(samples), 0)) y = samples.encoded_1d().values groups = samples.group_ids n_splits = 3 splitter = model_selection.GroupShuffleSplit(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = model_selection.GroupKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = model_selection.StratifiedKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = sklearn_utils.StratifiedGroupKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs)
Example #24
Source File: test_ridge.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_ridge_gcv_sample_weights( gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): alphas = [1e-3, .1, 1., 10., 1e3] rng = np.random.RandomState(0) n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=11, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise) y = y.reshape(y_shape) sample_weight = 3 * rng.randn(len(X)) sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) indices = np.repeat(np.arange(X.shape[0]), sample_weight) sample_weight = sample_weight.astype(float) X_tiled, y_tiled = X[indices], y[indices] cv = GroupKFold(n_splits=X.shape[0]) splits = cv.split(X_tiled, y_tiled, groups=indices) kfold = RidgeCV( alphas=alphas, cv=splits, scoring='neg_mean_squared_error', fit_intercept=fit_intercept) # ignore warning from GridSearchCV: DeprecationWarning: The default of the # `iid` parameter will change from True to False in version 0.22 and will # be removed in 0.24 with ignore_warnings(category=DeprecationWarning): kfold.fit(X_tiled, y_tiled) ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) splits = cv.split(X_tiled, y_tiled, groups=indices) predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) kfold_errors = (y_tiled - predictions)**2 kfold_errors = [ np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0])] kfold_errors = np.asarray(kfold_errors) X_gcv = X_constructor(X) gcv_ridge = RidgeCV( alphas=alphas, store_cv_values=True, gcv_mode=gcv_mode, fit_intercept=fit_intercept) gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) if len(y_shape) == 2: gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] else: gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)