Python sklearn.model_selection.LeaveOneGroupOut() Examples
The following are 23
code examples of sklearn.model_selection.LeaveOneGroupOut().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection
, or try the search function
.
Example #1
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #2
Source File: test_search.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", gs.fit, X, y) gs.fit(X, y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_group_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
Example #3
Source File: test_validation.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cross_val_score_predict_groups(): # Check if ValueError (when groups is None) propagates to cross_val_score # and cross_val_predict # And also check if groups is correctly passed to the cv object X, y = make_classification(n_samples=20, n_classes=2, random_state=0) clf = SVC(kernel="linear") group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: assert_raise_message(ValueError, "The 'groups' parameter should not be None.", cross_val_score, estimator=clf, X=X, y=y, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
Example #4
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): X = y = groups = np.ones(0) assert_raise_message(ValueError, "Found array with 0 sample(s)", next, LeaveOneGroupOut().split(X, y, groups)) X = y = groups = np.ones(1) msg = ("The groups parameter contains fewer than 2 unique groups ({}). " "LeaveOneGroupOut expects at least 2.").format(groups) assert_raise_message(ValueError, msg, next, LeaveOneGroupOut().split(X, y, groups)) X = y = groups = np.ones(1) msg = ("The groups parameter contains fewer than (or equal to) n_groups " "(3) numbers of unique groups ({}). LeavePGroupsOut expects " "that at least n_groups + 1 (4) unique groups " "be present").format(groups) assert_raise_message(ValueError, msg, next, LeavePGroupsOut(n_groups=3).split(X, y, groups)) X = y = groups = np.arange(3) msg = ("The groups parameter contains fewer than (or equal to) n_groups " "(3) numbers of unique groups ({}). LeavePGroupsOut expects " "that at least n_groups + 1 (4) unique groups " "be present").format(groups) assert_raise_message(ValueError, msg, next, LeavePGroupsOut(n_groups=3).split(X, y, groups))
Example #5
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_leave_group_out_changing_groups(): # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if # the groups variable is changed before calling split groups = np.array([0, 1, 2, 1, 1, 2, 0, 0]) X = np.ones(len(groups)) groups_changing = np.array(groups, copy=True) lolo = LeaveOneGroupOut().split(X, groups=groups) lolo_changing = LeaveOneGroupOut().split(X, groups=groups) lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups) lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups) groups_changing[:] = 0 for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): assert_array_equal(train, train_chan) assert_array_equal(test, test_chan) # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3 assert_equal( 3, LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, groups=groups)) # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups) assert_equal(3, LeaveOneGroupOut().get_n_splits(X, y=X, groups=groups))
Example #6
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #7
Source File: eval_train_test.py From fanci with GNU General Public License v3.0 | 6 votes |
def logo_cv(clf_type, data_sets: [GroupedDataSet], n_jobs=-1, parallel_verbose=1, persist=True): """ Parallel leave on group out cross validation. :param clf: :param data_sets: :param n_jobs: :param parallel_verbose: :param persist: :return: """ log.info('Starting leave on group out cv for {!s} sets'.format(len(data_sets))) parallel = Parallel(n_jobs=n_jobs, verbose=parallel_verbose) logo = LeaveOneGroupOut() stats_list = parallel(delayed(_fit_and_score)(clf, domains, labels, train_index, test_index, -1, data_set_id, -1) for domains, labels, groups, data_set_id, clf in _grouped_data_sets_generator(data_sets, clf_type) for train_index, test_index in logo.split(domains, labels, groups=groups)) where = settings.EVAL_FOLDER + '/' + 'logo_cv_{!s}_{!s}sets_{!s}.pkl'.format(clf_type, len(data_sets), settings.NOW_STR) return _serialize_cv_results(stats_list, persist, where)
Example #8
Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", gs.fit, X, y) gs.fit(X, y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_group_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
Example #9
Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_cross_val_score_predict_groups(): # Check if ValueError (when groups is None) propagates to cross_val_score # and cross_val_predict # And also check if groups is correctly passed to the cv object X, y = make_classification(n_samples=20, n_classes=2, random_state=0) clf = SVC(kernel="linear") group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: assert_raise_message(ValueError, "The 'groups' parameter should not be None.", cross_val_score, estimator=clf, X=X, y=y, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
Example #10
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): X = y = groups = np.ones(0) assert_raise_message(ValueError, "Found array with 0 sample(s)", next, LeaveOneGroupOut().split(X, y, groups)) X = y = groups = np.ones(1) msg = ("The groups parameter contains fewer than 2 unique groups ({}). " "LeaveOneGroupOut expects at least 2.").format(groups) assert_raise_message(ValueError, msg, next, LeaveOneGroupOut().split(X, y, groups)) X = y = groups = np.ones(1) msg = ("The groups parameter contains fewer than (or equal to) n_groups " "(3) numbers of unique groups ({}). LeavePGroupsOut expects " "that at least n_groups + 1 (4) unique groups " "be present").format(groups) assert_raise_message(ValueError, msg, next, LeavePGroupsOut(n_groups=3).split(X, y, groups)) X = y = groups = np.arange(3) msg = ("The groups parameter contains fewer than (or equal to) n_groups " "(3) numbers of unique groups ({}). LeavePGroupsOut expects " "that at least n_groups + 1 (4) unique groups " "be present").format(groups) assert_raise_message(ValueError, msg, next, LeavePGroupsOut(n_groups=3).split(X, y, groups))
Example #11
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_leave_group_out_changing_groups(): # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if # the groups variable is changed before calling split groups = np.array([0, 1, 2, 1, 1, 2, 0, 0]) X = np.ones(len(groups)) groups_changing = np.array(groups, copy=True) lolo = LeaveOneGroupOut().split(X, groups=groups) lolo_changing = LeaveOneGroupOut().split(X, groups=groups) lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups) lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups) groups_changing[:] = 0 for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): assert_array_equal(train, train_chan) assert_array_equal(test, test_chan) # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3 assert_equal( 3, LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, groups=groups)) # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups) assert_equal(3, LeaveOneGroupOut().get_n_splits(X, y=X, groups=groups))
Example #12
Source File: eval_train_test.py From fanci with GNU General Public License v3.0 | 5 votes |
def leave_one_group_out_deprecated(clf, data_set: GroupedDataSet, n_jobs=8): log.info('Starting leave on group out cv.') logo = LeaveOneGroupOut() domains, labels, groups = data_set.expand() log.info('Set dimensions: {!s} x {!s} x {!s}'.format(len(domains), len(labels), len(groups))) log.info('Starting feature extraction.') feature_matrix = extract_all_features(domains) if isinstance(clf, SVC): std_scale = preprocessing.StandardScaler() feature_matrix = std_scale.fit_transform(feature_matrix) log.info('Feature extraction finished.') scores = cross_val_score(clf, feature_matrix, labels, groups, cv=logo, scoring=stats_metrics.multi_scorer_gridsearch, n_jobs=n_jobs, verbose=2) return scores
Example #13
Source File: evaluation.py From cddd with MIT License | 5 votes |
def qsar_classification(emb, groups, labels): """Helper function that fits and scores a SVM classifier on the extracted molecular descriptor in a leave-one-group-out cross-validation manner. Args: emb: Embedding (molecular descriptor) that is used as input for the SVM groups: Array or list with n_samples entries defining the fold membership for the crossvalidtion. labels: Target values of the of the qsar task. Returns: The mean accuracy, F1-score, ROC-AUC and prescion-recall-AUC of the cross-validation. """ acc = [] f1 = [] roc_auc = [] pr_auc = [] logo = LeaveOneGroupOut() clf = SVC(kernel='rbf', C=5.0, probability=True) for train_index, test_index in logo.split(emb, groups=groups): clf.fit(emb[train_index], labels[train_index]) y_pred = clf.predict(emb[test_index]) y_pred_prob = clf.predict_proba(emb[test_index])[:, 1] y_true = labels[test_index] precision, recall, t = precision_recall_curve(y_true, y_pred_prob) acc.append(accuracy_score(y_true, y_pred)) f1.append(f1_score(y_true, y_pred)) roc_auc.append(roc_auc_score(y_true, y_pred_prob)) pr_auc.append(auc(recall, precision)) return np.mean(acc), np.mean(f1), np.mean(roc_auc), np.mean(pr_auc)
Example #14
Source File: evaluation.py From cddd with MIT License | 5 votes |
def qsar_regression(emb, groups, labels): """Helper function that fits and scores a SVM regressor on the extracted molecular descriptor in a leave-one-group-out cross-validation manner. Args: emb: Embedding (molecular descriptor) that is used as input for the SVM groups: Array or list with n_samples entries defining the fold membership for the crossvalidtion. labels: Target values of the of the qsar task. Returns: The mean accuracy, F1-score, ROC-AUC and prescion-recall-AUC of the cross-validation. """ r2 = [] r = [] mse = [] mae = [] logo = LeaveOneGroupOut() clf = SVR(kernel='rbf', C=5.0) for train_index, test_index in logo.split(emb, groups=groups): clf.fit(emb[train_index], labels[train_index]) y_pred = clf.predict(emb[test_index]) y_true = labels[test_index] r2.append(r2_score(y_true, y_pred)) r.append(spearmanr(y_true, y_pred)[0]) mse.append(mean_squared_error(y_true, y_pred)) mae.append(mean_absolute_error(y_true, y_pred)) return np.mean(r2), np.mean(r), np.mean(mse), np.mean(mae)
Example #15
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.model_selection.KFold, ms.KFold) self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold) self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut) self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.model_selection.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.model_selection.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV) self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid) self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler) # Model validation
Example #16
Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_objectmapper_abbr(self): df = pdml.ModelFrame([]) # Splitter Classes self.assertIs(df.ms.KFold, ms.KFold) self.assertIs(df.ms.GroupKFold, ms.GroupKFold) self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold) self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut) self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut) self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut) self.assertIs(df.ms.LeavePOut, ms.LeavePOut) self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit) self.assertIs(df.ms.GroupShuffleSplit, ms.GroupShuffleSplit) # self.assertIs(df.ms.StratifiedShuffleSplit, # ms.StratifiedShuffleSplit) self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit) self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit) # Splitter Functions # Hyper-parameter optimizers self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV) self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV) self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid) self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler) # Model validation
Example #17
Source File: mastml_driver.py From MAST-ML with MIT License | 5 votes |
def _instantiate(kwargs_dict, name_to_constructor, category, X_grouped=None, X_indices=None): """ Uses name_to_constructor to instantiate every item in kwargs_dict and return the list of instantiations """ instantiations = [] for long_name, (name, kwargs) in kwargs_dict.items(): log.debug(f'instantiation: {long_name}, {name}({kwargs})') try: #skip instantiate step for keras model because need to pass dict to build model and not all values directly if 'KerasRegressor' in long_name: pass # Need to construct cv object when have special case of RFECV and LeaveOneGroupOut cross-validation! elif name == 'RFECV': if 'cv' in kwargs.keys(): if X_grouped is not None: if kwargs['cv'].__class__.__name__ == 'LeaveOneGroupOut': trains = list() tests = list() for train_idx, test_idx in LeaveOneGroupOut().split(X=X_indices, y=None, groups=X_grouped): trains.append(train_idx) tests.append(test_idx) custom_cv = zip(trains, tests) kwargs['cv'] = custom_cv instantiations.append([long_name, name_to_constructor[name](**kwargs)]) else: instantiations.append([long_name, name_to_constructor[name](**kwargs)]) except TypeError: log.info(f"ARGUMENTS FOR '{name}': {inspect.signature(name_to_constructor[name])}") raise utils.InvalidConfParameters( f"The {category} '{name}' has invalid parameters: {kwargs}\n" f"Signature for '{name}': {inspect.signature(name_to_constructor[name])}") except KeyError: raise utils.InvalidConfSubSection( f"There is no {category} called '{name}'." f"All valid {category}: {list(name_to_constructor.keys())}") return instantiations
Example #18
Source File: run_qsar_test.py From cddd with MIT License | 4 votes |
def main(unused_argv): """Main function to test the performance of the translation model to extract meaningfull features for a QSAR modelling""" if FLAGS.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.device) print("use gpu {}".format(str(FLAGS.device))) else: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' model_dir = FLAGS.model_dir infer_model = InferenceModel(model_dir, use_gpu=FLAGS.gpu, cpu_threads=FLAGS.cpu_threads) ames_df = pd.read_csv("ames.csv") ames_smls = ames_df.smiles.tolist() ames_labels = ames_df.label.values ames_fold = ames_df.fold.values print("Extracting molecular desscriptors for Ames") ames_emb = infer_model.seq_to_emb(ames_smls) ames_emb = (ames_emb - ames_emb.mean()) / ames_emb.std() lipo_df = pd.read_csv("lipo.csv") lipo_smls = lipo_df.smiles.tolist() lipo_labels = lipo_df.label.values lipo_fold = lipo_df.fold.values print("Extracting molecular desscriptors for Lipophilicity") lipo_emb = infer_model.seq_to_emb(lipo_smls) lipo_emb = (lipo_emb - lipo_emb.mean()) / lipo_emb.std() print("Running SVM on Ames mutagenicity...") clf = SVC(C=5.0) result = cross_val_score(clf, ames_emb, ames_labels, ames_fold, cv=LeaveOneGroupOut(), n_jobs=5) print("Ames mutagenicity accuracy: %0.3f +/- %0.3f" %(np.mean(result), np.std(result))) print("Running SVM on Lipophilicity...") clf = SVR(C=5.0) result = cross_val_score(clf, lipo_emb, lipo_labels, lipo_fold, cv=LeaveOneGroupOut(), n_jobs=5) print("Lipophilicity r2: %0.3f +/- %0.3f" %(np.mean(result), np.std(result)))
Example #19
Source File: cross_validation.py From nltools with MIT License | 4 votes |
def set_cv(Y=None, cv_dict=None, return_generator=True): """ Helper function to create a sci-kit learn compatible cv object using common parameters for prediction analyses. Args: Y: (pd.DataFrame) Pandas Dataframe of Y labels cv_dict: (dict) Type of cross_validation to use. A dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout} return_generator (bool): return a cv generator instead of an instance; default True Returns: cv: a scikit-learn model-selection generator """ if isinstance(cv_dict, dict): if cv_dict['type'] == 'kfolds': if 'subject_id' in cv_dict: # Hold out subjects within each fold from sklearn.model_selection import GroupKFold cv_inst = GroupKFold(n_splits=cv_dict['n_folds']) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id']) elif 'stratified' in cv_dict: # Stratified K-Folds Continuous from nltools.cross_validation import KFoldStratified cv_inst = KFoldStratified(n_splits=cv_dict['n_folds']) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y) else: # Normal K-Folds from sklearn.model_selection import KFold cv_inst = KFold(n_splits=cv_dict['n_folds']) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y) elif cv_dict['type'] == 'loso': # Leave One Subject Out from sklearn.model_selection import LeaveOneGroupOut cv_inst = LeaveOneGroupOut() cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id']) else: raise ValueError("""Make sure you specify a dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout}, where n = number of folds, and subject = vector of subject ids that corresponds to self.Y""") else: raise ValueError("Make sure 'cv_dict' is a dictionary.") if return_generator: return cv else: return cv_inst
Example #20
Source File: test_split.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_cross_validator_with_default_params(): n_samples = 4 n_unique_groups = 4 n_splits = 2 p = 2 n_shuffle_splits = 10 # (the default value) X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) X_1d = np.array([1, 2, 3, 4]) y = np.array([1, 1, 2, 2]) groups = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) kf = KFold(n_splits) skf = StratifiedKFold(n_splits) lolo = LeaveOneGroupOut() lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, " "test_size='default',\n train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, n_unique_groups, comb(n_unique_groups, p), n_shuffle_splits, 2] for i, (cv, cv_repr) in enumerate(zip( [loo, lpo, kf, skf, lolo, lopo, ss, ps], [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, ss_repr, ps_repr])): # Test if get_n_splits works correctly assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups)) # Test if the cross-validator works as expected even if # the data is 1d np.testing.assert_equal(list(cv.split(X, y, groups)), list(cv.split(X_1d, y, groups))) # Test that train, test indices returned are integers for train, test in cv.split(X, y, groups): assert_equal(np.asarray(train).dtype.kind, 'i') assert_equal(np.asarray(train).dtype.kind, 'i') # Test if the repr works without any errors assert_equal(cv_repr, repr(cv)) # ValueError for get_n_splits methods msg = "The 'X' parameter should not be None." assert_raise_message(ValueError, msg, loo.get_n_splits, None, y, groups) assert_raise_message(ValueError, msg, lpo.get_n_splits, None, y, groups)
Example #21
Source File: test_split.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_leave_one_p_group_out(): logo = LeaveOneGroupOut() lpgo_1 = LeavePGroupsOut(n_groups=1) lpgo_2 = LeavePGroupsOut(n_groups=2) # Make sure the repr works assert_equal(repr(logo), 'LeaveOneGroupOut()') assert_equal(repr(lpgo_1), 'LeavePGroupsOut(n_groups=1)') assert_equal(repr(lpgo_2), 'LeavePGroupsOut(n_groups=2)') assert_equal(repr(LeavePGroupsOut(n_groups=3)), 'LeavePGroupsOut(n_groups=3)') for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1), (lpgo_2, 2))): for i, groups_i in enumerate(test_groups): n_groups = len(np.unique(groups_i)) n_splits = (n_groups if p_groups_out == 1 else n_groups * (n_groups - 1) / 2) X = y = np.ones(len(groups_i)) # Test that the length is correct assert_equal(cv.get_n_splits(X, y, groups=groups_i), n_splits) groups_arr = np.asarray(groups_i) # Split using the original list / array / list of string groups_i for train, test in cv.split(X, y, groups=groups_i): # First test: no train group is in the test set and vice versa assert_array_equal(np.intersect1d(groups_arr[train], groups_arr[test]).tolist(), []) # Second test: train and test add up to all the data assert_equal(len(train) + len(test), len(groups_i)) # Third test: # The number of groups in test must be equal to p_groups_out assert_true(np.unique(groups_arr[test]).shape[0], p_groups_out) # check get_n_splits() with dummy parameters assert_equal(logo.get_n_splits(None, None, ['a', 'b', 'c', 'b', 'c']), 3) assert_equal(logo.get_n_splits(groups=[1.0, 1.1, 1.0, 1.2]), 3) assert_equal(lpgo_2.get_n_splits(None, None, np.arange(4)), 6) assert_equal(lpgo_1.get_n_splits(groups=np.arange(4)), 4) # raise ValueError if a `groups` parameter is illegal with assert_raises(ValueError): logo.get_n_splits(None, None, [0.0, np.nan, 0.0]) with assert_raises(ValueError): lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0]) msg = "The 'groups' parameter should not be None." assert_raise_message(ValueError, msg, logo.get_n_splits, None, None, None) assert_raise_message(ValueError, msg, lpgo_1.get_n_splits, None, None, None)
Example #22
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_leave_one_p_group_out(): logo = LeaveOneGroupOut() lpgo_1 = LeavePGroupsOut(n_groups=1) lpgo_2 = LeavePGroupsOut(n_groups=2) # Make sure the repr works assert_equal(repr(logo), 'LeaveOneGroupOut()') assert_equal(repr(lpgo_1), 'LeavePGroupsOut(n_groups=1)') assert_equal(repr(lpgo_2), 'LeavePGroupsOut(n_groups=2)') assert_equal(repr(LeavePGroupsOut(n_groups=3)), 'LeavePGroupsOut(n_groups=3)') for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1), (lpgo_2, 2))): for i, groups_i in enumerate(test_groups): n_groups = len(np.unique(groups_i)) n_splits = (n_groups if p_groups_out == 1 else n_groups * (n_groups - 1) / 2) X = y = np.ones(len(groups_i)) # Test that the length is correct assert_equal(cv.get_n_splits(X, y, groups=groups_i), n_splits) groups_arr = np.asarray(groups_i) # Split using the original list / array / list of string groups_i for train, test in cv.split(X, y, groups=groups_i): # First test: no train group is in the test set and vice versa assert_array_equal(np.intersect1d(groups_arr[train], groups_arr[test]).tolist(), []) # Second test: train and test add up to all the data assert_equal(len(train) + len(test), len(groups_i)) # Third test: # The number of groups in test must be equal to p_groups_out assert np.unique(groups_arr[test]).shape[0], p_groups_out # check get_n_splits() with dummy parameters assert_equal(logo.get_n_splits(None, None, ['a', 'b', 'c', 'b', 'c']), 3) assert_equal(logo.get_n_splits(groups=[1.0, 1.1, 1.0, 1.2]), 3) assert_equal(lpgo_2.get_n_splits(None, None, np.arange(4)), 6) assert_equal(lpgo_1.get_n_splits(groups=np.arange(4)), 4) # raise ValueError if a `groups` parameter is illegal with assert_raises(ValueError): logo.get_n_splits(None, None, [0.0, np.nan, 0.0]) with assert_raises(ValueError): lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0]) msg = "The 'groups' parameter should not be None." assert_raise_message(ValueError, msg, logo.get_n_splits, None, None, None) assert_raise_message(ValueError, msg, lpgo_1.get_n_splits, None, None, None)
Example #23
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_cross_validator_with_default_params(): n_samples = 4 n_unique_groups = 4 n_splits = 2 p = 2 n_shuffle_splits = 10 # (the default value) X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) X_1d = np.array([1, 2, 3, 4]) y = np.array([1, 1, 2, 2]) groups = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) kf = KFold(n_splits) skf = StratifiedKFold(n_splits) lolo = LeaveOneGroupOut() lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, " "test_size=None, train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, n_unique_groups, comb(n_unique_groups, p), n_shuffle_splits, 2] for i, (cv, cv_repr) in enumerate(zip( [loo, lpo, kf, skf, lolo, lopo, ss, ps], [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, ss_repr, ps_repr])): # Test if get_n_splits works correctly assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups)) # Test if the cross-validator works as expected even if # the data is 1d np.testing.assert_equal(list(cv.split(X, y, groups)), list(cv.split(X_1d, y, groups))) # Test that train, test indices returned are integers for train, test in cv.split(X, y, groups): assert_equal(np.asarray(train).dtype.kind, 'i') assert_equal(np.asarray(train).dtype.kind, 'i') # Test if the repr works without any errors assert_equal(cv_repr, repr(cv)) # ValueError for get_n_splits methods msg = "The 'X' parameter should not be None." assert_raise_message(ValueError, msg, loo.get_n_splits, None, y, groups) assert_raise_message(ValueError, msg, lpo.get_n_splits, None, y, groups)