Python Examples of sklearn.model_selection.LeaveOneGroupOut

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Source File: test_search.py From twitter-stock-recommendation with MIT License

6 votes

def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

Source File: test_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv)

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
    X = y = groups = np.ones(0)
    assert_raise_message(ValueError, "Found array with 0 sample(s)", next,
                         LeaveOneGroupOut().split(X, y, groups))
    X = y = groups = np.ones(1)
    msg = ("The groups parameter contains fewer than 2 unique groups ({}). "
           "LeaveOneGroupOut expects at least 2.").format(groups)
    assert_raise_message(ValueError, msg, next,
                         LeaveOneGroupOut().split(X, y, groups))
    X = y = groups = np.ones(1)
    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
           "(3) numbers of unique groups ({}). LeavePGroupsOut expects "
           "that at least n_groups + 1 (4) unique groups "
           "be present").format(groups)
    assert_raise_message(ValueError, msg, next,
                         LeavePGroupsOut(n_groups=3).split(X, y, groups))
    X = y = groups = np.arange(3)
    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
           "(3) numbers of unique groups ({}). LeavePGroupsOut expects "
           "that at least n_groups + 1 (4) unique groups "
           "be present").format(groups)
    assert_raise_message(ValueError, msg, next,
                         LeavePGroupsOut(n_groups=3).split(X, y, groups))

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_leave_group_out_changing_groups():
    # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if
    # the groups variable is changed before calling split
    groups = np.array([0, 1, 2, 1, 1, 2, 0, 0])
    X = np.ones(len(groups))
    groups_changing = np.array(groups, copy=True)
    lolo = LeaveOneGroupOut().split(X, groups=groups)
    lolo_changing = LeaveOneGroupOut().split(X, groups=groups)
    lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
    lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
    groups_changing[:] = 0
    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
            assert_array_equal(train, train_chan)
            assert_array_equal(test, test_chan)

    # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3
    assert_equal(
        3, LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X,
                                                    groups=groups))
    # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups)
    assert_equal(3, LeaveOneGroupOut().get_n_splits(X, y=X,
                                                    groups=groups))

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Source File: eval_train_test.py From fanci with GNU General Public License v3.0

6 votes

def logo_cv(clf_type, data_sets: [GroupedDataSet], n_jobs=-1, parallel_verbose=1, persist=True):
    """
    Parallel leave on group out cross validation.
    :param clf:
    :param data_sets:
    :param n_jobs:
    :param parallel_verbose:
    :param persist:
    :return:
    """
    log.info('Starting leave on group out cv for {!s} sets'.format(len(data_sets)))

    parallel = Parallel(n_jobs=n_jobs, verbose=parallel_verbose)

    logo = LeaveOneGroupOut()
    stats_list = parallel(delayed(_fit_and_score)(clf, domains, labels, train_index, test_index, -1, data_set_id, -1)
                      for domains, labels, groups, data_set_id, clf in _grouped_data_sets_generator(data_sets, clf_type)
                      for train_index, test_index in logo.split(domains, labels, groups=groups))
    where = settings.EVAL_FOLDER + '/' + 'logo_cv_{!s}_{!s}sets_{!s}.pkl'.format(clf_type, len(data_sets),
                                                                                                settings.NOW_STR)
    return _serialize_cv_results(stats_list, persist, where)

Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
    X = y = groups = np.ones(0)
    assert_raise_message(ValueError, "Found array with 0 sample(s)", next,
                         LeaveOneGroupOut().split(X, y, groups))
    X = y = groups = np.ones(1)
    msg = ("The groups parameter contains fewer than 2 unique groups ({}). "
           "LeaveOneGroupOut expects at least 2.").format(groups)
    assert_raise_message(ValueError, msg, next,
                         LeaveOneGroupOut().split(X, y, groups))
    X = y = groups = np.ones(1)
    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
           "(3) numbers of unique groups ({}). LeavePGroupsOut expects "
           "that at least n_groups + 1 (4) unique groups "
           "be present").format(groups)
    assert_raise_message(ValueError, msg, next,
                         LeavePGroupsOut(n_groups=3).split(X, y, groups))
    X = y = groups = np.arange(3)
    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
           "(3) numbers of unique groups ({}). LeavePGroupsOut expects "
           "that at least n_groups + 1 (4) unique groups "
           "be present").format(groups)
    assert_raise_message(ValueError, msg, next,
                         LeavePGroupsOut(n_groups=3).split(X, y, groups))

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_leave_group_out_changing_groups():
    # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if
    # the groups variable is changed before calling split
    groups = np.array([0, 1, 2, 1, 1, 2, 0, 0])
    X = np.ones(len(groups))
    groups_changing = np.array(groups, copy=True)
    lolo = LeaveOneGroupOut().split(X, groups=groups)
    lolo_changing = LeaveOneGroupOut().split(X, groups=groups)
    lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
    lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
    groups_changing[:] = 0
    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
            assert_array_equal(train, train_chan)
            assert_array_equal(test, test_chan)

    # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3
    assert_equal(
        3, LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X,
                                                    groups=groups))
    # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups)
    assert_equal(3, LeaveOneGroupOut().get_n_splits(X, y=X,
                                                    groups=groups))

Source File: eval_train_test.py From fanci with GNU General Public License v3.0

5 votes

def leave_one_group_out_deprecated(clf, data_set: GroupedDataSet, n_jobs=8):
    log.info('Starting leave on group out cv.')
    logo = LeaveOneGroupOut()
    domains, labels, groups = data_set.expand()
    log.info('Set dimensions: {!s} x {!s} x {!s}'.format(len(domains), len(labels), len(groups)))
    log.info('Starting feature extraction.')
    feature_matrix = extract_all_features(domains)
    if isinstance(clf, SVC):
        std_scale = preprocessing.StandardScaler()
        feature_matrix = std_scale.fit_transform(feature_matrix)

    log.info('Feature extraction finished.')

    scores = cross_val_score(clf, feature_matrix, labels, groups, cv=logo, scoring=stats_metrics.multi_scorer_gridsearch, n_jobs=n_jobs, verbose=2)
    return scores

Source File: evaluation.py From cddd with MIT License

5 votes

def qsar_classification(emb, groups, labels):
    """Helper function that fits and scores a SVM classifier on the extracted molecular
    descriptor in a leave-one-group-out cross-validation manner.

    Args:
        emb: Embedding (molecular descriptor) that is used as input for the SVM
        groups: Array or list with n_samples entries defining the fold membership for the
        crossvalidtion.
        labels: Target values of the of the qsar task.
    Returns:
        The mean accuracy, F1-score, ROC-AUC and prescion-recall-AUC of the cross-validation.
    """
    acc = []
    f1 = []
    roc_auc = []
    pr_auc = []
    logo = LeaveOneGroupOut()
    clf = SVC(kernel='rbf', C=5.0, probability=True)
    for train_index, test_index in logo.split(emb, groups=groups):
        clf.fit(emb[train_index], labels[train_index])
        y_pred = clf.predict(emb[test_index])
        y_pred_prob = clf.predict_proba(emb[test_index])[:, 1]
        y_true = labels[test_index]
        precision, recall, t = precision_recall_curve(y_true, y_pred_prob)
        acc.append(accuracy_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        roc_auc.append(roc_auc_score(y_true, y_pred_prob))
        pr_auc.append(auc(recall, precision))
    return np.mean(acc), np.mean(f1), np.mean(roc_auc), np.mean(pr_auc)

Source File: evaluation.py From cddd with MIT License

5 votes

def qsar_regression(emb, groups, labels):
    """Helper function that fits and scores a SVM regressor on the extracted molecular
    descriptor in a leave-one-group-out cross-validation manner.

    Args:
        emb: Embedding (molecular descriptor) that is used as input for the SVM
        groups: Array or list with n_samples entries defining the fold membership for the
        crossvalidtion.
        labels: Target values of the of the qsar task.
    Returns:
        The mean accuracy, F1-score, ROC-AUC and prescion-recall-AUC of the cross-validation.
    """
    r2 = []
    r = []
    mse = []
    mae = []
    logo = LeaveOneGroupOut()
    clf = SVR(kernel='rbf', C=5.0)
    for train_index, test_index in logo.split(emb, groups=groups):
        clf.fit(emb[train_index], labels[train_index])
        y_pred = clf.predict(emb[test_index])
        y_true = labels[test_index]
        r2.append(r2_score(y_true, y_pred))
        r.append(spearmanr(y_true, y_pred)[0])
        mse.append(mean_squared_error(y_true, y_pred))
        mae.append(mean_absolute_error(y_true, y_pred))
    return np.mean(r2), np.mean(r), np.mean(mse), np.mean(mae)

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.model_selection.KFold, ms.KFold)
        self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold)
        self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut)

        self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.model_selection.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.model_selection.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler)

        # Model validation

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper_abbr(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.ms.KFold, ms.KFold)
        self.assertIs(df.ms.GroupKFold, ms.GroupKFold)
        self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.ms.LeavePOut, ms.LeavePOut)

        self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.ms.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.ms.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler)

        # Model validation

Source File: mastml_driver.py From MAST-ML with MIT License

5 votes

def _instantiate(kwargs_dict, name_to_constructor, category, X_grouped=None, X_indices=None):
    """
    Uses name_to_constructor to instantiate every item in kwargs_dict and return
    the list of instantiations
    """
    instantiations = []
    for long_name, (name, kwargs) in kwargs_dict.items():
        log.debug(f'instantiation: {long_name}, {name}({kwargs})')
        try:
            #skip instantiate step for keras model because need to pass dict to build model and not all values directly
            if 'KerasRegressor' in long_name:
                pass

            # Need to construct cv object when have special case of RFECV and LeaveOneGroupOut cross-validation!
            elif name == 'RFECV':
                if 'cv' in kwargs.keys():
                    if X_grouped is not None:
                        if kwargs['cv'].__class__.__name__ == 'LeaveOneGroupOut':
                            trains = list()
                            tests = list()
                            for train_idx, test_idx in LeaveOneGroupOut().split(X=X_indices, y=None, groups=X_grouped):
                                trains.append(train_idx)
                                tests.append(test_idx)
                            custom_cv = zip(trains, tests)
                            kwargs['cv'] = custom_cv
                instantiations.append([long_name, name_to_constructor[name](**kwargs)])
            else:
                instantiations.append([long_name, name_to_constructor[name](**kwargs)])

        except TypeError:
            log.info(f"ARGUMENTS FOR '{name}': {inspect.signature(name_to_constructor[name])}")
            raise utils.InvalidConfParameters(
                f"The {category} '{name}' has invalid parameters: {kwargs}\n"
                f"Signature for '{name}': {inspect.signature(name_to_constructor[name])}")
        except KeyError:
            raise utils.InvalidConfSubSection(
                f"There is no {category} called '{name}'."
                f"All valid {category}: {list(name_to_constructor.keys())}")

    return instantiations

Source File: run_qsar_test.py From cddd with MIT License

4 votes

def main(unused_argv):
    """Main function to test the performance of the translation model to extract
    meaningfull features for a QSAR modelling"""
    if FLAGS.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.device)
        print("use gpu {}".format(str(FLAGS.device)))
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    model_dir = FLAGS.model_dir

    infer_model = InferenceModel(model_dir, use_gpu=FLAGS.gpu, cpu_threads=FLAGS.cpu_threads)
    ames_df = pd.read_csv("ames.csv")
    ames_smls = ames_df.smiles.tolist()
    ames_labels = ames_df.label.values
    ames_fold = ames_df.fold.values
    print("Extracting molecular desscriptors for Ames")
    ames_emb = infer_model.seq_to_emb(ames_smls)
    ames_emb = (ames_emb - ames_emb.mean()) / ames_emb.std()

    lipo_df = pd.read_csv("lipo.csv")
    lipo_smls = lipo_df.smiles.tolist()
    lipo_labels = lipo_df.label.values
    lipo_fold = lipo_df.fold.values
    print("Extracting molecular desscriptors for Lipophilicity")
    lipo_emb = infer_model.seq_to_emb(lipo_smls)
    lipo_emb = (lipo_emb - lipo_emb.mean()) / lipo_emb.std()

    print("Running SVM on Ames mutagenicity...")
    clf = SVC(C=5.0)
    result = cross_val_score(clf,
                             ames_emb,
                             ames_labels,
                             ames_fold,
                             cv=LeaveOneGroupOut(),
                             n_jobs=5)
    print("Ames mutagenicity accuracy: %0.3f +/- %0.3f"
          %(np.mean(result), np.std(result)))

    print("Running SVM on Lipophilicity...")
    clf = SVR(C=5.0)
    result = cross_val_score(clf,
                             lipo_emb,
                             lipo_labels,
                             lipo_fold,
                             cv=LeaveOneGroupOut(),
                             n_jobs=5)
    print("Lipophilicity r2: %0.3f +/- %0.3f"
          %(np.mean(result), np.std(result)))

Source File: cross_validation.py From nltools with MIT License

4 votes

def set_cv(Y=None, cv_dict=None, return_generator=True):
    """ Helper function to create a sci-kit learn compatible cv object using
    common parameters for prediction analyses.

    Args:
        Y:  (pd.DataFrame) Pandas Dataframe of Y labels
        cv_dict: (dict) Type of cross_validation to use. A dictionary of
            {'type': 'kfolds', 'n_folds': n},
            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
            {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or
            {'type': 'loso', 'subject_id': holdout}
        return_generator (bool): return a cv generator instead of an instance; default True
    Returns:
        cv: a scikit-learn model-selection generator

     """

    if isinstance(cv_dict, dict):
        if cv_dict['type'] == 'kfolds':
            if 'subject_id' in cv_dict:  # Hold out subjects within each fold
                from sklearn.model_selection import GroupKFold
                cv_inst = GroupKFold(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id'])
            elif 'stratified' in cv_dict:  # Stratified K-Folds Continuous
                from nltools.cross_validation import KFoldStratified
                cv_inst = KFoldStratified(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
            else:  # Normal K-Folds
                from sklearn.model_selection import KFold
                cv_inst = KFold(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
        elif cv_dict['type'] == 'loso':  # Leave One Subject Out
            from sklearn.model_selection import LeaveOneGroupOut
            cv_inst = LeaveOneGroupOut()
            cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id'])
        else:
            raise ValueError("""Make sure you specify a dictionary of
                            {'type': 'kfolds', 'n_folds': n},
                            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
                            {'type': 'kfolds', 'n_folds': n,
                            'subject_id': holdout}, or {'type': 'loso',
                            'subject_id': holdout}, where n = number of folds,
                            and subject = vector of subject ids that
                            corresponds to self.Y""")
    else:
        raise ValueError("Make sure 'cv_dict' is a dictionary.")
    if return_generator:
        return cv
    else:
        return cv_inst

Source File: test_split.py From twitter-stock-recommendation with MIT License

4 votes

def test_cross_validator_with_default_params():
    n_samples = 4
    n_unique_groups = 4
    n_splits = 2
    p = 2
    n_shuffle_splits = 10  # (the default value)

    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    X_1d = np.array([1, 2, 3, 4])
    y = np.array([1, 1, 2, 2])
    groups = np.array([1, 2, 3, 4])
    loo = LeaveOneOut()
    lpo = LeavePOut(p)
    kf = KFold(n_splits)
    skf = StratifiedKFold(n_splits)
    lolo = LeaveOneGroupOut()
    lopo = LeavePGroupsOut(p)
    ss = ShuffleSplit(random_state=0)
    ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2

    loo_repr = "LeaveOneOut()"
    lpo_repr = "LeavePOut(p=2)"
    kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)"
    skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
    lolo_repr = "LeaveOneGroupOut()"
    lopo_repr = "LeavePGroupsOut(n_groups=2)"
    ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, "
               "test_size='default',\n       train_size=None)")
    ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"

    n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits,
                         n_unique_groups, comb(n_unique_groups, p),
                         n_shuffle_splits, 2]

    for i, (cv, cv_repr) in enumerate(zip(
            [loo, lpo, kf, skf, lolo, lopo, ss, ps],
            [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr,
             ss_repr, ps_repr])):
        # Test if get_n_splits works correctly
        assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups))

        # Test if the cross-validator works as expected even if
        # the data is 1d
        np.testing.assert_equal(list(cv.split(X, y, groups)),
                                list(cv.split(X_1d, y, groups)))
        # Test that train, test indices returned are integers
        for train, test in cv.split(X, y, groups):
            assert_equal(np.asarray(train).dtype.kind, 'i')
            assert_equal(np.asarray(train).dtype.kind, 'i')

        # Test if the repr works without any errors
        assert_equal(cv_repr, repr(cv))

    # ValueError for get_n_splits methods
    msg = "The 'X' parameter should not be None."
    assert_raise_message(ValueError, msg,
                         loo.get_n_splits, None, y, groups)
    assert_raise_message(ValueError, msg,
                         lpo.get_n_splits, None, y, groups)

Source File: test_split.py From twitter-stock-recommendation with MIT License

4 votes

def test_leave_one_p_group_out():
    logo = LeaveOneGroupOut()
    lpgo_1 = LeavePGroupsOut(n_groups=1)
    lpgo_2 = LeavePGroupsOut(n_groups=2)

    # Make sure the repr works
    assert_equal(repr(logo), 'LeaveOneGroupOut()')
    assert_equal(repr(lpgo_1), 'LeavePGroupsOut(n_groups=1)')
    assert_equal(repr(lpgo_2), 'LeavePGroupsOut(n_groups=2)')
    assert_equal(repr(LeavePGroupsOut(n_groups=3)),
                 'LeavePGroupsOut(n_groups=3)')

    for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1),
                                            (lpgo_2, 2))):
        for i, groups_i in enumerate(test_groups):
            n_groups = len(np.unique(groups_i))
            n_splits = (n_groups if p_groups_out == 1
                        else n_groups * (n_groups - 1) / 2)
            X = y = np.ones(len(groups_i))

            # Test that the length is correct
            assert_equal(cv.get_n_splits(X, y, groups=groups_i), n_splits)

            groups_arr = np.asarray(groups_i)

            # Split using the original list / array / list of string groups_i
            for train, test in cv.split(X, y, groups=groups_i):
                # First test: no train group is in the test set and vice versa
                assert_array_equal(np.intersect1d(groups_arr[train],
                                                  groups_arr[test]).tolist(),
                                   [])

                # Second test: train and test add up to all the data
                assert_equal(len(train) + len(test), len(groups_i))

                # Third test:
                # The number of groups in test must be equal to p_groups_out
                assert_true(np.unique(groups_arr[test]).shape[0], p_groups_out)

    # check get_n_splits() with dummy parameters
    assert_equal(logo.get_n_splits(None, None, ['a', 'b', 'c', 'b', 'c']), 3)
    assert_equal(logo.get_n_splits(groups=[1.0, 1.1, 1.0, 1.2]), 3)
    assert_equal(lpgo_2.get_n_splits(None, None, np.arange(4)), 6)
    assert_equal(lpgo_1.get_n_splits(groups=np.arange(4)), 4)

    # raise ValueError if a `groups` parameter is illegal
    with assert_raises(ValueError):
        logo.get_n_splits(None, None, [0.0, np.nan, 0.0])
    with assert_raises(ValueError):
        lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0])

    msg = "The 'groups' parameter should not be None."
    assert_raise_message(ValueError, msg,
                         logo.get_n_splits, None, None, None)
    assert_raise_message(ValueError, msg,
                         lpgo_1.get_n_splits, None, None, None)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

4 votes

def test_leave_one_p_group_out():
    logo = LeaveOneGroupOut()
    lpgo_1 = LeavePGroupsOut(n_groups=1)
    lpgo_2 = LeavePGroupsOut(n_groups=2)

    # Make sure the repr works
    assert_equal(repr(logo), 'LeaveOneGroupOut()')
    assert_equal(repr(lpgo_1), 'LeavePGroupsOut(n_groups=1)')
    assert_equal(repr(lpgo_2), 'LeavePGroupsOut(n_groups=2)')
    assert_equal(repr(LeavePGroupsOut(n_groups=3)),
                 'LeavePGroupsOut(n_groups=3)')

    for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1),
                                            (lpgo_2, 2))):
        for i, groups_i in enumerate(test_groups):
            n_groups = len(np.unique(groups_i))
            n_splits = (n_groups if p_groups_out == 1
                        else n_groups * (n_groups - 1) / 2)
            X = y = np.ones(len(groups_i))

            # Test that the length is correct
            assert_equal(cv.get_n_splits(X, y, groups=groups_i), n_splits)

            groups_arr = np.asarray(groups_i)

            # Split using the original list / array / list of string groups_i
            for train, test in cv.split(X, y, groups=groups_i):
                # First test: no train group is in the test set and vice versa
                assert_array_equal(np.intersect1d(groups_arr[train],
                                                  groups_arr[test]).tolist(),
                                   [])

                # Second test: train and test add up to all the data
                assert_equal(len(train) + len(test), len(groups_i))

                # Third test:
                # The number of groups in test must be equal to p_groups_out
                assert np.unique(groups_arr[test]).shape[0], p_groups_out

    # check get_n_splits() with dummy parameters
    assert_equal(logo.get_n_splits(None, None, ['a', 'b', 'c', 'b', 'c']), 3)
    assert_equal(logo.get_n_splits(groups=[1.0, 1.1, 1.0, 1.2]), 3)
    assert_equal(lpgo_2.get_n_splits(None, None, np.arange(4)), 6)
    assert_equal(lpgo_1.get_n_splits(groups=np.arange(4)), 4)

    # raise ValueError if a `groups` parameter is illegal
    with assert_raises(ValueError):
        logo.get_n_splits(None, None, [0.0, np.nan, 0.0])
    with assert_raises(ValueError):
        lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0])

    msg = "The 'groups' parameter should not be None."
    assert_raise_message(ValueError, msg,
                         logo.get_n_splits, None, None, None)
    assert_raise_message(ValueError, msg,
                         lpgo_1.get_n_splits, None, None, None)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

4 votes

def test_cross_validator_with_default_params():
    n_samples = 4
    n_unique_groups = 4
    n_splits = 2
    p = 2
    n_shuffle_splits = 10  # (the default value)

    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    X_1d = np.array([1, 2, 3, 4])
    y = np.array([1, 1, 2, 2])
    groups = np.array([1, 2, 3, 4])
    loo = LeaveOneOut()
    lpo = LeavePOut(p)
    kf = KFold(n_splits)
    skf = StratifiedKFold(n_splits)
    lolo = LeaveOneGroupOut()
    lopo = LeavePGroupsOut(p)
    ss = ShuffleSplit(random_state=0)
    ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2

    loo_repr = "LeaveOneOut()"
    lpo_repr = "LeavePOut(p=2)"
    kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)"
    skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
    lolo_repr = "LeaveOneGroupOut()"
    lopo_repr = "LeavePGroupsOut(n_groups=2)"
    ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, "
               "test_size=None, train_size=None)")
    ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"

    n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits,
                         n_unique_groups, comb(n_unique_groups, p),
                         n_shuffle_splits, 2]

    for i, (cv, cv_repr) in enumerate(zip(
            [loo, lpo, kf, skf, lolo, lopo, ss, ps],
            [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr,
             ss_repr, ps_repr])):
        # Test if get_n_splits works correctly
        assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups))

        # Test if the cross-validator works as expected even if
        # the data is 1d
        np.testing.assert_equal(list(cv.split(X, y, groups)),
                                list(cv.split(X_1d, y, groups)))
        # Test that train, test indices returned are integers
        for train, test in cv.split(X, y, groups):
            assert_equal(np.asarray(train).dtype.kind, 'i')
            assert_equal(np.asarray(train).dtype.kind, 'i')

        # Test if the repr works without any errors
        assert_equal(cv_repr, repr(cv))

    # ValueError for get_n_splits methods
    msg = "The 'X' parameter should not be None."
    assert_raise_message(ValueError, msg,
                         loo.get_n_splits, None, y, groups)
    assert_raise_message(ValueError, msg,
                         lpo.get_n_splits, None, y, groups)

Python sklearn.model_selection.LeaveOneGroupOut() Examples