Python Examples of sklearn.model

Source File: test_search.py From twitter-stock-recommendation with MIT License

6 votes

def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

Source File: test_rfe.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_rfe_cv_groups():
    generator = check_random_state(0)
    iris = load_iris()
    number_groups = 4
    groups = np.floor(np.linspace(0, number_groups, len(iris.target)))
    X = iris.data
    y = (iris.target > 0).astype(int)

    est_groups = RFECV(
        estimator=RandomForestClassifier(random_state=generator),
        step=1,
        scoring='accuracy',
        cv=GroupKFold(n_splits=2)
    )
    est_groups.fit(X, y, groups=groups)
    assert est_groups.n_features_ > 0

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Source File: test_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv)

Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv)

Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Source File: tpot_tests.py From tpot with GNU Lesser General Public License v3.0

6 votes

def test_fit_GroupKFold():
    """Assert that TPOT properly handles the group parameter when using GroupKFold."""
    # This check tests if the darker digits images would generalize to the lighter ones.
    means = np.mean(training_features, axis=1)
    groups = means >= np.median(means)

    tpot_obj = TPOTClassifier(
        random_state=42,
        population_size=2,
        offspring_size=4,
        generations=1,
        verbosity=0,
        config_dict='TPOT light',
        cv=model_selection.GroupKFold(n_splits=2),
    )
    tpot_obj.fit(training_features, training_target, groups=groups)

    assert_greater_equal(tpot_obj.score(testing_features, testing_target), 0.97)

Source File: SupervisedClassifier.py From CDSS with GNU General Public License v3.0

6 votes

def __init__(self, classes, hyperparams=None, groups=None):
        self._classes = classes

        # Initialize params.
        self._params = {}
        self._model = None

        '''
        Used by GroupKFold for splitting train/validation. 
        '''
        self._groups = groups

        # Initialize hyperparams.
        self._hyperparams = {} if hyperparams is None else hyperparams
        self._hyperparam_search_space = {}
        # Set algorithm.
        self._get_or_set_hyperparam('algorithm')
        # Set random state.
        self._get_or_set_hyperparam('random_state')
        # Set CV strategy.
        self._get_or_set_hyperparam('hyperparam_strategy')

Source File: sklearn_test.py From keras-tuner with Apache License 2.0

6 votes

def test_sklearn_cv_with_groups(tmp_dir):
    tuner = sklearn_tuner.Sklearn(
        oracle=kt.oracles.BayesianOptimization(
            objective=kt.Objective('score', 'max'),
            max_trials=10),
        hypermodel=build_model,
        cv=model_selection.GroupKFold(5),
        directory=tmp_dir)

    x = np.random.uniform(size=(50, 10))
    y = np.random.randint(0, 2, size=(50,))
    groups = np.random.randint(0, 5, size=(50,))
    tuner.search(x, y, groups=groups)

    assert len(tuner.oracle.trials) == 10

    best_trial = tuner.oracle.get_best_trials()[0]
    assert best_trial.status == 'COMPLETED'
    assert best_trial.score is not None
    assert best_trial.best_step == 0
    assert best_trial.metrics.exists('score')

    # Make sure best model can be reloaded.
    best_model = tuner.get_best_models()[0]
    best_model.score(x, y)

Source File: test_run.py From nyaggle with MIT License

5 votes

def test_experiment_manual_cv_group(tmpdir_name):
    df1 = pd.DataFrame()
    df1['x'] = np.random.randint(0, 10, size=1000)
    df1['y'] = df1['x'] > 5
    df1['grp'] = 0

    df2 = pd.DataFrame()
    df2['x'] = np.random.randint(0, 10, size=100)
    df2['y'] = df2['x'] <= 5
    df2['grp'] = 1

    X = pd.concat([df1, df2]).reset_index(drop=True)
    y = X['y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    grp = X_train['grp']
    X_train = X_train.drop(['y', 'grp'], axis=1)
    X_test = X_test.drop(['y', 'grp'], axis=1)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    result = run_experiment(params, X_train, y_train, X_test, tmpdir_name, cv=GroupKFold(2), groups=grp)
    assert result.metrics[-1] < 0.7

Source File: test_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_learning_curve_with_shuffle():
    # Following test case was designed this way to verify the code
    # changes made in pull request: #7506.
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [11, 12], [13, 14], [15, 16],
                 [17, 18], [19, 20], [7, 8], [9, 10], [11, 12], [13, 14],
                 [15, 16], [17, 18]])
    y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4])
    groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4])
    # Splits on these groups fail without shuffle as the first iteration
    # of the learning curve doesn't contain label 4 in the training set.
    estimator = PassiveAggressiveClassifier(max_iter=5, tol=None,
                                            shuffle=False)

    cv = GroupKFold(n_splits=2)
    train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve(
        estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
        groups=groups, shuffle=True, random_state=2)
    assert_array_almost_equal(train_scores_batch.mean(axis=1),
                              np.array([0.75, 0.3, 0.36111111]))
    assert_array_almost_equal(test_scores_batch.mean(axis=1),
                              np.array([0.36111111, 0.25, 0.25]))
    assert_raises(ValueError, learning_curve, estimator, X, y, cv=cv, n_jobs=1,
                  train_sizes=np.linspace(0.3, 1.0, 3), groups=groups)

    train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
        estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
        groups=groups, shuffle=True, random_state=2,
        exploit_incremental_learning=True)
    assert_array_almost_equal(train_scores_inc.mean(axis=1),
                              train_scores_batch.mean(axis=1))
    assert_array_almost_equal(test_scores_inc.mean(axis=1),
                              test_scores_batch.mean(axis=1))

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 5, 15)

    cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_splits=3, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                          cv=inner_cv)
        cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                        fit_params={'groups': groups})

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper_abbr(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.ms.KFold, ms.KFold)
        self.assertIs(df.ms.GroupKFold, ms.GroupKFold)
        self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.ms.LeavePOut, ms.LeavePOut)

        self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.ms.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.ms.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler)

        # Model validation

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.model_selection.KFold, ms.KFold)
        self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold)
        self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut)

        self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.model_selection.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.model_selection.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler)

        # Model validation

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_nsplit_default_warn():
    # Test that warnings are raised. Will be removed in 0.22
    assert_warns_message(FutureWarning, NSPLIT_WARNING, KFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, GroupKFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, StratifiedKFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, TimeSeriesSplit)

    assert_no_warnings(KFold, n_splits=5)
    assert_no_warnings(GroupKFold, n_splits=5)
    assert_no_warnings(StratifiedKFold, n_splits=5)
    assert_no_warnings(TimeSeriesSplit, n_splits=5)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 5, 15)

    cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_splits=3, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                          cv=inner_cv, error_score='raise', iid=False)
        cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                        fit_params={'groups': groups})

Source File: test_wrapper.py From category_encoders with BSD 3-Clause "New" or "Revised" License

5 votes

def test_custom_cv(self):
        x = np.array([
            ['a', 'b', 'c'],
            ['a', 'b', 'c'],
            ['a', 'b', 'c'],
            ['a', 'b', 'c'],
            ['b', 'b', 'c'],
            ['b', 'b', 'c'],
            ['b', 'b', 'b'],
            ['b', 'b', 'b'],
            ['b', 'b', 'b'],
            ['b', 'b', 'b'],
            ['a', 'b', 'a'],
            ['a', 'b', 'a'],
        ])
        groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
        y = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
        gkfold = GroupKFold(n_splits=3)
        wrapper = NestedCVWrapper(encoders.TargetEncoder(), cv=gkfold)
        result_train, result_valid = wrapper.fit_transform(x, y, X_test=x, groups=groups)

        # We would expect result_train != result_valid since result_train has been generated using nested
        # folds and result_valid is generated by fitting the encoder on all of the x & y daya
        self.assertFalse(np.allclose(result_train, result_valid))

Source File: splitters.py From fklearn with Apache License 2.0

4 votes

def out_of_time_and_space_splitter(train_data: pd.DataFrame,
                                   n_splits: int,
                                   in_time_limit: DateType,
                                   time_column: str,
                                   space_column: str,
                                   holdout_gap: timedelta = timedelta(days=0)) -> SplitterReturnType:
    """
    Makes K grouped train/test split folds for cross validation.
    The folds are made so that every ID is used at least once for
    evaluating and K-1 times for training. Also, for each fold, evaluation
    will always be out-of-ID and out-of-time.

    Parameters
    ----------
    train_data : pandas.DataFrame
        A Pandas' DataFrame that will be split into K out-of-time and ID
        folds for cross validation.

    n_splits : int
        The number of folds K for the K-Fold cross validation strategy.

    in_time_limit : str or datetime.datetime
        A String representing the end time of the training data.
        It should be in the same format as the Date column in `train_data`.

    time_column : str
        The name of the Date column of `train_data`.

    space_column : str
        The name of the ID column of `train_data`.

    holdout_gap: datetime.timedelta
        Timedelta of the gap between the end of the training period and the start of the validation period.
    """

    # first generate folds by space, using LabelKFold
    # GroupKFold is not supposed to be randomized, that's why there's no random_state here
    train_data = train_data.reset_index()
    space_folds = GroupKFold(n_splits).split(train_data, groups=train_data[space_column])

    if isinstance(in_time_limit, str):
        in_time_limit = datetime.strptime(in_time_limit, "%Y-%m-%d")

    # train_indexes have time_column <= in_time_limit
    # test_indexes have time_column > in_time_limit
    folds = pipe(space_folds,
                 partial(starmap, lambda f_train, f_test: [train_data.iloc[f_train][time_column],
                                                           train_data.iloc[f_test][time_column]]),
                 partial(starmap, lambda train, test: (train[train <= in_time_limit],  # filter train time
                                                       test[test > (in_time_limit + holdout_gap)])),  # filter test time
                 list)

    logs = list(map(_log_time_fold, folds))  # get fold logs
    folds_indexes = _lc_fold_to_indexes(folds)  # final formatting with idx
    return folds_indexes, logs

Source File: split.py From mlcomp with Apache License 2.0

4 votes

def file_group_kfold(
        n_splits: int,
        output: str = None,
        get_group=None,
        sort=False,
        must_equal=(),
        **files
):
    assert len(files) > 0, 'at lease 1 type of files is required'
    fold = GroupKFold(n_splits)
    keys = sorted(list(files))

    def get_name(file):
        return splitext(basename(file))[0]

    if sort:
        for k, v in files.items():
            files[k] = sorted(files[k], key=get_name)

    file_first = sorted(files[keys[0]])

    assert len(file_first) > n_splits, \
        f'at least {n_splits} files is required. Provided: {len(file_first)}'

    for k, v in files.items():
        assert len(files[k]) == len(file_first), \
            f'count of files in key = {k} is not the same as in {keys[0]}'

    for k, v in files.items():
        if k not in must_equal:
            continue
        for i in range(len(file_first)):
            names_equal = get_name(v[i]) == get_name(file_first[i])
            assert names_equal, \
                f'file name in {k} does not equal to {keys[0]}, ' \
                f'file name = {basename(v[i])}'

    df = pd.DataFrame(files)[keys]
    df['fold'] = 0

    groups = [
        i if not get_group else get_group(file)
        for i, file in enumerate(file_first)
    ]

    for i, (train_index,
            test_index) in enumerate(fold.split(groups, groups=groups)):
        df.loc[test_index, 'fold'] = i

    df = df.sample(frac=1)
    if output:
        df.to_csv(output, index=False)
    return df

Source File: cross_validation.py From nltools with MIT License

4 votes

def set_cv(Y=None, cv_dict=None, return_generator=True):
    """ Helper function to create a sci-kit learn compatible cv object using
    common parameters for prediction analyses.

    Args:
        Y:  (pd.DataFrame) Pandas Dataframe of Y labels
        cv_dict: (dict) Type of cross_validation to use. A dictionary of
            {'type': 'kfolds', 'n_folds': n},
            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
            {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or
            {'type': 'loso', 'subject_id': holdout}
        return_generator (bool): return a cv generator instead of an instance; default True
    Returns:
        cv: a scikit-learn model-selection generator

     """

    if isinstance(cv_dict, dict):
        if cv_dict['type'] == 'kfolds':
            if 'subject_id' in cv_dict:  # Hold out subjects within each fold
                from sklearn.model_selection import GroupKFold
                cv_inst = GroupKFold(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id'])
            elif 'stratified' in cv_dict:  # Stratified K-Folds Continuous
                from nltools.cross_validation import KFoldStratified
                cv_inst = KFoldStratified(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
            else:  # Normal K-Folds
                from sklearn.model_selection import KFold
                cv_inst = KFold(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
        elif cv_dict['type'] == 'loso':  # Leave One Subject Out
            from sklearn.model_selection import LeaveOneGroupOut
            cv_inst = LeaveOneGroupOut()
            cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id'])
        else:
            raise ValueError("""Make sure you specify a dictionary of
                            {'type': 'kfolds', 'n_folds': n},
                            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
                            {'type': 'kfolds', 'n_folds': n,
                            'subject_id': holdout}, or {'type': 'loso',
                            'subject_id': holdout}, where n = number of folds,
                            and subject = vector of subject ids that
                            corresponds to self.Y""")
    else:
        raise ValueError("Make sure 'cv_dict' is a dictionary.")
    if return_generator:
        return cv
    else:
        return cv_inst

Source File: skwrapper.py From Benchmarks with MIT License

4 votes

def split_data(df, ycol='0', classify=False, cv=5, bins=0, cutoffs=None, groupcols=None, ignore_categoricals=False, verbose=True):
    if groupcols is not None:
        groups = make_group_from_columns(df, groupcols)

    cat_cols = df.select_dtypes(['object']).columns
    if ignore_categoricals:
        df[cat_cols] = 0
    else:
        df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('category').cat.codes)

    if ycol.isdigit():
        ycol = df.columns[int(ycol)]

    y = df.loc[:, ycol].values
    x = df.drop(ycol, axis=1).values
    features = df.drop(ycol, axis=1).columns.tolist()

    if verbose:
        print('Target column: {}'.format(ycol))
        print('  count = {}, uniq = {}, mean = {:.3g}, std = {:.3g}'.format(len(y), len(np.unique(y)), np.mean(y), np.std(y)))
        print('  min = {:.3g}, q1 = {:.3g}, median = {:.3g}, q3 = {:.3g}, max = {:.3g}'.format(np.min(y), np.percentile(y, 25), np.median(y), np.percentile(y, 75), np.max(y)))

    if not classify:
        y_even = discretize(y, bins=5, verbose=False)
    elif bins >= 2:
        y = discretize(y, bins=bins, min_count=cv, verbose=verbose)
    elif cutoffs:
        y = discretize(y, cutoffs=cutoffs, min_count=cv, verbose=verbose)
    elif df[ycol].dtype in [np.dtype('float64'), np.dtype('float32')]:
        warnings.warn('Warning: classification target is float; consider using --bins or --cutoffs')
        y = y.astype(int)

    if classify:
        mask = np.ones(len(y), dtype=bool)
        unique, counts = np.unique(y, return_counts=True)
        for v, c in zip(unique, counts):
            if c < cv:
                mask[y == v] = False
        x = x[mask]
        y = y[mask]
        removed = len(mask) - np.sum(mask)
        if removed and verbose:
            print('Removed {} rows in small classes: count < {}'.format(removed, cv))

    if groupcols is None:
        if classify:
            y_even = y
        skf = StratifiedKFold(n_splits=cv, shuffle=True)
        splits = skf.split(x, y_even)
    else:
        if classify:
            groups = groups[mask]
        gkf = GroupKFold(n_splits=cv)
        splits = gkf.split(x, y, groups)

    if verbose:
        print()

    return x, y, list(splits), features

Source File: sklearn_utils.py From ibeis with Apache License 2.0

4 votes

def temp(samples):
    from sklearn import model_selection
    from ibeis.algo.verif import sklearn_utils
    def check_balance(idxs):
        # from sklearn.utils.fixes import bincount
        print('-------')
        for count, (test, train) in enumerate(idxs):
            print('split %r' % (count))
            groups_train = set(groups.take(train))
            groups_test = set(groups.take(test))
            n_group_isect = len(groups_train.intersection(groups_test))
            y_train_freq = bincount(y.take(train))
            y_test_freq = bincount(y.take(test))
            y_test_ratio = y_test_freq / y_test_freq.sum()
            y_train_ratio = y_train_freq / y_train_freq.sum()
            balance_error = np.sum((y_test_ratio - y_train_ratio) ** 2)
            print('n_group_isect = %r' % (n_group_isect,))
            print('y_test_ratio = %r' % (y_test_ratio,))
            print('y_train_ratio = %r' % (y_train_ratio,))
            print('balance_error = %r' % (balance_error,))

    X = np.empty((len(samples), 0))
    y = samples.encoded_1d().values
    groups = samples.group_ids

    n_splits = 3

    splitter = model_selection.GroupShuffleSplit(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = model_selection.GroupKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = model_selection.StratifiedKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = sklearn_utils.StratifiedGroupKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

Source File: test_ridge.py From Mastering-Elasticsearch-7.0 with MIT License

4 votes

def test_ridge_gcv_sample_weights(
        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=11, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(
        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
        fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0) for
        i in np.arange(X.shape[0])]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(
        alphas=alphas, store_cv_values=True,
        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)

Python sklearn.model_selection.GroupKFold() Examples