Python Examples of sklearn.model_selection.RepeatedKFold

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Source File: utils.py From autogluon with Apache License 2.0

6 votes

def generate_kfold(X, y=None, n_splits=5, random_state=0, stratified=False, n_repeats=1):
    if stratified and (y is not None):
        if n_repeats > 1:
            kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
        else:
            kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

        kf.get_n_splits(X, y)
        return [[train_index, test_index] for train_index, test_index in kf.split(X, y)]
    else:
        if n_repeats > 1:
            kf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
        else:
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

        kf.get_n_splits(X)
        return [[train_index, test_index] for train_index, test_index in kf.split(X)]

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_repeated_cv_value_errors():
    # n_repeats is not integer or <= 0
    for cv in (RepeatedKFold, RepeatedStratifiedKFold):
        assert_raises(ValueError, cv, n_repeats=0)
        assert_raises(ValueError, cv, n_repeats=1.5)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_repeated_kfold_determinstic_split():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    random_state = 258173307
    rkf = RepeatedKFold(
        n_splits=2,
        n_repeats=2,
        random_state=random_state)

    # split should produce same and deterministic splits on
    # each call
    for _ in range(3):
        splits = rkf.split(X)
        train, test = next(splits)
        assert_array_equal(train, [2, 4])
        assert_array_equal(test, [0, 1, 3])

        train, test = next(splits)
        assert_array_equal(train, [0, 1, 3])
        assert_array_equal(test, [2, 4])

        train, test = next(splits)
        assert_array_equal(train, [0, 1])
        assert_array_equal(test, [2, 3, 4])

        train, test = next(splits)
        assert_array_equal(train, [2, 3, 4])
        assert_array_equal(test, [0, 1])

        assert_raises(StopIteration, next, splits)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_get_n_splits_for_repeated_kfold():
    n_splits = 3
    n_repeats = 4
    rkf = RepeatedKFold(n_splits, n_repeats)
    expected_n_splits = n_splits * n_repeats
    assert_equal(expected_n_splits, rkf.get_n_splits())

Source File: rbf.py From pwtools with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, points, values, rbf_kwds=dict(), cv_kwds=dict(ns=5, nr=1)):
        """
        Parameters
        ----------
        points, values : see :class:`Rbf`
        rbf_kwds : dict
            for ``Rbf(points, values, **rbf_kwds)``
        cv_kwds : dict
            cross-validation parameters: `ns` = `n_splits`, `nr` = `n_repeats` (see
            sklearn.model_selection.RepeatedKFold)
        """
        self.points = points
        self.values = values
        self.rbf_kwds = rbf_kwds
        self.cv_kwds = cv_kwds

Source File: rbf.py From pwtools with BSD 3-Clause "New" or "Revised" License

5 votes

def cv(self, params):
        """K-fold repeated CV.

        Split data (points, values) randomly into K parts ("folds", K = ``ns``
        in ``self.cv_kwds``) along axis 0 and use each part once as test set,
        the rest as training set. For example `ns=5`: split in 5 parts at
        random indices, use 5 times 4/5 data for train, 1/5 for test (each of
        the folds), so 5 fits total -> 5 fit errors. Optionally repeat ``nr``
        times with different random splits. So, `nr` * `ns` fit errors total.

        Each time, build an Rbf interpolator with ``self.rbf_kwds``, fit,
        return the fit error (scalar sum of squares from
        :meth:`Rbf.fit_error`).

        Parameters
        ----------
        params : seq length 1 or 2
            | params[0] = p
            | params[1] = r (optional)

        Returns
        -------
        errs : 1d array (nr*ns,)
            direct fit error from each fold
        """
        ns = self.cv_kwds['ns']
        nr = self.cv_kwds['nr']
        errs = np.empty((ns*nr,), dtype=float)
        folds = RepeatedKFold(n_splits=ns,
                              n_repeats=nr)
        for ii, tup in enumerate(folds.split(self.points)):
            idxs_train, idxs_test = tup
            rbfi = self._get_rbfi(params,
                                  self.points[idxs_train,...],
                                  self.values[idxs_train,...])
            errs[ii] = rbfi.fit_error(self.points[idxs_test,...],
                                      self.values[idxs_test,...])
        return errs

Source File: hyper_opt.py From MAST-ML with MIT License

5 votes

def fit(self, X, y, savepath=None, refit=True, iid=True):
        rst = dict()
        param_dict = self._get_grid_param_dict()

        if savepath is None:
            savepath = os.getcwd()

        estimator_name = self._estimator_name
        param_dict = self._search_space_generator(param_dict)

        if self.cv is None:
            self.cv = ms.RepeatedKFold()

        model = GridSearchCV(self.estimator, param_dict, scoring=self.scoring, cv=self.cv, refit=refit,
                             iid=iid, n_jobs=self.n_jobs, verbose=2)

        try:
            rst[estimator_name] = model.fit(X, y)
        except:
            log.error('Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize'
                               ' one or more parameters over. Please check your input file and the sklearn docs for the mode'
                               ' you are optimizing for the domain of correct values')
            exit()

        best_estimator = rst[estimator_name].best_estimator_

        self._save_output(savepath, rst)
        return best_estimator

Source File: hyper_opt.py From MAST-ML with MIT License

5 votes

def fit(self, X, y, savepath=None, refit=True):
        rst = dict()
        param_dict = self._get_randomized_param_dict()

        if savepath is None:
            savepath = os.getcwd()

        estimator_name = self._estimator_name

        if self.cv is None:
            self.cv = ms.RepeatedKFold()

        model = RandomizedSearchCV(self.estimator, param_dict, n_iter=self.n_iter, scoring=self.scoring, cv=self.cv,
                                   refit=refit, n_jobs=self.n_jobs, verbose=2)

        try:
            rst[estimator_name] = model.fit(X, y)
        except:
            log.error('Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize'
                               ' one or more parameters over. Please check your input file and the sklearn docs for the mode'
                               ' you are optimizing for the domain of correct values')
            exit()

        best_estimator = rst[estimator_name].best_estimator_

        self._save_output(savepath, rst)
        return best_estimator

Source File: hyper_opt.py From MAST-ML with MIT License

5 votes

def fit(self, X, y, savepath=None, refit=True):
        rst = dict()
        param_dict = self._get_bayesian_param_dict()

        if savepath is None:
            savepath = os.getcwd()

        estimator_name = self._estimator_name

        if self.cv is None:
            self.cv = ms.RepeatedKFold()

        model = BayesSearchCV(estimator=self.estimator, search_spaces=param_dict, n_iter=self.n_iter,
                              scoring=self.scoring, cv=self.cv, refit=refit, n_jobs=self.n_jobs, verbose=2)

        try:
            rst[estimator_name] = model.fit(X, y)
        except:
            log.error('Hyperparameter optimization failed, likely due to inappropriate domain of values to optimize'
                               ' one or more parameters over. Please check your input file and the sklearn docs for the mode'
                               ' you are optimizing for the domain of correct values')
            exit()

        best_estimator = rst[estimator_name].best_estimator_

        self._save_output(savepath, rst)
        return best_estimator

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_repeated_cv_value_errors():
    # n_repeats is not integer or <= 0
    for cv in (RepeatedKFold, RepeatedStratifiedKFold):
        assert_raises(ValueError, cv, n_repeats=0)
        assert_raises(ValueError, cv, n_repeats=1.5)

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_repeated_kfold_determinstic_split():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    random_state = 258173307
    rkf = RepeatedKFold(
        n_splits=2,
        n_repeats=2,
        random_state=random_state)

    # split should produce same and deterministic splits on
    # each call
    for _ in range(3):
        splits = rkf.split(X)
        train, test = next(splits)
        assert_array_equal(train, [2, 4])
        assert_array_equal(test, [0, 1, 3])

        train, test = next(splits)
        assert_array_equal(train, [0, 1, 3])
        assert_array_equal(test, [2, 4])

        train, test = next(splits)
        assert_array_equal(train, [0, 1])
        assert_array_equal(test, [2, 3, 4])

        train, test = next(splits)
        assert_array_equal(train, [2, 3, 4])
        assert_array_equal(test, [0, 1])

        assert_raises(StopIteration, next, splits)

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_get_n_splits_for_repeated_kfold():
    n_splits = 3
    n_repeats = 4
    rkf = RepeatedKFold(n_splits, n_repeats)
    expected_n_splits = n_splits * n_repeats
    assert_equal(expected_n_splits, rkf.get_n_splits())

Python sklearn.model_selection.RepeatedKFold() Examples