Python Examples of sklearn.model_selection.check

Source File: _search.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def check_cv(cv=3, y=None, classifier=False):
    """Dask aware version of ``sklearn.model_selection.check_cv``

    Same as the scikit-learn version, but works if ``y`` is a dask object.
    """
    if cv is None:
        cv = 3

    # If ``cv`` is not an integer, the scikit-learn implementation doesn't
    # touch the ``y`` object, so passing on a dask object is fine
    if not is_dask_collection(y) or not isinstance(cv, numbers.Integral):
        return model_selection.check_cv(cv, y, classifier=classifier)

    if classifier:
        # ``y`` is a dask object. We need to compute the target type
        target_type = delayed(type_of_target, pure=True)(y).compute()
        if target_type in ("binary", "multiclass"):
            return StratifiedKFold(cv)
    return KFold(cv)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cv_iterable_wrapper():
    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    # numpy's assert_array_equal properly compares nested lists
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert not splits_are_equal, (
        "If the splits are randomized, "
        "successive calls to split should yield different results")

Source File: stacking.py From civisml-extensions with BSD 3-Clause "New" or "Revised" License

6 votes

def _check_cv(self, y):
        """Overrides base class _check_cv
        """
        # Squeezed target should be 1-dimensional
        if len(y.shape) != 1:
            raise NotImplementedError("StackedClassifier does not currently "
                                      "support multi-column classification "
                                      "problems. If your target is a one-hot "
                                      "encoded multi-class problem, please "
                                      "recast it to a single column.")
        return check_cv(self.cv, y=y, classifier=True)

Source File: dataset.py From skorch with BSD 3-Clause "New" or "Revised" License

5 votes

def __call__(self, dataset, y=None, groups=None):
        bad_y_error = ValueError(
            "Stratified CV requires explicitly passing a suitable y.")
        if (y is None) and self.stratified:
            raise bad_y_error

        cv = self.check_cv(y)
        if self.stratified and not self._is_stratified(cv):
            raise bad_y_error

        # pylint: disable=invalid-name
        len_dataset = get_len(dataset)
        if y is not None:
            len_y = get_len(y)
            if len_dataset != len_y:
                raise ValueError("Cannot perform a CV split if dataset and y "
                                 "have different lengths.")

        args = (np.arange(len_dataset),)
        if self._is_stratified(cv):
            args = args + (to_numpy(y),)

        idx_train, idx_valid = next(iter(cv.split(*args, groups=groups)))
        dataset_train = torch.utils.data.Subset(dataset, idx_train)
        dataset_valid = torch.utils.data.Subset(dataset, idx_valid)
        return dataset_train, dataset_valid

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_cv_iterable_wrapper():
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv = OldSKF(y_multiclass, n_folds=3)
    wrapped_old_skf = _CVIterableWrapper(cv)

    # Check if split works correctly
    np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))

    # Check if get_n_splits works correctly
    assert_equal(len(cv), wrapped_old_skf.get_n_splits())

    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    # numpy's assert_array_equal properly compares nested lists
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert_false(splits_are_equal, "If the splits are randomized, "
                 "successive calls to split should yield different results")

Source File: search.py From sigopt-sklearn with MIT License

5 votes

def our_check_cv(cv, X, y, classifier):
      ret = base_check_cv(cv, X, y, classifier)
      return len(ret), list(iter(ret))

Source File: search.py From sigopt-sklearn with MIT License

5 votes

def our_check_cv(cv, X, y, classifier):
      ret = base_check_cv(cv, y, classifier)
      return ret.n_splits, list(ret.split(X, y=y))

Source File: stacking.py From civisml-extensions with BSD 3-Clause "New" or "Revised" License

5 votes

def _check_cv(self, y):
        """Overrides base class _check_cv
        """
        return check_cv(self.cv, y=y, classifier=False)

Source File: base.py From carl with BSD 3-Clause "New" or "Revised" License

5 votes

def check_cv(cv=3, X=None, y=None, classifier=False):
    """Input checker utility for building a cross-validator.

    Parameters
    ----------
    * `cv` [integer, cross-validation generator or an iterable, default=`3`]:
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if classifier is True and `y` is either
        binary or multiclass, `StratifiedKFold` used. In all other
        cases, `KFold` is used.

    * `y` [array-like, optional]:
        The target variable for supervised learning problems.

    * `classifier` [boolean, default=`False`]:
        Whether the task is a classification task, in which case
        stratified `KFold` will be used.

    Returns
    -------
    * `checked_cv` [a cross-validator instance]:
        The return value is a cross-validator which generates the train/test
        splits via the `split` method.

    Note
    ----
    This method is backported from scikit-learn 0.18.
    """
    return sklearn_check_cv(cv, y=y, classifier=classifier)

Source File: fixes.py From skutil with BSD 3-Clause "New" or "Revised" License

5 votes

def _set_cv(cv, X, y, classifier):
    """This method returns either a `sklearn.cross_validation._PartitionIterator` or 
    `sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17
    or sklearn-0.18 is being used.

    Parameters
    ----------

    cv : int, `_PartitionIterator` or `BaseCrossValidator`
        The CV object or int to check. If an int, will be converted
        into the appropriate class of crossvalidator.

    X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
        The dataframe or np.ndarray being fit in the grid search.

    y : np.ndarray, shape(n_samples,)
        The target being fit in the grid search.

    classifier : bool
        Whether the estimator being fit is a classifier

    Returns
    -------

    `_PartitionIterator` or `BaseCrossValidator`
    """
    return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)

Source File: dataset.py From skorch with BSD 3-Clause "New" or "Revised" License

5 votes

def check_cv(self, y):
        """Resolve which cross validation strategy is used."""
        y_arr = None
        if self.stratified:
            # Try to convert y to numpy for sklearn's check_cv; if conversion
            # doesn't work, still try.
            try:
                y_arr = to_numpy(y)
            except (AttributeError, TypeError):
                y_arr = y

        if self._is_float(self.cv):
            return self._check_cv_float()
        return self._check_cv_non_float(y_arr)

Source File: dataset.py From skorch with BSD 3-Clause "New" or "Revised" License

5 votes

def _check_cv_non_float(self, y):
        return check_cv(
            self.cv,
            y=y,
            classifier=self.stratified,
        )

Source File: cross_validation.py From Pyspatialml with GNU General Public License v3.0

5 votes

def fit(self, X, y=None, groups=None, **fit_params):
        """
        Run fit method with all sets of parameters

        Args
        ----
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning

        groups : array-like, shape = [n_samples], optional
            Training vector groups for cross-validation

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        # check estimator and cv methods are valid
        self.cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        # check for binary response
        if len(np.unique(y)) > 2:
            raise ValueError('Only a binary response vector is currently supported')

        # check that scoring metric has been specified
        if self.scoring is None:
            raise ValueError('No score function is defined')

        # perform cross validation prediction
        self.y_pred_ = cross_val_predict(
            estimator=self.estimator, X=X, y=y, groups=groups, cv=self.cv,
            method='predict_proba', n_jobs=self.n_jobs, **fit_params)
        self.y_true = y

        # add fold id to the predictions
        self.test_idx_ = [indexes[1] for indexes in self.cv.split(X, y, groups)]

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_check_cv_default_warn():
    # Test that warnings are raised. Will be removed in 0.22
    assert_warns_message(FutureWarning, CV_WARNING, check_cv)
    assert_warns_message(FutureWarning, CV_WARNING, check_cv, None)
    assert_no_warnings(check_cv, cv=5)

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_check_cv():
    X = np.ones(9)
    cv = check_cv(3, classifier=False)
    # Use numpy.testing.assert_equal which recursively compares
    # lists of lists
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = check_cv(3, y_binary, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_binary)),
                            list(cv.split(X, y_binary)))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = check_cv(3, y_multiclass, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_multiclass)),
                            list(cv.split(X, y_multiclass)))
    # also works with 2d multiclass
    y_multiclass_2d = y_multiclass.reshape(-1, 1)
    cv = check_cv(3, y_multiclass_2d, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_multiclass_2d)),
                            list(cv.split(X, y_multiclass_2d)))

    assert not np.all(
        next(StratifiedKFold(3).split(X, y_multiclass_2d))[0] ==
        next(KFold(3).split(X, y_multiclass_2d))[0])

    X = np.ones(5)
    y_multilabel = np.array([[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1],
                             [1, 1, 0, 1], [0, 0, 1, 0]])
    cv = check_cv(3, y_multilabel, classifier=True)
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = check_cv(3, y_multioutput, classifier=True)
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    assert_raises(ValueError, check_cv, cv="lolo")

Source File: split.py From nyaggle with MIT License

5 votes

def check_cv(cv: Union[int, Iterable, BaseCrossValidator] = 5,
             y: Optional[Union[pd.Series, np.ndarray]] = None,
             stratified: bool = False,
             random_state: int = 0):
    if cv is None:
        cv = 5
    if isinstance(cv, numbers.Integral):
        if stratified and (y is not None) and (type_of_target(y) in ('binary', 'multiclass')):
            return StratifiedKFold(cv, shuffle=True, random_state=random_state)
        else:
            return KFold(cv, shuffle=True, random_state=random_state)

    return model_selection.check_cv(cv, y, stratified)

Source File: test_split.py From twitter-stock-recommendation with MIT License

4 votes

def test_check_cv():
    X = np.ones(9)
    cv = check_cv(3, classifier=False)
    # Use numpy.testing.assert_equal which recursively compares
    # lists of lists
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = check_cv(3, y_binary, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_binary)),
                            list(cv.split(X, y_binary)))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = check_cv(3, y_multiclass, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_multiclass)),
                            list(cv.split(X, y_multiclass)))
    # also works with 2d multiclass
    y_multiclass_2d = y_multiclass.reshape(-1, 1)
    cv = check_cv(3, y_multiclass_2d, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_multiclass_2d)),
                            list(cv.split(X, y_multiclass_2d)))

    assert_false(np.all(
        next(StratifiedKFold(3).split(X, y_multiclass_2d))[0] ==
        next(KFold(3).split(X, y_multiclass_2d))[0]))

    X = np.ones(5)
    y_multilabel = np.array([[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1],
                             [1, 1, 0, 1], [0, 0, 1, 0]])
    cv = check_cv(3, y_multilabel, classifier=True)
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = check_cv(3, y_multioutput, classifier=True)
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    # Check if the old style classes are wrapped to have a split method
    X = np.ones(9)
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv1 = check_cv(3, y_multiclass, classifier=True)

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv2 = check_cv(OldSKF(y_multiclass, n_folds=3))
    np.testing.assert_equal(list(cv1.split(X, y_multiclass)),
                            list(cv2.split()))

    assert_raises(ValueError, check_cv, cv="lolo")

Python sklearn.model_selection.check_cv() Examples