Source File:    From Mastering-Elasticsearch-7.0 with MIT License 7 votes vote down vote up
def test_empty_cv_iterator_error():
    # Use global X, y

    # create cv
    cv = KFold(n_splits=3).split(X)

    # pop all of it, this should cause the expected ValueError
    [u for u in cv]
    # cv is empty now

    train_size = 100
    ridge = RandomizedSearchCV(Ridge(), {'alpha': [1e-3, 1e-2, 1e-1]},
                               cv=cv, n_jobs=-1)

    # assert that this raises an error
    with pytest.raises(ValueError,
                       match='No fits were performed. '
                             'Was the CV iterator empty\\? '
                             'Were there no candidates\\?'):[:train_size], y[:train_size]) 
Source File:    From nyaggle with MIT License 6 votes vote down vote up
def test_skip():
    df = pd.DataFrame()
    df['id'] = np.arange(10)

    kf = split.Skip(2, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 3

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([4, 5]))

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([6, 7]))

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([8, 9]))

    with pytest.raises(StopIteration):
Source File:    From Kaggler with MIT License 6 votes vote down vote up
def test_FrequencyEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = FrequencyEncoder()
    X_cat = te.fit_transform(df[cat_cols])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = FrequencyEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = FrequencyEncoder(cv=cv)[cat_cols])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols) 
Source File:    From Kaggler with MIT License 6 votes vote down vote up
def test_TargetEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = TargetEncoder()
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = TargetEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = TargetEncoder(cv=cv)[cat_cols], df[TARGET_COL])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols) 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_learning_curve_with_boolean_indices():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    cv = KFold(n_splits=3)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
                              np.linspace(1.9, 1.0, 10))
                              np.linspace(0.1, 1.0, 10))

# 0.23. warning about tol not having its correct default value. 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_deprecated_grid_search_iid():
    depr_message = ("The default of the `iid` parameter will change from True "
                    "to False in version 0.22")
    X, y = make_blobs(n_samples=54, random_state=0, centers=2)
    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=3)
    # no warning with equally sized test sets
    assert_no_warnings(, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=5)
    # warning because 54 % 5 != 0
    assert_warns_message(DeprecationWarning, depr_message,, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=2)
    # warning because stratification into two classes and 27 % 2 != 0
    assert_warns_message(DeprecationWarning, depr_message,, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=KFold(2))
    # no warning because no stratification and 54 % 2 == 0
    assert_no_warnings(, X, y) 
Source File:    From ibllib with MIT License 6 votes vote down vote up
def split_trials(trial_ids, n_splits=5, rng_seed=0):
    Assign each trial to testing or training fold

    :param trial_ids:
    :type trial_ids: array-like
    :param n_splits: one split used for testing; remaining splits used for training
    :type n_splits: int
    :param rng_seed: set random state for shuffling trials
    :type rng_seed: int
    :return: list of dicts of indices with keys `train` and `test`
    from sklearn.model_selection import KFold
    shuffle = True if rng_seed is not None else False
    kf = KFold(n_splits=n_splits, random_state=rng_seed, shuffle=shuffle)
    idxs = [None for _ in range(n_splits)]
    for i, t0 in enumerate(kf.split(trial_ids)):
        idxs[i] = {'train': t0[0], 'test': t0[1]}
    return idxs 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y =,
    kfold = KFold(5)
    scores_indices = cross_val_score(svm, X, y, cv=kfold)
    kfold = KFold(5)
    cv_masks = []
    for train, test in kfold.split(X, y):
        mask_train = np.zeros(len(y), dtype=np.bool)
        mask_test = np.zeros(len(y), dtype=np.bool)
        mask_train[train] = 1
        mask_test[test] = 1
        cv_masks.append((train, test))
    scores_masks = cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks) 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cv_iterable_wrapper():
    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    # numpy's assert_array_equal properly compares nested lists
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert not splits_are_equal, (
        "If the splits are randomized, "
        "successive calls to split should yield different results") 
Source File:    From heamy with MIT License 6 votes vote down vote up
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
        """K-Folds cross validation iterator.

        k : int, default 5
        stratify : bool, default False
        shuffle : bool, default True
        seed : int, default 33

        X_train, y_train, X_test, y_test, train_index, test_index
        if stratify:
            kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
            kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)

        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
            X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
            yield X_train, y_train, X_test, y_test, train_index, test_index 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_shuffle_kfold():
    # Check the indices are shuffled properly
    kf = KFold(3)
    kf2 = KFold(3, shuffle=True, random_state=0)
    kf3 = KFold(3, shuffle=True, random_state=1)

    X = np.ones(300)

    all_folds = np.zeros(300)
    for (tr1, te1), (tr2, te2), (tr3, te3) in zip(
            kf.split(X), kf2.split(X), kf3.split(X)):
        for tr_a, tr_b in combinations((tr1, tr2, tr3), 2):
            # Assert that there is no complete overlap
            assert_not_equal(len(np.intersect1d(tr_a, tr_b)), len(tr1))

        # Set all test indices in successive iterations of kf2 to 1
        all_folds[te2] = 1

    # Check that all indices are returned in the different test folds
    assert_equal(sum(all_folds), 300) 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_kfold_no_shuffle():
    # Manually check that KFold preserves the data ordering on toy datasets
    X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]

    splits = KFold(2).split(X2[:-1])
    train, test = next(splits)
    assert_array_equal(test, [0, 1])
    assert_array_equal(train, [2, 3])

    train, test = next(splits)
    assert_array_equal(test, [2, 3])
    assert_array_equal(train, [0, 1])

    splits = KFold(2).split(X2)
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 2])
    assert_array_equal(train, [3, 4])

    train, test = next(splits)
    assert_array_equal(test, [3, 4])
    assert_array_equal(train, [0, 1, 2]) 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
            assert msg in str(e) 
Source File:    From nyaggle with MIT License 6 votes vote down vote up
def test_nth():
    df = pd.DataFrame()
    df['id'] = np.arange(10)

    kf = split.Nth(3, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 1

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([4, 5]))

    with pytest.raises(StopIteration):

    kf = split.Nth(1, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 1

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([0, 1]))

    with pytest.raises(StopIteration):
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def _test_ridge_cv(filter_):
    ridge_cv = RidgeCV(), y_diabetes)

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)

    cv = KFold(5)
    ridge_cv.set_params(cv=cv), y_diabetes)

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64) 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_X_as_list():
    # Pass X as list in GridSearchCV
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)

    clf = CheckingClassifier(check_X=lambda x: isinstance(x, list))
    cv = KFold(n_splits=3)
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv), y).score(X, y)
    assert hasattr(grid_search, "cv_results_") 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_cross_val_predict_class_subset():

    X = np.arange(200).reshape(100, 2)
    y = np.array([x // 10 for x in range(100)])
    classes = 10

    kfold3 = KFold(n_splits=3)
    kfold4 = KFold(n_splits=4)

    le = LabelEncoder()

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        # Test with n_splits=3
        predictions = cross_val_predict(est, X, y, method=method,

        # Runs a naive loop (should be same as cross_val_predict):
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Test with n_splits=4
        predictions = cross_val_predict(est, X, y, method=method,
        expected_predictions = get_expected_predictions(X, y, kfold4, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Testing unordered labels
        y = shuffle(np.repeat(range(10), 10), random_state=0)
        predictions = cross_val_predict(est, X, y, method=method,
        y = le.fit_transform(y)
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions) 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = np.full(10, -1.)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
        folds[test_ind] = i
    ps = PredefinedSplit(folds)
    # n_splits is simply the no of unique folds
    assert_equal(len(np.unique(folds)), ps.get_n_splits())
    ps_train, ps_test = zip(*ps.split())
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test) 
Source File:    From deep_pipe with MIT License 5 votes vote down vote up
def kfold_split(subj_ids, n_splits, groups=None, **kwargs):
    """Perform kfold (or group kfold) shuffled split
    Returns indices: [[train_0, val_0], ..., [train_k, val_k]]
    kfoldClass = KFold if groups is None else ShuffleGroupKFold
    kfold = kfoldClass(n_splits, shuffle=True, **kwargs)
    return [split for split in kfold.split(X=subj_ids, groups=groups)] 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_shuffle_kfold_stratifiedkfold_reproducibility():
    X = np.ones(15)  # Divisible by 3
    y = [0] * 7 + [1] * 8
    X2 = np.ones(16)  # Not divisible by 3
    y2 = [0] * 8 + [1] * 8

    # Check that when the shuffle is True, multiple split calls produce the
    # same split when random_state is int
    kf = KFold(3, shuffle=True, random_state=0)
    skf = StratifiedKFold(3, shuffle=True, random_state=0)

    for cv in (kf, skf):
        np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y)))
        np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2)))

    # Check that when the shuffle is True, multiple split calls often
    # (not always) produce different splits when random_state is
    # RandomState instance or None
    kf = KFold(3, shuffle=True, random_state=np.random.RandomState(0))
    skf = StratifiedKFold(3, shuffle=True,

    for cv in (kf, skf):
        for data in zip((X, X2), (y, y2)):
            # Test if the two splits are different cv
            for (_, test_a), (_, test_b) in zip(cv.split(*data),
                # cv.split(...) returns an array of tuples, each tuple
                # consisting of an array with train indices and test indices
                # Ensure that the splits for data are not same
                # when random state is not set
                with pytest.raises(AssertionError):
                    np.testing.assert_array_equal(test_a, test_b) 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def check_cross_val_predict_binary(est, X, y, method):
    """Helper for tests of cross_val_predict with binary classification"""
    cv = KFold(n_splits=3, shuffle=False)

    # Generate expected outputs
    if y.ndim == 1:
        exp_shape = (len(X),) if method == 'decision_function' else (len(X), 2)
        exp_shape = y.shape
    expected_predictions = np.zeros(exp_shape)
    for train, test in cv.split(X, y):
        est = clone(est).fit(X[train], y[train])
        expected_predictions[test] = getattr(est, method)(X[test])

    # Check actual outputs for several representations of y
    for tg in [y, y + 1, y - 2, y.astype('str')]:
        assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv),
Source File:    From xam with MIT License 5 votes vote down vote up
def __init__(self, models, meta_model, cv=model_selection.KFold(n_splits=3),
                 metric=metrics.mean_squared_error, use_base_features=False):
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_stratifiedkfold_balance():
    # Check that KFold returns folds with balanced sizes (only when
    # stratification is possible)
    # Repeat with shuffling turned off and on
    X = np.ones(17)
    y = [0] * 3 + [1] * 14

    for shuffle in (True, False):
        cv = StratifiedKFold(3, shuffle=shuffle)
        for i in range(11, 17):
            skf = cv.split(X[:i], y[:i])
            sizes = [len(test) for _, test in skf]

            assert (np.max(sizes) - np.min(sizes)) <= 1
            assert_equal(np.sum(sizes), i) 
Source File:    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_kfold_indices():
    # Check all indices are returned in the test folds
    X1 = np.ones(18)
    kf = KFold(3)
    check_cv_coverage(kf, X1, y=None, groups=None, expected_n_splits=3)

    # Check all indices are returned in the test folds even when equal-sized
    # folds are not possible
    X2 = np.ones(17)
    kf = KFold(3)
    check_cv_coverage(kf, X2, y=None, groups=None, expected_n_splits=3)

    # Check if get_n_splits returns the number of folds
    assert_equal(5, KFold(5).get_n_splits(X2)) 
Source File:    From xam with MIT License 5 votes vote down vote up
def __init__(self, models, meta_model, cv=model_selection.KFold(n_splits=3),
                 metric=metrics.mean_squared_error, use_base_features=False,
Example #26
Source File:    From nyaggle with MIT License 5 votes vote down vote up
def test_experiment_manual_cv_int(tmpdir_name):
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8

    result = run_experiment(params, X_train, y_train, None, tmpdir_name, cv=KFold(2))
    assert len(result.models) == 2
    assert len(result.metrics) == 2 + 1 
Source File:    From nyaggle with MIT License 5 votes vote down vote up
def test_experiment_manual_cv_kfold(tmpdir_name):
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8

    result = run_experiment(params, X_train, y_train, None, tmpdir_name, cv=KFold(4))
    assert len(result.models) == 4
    assert len(result.metrics) == 4 + 1 
Source File:    From nyaggle with MIT License 5 votes vote down vote up
def _test_target_encoder(X_train, y_train, X_test, **kw):
    cv = KFold(n_splits=2, random_state=42, shuffle=True)

    te = TargetEncoder(cv.split(X_train), **kw)

    ret_train = te.fit_transform(X_train, y_train)
    ret_test = te.transform(X_test)

    ret_train2 = copy.deepcopy(X_train)
    ret_test2 = copy.deepcopy(X_test)

    for train_idx, test_idx in cv.split(X_train):
        te2 = ce.TargetEncoder(**kw)

        if isinstance(X_train, pd.DataFrame):
  [train_idx, :], y_train.loc[train_idx])
            ret_train2.loc[test_idx] = te2.transform(ret_train2.loc[test_idx])
  [train_idx, :], y_train[train_idx])
            ret_train2[test_idx] = te2.transform(ret_train2[test_idx])

    ret_train2 = ret_train2.astype(float)

    if isinstance(ret_train, pd.DataFrame):
        assert_frame_equal(ret_train, ret_train2)
        npt.assert_array_equal(ret_train, ret_train2)

    te2 = ce.TargetEncoder(**kw), y_train)

    ret_test2 = te2.transform(ret_test2)

    if isinstance(ret_train, pd.DataFrame):
        assert_frame_equal(ret_test, ret_test2)
        npt.assert_array_equal(ret_test, ret_test2) 
Source File:    From models with MIT License 5 votes vote down vote up
def split_dataset(nb_samples, nb_folds):
    indicies = KFold(n=nb_samples, n_folds=nb_folds, shuffle=True)
    indicies = list(indicies)  # list in the order of (train, test)

    return (indicies)

# functions to get statistics from a confusion matrix 
Source File:    From cherry with MIT License 5 votes vote down vote up
def get_score(self):
        # TODO:  operator.itemgetter maybe is not the best solution
        print('Calculating score, depending on your datasets size, this may take several minutes to several hours.')
        for train_index, test_index in KFold(
                n_splits=self.n_splits, shuffle=True).split(self.y_data):
            x_train = operator.itemgetter(*train_index)(self.x_data)
            x_test = operator.itemgetter(*test_index)(self.x_data)
            y_train, y_test = self.y_data[train_index], self.y_data[test_index]
            self._score(self.vectorizer, self.clf, x_train, y_train, x_test, y_test, self.output)