Python sklearn.model_selection.KFold() Examples

The following are 30 code examples of sklearn.model_selection.KFold(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function .
Example #1
Source File: test_search.py    From Mastering-Elasticsearch-7.0 with MIT License 7 votes vote down vote up
def test_empty_cv_iterator_error():
    # Use global X, y

    # create cv
    cv = KFold(n_splits=3).split(X)

    # pop all of it, this should cause the expected ValueError
    [u for u in cv]
    # cv is empty now

    train_size = 100
    ridge = RandomizedSearchCV(Ridge(), {'alpha': [1e-3, 1e-2, 1e-1]},
                               cv=cv, n_jobs=-1)

    # assert that this raises an error
    with pytest.raises(ValueError,
                       match='No fits were performed. '
                             'Was the CV iterator empty\\? '
                             'Were there no candidates\\?'):
        ridge.fit(X[:train_size], y[:train_size]) 
Example #2
Source File: test_split.py    From nyaggle with MIT License 6 votes vote down vote up
def test_skip():
    df = pd.DataFrame()
    df['id'] = np.arange(10)

    kf = split.Skip(2, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 3

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([4, 5]))

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([6, 7]))

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([8, 9]))

    with pytest.raises(StopIteration):
        next(folds) 
Example #3
Source File: test_encoders.py    From Kaggler with MIT License 6 votes vote down vote up
def test_FrequencyEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = FrequencyEncoder()
    X_cat = te.fit_transform(df[cat_cols])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = FrequencyEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = FrequencyEncoder(cv=cv)
    te.fit(df[cat_cols])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols) 
Example #4
Source File: test_encoders.py    From Kaggler with MIT License 6 votes vote down vote up
def test_TargetEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = TargetEncoder()
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = TargetEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = TargetEncoder(cv=cv)
    te.fit(df[cat_cols], df[TARGET_COL])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols) 
Example #5
Source File: test_validation.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_learning_curve_with_boolean_indices():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    cv = KFold(n_splits=3)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10))


# 0.23. warning about tol not having its correct default value. 
Example #6
Source File: test_search.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_deprecated_grid_search_iid():
    depr_message = ("The default of the `iid` parameter will change from True "
                    "to False in version 0.22")
    X, y = make_blobs(n_samples=54, random_state=0, centers=2)
    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=3)
    # no warning with equally sized test sets
    assert_no_warnings(grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=5)
    # warning because 54 % 5 != 0
    assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=2)
    # warning because stratification into two classes and 27 % 2 != 0
    assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=KFold(2))
    # no warning because no stratification and 54 % 2 == 0
    assert_no_warnings(grid.fit, X, y) 
Example #7
Source File: cca.py    From ibllib with MIT License 6 votes vote down vote up
def split_trials(trial_ids, n_splits=5, rng_seed=0):
    """
    Assign each trial to testing or training fold

    :param trial_ids:
    :type trial_ids: array-like
    :param n_splits: one split used for testing; remaining splits used for training
    :type n_splits: int
    :param rng_seed: set random state for shuffling trials
    :type rng_seed: int
    :return: list of dicts of indices with keys `train` and `test`
    """
    from sklearn.model_selection import KFold
    shuffle = True if rng_seed is not None else False
    kf = KFold(n_splits=n_splits, random_state=rng_seed, shuffle=shuffle)
    kf.get_n_splits(trial_ids)
    idxs = [None for _ in range(n_splits)]
    for i, t0 in enumerate(kf.split(trial_ids)):
        idxs[i] = {'train': t0[0], 'test': t0[1]}
    return idxs 
Example #8
Source File: test_validation.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y = iris.data, iris.target
    kfold = KFold(5)
    scores_indices = cross_val_score(svm, X, y, cv=kfold)
    kfold = KFold(5)
    cv_masks = []
    for train, test in kfold.split(X, y):
        mask_train = np.zeros(len(y), dtype=np.bool)
        mask_test = np.zeros(len(y), dtype=np.bool)
        mask_train[train] = 1
        mask_test[test] = 1
        cv_masks.append((train, test))
    scores_masks = cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks) 
Example #9
Source File: test_split.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cv_iterable_wrapper():
    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    # numpy's assert_array_equal properly compares nested lists
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert not splits_are_equal, (
        "If the splits are randomized, "
        "successive calls to split should yield different results") 
Example #10
Source File: dataset.py    From heamy with MIT License 6 votes vote down vote up
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
        """K-Folds cross validation iterator.

        Parameters
        ----------
        k : int, default 5
        stratify : bool, default False
        shuffle : bool, default True
        seed : int, default 33

        Yields
        -------
        X_train, y_train, X_test, y_test, train_index, test_index
        """
        if stratify:
            kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
        else:
            kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)

        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
            X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
            yield X_train, y_train, X_test, y_test, train_index, test_index 
Example #11
Source File: test_split.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_shuffle_kfold():
    # Check the indices are shuffled properly
    kf = KFold(3)
    kf2 = KFold(3, shuffle=True, random_state=0)
    kf3 = KFold(3, shuffle=True, random_state=1)

    X = np.ones(300)

    all_folds = np.zeros(300)
    for (tr1, te1), (tr2, te2), (tr3, te3) in zip(
            kf.split(X), kf2.split(X), kf3.split(X)):
        for tr_a, tr_b in combinations((tr1, tr2, tr3), 2):
            # Assert that there is no complete overlap
            assert_not_equal(len(np.intersect1d(tr_a, tr_b)), len(tr1))

        # Set all test indices in successive iterations of kf2 to 1
        all_folds[te2] = 1

    # Check that all indices are returned in the different test folds
    assert_equal(sum(all_folds), 300) 
Example #12
Source File: test_split.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_kfold_no_shuffle():
    # Manually check that KFold preserves the data ordering on toy datasets
    X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]

    splits = KFold(2).split(X2[:-1])
    train, test = next(splits)
    assert_array_equal(test, [0, 1])
    assert_array_equal(train, [2, 3])

    train, test = next(splits)
    assert_array_equal(test, [2, 3])
    assert_array_equal(train, [0, 1])

    splits = KFold(2).split(X2)
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 2])
    assert_array_equal(train, [3, 4])

    train, test = next(splits)
    assert_array_equal(test, [3, 4])
    assert_array_equal(train, [0, 1, 2]) 
Example #13
Source File: test_split.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example #14
Source File: test_split.py    From nyaggle with MIT License 6 votes vote down vote up
def test_nth():
    df = pd.DataFrame()
    df['id'] = np.arange(10)

    kf = split.Nth(3, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 1

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([4, 5]))

    with pytest.raises(StopIteration):
        next(folds)

    kf = split.Nth(1, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 1

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([0, 1]))

    with pytest.raises(StopIteration):
        next(folds) 
Example #15
Source File: test_ridge.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def _test_ridge_cv(filter_):
    ridge_cv = RidgeCV()
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)

    cv = KFold(5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64) 
Example #16
Source File: test_search.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_X_as_list():
    # Pass X as list in GridSearchCV
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)

    clf = CheckingClassifier(check_X=lambda x: isinstance(x, list))
    cv = KFold(n_splits=3)
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
    grid_search.fit(X.tolist(), y).score(X, y)
    assert hasattr(grid_search, "cv_results_") 
Example #17
Source File: test_validation.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_cross_val_predict_class_subset():

    X = np.arange(200).reshape(100, 2)
    y = np.array([x // 10 for x in range(100)])
    classes = 10

    kfold3 = KFold(n_splits=3)
    kfold4 = KFold(n_splits=4)

    le = LabelEncoder()

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        # Test with n_splits=3
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold3)

        # Runs a naive loop (should be same as cross_val_predict):
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Test with n_splits=4
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold4)
        expected_predictions = get_expected_predictions(X, y, kfold4, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions)

        # Testing unordered labels
        y = shuffle(np.repeat(range(10), 10), random_state=0)
        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold3)
        y = le.fit_transform(y)
        expected_predictions = get_expected_predictions(X, y, kfold3, classes,
                                                        est, method)
        assert_array_almost_equal(expected_predictions, predictions) 
Example #18
Source File: test_split.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = np.full(10, -1.)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps = PredefinedSplit(folds)
    # n_splits is simply the no of unique folds
    assert_equal(len(np.unique(folds)), ps.get_n_splits())
    ps_train, ps_test = zip(*ps.split())
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test) 
Example #19
Source File: base.py    From deep_pipe with MIT License 5 votes vote down vote up
def kfold_split(subj_ids, n_splits, groups=None, **kwargs):
    """Perform kfold (or group kfold) shuffled split
    Returns indices: [[train_0, val_0], ..., [train_k, val_k]]
    """
    kfoldClass = KFold if groups is None else ShuffleGroupKFold
    kfold = kfoldClass(n_splits, shuffle=True, **kwargs)
    return [split for split in kfold.split(X=subj_ids, groups=groups)] 
Example #20
Source File: test_split.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_shuffle_kfold_stratifiedkfold_reproducibility():
    X = np.ones(15)  # Divisible by 3
    y = [0] * 7 + [1] * 8
    X2 = np.ones(16)  # Not divisible by 3
    y2 = [0] * 8 + [1] * 8

    # Check that when the shuffle is True, multiple split calls produce the
    # same split when random_state is int
    kf = KFold(3, shuffle=True, random_state=0)
    skf = StratifiedKFold(3, shuffle=True, random_state=0)

    for cv in (kf, skf):
        np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y)))
        np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2)))

    # Check that when the shuffle is True, multiple split calls often
    # (not always) produce different splits when random_state is
    # RandomState instance or None
    kf = KFold(3, shuffle=True, random_state=np.random.RandomState(0))
    skf = StratifiedKFold(3, shuffle=True,
                          random_state=np.random.RandomState(0))

    for cv in (kf, skf):
        for data in zip((X, X2), (y, y2)):
            # Test if the two splits are different cv
            for (_, test_a), (_, test_b) in zip(cv.split(*data),
                                                cv.split(*data)):
                # cv.split(...) returns an array of tuples, each tuple
                # consisting of an array with train indices and test indices
                # Ensure that the splits for data are not same
                # when random state is not set
                with pytest.raises(AssertionError):
                    np.testing.assert_array_equal(test_a, test_b) 
Example #21
Source File: test_validation.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def check_cross_val_predict_binary(est, X, y, method):
    """Helper for tests of cross_val_predict with binary classification"""
    cv = KFold(n_splits=3, shuffle=False)

    # Generate expected outputs
    if y.ndim == 1:
        exp_shape = (len(X),) if method == 'decision_function' else (len(X), 2)
    else:
        exp_shape = y.shape
    expected_predictions = np.zeros(exp_shape)
    for train, test in cv.split(X, y):
        est = clone(est).fit(X[train], y[train])
        expected_predictions[test] = getattr(est, method)(X[test])

    # Check actual outputs for several representations of y
    for tg in [y, y + 1, y - 2, y.astype('str')]:
        assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv),
                        expected_predictions) 
Example #22
Source File: stacking.py    From xam with MIT License 5 votes vote down vote up
def __init__(self, models, meta_model, cv=model_selection.KFold(n_splits=3),
                 metric=metrics.mean_squared_error, use_base_features=False):
        super().__init__(
            models=models,
            meta_model=meta_model,
            cv=cv,
            metric=metric,
            use_base_features=use_base_features,
            use_probas=False
        ) 
Example #23
Source File: test_split.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_stratifiedkfold_balance():
    # Check that KFold returns folds with balanced sizes (only when
    # stratification is possible)
    # Repeat with shuffling turned off and on
    X = np.ones(17)
    y = [0] * 3 + [1] * 14

    for shuffle in (True, False):
        cv = StratifiedKFold(3, shuffle=shuffle)
        for i in range(11, 17):
            skf = cv.split(X[:i], y[:i])
            sizes = [len(test) for _, test in skf]

            assert (np.max(sizes) - np.min(sizes)) <= 1
            assert_equal(np.sum(sizes), i) 
Example #24
Source File: test_split.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_kfold_indices():
    # Check all indices are returned in the test folds
    X1 = np.ones(18)
    kf = KFold(3)
    check_cv_coverage(kf, X1, y=None, groups=None, expected_n_splits=3)

    # Check all indices are returned in the test folds even when equal-sized
    # folds are not possible
    X2 = np.ones(17)
    kf = KFold(3)
    check_cv_coverage(kf, X2, y=None, groups=None, expected_n_splits=3)

    # Check if get_n_splits returns the number of folds
    assert_equal(5, KFold(5).get_n_splits(X2)) 
Example #25
Source File: stacking.py    From xam with MIT License 5 votes vote down vote up
def __init__(self, models, meta_model, cv=model_selection.KFold(n_splits=3),
                 metric=metrics.mean_squared_error, use_base_features=False,
                 fit_handlers={}):
        super().__init__(
            models=models,
            meta_model=meta_model,
            cv=cv,
            metric=metric,
            use_base_features=use_base_features,
            use_probas=False,
            fit_handlers=fit_handlers
        ) 
Example #26
Source File: test_run.py    From nyaggle with MIT License 5 votes vote down vote up
def test_experiment_manual_cv_int(tmpdir_name):
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    result = run_experiment(params, X_train, y_train, None, tmpdir_name, cv=KFold(2))
    assert len(result.models) == 2
    assert len(result.metrics) == 2 + 1 
Example #27
Source File: test_run.py    From nyaggle with MIT License 5 votes vote down vote up
def test_experiment_manual_cv_kfold(tmpdir_name):
    X, y = make_classification_df(n_samples=1024, n_num_features=10, n_cat_features=2,
                                  class_sep=0.98, random_state=0, id_column='user_id')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    result = run_experiment(params, X_train, y_train, None, tmpdir_name, cv=KFold(4))
    assert len(result.models) == 4
    assert len(result.metrics) == 4 + 1 
Example #28
Source File: test_target_encoder.py    From nyaggle with MIT License 5 votes vote down vote up
def _test_target_encoder(X_train, y_train, X_test, **kw):
    cv = KFold(n_splits=2, random_state=42, shuffle=True)

    te = TargetEncoder(cv.split(X_train), **kw)

    ret_train = te.fit_transform(X_train, y_train)
    ret_test = te.transform(X_test)

    ret_train2 = copy.deepcopy(X_train)
    ret_test2 = copy.deepcopy(X_test)

    for train_idx, test_idx in cv.split(X_train):
        te2 = ce.TargetEncoder(**kw)

        if isinstance(X_train, pd.DataFrame):
            te2.fit(X_train.loc[train_idx, :], y_train.loc[train_idx])
            ret_train2.loc[test_idx] = te2.transform(ret_train2.loc[test_idx])
        else:
            te2.fit(X_train[train_idx, :], y_train[train_idx])
            ret_train2[test_idx] = te2.transform(ret_train2[test_idx])

    ret_train2 = ret_train2.astype(float)

    if isinstance(ret_train, pd.DataFrame):
        assert_frame_equal(ret_train, ret_train2)
    else:
        npt.assert_array_equal(ret_train, ret_train2)

    te2 = ce.TargetEncoder(**kw)
    te2.fit(X_train, y_train)

    ret_test2 = te2.transform(ret_test2)

    if isinstance(ret_train, pd.DataFrame):
        assert_frame_equal(ret_test, ret_test2)
    else:
        npt.assert_array_equal(ret_test, ret_test2) 
Example #29
Source File: bio_utils.py    From models with MIT License 5 votes vote down vote up
def split_dataset(nb_samples, nb_folds):
    indicies = KFold(n=nb_samples, n_folds=nb_folds, shuffle=True)
    indicies = list(indicies)  # list in the order of (train, test)

    return (indicies)

# functions to get statistics from a confusion matrix 
Example #30
Source File: performancer.py    From cherry with MIT License 5 votes vote down vote up
def get_score(self):
        # TODO:  operator.itemgetter maybe is not the best solution
        print('Calculating score, depending on your datasets size, this may take several minutes to several hours.')
        for train_index, test_index in KFold(
                n_splits=self.n_splits, shuffle=True).split(self.y_data):
            x_train = operator.itemgetter(*train_index)(self.x_data)
            x_test = operator.itemgetter(*test_index)(self.x_data)
            y_train, y_test = self.y_data[train_index], self.y_data[test_index]
            self._score(self.vectorizer, self.clf, x_train, y_train, x_test, y_test, self.output)