Python Examples of sklearn.model_selection.ShuffleSplit

Source File: TargetingSystem.py From poeai with MIT License

7 votes

def Train(self, C, A, Y, SF):
        '''
        Train the classifier using the sample matrix A and target matrix Y
        '''
        C.fit(A, Y)
        YH = np.zeros(Y.shape, dtype = np.object)
        for i in np.array_split(np.arange(A.shape[0]), 32):   #Split up verification into chunks to prevent out of memory
            YH[i] = C.predict(A[i])
        s1 = SF(Y, YH)
        print('All:{:8.6f}'.format(s1))
        '''
        ss = ShuffleSplit(random_state = 1151)  #Use fixed state for so training can be repeated later
        trn, tst = next(ss.split(A, Y))         #Make train/test split
        mi = [8] * 1                            #Maximum number of iterations at each iter
        YH = np.zeros((A.shape[0]), dtype = np.object)
        for mic in mi:                                      #Chunk size to split dataset for CV results
            #C.SetMaxIter(mic)                               #Set the maximum number of iterations to run
            #C.fit(A[trn], Y[trn])                           #Perform training iterations
        '''

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Source File: expedia_dataset_reader.py From cs-ranking with Apache License 2.0

6 votes

def get_single_train_test_split(self):
        splits = dict()
        cv_iter = ShuffleSplit(
            n_splits=1, random_state=self.random_state, test_size=0.80
        )
        for n_obj, arr in self.X_dict.items():
            if arr.shape[0] == 1:
                splits[n_obj] = ([0], [0])
            else:
                splits[n_obj] = list(cv_iter.split(arr))[0]
        self.X_train = dict()
        self.Y_train = dict()
        self.X_test = dict()
        self.Y_test = dict()
        for n_obj, itr in splits.items():
            train_idx, test_idx = itr
            self.X_train[n_obj] = np.copy(self.X_dict[n_obj][train_idx])
            self.X_test[n_obj] = np.copy(self.X_dict[n_obj][test_idx])
            self.Y_train[n_obj] = np.copy(self.Y_dict[n_obj][train_idx])
            self.Y_test[n_obj] = np.copy(self.Y_dict[n_obj][test_idx])
        self.X, self.Y = self.sub_sampling_from_dictionary()
        self.__check_dataset_validity__()
        self.X, self.X_test = standardize_features(self.X, self.X_test)
        return self.X, self.Y, self.X_test, self.Y_test

Source File: image_dataset.py From self-ensemble-visual-domain-adapt-photo with MIT License

6 votes

def subset_indices(d_source, d_target, subsetsize, subsetseed):
    if subsetsize > 0:
        if subsetseed != 0:
            subset_rng = np.random.RandomState(subsetseed)
        else:
            subset_rng = np.random
        strat = StratifiedShuffleSplit(n_splits=1, test_size=subsetsize, random_state=subset_rng)
        shuf = ShuffleSplit(n_splits=1, test_size=subsetsize, random_state=subset_rng)
        _, source_indices = next(strat.split(d_source.y, d_source.y))
        n_src = source_indices.shape[0]
        if d_target.has_ground_truth:
            _, target_indices = next(strat.split(d_target.y, d_target.y))
        else:
            _, target_indices = next(shuf.split(np.arange(len(d_target.images))))
        n_tgt = target_indices.shape[0]
    else:
        source_indices = None
        target_indices = None
        n_src = len(d_source.images)
        n_tgt = len(d_target.images)

    return source_indices, target_indices, n_src, n_tgt

Source File: stockpredictor.py From SyBrain with GNU General Public License v3.0

6 votes

def TestPerformance(self, df=None):
        # If no dataframe is provided, use the currently learned one
        if (df is None):
            D = self.D
        else:
            D = self.S.transform(df.copy())
        # Get features from the data frame
        A = self._ExtractFeat(D)
        # Get the target values and their corresponding column names
        y, _ = self._ExtractTarg(D)
        # Begin cross validation
        ss = ShuffleSplit(n_splits=1)
        for trn, tst in ss.split(A):
            s1 = cross_val_score(self.R, A, y, cv=3, scoring=make_scorer(r2_score))
            s2 = cross_val_score(self.R, A[tst], y[tst], cv=3, scoring=make_scorer(r2_score))
            s3 = cross_val_score(self.R, A[trn], y[trn], cv=3, scoring=make_scorer(r2_score))
            print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))

Source File: test_multiclass.py From twitter-stock-recommendation with MIT License

6 votes

def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    train, test = list(cv.split(X))[0]

    X_train, y_train = _safe_split(clf, X, y, train)
    K_train, y_train2 = _safe_split(clfp, K, y, train)
    assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
    assert_array_almost_equal(y_train, y_train2)

    X_test, y_test = _safe_split(clf, X, y, test, train)
    K_test, y_test2 = _safe_split(clfp, K, y, test, train)
    assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
    assert_array_almost_equal(y_test, y_test2)

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    train, test = list(cv.split(X))[0]

    X_train, y_train = _safe_split(clf, X, y, train)
    K_train, y_train2 = _safe_split(clfp, K, y, train)
    assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
    assert_array_almost_equal(y_train, y_train2)

    X_test, y_test = _safe_split(clf, X, y, test, train)
    K_test, y_test2 = _safe_split(clfp, K, y, test, train)
    assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
    assert_array_almost_equal(y_test, y_test2)

Source File: active.py From chemml with BSD 3-Clause "New" or "Revised" License

6 votes

def _update_train_test(self):
        """
        This function take care of the test_type parameter.

        """
        if self.test_type == 'passive':
            return True
        if len(self._queries) > 0:
            return True
        else:
            # active test split
            all_indices = np.concatenate([self.train_indices, self.test_indices], axis=0)
            all_y = np.concatenate([self._Y_train, self._Y_test], axis=0)
            # select randomly
            ss = ShuffleSplit(n_splits=1, test_size=self.test_size, train_size=None, random_state=90)
            for train_indices, test_indices in ss.split(all_indices):
                # test
                self._Y_test = all_y[test_indices]
                self.test_indices = all_indices[test_indices]
                # train
                self._Y_train = all_y[train_indices]
                self.train_indices = all_indices[train_indices]

Source File: split.py From gumpy with MIT License

5 votes

def  stratified_shuffle_Split(features, labels, n_splits,test_size,random_state):

    """Stratified ShuffleSplit cross-validator
    """
    cv = StratifiedShuffleSplit(n_splits, test_size, random_state=random_state)
    for train_index, test_index in cv.split(features,labels):
        X_train = features[train_index]
        X_test = features[test_index]
        Y_train = labels[train_index]
        Y_test = labels[test_index]
    return X_train, X_test, Y_train, Y_test


#Random permutation cross-validator

Source File: classifier_basetest.py From scikit-multilearn with BSD 2-Clause "Simplified" License

5 votes

def assertClassifierWorksWithCV(self, classifier):
        # all the nice stuff is tested here - whether the classifier is
        # clonable, etc.
        for X, y in self.get_multilabel_data_for_tests('dense'):
            n_iterations = 3
            cv = model_selection.ShuffleSplit(n_splits=n_iterations, test_size=0.5, random_state=0)

            scores = model_selection.cross_val_score(
                classifier, X, y=y, cv=cv, scoring='accuracy')

            self.assertEqual(len(scores), n_iterations)

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.model_selection.KFold, ms.KFold)
        self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold)
        self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut)

        self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.model_selection.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.model_selection.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler)

        # Model validation

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper_abbr(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.ms.KFold, ms.KFold)
        self.assertIs(df.ms.GroupKFold, ms.GroupKFold)
        self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.ms.LeavePOut, ms.LeavePOut)

        self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.ms.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.ms.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler)

        # Model validation

Source File: split.py From gumpy with MIT License

5 votes

def  shuffle_Split(features, labels, n_splits,test_size,random_state):

    """ShuffleSplit: Random permutation cross-validator
    """
    cv = ShuffleSplit(n_splits, test_size, random_state=random_state)
    for train_index, test_index in cv.split(features):
        X_train = features[train_index]
        X_test = features[test_index]
        Y_train = labels[train_index]
        Y_test = labels[test_index]
    return X_train, X_test, Y_train, Y_test

Source File: DeepOCR.py From pythonml with MIT License

5 votes

def FitModel(cnnc, A, Y, T, FN):
    print('Fitting model...')
    ss = ShuffleSplit(n_splits = 1)
    trn, tst = next(ss.split(A))
    #Fit the network
    cnnc.fit(A[trn], Y[trn])
    #The predictions as sequences of character indices
    YH = []
    for i in np.array_split(np.arange(A.shape[0]), 32): 
        YH.append(cnnc.predict(A[i]))
    YH = np.vstack(YH)
    #Convert from sequence of char indices to strings
    PS = np.array([''.join(YHi) for YHi in YH])
    #Compute the accuracy
    S1 = SAcc(PS[trn], T[trn])
    S2 = SAcc(PS[tst], T[tst])
    print('Train: ' + str(S1))
    print('Test: ' + str(S2))
    for PSi, Ti, FNi in zip(PS, T, FN):
        if np.random.rand() > 0.99: #Randomly select rows to print
            print(FNi + ': ' + Ti + ' -> ' + PSi)
    print('Fitting with CV data...')
    #Fit remainder
    cnnc.SetMaxIter(4)
    cnnc.fit(A, Y)
    return cnnc

Source File: predict.py From NetMF with MIT License

5 votes

def predict_cv(X, y, train_ratio=0.2, n_splits=10, random_state=0, C=1.):
    micro, macro = [], []
    shuffle = ShuffleSplit(n_splits=n_splits, test_size=1-train_ratio,
            random_state=random_state)
    for train_index, test_index in shuffle.split(X):
        print(train_index.shape, test_index.shape)
        assert len(set(train_index) & set(test_index)) == 0
        assert len(train_index) + len(test_index) == X.shape[0]
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = OneVsRestClassifier(
                LogisticRegression(
                    C=C,
                    solver="liblinear",
                    multi_class="ovr"),
                n_jobs=-1)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        y_pred = construct_indicator(y_score, y_test)
        mi = f1_score(y_test, y_pred, average="micro")
        ma = f1_score(y_test, y_pred, average="macro")
        logger.info("micro f1 %f macro f1 %f", mi, ma)
        micro.append(mi)
        macro.append(ma)
    logger.info("%d fold validation, training ratio %f", len(micro), train_ratio)
    logger.info("Average micro %.2f, Average macro %.2f",
            np.mean(micro) * 100,
            np.mean(macro) * 100)

Source File: acp.py From nonconformist with MIT License

5 votes

def gen_samples(self, y, n_samples, problem_type):
		if problem_type == 'classification':
			splits = StratifiedShuffleSplit(
					n_splits=n_samples,
					test_size=self.cal_portion
				)

			split_ = splits.split(np.zeros((y.size, 1)), y)
		
		else:
			splits = ShuffleSplit(
				n_splits=n_samples,
				test_size=self.cal_portion
			)

			split_ = splits.split(np.zeros((y.size, 1)))

		for train, cal in split_:
			yield train, cal


# -----------------------------------------------------------------------------
# Conformal ensemble
# -----------------------------------------------------------------------------

Source File: Stacking.py From Kaggle-Competition-Sberbank with MIT License

5 votes

def fit_predict(self, trainDf, testDf):
        X = trainDf.drop(['price_doc', 'w'], 1).values
        y = trainDf['price_doc'].values
        w = trainDf['w'].values
        T = testDf.values

        X_fillna = trainDf.drop(['price_doc', 'w'], 1).fillna(-999).values
        T_fillna = testDf.fillna(-999).values

        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True))
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):
            print('Training base model ' + str(i+1) + '...')
            S_test_i = np.zeros((T.shape[0], len(folds)))
            for j, (train_idx, test_idx) in enumerate(folds):
                print('Training round ' + str(j+1) + '...')
                if clf not in [xgb1,lgb1]: # sklearn models cannot handle missing values.
                    X = X_fillna
                    T = T_fillna
                X_train = X[train_idx]
                y_train = y[train_idx]
                w_train = w[train_idx]
                X_holdout = X[test_idx]
                # w_holdout = w[test_idx]
                # y_holdout = y[test_idx]
                clf.fit(X_train, y_train, w_train)
                y_pred = clf.predict(X_holdout)
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)
            S_test[:, i] = S_test_i.mean(1)
        self.S_train, self.S_test, self.y = S_train, S_test, y  # for diagnosis purpose
        self.corr = pd.concat([pd.DataFrame(S_train),trainDf['price_doc']],1).corr() # correlation of predictions by different models.
        # cv_stack = ShuffleSplit(n_splits=6, test_size=0.2)
        # score_stacking = cross_val_score(self.stacker, S_train, y, cv=cv_stack, n_jobs=1, scoring='neg_mean_squared_error')
        # print(np.sqrt(-score_stacking.mean())) # CV result of stacking
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)
        return y_pred

Source File: sushi_discrete_choice_dataset_reader.py From cs-ranking with Apache License 2.0

5 votes

def get_single_train_test_split(self):
        cv_iter = ShuffleSplit(
            n_splits=1, random_state=self.random_state, test_size=0.20
        )
        splits = list(cv_iter.split(self.X))
        return list(self.splitter(splits))[0]

Source File: survey_dataset_reader.py From cs-ranking with Apache License 2.0

5 votes

def get_single_train_test_split(self):
        cv_iter = ShuffleSplit(
            n_splits=1, test_size=0.3, random_state=self.random_state
        )
        (train_idx, test_idx) = list(cv_iter.split(self.X))[0]
        return self.X[train_idx], self.Y[train_idx], self.X[test_idx], self.Y[test_idx]

Source File: intelligent_system_group_dataset_reader.py From cs-ranking with Apache License 2.0

5 votes

def get_single_train_test_split(self, name="cold"):
        cv_iter = ShuffleSplit(
            n_splits=1, test_size=0.3, random_state=self.random_state
        )
        (train_idx, test_idx) = list(cv_iter.split(self.X[name]))[0]
        return (
            self.X[name][train_idx],
            self.Y[name][train_idx],
            self.X[name][test_idx],
            self.Y[name][test_idx],
        )

Source File: sushi_dyad_ranking_dataset_reader.py From cs-ranking with Apache License 2.0

5 votes

def get_single_train_test_split(self):
        cv_iter = ShuffleSplit(
            n_splits=1, random_state=self.random_state, test_size=0.30
        )
        splits = list(cv_iter.split(self.X))
        return list(self.splitter(splits))[0]

Source File: sushi_object_ranking_dataset_reader.py From cs-ranking with Apache License 2.0

5 votes

def get_single_train_test_split(self):
        cv_iter = ShuffleSplit(
            n_splits=1, random_state=self.random_state, test_size=0.30
        )
        splits = list(cv_iter.split(self.X))
        return list(self.splitter(splits))[0]

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_shuffle_split():
    ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X)
    ss2 = ShuffleSplit(test_size=2, random_state=0).split(X)
    ss3 = ShuffleSplit(test_size=np.int32(2), random_state=0).split(X)
    for typ in six.integer_types:
        ss4 = ShuffleSplit(test_size=typ(2), random_state=0).split(X)
    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
        assert_array_equal(t1[0], t2[0])
        assert_array_equal(t2[0], t3[0])
        assert_array_equal(t3[0], t4[0])
        assert_array_equal(t1[1], t2[1])
        assert_array_equal(t2[1], t3[1])
        assert_array_equal(t3[1], t4[1])

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_shufflesplit_errors():
    # When the {test|train}_size is a float/invalid, error is raised at init
    assert_raises(ValueError, ShuffleSplit, test_size=None, train_size=None)
    assert_raises(ValueError, ShuffleSplit, test_size=2.0)
    assert_raises(ValueError, ShuffleSplit, test_size=1.0)
    assert_raises(ValueError, ShuffleSplit, test_size=0.1, train_size=0.95)
    assert_raises(ValueError, ShuffleSplit, train_size=1j)

    # When the {test|train}_size is an int, validation is based on the input X
    # and happens at split(...)
    assert_raises(ValueError, next, ShuffleSplit(test_size=11).split(X))
    assert_raises(ValueError, next, ShuffleSplit(test_size=10).split(X))
    assert_raises(ValueError, next, ShuffleSplit(test_size=8,
                                                 train_size=3).split(X))

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_train_test_default_warning():
    assert_warns(FutureWarning, ShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, train_test_split, range(3),
                 train_size=0.75)

Source File: estimator_utils.py From EDeN with MIT License

5 votes

def estimate_predictive_performance(x_y,
                                    estimator=None,
                                    n_splits=10,
                                    random_state=1):
    """estimate_predictive_performance."""
    x, y = x_y
    cv = ShuffleSplit(n_splits=n_splits,
                      test_size=0.3,
                      random_state=random_state)
    scoring = make_scorer(average_precision_score)
    scores = cross_val_score(estimator, x, y, cv=cv, scoring=scoring)
    return scores

Source File: base.py From deep_pipe with MIT License

5 votes

def train_test_split_groups(X, *, val_size, groups=None, **kwargs):
    split_class = (ShuffleSplit if groups is None else GroupShuffleSplit)
    split = split_class(test_size=val_size, **kwargs)
    train, val = next(split.split(X=X, groups=groups))
    return X[train], X[val]

Source File: core.py From HungaBunga with MIT License

5 votes

def cv_reg(x, test_size = 0.2, n_splits = 5, random_state=None): return ss(n_splits, test_size, random_state=random_state).split(x)

Source File: test_model_selection.py From verde with BSD 3-Clause "New" or "Revised" License

5 votes

def test_cross_val_score_client(trend):
    "Test the deprecated dask Client interface"
    coords, data = trend[:2]
    model = Trend(degree=1)
    nsplits = 5
    cross_validator = ShuffleSplit(n_splits=nsplits, random_state=0)
    client = Client(processes=False)
    futures = cross_val_score(model, coords, data, cv=cross_validator, client=client)
    scores = [future.result() for future in futures]
    client.close()
    assert len(scores) == nsplits
    npt.assert_allclose(scores, 1)

Python sklearn.model_selection.ShuffleSplit() Examples