Python sklearn.model_selection.StratifiedKFold() Examples
The following are 30
code examples of sklearn.model_selection.StratifiedKFold().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection
, or try the search function
.
Example #1
Source File: train_eval.py From pytorch_geometric with MIT License | 8 votes |
def k_fold(dataset, folds): skf = StratifiedKFold(folds, shuffle=True, random_state=12345) test_indices, train_indices = [], [] for _, idx in skf.split(torch.zeros(len(dataset)), dataset.data.y): test_indices.append(torch.from_numpy(idx).to(torch.long)) val_indices = [test_indices[i - 1] for i in range(folds)] for i in range(folds): train_mask = torch.ones(len(dataset), dtype=torch.bool) train_mask[test_indices[i]] = 0 train_mask[val_indices[i]] = 0 train_indices.append(train_mask.nonzero().view(-1)) return train_indices, test_indices, val_indices
Example #2
Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.", gs.fit, X, y) gs.fit(X, y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_group_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
Example #3
Source File: keras_models.py From gentun with Apache License 2.0 | 6 votes |
def cross_validate(self): """Train model using k-fold cross validation and return mean value of the validation accuracy. """ acc = .0 kfold = StratifiedKFold(n_splits=self.kfold, shuffle=True) for fold, (train, validation) in enumerate(kfold.split(self.x_train, np.where(self.y_train == 1)[1])): print("KFold {}/{}".format(fold + 1, self.kfold)) self.reset_weights() for epochs, learning_rate in zip(self.epochs, self.learning_rate): print("Training {} epochs with learning rate {}".format(epochs, learning_rate)) self.model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) self.model.fit( self.x_train[train], self.y_train[train], epochs=epochs, batch_size=self.batch_size, verbose=1 ) acc += self.model.evaluate(self.x_train[validation], self.y_train[validation], verbose=0)[1] / self.kfold return acc
Example #4
Source File: stratifiedKfold.py From RecommenderSystems with MIT License | 6 votes |
def main(): train_x, train_y = _load_data() print('loading data done!') folds = list(StratifiedKFold(n_splits=10, shuffle=True, random_state=config.RANDOM_SEED).split(train_x, train_y)) fold_index = [] for i,(train_id, valid_id) in enumerate(folds): fold_index.append(valid_id) print("fold num: %d" % (len(fold_index))) fold_index = np.array(fold_index) np.save(config.DATA_PATH + "fold_index.npy", fold_index) save_x_y(fold_index, train_x, train_y) print("save train_x_y done!") fold_index = np.load(config.DATA_PATH + "fold_index.npy") save_i(fold_index) print("save index done!")
Example #5
Source File: dataloader.py From dgl with Apache License 2.0 | 6 votes |
def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True): ''' 10 flod ''' assert 0 <= fold_idx and fold_idx < 10, print( "fold_idx must be from 0 to 9.") skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed) idx_list = [] for idx in skf.split(np.zeros(len(labels)), labels): # split(x, y) idx_list.append(idx) train_idx, valid_idx = idx_list[fold_idx] print( "train_set : test_set = %d : %d", len(train_idx), len(valid_idx)) return train_idx, valid_idx
Example #6
Source File: test.py From rasa_nlu with Apache License 2.0 | 6 votes |
def generate_folds(n, td): """Generates n cross validation folds for training data td.""" from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=n, shuffle=True) x = td.intent_examples y = [example.get("intent") for example in x] for i_fold, (train_index, test_index) in enumerate(skf.split(x, y)): logger.debug("Fold: {}".format(i_fold)) train = [x[i] for i in train_index] test = [x[i] for i in test_index] yield (TrainingData(training_examples=train, entity_synonyms=td.entity_synonyms, regex_features=td.regex_features), TrainingData(training_examples=test, entity_synonyms=td.entity_synonyms, regex_features=td.regex_features))
Example #7
Source File: dataset.py From heamy with MIT License | 6 votes |
def kfold(self, k=5, stratify=False, shuffle=True, seed=33): """K-Folds cross validation iterator. Parameters ---------- k : int, default 5 stratify : bool, default False shuffle : bool, default True seed : int, default 33 Yields ------- X_train, y_train, X_test, y_test, train_index, test_index """ if stratify: kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle) else: kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle) for train_index, test_index in kf.split(self.X_train, self.y_train): X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index] X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index] yield X_train, y_train, X_test, y_test, train_index, test_index
Example #8
Source File: stratifiedKfold.py From AutoInt with MIT License | 6 votes |
def main(): train_x, train_y = _load_data() print('loading data done!') folds = list(StratifiedKFold(n_splits=10, shuffle=True, random_state=config.RANDOM_SEED).split(train_x, train_y)) fold_index = [] for i,(train_id, valid_id) in enumerate(folds): fold_index.append(valid_id) print("fold num: %d" % (len(fold_index))) fold_index = np.array(fold_index) np.save(config.DATA_PATH + "fold_index.npy", fold_index) save_x_y(fold_index, train_x, train_y) print("save train_x_y done!") fold_index = np.load(config.DATA_PATH + "fold_index.npy") save_i(fold_index) print("save index done!")
Example #9
Source File: dataloader.py From dgl with Apache License 2.0 | 6 votes |
def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True): ''' 10 flod ''' assert 0 <= fold_idx and fold_idx < 10, print( "fold_idx must be from 0 to 9.") skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed) idx_list = [] for idx in skf.split(np.zeros(len(labels)), labels): # split(x, y) idx_list.append(idx) train_idx, valid_idx = idx_list[fold_idx] print( "train_set : test_set = %d : %d", len(train_idx), len(valid_idx)) return train_idx, valid_idx
Example #10
Source File: tests.py From scikit-mdr with MIT License | 6 votes |
def test_mdr_sklearn_pipeline(): """Ensure that MDR can be used as a transformer in a scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True)) assert np.mean(cv_scores) > 0.
Example #11
Source File: 2_AssignCVFolds.py From kaggle-rsna18 with MIT License | 6 votes |
def assign_folds(orig_df, num_folds, val_frac=0.10, seed=88): # Stratified splits np.random.seed(seed) df = orig_df.copy() df["fold"] = None skf = StratifiedKFold(n_splits=num_folds, random_state=0, shuffle=True) fold_counter = 0 for train_index, test_index in skf.split(df.patientId, df.combined_cat): df["fold"].iloc[test_index] = fold_counter fold_counter += 1 # for each_fold in np.unique(df.fold): # train_df = df[df.fold != each_fold] # val_counter = 0 # train_df["val{}".format(each_fold)] = None # for train_index, test_index in skf.split(train_df.patientId, train_df.combined_cat): # train_df["val{}".format(each_fold)].iloc[test_index] = val_counter # val_counter += 1 # df = df.merge(train_df[["patientId", "val{}".format(each_fold)]], on="patientId", how="left") return df ########## # SCRIPT # ##########
Example #12
Source File: filter_datasets.py From pysaliency with MIT License | 6 votes |
def _get_stratified_crossval_split(stimuli, fixations, split_count, included_splits, random=True, stratified_attributes=None): from sklearn.model_selection import StratifiedKFold labels = [] for attribute_name in stratified_attributes: attribute_data = np.array(stimuli.attributes[attribute_name]) if attribute_data.ndim == 1: attribute_data = attribute_data[:, np.newaxis] labels.append(attribute_data) labels = np.vstack(labels) X = np.ones((len(stimuli), 1)) rst = np.random.RandomState(42) inds = [] k_fold = StratifiedKFold(n_splits=split_count, shuffle=random, random_state=rst) for i, (train_index, test_index) in enumerate(k_fold.split(X, labels)): if i in included_splits: inds.extend(test_index) stimuli, fixations = create_subset(stimuli, fixations, inds) return stimuli, fixations
Example #13
Source File: mvpa_voxelselector.py From brainiak with Apache License 2.0 | 6 votes |
def _sfn(data, mask, myrad, bcast_var): """Score classifier on searchlight data using cross-validation. The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The number of cross-validation folds is in `bast_var[1]. """ clf = bcast_var[2] masked_data = data[0][mask, :].T # print(l[0].shape, mask.shape, data.shape) skf = model_selection.StratifiedKFold(n_splits=bcast_var[1], shuffle=False) accuracy = np.mean(model_selection.cross_val_score(clf, masked_data, y=bcast_var[0], cv=skf, n_jobs=1)) return accuracy
Example #14
Source File: inbreast.py From deep-mil-for-whole-mammogram-classification with MIT License | 6 votes |
def cvsplit(fold, totalfold, mydict): '''get the split of train and test fold is the returned fold th data, from 0 to totalfold-1 total fold is for the cross validation mydict is the return dict from readlabel''' skf = StratifiedKFold(n_splits=totalfold) # default shuffle is false, okay! #readdicom(mydict) y = mydict.values() x = mydict.keys() count = 0 for train, test in skf.split(x,y): print(len(train), len(test)) if count == fold: #print test return train, test count += 1
Example #15
Source File: classification.py From brainiak with Apache License 2.0 | 6 votes |
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj): # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel # when the kernel matrix is computed in portions; also, this method only works # for self-correlation, i.e. correlation between the same data matrix. # no shrinking, set C=1 svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto') #logit_clf = LogisticRegression() clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj) # doing leave-one-subject-out cross validation # no shuffling in cv skf = model_selection.StratifiedKFold(n_splits=num_subjects, shuffle=False) scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)), y=labels, cv=skf) print(scores) logger.info( 'the overall cross validation accuracy is %.2f' % np.mean(scores) )
Example #16
Source File: eval_train_test.py From fanci with GNU General Public License v3.0 | 6 votes |
def kfold_cv(clf_type, data_sets: [DataSet], fold_count=5, repetitions=5, n_jobs=-1, parallel_verbose=1, persist=True): """ Do a kfold cross validation with a SVM classifier. :param data_sets: list of data sets :param fold_count: count of folds to be made and hence also runs :return: a Statistics object """ log.info('Starting {!s}-fold cv. Set count: {!s}'.format(fold_count, len(data_sets))) parallel = Parallel(n_jobs=n_jobs, verbose=parallel_verbose) skf = StratifiedKFold(n_splits=fold_count, shuffle=True) stats_list = parallel(delayed(_fit_and_score)(clf, domains, labels, train_index, test_index, i, data_set_id, fold_count) for domains, labels, data_set_id, clf in _data_sets_generator(data_sets, clf_type) for i in range(repetitions) for train_index, test_index in skf.split(domains, labels) ) where = settings.EVAL_FOLDER + '/' + '{!s}fold_cv_{!s}_{!s}rep_{!s}sets_{!s}.pkl'.format(fold_count, clf_type, repetitions, len(data_sets), settings.NOW_STR) return _serialize_cv_results(stats_list, persist, where)
Example #17
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #18
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_stratified_kfold_ratios(): # Check that stratified kfold preserves class ratios in individual splits # Repeat with shuffling turned off and on n_samples = 1000 X = np.ones(n_samples) y = np.array([4] * int(0.10 * n_samples) + [0] * int(0.89 * n_samples) + [1] * int(0.01 * n_samples)) for shuffle in (False, True): for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y): assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2) assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2) assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2) assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2) assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2) assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2)
Example #19
Source File: clf_helpers.py From ibeis with Apache License 2.0 | 6 votes |
def setup(pblm): import sklearn.datasets iris = sklearn.datasets.load_iris() pblm.primary_task_key = 'iris' pblm.default_data_key = 'learn(all)' pblm.default_clf_key = 'RF' X_df = pd.DataFrame(iris.data, columns=iris.feature_names) samples = MultiTaskSamples(X_df.index) samples.apply_indicators( {'iris': {name: iris.target == idx for idx, name in enumerate(iris.target_names)}}) samples.X_dict = {'learn(all)': X_df} pblm.samples = samples pblm.xval_kw['type'] = 'StratifiedKFold'
Example #20
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_shuffle_stratifiedkfold(): # Check that shuffling is happening when requested, and for proper # sample coverage X_40 = np.ones(40) y = [0] * 20 + [1] * 20 kf0 = StratifiedKFold(5, shuffle=True, random_state=0) kf1 = StratifiedKFold(5, shuffle=True, random_state=1) for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)): assert_not_equal(set(test0), set(test1)) check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5) # Ensure that we shuffle each class's samples with different # random_state in StratifiedKFold # See https://github.com/scikit-learn/scikit-learn/pull/13124 X = np.arange(10) y = [0] * 5 + [1] * 5 kf1 = StratifiedKFold(5, shuffle=True, random_state=0) kf2 = StratifiedKFold(5, shuffle=True, random_state=1) test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)]) test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)]) assert test_set1 != test_set2
Example #21
Source File: clf_helpers.py From ibeis with Apache License 2.0 | 6 votes |
def stratified_kfold_indices(samples, **xval_kw): """ TODO: check xval label frequency """ from sklearn import model_selection X = np.empty((len(samples), 0)) y = samples.encoded_1d().values groups = samples.group_ids type_ = xval_kw.pop('type', 'StratifiedGroupKFold') if type_ == 'StratifiedGroupKFold': assert groups is not None # FIXME: The StratifiedGroupKFold could be implemented better. splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw) skf_list = list(splitter.split(X=X, y=y, groups=groups)) elif type_ == 'StratifiedKFold': splitter = model_selection.StratifiedKFold(**xval_kw) skf_list = list(splitter.split(X=X, y=y)) return skf_list
Example #22
Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_cross_val_predict_unbalanced(): X, y = make_classification(n_samples=100, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=1) # Change the first sample to a new class y[0] = 2 clf = LogisticRegression(random_state=1) cv = StratifiedKFold(n_splits=2, random_state=1) train, test = list(cv.split(X, y)) yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba") assert y[test[0]][0] == 2 # sanity check for further assertions assert np.all(yhat_proba[test[0]][:, 2] == 0) assert np.all(yhat_proba[test[0]][:, 0:1] > 0) assert np.all(yhat_proba[test[1]] > 0) assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape), decimal=12)
Example #23
Source File: stacking.py From xam with MIT License | 5 votes |
def __init__(self, models, meta_model, cv=model_selection.StratifiedKFold(n_splits=3), metric=metrics.roc_auc_score, use_base_features=False, use_probas=True): super().__init__( models=models, meta_model=meta_model, cv=cv, metric=metric, use_base_features=use_base_features, use_probas=use_probas, )
Example #24
Source File: test_stacker.py From xcessiv with Apache License 2.0 | 5 votes |
def setUp(self): bl1 = RandomForestClassifier(random_state=8) bl2 = LogisticRegression() bl3 = RandomForestClassifier(max_depth=10, random_state=10) meta_est = LogisticRegression() skf = StratifiedKFold(random_state=8).split self.stacked_ensemble = stacker.XcessivStackedEnsemble( [bl1, bl2, bl3], ['predict', 'predict_proba', 'predict_proba'], meta_est, skf )
Example #25
Source File: test_logistic.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr(): # make sure LogisticRegressionCV gives same best params (l1 and C) as # GridSearchCV when penalty is elasticnet and multiclass is ovr. We can't # compare best_params like in the previous test because # LogisticRegressionCV with multi_class='ovr' will have one C and one # l1_param for each class, while LogisticRegression will share the # parameters over the *n_classes* classifiers. X, y = make_classification(n_samples=200, n_classes=3, n_informative=3, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) cv = StratifiedKFold(5, random_state=0) l1_ratios = np.linspace(0, 1, 5) Cs = np.logspace(-4, 4, 5) lrcv = LogisticRegressionCV(penalty='elasticnet', Cs=Cs, solver='saga', cv=cv, l1_ratios=l1_ratios, random_state=0, multi_class='ovr') lrcv.fit(X_train, y_train) param_grid = {'C': Cs, 'l1_ratio': l1_ratios} lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=0, multi_class='ovr') gs = GridSearchCV(lr, param_grid, cv=cv, iid=False) gs.fit(X_train, y_train) # Check that predictions are 80% the same assert (lrcv.predict(X_train) == gs.predict(X_train)).mean() >= .8 assert (lrcv.predict(X_test) == gs.predict(X_test)).mean() >= .8
Example #26
Source File: test_logistic.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_LogisticRegressionCV_GridSearchCV_elastic_net(multi_class): # make sure LogisticRegressionCV gives same best params (l1 and C) as # GridSearchCV when penalty is elasticnet if multi_class == 'ovr': # This is actually binary classification, ovr multiclass is treated in # test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr X, y = make_classification(random_state=0) else: X, y = make_classification(n_samples=200, n_classes=3, n_informative=3, random_state=0) cv = StratifiedKFold(5, random_state=0) l1_ratios = np.linspace(0, 1, 5) Cs = np.logspace(-4, 4, 5) lrcv = LogisticRegressionCV(penalty='elasticnet', Cs=Cs, solver='saga', cv=cv, l1_ratios=l1_ratios, random_state=0, multi_class=multi_class) lrcv.fit(X, y) param_grid = {'C': Cs, 'l1_ratio': l1_ratios} lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=0, multi_class=multi_class) gs = GridSearchCV(lr, param_grid, cv=cv) gs.fit(X, y) assert gs.best_params_['l1_ratio'] == lrcv.l1_ratio_[0] assert gs.best_params_['C'] == lrcv.C_[0]
Example #27
Source File: Stacking.py From MachineLearning with Apache License 2.0 | 5 votes |
def train(self, train_data, train_label): if self.norm_type == "Standardization": train_data = preProcess.Standardization(train_data) else: train_data = preProcess.Normalization(train_data) skf = StratifiedKFold(self.n_folds) prediction_feature = np.zeros((train_data.shape[0], len(self.classifier_set))) trained_model = [] # the first layer in Stacking for j, clf in enumerate(self.classifier_set): # train each submodel subtrained_model = [] # cross validation for (train_index, test_index) in skf.split(train_data, train_label): X_train, X_test = train_data[train_index], train_data[test_index] y_train, y_test = train_label[train_index], train_label[test_index] # train and save the model trained with S-si clf.train(X_train, y_train) subtrained_model.append(clf) # get the prediction feature for each sub model prediction_feature[test_index, j] = clf.predict(X_test)[:, 0] # save the models trained_model.append(subtrained_model) self.trained_classifier_set = trained_model return self
Example #28
Source File: clf_helpers.py From ibeis with Apache License 2.0 | 5 votes |
def subsplit_indices(samples, subset_idx, **xval_kw): """ split an existing set """ from sklearn import model_selection X = np.empty((len(subset_idx), 0)) y = samples.encoded_1d().values[subset_idx] groups = samples.group_ids[subset_idx] xval_kw_ = xval_kw.copy() if 'n_splits' not in xval_kw_: xval_kw_['n_splits'] = 3 type_ = xval_kw_.pop('type', 'StratifiedGroupKFold') if type_ == 'StratifiedGroupKFold': assert groups is not None # FIXME: The StratifiedGroupKFold could be implemented better. splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw_) rel_skf_list = list(splitter.split(X=X, y=y, groups=groups)) elif type_ == 'StratifiedKFold': splitter = model_selection.StratifiedKFold(**xval_kw_) rel_skf_list = list(splitter.split(X=X, y=y)) # map back into original coords skf_list = [(subset_idx[rel_idx1], subset_idx[rel_idx2]) for rel_idx1, rel_idx2 in rel_skf_list] for idx1, idx2 in skf_list: assert len(np.intersect1d(subset_idx, idx1)) == len(idx1) assert len(np.intersect1d(subset_idx, idx2)) == len(idx2) # assert return skf_list
Example #29
Source File: utils.py From ICIAR2018 with MIT License | 5 votes |
def make_folds(): """Creates stratified splits based on train directory listing # Dumps folds: list of splits dict{ "train": { "x": train files list, "y": train labels}, "test": { "x": test files list, "y": test labels}} } """ files = np.array([basename(f) for f in glob.glob("data/preprocessed/train/ResNet-0.5-400/*.npy")]) labels = [] classes = np.array([0, 1, 2, 3]) for f in files: lb = np.array([f.startswith("n"), f.startswith("b"), f.startswith("is"), f.startswith("iv")]) labels.append(classes[np.argmax(lb)]) labels = np.array(labels) folds = [] skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(files, labels): f_train, f_test = files[train_index], files[test_index] y_train, y_test = labels[train_index], labels[test_index] folds.append({"train": {"x": f_train, "y": y_train}, "test": {"x": f_test, "y": y_test}}) with open("data/folds-10.pkl", "wb") as f: pickle.dump(folds, f)
Example #30
Source File: linear_svm.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) orig_cols = list(X.names) if self.num_classes >= 2: mod = linsvc(random_state=self.random_state, C=self.params["C"], penalty=self.params["penalty"], loss=self.params["loss"], dual=self.params["dual"]) kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state) model = CalibratedClassifierCV(base_estimator=mod, method='isotonic', cv=kf) lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) else: model = LinearSVR(epsilon=self.params["epsilon"], C=self.params["C"], loss=self.params["loss"], dual=self.params["dual"], random_state=self.random_state) self.means = dict() self.standard_scaler = StandardScaler() for col in X.names: XX = X[:, col] self.means[col] = XX.mean1() if self.means[col] is None: self.means[col] = 0 XX.replace(None, self.means[col]) X[:, col] = XX assert X[dt.isna(dt.f[col]), col].nrows == 0 X = X.to_numpy() X = self.standard_scaler.fit_transform(X) model.fit(X, y, sample_weight=sample_weight) importances = np.array([0.0 for k in range(len(orig_cols))]) if self.num_classes >= 2: for classifier in model.calibrated_classifiers_: importances += np.array(abs(classifier.base_estimator.get_coeff())) else: importances += np.array(abs(model.coef_[0])) self.set_model_properties(model=model, features=orig_cols, importances=importances.tolist(), # abs(model.coef_[0]) iterations=0)