Python sklearn.model_selection.StratifiedKFold() Examples
The following are 30
code examples of sklearn.model_selection.StratifiedKFold().
Example #1
Source File: From pytorch_geometric with MIT License | 8 votes |
def k_fold(dataset, folds): skf = StratifiedKFold(folds, shuffle=True, random_state=12345) test_indices, train_indices = [], [] for _, idx in skf.split(torch.zeros(len(dataset)), test_indices.append(torch.from_numpy(idx).to(torch.long)) val_indices = [test_indices[i - 1] for i in range(folds)] for i in range(folds): train_mask = torch.ones(len(dataset), dtype=torch.bool) train_mask[test_indices[i]] = 0 train_mask[val_indices[i]] = 0 train_indices.append(train_mask.nonzero().view(-1)) return train_indices, test_indices, val_indices
Example #2
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] for cv in group_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The 'groups' parameter should not be None.",, X, y), y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_group_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error, y)
Example #3
Source File: From gentun with Apache License 2.0 | 6 votes |
def cross_validate(self): """Train model using k-fold cross validation and return mean value of the validation accuracy. """ acc = .0 kfold = StratifiedKFold(n_splits=self.kfold, shuffle=True) for fold, (train, validation) in enumerate(kfold.split(self.x_train, np.where(self.y_train == 1)[1])): print("KFold {}/{}".format(fold + 1, self.kfold)) self.reset_weights() for epochs, learning_rate in zip(self.epochs, self.learning_rate): print("Training {} epochs with learning rate {}".format(epochs, learning_rate)) self.model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) self.x_train[train], self.y_train[train], epochs=epochs, batch_size=self.batch_size, verbose=1 ) acc += self.model.evaluate(self.x_train[validation], self.y_train[validation], verbose=0)[1] / self.kfold return acc
Example #4
Source File: From RecommenderSystems with MIT License | 6 votes |
def main(): train_x, train_y = _load_data() print('loading data done!') folds = list(StratifiedKFold(n_splits=10, shuffle=True, random_state=config.RANDOM_SEED).split(train_x, train_y)) fold_index = [] for i,(train_id, valid_id) in enumerate(folds): fold_index.append(valid_id) print("fold num: %d" % (len(fold_index))) fold_index = np.array(fold_index) + "fold_index.npy", fold_index) save_x_y(fold_index, train_x, train_y) print("save train_x_y done!") fold_index = np.load(config.DATA_PATH + "fold_index.npy") save_i(fold_index) print("save index done!")
Example #5
Source File: From dgl with Apache License 2.0 | 6 votes |
def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True): ''' 10 flod ''' assert 0 <= fold_idx and fold_idx < 10, print( "fold_idx must be from 0 to 9.") skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed) idx_list = [] for idx in skf.split(np.zeros(len(labels)), labels): # split(x, y) idx_list.append(idx) train_idx, valid_idx = idx_list[fold_idx] print( "train_set : test_set = %d : %d", len(train_idx), len(valid_idx)) return train_idx, valid_idx
Example #6
Source File: From rasa_nlu with Apache License 2.0 | 6 votes |
def generate_folds(n, td): """Generates n cross validation folds for training data td.""" from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=n, shuffle=True) x = td.intent_examples y = [example.get("intent") for example in x] for i_fold, (train_index, test_index) in enumerate(skf.split(x, y)): logger.debug("Fold: {}".format(i_fold)) train = [x[i] for i in train_index] test = [x[i] for i in test_index] yield (TrainingData(training_examples=train, entity_synonyms=td.entity_synonyms, regex_features=td.regex_features), TrainingData(training_examples=test, entity_synonyms=td.entity_synonyms, regex_features=td.regex_features))
Example #7
Source File: From heamy with MIT License | 6 votes |
def kfold(self, k=5, stratify=False, shuffle=True, seed=33): """K-Folds cross validation iterator. Parameters ---------- k : int, default 5 stratify : bool, default False shuffle : bool, default True seed : int, default 33 Yields ------- X_train, y_train, X_test, y_test, train_index, test_index """ if stratify: kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle) else: kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle) for train_index, test_index in kf.split(self.X_train, self.y_train): X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index] X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index] yield X_train, y_train, X_test, y_test, train_index, test_index
Example #8
Source File: From AutoInt with MIT License | 6 votes |
def main(): train_x, train_y = _load_data() print('loading data done!') folds = list(StratifiedKFold(n_splits=10, shuffle=True, random_state=config.RANDOM_SEED).split(train_x, train_y)) fold_index = [] for i,(train_id, valid_id) in enumerate(folds): fold_index.append(valid_id) print("fold num: %d" % (len(fold_index))) fold_index = np.array(fold_index) + "fold_index.npy", fold_index) save_x_y(fold_index, train_x, train_y) print("save train_x_y done!") fold_index = np.load(config.DATA_PATH + "fold_index.npy") save_i(fold_index) print("save index done!")
Example #9
Source File: From dgl with Apache License 2.0 | 6 votes |
def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True): ''' 10 flod ''' assert 0 <= fold_idx and fold_idx < 10, print( "fold_idx must be from 0 to 9.") skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed) idx_list = [] for idx in skf.split(np.zeros(len(labels)), labels): # split(x, y) idx_list.append(idx) train_idx, valid_idx = idx_list[fold_idx] print( "train_set : test_set = %d : %d", len(train_idx), len(valid_idx)) return train_idx, valid_idx
Example #10
Source File: From scikit-mdr with MIT License | 6 votes |
def test_mdr_sklearn_pipeline(): """Ensure that MDR can be used as a transformer in a scikit-learn pipeline""" features = np.array([[2, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 1], [1, 1]]) classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) clf = make_pipeline(MDR(), LogisticRegression()) cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True)) assert np.mean(cv_scores) > 0.
Example #11
Source File: From kaggle-rsna18 with MIT License | 6 votes |
def assign_folds(orig_df, num_folds, val_frac=0.10, seed=88): # Stratified splits np.random.seed(seed) df = orig_df.copy() df["fold"] = None skf = StratifiedKFold(n_splits=num_folds, random_state=0, shuffle=True) fold_counter = 0 for train_index, test_index in skf.split(df.patientId, df.combined_cat): df["fold"].iloc[test_index] = fold_counter fold_counter += 1 # for each_fold in np.unique(df.fold): # train_df = df[df.fold != each_fold] # val_counter = 0 # train_df["val{}".format(each_fold)] = None # for train_index, test_index in skf.split(train_df.patientId, train_df.combined_cat): # train_df["val{}".format(each_fold)].iloc[test_index] = val_counter # val_counter += 1 # df = df.merge(train_df[["patientId", "val{}".format(each_fold)]], on="patientId", how="left") return df ########## # SCRIPT # ##########
Example #12
Source File: From pysaliency with MIT License | 6 votes |
def _get_stratified_crossval_split(stimuli, fixations, split_count, included_splits, random=True, stratified_attributes=None): from sklearn.model_selection import StratifiedKFold labels = [] for attribute_name in stratified_attributes: attribute_data = np.array(stimuli.attributes[attribute_name]) if attribute_data.ndim == 1: attribute_data = attribute_data[:, np.newaxis] labels.append(attribute_data) labels = np.vstack(labels) X = np.ones((len(stimuli), 1)) rst = np.random.RandomState(42) inds = [] k_fold = StratifiedKFold(n_splits=split_count, shuffle=random, random_state=rst) for i, (train_index, test_index) in enumerate(k_fold.split(X, labels)): if i in included_splits: inds.extend(test_index) stimuli, fixations = create_subset(stimuli, fixations, inds) return stimuli, fixations
Example #13
Source File: From brainiak with Apache License 2.0 | 6 votes |
def _sfn(data, mask, myrad, bcast_var): """Score classifier on searchlight data using cross-validation. The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The number of cross-validation folds is in `bast_var[1]. """ clf = bcast_var[2] masked_data = data[0][mask, :].T # print(l[0].shape, mask.shape, data.shape) skf = model_selection.StratifiedKFold(n_splits=bcast_var[1], shuffle=False) accuracy = np.mean(model_selection.cross_val_score(clf, masked_data, y=bcast_var[0], cv=skf, n_jobs=1)) return accuracy
Example #14
Source File: From deep-mil-for-whole-mammogram-classification with MIT License | 6 votes |
def cvsplit(fold, totalfold, mydict): '''get the split of train and test fold is the returned fold th data, from 0 to totalfold-1 total fold is for the cross validation mydict is the return dict from readlabel''' skf = StratifiedKFold(n_splits=totalfold) # default shuffle is false, okay! #readdicom(mydict) y = mydict.values() x = mydict.keys() count = 0 for train, test in skf.split(x,y): print(len(train), len(test)) if count == fold: #print test return train, test count += 1
Example #15
Source File: From brainiak with Apache License 2.0 | 6 votes |
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj): # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel # when the kernel matrix is computed in portions; also, this method only works # for self-correlation, i.e. correlation between the same data matrix. # no shrinking, set C=1 svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto') #logit_clf = LogisticRegression() clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj) # doing leave-one-subject-out cross validation # no shuffling in cv skf = model_selection.StratifiedKFold(n_splits=num_subjects, shuffle=False) scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)), y=labels, cv=skf) print(scores) 'the overall cross validation accuracy is %.2f' % np.mean(scores) )
Example #16
Source File: From fanci with GNU General Public License v3.0 | 6 votes |
def kfold_cv(clf_type, data_sets: [DataSet], fold_count=5, repetitions=5, n_jobs=-1, parallel_verbose=1, persist=True): """ Do a kfold cross validation with a SVM classifier. :param data_sets: list of data sets :param fold_count: count of folds to be made and hence also runs :return: a Statistics object """'Starting {!s}-fold cv. Set count: {!s}'.format(fold_count, len(data_sets))) parallel = Parallel(n_jobs=n_jobs, verbose=parallel_verbose) skf = StratifiedKFold(n_splits=fold_count, shuffle=True) stats_list = parallel(delayed(_fit_and_score)(clf, domains, labels, train_index, test_index, i, data_set_id, fold_count) for domains, labels, data_set_id, clf in _data_sets_generator(data_sets, clf_type) for i in range(repetitions) for train_index, test_index in skf.split(domains, labels) ) where = settings.EVAL_FOLDER + '/' + '{!s}fold_cv_{!s}_{!s}rep_{!s}sets_{!s}.pkl'.format(fold_count, clf_type, repetitions, len(data_sets), settings.NOW_STR) return _serialize_cv_results(stats_list, persist, where)
Example #17
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_2d_y(): # smoke test for 2d y and multi-label n_samples = 30 rng = np.random.RandomState(1) X = rng.randint(0, 3, size=(n_samples, 2)) y = rng.randint(0, 3, size=(n_samples,)) y_2d = y.reshape(-1, 1) y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) try: list(splitter.split(X, y_multilabel, groups)) except ValueError as e: allowed_target_types = ('binary', 'multiclass') msg = "Supported target types are: {}. Got 'multilabel".format( allowed_target_types) assert msg in str(e)
Example #18
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_stratified_kfold_ratios(): # Check that stratified kfold preserves class ratios in individual splits # Repeat with shuffling turned off and on n_samples = 1000 X = np.ones(n_samples) y = np.array([4] * int(0.10 * n_samples) + [0] * int(0.89 * n_samples) + [1] * int(0.01 * n_samples)) for shuffle in (False, True): for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y): assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2) assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2) assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2) assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2) assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2) assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2)
Example #19
Source File: From ibeis with Apache License 2.0 | 6 votes |
def setup(pblm): import sklearn.datasets iris = sklearn.datasets.load_iris() pblm.primary_task_key = 'iris' pblm.default_data_key = 'learn(all)' pblm.default_clf_key = 'RF' X_df = pd.DataFrame(, columns=iris.feature_names) samples = MultiTaskSamples(X_df.index) samples.apply_indicators( {'iris': {name: == idx for idx, name in enumerate(iris.target_names)}}) samples.X_dict = {'learn(all)': X_df} pblm.samples = samples pblm.xval_kw['type'] = 'StratifiedKFold'
Example #20
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_shuffle_stratifiedkfold(): # Check that shuffling is happening when requested, and for proper # sample coverage X_40 = np.ones(40) y = [0] * 20 + [1] * 20 kf0 = StratifiedKFold(5, shuffle=True, random_state=0) kf1 = StratifiedKFold(5, shuffle=True, random_state=1) for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)): assert_not_equal(set(test0), set(test1)) check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5) # Ensure that we shuffle each class's samples with different # random_state in StratifiedKFold # See X = np.arange(10) y = [0] * 5 + [1] * 5 kf1 = StratifiedKFold(5, shuffle=True, random_state=0) kf2 = StratifiedKFold(5, shuffle=True, random_state=1) test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)]) test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)]) assert test_set1 != test_set2
Example #21
Source File: From ibeis with Apache License 2.0 | 6 votes |
def stratified_kfold_indices(samples, **xval_kw): """ TODO: check xval label frequency """ from sklearn import model_selection X = np.empty((len(samples), 0)) y = samples.encoded_1d().values groups = samples.group_ids type_ = xval_kw.pop('type', 'StratifiedGroupKFold') if type_ == 'StratifiedGroupKFold': assert groups is not None # FIXME: The StratifiedGroupKFold could be implemented better. splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw) skf_list = list(splitter.split(X=X, y=y, groups=groups)) elif type_ == 'StratifiedKFold': splitter = model_selection.StratifiedKFold(**xval_kw) skf_list = list(splitter.split(X=X, y=y)) return skf_list
Example #22
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_cross_val_predict_unbalanced(): X, y = make_classification(n_samples=100, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=1) # Change the first sample to a new class y[0] = 2 clf = LogisticRegression(random_state=1) cv = StratifiedKFold(n_splits=2, random_state=1) train, test = list(cv.split(X, y)) yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba") assert y[test[0]][0] == 2 # sanity check for further assertions assert np.all(yhat_proba[test[0]][:, 2] == 0) assert np.all(yhat_proba[test[0]][:, 0:1] > 0) assert np.all(yhat_proba[test[1]] > 0) assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape), decimal=12)
Example #23
Source File: From xam with MIT License | 5 votes |
def __init__(self, models, meta_model, cv=model_selection.StratifiedKFold(n_splits=3), metric=metrics.roc_auc_score, use_base_features=False, use_probas=True): super().__init__( models=models, meta_model=meta_model, cv=cv, metric=metric, use_base_features=use_base_features, use_probas=use_probas, )
Example #24
Source File: From xcessiv with Apache License 2.0 | 5 votes |
def setUp(self): bl1 = RandomForestClassifier(random_state=8) bl2 = LogisticRegression() bl3 = RandomForestClassifier(max_depth=10, random_state=10) meta_est = LogisticRegression() skf = StratifiedKFold(random_state=8).split self.stacked_ensemble = stacker.XcessivStackedEnsemble( [bl1, bl2, bl3], ['predict', 'predict_proba', 'predict_proba'], meta_est, skf )
Example #25
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr(): # make sure LogisticRegressionCV gives same best params (l1 and C) as # GridSearchCV when penalty is elasticnet and multiclass is ovr. We can't # compare best_params like in the previous test because # LogisticRegressionCV with multi_class='ovr' will have one C and one # l1_param for each class, while LogisticRegression will share the # parameters over the *n_classes* classifiers. X, y = make_classification(n_samples=200, n_classes=3, n_informative=3, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) cv = StratifiedKFold(5, random_state=0) l1_ratios = np.linspace(0, 1, 5) Cs = np.logspace(-4, 4, 5) lrcv = LogisticRegressionCV(penalty='elasticnet', Cs=Cs, solver='saga', cv=cv, l1_ratios=l1_ratios, random_state=0, multi_class='ovr'), y_train) param_grid = {'C': Cs, 'l1_ratio': l1_ratios} lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=0, multi_class='ovr') gs = GridSearchCV(lr, param_grid, cv=cv, iid=False), y_train) # Check that predictions are 80% the same assert (lrcv.predict(X_train) == gs.predict(X_train)).mean() >= .8 assert (lrcv.predict(X_test) == gs.predict(X_test)).mean() >= .8
Example #26
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_LogisticRegressionCV_GridSearchCV_elastic_net(multi_class): # make sure LogisticRegressionCV gives same best params (l1 and C) as # GridSearchCV when penalty is elasticnet if multi_class == 'ovr': # This is actually binary classification, ovr multiclass is treated in # test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr X, y = make_classification(random_state=0) else: X, y = make_classification(n_samples=200, n_classes=3, n_informative=3, random_state=0) cv = StratifiedKFold(5, random_state=0) l1_ratios = np.linspace(0, 1, 5) Cs = np.logspace(-4, 4, 5) lrcv = LogisticRegressionCV(penalty='elasticnet', Cs=Cs, solver='saga', cv=cv, l1_ratios=l1_ratios, random_state=0, multi_class=multi_class), y) param_grid = {'C': Cs, 'l1_ratio': l1_ratios} lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=0, multi_class=multi_class) gs = GridSearchCV(lr, param_grid, cv=cv), y) assert gs.best_params_['l1_ratio'] == lrcv.l1_ratio_[0] assert gs.best_params_['C'] == lrcv.C_[0]
Example #27
Source File: From MachineLearning with Apache License 2.0 | 5 votes |
def train(self, train_data, train_label): if self.norm_type == "Standardization": train_data = preProcess.Standardization(train_data) else: train_data = preProcess.Normalization(train_data) skf = StratifiedKFold(self.n_folds) prediction_feature = np.zeros((train_data.shape[0], len(self.classifier_set))) trained_model = [] # the first layer in Stacking for j, clf in enumerate(self.classifier_set): # train each submodel subtrained_model = [] # cross validation for (train_index, test_index) in skf.split(train_data, train_label): X_train, X_test = train_data[train_index], train_data[test_index] y_train, y_test = train_label[train_index], train_label[test_index] # train and save the model trained with S-si clf.train(X_train, y_train) subtrained_model.append(clf) # get the prediction feature for each sub model prediction_feature[test_index, j] = clf.predict(X_test)[:, 0] # save the models trained_model.append(subtrained_model) self.trained_classifier_set = trained_model return self
Example #28
Source File: From ibeis with Apache License 2.0 | 5 votes |
def subsplit_indices(samples, subset_idx, **xval_kw): """ split an existing set """ from sklearn import model_selection X = np.empty((len(subset_idx), 0)) y = samples.encoded_1d().values[subset_idx] groups = samples.group_ids[subset_idx] xval_kw_ = xval_kw.copy() if 'n_splits' not in xval_kw_: xval_kw_['n_splits'] = 3 type_ = xval_kw_.pop('type', 'StratifiedGroupKFold') if type_ == 'StratifiedGroupKFold': assert groups is not None # FIXME: The StratifiedGroupKFold could be implemented better. splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw_) rel_skf_list = list(splitter.split(X=X, y=y, groups=groups)) elif type_ == 'StratifiedKFold': splitter = model_selection.StratifiedKFold(**xval_kw_) rel_skf_list = list(splitter.split(X=X, y=y)) # map back into original coords skf_list = [(subset_idx[rel_idx1], subset_idx[rel_idx2]) for rel_idx1, rel_idx2 in rel_skf_list] for idx1, idx2 in skf_list: assert len(np.intersect1d(subset_idx, idx1)) == len(idx1) assert len(np.intersect1d(subset_idx, idx2)) == len(idx2) # assert return skf_list
Example #29
Source File: From ICIAR2018 with MIT License | 5 votes |
def make_folds(): """Creates stratified splits based on train directory listing # Dumps folds: list of splits dict{ "train": { "x": train files list, "y": train labels}, "test": { "x": test files list, "y": test labels}} } """ files = np.array([basename(f) for f in glob.glob("data/preprocessed/train/ResNet-0.5-400/*.npy")]) labels = [] classes = np.array([0, 1, 2, 3]) for f in files: lb = np.array([f.startswith("n"), f.startswith("b"), f.startswith("is"), f.startswith("iv")]) labels.append(classes[np.argmax(lb)]) labels = np.array(labels) folds = [] skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(files, labels): f_train, f_test = files[train_index], files[test_index] y_train, y_test = labels[train_index], labels[test_index] folds.append({"train": {"x": f_train, "y": y_train}, "test": {"x": f_test, "y": y_test}}) with open("data/folds-10.pkl", "wb") as f: pickle.dump(folds, f)
Example #30
Source File: From driverlessai-recipes with Apache License 2.0 | 5 votes |
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) orig_cols = list(X.names) if self.num_classes >= 2: mod = linsvc(random_state=self.random_state, C=self.params["C"], penalty=self.params["penalty"], loss=self.params["loss"], dual=self.params["dual"]) kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state) model = CalibratedClassifierCV(base_estimator=mod, method='isotonic', cv=kf) lb = LabelEncoder() y = lb.transform(y) else: model = LinearSVR(epsilon=self.params["epsilon"], C=self.params["C"], loss=self.params["loss"], dual=self.params["dual"], random_state=self.random_state) self.means = dict() self.standard_scaler = StandardScaler() for col in X.names: XX = X[:, col] self.means[col] = XX.mean1() if self.means[col] is None: self.means[col] = 0 XX.replace(None, self.means[col]) X[:, col] = XX assert X[dt.isna(dt.f[col]), col].nrows == 0 X = X.to_numpy() X = self.standard_scaler.fit_transform(X), y, sample_weight=sample_weight) importances = np.array([0.0 for k in range(len(orig_cols))]) if self.num_classes >= 2: for classifier in model.calibrated_classifiers_: importances += np.array(abs(classifier.base_estimator.get_coeff())) else: importances += np.array(abs(model.coef_[0])) self.set_model_properties(model=model, features=orig_cols, importances=importances.tolist(), # abs(model.coef_[0]) iterations=0)