Python sklearn.model_selection() Examples
The following are 16
code examples of sklearn.model_selection().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn
, or try the search function
.
Example #1
Source File: fixes.py From skutil with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _cv_len(cv, X, y): """This method computes the length of a cross validation object, agnostic of whether sklearn-0.17 or sklearn-0.18 is being used. Parameters ---------- cv : `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator` The cv object from which to extract length. If using sklearn-0.17, this can be computed by calling `len` on ``cv``, else it's computed with `cv.get_n_splits(X, y)`. X : pd.DataFrame or np.ndarray, shape(n_samples, n_features) The dataframe or np.ndarray being fit in the grid search. y : np.ndarray, shape(n_samples,) The target being fit in the grid search. Returns ------- int """ return len(cv) if not SK18 else cv.get_n_splits(X, y)
Example #2
Source File: clf_helpers.py From ibeis with Apache License 2.0 | 6 votes |
def stratified_kfold_indices(samples, **xval_kw): """ TODO: check xval label frequency """ from sklearn import model_selection X = np.empty((len(samples), 0)) y = samples.encoded_1d().values groups = samples.group_ids type_ = xval_kw.pop('type', 'StratifiedGroupKFold') if type_ == 'StratifiedGroupKFold': assert groups is not None # FIXME: The StratifiedGroupKFold could be implemented better. splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw) skf_list = list(splitter.split(X=X, y=y, groups=groups)) elif type_ == 'StratifiedKFold': splitter = model_selection.StratifiedKFold(**xval_kw) skf_list = list(splitter.split(X=X, y=y)) return skf_list
Example #3
Source File: classify_shark.py From ibeis with Apache License 2.0 | 6 votes |
def gen_crossval_idxs(problem, n_folds=2): y = problem.ds.target rng = 43432 if hasattr(problem.ds, 'nids'): # Ensure that an individual does not appear in both the train # and the test dataset from ibeis_cnn.dataset import stratified_kfold_label_split labels = problem.ds.nids _iter = stratified_kfold_label_split(y, labels, n_folds=n_folds, rng=rng) else: xvalkw = dict(n_folds=n_folds, shuffle=True, random_state=rng) import sklearn.cross_validation skf = sklearn.cross_validation.StratifiedKFold(y, **xvalkw) _iter = skf #import sklearn.model_selection #skf = sklearn.model_selection.StratifiedKFold(**xvalkw) #_iter = skf.split(X=np.empty(len(y)), y=y) msg = 'cross-val test on %s' % (problem.ds.name) progiter = ut.ProgIter(_iter, length=n_folds, lbl=msg) for train_idx, test_idx in progiter: yield train_idx, test_idx # @ut.reloadable_class
Example #4
Source File: sklearn_intent_classifier.py From rasa_nlu with Apache License 2.0 | 6 votes |
def __init__(self, component_config: Dict[Text, Any] = None, clf: 'sklearn.model_selection.GridSearchCV' = None, le: Optional['sklearn.preprocessing.LabelEncoder'] = None ) -> None: """Construct a new intent classifier using the sklearn framework.""" from sklearn.preprocessing import LabelEncoder super(SklearnIntentClassifier, self).__init__(component_config) if le is not None: self.le = le else: self.le = LabelEncoder() self.clf = clf _sklearn_numpy_warning_fix()
Example #5
Source File: sklearn_intent_classifier.py From rasa_nlu with Apache License 2.0 | 6 votes |
def _create_classifier(self, num_threads, y): from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC C = self.component_config["C"] kernels = self.component_config["kernels"] gamma = self.component_config["gamma"] # dirty str fix because sklearn is expecting # str not instance of basestr... tuned_parameters = [{"C": C, "gamma": gamma, "kernel": [str(k) for k in kernels]}] # aim for 5 examples in each fold cv_splits = self._num_cv_splits(y) return GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'), param_grid=tuned_parameters, n_jobs=num_threads, cv=cv_splits, scoring=self.component_config['scoring_function'], verbose=1)
Example #6
Source File: sklearn_intent_classifier.py From rasa-for-botfront with Apache License 2.0 | 6 votes |
def __init__( self, component_config: Optional[Dict[Text, Any]] = None, clf: "sklearn.model_selection.GridSearchCV" = None, le: Optional["sklearn.preprocessing.LabelEncoder"] = None, ) -> None: """Construct a new intent classifier using the sklearn framework.""" from sklearn.preprocessing import LabelEncoder super().__init__(component_config) if le is not None: self.le = le else: self.le = LabelEncoder() self.clf = clf
Example #7
Source File: sklearn_intent_classifier.py From Rasa_NLU_Chi with Apache License 2.0 | 6 votes |
def __init__(self, component_config=None, # type: Dict[Text, Any] clf=None, # type: sklearn.model_selection.GridSearchCV le=None # type: sklearn.preprocessing.LabelEncoder ): # type: (...) -> None """Construct a new intent classifier using the sklearn framework.""" from sklearn.preprocessing import LabelEncoder super(SklearnIntentClassifier, self).__init__(component_config) if le is not None: self.le = le else: self.le = LabelEncoder() self.clf = clf _sklearn_numpy_warning_fix()
Example #8
Source File: sklearn_intent_classifier.py From Rasa_NLU_Chi with Apache License 2.0 | 6 votes |
def _create_classifier(self, num_threads, y): from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC C = self.component_config["C"] kernels = self.component_config["kernels"] # dirty str fix because sklearn is expecting # str not instance of basestr... tuned_parameters = [{"C": C, "kernel": [str(k) for k in kernels]}] # aim for 5 examples in each fold cv_splits = self._num_cv_splits(y) return GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'), param_grid=tuned_parameters, n_jobs=num_threads, cv=cv_splits, scoring='f1_weighted', verbose=1)
Example #9
Source File: fixes.py From skutil with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _set_cv(cv, X, y, classifier): """This method returns either a `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17 or sklearn-0.18 is being used. Parameters ---------- cv : int, `_PartitionIterator` or `BaseCrossValidator` The CV object or int to check. If an int, will be converted into the appropriate class of crossvalidator. X : pd.DataFrame or np.ndarray, shape(n_samples, n_features) The dataframe or np.ndarray being fit in the grid search. y : np.ndarray, shape(n_samples,) The target being fit in the grid search. classifier : bool Whether the estimator being fit is a classifier Returns ------- `_PartitionIterator` or `BaseCrossValidator` """ return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)
Example #10
Source File: build_model.py From gordo with GNU Affero General Public License v3.0 | 5 votes |
def build_split_dict(X: pd.DataFrame, split_obj: Type[BaseCrossValidator]) -> dict: """ Get dictionary of cross-validation training dataset split metadata Parameters ---------- X: pd.DataFrame The training dataset that will be split during cross-validation. split_obj: Type[sklearn.model_selection.BaseCrossValidator] The cross-validation object that returns train, test indices for splitting. Returns ------- split_metadata: Dict[str,Any] Dictionary of cross-validation train/test split metadata """ split_metadata: Dict[str, Any] = dict() for i, (train_ind, test_ind) in enumerate(split_obj.split(X)): split_metadata.update( { f"fold-{i+1}-train-start": X.index[train_ind[0]], f"fold-{i+1}-train-end": X.index[train_ind[-1]], f"fold-{i+1}-test-start": X.index[test_ind[0]], f"fold-{i+1}-test-end": X.index[test_ind[-1]], } ) split_metadata.update({f"fold-{i+1}-n-train": len(train_ind)}) split_metadata.update({f"fold-{i+1}-n-test": len(test_ind)}) return split_metadata
Example #11
Source File: smk_pipeline.py From ibeis with Apache License 2.0 | 5 votes |
def testdata_smk(*args, **kwargs): """ >>> from ibeis.algo.smk.smk_pipeline import * # NOQA >>> kwargs = {} """ import ibeis import sklearn import sklearn.cross_validation # import sklearn.model_selection ibs, aid_list = ibeis.testdata_aids(defaultdb='PZ_MTEST') nid_list = np.array(ibs.annots(aid_list).nids) rng = ut.ensure_rng(0) xvalkw = dict(n_folds=4, shuffle=False, random_state=rng) skf = sklearn.cross_validation.StratifiedKFold(nid_list, **xvalkw) train_idx, test_idx = six.next(iter(skf)) daids = ut.take(aid_list, train_idx) qaids = ut.take(aid_list, test_idx) config = { 'num_words': 1000, } config.update(**kwargs) qreq_ = SMKRequest(ibs, qaids, daids, config) smk = qreq_.smk #qreq_ = ibs.new_query_request(qaids, daids, cfgdict={'pipeline_root': 'smk', 'proot': 'smk'}) #qreq_ = ibs.new_query_request(qaids, daids, cfgdict={}) return ibs, smk, qreq_
Example #12
Source File: clf_helpers.py From ibeis with Apache License 2.0 | 5 votes |
def subsplit_indices(samples, subset_idx, **xval_kw): """ split an existing set """ from sklearn import model_selection X = np.empty((len(subset_idx), 0)) y = samples.encoded_1d().values[subset_idx] groups = samples.group_ids[subset_idx] xval_kw_ = xval_kw.copy() if 'n_splits' not in xval_kw_: xval_kw_['n_splits'] = 3 type_ = xval_kw_.pop('type', 'StratifiedGroupKFold') if type_ == 'StratifiedGroupKFold': assert groups is not None # FIXME: The StratifiedGroupKFold could be implemented better. splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw_) rel_skf_list = list(splitter.split(X=X, y=y, groups=groups)) elif type_ == 'StratifiedKFold': splitter = model_selection.StratifiedKFold(**xval_kw_) rel_skf_list = list(splitter.split(X=X, y=y)) # map back into original coords skf_list = [(subset_idx[rel_idx1], subset_idx[rel_idx2]) for rel_idx1, rel_idx2 in rel_skf_list] for idx1, idx2 in skf_list: assert len(np.intersect1d(subset_idx, idx1)) == len(idx1) assert len(np.intersect1d(subset_idx, idx2)) == len(idx2) # assert return skf_list
Example #13
Source File: sklearn_intent_classifier.py From rasa-for-botfront with Apache License 2.0 | 5 votes |
def _create_classifier( self, num_threads: int, y ) -> "sklearn.model_selection.GridSearchCV": from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC C = self.component_config["C"] kernels = self.component_config["kernels"] gamma = self.component_config["gamma"] # dirty str fix because sklearn is expecting # str not instance of basestr... tuned_parameters = [ {"C": C, "gamma": gamma, "kernel": [str(k) for k in kernels]} ] # aim for 5 examples in each fold cv_splits = self._num_cv_splits(y) return GridSearchCV( SVC(C=1, probability=True, class_weight="balanced"), param_grid=tuned_parameters, n_jobs=num_threads, cv=cv_splits, scoring=self.component_config["scoring_function"], verbose=1, iid=False, )
Example #14
Source File: test_shap.py From AIX360 with Apache License 2.0 | 4 votes |
def test_Shap(self): np.random.seed(1) X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) # K-nearest neighbors knn = sklearn.neighbors.KNeighborsClassifier() knn.fit(X_train, Y_train) v = 100*np.sum(knn.predict(X_test) == Y_test)/len(Y_test) print("Accuracy = {0}%".format(v)) # Explain a single prediction from the test set shapexplainer = KernelExplainer(knn.predict_proba, X_train) shap_values = shapexplainer.explain_instance(X_test.iloc[0,:]) # TODO test against original SHAP Lib print('knn X_test iloc_0') print(shap_values) print(shapexplainer.explainer.expected_value[0]) print(shap_values[0]) # Explain all the predictions in the test set shap_values = shapexplainer.explain_instance(X_test) print('knn X_test') print(shap_values) print(shapexplainer.explainer.expected_value[0]) print(shap_values[0]) # SV machine with a linear kernel svc_linear = sklearn.svm.SVC(kernel='linear', probability=True) svc_linear.fit(X_train, Y_train) v = 100*np.sum(svc_linear.predict(X_test) == Y_test)/len(Y_test) print("Accuracy = {0}%".format(v)) # Explain all the predictions in the test set shapexplainer = KernelExplainer(svc_linear.predict_proba, X_train) shap_values = shapexplainer.explain_instance(X_test) print('svc X_test') print(shap_values) print(shapexplainer.explainer.expected_value[0]) print(shap_values[0]) np.random.seed(1) X,y = shap.datasets.adult() X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=7) knn = sklearn.neighbors.KNeighborsClassifier() knn.fit(X_train, y_train) f = lambda x: knn.predict_proba(x)[:,1] med = X_train.median().values.reshape((1,X_train.shape[1])) shapexplainer = KernelExplainer(f, med) shap_values_single = shapexplainer.explain_instance(X.iloc[0,:], nsamples=1000) print('Shap Tabular Example') print(shapexplainer.explainer.expected_value) print(shap_values_single) print("Invoked Shap KernelExplainer")
Example #15
Source File: old_vsone.py From ibeis with Apache License 2.0 | 4 votes |
def gridsearch_ratio_thresh(matches): import sklearn import sklearn.metrics import vtool_ibeis as vt # Param search for vsone import plottool_ibeis as pt pt.qt4ensure() skf = sklearn.model_selection.StratifiedKFold(n_splits=10, random_state=119372) y = np.array([m.annot1['nid'] == m.annot2['nid'] for m in matches]) basis = {'ratio_thresh': np.linspace(.6, .7, 50).tolist()} grid = ut.all_dict_combinations(basis) xdata = np.array(ut.take_column(grid, 'ratio_thresh')) def _ratio_thresh(y_true, match_list): # Try and find optional ratio threshold auc_list = [] for cfgdict in ut.ProgIter(grid, lbl='gridsearch'): y_score = [ match.fs.compress(match.ratio_test_flags(cfgdict)).sum() for match in match_list ] auc = sklearn.metrics.roc_auc_score(y_true, y_score) auc_list.append(auc) auc_list = np.array(auc_list) return auc_list auc_list = _ratio_thresh(y, matches) pt.plot(xdata, auc_list) subx, suby = vt.argsubmaxima(auc_list, xdata) best_ratio_thresh = subx[suby.argmax()] skf_results = [] y_true = y for train_idx, test_idx in skf.split(matches, y): match_list_ = ut.take(matches, train_idx) y_true = y.take(train_idx) auc_list = _ratio_thresh(y_true, match_list_) subx, suby = vt.argsubmaxima(auc_list, xdata, maxima_thresh=.8) best_ratio_thresh = subx[suby.argmax()] skf_results.append(best_ratio_thresh) print('skf_results.append = %r' % (np.mean(skf_results),)) import utool utool.embed()
Example #16
Source File: pyglmnet.py From pyglmnet with MIT License | 4 votes |
def _set_cv(cv, estimator=None, X=None, y=None): """Set the default CV depending on whether clf is classifier/regressor.""" # Detect whether classification or regression if estimator in ['classifier', 'regressor']: est_is_classifier = estimator == 'classifier' else: est_is_classifier = is_classifier(estimator) # Setup CV if check_version('sklearn', '0.18'): from sklearn import model_selection as models from sklearn.model_selection import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): XFold = StratifiedKFold if est_is_classifier else KFold cv = XFold(n_splits=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) cv = cv() cv = check_cv(cv=cv, y=y, classifier=est_is_classifier) else: from sklearn import cross_validation as models from sklearn.cross_validation import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): if est_is_classifier: cv = StratifiedKFold(y=y, n_folds=cv) else: cv = KFold(n=len(y), n_folds=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) if cv.__name__ not in ['KFold', 'LeaveOneOut']: raise NotImplementedError('CV cannot be defined with str' ' for sklearn < .017.') cv = cv(len(y)) cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier) # Extract train and test set to retrieve them at predict time if hasattr(cv, 'split'): cv_splits = [(train, test) for train, test in cv.split(X=np.zeros_like(y), y=y)] else: # XXX support sklearn.cross_validation cv cv_splits = [(train, test) for train, test in cv] if not np.all([len(train) for train, _ in cv_splits]): raise ValueError('Some folds do not have any train epochs.') return cv, cv_splits