Python Examples of sklearn.model

Source File: fixes.py From skutil with BSD 3-Clause "New" or "Revised" License

6 votes

def _cv_len(cv, X, y):
    """This method computes the length of a cross validation
    object, agnostic of whether sklearn-0.17 or sklearn-0.18
    is being used.

    Parameters
    ----------

    cv : `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator`
        The cv object from which to extract length. If using
        sklearn-0.17, this can be computed by calling `len` on
        ``cv``, else it's computed with `cv.get_n_splits(X, y)`.

    X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
        The dataframe or np.ndarray being fit in the grid search.

    y : np.ndarray, shape(n_samples,)
        The target being fit in the grid search.

    Returns
    -------

    int
    """
    return len(cv) if not SK18 else cv.get_n_splits(X, y)

Source File: clf_helpers.py From ibeis with Apache License 2.0

6 votes

def stratified_kfold_indices(samples, **xval_kw):
        """
        TODO: check xval label frequency


        """
        from sklearn import model_selection

        X = np.empty((len(samples), 0))
        y = samples.encoded_1d().values
        groups = samples.group_ids

        type_ = xval_kw.pop('type', 'StratifiedGroupKFold')
        if type_ == 'StratifiedGroupKFold':
            assert groups is not None
            # FIXME: The StratifiedGroupKFold could be implemented better.
            splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw)
            skf_list = list(splitter.split(X=X, y=y, groups=groups))
        elif type_ == 'StratifiedKFold':
            splitter = model_selection.StratifiedKFold(**xval_kw)
            skf_list = list(splitter.split(X=X, y=y))
        return skf_list

Source File: classify_shark.py From ibeis with Apache License 2.0

6 votes

def gen_crossval_idxs(problem, n_folds=2):
        y = problem.ds.target
        rng = 43432
        if hasattr(problem.ds, 'nids'):
            # Ensure that an individual does not appear in both the train
            # and the test dataset
            from ibeis_cnn.dataset import stratified_kfold_label_split
            labels = problem.ds.nids
            _iter = stratified_kfold_label_split(y, labels, n_folds=n_folds, rng=rng)
        else:
            xvalkw = dict(n_folds=n_folds, shuffle=True, random_state=rng)
            import sklearn.cross_validation
            skf = sklearn.cross_validation.StratifiedKFold(y, **xvalkw)
            _iter = skf
            #import sklearn.model_selection
            #skf = sklearn.model_selection.StratifiedKFold(**xvalkw)
            #_iter = skf.split(X=np.empty(len(y)), y=y)
        msg = 'cross-val test on %s' % (problem.ds.name)
        progiter = ut.ProgIter(_iter, length=n_folds, lbl=msg)
        for train_idx, test_idx in progiter:
            yield train_idx, test_idx


# @ut.reloadable_class

Source File: sklearn_intent_classifier.py From rasa_nlu with Apache License 2.0

6 votes

def __init__(self,
                 component_config: Dict[Text, Any] = None,
                 clf: 'sklearn.model_selection.GridSearchCV' = None,
                 le: Optional['sklearn.preprocessing.LabelEncoder'] = None
                 ) -> None:
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        super(SklearnIntentClassifier, self).__init__(component_config)

        if le is not None:
            self.le = le
        else:
            self.le = LabelEncoder()
        self.clf = clf

        _sklearn_numpy_warning_fix()

Source File: sklearn_intent_classifier.py From rasa_nlu with Apache License 2.0

6 votes

def _create_classifier(self, num_threads, y):
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC

        C = self.component_config["C"]
        kernels = self.component_config["kernels"]
        gamma = self.component_config["gamma"]
        # dirty str fix because sklearn is expecting
        # str not instance of basestr...
        tuned_parameters = [{"C": C,
                             "gamma": gamma,
                             "kernel": [str(k) for k in kernels]}]

        # aim for 5 examples in each fold

        cv_splits = self._num_cv_splits(y)

        return GridSearchCV(SVC(C=1,
                                probability=True,
                                class_weight='balanced'),
                            param_grid=tuned_parameters,
                            n_jobs=num_threads,
                            cv=cv_splits,
                            scoring=self.component_config['scoring_function'],
                            verbose=1)

Source File: sklearn_intent_classifier.py From rasa-for-botfront with Apache License 2.0

6 votes

def __init__(
        self,
        component_config: Optional[Dict[Text, Any]] = None,
        clf: "sklearn.model_selection.GridSearchCV" = None,
        le: Optional["sklearn.preprocessing.LabelEncoder"] = None,
    ) -> None:
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        super().__init__(component_config)

        if le is not None:
            self.le = le
        else:
            self.le = LabelEncoder()
        self.clf = clf

Source File: sklearn_intent_classifier.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def __init__(self,
                 component_config=None,  # type: Dict[Text, Any]
                 clf=None,  # type: sklearn.model_selection.GridSearchCV
                 le=None  # type: sklearn.preprocessing.LabelEncoder
                 ):
        # type: (...) -> None
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        super(SklearnIntentClassifier, self).__init__(component_config)

        if le is not None:
            self.le = le
        else:
            self.le = LabelEncoder()
        self.clf = clf

        _sklearn_numpy_warning_fix()

Source File: sklearn_intent_classifier.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def _create_classifier(self, num_threads, y):
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC

        C = self.component_config["C"]
        kernels = self.component_config["kernels"]
        # dirty str fix because sklearn is expecting
        # str not instance of basestr...
        tuned_parameters = [{"C": C,
                             "kernel": [str(k) for k in kernels]}]

        # aim for 5 examples in each fold

        cv_splits = self._num_cv_splits(y)

        return GridSearchCV(SVC(C=1,
                                probability=True,
                                class_weight='balanced'),
                            param_grid=tuned_parameters,
                            n_jobs=num_threads,
                            cv=cv_splits,
                            scoring='f1_weighted',
                            verbose=1)

Source File: fixes.py From skutil with BSD 3-Clause "New" or "Revised" License

5 votes

def _set_cv(cv, X, y, classifier):
    """This method returns either a `sklearn.cross_validation._PartitionIterator` or 
    `sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17
    or sklearn-0.18 is being used.

    Parameters
    ----------

    cv : int, `_PartitionIterator` or `BaseCrossValidator`
        The CV object or int to check. If an int, will be converted
        into the appropriate class of crossvalidator.

    X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
        The dataframe or np.ndarray being fit in the grid search.

    y : np.ndarray, shape(n_samples,)
        The target being fit in the grid search.

    classifier : bool
        Whether the estimator being fit is a classifier

    Returns
    -------

    `_PartitionIterator` or `BaseCrossValidator`
    """
    return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)

Source File: build_model.py From gordo with GNU Affero General Public License v3.0

5 votes

def build_split_dict(X: pd.DataFrame, split_obj: Type[BaseCrossValidator]) -> dict:
        """
        Get dictionary of cross-validation training dataset split metadata

        Parameters
        ----------
        X: pd.DataFrame
            The training dataset that will be split during cross-validation.
        split_obj: Type[sklearn.model_selection.BaseCrossValidator]
            The cross-validation object that returns train, test indices for splitting.

        Returns
        -------
        split_metadata: Dict[str,Any]
            Dictionary of cross-validation train/test split metadata
        """
        split_metadata: Dict[str, Any] = dict()
        for i, (train_ind, test_ind) in enumerate(split_obj.split(X)):
            split_metadata.update(
                {
                    f"fold-{i+1}-train-start": X.index[train_ind[0]],
                    f"fold-{i+1}-train-end": X.index[train_ind[-1]],
                    f"fold-{i+1}-test-start": X.index[test_ind[0]],
                    f"fold-{i+1}-test-end": X.index[test_ind[-1]],
                }
            )
            split_metadata.update({f"fold-{i+1}-n-train": len(train_ind)})
            split_metadata.update({f"fold-{i+1}-n-test": len(test_ind)})
        return split_metadata

Source File: smk_pipeline.py From ibeis with Apache License 2.0

5 votes

def testdata_smk(*args, **kwargs):
    """
    >>> from ibeis.algo.smk.smk_pipeline import *  # NOQA
    >>> kwargs = {}
    """
    import ibeis
    import sklearn
    import sklearn.cross_validation
    # import sklearn.model_selection
    ibs, aid_list = ibeis.testdata_aids(defaultdb='PZ_MTEST')
    nid_list = np.array(ibs.annots(aid_list).nids)
    rng = ut.ensure_rng(0)
    xvalkw = dict(n_folds=4, shuffle=False, random_state=rng)

    skf = sklearn.cross_validation.StratifiedKFold(nid_list, **xvalkw)
    train_idx, test_idx = six.next(iter(skf))
    daids = ut.take(aid_list, train_idx)
    qaids = ut.take(aid_list, test_idx)

    config = {
        'num_words': 1000,
    }
    config.update(**kwargs)
    qreq_ = SMKRequest(ibs, qaids, daids, config)
    smk = qreq_.smk
    #qreq_ = ibs.new_query_request(qaids, daids, cfgdict={'pipeline_root': 'smk', 'proot': 'smk'})
    #qreq_ = ibs.new_query_request(qaids, daids, cfgdict={})
    return ibs, smk, qreq_

Source File: clf_helpers.py From ibeis with Apache License 2.0

5 votes

def subsplit_indices(samples, subset_idx, **xval_kw):
        """ split an existing set """
        from sklearn import model_selection

        X = np.empty((len(subset_idx), 0))
        y = samples.encoded_1d().values[subset_idx]
        groups = samples.group_ids[subset_idx]

        xval_kw_ = xval_kw.copy()
        if 'n_splits' not in xval_kw_:
            xval_kw_['n_splits'] = 3
        type_ = xval_kw_.pop('type', 'StratifiedGroupKFold')
        if type_ == 'StratifiedGroupKFold':
            assert groups is not None
            # FIXME: The StratifiedGroupKFold could be implemented better.
            splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw_)
            rel_skf_list = list(splitter.split(X=X, y=y, groups=groups))
        elif type_ == 'StratifiedKFold':
            splitter = model_selection.StratifiedKFold(**xval_kw_)
            rel_skf_list = list(splitter.split(X=X, y=y))

        # map back into original coords
        skf_list = [(subset_idx[rel_idx1], subset_idx[rel_idx2])
                    for rel_idx1, rel_idx2 in rel_skf_list]

        for idx1, idx2 in skf_list:
            assert len(np.intersect1d(subset_idx, idx1)) == len(idx1)
            assert len(np.intersect1d(subset_idx, idx2)) == len(idx2)
            # assert
        return skf_list

Source File: sklearn_intent_classifier.py From rasa-for-botfront with Apache License 2.0

5 votes

def _create_classifier(
        self, num_threads: int, y
    ) -> "sklearn.model_selection.GridSearchCV":
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC

        C = self.component_config["C"]
        kernels = self.component_config["kernels"]
        gamma = self.component_config["gamma"]
        # dirty str fix because sklearn is expecting
        # str not instance of basestr...
        tuned_parameters = [
            {"C": C, "gamma": gamma, "kernel": [str(k) for k in kernels]}
        ]

        # aim for 5 examples in each fold

        cv_splits = self._num_cv_splits(y)

        return GridSearchCV(
            SVC(C=1, probability=True, class_weight="balanced"),
            param_grid=tuned_parameters,
            n_jobs=num_threads,
            cv=cv_splits,
            scoring=self.component_config["scoring_function"],
            verbose=1,
            iid=False,
        )

Source File: test_shap.py From AIX360 with Apache License 2.0

4 votes

def test_Shap(self):

        np.random.seed(1)
        X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)

        # K-nearest neighbors
        knn = sklearn.neighbors.KNeighborsClassifier()
        knn.fit(X_train, Y_train)
        v = 100*np.sum(knn.predict(X_test) == Y_test)/len(Y_test)
        print("Accuracy = {0}%".format(v))

        # Explain a single prediction from the test set
        shapexplainer = KernelExplainer(knn.predict_proba, X_train)
        shap_values = shapexplainer.explain_instance(X_test.iloc[0,:])  # TODO test against original SHAP Lib
        print('knn X_test iloc_0')
        print(shap_values)
        print(shapexplainer.explainer.expected_value[0])
        print(shap_values[0])

        # Explain all the predictions in the test set
        shap_values = shapexplainer.explain_instance(X_test)
        print('knn X_test')
        print(shap_values)
        print(shapexplainer.explainer.expected_value[0])
        print(shap_values[0])

        # SV machine with a linear kernel
        svc_linear = sklearn.svm.SVC(kernel='linear', probability=True)
        svc_linear.fit(X_train, Y_train)
        v = 100*np.sum(svc_linear.predict(X_test) == Y_test)/len(Y_test)
        print("Accuracy = {0}%".format(v))

        # Explain all the predictions in the test set
        shapexplainer = KernelExplainer(svc_linear.predict_proba, X_train)
        shap_values = shapexplainer.explain_instance(X_test)
        print('svc X_test')
        print(shap_values)
        print(shapexplainer.explainer.expected_value[0])
        print(shap_values[0])

        np.random.seed(1)
        X,y = shap.datasets.adult()
        X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=7)

        knn = sklearn.neighbors.KNeighborsClassifier()
        knn.fit(X_train, y_train)

        f = lambda x: knn.predict_proba(x)[:,1]
        med = X_train.median().values.reshape((1,X_train.shape[1]))
        shapexplainer = KernelExplainer(f, med)
        shap_values_single = shapexplainer.explain_instance(X.iloc[0,:], nsamples=1000)
        print('Shap Tabular Example')
        print(shapexplainer.explainer.expected_value)
        print(shap_values_single)
        print("Invoked Shap KernelExplainer")

Source File: old_vsone.py From ibeis with Apache License 2.0

4 votes

def gridsearch_ratio_thresh(matches):
    import sklearn
    import sklearn.metrics
    import vtool_ibeis as vt
    # Param search for vsone
    import plottool_ibeis as pt
    pt.qt4ensure()

    skf = sklearn.model_selection.StratifiedKFold(n_splits=10,
                                                  random_state=119372)

    y = np.array([m.annot1['nid'] == m.annot2['nid'] for m in matches])

    basis = {'ratio_thresh': np.linspace(.6, .7, 50).tolist()}
    grid = ut.all_dict_combinations(basis)
    xdata = np.array(ut.take_column(grid, 'ratio_thresh'))

    def _ratio_thresh(y_true, match_list):
        # Try and find optional ratio threshold
        auc_list = []
        for cfgdict in ut.ProgIter(grid, lbl='gridsearch'):
            y_score = [
                match.fs.compress(match.ratio_test_flags(cfgdict)).sum()
                for match in match_list
            ]
            auc = sklearn.metrics.roc_auc_score(y_true, y_score)
            auc_list.append(auc)
        auc_list = np.array(auc_list)
        return auc_list

    auc_list = _ratio_thresh(y, matches)
    pt.plot(xdata, auc_list)
    subx, suby = vt.argsubmaxima(auc_list, xdata)
    best_ratio_thresh = subx[suby.argmax()]

    skf_results = []
    y_true = y
    for train_idx, test_idx in skf.split(matches, y):
        match_list_ = ut.take(matches, train_idx)
        y_true = y.take(train_idx)
        auc_list = _ratio_thresh(y_true, match_list_)
        subx, suby = vt.argsubmaxima(auc_list, xdata, maxima_thresh=.8)
        best_ratio_thresh = subx[suby.argmax()]
        skf_results.append(best_ratio_thresh)
    print('skf_results.append = %r' % (np.mean(skf_results),))
    import utool
    utool.embed()

Source File: pyglmnet.py From pyglmnet with MIT License

4 votes

def _set_cv(cv, estimator=None, X=None, y=None):
        """Set the default CV depending on whether clf
           is classifier/regressor."""
        # Detect whether classification or regression
        if estimator in ['classifier', 'regressor']:
            est_is_classifier = estimator == 'classifier'
        else:
            est_is_classifier = is_classifier(estimator)
        # Setup CV
        if check_version('sklearn', '0.18'):
            from sklearn import model_selection as models
            from sklearn.model_selection import (check_cv,
                                                 StratifiedKFold, KFold)
            if isinstance(cv, (int, np.int)):
                XFold = StratifiedKFold if est_is_classifier else KFold
                cv = XFold(n_splits=cv)
            elif isinstance(cv, str):
                if not hasattr(models, cv):
                    raise ValueError('Unknown cross-validation')
                cv = getattr(models, cv)
                cv = cv()
            cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)
        else:
            from sklearn import cross_validation as models
            from sklearn.cross_validation import (check_cv,
                                                  StratifiedKFold, KFold)
            if isinstance(cv, (int, np.int)):
                if est_is_classifier:
                    cv = StratifiedKFold(y=y, n_folds=cv)
                else:
                    cv = KFold(n=len(y), n_folds=cv)
            elif isinstance(cv, str):
                if not hasattr(models, cv):
                    raise ValueError('Unknown cross-validation')
                cv = getattr(models, cv)
                if cv.__name__ not in ['KFold', 'LeaveOneOut']:
                    raise NotImplementedError('CV cannot be defined with str'
                                              ' for sklearn < .017.')
                cv = cv(len(y))
            cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier)

        # Extract train and test set to retrieve them at predict time
        if hasattr(cv, 'split'):
            cv_splits = [(train, test) for train, test in
                         cv.split(X=np.zeros_like(y), y=y)]
        else:
            # XXX support sklearn.cross_validation cv
            cv_splits = [(train, test) for train, test in cv]

        if not np.all([len(train) for train, _ in cv_splits]):
            raise ValueError('Some folds do not have any train epochs.')

        return cv, cv_splits

Python sklearn.model_selection() Examples