Python sklearn.utils.indexable() Examples
The following are 5
code examples of sklearn.utils.indexable().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils
, or try the search function
.
Example #1
Source File: _split.py From mriqc with BSD 3-Clause "New" or "Revised" License | 5 votes |
def split(self, X, y, groups=None): if groups is None: groups = self._groups X, y, groups = indexable(X, y, groups) msk = np.array(groups, dtype=bool) train_idx = np.arange(len(X))[~msk] test_idx = np.arange(len(X))[msk] try: test_x = X.as_matrix()[test_idx, :] except AttributeError: test_x = X[test_idx, :] test_y = np.array(y)[test_idx] split = super(PartiallyHeldOutKFold, self).split(test_x, test_y) offset = test_idx[0] for test_train, test_test in split: test_train = np.concatenate((train_idx, test_train + offset)) yield test_train, test_test
Example #2
Source File: _validation.py From mriqc with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cross_val_score( estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch="2*n_jobs", ): """ Evaluate a score by cross-validation """ if not isinstance(scoring, (list, tuple)): scoring = [scoring] X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) splits = list(cv.split(X, y, groups)) scorer = [check_scoring(estimator, scoring=s) for s in scoring] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( delayed(_fit_and_score)( clone(estimator), X, y, scorer, train, test, verbose, None, fit_params ) for train, test in splits ) group_order = [] if hasattr(cv, "groups"): group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits] return np.squeeze(np.array(scores)), group_order
Example #3
Source File: split.py From TSCV with BSD 3-Clause "New" or "Revised" License | 4 votes |
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups : array-like, with shape (n_samples,) Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 gap_size = self.gap_size test_size = self.test_size if self.test_size else n_samples // n_folds # Make sure we have enough samples for the given split parameters if n_folds > n_samples: raise ValueError( ("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format(n_folds, n_samples)) if n_samples - gap_size - (test_size * n_splits) <= 0: raise ValueError( ("Too many splits ={0} for number of samples" " ={1} with test_size ={2} and gap_size ={3}." "").format(n_splits, n_samples, test_size, gap_size)) indices = np.arange(n_samples) test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) for test_start in test_starts: train_end = test_start - gap_size if self.max_train_size and self.max_train_size < train_end: yield (indices[train_end - self.max_train_size:train_end], indices[test_start:test_start + test_size]) else: yield (indices[:train_end], indices[test_start:test_start + test_size])
Example #4
Source File: _validation.py From mriqc with BSD 3-Clause "New" or "Revised" License | 4 votes |
def permutation_test_score( estimator, X, y, groups=None, cv=None, n_permutations=100, n_jobs=1, random_state=0, verbose=0, scoring=None, ): """ Evaluate the significance of a cross-validated score with permutations, as in test 1 of [Ojala2010]_. A modification of original sklearn's permutation test score function to evaluate p-value outside this function, so that the score can be reused from outside. .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier Performance. The Journal of Machine Learning Research (2010) vol. 11 """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_permutation_test_score)( clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer ) for _ in range(n_permutations) ) permutation_scores = np.array(permutation_scores) return permutation_scores
Example #5
Source File: base.py From skoot with MIT License | 4 votes |
def _validate_X_y_ratio_classes(X, y, ratio): # validate the cheap stuff before copying arrays around... validate_float(ratio, 'balance_ratio') # validate arrays X, y = indexable(X, y) # want to allow pd.DataFrame y = column_or_1d(y, warn=False) # type: np.ndarray # get n classes in y, ensure they are <= MAX_N_CLASSES, but first # ensure these are actually class labels and not floats or anything... y_type = type_of_target(y) supported_types = {'multiclass', 'binary'} if y_type not in supported_types: raise ValueError('balancers only support %r, but got %r' % ("(" + ', '.join(supported_types) + ")", y_type)) present_classes, counts = np.unique(y, return_counts=True) n_classes = len(present_classes) # ensure <= MAX_N_CLASSES if n_classes > MAX_N_CLASSES: raise ValueError('balancers currently only support a maximum of %i ' 'unique class labels, but %i were identified.' % (MAX_N_CLASSES, n_classes)) # get the majority class label, and its count: majority_count_idx = np.argmax(counts, axis=0) majority_label, majority_count = (present_classes[majority_count_idx], counts[majority_count_idx]) target_count = max(int(ratio * majority_count), 1) # define a min_n_samples based on the sample ratio to max_class # required = {target_count - counts[i] # for i, v in enumerate(present_classes) # if v != majority_label} # THIS WAS OUR ORIGINAL LOGIC: # * If there were any instances where the number of synthetic examples # required for a class outweighed the number that existed in the class # to begin with, we would end up having to potentially sample from the # synthetic examples. We didn't want to have to do that. # # But it seems like a totally valid use-case. If we're detecting breast # cancer, it might be a rare event that needs lots of bolstering. We # should allow that, even though we may discourage it. # if any counts < MIN_N_SAMPLES, raise: if any(i < MIN_N_SAMPLES for i in counts): raise ValueError('All label counts must be >= %i' % MIN_N_SAMPLES) return (X, y, n_classes, present_classes, counts, majority_label, target_count)