Python Examples of sklearn.utils.indexable

Source File: _split.py From mriqc with BSD 3-Clause "New" or "Revised" License

5 votes

def split(self, X, y, groups=None):
        if groups is None:
            groups = self._groups

        X, y, groups = indexable(X, y, groups)

        msk = np.array(groups, dtype=bool)
        train_idx = np.arange(len(X))[~msk]
        test_idx = np.arange(len(X))[msk]

        try:
            test_x = X.as_matrix()[test_idx, :]
        except AttributeError:
            test_x = X[test_idx, :]

        test_y = np.array(y)[test_idx]
        split = super(PartiallyHeldOutKFold, self).split(test_x, test_y)

        offset = test_idx[0]
        for test_train, test_test in split:
            test_train = np.concatenate((train_idx, test_train + offset))
            yield test_train, test_test

Source File: _validation.py From mriqc with BSD 3-Clause "New" or "Revised" License

5 votes

def cross_val_score(
    estimator,
    X,
    y=None,
    groups=None,
    scoring=None,
    cv=None,
    n_jobs=1,
    verbose=0,
    fit_params=None,
    pre_dispatch="2*n_jobs",
):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score)(
            clone(estimator), X, y, scorer, train, test, verbose, None, fit_params
        )
        for train, test in splits
    )

    group_order = []
    if hasattr(cv, "groups"):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order

Source File: split.py From TSCV with BSD 3-Clause "New" or "Revised" License

4 votes

def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like, with shape (n_samples,)
            Always ignored, exists for compatibility.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        gap_size = self.gap_size
        test_size = self.test_size if self.test_size else n_samples // n_folds

        # Make sure we have enough samples for the given split parameters
        if n_folds > n_samples:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of samples: {1}.").format(n_folds,
                                                             n_samples))
        if n_samples - gap_size - (test_size * n_splits) <= 0:
            raise ValueError(
                ("Too many splits ={0} for number of samples"
                 " ={1} with test_size ={2} and gap_size ={3}."
                 "").format(n_splits, n_samples, test_size, gap_size))

        indices = np.arange(n_samples)
        test_starts = range(n_samples - n_splits * test_size,
                            n_samples, test_size)

        for test_start in test_starts:
            train_end = test_start - gap_size
            if self.max_train_size and self.max_train_size < train_end:
                yield (indices[train_end - self.max_train_size:train_end],
                       indices[test_start:test_start + test_size])
            else:
                yield (indices[:train_end],
                       indices[test_start:test_start + test_size])

Source File: _validation.py From mriqc with BSD 3-Clause "New" or "Revised" License

4 votes

def permutation_test_score(
    estimator,
    X,
    y,
    groups=None,
    cv=None,
    n_permutations=100,
    n_jobs=1,
    random_state=0,
    verbose=0,
    scoring=None,
):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    return permutation_scores

Source File: base.py From skoot with MIT License

4 votes

def _validate_X_y_ratio_classes(X, y, ratio):
    # validate the cheap stuff before copying arrays around...
    validate_float(ratio, 'balance_ratio')

    # validate arrays
    X, y = indexable(X, y)  # want to allow pd.DataFrame
    y = column_or_1d(y, warn=False)  # type: np.ndarray

    # get n classes in y, ensure they are <= MAX_N_CLASSES, but first
    # ensure these are actually class labels and not floats or anything...
    y_type = type_of_target(y)
    supported_types = {'multiclass', 'binary'}
    if y_type not in supported_types:
        raise ValueError('balancers only support %r, but got %r'
                         % ("(" + ', '.join(supported_types) + ")", y_type))

    present_classes, counts = np.unique(y, return_counts=True)
    n_classes = len(present_classes)

    # ensure <= MAX_N_CLASSES
    if n_classes > MAX_N_CLASSES:
        raise ValueError('balancers currently only support a maximum of %i '
                         'unique class labels, but %i were identified.'
                         % (MAX_N_CLASSES, n_classes))

    # get the majority class label, and its count:
    majority_count_idx = np.argmax(counts, axis=0)
    majority_label, majority_count = (present_classes[majority_count_idx],
                                      counts[majority_count_idx])
    target_count = max(int(ratio * majority_count), 1)

    # define a min_n_samples based on the sample ratio to max_class
    # required = {target_count - counts[i]
    #             for i, v in enumerate(present_classes)
    #             if v != majority_label}

    # THIS WAS OUR ORIGINAL LOGIC:
    #   * If there were any instances where the number of synthetic examples
    #     required for a class outweighed the number that existed in the class
    #     to begin with, we would end up having to potentially sample from the
    #     synthetic examples. We didn't want to have to do that.
    #
    # But it seems like a totally valid use-case. If we're detecting breast
    # cancer, it might be a rare event that needs lots of bolstering. We
    # should allow that, even though we may discourage it.

    # if any counts < MIN_N_SAMPLES, raise:
    if any(i < MIN_N_SAMPLES for i in counts):
        raise ValueError('All label counts must be >= %i' % MIN_N_SAMPLES)

    return (X, y, n_classes, present_classes, counts,
            majority_label, target_count)

Python sklearn.utils.indexable() Examples