Python sklearn.utils.validation._num_samples() Examples
The following are 20
code examples of sklearn.utils.validation._num_samples().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils.validation
, or try the search function
.
Example #1
Source File: test_split.py From twitter-stock-recommendation with MIT License | 6 votes |
def check_cv_coverage(cv, X, y, groups, expected_n_splits=None): n_samples = _num_samples(X) # Check that a all the samples appear at least once in a test fold if expected_n_splits is not None: assert_equal(cv.get_n_splits(X, y, groups), expected_n_splits) else: expected_n_splits = cv.get_n_splits(X, y, groups) collected_test_samples = set() iterations = 0 for train, test in cv.split(X, y, groups): check_valid_split(train, test, n_samples=n_samples) iterations += 1 collected_test_samples.update(test) # Check that the accumulated test samples cover the whole dataset assert_equal(iterations, expected_n_splits) if n_samples is not None: assert_equal(collected_test_samples, set(range(n_samples)))
Example #2
Source File: labels.py From pumpp with ISC License | 6 votes |
def transform(self, y): """Transform labels to normalized encoding. Parameters ---------- y : array-like of shape [n_samples] Target values. Returns ------- y : array-like of shape [n_samples] """ check_is_fitted(self, 'classes_') y = column_or_1d(y, warn=True) # transform of empty array is empty array if _num_samples(y) == 0: return np.array([]) _, y = _encode(y, uniques=self.classes_, encode=True) return y
Example #3
Source File: labels.py From pumpp with ISC License | 6 votes |
def inverse_transform(self, y): """Transform labels back to original encoding. Parameters ---------- y : numpy array of shape [n_samples] Target values. Returns ------- y : numpy array of shape [n_samples] """ check_is_fitted(self, 'classes_') y = column_or_1d(y, warn=True) # inverse transform of empty array is empty array if _num_samples(y) == 0: return np.array([]) diff = np.setdiff1d(y, np.arange(len(self.classes_))) if len(diff): raise ValueError( "y contains previously unseen labels: %s" % str(diff)) y = np.asarray(y) return self.classes_[y]
Example #4
Source File: labels.py From pumpp with ISC License | 6 votes |
def fit(self, y): """Fit label binarizer Parameters ---------- y : array of shape [n_samples,] or [n_samples, n_classes] Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ self.y_type_ = type_of_target(y) if 'multioutput' in self.y_type_: raise ValueError("Multioutput target data is not supported with " "label binarization") if _num_samples(y) == 0: raise ValueError('y has 0 samples: %r' % y) self.sparse_input_ = sp.issparse(y) self.classes_ = unique_labels(y) return self
Example #5
Source File: one_against_rest.py From qiskit-aqua with Apache License 2.0 | 6 votes |
def predict(self, x): """ Applying multiple estimators for prediction. Args: x (numpy.ndarray): NxD array Returns: numpy.ndarray: predicted labels, Nx1 array """ n_samples = _num_samples(x) maxima = np.empty(n_samples, dtype=float) maxima.fill(-np.inf) argmaxima = np.zeros(n_samples, dtype=int) for i, e in enumerate(self.estimators): pred = np.ravel(e.decision_function(x)) np.maximum(maxima, pred, out=maxima) argmaxima[maxima == pred] = i return self.classes[np.array(argmaxima.T)]
Example #6
Source File: _search.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _do_n_samples(dsk, token, Xs, n_splits): name = "n_samples-" + token n_samples = [] n_samples_append = n_samples.append seen = {} m = 0 for x in Xs: if x in seen: n_samples_append(seen[x]) else: for n in range(n_splits): dsk[name, m, n] = (_num_samples, x + (n,)) n_samples_append((name, m)) seen[x] = (name, m) m += 1 return n_samples
Example #7
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def check_cv_coverage(cv, X, y, groups, expected_n_splits=None): n_samples = _num_samples(X) # Check that a all the samples appear at least once in a test fold if expected_n_splits is not None: assert_equal(cv.get_n_splits(X, y, groups), expected_n_splits) else: expected_n_splits = cv.get_n_splits(X, y, groups) collected_test_samples = set() iterations = 0 for train, test in cv.split(X, y, groups): check_valid_split(train, test, n_samples=n_samples) iterations += 1 collected_test_samples.update(test) # Check that the accumulated test samples cover the whole dataset assert_equal(iterations, expected_n_splits) if n_samples is not None: assert_equal(collected_test_samples, set(range(n_samples)))
Example #8
Source File: split.py From TSCV with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_n_splits(self, X, y=None, groups=None): """Returns the number of splitting iterations in the cross-validator Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. """ self.__check_validity(X, y, groups) n_samples = _num_samples(X) gap_before, gap_after = self.gap_before, self.gap_after if n_samples - gap_after - self.p >= gap_before + 1: n_splits = n_samples - self.p + 1 else: n_splits = max(n_samples - gap_after - self.p, 0) n_splits += max(n_samples - self.p - gap_before, 0) return n_splits
Example #9
Source File: split.py From TSCV with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _iter_test_indices(self, X, y=None, groups=None): self.__check_validity(X, y, groups) n_samples = _num_samples(X) gap_before, gap_after = self.gap_before, self.gap_after if n_samples - gap_after - self.p >= gap_before + 1: for i in range(n_samples - self.p + 1): yield np.arange(i, i + self.p) else: for i in range(n_samples - gap_after - self.p): yield np.arange(i, i + self.p) for i in range(gap_before + 1, n_samples - self.p + 1): yield np.arange(i, i + self.p)
Example #10
Source File: utils_test.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def predict(self, T): if self.check_X is not None: assert self.check_X(T) return self.classes_[np.zeros(_num_samples(T), dtype=np.int)]
Example #11
Source File: utils.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _num_samples(X): result = sk_validation._num_samples(X) if dask.is_dask_collection(result): # dask dataframe result = result.compute() return result
Example #12
Source File: ml_stratifiers.py From iterative-stratification with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _iter_indices(self, X, y, groups=None): n_samples = _num_samples(X) y = check_array(y, ensure_2d=False, dtype=None) y = np.asarray(y, dtype=bool) type_of_target_y = type_of_target(y) if type_of_target_y != 'multilabel-indicator': raise ValueError( 'Supported target type is: multilabel-indicator. Got {!r} instead.'.format( type_of_target_y)) n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) n_samples = y.shape[0] rng = check_random_state(self.random_state) y_orig = y.copy() r = np.array([n_train, n_test]) / (n_train + n_test) for _ in range(self.n_splits): indices = np.arange(n_samples) rng.shuffle(indices) y = y_orig[indices] test_folds = IterativeStratification(labels=y, r=r, random_state=rng) test_idx = test_folds[np.argsort(indices)] == 1 test = np.where(test_idx)[0] train = np.where(~test_idx)[0] yield train, test
Example #13
Source File: split.py From TSCV with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __check_validity(self, X, y=None, groups=None): if X is None: raise ValueError("The 'X' parameter should not be None.") n_samples = _num_samples(X) gap_before, gap_after = self.gap_before, self.gap_after if (0 >= n_samples - gap_after - self.p and gap_before >= n_samples - self.p): raise ValueError("Not enough training samples available.") if n_samples - gap_after - self.p <= gap_before + 1: warnings.warn(SINGLETON_WARNING, Warning)
Example #14
Source File: split.py From TSCV with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _iter_train_masks(self, X=None, y=None, groups=None): """Generates boolean masks corresponding to training sets. By default, delegates to _iter_train_indices(X, y, groups) """ return GapCrossValidator.__indices_to_masks( self._iter_train_indices(X, y, groups), _num_samples(X))
Example #15
Source File: split.py From TSCV with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _iter_train_indices(self, X=None, y=None, groups=None): """Generates integer indices corresponding to training sets. By default, delegates to _iter_test_indices(X, y, groups) """ return self.__complement_indices( self._iter_test_indices(X, y, groups), _num_samples(X))
Example #16
Source File: encoders.py From sagemaker-scikit-learn-extension with Apache License 2.0 | 5 votes |
def inverse_transform(self, y): """Transform labels back to original encoding. If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_label_value`` for unseen values. Parameters ---------- y : numpy array of shape [n_samples] Encoded label values. Returns ------- y_decoded : numpy array of shape [n_samples] Label values. """ check_is_fitted(self, "classes_") y = column_or_1d(y, warn=True) if y.dtype.kind not in ("i", "u"): try: y = y.astype(np.float).astype(np.int) except ValueError: raise ValueError("`y` contains values not convertible to integer.") # inverse transform of empty array is empty array if _num_samples(y) == 0: return np.array([]) labels = np.arange(len(self.classes_)) diff = np.setdiff1d(y, labels) if diff and not self.fill_unseen_labels: raise ValueError("y contains previously unseen labels: %s" % str(diff)) y_decoded = [self.classes_[idx] if idx in labels else self.fill_label_value for idx in y] return y_decoded
Example #17
Source File: encoders.py From sagemaker-scikit-learn-extension with Apache License 2.0 | 5 votes |
def transform(self, y): """Transform labels to normalized encoding. If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_encoded_label_value`` for unseen values. Seen labels are encoded with value between 0 and n_classes-1. Unseen labels are encoded with ``self.fill_encoded_label_value`` with a default value of n_classes. Parameters ---------- y : array-like of shape [n_samples] Label values. Returns ------- y_encoded : array-like of shape [n_samples] Encoded label values. """ check_is_fitted(self, "classes_") y = column_or_1d(y, warn=True) # transform of empty array is empty array if _num_samples(y) == 0: return np.array([]) if self.fill_unseen_labels: _, mask = _encode_check_unknown(y, self.classes_, return_mask=True) y_encoded = np.searchsorted(self.classes_, y) fill_encoded_label_value = self.fill_encoded_label_value or len(self.classes_) y_encoded[~mask] = fill_encoded_label_value else: _, y_encoded = _encode(y, uniques=self.classes_, encode=True) return y_encoded
Example #18
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_check_sample_weight(): from sklearn.cluster.k_means_ import _check_sample_weight sample_weight = None checked_sample_weight = _check_sample_weight(X, sample_weight) assert_equal(_num_samples(X), _num_samples(checked_sample_weight)) assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) assert_equal(X.dtype, checked_sample_weight.dtype)
Example #19
Source File: split.py From TSCV with BSD 3-Clause "New" or "Revised" License | 4 votes |
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups : array-like, with shape (n_samples,) Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 gap_size = self.gap_size test_size = self.test_size if self.test_size else n_samples // n_folds # Make sure we have enough samples for the given split parameters if n_folds > n_samples: raise ValueError( ("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format(n_folds, n_samples)) if n_samples - gap_size - (test_size * n_splits) <= 0: raise ValueError( ("Too many splits ={0} for number of samples" " ={1} with test_size ={2} and gap_size ={3}." "").format(n_splits, n_samples, test_size, gap_size)) indices = np.arange(n_samples) test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) for test_start in test_starts: train_end = test_start - gap_size if self.max_train_size and self.max_train_size < train_end: yield (indices[train_end - self.max_train_size:train_end], indices[test_start:test_start + test_size]) else: yield (indices[:train_end], indices[test_start:test_start + test_size])
Example #20
Source File: cross_validation.py From sparkit-learn with Apache License 2.0 | 4 votes |
def _fit_and_score(estimator, Z, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in list(parameters.items()))) print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))) fit_params = fit_params if fit_params is not None else {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() Z_train = Z[train] Z_test = Z[test] try: estimator.fit(Z_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, Z_test, scorer) if return_train_score: train_score = _score(estimator, Z_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(Z_test), scoring_time]) if return_parameters: ret.append(parameters) return ret