Python sklearn.utils.multiclass.type_of_target() Examples
The following are 30
code examples of sklearn.utils.multiclass.type_of_target().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils.multiclass
, or try the search function
.
Example #1
Source File: run.py From nyaggle with MIT License | 10 votes |
def _dispatch_gbdt_class(algorithm_type: str, type_of_target: str): is_regression = type_of_target == 'continuous' if algorithm_type == 'lgbm': requires_lightgbm() from lightgbm import LGBMClassifier, LGBMRegressor return LGBMRegressor if is_regression else LGBMClassifier elif algorithm_type == 'cat': requires_catboost() from catboost import CatBoostClassifier, CatBoostRegressor return CatBoostRegressor if is_regression else CatBoostClassifier else: requires_xgboost() assert algorithm_type == 'xgb' from xgboost import XGBClassifier, XGBRegressor return XGBRegressor if is_regression else XGBClassifier
Example #2
Source File: nested_cv.py From Nested-Cross-Validation with MIT License | 6 votes |
def _predict_and_score(self, X_test, y_test): #XXX: Implement type_of_target(y) if(self.predict_proba): y_type = type_of_target(y_test) if(y_type in ('binary')): pred = self.model.predict_proba(X_test)[:,1] else: pred = self.model.predict_proba(X_test) else: pred = self.model.predict(X_test) if(self.multiclass_average == 'binary'): return self.metric(y_test, pred), pred else: return self.metric(y_test, pred, average=self.multiclass_average), pred
Example #3
Source File: WOE_IV.py From exploripy with MIT License | 6 votes |
def feature_discretion(self, X): ''' Discrete the continuous features of input data X, and keep other features unchanged. :param X : numpy array :return: the numpy array in which all continuous features are discreted ''' temp = [] for i in range(0, X.shape[-1]): x = X[:, i] x_type = type_of_target(x) if x_type == 'continuous': x1 = self.discrete(x) temp.append(x1) else: temp.append(x) return np.array(temp).T
Example #4
Source File: test_multiclass.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_type_of_target(): for group, group_examples in iteritems(EXAMPLES): for example in group_examples: assert_equal(type_of_target(example), group, msg=('type_of_target(%r) should be %r, got %r' % (example, group, type_of_target(example)))) for example in NON_ARRAY_LIKE_EXAMPLES: msg_regex = 'Expected array-like \(array or non-string sequence\).*' assert_raises_regex(ValueError, msg_regex, type_of_target, example) for example in MULTILABEL_SEQUENCES: msg = ('You appear to be using a legacy multi-label data ' 'representation. Sequence of sequences are no longer supported;' ' use a binary array or sparse matrix instead.') assert_raises_regex(ValueError, msg, type_of_target, example) try: from pandas import SparseSeries except ImportError: raise SkipTest("Pandas not found") y = SparseSeries([1, 0, 0, 1, 0]) msg = "y cannot be class 'SparseSeries'." assert_raises_regex(ValueError, msg, type_of_target, y)
Example #5
Source File: information_value.py From information_value with MIT License | 6 votes |
def feature_discretion(self, X): ''' Discrete the continuous features of input data X, and keep other features unchanged. :param X : numpy array :return: the numpy array in which all continuous features are discreted ''' temp = [] for i in range(0, X.shape[-1]): x = X[:, i] x_type = type_of_target(x) if x_type == 'continuous': x1 = self.discrete(x) temp.append(x1) else: temp.append(x) return np.array(temp).T
Example #6
Source File: base.py From polylearn with BSD 2-Clause "Simplified" License | 6 votes |
def _check_X_y(self, X, y): # helpful error message for sklearn < 1.17 is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2 if is_2d or type_of_target(y) != 'binary': raise TypeError("Only binary targets supported. For training " "multiclass or multilabel models, you may use the " "OneVsRest or OneVsAll metaestimators in " "scikit-learn.") X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc', multi_output=False) self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1) y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double) return X, y
Example #7
Source File: _search.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def check_cv(cv=3, y=None, classifier=False): """Dask aware version of ``sklearn.model_selection.check_cv`` Same as the scikit-learn version, but works if ``y`` is a dask object. """ if cv is None: cv = 3 # If ``cv`` is not an integer, the scikit-learn implementation doesn't # touch the ``y`` object, so passing on a dask object is fine if not is_dask_collection(y) or not isinstance(cv, numbers.Integral): return model_selection.check_cv(cv, y, classifier=classifier) if classifier: # ``y`` is a dask object. We need to compute the target type target_type = delayed(type_of_target, pure=True)(y).compute() if target_type in ("binary", "multiclass"): return StratifiedKFold(cv) return KFold(cv)
Example #8
Source File: nn.py From tpot with GNU Lesser General Public License v3.0 | 6 votes |
def validate_inputs(self, X, y): # Things we don't want to allow until we've tested them: # - Sparse inputs # - Multiclass outputs (e.g., more than 2 classes in `y`) # - Non-finite inputs # - Complex inputs X, y = check_X_y(X, y, accept_sparse=False, allow_nd=False) assert_all_finite(X, y) if type_of_target(y) != 'binary': raise ValueError("Non-binary targets not supported") if np.any(np.iscomplex(X)) or np.any(np.iscomplex(y)): raise ValueError("Complex data not supported") if np.issubdtype(X.dtype, np.object_) or np.issubdtype(y.dtype, np.object_): try: X = X.astype(float) y = y.astype(int) except (TypeError, ValueError): raise ValueError("argument must be a string.* number") return (X, y)
Example #9
Source File: ml_stratifiers.py From iterative-stratification with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _make_test_folds(self, X, y): y = np.asarray(y, dtype=bool) type_of_target_y = type_of_target(y) if type_of_target_y != 'multilabel-indicator': raise ValueError( 'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y)) num_samples = y.shape[0] rng = check_random_state(self.random_state) indices = np.arange(num_samples) if self.shuffle: rng.shuffle(indices) y = y[indices] r = np.asarray([1 / self.n_splits] * self.n_splits) test_folds = IterativeStratification(labels=y, r=r, random_state=rng) return test_folds[np.argsort(indices)]
Example #10
Source File: labels.py From pumpp with ISC License | 6 votes |
def fit(self, y): """Fit label binarizer Parameters ---------- y : array of shape [n_samples,] or [n_samples, n_classes] Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Returns ------- self : returns an instance of self. """ self.y_type_ = type_of_target(y) if 'multioutput' in self.y_type_: raise ValueError("Multioutput target data is not supported with " "label binarization") if _num_samples(y) == 0: raise ValueError('y has 0 samples: %r' % y) self.sparse_input_ = sp.issparse(y) self.classes_ = unique_labels(y) return self
Example #11
Source File: mlp_classifier.py From muffnn with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _is_multilabel(self, y): """ Return whether the given target array corresponds to a multilabel problem. """ temp_y = y.copy() temp_y[np.zeros_like(temp_y, dtype=bool) | (temp_y == -1)] = 1 target_type = type_of_target(temp_y) if target_type in ['binary', 'multiclass']: return False elif target_type == 'multilabel-indicator': return True else: # Raise an error, as in # sklearn.utils.multiclass.check_classification_targets. raise ValueError("Unknown label type: %s" % target_type)
Example #12
Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_type_of_target(): for group, group_examples in EXAMPLES.items(): for example in group_examples: assert_equal(type_of_target(example), group, msg=('type_of_target(%r) should be %r, got %r' % (example, group, type_of_target(example)))) for example in NON_ARRAY_LIKE_EXAMPLES: msg_regex = r'Expected array-like \(array or non-string sequence\).*' assert_raises_regex(ValueError, msg_regex, type_of_target, example) for example in MULTILABEL_SEQUENCES: msg = ('You appear to be using a legacy multi-label data ' 'representation. Sequence of sequences are no longer supported;' ' use a binary array or sparse matrix instead.') assert_raises_regex(ValueError, msg, type_of_target, example) try: from pandas import SparseSeries except ImportError: raise SkipTest("Pandas not found") y = SparseSeries([1, 0, 0, 1, 0]) msg = "y cannot be class 'SparseSeries'." assert_raises_regex(ValueError, msg, type_of_target, y)
Example #13
Source File: test_averaging.py From nyaggle with MIT License | 6 votes |
def _make_1st_stage_preds(X, y, X_test): if type_of_target(y) == 'continuous': models = [ SVR(), Ridge(random_state=0), RandomForestRegressor(n_estimators=30, random_state=0) ] else: models = [ SVC(random_state=0), LogisticRegression(random_state=0), RandomForestClassifier(n_estimators=30, random_state=0) ] results = [cross_validate(m, X, y, X_test, cv=5) for m in models] return [r.oof_prediction for r in results], [r.test_prediction for r in results]
Example #14
Source File: test_stacking.py From nyaggle with MIT License | 6 votes |
def _make_1st_stage_preds(X, y, X_test): if type_of_target(y) == 'continuous': models = [ SVR(), Ridge(random_state=0), RandomForestRegressor(n_estimators=30, random_state=0) ] else: models = [ SVC(random_state=0), LogisticRegression(random_state=0), RandomForestClassifier(n_estimators=30, random_state=0) ] results = [cross_validate(m, X, y, X_test, cv=5) for m in models] return [r.oof_prediction for r in results], [r.test_prediction for r in results]
Example #15
Source File: labels.py From pumpp with ISC License | 5 votes |
def transform(self, y): """Transform multi-class labels to binary labels The output of transform is sometimes referred to by some authors as the 1-of-K coding scheme. Parameters ---------- y : array or sparse matrix of shape [n_samples,] or \ [n_samples, n_classes] Target values. The 2-d matrix should only contain 0 and 1, represents multilabel classification. Sparse matrix can be CSR, CSC, COO, DOK, or LIL. Returns ------- Y : numpy array or CSR matrix of shape [n_samples, n_classes] Shape will be [n_samples, 1] for binary problems. """ check_is_fitted(self, 'classes_') y_is_multilabel = type_of_target(y).startswith('multilabel') if y_is_multilabel and not self.y_type_.startswith('multilabel'): raise ValueError("The object was not fitted with multilabel" " input.") return label_binarize(y, self.classes_, pos_label=self.pos_label, neg_label=self.neg_label, sparse_output=self.sparse_output)
Example #16
Source File: WOE_IV.py From exploripy with MIT License | 5 votes |
def check_target_binary(self, y): ''' check if the target variable is binary, raise error if not. :param y: :return: ''' y_type = type_of_target(y) if y_type not in ['binary']: raise ValueError('Label type must be binary')
Example #17
Source File: test_label.py From twitter-stock-recommendation with MIT License | 5 votes |
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if ((pos_label == 0 or neg_label != 0) and sparse_output): assert_raises(ValueError, label_binarize, y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) continue # check label_binarize binarized = label_binarize(y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding(binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.)) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert_equal(issparse(inverse_output), issparse(y))
Example #18
Source File: split.py From nyaggle with MIT License | 5 votes |
def check_cv(cv: Union[int, Iterable, BaseCrossValidator] = 5, y: Optional[Union[pd.Series, np.ndarray]] = None, stratified: bool = False, random_state: int = 0): if cv is None: cv = 5 if isinstance(cv, numbers.Integral): if stratified and (y is not None) and (type_of_target(y) in ('binary', 'multiclass')): return StratifiedKFold(cv, shuffle=True, random_state=random_state) else: return KFold(cv, shuffle=True, random_state=random_state) return model_selection.check_cv(cv, y, stratified)
Example #19
Source File: test_common.py From twitter-stock-recommendation with MIT License | 5 votes |
def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score): is_multilabel = type_of_target(y_true).startswith("multilabel") metric = ALL_METRICS[name] if name in METRICS_WITH_AVERAGING: _check_averaging(metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel) elif name in THRESHOLDED_METRICS_WITH_AVERAGING: _check_averaging(metric, y_true, y_score, y_true_binarize, y_score, is_multilabel) else: raise ValueError("Metric is not recorded as having an average option")
Example #20
Source File: __init__.py From autogluon with Apache License 2.0 | 5 votes |
def __call__(self, y_true, y_pred, sample_weight=None): """Evaluate decision function output for X relative to y_true. Parameters ---------- y_true : array-like Gold standard target values for X. These must be class labels, not probabilities. y_pred : array-like, [n_samples x n_classes] Model predictions sample_weight : array-like, optional (default=None) Sample weights. Returns ------- score : float Score function applied to prediction of estimator on X. """ if isinstance(y_true, list): y_true = np.array(y_true) if isinstance(y_pred, list): y_pred = np.array(y_pred) y_type = type_of_target(y_true) if y_type not in ("binary", "multilabel-indicator"): raise ValueError("{0} format is not supported".format(y_type)) if y_type == "binary": pass # y_pred = y_pred[:, 1] elif isinstance(y_pred, list): y_pred = np.vstack([p[:, -1] for p in y_pred]).T if sample_weight is not None: return self._sign * self._score_func(y_true, y_pred, sample_weight=sample_weight, **self._kwargs) else: return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
Example #21
Source File: information_value.py From information_value with MIT License | 5 votes |
def check_target_binary(self, y): ''' check if the target variable is binary, raise error if not. :param y: :return: ''' y_type = type_of_target(y) if y_type not in ['binary']: raise ValueError('Label type must be binary')
Example #22
Source File: test_corner_cases.py From smrt with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_label_corner_cases(): # the current max classes is 100 (might change though). n_classes = base.MAX_N_CLASSES + 1 # create n_classes labels, append on itself so there are at least two of each # so sklearn will find it as a multi-class and not a continuous target labels = np.arange(n_classes) labels = np.concatenate([labels, labels]) # assert that it's multiclass and that we're getting the appropriate ValueError! y_type = type_of_target(labels) assert y_type == 'multiclass', y_type # create an X of random. Doesn't even matter. x = np.random.rand(labels.shape[0], 4) # try to balance, but it will fail because of the number of classes assert_raises(ValueError, smote_balance, x, labels) # now time for continuous... labels = np.linspace(0, 1000, x.shape[0]) # fails because improper y_type assert_raises(ValueError, smote_balance, x, labels) # perform a balancing operation with only one observation, and show that it will raise labels = np.zeros(x.shape[0]) labels[0] = 1 # this is the only one. y_type = type_of_target(labels) assert y_type == 'binary', y_type # fails because only one observation of one of the classes assert_raises(ValueError, smote_balance, x, labels)
Example #23
Source File: test_common.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score): is_multilabel = type_of_target(y_true).startswith("multilabel") metric = ALL_METRICS[name] if name in METRICS_WITH_AVERAGING: _check_averaging(metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel) elif name in THRESHOLDED_METRICS_WITH_AVERAGING: _check_averaging(metric, y_true, y_score, y_true_binarize, y_score, is_multilabel) else: raise ValueError("Metric is not recorded as having an average option")
Example #24
Source File: preprocessing.py From reportgen with MIT License | 5 votes |
def _posibility(self, x, tag, event=1): """计算触发概率 Parameters: ---------- x (Sequence): - 离散特征序列 tag (Sequence): - 用于训练的标签序列 event (any): - True指代的触发事件 Returns: ---------- Dict[str,Tuple[rate_T, rate_F]]: - 训练好后的好坏触发概率 """ if type_of_target(tag) not in ['binary']: raise AttributeError("tag must be a binary array") #if type_of_target(x) in ['continuous']: # raise AttributeError("input array must not continuous") tag = np.array(tag) x = np.array(x) event_total = (tag == event).sum() non_event_total = tag.shape[-1] - event_total x_labels = pd.unique(x[pd.notnull(x)]) pos_dic = {} for x1 in x_labels: # 当 x1 是nan时,y1 也为空 y1 = tag[np.where(x == x1)[0]] event_count = (y1 == event).sum() non_event_count = y1.shape[-1] - event_count rate_event = 1.0 * event_count / event_total rate_non_event = 1.0 * non_event_count / non_event_total pos_dic[x1] = (rate_event, rate_non_event) return pos_dic
Example #25
Source File: metrics.py From hyperparameter_hunter with MIT License | 5 votes |
def get_clean_prediction(target: ArrayLike, prediction: ArrayLike): """Create `prediction` that is of a form comparable to `target` Parameters ---------- target: Array-like True labels for the data. Should be same shape as `prediction` prediction: Array-like Predicted labels for the data. Should be same shape as `target` Returns ------- prediction: Array-like If `target` types are ints, and `prediction` types are not, given predicted labels clipped between the min, and max of `target`, then rounded to the nearest integer. Else, original predicted labels""" target_type = type_of_target(target) prediction_type = type_of_target(prediction) # ValueError probably: "Classification metrics can't handle a mix of binary and continuous targets" if _is_int(target) and not _is_int(prediction): #################### Get Minimum/Maximum #################### target_min, target_max = target.min(), target.max() with suppress(TypeError): # Bypass one-dimensional arrays, whose min/max should be a scalar if (len(target_min) == 1) and (len(target_max) == 1): target_min, target_max = target_min[0], target_max[0] #################### Clip/Round `prediction` #################### try: prediction = np.clip(prediction, target_min, target_max) except ValueError: prediction = prediction.clip(target_min, target_max, axis=1) finally: prediction = prediction.astype(np.float64) prediction = np.rint(prediction) elif target_type in classification_target_types and prediction_type.startswith("continuous"): prediction = classify_output(target, prediction) # TODO: One-hot-encoded outputs will be of type "multiclass-multioutput" - Handle it return prediction
Example #26
Source File: ml_stratifiers.py From iterative-stratification with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _iter_indices(self, X, y, groups=None): n_samples = _num_samples(X) y = check_array(y, ensure_2d=False, dtype=None) y = np.asarray(y, dtype=bool) type_of_target_y = type_of_target(y) if type_of_target_y != 'multilabel-indicator': raise ValueError( 'Supported target type is: multilabel-indicator. Got {!r} instead.'.format( type_of_target_y)) n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) n_samples = y.shape[0] rng = check_random_state(self.random_state) y_orig = y.copy() r = np.array([n_train, n_test]) / (n_train + n_test) for _ in range(self.n_splits): indices = np.arange(n_samples) rng.shuffle(indices) y = y_orig[indices] test_folds = IterativeStratification(labels=y, r=r, random_state=rng) test_idx = test_folds[np.argsort(indices)] == 1 test = np.where(test_idx)[0] train = np.where(~test_idx)[0] yield train, test
Example #27
Source File: test_label.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if ((pos_label == 0 or neg_label != 0) and sparse_output): assert_raises(ValueError, label_binarize, y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) continue # check label_binarize binarized = label_binarize(y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding(binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.)) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert_equal(issparse(inverse_output), issparse(y))
Example #28
Source File: models.py From scikit-uplift with MIT License | 5 votes |
def fit(self, X, y, treatment, estimator_fit_params=None): """Fit the model according to the given training data. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator. Returns: object: self """ # TODO: check the treatment is binary check_consistent_length(X, y, treatment) self._type_of_target = type_of_target(y) if self._type_of_target != 'binary': raise ValueError("This approach is only suitable for binary classification problem") _, treatment_counts = np.unique(treatment, return_counts=True) if treatment_counts[0] != treatment_counts[1]: warnings.warn( "It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.", category=UserWarning, stacklevel=2 ) y_mod = (np.array(y) == np.array(treatment)).astype(int) if estimator_fit_params is None: estimator_fit_params = {} self.estimator.fit(X, y_mod, **estimator_fit_params) return self
Example #29
Source File: base.py From smrt with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _validate_X_y_ratio_classes(X, y, ratio): # validate the cheap stuff before copying arrays around... validate_float(ratio, 'balance_ratio') # validate arrays X = check_array(X, accept_sparse=False, dtype=NPDTYPE, ensure_2d=True, copy=True) y = check_array(y, accept_sparse=False, ensure_2d=False, dtype=None) y = column_or_1d(y, warn=False) # get n classes in y, ensure they are <= MAX_N_CLASSES, but first ensure these are actually # class labels and not floats or anything... y_type = type_of_target(y) supported_types = {'multiclass', 'binary'} if y_type not in supported_types: raise ValueError('balancers only support %r, but got %r' % ("(" + ', '.join(supported_types) + ")", y_type)) present_classes, counts = np.unique(y, return_counts=True) n_classes = len(present_classes) # ensure <= MAX_N_CLASSES if n_classes > MAX_N_CLASSES: raise ValueError('balancers currently only support a maximum of %i ' 'unique class labels, but %i were identified.' % (MAX_N_CLASSES, n_classes)) # get the majority class label, and its count: majority_count_idx = np.argmax(counts, axis=0) majority_label, majority_count = present_classes[majority_count_idx], counts[majority_count_idx] target_count = max(int(ratio * majority_count), 1) # define a min_n_samples based on the sample ratio to max_class # required = {target_count - counts[i] for i, v in enumerate(present_classes) if v != majority_label} # THIS WAS OUR ORIGINAL LOGIC: # * If there were any instances where the number of synthetic examples required for a class # outweighed the number that existed in the class to begin with, we would end up having to # potentially sample from the synthetic examples. We didn't want to have to do that. # # But it seems like a totally valid use-case. If we're detecting breast cancer, it might be a rare # event that needs lots of bolstering. We should allow that, even though we may discourage it. # if any counts < MIN_N_SAMPLES, raise: if any(i < MIN_N_SAMPLES for i in counts): raise ValueError('All label counts must be >= %i' % MIN_N_SAMPLES) return X, y, n_classes, present_classes, counts, majority_label, target_count
Example #30
Source File: base.py From skoot with MIT License | 4 votes |
def _validate_X_y_ratio_classes(X, y, ratio): # validate the cheap stuff before copying arrays around... validate_float(ratio, 'balance_ratio') # validate arrays X, y = indexable(X, y) # want to allow pd.DataFrame y = column_or_1d(y, warn=False) # type: np.ndarray # get n classes in y, ensure they are <= MAX_N_CLASSES, but first # ensure these are actually class labels and not floats or anything... y_type = type_of_target(y) supported_types = {'multiclass', 'binary'} if y_type not in supported_types: raise ValueError('balancers only support %r, but got %r' % ("(" + ', '.join(supported_types) + ")", y_type)) present_classes, counts = np.unique(y, return_counts=True) n_classes = len(present_classes) # ensure <= MAX_N_CLASSES if n_classes > MAX_N_CLASSES: raise ValueError('balancers currently only support a maximum of %i ' 'unique class labels, but %i were identified.' % (MAX_N_CLASSES, n_classes)) # get the majority class label, and its count: majority_count_idx = np.argmax(counts, axis=0) majority_label, majority_count = (present_classes[majority_count_idx], counts[majority_count_idx]) target_count = max(int(ratio * majority_count), 1) # define a min_n_samples based on the sample ratio to max_class # required = {target_count - counts[i] # for i, v in enumerate(present_classes) # if v != majority_label} # THIS WAS OUR ORIGINAL LOGIC: # * If there were any instances where the number of synthetic examples # required for a class outweighed the number that existed in the class # to begin with, we would end up having to potentially sample from the # synthetic examples. We didn't want to have to do that. # # But it seems like a totally valid use-case. If we're detecting breast # cancer, it might be a rare event that needs lots of bolstering. We # should allow that, even though we may discourage it. # if any counts < MIN_N_SAMPLES, raise: if any(i < MIN_N_SAMPLES for i in counts): raise ValueError('All label counts must be >= %i' % MIN_N_SAMPLES) return (X, y, n_classes, present_classes, counts, majority_label, target_count)