Python Examples of sklearn.utils.validation.column_or

Source File: utils.py From pmdarima with MIT License

6 votes

def is_constant(x):
    """Test ``x`` for constancy.

    Determine whether a vector is composed of all of the same elements
    and nothing else.

    Parameters
    ----------
    x : array-like, shape=(n_samples,)
        The time series vector.

    Examples
    --------
    >>> import numpy as np
    >>> x = np.array([1, 2, 3])
    >>> y = np.ones(3)
    >>> [is_constant(x), is_constant(y)]
    [False, True]
    """
    x = column_or_1d(x)  # type: np.ndarray
    return (x == x[0]).all()

Source File: Estimator.py From tbats with MIT License

6 votes

def _validate(self, y):
        """Validates input time series. Also adjusts box_cox if necessary."""
        try:
            y = c1d(check_array(y, ensure_2d=False, force_all_finite=True, ensure_min_samples=1,
                                copy=True, dtype=np.float64))  # type: np.ndarray
        except Exception as validation_exception:
            self.context.get_exception_handler().exception(
                "y series is invalid", error.InputArgsException, previous_exception=validation_exception
            )
            return False

        if np.any(y <= 0):
            if self.use_box_cox is True:
                self.context.get_exception_handler().warn(
                    "Box-Cox transformation (use_box_cox) was forced to True "
                    "but there are negative values in input series. "
                    "Setting use_box_cox to False.",
                    error.InputArgsWarning
                )
            self.use_box_cox = False

        return y

Source File: transformations.py From keras-pandas with MIT License

6 votes

def fit(self, y):
        """Fit label encoder

        Parameters
        ----------
        y : array-like of shape (n_samples,)
            Target values.

        Returns
        -------
        self : returns an instance of self.
        """
        y = column_or_1d(y, warn=True)
        y = numpy.append(y, ['UNK'])
        self.classes_ = numpy.unique(y)
        return self

Source File: transformations.py From keras-pandas with MIT License

6 votes

def fit_transform(self, y, **kwargs):
        """Fit label encoder and return encoded labels

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.

        Returns
        -------
        y : array-like of shape [n_samples]
        :param **kwargs:
        """
        y = column_or_1d(y, warn=True)
        y = numpy.append(y, ['UNK'])
        self.classes_, y = numpy.unique(y, return_inverse=True)
        return y

Source File: transformations.py From keras-pandas with MIT License

6 votes

def transform(self, y):
        """Transform labels to normalized encoding.

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.

        Returns
        -------
        y : array-like of shape [n_samples]
        """
        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)
        y = numpy.array(list(map(lambda x: x if x in self.classes_ else 'UNK', y)))

        classes = numpy.unique(y)
        if len(numpy.intersect1d(classes, self.classes_)) < len(classes):
            diff = numpy.setdiff1d(classes, self.classes_)
            raise ValueError("y contains new labels: %s" % str(diff))
        return numpy.searchsorted(self.classes_, y)

Source File: ConditionMortalityPredictor.py From CDSS with GNU General Public License v3.0

6 votes

def _select_features(self):
        # Use FeatureSelector to prune all but 100 variables.
        fs = FeatureSelector(algorithm=FeatureSelector.RECURSIVE_ELIMINATION, \
            problem=FeatureSelector.CLASSIFICATION)

        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(0.01*len(self._X_train.columns.values))
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        self._feature_ranks = fs.compute_ranks()
        for i in range(len(self._feature_ranks)):
            if self._feature_ranks[i] > num_features_to_select:
                self._eliminated_features.append(self._X_train.columns[i])

        self._X_train = fs.transform_matrix(self._X_train)
        self._X_test = fs.transform_matrix(self._X_test)

Source File: SupervisedLearningPipeline.py From CDSS with GNU General Public License v3.0

6 votes

def _train_predictor(self, problem, classes=None, hyperparams=None):
        if problem == SupervisedLearningPipeline.CLASSIFICATION:
            if 'bifurcated' in hyperparams['algorithm']:
                learning_class = BifurcatedSupervisedClassifier
                # Strip 'bifurcated-' from algorithm for SupervisedClassifier.
                hyperparams['algorithm'] = '-'.join(hyperparams['algorithm'].split('-')[1:])
            else:
                learning_class = SupervisedClassifier

            self._predictor = learning_class(classes, hyperparams)
        elif problem == SupervisedLearningPipeline.REGRESSION:
            learning_class = Regressor
            self._predictor = learning_class(algorithm=algorithm)
        status = self._predictor.train(self._X_train, column_or_1d(self._y_train),
                                       groups = self._patIds_train)

        return status

Source File: TestClassifierAnalyzer.py From CDSS with GNU General Public License v3.0

6 votes

def setUp(self):
        log.level = logging.ERROR
        # Use simple classifier and test case for testing non-ROC analyses.
        X = RANDOM_10_TEST_CASE['X']
        y = RANDOM_10_TEST_CASE['y']
        self._list_classifier = ListPredictor([0, 1])
        self._lc_analyzer = ClassifierAnalyzer(self._list_classifier, X, y)

        # Use ml classifier and complex test case.
        X = RANDOM_100_TEST_CASE['X']
        y = RANDOM_100_TEST_CASE['y']
        # Generate train/test split.
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123456789)
        # Train logistic regression model.
        hyperparams = {
            'algorithm': SupervisedClassifier.REGRESS_AND_ROUND,
            'random_state': 123456789
        }
        self._ml_classifier = SupervisedClassifier([0, 1], hyperparams)
        self._ml_classifier.train(X_train, column_or_1d(y_train))
        self._ml_analyzer = ClassifierAnalyzer(self._ml_classifier, X_test, y_test)

Source File: array.py From pmdarima with MIT License

5 votes

def check_endog(y, dtype=DTYPE, copy=True, force_all_finite=False):
    """Wrapper for ``check_array`` and ``column_or_1d`` from sklearn

    Parameters
    ----------
    y : array-like, shape=(n_samples,)
        The 1d endogenous array.

    dtype : string, type or None (default=np.float64)
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.

    copy : bool, optional (default=False)
        Whether a forced copy will be triggered. If copy=False, a copy might
        still be triggered by a conversion.

    force_all_finite : bool, optional (default=False)
        Whether to raise an error on np.inf and np.nan in an array. The
        possibilities are:

        - True: Force all values of array to be finite.
        - False: accept both np.inf and np.nan in array.

    Returns
    -------
    y : np.ndarray, shape=(n_samples,)
        A 1d numpy ndarray
    """
    return column_or_1d(
        check_array(y, ensure_2d=False, force_all_finite=force_all_finite,
                    copy=copy, dtype=dtype))  # type: np.ndarray

Source File: BoxCox.py From tbats with MIT License

5 votes

def boxcox(y, lam=None, seasonal_periods=None, bounds=(-1, 2)):
    y = c1d(check_array(y, ensure_2d=False, force_all_finite=True, ensure_min_samples=1,
                        copy=False, dtype=np.float64))  # type: np.ndarray
    if lam is None:
        lam = find_box_cox_lambda(y, seasonal_periods=seasonal_periods, bounds=bounds)
    if lam <= 0 and np.any(y <= 0):
        raise error.InputArgsException('y must have only positive values for box-cox transformation.')
    if np.isclose(0.0, lam):
        return np.log(y)
    return (np.sign(y) * (np.abs(y) ** lam) - 1) / lam

Source File: BoxCox.py From tbats with MIT License

5 votes

def find_box_cox_lambda(y, seasonal_periods=None, bounds=(-1, 2)):
    y = c1d(check_array(y, ensure_2d=False, force_all_finite=True, ensure_min_samples=1,
                        copy=False, dtype=np.float64))  # type: np.ndarray

    guerrero = Guerrero()
    return guerrero.find_lambda(y, seasonal_periods=seasonal_periods, bounds=bounds)

Source File: Estimator.py From tbats with MIT License

5 votes

def _normalize_seasonal_periods_to_type(self, seasonal_periods, dtype):
        """Validates seasonal periods and normalizes them

        Normalization ensures periods are of proper type, unique and sorted.
        """
        if seasonal_periods is not None:
            try:
                seasonal_periods = c1d(check_array(seasonal_periods, ensure_2d=False, force_all_finite=True,
                                                   ensure_min_samples=0,
                                                   copy=True, dtype=dtype))
            except Exception as validation_exception:
                self.context.get_exception_handler().exception("seasonal_periods definition is invalid",
                                                               error.InputArgsException,
                                                               previous_exception=validation_exception)

            seasonal_periods = np.unique(seasonal_periods)
            if len(seasonal_periods[np.where(seasonal_periods <= 1)]) > 0:
                self.context.get_exception_handler().warn(
                    "All seasonal periods should be values greater than 1. "
                    "Ignoring all seasonal period values that do not meet this condition.",
                    error.InputArgsWarning
                )
            seasonal_periods = seasonal_periods[np.where(seasonal_periods > 1)]
            seasonal_periods.sort()
            if len(seasonal_periods) == 0:
                seasonal_periods = None
        return seasonal_periods

Source File: utils.py From AIF360 with Apache License 2.0

5 votes

def check_inputs(X, y, sample_weight=None, ensure_2d=True):
    """Input validation for debiasing algorithms.

    Checks all inputs for consistent length, validates shapes (optional for X),
    and returns an array of all ones if sample_weight is ``None``.

    Args:
        X (array-like): Input data.
        y (array-like, shape = (n_samples,)): Target values.
        sample_weight (array-like, optional): Sample weights.
        ensure_2d (bool, optional): Whether to raise a ValueError if X is not
            2D.

    Returns:
        tuple:

            * **X** (`array-like`) -- Validated X. Unchanged.

            * **y** (`array-like`) -- Validated y. Possibly converted to 1D if
              not a :class:`pandas.Series`.
            * **sample_weight** (`array-like`) -- Validated sample_weight. If no
              sample_weight is provided, returns a consistent-length array of
              ones.
    """
    if ensure_2d and X.ndim != 2:
        raise ValueError("Expected X to be 2D, got ndim == {} instead.".format(
                X.ndim))
    if not isinstance(y, pd.Series):  # don't cast Series -> ndarray
        y = column_or_1d(y)
    if sample_weight is not None:
        sample_weight = column_or_1d(sample_weight)
    else:
        sample_weight = np.ones(X.shape[0])
    check_consistent_length(X, y, sample_weight)
    return X, y, sample_weight

Source File: SupervisedClassifier.py From CDSS with GNU General Public License v3.0

5 votes

def _maybe_reshape_y(self, y):
        # If necessary, reshape y from (n_samples, 1) to (n_samples, )
        try:
            num_cols = y.shape[1]
            y = column_or_1d(y)
            log.debug('Reshaped y to 1d.')
        except IndexError:
            log.debug('Did not need to reshape y to 1d.')

        return y

Source File: encoders.py From sagemaker-scikit-learn-extension with Apache License 2.0

5 votes

def inverse_transform(self, y):
        """Transform labels back to original encoding.

        If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_label_value`` for unseen values.

        Parameters
        ----------
        y : numpy array of shape [n_samples]
            Encoded label values.

        Returns
        -------
        y_decoded : numpy array of shape [n_samples]
                    Label values.
        """
        check_is_fitted(self, "classes_")
        y = column_or_1d(y, warn=True)

        if y.dtype.kind not in ("i", "u"):
            try:
                y = y.astype(np.float).astype(np.int)
            except ValueError:
                raise ValueError("`y` contains values not convertible to integer.")

        # inverse transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        labels = np.arange(len(self.classes_))
        diff = np.setdiff1d(y, labels)

        if diff and not self.fill_unseen_labels:
            raise ValueError("y contains previously unseen labels: %s" % str(diff))

        y_decoded = [self.classes_[idx] if idx in labels else self.fill_label_value for idx in y]
        return y_decoded

Source File: encoders.py From sagemaker-scikit-learn-extension with Apache License 2.0

5 votes

def transform(self, y):
        """Transform labels to normalized encoding.

        If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_encoded_label_value`` for unseen values.
        Seen labels are encoded with value between 0 and n_classes-1.  Unseen labels are encoded with
        ``self.fill_encoded_label_value`` with a default value of n_classes.

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Label values.

        Returns
        -------
        y_encoded : array-like of shape [n_samples]
                    Encoded label values.
        """
        check_is_fitted(self, "classes_")
        y = column_or_1d(y, warn=True)

        # transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        if self.fill_unseen_labels:
            _, mask = _encode_check_unknown(y, self.classes_, return_mask=True)
            y_encoded = np.searchsorted(self.classes_, y)
            fill_encoded_label_value = self.fill_encoded_label_value or len(self.classes_)
            y_encoded[~mask] = fill_encoded_label_value
        else:
            _, y_encoded = _encode(y, uniques=self.classes_, encode=True)

        return y_encoded

Source File: encoders.py From sagemaker-scikit-learn-extension with Apache License 2.0

5 votes

def fit(self, y):
        """Fit label encoder.

        Parameters
        ----------
        y : array-like of shape (n_samples,)
            Label values.

        Returns
        -------
        self : RobustLabelEncoder.
        """
        y = column_or_1d(y, warn=True)
        self.classes_ = self._check_labels_and_sort() or _encode(y)
        return self

Source File: array.py From pmdarima with MIT License

5 votes

def as_series(x):
    """Cast as pandas Series.

    Cast an iterable to a Pandas Series object. Note that the index
    will simply be a positional ``arange`` and cannot be set in this
    function.

    Parameters
    ----------
    x : array-like, shape=(n_samples,)
        The 1d array on which to compute the auto correlation.

    Examples
    --------
    >>> as_series([1, 2, 3])
    0    1
    1    2
    2    3
    dtype: int64

    >>> as_series(as_series((1, 2, 3)))
    0    1
    1    2
    2    3
    dtype: int64

    >>> import pandas as pd
    >>> as_series(pd.Series([4, 5, 6], index=['a', 'b', 'c']))
    a    4
    b    5
    c    6
    dtype: int64

    Returns
    -------
    s : pd.Series
        A pandas Series object.
    """
    if isinstance(x, pd.Series):
        return x
    return pd.Series(column_or_1d(x))

Source File: SupervisedLearningPipeline.py From CDSS with GNU General Public License v3.0

4 votes

def _select_features(self, problem, percent_features_to_select, algorithm, features_to_keep=None):
        # Initialize FeatureSelector.
        fs = FeatureSelector(problem=problem, algorithm=algorithm, random_state=self._random_state)
        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(percent_features_to_select*len(self._X_train.columns.values))

        # Parse features_to_keep.
        if features_to_keep is None:
            features_to_keep = []

        # Select features.
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        feature_ranks = fs.compute_ranks()
        for i in range(len(feature_ranks)):
            if feature_ranks[i] > num_features_to_select:
                # If in features_to_keep, pretend it wasn't eliminated.
                if self._X_train.columns[i] not in features_to_keep:
                    self._eliminated_features.append(self._X_train.columns[i])

        # Hack: rather than making FeatureSelector handle the concept of
        # kept features, just copy the data here and add it back to the
        # transformed matrices.
        # Rather than looping, do this individually so that we can skip if
        # transformed X already has the feature.

        # for feature in features_to_keep:
        kept_X_train_feature = self._X_train[features_to_keep].copy()
        log.debug('kept_X_train_feature.shape: %s' % str(kept_X_train_feature.shape))
        self._X_train = fs.transform_matrix(self._X_train)
        for feature in features_to_keep:
            if feature not in self._X_train:
                self._X_train = self._X_train.merge(kept_X_train_feature[[feature]], left_index=True, right_index=True)

        kept_X_test_feature = self._X_test[features_to_keep].copy()
        log.debug('kept_X_test_feature.shape: %s' % str(kept_X_test_feature.shape))
        self._X_test = fs.transform_matrix(self._X_test)
        for feature in features_to_keep:
            if feature not in self._X_test:
                self._X_test = self._X_test.merge(kept_X_test_feature[[feature]], left_index=True, right_index=True)

        if not features_to_keep:
        # Even if there is no feature to keep, still need to
        # perform transform_matrix to drop most low-rank features
            self._X_train = fs.transform_matrix(self._X_train)
            self._X_test = fs.transform_matrix(self._X_test)

Source File: SupervisedLearner.py From CDSS with GNU General Public License v3.0

4 votes

def run(self):
        file_organizer = Syst.FileOrganizerLocal(working_folderpath=self.working_folderpath)

        raw_matrix_train, raw_matrix_test = Utils.split_rows(self.input_matrix)

        X_train_raw, y_train = Utils.split_Xy(raw_matrix_train, ylabel=self.ylabel)

        feature_processing_pipeline = Pipeline(
            memory=None,  # file_organizer.cached_pipeline_filepath,
            steps=[
                ('impute_features', Clas.FeatureImputer()),
                ('remove_features', Clas.FeatureRemover()),
                ('select_features', Clas.Select_Features())
            ]
        )
        X_train_processed = feature_processing_pipeline.fit_transform(X_train_raw, y_train)

        predictor = SupervisedClassifier(classes=[0, 1], hyperparams={'algorithm':'random-forest',
                                                                      'hyperparam_strategy':SupervisedClassifier.EXHAUSTIVE_SEARCH,
                                                                      'max_iter': 1024
                                                                      })

        status = predictor.train(X_train_processed, column_or_1d(y_train))

        X_test_raw, y_test = Utils.split_Xy(raw_matrix_test, ylabel=self.ylabel)
        X_test_processed = feature_processing_pipeline.transform(X_test_raw)
        y_test_pred_proba = predictor.predict_probability(X_test_processed)[:, 1]

        res_df = pd.DataFrame({'actual': y_test,
                               'predict': y_test_pred_proba})
        res_df.to_csv(file_organizer.get_output_filepath())

        '''TODO'''
        from scripts.LabTestAnalysis.lab_statistics.stats_utils import get_confusion_metrics
        from sklearn.metrics import roc_auc_score

        AUC = roc_auc_score(y_test, y_test_pred_proba)

        sensitivity, specificity, LR_p, LR_n, PPV, NPV = get_confusion_metrics(actual_labels=y_test.values,
                                                                               predict_probas=y_test_pred_proba,
                                                                               threshold=0.5)
        print("AUC: %s, sensitivity: %s, specificity: %s, LR_p: %s, LR_n: %s, PPV: %s, NPV: %s:. " \
                % (AUC, sensitivity, specificity, LR_p, LR_n, PPV, NPV))

Source File: Model.py From tbats with MIT License

4 votes

def _fit_to_observations(self, y, starting_x):
        """Fits model with starting x to time series"""
        self.warnings = []
        self.is_fitted = False

        if self.validate_input:
            try:
                y = c1d(check_array(y, ensure_2d=False, force_all_finite=True, ensure_min_samples=1,
                                    copy=True, dtype=np.float64))  # type: np.ndarray
            except Exception as validation_exception:
                self.context.get_exception_handler().exception("y series is invalid",
                                                               error.InputArgsException,
                                                               previous_exception=validation_exception)
        self.y = y
        yw = self._boxcox(y)

        matrix_builder = self.matrix
        w = matrix_builder.make_w_vector()
        g = matrix_builder.make_g_vector()
        F = matrix_builder.make_F_matrix()

        # initialize matrices
        yw_hat = np.asarray([0.0] * len(y))
        # x = np.matrix(np.zeros((len(params.x0), len(yw) + 1)))
        x = starting_x

        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            try:
                for t in range(0, len(y)):
                    yw_hat[t] = w @ x
                    e = yw[t] - yw_hat[t]
                    x = F @ x + g * e
            except RuntimeWarning:
                # calculation issues, values close to max float value
                self.add_warning('Numeric calculation issues detected. Model is not usable.')
                self.is_fitted = False
                return self

        # store fit results
        self.x_last = x
        self.resid_boxcox = yw - yw_hat
        try:
            self.y_hat = self._inv_boxcox(yw_hat)
        except RuntimeWarning:
            self.add_warning('Box-Cox related numeric calculation issues detected. Model is not usable.')
            self.is_fitted = False
            return self

        self.resid = self.y - self.y_hat

        self.is_fitted = True
        self.aic = self.calculate_aic()

        return self

Python sklearn.utils.validation.column_or_1d() Examples