Python Examples of sklearn.utils.validation.check

Source File: nfpc.py From fylearn with MIT License

6 votes

def fit(self, X, y):

        X = check_array(X)

        self.classes_, y = np.unique(y, return_inverse=True)

        if "?" in tuple(self.classes_):
            raise ValueError("nan not supported for class values")

        # build membership functions for each feature for each class
        learned = [
            learn_class(X, y, y == idx, self.membership_factory, self.aggregation_factory)
            for idx, class_value in enumerate(self.classes_)
        ]

        logger.info("learned %s" % (str(learned),))

        self.protos_ = [ x[0] for x in learned ]
        self.aggregations_ = [ x[1] for x in learned ]
        self.selection_method_ = self.selection_factory(X, y)

        return self

Source File: fpcga.py From fylearn with MIT License

6 votes

def predict(self, X):
        """

        Predict outputs given examples.

        Parameters:
        -----------

        X : the examples to predict (array or matrix)

        Returns:
        --------

        y_pred : Predicted values for each row in matrix.

        """
        if self.protos_ is None:
            raise Exception("Prototypes not initialized. Perform a fit first.")

        X = check_array(X)

        # predict
        return _predict(self.protos_, self.aggregation, self.classes_, X)

Source File: fpcga.py From fylearn with MIT License

6 votes

def fit(self, X, y_orig):

        def as_factory(r):
            return r if isinstance(r, AggregationRuleFactory) else DummyAggregationRuleFactory(r)

        self.aggregation_rules__ = [ as_factory(r) for r in self.aggregation_rules ]
        
        X = check_array(X)

        self.classes_, _ = np.unique(y_orig, return_inverse=True)
        self.m = X.shape[1]

        if np.nan in self.classes_:
            raise Exception("nan not supported for class values")

        self.build_with_ga(X, y_orig)

        return self

Source File: rafpc.py From fylearn with MIT License

6 votes

def predict(self, X):
        """

        Predict outputs given examples.

        Parameters:
        -----------

        X : the examples to predict (array or matrix)

        Returns:
        --------

        y_pred : Predicted values for each row in matrix.

        """
        if self.protos_ is None:
            raise Exception("Prototypes not initialized. Perform a fit first.")

        X = check_array(X)

        # predict
        return _predict_multi(self.protos_, self.aggregation, self.classes_, X, self.n_features)

Source File: garules.py From fylearn with MIT License

6 votes

def fit(self, X, y):
        X = check_array(X)

        self.classes_, _ = np.unique(y, return_inverse=True)

        # construct distance measure
        self.distance_ = self.df(X)

        # build models
        models = np.zeros((len(self.classes_), X.shape[1]))
        for c_idx, c_value in enumerate(self.classes_):
            models[c_idx, :] = self.build_for_class(X[y == c_value])

        self.models_ = models

        return self

Source File: predictive_imputer.py From predictive_imputer with MIT License

6 votes

def transform(self, X):
        check_is_fitted(self, ['statistics_', 'estimators_', 'gamma_'])
        X = check_array(X, copy=True, dtype=np.float64, force_all_finite=False)
        if X.shape[1] != self.statistics_.shape[1]:
            raise ValueError("X has %d features per sample, expected %d"
                             % (X.shape[1], self.statistics_.shape[1]))

        X_nan = np.isnan(X)
        imputed = self.initial_imputer.transform(X)

        if len(self.estimators_) > 1:
            for i, estimator_ in enumerate(self.estimators_):
                X_s = np.delete(imputed, i, 1)
                y_nan = X_nan[:, i]

                X_unk = X_s[y_nan]
                if len(X_unk) > 0:
                    X[y_nan, i] = estimator_.predict(X_unk)

        else:
            estimator_ = self.estimators_[0]
            X[X_nan] = estimator_.inverse_transform(estimator_.transform(imputed))[X_nan]

        return X

Source File: event.py From brainiak with Apache License 2.0

6 votes

def predict(self, X):
        """Applies learned event segmentation to new testing dataset

        Alternative function for segmenting a new dataset after using
        fit() to learn a sequence of events, to comply with the sklearn
        Classifier interface

        Parameters
        ----------
        X: timepoint by voxel ndarray
            fMRI data to segment based on previously-learned event patterns

        Returns
        -------
        Event label for each timepoint
        """
        check_is_fitted(self, ["event_pat_", "event_var_"])
        X = check_array(X)
        segments, test_ll = self.find_events(X)
        return np.argmax(segments, axis=1)

Source File: mcd.py From pyod with BSD 2-Clause "Simplified" License

6 votes

def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)

        # Computer mahalanobis distance of the samples
        return self.detector_.mahalanobis(X)

Source File: nfpc.py From fylearn with MIT License

6 votes

def fit(self, X, y):

        X = check_array(X)

        self.classes_, y = np.unique(y, return_inverse=True)

        if "?" in tuple(self.classes_):
            raise ValueError("nan not supported for class values")

        # build membership functions for each feature for each class
        self.protos_ = [
            build_memberships(X, y == idx, self.membership_factory)
            for idx, class_value in enumerate(self.classes_)
        ]

        # build aggregation
        self.aggregation_ = self.aggregation_factory(self.protos_, X, y, self.classes_)

        return self

Source File: arima.py From pmdarima with MIT License

6 votes

def _seasonal_prediction_with_confidence(arima_res, start, end, exog, alpha,
                                         **kwargs):
    """Compute the prediction for a SARIMAX and get a conf interval

    Unfortunately, SARIMAX does not really provide a nice way to get the
    confidence intervals out of the box, so we have to perform the
    ``get_prediction`` code here and unpack the confidence intervals manually.

    Notes
    -----
    For internal use only.
    """
    results = arima_res.get_prediction(
        start=start,
        end=end,
        exog=exog,
        **kwargs)

    f = results.predicted_mean
    conf_int = results.conf_int(alpha=alpha)
    return check_endog(f, dtype=None, copy=False), \
        check_array(conf_int, copy=False, dtype=None)

Source File: array.py From pmdarima with MIT License

6 votes

def _diff_inv_matrix(x, lag, differences, xi):
    n, m = x.shape
    y = np.zeros((n + lag * differences, m), dtype=DTYPE)

    if m >= 1:  # todo: R checks this. do we need to?
        # R: if(missing(xi)) xi <- matrix(0.0, lag*differences, m)
        if xi is None:
            xi = np.zeros((lag * differences, m), dtype=DTYPE)
        else:
            xi = check_array(
                xi, dtype=DTYPE, copy=False, force_all_finite=False,
                ensure_2d=True)
            if xi.shape != (lag * differences, m):
                raise IndexError('"xi" does not have the right shape')

        # TODO: can we vectorize?
        for i in range(m):
            y[:, i] = _diff_inv_vector(x[:, i], lag, differences, xi[:, i])

    return y

Source File: fpt.py From fylearn with MIT License

6 votes

def fit(self, X, y):

        X = check_array(X)

        self.classes_, y = np.unique(y, return_inverse=True)

        if np.nan in self.classes_:
            raise Exception("nan not supported for class values")

        self.trees_ = {}

        # build membership functions
        P = []
        for feature_idx, feature in enumerate(X.T):
            P.extend(self.fuzzifier(feature_idx, feature))

        # build the pattern tree for each class
        for class_idx, class_value in enumerate(self.classes_):
            class_vector = np.zeros(len(y))
            class_vector[y == class_idx] = 1.0
            root = self.build_for_class(X, y, class_vector, list(P))
            self.trees_[class_idx] = root

        return self

Source File: base.py From sagemaker-scikit-learn-extension with Apache License 2.0

6 votes

def fit(self, X, y=None):
        """Compute the lower and upper quantile cutoffs, columns to transform, and nonnegative columns.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data array to transform. Must be numeric, non-sparse, and two-dimensional.

        Returns
        -------
        self : LogExtremeValueTransformer
        """
        super().fit(X)
        X = check_array(X)
        self.nonnegative_cols_ = [j for j in range(self.n_input_features_) if np.all(X[:, j] >= 0)]
        return self

Source File: base.py From sagemaker-scikit-learn-extension with Apache License 2.0

6 votes

def fit(self, X, y=None):
        """Compute the lower and upper quantile cutoffs, columns to transform, and each column's quantiles.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data array to transform. Must be numeric, non-sparse, and two-dimensional.

        Returns
        -------
        self : QuantileExtremeValueTransformer
        """
        super().fit(X)
        X = check_array(X)
        self.quantile_transformer_ = QuantileTransformer(random_state=0, copy=True)
        self.quantile_transformer_.fit(X)
        return self

Source File: util.py From skutil with BSD 3-Clause "New" or "Revised" License

6 votes

def from_array(X, column_names=None):
    """A simple wrapper for H2OFrame.from_python. This takes a
    numpy array (or 2d array) and returns an H2OFrame with all 
    the default args.

    Parameters
    ----------

    X : ndarray
        The array to convert.

    column_names : list, tuple (default=None)
        the names to use for your columns

    Returns
    -------

    H2OFrame
    """
    X = check_array(X, force_all_finite=False)
    return from_pandas(pd.DataFrame.from_records(data=X, columns=column_names))

Source File: base.py From sagemaker-scikit-learn-extension with Apache License 2.0

6 votes

def fit(self, X, y=None):
        """Learn empirical variances from X.
        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            Input samples from which to check uniqueness.

        Returns
        -------
        self
        """
        X = check_array(X, force_all_finite=False)
        _, self.n_input_features_ = X.shape
        all_nan_cols = np.all(np.isnan(X), axis=0)
        self.cols_to_transform_ = np.logical_or(
            np.array([np.unique(X[:, j]).size == 1 for j in range(self.n_input_features_)]), all_nan_cols
        )
        return self

Source File: fpt.py From fylearn with MIT License

6 votes

def predict(self, X):
        """Predict class for X.

        Parameters
        ----------
        X : Array-like of shape [n_samples, n_features]
            The input to classify.

        Returns
        -------
        y : array of shape = [n_samples]
            The predicted classes.
        """

        X = check_array(X)

        if self.trees_ is None:
            raise Exception("Pattern trees not initialized. Perform a fit first.")

        y_classes = np.zeros((X.shape[0], len(self.classes_)))
        for i, c in enumerate(self.classes_):
            y_classes[:, i] = self.trees_[i](X)

        # predict the maximum value
        return self.classes_.take(np.argmax(y_classes, -1))

Source File: nb_sklearn.py From recordlinkage with BSD 3-Clause "New" or "Revised" License

6 votes

def _joint_log_likelihood(self, X):
        """Calculate the posterior log probability of the samples X"""
        check_is_fitted(self, "classes_")

        X = check_array(X, accept_sparse='csr')
        X_bin = self._transform_data(X)

        n_classes, n_features = self.feature_log_prob_.shape
        n_samples, n_features_X = X_bin.shape

        if n_features_X != n_features:
            raise ValueError(
                "Expected input with %d features, got %d instead" %
                (n_features, n_features_X))

        # see chapter 4.1 of http://www.cs.columbia.edu/~mcollins/em.pdf
        # implementation as in Formula 4.
        jll = safe_sparse_dot(X_bin, self.feature_log_prob_.T)
        jll += self.class_log_prior_

        return jll

Source File: base.py From sagemaker-scikit-learn-extension with Apache License 2.0

5 votes

def _validate_input(self, X):
        if self._is_constant_multicolumn_imputation():
            if len(self.fill_values) != X.shape[1]:
                raise ValueError(
                    "'fill_values' should have length equal to number of features in X {num_features}, "
                    "got {fill_values_length}".format(num_features=X.shape[1], fill_values_length=len(self.fill_values))
                )

        dtype = self.dtype or np.dtype("O")

        if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c":
            raise ValueError("Complex data not supported\n{}\n".format(X))

        return check_array(X, dtype=dtype, copy=True, force_all_finite=False, ensure_2d=True)

Source File: base.py From sagemaker-scikit-learn-extension with Apache License 2.0

5 votes

def fit(self, X, y=None):
        """Compute the lower and upper quantile cutoffs and which columns to transform.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data array to transform. Must be numeric, non-sparse, and two-dimensional.

        Returns
        -------
        self : BaseExtremeValueTransformer
        """
        if not 0 <= self.quantile <= 100:
            raise ValueError(
                "Parameter `quantile` {} is invalid. `quantile` must be an integer between 0 and 100".format(
                    self.quantile
                )
            )

        X = check_array(X)
        _, self.n_input_features_ = X.shape

        self.quantiles_ = np.percentile(X, [100 - self.quantile, self.quantile], axis=0)

        nonstandard_threshold_stds = self.threshold_std * np.std(X, axis=0)
        col_means = np.mean(X, axis=0)
        threshold_upper_bound = nonstandard_threshold_stds + col_means
        threshold_lower_bound = -nonstandard_threshold_stds + col_means

        self.cols_to_transform_ = [
            j
            for j in range(self.n_input_features_)
            if self.quantiles_[0, j] < threshold_lower_bound[j] or self.quantiles_[1, j] > threshold_upper_bound[j]
        ]

        return self

Source File: base.py From sagemaker-scikit-learn-extension with Apache License 2.0

5 votes

def _validate_input(self, X):
        if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c":
            raise ValueError("Complex data not supported\n{}\n".format(X))

        return check_array(X, dtype=np.dtype("O"), copy=True, force_all_finite=False, ensure_2d=True)

Source File: date_time.py From sagemaker-scikit-learn-extension with Apache License 2.0

5 votes

def transform(self, X, y=None):
        X = check_array(X, dtype=None, force_all_finite="allow-nan")
        check_is_fitted(self, "extract_")

        X = np.array(X)
        X = self._to_datetime_array(X)

        return self._convert(X, self.mode)

Source File: test_estimator_checks.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def predict(self, X):
        X = check_array(X)
        self.key = 1000
        return np.ones(X.shape[0])

Source File: stat_models.py From pyod with BSD 2-Clause "Simplified" License

5 votes

def pearsonr_mat(mat, w=None):
    """Utility function to calculate pearson matrix (row-wise).

    Parameters
    ----------
    mat : numpy array of shape (n_samples, n_features)
        Input matrix.

    w : numpy array of shape (n_features,)
        Weights.

    Returns
    -------
    pear_mat : numpy array of shape (n_samples, n_samples)
        Row-wise pearson score matrix.

    """
    mat = check_array(mat)
    n_row = mat.shape[0]
    n_col = mat.shape[1]
    pear_mat = np.full([n_row, n_row], 1).astype(float)

    if w is not None:
        for cx in range(n_row):
            for cy in range(cx + 1, n_row):
                curr_pear = wpearsonr(mat[cx, :], mat[cy, :], w)
                pear_mat[cx, cy] = curr_pear
                pear_mat[cy, cx] = curr_pear
    else:
        for cx in range(n_col):
            for cy in range(cx + 1, n_row):
                curr_pear = pearsonr(mat[cx, :], mat[cy, :])[0]
                pear_mat[cx, cy] = curr_pear
                pear_mat[cy, cx] = curr_pear

    return pear_mat

Source File: test_ranking.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def _my_lrap(y_true, y_score):
    """Simple implementation of label ranking average precision"""
    check_consistent_length(y_true, y_score)
    y_true = check_array(y_true)
    y_score = check_array(y_score)
    n_samples, n_labels = y_true.shape
    score = np.empty((n_samples, ))
    for i in range(n_samples):
        # The best rank correspond to 1. Rank higher than 1 are worse.
        # The best inverse ranking correspond to n_labels.
        unique_rank, inv_rank = np.unique(y_score[i], return_inverse=True)
        n_ranks = unique_rank.size
        rank = n_ranks - inv_rank

        # Rank need to be corrected to take into account ties
        # ex: rank 1 ex aequo means that both label are rank 2.
        corr_rank = np.bincount(rank, minlength=n_ranks + 1).cumsum()
        rank = corr_rank[rank]

        relevant = y_true[i].nonzero()[0]
        if relevant.size == 0 or relevant.size == n_labels:
            score[i] = 1
            continue

        score[i] = 0.
        for label in relevant:
            # Let's count the number of relevant label with better rank
            # (smaller rank).
            n_ranked_above = sum(rank[r] <= rank[label] for r in relevant)

            # Weight by the rank of the actual label
            score[i] += n_ranked_above / rank[label]

        score[i] /= relevant.size

    return score.mean()

Source File: split.py From nyaggle with MIT License

5 votes

def split(self, X, y, groups=None):
        """
        Generate indices to split data into training and test set.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(StratifiedGroupKFold, self).split(X, y, groups)

Source File: lof.py From pyod with BSD 2-Clause "Simplified" License

5 votes

def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors,
                                            algorithm=self.algorithm,
                                            leaf_size=self.leaf_size,
                                            metric=self.metric,
                                            p=self.p,
                                            metric_params=self.metric_params,
                                            contamination=self.contamination,
                                            n_jobs=self.n_jobs)
        self.detector_.fit(X=X, y=y)

        # Invert decision_scores_. Outliers comes with higher outlier scores
        self.decision_scores_ = invert_order(
            self.detector_.negative_outlier_factor_)
        self._process_decision_scores()
        return self

Source File: pca.py From pyod with BSD 2-Clause "Simplified" License

5 votes

def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['components_', 'w_components_'])

        X = check_array(X)
        if self.standardization:
            X = self.scaler_.transform(X)

        return np.sum(
            cdist(X, self.selected_components_) / self.selected_w_components_,
            axis=1).ravel()

Source File: mcd.py From pyod with BSD 2-Clause "Simplified" License

5 votes

def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = MinCovDet(store_precision=self.store_precision,
                                   assume_centered=self.assume_centered,
                                   support_fraction=self.support_fraction,
                                   random_state=self.random_state)
        self.detector_.fit(X=X, y=y)

        # Use mahalanabis distance as the outlier score
        self.decision_scores_ = self.detector_.dist_
        self._process_decision_scores()
        return self

Source File: genetic.py From gplearn with BSD 3-Clause "New" or "Revised" License

5 votes

def transform(self, X):
        """Transform X according to the fitted transformer.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Input vectors, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        X_new : array-like, shape = [n_samples, n_components]
            Transformed array.

        """
        if not hasattr(self, '_best_programs'):
            raise NotFittedError('SymbolicTransformer not fitted.')

        X = check_array(X)
        _, n_features = X.shape
        if self.n_features_ != n_features:
            raise ValueError('Number of features of the model must match the '
                             'input. Model n_features is %s and input '
                             'n_features is %s.'
                             % (self.n_features_, n_features))

        X_new = np.array([gp.execute(X) for gp in self._best_programs]).T

        return X_new

Python sklearn.utils.validation.check_array() Examples