Python sklearn.utils.validation.check_array() Examples

The following are 30 code examples of sklearn.utils.validation.check_array(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils.validation , or try the search function .
Example #1
Source File: nfpc.py    From fylearn with MIT License 6 votes vote down vote up
def fit(self, X, y):

        X = check_array(X)

        self.classes_, y = np.unique(y, return_inverse=True)

        if "?" in tuple(self.classes_):
            raise ValueError("nan not supported for class values")

        # build membership functions for each feature for each class
        learned = [
            learn_class(X, y, y == idx, self.membership_factory, self.aggregation_factory)
            for idx, class_value in enumerate(self.classes_)
        ]

        logger.info("learned %s" % (str(learned),))

        self.protos_ = [ x[0] for x in learned ]
        self.aggregations_ = [ x[1] for x in learned ]
        self.selection_method_ = self.selection_factory(X, y)

        return self 
Example #2
Source File: fpcga.py    From fylearn with MIT License 6 votes vote down vote up
def predict(self, X):
        """

        Predict outputs given examples.

        Parameters:
        -----------

        X : the examples to predict (array or matrix)

        Returns:
        --------

        y_pred : Predicted values for each row in matrix.

        """
        if self.protos_ is None:
            raise Exception("Prototypes not initialized. Perform a fit first.")

        X = check_array(X)

        # predict
        return _predict(self.protos_, self.aggregation, self.classes_, X) 
Example #3
Source File: fpcga.py    From fylearn with MIT License 6 votes vote down vote up
def fit(self, X, y_orig):

        def as_factory(r):
            return r if isinstance(r, AggregationRuleFactory) else DummyAggregationRuleFactory(r)

        self.aggregation_rules__ = [ as_factory(r) for r in self.aggregation_rules ]
        
        X = check_array(X)

        self.classes_, _ = np.unique(y_orig, return_inverse=True)
        self.m = X.shape[1]

        if np.nan in self.classes_:
            raise Exception("nan not supported for class values")

        self.build_with_ga(X, y_orig)

        return self 
Example #4
Source File: rafpc.py    From fylearn with MIT License 6 votes vote down vote up
def predict(self, X):
        """

        Predict outputs given examples.

        Parameters:
        -----------

        X : the examples to predict (array or matrix)

        Returns:
        --------

        y_pred : Predicted values for each row in matrix.

        """
        if self.protos_ is None:
            raise Exception("Prototypes not initialized. Perform a fit first.")

        X = check_array(X)

        # predict
        return _predict_multi(self.protos_, self.aggregation, self.classes_, X, self.n_features) 
Example #5
Source File: garules.py    From fylearn with MIT License 6 votes vote down vote up
def fit(self, X, y):
        X = check_array(X)

        self.classes_, _ = np.unique(y, return_inverse=True)

        # construct distance measure
        self.distance_ = self.df(X)

        # build models
        models = np.zeros((len(self.classes_), X.shape[1]))
        for c_idx, c_value in enumerate(self.classes_):
            models[c_idx, :] = self.build_for_class(X[y == c_value])

        self.models_ = models

        return self 
Example #6
Source File: predictive_imputer.py    From predictive_imputer with MIT License 6 votes vote down vote up
def transform(self, X):
        check_is_fitted(self, ['statistics_', 'estimators_', 'gamma_'])
        X = check_array(X, copy=True, dtype=np.float64, force_all_finite=False)
        if X.shape[1] != self.statistics_.shape[1]:
            raise ValueError("X has %d features per sample, expected %d"
                             % (X.shape[1], self.statistics_.shape[1]))

        X_nan = np.isnan(X)
        imputed = self.initial_imputer.transform(X)

        if len(self.estimators_) > 1:
            for i, estimator_ in enumerate(self.estimators_):
                X_s = np.delete(imputed, i, 1)
                y_nan = X_nan[:, i]

                X_unk = X_s[y_nan]
                if len(X_unk) > 0:
                    X[y_nan, i] = estimator_.predict(X_unk)

        else:
            estimator_ = self.estimators_[0]
            X[X_nan] = estimator_.inverse_transform(estimator_.transform(imputed))[X_nan]

        return X 
Example #7
Source File: event.py    From brainiak with Apache License 2.0 6 votes vote down vote up
def predict(self, X):
        """Applies learned event segmentation to new testing dataset

        Alternative function for segmenting a new dataset after using
        fit() to learn a sequence of events, to comply with the sklearn
        Classifier interface

        Parameters
        ----------
        X: timepoint by voxel ndarray
            fMRI data to segment based on previously-learned event patterns

        Returns
        -------
        Event label for each timepoint
        """
        check_is_fitted(self, ["event_pat_", "event_var_"])
        X = check_array(X)
        segments, test_ll = self.find_events(X)
        return np.argmax(segments, axis=1) 
Example #8
Source File: mcd.py    From pyod with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)

        # Computer mahalanobis distance of the samples
        return self.detector_.mahalanobis(X) 
Example #9
Source File: nfpc.py    From fylearn with MIT License 6 votes vote down vote up
def fit(self, X, y):

        X = check_array(X)

        self.classes_, y = np.unique(y, return_inverse=True)

        if "?" in tuple(self.classes_):
            raise ValueError("nan not supported for class values")

        # build membership functions for each feature for each class
        self.protos_ = [
            build_memberships(X, y == idx, self.membership_factory)
            for idx, class_value in enumerate(self.classes_)
        ]

        # build aggregation
        self.aggregation_ = self.aggregation_factory(self.protos_, X, y, self.classes_)

        return self 
Example #10
Source File: arima.py    From pmdarima with MIT License 6 votes vote down vote up
def _seasonal_prediction_with_confidence(arima_res, start, end, exog, alpha,
                                         **kwargs):
    """Compute the prediction for a SARIMAX and get a conf interval

    Unfortunately, SARIMAX does not really provide a nice way to get the
    confidence intervals out of the box, so we have to perform the
    ``get_prediction`` code here and unpack the confidence intervals manually.

    Notes
    -----
    For internal use only.
    """
    results = arima_res.get_prediction(
        start=start,
        end=end,
        exog=exog,
        **kwargs)

    f = results.predicted_mean
    conf_int = results.conf_int(alpha=alpha)
    return check_endog(f, dtype=None, copy=False), \
        check_array(conf_int, copy=False, dtype=None) 
Example #11
Source File: array.py    From pmdarima with MIT License 6 votes vote down vote up
def _diff_inv_matrix(x, lag, differences, xi):
    n, m = x.shape
    y = np.zeros((n + lag * differences, m), dtype=DTYPE)

    if m >= 1:  # todo: R checks this. do we need to?
        # R: if(missing(xi)) xi <- matrix(0.0, lag*differences, m)
        if xi is None:
            xi = np.zeros((lag * differences, m), dtype=DTYPE)
        else:
            xi = check_array(
                xi, dtype=DTYPE, copy=False, force_all_finite=False,
                ensure_2d=True)
            if xi.shape != (lag * differences, m):
                raise IndexError('"xi" does not have the right shape')

        # TODO: can we vectorize?
        for i in range(m):
            y[:, i] = _diff_inv_vector(x[:, i], lag, differences, xi[:, i])

    return y 
Example #12
Source File: fpt.py    From fylearn with MIT License 6 votes vote down vote up
def fit(self, X, y):

        X = check_array(X)

        self.classes_, y = np.unique(y, return_inverse=True)

        if np.nan in self.classes_:
            raise Exception("nan not supported for class values")

        self.trees_ = {}

        # build membership functions
        P = []
        for feature_idx, feature in enumerate(X.T):
            P.extend(self.fuzzifier(feature_idx, feature))

        # build the pattern tree for each class
        for class_idx, class_value in enumerate(self.classes_):
            class_vector = np.zeros(len(y))
            class_vector[y == class_idx] = 1.0
            root = self.build_for_class(X, y, class_vector, list(P))
            self.trees_[class_idx] = root

        return self 
Example #13
Source File: base.py    From sagemaker-scikit-learn-extension with Apache License 2.0 6 votes vote down vote up
def fit(self, X, y=None):
        """Compute the lower and upper quantile cutoffs, columns to transform, and nonnegative columns.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data array to transform. Must be numeric, non-sparse, and two-dimensional.

        Returns
        -------
        self : LogExtremeValueTransformer
        """
        super().fit(X)
        X = check_array(X)
        self.nonnegative_cols_ = [j for j in range(self.n_input_features_) if np.all(X[:, j] >= 0)]
        return self 
Example #14
Source File: base.py    From sagemaker-scikit-learn-extension with Apache License 2.0 6 votes vote down vote up
def fit(self, X, y=None):
        """Compute the lower and upper quantile cutoffs, columns to transform, and each column's quantiles.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data array to transform. Must be numeric, non-sparse, and two-dimensional.

        Returns
        -------
        self : QuantileExtremeValueTransformer
        """
        super().fit(X)
        X = check_array(X)
        self.quantile_transformer_ = QuantileTransformer(random_state=0, copy=True)
        self.quantile_transformer_.fit(X)
        return self 
Example #15
Source File: util.py    From skutil with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def from_array(X, column_names=None):
    """A simple wrapper for H2OFrame.from_python. This takes a
    numpy array (or 2d array) and returns an H2OFrame with all 
    the default args.

    Parameters
    ----------

    X : ndarray
        The array to convert.

    column_names : list, tuple (default=None)
        the names to use for your columns

    Returns
    -------

    H2OFrame
    """
    X = check_array(X, force_all_finite=False)
    return from_pandas(pd.DataFrame.from_records(data=X, columns=column_names)) 
Example #16
Source File: base.py    From sagemaker-scikit-learn-extension with Apache License 2.0 6 votes vote down vote up
def fit(self, X, y=None):
        """Learn empirical variances from X.
        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            Input samples from which to check uniqueness.

        Returns
        -------
        self
        """
        X = check_array(X, force_all_finite=False)
        _, self.n_input_features_ = X.shape
        all_nan_cols = np.all(np.isnan(X), axis=0)
        self.cols_to_transform_ = np.logical_or(
            np.array([np.unique(X[:, j]).size == 1 for j in range(self.n_input_features_)]), all_nan_cols
        )
        return self 
Example #17
Source File: fpt.py    From fylearn with MIT License 6 votes vote down vote up
def predict(self, X):
        """Predict class for X.

        Parameters
        ----------
        X : Array-like of shape [n_samples, n_features]
            The input to classify.

        Returns
        -------
        y : array of shape = [n_samples]
            The predicted classes.
        """

        X = check_array(X)

        if self.trees_ is None:
            raise Exception("Pattern trees not initialized. Perform a fit first.")

        y_classes = np.zeros((X.shape[0], len(self.classes_)))
        for i, c in enumerate(self.classes_):
            y_classes[:, i] = self.trees_[i](X)

        # predict the maximum value
        return self.classes_.take(np.argmax(y_classes, -1)) 
Example #18
Source File: nb_sklearn.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _joint_log_likelihood(self, X):
        """Calculate the posterior log probability of the samples X"""
        check_is_fitted(self, "classes_")

        X = check_array(X, accept_sparse='csr')
        X_bin = self._transform_data(X)

        n_classes, n_features = self.feature_log_prob_.shape
        n_samples, n_features_X = X_bin.shape

        if n_features_X != n_features:
            raise ValueError(
                "Expected input with %d features, got %d instead" %
                (n_features, n_features_X))

        # see chapter 4.1 of http://www.cs.columbia.edu/~mcollins/em.pdf
        # implementation as in Formula 4.
        jll = safe_sparse_dot(X_bin, self.feature_log_prob_.T)
        jll += self.class_log_prior_

        return jll 
Example #19
Source File: base.py    From sagemaker-scikit-learn-extension with Apache License 2.0 5 votes vote down vote up
def _validate_input(self, X):
        if self._is_constant_multicolumn_imputation():
            if len(self.fill_values) != X.shape[1]:
                raise ValueError(
                    "'fill_values' should have length equal to number of features in X {num_features}, "
                    "got {fill_values_length}".format(num_features=X.shape[1], fill_values_length=len(self.fill_values))
                )

        dtype = self.dtype or np.dtype("O")

        if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c":
            raise ValueError("Complex data not supported\n{}\n".format(X))

        return check_array(X, dtype=dtype, copy=True, force_all_finite=False, ensure_2d=True) 
Example #20
Source File: base.py    From sagemaker-scikit-learn-extension with Apache License 2.0 5 votes vote down vote up
def fit(self, X, y=None):
        """Compute the lower and upper quantile cutoffs and which columns to transform.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data array to transform. Must be numeric, non-sparse, and two-dimensional.

        Returns
        -------
        self : BaseExtremeValueTransformer
        """
        if not 0 <= self.quantile <= 100:
            raise ValueError(
                "Parameter `quantile` {} is invalid. `quantile` must be an integer between 0 and 100".format(
                    self.quantile
                )
            )

        X = check_array(X)
        _, self.n_input_features_ = X.shape

        self.quantiles_ = np.percentile(X, [100 - self.quantile, self.quantile], axis=0)

        nonstandard_threshold_stds = self.threshold_std * np.std(X, axis=0)
        col_means = np.mean(X, axis=0)
        threshold_upper_bound = nonstandard_threshold_stds + col_means
        threshold_lower_bound = -nonstandard_threshold_stds + col_means

        self.cols_to_transform_ = [
            j
            for j in range(self.n_input_features_)
            if self.quantiles_[0, j] < threshold_lower_bound[j] or self.quantiles_[1, j] > threshold_upper_bound[j]
        ]

        return self 
Example #21
Source File: base.py    From sagemaker-scikit-learn-extension with Apache License 2.0 5 votes vote down vote up
def _validate_input(self, X):
        if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c":
            raise ValueError("Complex data not supported\n{}\n".format(X))

        return check_array(X, dtype=np.dtype("O"), copy=True, force_all_finite=False, ensure_2d=True) 
Example #22
Source File: date_time.py    From sagemaker-scikit-learn-extension with Apache License 2.0 5 votes vote down vote up
def transform(self, X, y=None):
        X = check_array(X, dtype=None, force_all_finite="allow-nan")
        check_is_fitted(self, "extract_")

        X = np.array(X)
        X = self._to_datetime_array(X)

        return self._convert(X, self.mode) 
Example #23
Source File: test_estimator_checks.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def predict(self, X):
        X = check_array(X)
        self.key = 1000
        return np.ones(X.shape[0]) 
Example #24
Source File: stat_models.py    From pyod with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def pearsonr_mat(mat, w=None):
    """Utility function to calculate pearson matrix (row-wise).

    Parameters
    ----------
    mat : numpy array of shape (n_samples, n_features)
        Input matrix.

    w : numpy array of shape (n_features,)
        Weights.

    Returns
    -------
    pear_mat : numpy array of shape (n_samples, n_samples)
        Row-wise pearson score matrix.

    """
    mat = check_array(mat)
    n_row = mat.shape[0]
    n_col = mat.shape[1]
    pear_mat = np.full([n_row, n_row], 1).astype(float)

    if w is not None:
        for cx in range(n_row):
            for cy in range(cx + 1, n_row):
                curr_pear = wpearsonr(mat[cx, :], mat[cy, :], w)
                pear_mat[cx, cy] = curr_pear
                pear_mat[cy, cx] = curr_pear
    else:
        for cx in range(n_col):
            for cy in range(cx + 1, n_row):
                curr_pear = pearsonr(mat[cx, :], mat[cy, :])[0]
                pear_mat[cx, cy] = curr_pear
                pear_mat[cy, cx] = curr_pear

    return pear_mat 
Example #25
Source File: test_ranking.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def _my_lrap(y_true, y_score):
    """Simple implementation of label ranking average precision"""
    check_consistent_length(y_true, y_score)
    y_true = check_array(y_true)
    y_score = check_array(y_score)
    n_samples, n_labels = y_true.shape
    score = np.empty((n_samples, ))
    for i in range(n_samples):
        # The best rank correspond to 1. Rank higher than 1 are worse.
        # The best inverse ranking correspond to n_labels.
        unique_rank, inv_rank = np.unique(y_score[i], return_inverse=True)
        n_ranks = unique_rank.size
        rank = n_ranks - inv_rank

        # Rank need to be corrected to take into account ties
        # ex: rank 1 ex aequo means that both label are rank 2.
        corr_rank = np.bincount(rank, minlength=n_ranks + 1).cumsum()
        rank = corr_rank[rank]

        relevant = y_true[i].nonzero()[0]
        if relevant.size == 0 or relevant.size == n_labels:
            score[i] = 1
            continue

        score[i] = 0.
        for label in relevant:
            # Let's count the number of relevant label with better rank
            # (smaller rank).
            n_ranked_above = sum(rank[r] <= rank[label] for r in relevant)

            # Weight by the rank of the actual label
            score[i] += n_ranked_above / rank[label]

        score[i] /= relevant.size

    return score.mean() 
Example #26
Source File: split.py    From nyaggle with MIT License 5 votes vote down vote up
def split(self, X, y, groups=None):
        """
        Generate indices to split data into training and test set.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(StratifiedGroupKFold, self).split(X, y, groups) 
Example #27
Source File: lof.py    From pyod with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = LocalOutlierFactor(n_neighbors=self.n_neighbors,
                                            algorithm=self.algorithm,
                                            leaf_size=self.leaf_size,
                                            metric=self.metric,
                                            p=self.p,
                                            metric_params=self.metric_params,
                                            contamination=self.contamination,
                                            n_jobs=self.n_jobs)
        self.detector_.fit(X=X, y=y)

        # Invert decision_scores_. Outliers comes with higher outlier scores
        self.decision_scores_ = invert_order(
            self.detector_.negative_outlier_factor_)
        self._process_decision_scores()
        return self 
Example #28
Source File: pca.py    From pyod with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['components_', 'w_components_'])

        X = check_array(X)
        if self.standardization:
            X = self.scaler_.transform(X)

        return np.sum(
            cdist(X, self.selected_components_) / self.selected_w_components_,
            axis=1).ravel() 
Example #29
Source File: mcd.py    From pyod with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = MinCovDet(store_precision=self.store_precision,
                                   assume_centered=self.assume_centered,
                                   support_fraction=self.support_fraction,
                                   random_state=self.random_state)
        self.detector_.fit(X=X, y=y)

        # Use mahalanabis distance as the outlier score
        self.decision_scores_ = self.detector_.dist_
        self._process_decision_scores()
        return self 
Example #30
Source File: genetic.py    From gplearn with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def transform(self, X):
        """Transform X according to the fitted transformer.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Input vectors, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        X_new : array-like, shape = [n_samples, n_components]
            Transformed array.

        """
        if not hasattr(self, '_best_programs'):
            raise NotFittedError('SymbolicTransformer not fitted.')

        X = check_array(X)
        _, n_features = X.shape
        if self.n_features_ != n_features:
            raise ValueError('Number of features of the model must match the '
                             'input. Model n_features is %s and input '
                             'n_features is %s.'
                             % (self.n_features_, n_features))

        X_new = np.array([gp.execute(X) for gp in self._best_programs]).T

        return X_new