Python Examples of sklearn.utils.validation.FLOAT

Source File: naive_bayes.py From scikit-lego with MIT License

6 votes

def predict_proba(self, X: np.array):
        check_is_fitted(self, ["gmms_", "classes_", "num_fit_cols_"])
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        if self.num_fit_cols_ != X.shape[1]:
            raise ValueError(
                f"number of columns {X.shape[1]} does not match fit size {self.num_fit_cols_}"
            )
        check_is_fitted(self, ["gmms_", "classes_"])
        probs = np.zeros((X.shape[0], len(self.classes_)))
        for k, v in self.gmms_.items():
            class_idx = int(np.argwhere(self.classes_ == k))
            probs[:, class_idx] = np.array(
                [
                    m.score_samples(np.expand_dims(X[:, idx], 1))
                    for idx, m in enumerate(v)
                ]
            ).sum(axis=0)
        likelihood = np.exp(probs)
        return likelihood / likelihood.sum(axis=1).reshape(-1, 1)

Source File: umap_reconstruction.py From scikit-lego with MIT License

6 votes

def fit(self, X, y=None):
        """
        Fit the model using X as training data.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: ignored but kept in for pipeline support
        :return: Returns an instance of self.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        if self.n_components < 2:
            raise ValueError("Number of components must be at least two.")
        if not self.threshold:
            raise ValueError(f"The `threshold` value cannot be `None`.")

        self.umap_ = umap.UMAP(
            n_components=self.n_components,
            n_neighbors=self.n_neighbors,
            min_dist=self.min_dist,
            metric=self.metric,
            random_state=self.random_state,
        )
        self.umap_.fit(X, y)
        self.offset_ = -self.threshold
        return self

Source File: bayesian_gmm_classifier.py From scikit-lego with MIT License

5 votes

def predict_proba(self, X):
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmms_", "classes_"])
        res = np.zeros((X.shape[0], self.classes_.shape[0]))
        for idx, c in enumerate(self.classes_):
            res[:, idx] = self.gmms_[c].score_samples(X)
        return np.exp(res) / np.exp(res).sum(axis=1)[:, np.newaxis]

Source File: equal_groups.py From Same-Size-K-Means with BSD 3-Clause "New" or "Revised" License

5 votes

def _check_test_data(self, X):
        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
                        warn_on_dtype=True)
        n_samples, n_features = X.shape
        expected_n_features = self.cluster_centers_.shape[1]
        if not n_features == expected_n_features:
            raise ValueError("Incorrect number of features. "
                             "Got %d features, expected %d" % (
                                 n_features, expected_n_features))

        return X

Source File: naive_bayes.py From scikit-lego with MIT License

5 votes

def predict(self, X):
        check_is_fitted(self, ["gmms_", "classes_", "num_fit_cols_"])
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        return self.classes_[self.predict_proba(X).argmax(axis=1)]

Source File: naive_bayes.py From scikit-lego with MIT License

5 votes

def fit(self, X: np.array, y: np.array) -> "BayesianGaussianMixtureNB":
        """
        Fit the model using X, y as training data.

        :param X: array-like, shape=(n_columns, n_samples, ) training data.
        :param y: array-like, shape=(n_samples, ) training data.
        :return: Returns an instance of self.
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        if X.ndim == 1:
            X = np.expand_dims(X, 1)

        self.gmms_ = {}
        self.classes_ = unique_labels(y)
        self.num_fit_cols_ = X.shape[1]
        for c in self.classes_:
            subset_x, subset_y = X[y == c], y[y == c]
            self.gmms_[c] = [
                BayesianGaussianMixture(
                    n_components=self.n_components,
                    covariance_type=self.covariance_type,
                    tol=self.tol,
                    reg_covar=self.reg_covar,
                    max_iter=self.max_iter,
                    n_init=self.n_init,
                    init_params=self.init_params,
                    weight_concentration_prior_type=self.weight_concentration_prior_type,
                    weight_concentration_prior=self.weight_concentration_prior,
                    mean_precision_prior=self.mean_precision_prior,
                    mean_prior=self.mean_prior,
                    degrees_of_freedom_prior=self.degrees_of_freedom_prior,
                    covariance_prior=self.covariance_prior,
                    random_state=self.random_state,
                    warm_start=self.warm_start,
                    verbose=self.verbose,
                    verbose_interval=self.verbose_interval,
                ).fit(subset_x[:, i].reshape(-1, 1), subset_y)
                for i in range(X.shape[1])
            ]
        return self

Source File: naive_bayes.py From scikit-lego with MIT License

5 votes

def predict(self, X):
        check_is_fitted(self, ["gmms_", "classes_"])
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        return self.classes_[self.predict_proba(X).argmax(axis=1)]

Source File: naive_bayes.py From scikit-lego with MIT License

5 votes

def fit(self, X: np.array, y: np.array) -> "GaussianMixtureNB":
        """
        Fit the model using X, y as training data.

        :param X: array-like, shape=(n_columns, n_samples, ) training data.
        :param y: array-like, shape=(n_samples, ) training data.
        :return: Returns an instance of self.
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        if X.ndim == 1:
            X = np.expand_dims(X, 1)

        self.gmms_ = {}
        self.classes_ = unique_labels(y)
        self.num_fit_cols_ = X.shape[1]
        for c in self.classes_:
            subset_x, subset_y = X[y == c], y[y == c]
            self.gmms_[c] = [
                GaussianMixture(
                    n_components=self.n_components,
                    covariance_type=self.covariance_type,
                    tol=self.tol,
                    reg_covar=self.reg_covar,
                    max_iter=self.max_iter,
                    n_init=self.n_init,
                    init_params=self.init_params,
                    weights_init=self.weights_init,
                    means_init=self.means_init,
                    precisions_init=self.precisions_init,
                    random_state=self.random_state,
                    warm_start=self.warm_start,
                ).fit(subset_x[:, i].reshape(-1, 1), subset_y)
                for i in range(X.shape[1])
            ]
        return self

Source File: randomadder.py From scikit-lego with MIT License

5 votes

def transform_train(self, X):
        rs = check_random_state(self.random_state)
        check_is_fitted(self, ["dim_"])

        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)

        return X + rs.normal(0, self.noise, size=X.shape)

Source File: randomadder.py From scikit-lego with MIT License

5 votes

def fit(self, X, y):
        super().fit(X, y)
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        self.dim_ = X.shape[1]

        return self

Source File: columncapper.py From scikit-lego with MIT License

5 votes

def transform(self, X):
        """
        Performs the capping on the column(s) of ``X``.

        :type X: pandas.DataFrame or numpy.ndarray
        :param X: The column(s) for which the capping limit(s) will be applied.

        :rtype: numpy.ndarray
        :returns: ``X`` values with capped limits.

        :raises:
            ``ValueError`` if the number of columns from ``X`` differs from the
            number of columns when fitting
        """
        check_is_fitted(self, "quantiles_")
        X = check_array(
            X,
            copy=self.copy,
            force_all_finite=False,
            dtype=FLOAT_DTYPES,
            estimator=self,
        )

        if X.shape[1] != self.n_columns_:
            raise ValueError(
                "X must have the same number of columns in fit and transform"
            )

        if self.discard_infs:
            np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan)

        # Actually capping
        X = np.minimum(X, self.quantiles_[1, :])
        X = np.maximum(X, self.quantiles_[0, :])

        return X

Source File: umap_reconstruction.py From scikit-lego with MIT License

5 votes

def predict(self, X):
        """
        Predict if a point is an outlier.

        :param X: array-like, shape=(n_columns, n_samples, ) training data.
        :return: array, shape=(n_samples,) the predicted data. 1 for inliers, -1 for outliers.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["umap_", "offset_"])
        result = np.ones(X.shape[0])
        result[self.difference(X) > self.threshold] = -1
        return result.astype(np.int)

Source File: umap_reconstruction.py From scikit-lego with MIT License

5 votes

def transform(self, X):
        """
        Uses the underlying UMAP method to transform the data.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["umap_", "offset_"])
        return self.umap_.transform(X)

Source File: pca_reconstruction.py From scikit-lego with MIT License

5 votes

def predict(self, X):
        """
        Predict if a point is an outlier.

        :param X: array-like, shape=(n_columns, n_samples, ) training data.
        :return: array, shape=(n_samples,) the predicted data. 1 for inliers, -1 for outliers.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["pca_", "offset_"])
        result = np.ones(X.shape[0])
        result[self.difference(X) > self.threshold] = -1
        return result.astype(np.int)

Source File: pca_reconstruction.py From scikit-lego with MIT License

5 votes

def transform(self, X):
        """
        Uses the underlying PCA method to transform the data.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["pca_", "offset_"])
        return self.pca_.transform(X)

Source File: linear_regression.py From differential-privacy-library with MIT License

5 votes

def _preprocess_data(X, y, fit_intercept, epsilon=1.0, bounds_X=None, bounds_y=None, copy=True, check_input=True,
                     **unused_args):
    warn_unused_args(unused_args)

    if check_input:
        X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES)
    elif copy:
        X = X.copy(order='K')

    y = np.asarray(y, dtype=X.dtype)
    X_scale = np.ones(X.shape[1], dtype=X.dtype)

    if fit_intercept:
        bounds_X = check_bounds(bounds_X, X.shape[1])
        bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1)

        X = clip_to_bounds(X, bounds_X)
        y = clip_to_bounds(y, bounds_y)

        X_offset = mean(X, axis=0, bounds=bounds_X, epsilon=epsilon, accountant=BudgetAccountant())
        X -= X_offset
        y_offset = mean(y, axis=0, bounds=bounds_y, epsilon=epsilon, accountant=BudgetAccountant())
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
        if y.ndim == 1:
            y_offset = X.dtype.type(0)
        else:
            y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_scale


# noinspection PyPep8Naming,PyAttributeOutsideInit

Source File: bayesian_gmm_classifier.py From scikit-lego with MIT License

5 votes

def predict(self, X):
        check_is_fitted(self, ["gmms_", "classes_"])
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        return self.classes_[self.predict_proba(X).argmax(axis=1)]

Source File: bayesian_gmm_classifier.py From scikit-lego with MIT License

5 votes

def fit(self, X: np.array, y: np.array) -> "BayesianGMMClassifier":
        """
        Fit the model using X, y as training data.

        :param X: array-like, shape=(n_columns, n_samples, ) training data.
        :param y: array-like, shape=(n_samples, ) training data.
        :return: Returns an instance of self.
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        if X.ndim == 1:
            X = np.expand_dims(X, 1)

        self.gmms_ = {}
        self.classes_ = unique_labels(y)
        for c in self.classes_:
            subset_x, subset_y = X[y == c], y[y == c]
            mixture = BayesianGaussianMixture(
                n_components=self.n_components,
                covariance_type=self.covariance_type,
                tol=self.tol,
                reg_covar=self.reg_covar,
                max_iter=self.max_iter,
                n_init=self.n_init,
                init_params=self.init_params,
                weight_concentration_prior_type=self.weight_concentration_prior_type,
                weight_concentration_prior=self.weight_concentration_prior,
                mean_precision_prior=self.mean_precision_prior,
                mean_prior=self.mean_prior,
                degrees_of_freedom_prior=self.degrees_of_freedom_prior,
                covariance_prior=self.covariance_prior,
                random_state=self.random_state,
                warm_start=self.warm_start,
                verbose=self.verbose,
                verbose_interval=self.verbose_interval,
            )
            self.gmms_[c] = mixture.fit(subset_x, subset_y)
        return self

Source File: bayesian_gmm_detector.py From scikit-lego with MIT License

5 votes

def score_samples(self, X):
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
        if len(X.shape) == 1:
            X = np.expand_dims(X, 1)

        return self.gmm_.score_samples(X) * -1

Source File: gmm_classifier.py From scikit-lego with MIT License

5 votes

def predict_proba(self, X):
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmms_", "classes_"])
        res = np.zeros((X.shape[0], self.classes_.shape[0]))
        for idx, c in enumerate(self.classes_):
            res[:, idx] = self.gmms_[c].score_samples(X)
        return np.exp(res) / np.exp(res).sum(axis=1)[:, np.newaxis]

Source File: gmm_classifier.py From scikit-lego with MIT License

5 votes

def fit(self, X: np.array, y: np.array) -> "GMMClassifier":
        """
        Fit the model using X, y as training data.

        :param X: array-like, shape=(n_columns, n_samples, ) training data.
        :param y: array-like, shape=(n_samples, ) training data.
        :return: Returns an instance of self.
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        if X.ndim == 1:
            X = np.expand_dims(X, 1)

        self.gmms_ = {}
        self.classes_ = unique_labels(y)
        for c in self.classes_:
            subset_x, subset_y = X[y == c], y[y == c]
            mixture = GaussianMixture(
                n_components=self.n_components,
                covariance_type=self.covariance_type,
                tol=self.tol,
                reg_covar=self.reg_covar,
                max_iter=self.max_iter,
                n_init=self.n_init,
                init_params=self.init_params,
                weights_init=self.weights_init,
                means_init=self.means_init,
                precisions_init=self.precisions_init,
                random_state=self.random_state,
                warm_start=self.warm_start,
                verbose=self.verbose,
                verbose_interval=self.verbose_interval,
            )
            self.gmms_[c] = mixture.fit(subset_x, subset_y)
        return self

Source File: gmm_outlier_detector.py From scikit-lego with MIT License

5 votes

def score_samples(self, X):
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
        if len(X.shape) == 1:
            X = np.expand_dims(X, 1)

        return -self.gmm_.score_samples(X)

Source File: neighbors.py From scikit-lego with MIT License

5 votes

def predict(self, X):
        """
        Predict class labels for samples in X.

        :param X: array_like, shape (n_samples, n_features)
        :return: array, shape (n_samples)
        """
        check_is_fitted(self)
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)

        return self.classes_[np.argmax(self.predict_proba(X), 1)]

Source File: neighbors.py From scikit-lego with MIT License

5 votes

def predict_proba(self, X):
        """
        Probability estimates.

        The returned estimates for all classes are in the same order found in the `.classes_` attribute.

        :param X: array-like of shape (n_samples, n_features)
        :return: array-like of shape (n_samples, n_classes)
            Returns the probability of the sample for each class in the model,
            where classes are ordered as they are in self.classes_.
        """
        check_is_fitted(self)
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)

        log_prior = np.array(
            [self.priors_logp_[target_label] for target_label in self.classes_]
        )

        log_likelihood = np.array(
            [
                self.models_[target_label].score_samples(X)
                for target_label in self.classes_
            ]
        ).T

        log_likelihood_and_prior = np.exp(log_likelihood + log_prior)
        evidence = log_likelihood_and_prior.sum(axis=1, keepdims=True)
        posterior = log_likelihood_and_prior / evidence
        return posterior

Source File: neighbors.py From scikit-lego with MIT License

5 votes

def fit(self, X: np.ndarray, y: np.ndarray):
        """
        Fit the model using X, y as training data.

        :param X: array-like, shape=(n_features, n_samples)
        :param y: array-like, shape=(n_samples)
        :return: Returns an instance of self
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)

        self.classes_ = unique_labels(y)
        self.models_, self.priors_logp_ = {}, {}
        for target_label in self.classes_:
            x_subset = X[y == target_label]

            # Computing joint distribution
            self.models_[target_label] = KernelDensity(
                bandwidth=self.bandwidth,
                kernel=self.kernel,
                algorithm=self.algorithm,
                metric=self.metric,
                atol=self.atol,
                rtol=self.rtol,
                breadth_first=self.breath_first,
                leaf_size=self.leaf_size,
                metric_params=self.metric_params,
            ).fit(x_subset)

            # Computing target class prior
            self.priors_logp_[target_label] = np.log(len(x_subset) / len(X))

        return self

Source File: data.py From sagemaker-scikit-learn-extension with Apache License 2.0

5 votes

def fit(self, X, y=None):
        """Fit RobustStandardScaler to X.

        If input is sparse, `fit` overrides `self.with_mean` to standardize without subtracting mean (avoids breaking
        for sparse matrix)

        If the data is dense, the mean is adjusted for sparse features and the scaled with mean.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to standardize.

        Returns
        -------
        self : RobustStandardScaler
        """
        X = check_array(
            X, accept_sparse=("csr", "csc"), estimator=self, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
        )

        with_mean = True
        if issparse(X):
            with_mean = False

        self.scaler_ = StandardScaler(with_mean=with_mean, with_std=True, copy=self.copy)
        self.scaler_.fit(X)

        if self.scaler_.with_mean:
            nnz_mean_mask = np.where(np.count_nonzero(X, axis=0) / X.shape[0] > 0.3, 1, 0)
            self.scaler_.mean_ = self.scaler_.mean_ * nnz_mean_mask

        return self

Source File: iterative_imputer.py From ME-Net with MIT License

5 votes

def _validate_input(self, X):
        allowed_strategies = ["mean", "median", "most_frequent", "constant"]
        if self.strategy not in allowed_strategies:
            raise ValueError("Can only use these strategies: {0} "
                             " got strategy={1}".format(allowed_strategies,
                                                        self.strategy))

        if self.strategy in ("most_frequent", "constant"):
            dtype = None
        else:
            dtype = FLOAT_DTYPES

        if not is_scalar_nan(self.missing_values):
            force_all_finite = True
        else:
            force_all_finite = False # "allow-nan"

        try:
            X = check_array(X, accept_sparse='csc', dtype=dtype,
                            force_all_finite=force_all_finite, copy=self.copy)
        except ValueError as ve:
            if "could not convert" in str(ve):
                raise ValueError("Cannot use {0} strategy with non-numeric "
                                 "data. Received datatype :{1}."
                                 "".format(self.strategy, X.dtype.kind))
            else:
                raise ve

        _check_inputs_dtype(X, self.missing_values)
        if X.dtype.kind not in ("i", "u", "f", "O"):
            raise ValueError("_SimpleImputer does not support data with dtype "
                             "{0}. Please provide either a numeric array (with"
                             " a floating point or integer dtype) or "
                             "categorical data represented either as an array "
                             "with integer dtype or an array of string values "
                             "with an object dtype.".format(X.dtype))

        return X

Source File: text_transformers.py From cdQA with Apache License 2.0

5 votes

def transform(self, X=None, copy=True, is_query=False):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term query matrix
        copy : boolean, optional (default=True)
        query: boolean (default=False)
            whether to transform a query or the documents database

        Returns
        -------
        vectors : sparse matrix, [n_samples, n_features]

        """
        if is_query:
            X = check_array(X, accept_sparse="csr", dtype=FLOAT_DTYPES, copy=copy)
            if not sp.issparse(X):
                X = sp.csr_matrix(X, dtype=np.float64)

            n_samples, n_features = X.shape

            expected_n_features = self._doc_matrix.shape[1]
            if n_features != expected_n_features:
                raise ValueError(
                    "Input has n_features=%d while the model"
                    " has been trained with n_features=%d"
                    % (n_features, expected_n_features)
                )

            if self.use_idf:
                check_is_fitted(self, "_idf_diag", "idf vector is not fitted")
                X = sp.csr_matrix(X.toarray() * self._idf_diag.diagonal())

            return X

        else:
            return self._doc_matrix

Source File: iterative_imputer.py From ME-Net with MIT License

4 votes

def _initial_imputation(self, X):
        """Perform initial imputation for input X.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Input data, where "n_samples" is the number of samples and
            "n_features" is the number of features.

        Returns
        -------
        Xt : ndarray, shape (n_samples, n_features)
            Input data, where "n_samples" is the number of samples and
            "n_features" is the number of features.

        X_filled : ndarray, shape (n_samples, n_features)
            Input data with the most recent imputations.

        mask_missing_values : ndarray, shape (n_samples, n_features)
            Input data's missing indicator matrix, where "n_samples" is the
            number of samples and "n_features" is the number of features.
        """
        # TODO: change False to "allow-nan"
        if is_scalar_nan(self.missing_values):
            force_all_finite = False # "allow-nan"
        else:
            force_all_finite = True

        X = check_array(X, dtype=FLOAT_DTYPES, order="F",
                        force_all_finite=force_all_finite)
        _check_inputs_dtype(X, self.missing_values)

        mask_missing_values = _get_mask(X, self.missing_values)
        if self.initial_imputer_ is None:
            self.initial_imputer_ = _SimpleImputer(
                                            missing_values=self.missing_values,
                                            strategy=self.initial_strategy)
            X_filled = self.initial_imputer_.fit_transform(X)
        else:
            X_filled = self.initial_imputer_.transform(X)

        valid_mask = np.flatnonzero(np.logical_not(
            np.isnan(self.initial_imputer_.statistics_)))
        Xt = X[:, valid_mask]
        mask_missing_values = mask_missing_values[:, valid_mask]

        return Xt, X_filled, mask_missing_values

Python sklearn.utils.validation.FLOAT_DTYPES Examples