Python Examples of sklearn.utils.extmath.stable

Source File: test_extmath.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_stable_cumsum():
    assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
    r = np.random.RandomState(0).rand(100000)
    assert_warns(RuntimeWarning, stable_cumsum, r, rtol=0, atol=0)

    # test axis parameter
    A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
    assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
    assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
    assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))

Source File: stats.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def _weighted_percentile(array, sample_weight, percentile=50):
    """
    Compute the weighted ``percentile`` of ``array`` with ``sample_weight``.
    """
    sorted_idx = np.argsort(array)

    # Find index of median prediction for each sample
    weight_cdf = stable_cumsum(sample_weight[sorted_idx])
    percentile_idx = np.searchsorted(
        weight_cdf, (percentile / 100.) * weight_cdf[-1])
    return array[sorted_idx[percentile_idx]]

Source File: _pca_0_22.py From daal4py with Apache License 2.0

5 votes

def _n_components_from_fraction(explained_variance_ratio, frac):
    # number of components for which the cumulated explained
    # variance percentage is superior to the desired threshold
    # side='right' ensures that number of features selected
    # their variance is always greater than n_components float
    # passed. More discussion in issue: #15669
    ratio_cumsum = stable_cumsum(explained_variance_ratio)
    n_components = np.searchsorted(ratio_cumsum, frac,
                                   side='right') + 1
    return n_components

Source File: _pca_0_21.py From daal4py with Apache License 2.0

5 votes

def _fit_full_daal4py(self, X, n_components):
        n_samples, n_features = X.shape

        # due to need to flip components, need to do full decomposition
        self._fit_daal4py(X, min(n_samples, n_features))
        U = self._transform_daal4py(X, whiten=True, check_X=False, scale_eigenvalues=True)
        V = self.components_
        U, V = svd_flip(U, V)
        U = U.copy()
        V = V.copy()
        S = self.singular_values_.copy()

        if n_components == 'mle':
            n_components = \
                _infer_dimension_(self.explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(self.explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = self.explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = self.components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = self.explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            self.explained_variance_ratio_[:n_components]
        self.singular_values_ = self.singular_values_[:n_components]

        return U, S, V

Source File: _pca_0_23.py From daal4py with Apache License 2.0

5 votes

def _n_components_from_fraction(explained_variance_ratio, frac):
    # number of components for which the cumulated explained
    # variance percentage is superior to the desired threshold
    # side='right' ensures that number of features selected
    # their variance is always greater than n_components float
    # passed. More discussion in issue: #15669
    ratio_cumsum = stable_cumsum(explained_variance_ratio)
    n_components = np.searchsorted(ratio_cumsum, frac,
                                   side='right') + 1
    return n_components

Source File: stats.py From twitter-stock-recommendation with MIT License

5 votes

def _weighted_percentile(array, sample_weight, percentile=50):
    """
    Compute the weighted ``percentile`` of ``array`` with ``sample_weight``.
    """
    sorted_idx = np.argsort(array)

    # Find index of median prediction for each sample
    weight_cdf = stable_cumsum(sample_weight[sorted_idx])
    percentile_idx = np.searchsorted(
        weight_cdf, (percentile / 100.) * weight_cdf[-1])
    return array[sorted_idx[percentile_idx]]

Source File: test_extmath.py From twitter-stock-recommendation with MIT License

5 votes

def test_stable_cumsum():
    if np_version < (1, 9):
        raise SkipTest("Sum is as unstable as cumsum for numpy < 1.9")
    assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
    r = np.random.RandomState(0).rand(100000)
    assert_warns(RuntimeWarning, stable_cumsum, r, rtol=0, atol=0)

    # test axis parameter
    A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
    assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
    assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
    assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))

Source File: metrics.py From scikit-uplift with MIT License

4 votes

def uplift_curve(y_true, uplift, treatment):
    """Compute Uplift curve.

    For computing the area under the Uplift Curve, see :func:`.uplift_auc_score`.

    Args:
        y_true (1d array-like): Correct (true) target values.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.

    Returns:
        array (shape = [>2]), array (shape = [>2]): Points on a curve.

    See also:
        :func:`.uplift_auc_score`: Compute normalized Area Under the Uplift curve from prediction scores.

        :func:`.perfect_uplift_curve`: Compute the perfect Uplift curve.

        :func:`.plot_uplift_curve`: Plot Uplift curves from predictions.

        :func:`.qini_curve`: Compute Qini curve.

    References:
        Devriendt, F., Guns, T., & Verbeke, W. (2020). Learning to rank for uplift modeling. ArXiv, abs/2002.05897.
    """

    # TODO: check the treatment is binary
    y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment)
    desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1]
    y_true, uplift, treatment = y_true[desc_score_indices], uplift[desc_score_indices], treatment[desc_score_indices]

    y_true_ctrl, y_true_trmnt = y_true.copy(), y_true.copy()

    y_true_ctrl[treatment == 1] = 0
    y_true_trmnt[treatment == 0] = 0

    distinct_value_indices = np.where(np.diff(uplift))[0]
    threshold_indices = np.r_[distinct_value_indices, uplift.size - 1]

    num_trmnt = stable_cumsum(treatment)[threshold_indices]
    y_trmnt = stable_cumsum(y_true_trmnt)[threshold_indices]

    num_all = threshold_indices + 1

    num_ctrl = num_all - num_trmnt
    y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices]

    curve_values = (np.divide(y_trmnt, num_trmnt, out=np.zeros_like(y_trmnt), where=num_trmnt != 0) -
                    np.divide(y_ctrl, num_ctrl, out=np.zeros_like(y_ctrl), where=num_ctrl != 0)) * num_all

    if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0:
        # Add an extra threshold position if necessary
        # to make sure that the curve starts at (0, 0)
        num_all = np.r_[0, num_all]
        curve_values = np.r_[0, curve_values]

    return num_all, curve_values

Source File: metrics.py From scikit-uplift with MIT License

4 votes

def qini_curve(y_true, uplift, treatment):
    """Compute Qini curve.

    For computing the area under the Qini Curve, see :func:`.qini_auc_score`.

    Args:
        y_true (1d array-like): Correct (true) target values.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.

    Returns:
        array (shape = [>2]), array (shape = [>2]): Points on a curve.

    See also:
        :func:`.uplift_curve`: Compute the area under the Qini curve.

        :func:`.perfect_qini_curve`: Compute the perfect Qini curve.

        :func:`.plot_qini_curves`: Plot Qini curves from predictions..

        :func:`.uplift_curve`: Compute Uplift curve.

    References:
        Nicholas J Radcliffe. (2007). Using control groups to target on predicted lift:
        Building and assessing uplift model. Direct Marketing Analytics Journal, (3):14–21, 2007.

        Devriendt, F., Guns, T., & Verbeke, W. (2020). Learning to rank for uplift modeling. ArXiv, abs/2002.05897.
    """
    # TODO: check the treatment is binary
    y_true, uplift, treatment = np.array(y_true), np.array(uplift), np.array(treatment)

    desc_score_indices = np.argsort(uplift, kind="mergesort")[::-1]

    y_true = y_true[desc_score_indices]
    treatment = treatment[desc_score_indices]
    uplift = uplift[desc_score_indices]

    y_true_ctrl, y_true_trmnt = y_true.copy(), y_true.copy()

    y_true_ctrl[treatment == 1] = 0
    y_true_trmnt[treatment == 0] = 0

    distinct_value_indices = np.where(np.diff(uplift))[0]
    threshold_indices = np.r_[distinct_value_indices, uplift.size - 1]

    num_trmnt = stable_cumsum(treatment)[threshold_indices]
    y_trmnt = stable_cumsum(y_true_trmnt)[threshold_indices]

    num_all = threshold_indices + 1

    num_ctrl = num_all - num_trmnt
    y_ctrl = stable_cumsum(y_true_ctrl)[threshold_indices]

    curve_values = y_trmnt - y_ctrl * np.divide(num_trmnt, num_ctrl, out=np.zeros_like(num_trmnt), where=num_ctrl != 0)
    if num_all.size == 0 or curve_values[0] != 0 or num_all[0] != 0:
        # Add an extra threshold position if necessary
        # to make sure that the curve starts at (0, 0)
        num_all = np.r_[0, num_all]
        curve_values = np.r_[0, curve_values]

    return num_all, curve_values

Source File: _pca_0_21.py From daal4py with Apache License 2.0

4 votes

def _fit_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        _validate_n_components(n_components, n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)
        centering_algo = daal4py.normalization_zscore(
            fptype=fpType, doScale=False)
        pca_alg = daal4py.pca(
            fptype=fpType,
            method='svdDense',
            normalization=centering_algo,
            resultsToCompute='mean|variance|eigenvalue',
            isDeterministic=True,
            nComponents=daal_n_components
        )
        pca_res = pca_alg.compute(X)

        self.mean_ = pca_res.means.ravel()
        variances_ = pca_res.variances.ravel()
        components_ = pca_res.eigenvectors
        explained_variance_ = pca_res.eigenvalues.ravel()
        tot_var  = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            n_components = \
                _infer_dimension_(explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean()
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = np.sqrt((n_samples - 1) * self.explained_variance_)

Source File: _pca_0_21.py From daal4py with Apache License 2.0

4 votes

def _fit_full_vanilla(self, X, n_components):
        """Fit the model by computing full SVD on X"""
        n_samples, n_features = X.shape

        # Center data
        self.mean_ = np.mean(X, axis=0)
        X -= self.mean_

        U, S, V = np.linalg.svd(X, full_matrices=False)
        # flip eigenvectors' sign to enforce deterministic output
        U, V = svd_flip(U, V)

        components_ = V

        # Get variance explained by singular values
        explained_variance_ = (S ** 2) / (n_samples - 1)
        total_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / total_var

        # Postprocess the number of components required
        if n_components == 'mle':
            n_components = \
               _infer_dimension_(explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            # number of components for which the cumulated explained
            # variance percentage is superior to the desired threshold
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(ratio_cumsum, n_components) + 1

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < min(n_features, n_samples):
            self.noise_variance_ = explained_variance_[n_components:].mean()
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
                    explained_variance_ratio_[:n_components]
        self.singular_values_ = S[:n_components]

        return U, S, V

Python sklearn.utils.extmath.stable_cumsum() Examples