Python sklearn.utils.sparsefuncs.mean_variance_axis() Examples
The following are 11
code examples of sklearn.utils.sparsefuncs.mean_variance_axis().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils.sparsefuncs
, or try the search function
.
Example #1
Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_scale_function_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_scaled = scale(X, with_mean=False) assert not np.any(np.isnan(X_scaled)) X_csr_scaled = scale(X_csr, with_mean=False) assert not np.any(np.isnan(X_csr_scaled.data)) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # null scale X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True) assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray())
Example #2
Source File: test_preprocessing.py From scanpy with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_mean_var_sparse(): from sklearn.utils.sparsefuncs import mean_variance_axis csr64 = sp.random(10000, 1000, format="csr", dtype=np.float64) csc64 = csr64.tocsc() # Test that we're equivalent for 64 bit for mtx, ax in product((csr64, csc64), (0, 1)): scm, scv = sc.pp._utils._get_mean_var(mtx, axis=ax) skm, skv = mean_variance_axis(mtx, ax) skv *= (mtx.shape[ax] / (mtx.shape[ax] - 1)) assert np.allclose(scm, skm) assert np.allclose(scv, skv) csr32 = csr64.astype(np.float32) csc32 = csc64.astype(np.float32) # Test whether ours is more accurate for 32 bit for mtx32, mtx64 in [(csc32, csc64), (csr32, csr64)]: scm32, scv32 = sc.pp._utils._get_mean_var(mtx32) scm64, scv64 = sc.pp._utils._get_mean_var(mtx64) skm32, skv32 = mean_variance_axis(mtx32, 0) skm64, skv64 = mean_variance_axis(mtx64, 0) skv32 *= (mtx.shape[0] / (mtx.shape[0] - 1)) skv64 *= (mtx.shape[0] / (mtx.shape[0] - 1)) m_resid_sc = np.mean(np.abs(scm64 - scm32)) m_resid_sk = np.mean(np.abs(skm64 - skm32)) v_resid_sc = np.mean(np.abs(scv64 - scv32)) v_resid_sk = np.mean(np.abs(skv64 - skv32)) assert m_resid_sc < m_resid_sk assert v_resid_sc < v_resid_sk
Example #3
Source File: equal_groups.py From Same-Size-K-Means with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _tolerance(X, tol): """Return a tolerance which is independent of the dataset""" if sp.issparse(X): variances = mean_variance_axis(X, axis=0)[1] else: variances = np.var(X, axis=0) return np.mean(variances) * tol
Example #4
Source File: _k_means_0_23.py From daal4py with Apache License 2.0 | 5 votes |
def _tolerance(X, rtol): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol if sp.issparse(X): variances = mean_variance_axis(X, axis=0)[1] mean_var = np.mean(variances) else: mean_var = _daal_mean_var(X) return mean_var * rtol
Example #5
Source File: _k_means_0_22.py From daal4py with Apache License 2.0 | 5 votes |
def _tolerance(X, rtol): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol if sp.issparse(X): variances = mean_variance_axis(X, axis=0)[1] mean_var = np.mean(variances) else: mean_var = _daal_mean_var(X) return mean_var * rtol
Example #6
Source File: _k_means_0_21.py From daal4py with Apache License 2.0 | 5 votes |
def _tolerance(X, rtol): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol if sp.issparse(X): variances = mean_variance_axis(X, axis=0)[1] mean_var = np.mean(variances) else: mean_var = _daal_mean_var(X) return mean_var * rtol
Example #7
Source File: scores.py From SecuML with GNU General Public License v2.0 | 5 votes |
def compute_scoring_func(self, func): if func == 'variance': features = self.instances.features.get_values() annotations = self.instances.annotations.get_labels() if isinstance(features, spmatrix): variance = mean_variance_axis(features, axis=0)[1] else: variance = features.var(axis=0) return variance, None features = self.annotated_instances.features.get_values() annotations = self.annotated_instances.annotations.get_supervision( self.multiclass) if func == 'f_classif': return f_classif(features, annotations) elif func == 'mutual_info_classif': if isinstance(features, spmatrix): discrete_indexes = True else: features_types = self.instances.features.info.types discrete_indexes = [i for i, t in enumerate(features_types) if t == FeatureType.binary] if not discrete_indexes: discrete_indexes = False return (mutual_info_classif(features, annotations, discrete_features=discrete_indexes), None) elif func == 'chi2': return chi2(features, annotations) else: assert(False)
Example #8
Source File: density.py From SecuML with GNU General Public License v2.0 | 5 votes |
def _display_dataset(self, dataset): eps = 0.00001 linewidth = dataset.linewidth delta = self.max_value - self.min_value density_delta = 1.2 * delta if delta > 0: x = np.arange(self.min_value - 0.1*delta, self.max_value + 0.1*delta, density_delta / self.num_points) else: x = np.array([self.min_value - 2*eps, self.max_value + 2*eps]) if isinstance(dataset.values, spmatrix): variance = mean_variance_axis(dataset.values, axis=0)[1] else: variance = np.var(dataset.values) if variance < eps: linewidth += 2 mean = np.mean(dataset.values) x = np.sort(np.append(x, [mean, mean - eps, mean + eps])) density = [1 if v == mean else 0 for v in x] else: self.kde.fit(dataset.values) x_density = [[y] for y in x] # kde.score_samples returns the 'log' of the density log_density = self.kde.score_samples(x_density).tolist() density = list(map(math.exp, log_density)) self.ax.plot(x, density, label=dataset.label, color=dataset.color, linewidth=linewidth, linestyle=dataset.linestyle)
Example #9
Source File: test_data.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_scale_function_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_scaled = scale(X, with_mean=False) assert_false(np.any(np.isnan(X_scaled))) X_csr_scaled = scale(X_csr, with_mean=False) assert_false(np.any(np.isnan(X_csr_scaled.data))) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # null scale X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True) assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray())
Example #10
Source File: variance_threshold.py From sparkit-learn with Apache License 2.0 | 4 votes |
def fit(self, Z): """Learn empirical variances from X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Sample vectors from which to compute variances. y : any Ignored. This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Returns ------- self """ X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z check_rdd(X, (np.ndarray, sp.spmatrix)) def mapper(X): """Calculate statistics for every numpy or scipy blocks.""" X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix mean, var = mean_variance_axis(X, axis=0) else: mean, var = np.mean(X, axis=0), np.var(X, axis=0) return X.shape[0], mean, var def reducer(a, b): """Calculate the combined statistics.""" n_a, mean_a, var_a = a n_b, mean_b, var_b = b n_ab = n_a + n_b mean_ab = ((mean_a * n_a) + (mean_b * n_b)) / n_ab var_ab = (((n_a * var_a) + (n_b * var_b)) / n_ab) + \ ((n_a * n_b) * ((mean_b - mean_a) / n_ab) ** 2) return (n_ab, mean_ab, var_ab) _, _, self.variances_ = X.map(mapper).treeReduce(reducer) if np.all(self.variances_ <= self.threshold): msg = "No feature in X meets the variance threshold {0:.5f}" if X.shape[0] == 1: msg += " (X contains only one sample)" raise ValueError(msg.format(self.threshold)) return self
Example #11
Source File: data.py From sparkit-learn with Apache License 2.0 | 4 votes |
def fit(self, Z): """Compute the mean and std to be used for later scaling. Parameters ---------- Z : DictRDD containing (X, y) pairs X - Training vector. {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y - Target labels Passthrough for ``Pipeline`` compatibility. """ # Reset internal state before fitting self._reset() X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z check_rdd(X, (np.ndarray, sp.spmatrix)) def mapper(X): """Calculate statistics for every numpy or scipy blocks.""" X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix mean, var = mean_variance_axis(X, axis=0) else: mean, var = np.mean(X, axis=0), np.var(X, axis=0) return X.shape[0], mean, var def reducer(a, b): """Calculate the combined statistics.""" n_a, mean_a, var_a = a n_b, mean_b, var_b = b n_ab = n_a + n_b mean_ab = ((mean_a * n_a) + (mean_b * n_b)) / n_ab var_ab = (((n_a * var_a) + (n_b * var_b)) / n_ab) + \ ((n_a * n_b) * ((mean_b - mean_a) / n_ab) ** 2) return (n_ab, mean_ab, var_ab) if check_rdd_dtype(X, (sp.spmatrix)): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") self.n_samples_seen_, self.mean_, self.var_ = X.map(mapper).treeReduce(reducer) if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None return self