Python sklearn.utils.gen_batches() Examples
The following are 13
code examples of sklearn.utils.gen_batches().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils
, or try the search function
.
Example #1
Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_standard_scaler_trasform_with_partial_fit(): # Check some postconditions after applying partial_fit and transform X = X_2d[:100, :] scaler_incr = StandardScaler() for i, batch in enumerate(gen_batches(X.shape[0], 1)): X_sofar = X[:(i + 1), :] chunks_copy = X_sofar.copy() scaled_batch = StandardScaler().fit_transform(X_sofar) scaler_incr = scaler_incr.partial_fit(X[batch]) scaled_incr = scaler_incr.transform(X_sofar) assert_array_almost_equal(scaled_batch, scaled_incr) assert_array_almost_equal(X_sofar, chunks_copy) # No change right_input = scaler_incr.inverse_transform(scaled_incr) assert_array_almost_equal(X_sofar, right_input) zero = np.zeros(X.shape[1]) epsilon = np.finfo(float).eps assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) # (i+1) because the Scaler has been already fitted assert_equal((i + 1), scaler_incr.n_samples_seen_)
Example #2
Source File: dict_fact.py From modl with BSD 2-Clause "Simplified" License | 6 votes |
def partial_fit(self, X, sample_indices=None): """ Update the factorization using rows from X Parameters ---------- X: ndarray, shape (n_samples, n_features) Input data sample_indices: Indices for each row of X. If None, consider that row i index is i (useful when providing the whole data to the function) Returns ------- self """ X = check_array(X, dtype=[np.float32, np.float64], order='C') n_samples, n_features = X.shape batches = gen_batches(n_samples, self.batch_size) for batch in batches: this_X = X[batch] these_sample_indices = get_sub_slice(sample_indices, batch) self._single_batch_fit(this_X, these_sample_indices) return self
Example #3
Source File: test_data.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_standard_scaler_trasform_with_partial_fit(): # Check some postconditions after applying partial_fit and transform X = X_2d[:100, :] scaler_incr = StandardScaler() for i, batch in enumerate(gen_batches(X.shape[0], 1)): X_sofar = X[:(i + 1), :] chunks_copy = X_sofar.copy() scaled_batch = StandardScaler().fit_transform(X_sofar) scaler_incr = scaler_incr.partial_fit(X[batch]) scaled_incr = scaler_incr.transform(X_sofar) assert_array_almost_equal(scaled_batch, scaled_incr) assert_array_almost_equal(X_sofar, chunks_copy) # No change right_input = scaler_incr.inverse_transform(scaled_incr) assert_array_almost_equal(X_sofar, right_input) zero = np.zeros(X.shape[1]) epsilon = np.nextafter(0, 1) assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) # (i+1) because the Scaler has been already fitted assert_equal((i + 1), scaler_incr.n_samples_seen_)
Example #4
Source File: rotation_forest.py From RotationForest with MIT License | 5 votes |
def random_feature_subsets(array, batch_size, random_state=1234): """ Generate K subsets of the features in X """ random_state = check_random_state(random_state) features = range(array.shape[1]) random_state.shuffle(features) for batch in gen_batches(len(features), batch_size): yield features[batch]
Example #5
Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_minmax_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MinMaxScaler().fit(X[batch0]) scaler_incr = MinMaxScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std until the end of partial fits, and scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_)
Example #6
Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_standard_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = StandardScaler(with_std=False).fit(X) scaler_incr = StandardScaler(with_std=False) for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) assert_equal(scaler_batch.var_, scaler_incr.var_) # Nones assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_incr = StandardScaler().partial_fit(X[batch0]) if chunk_size == 1: assert_array_almost_equal(np.zeros(n_features, dtype=np.float64), scaler_incr.var_) assert_array_almost_equal(np.ones(n_features, dtype=np.float64), scaler_incr.scale_) else: assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_) assert_array_almost_equal(np.std(X[batch0], axis=0), scaler_incr.scale_) # no constants # Test std until the end of partial fits, and scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
Example #7
Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_maxabs_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d[:100, :] n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() scaler_incr_csr = MaxAbsScaler() scaler_incr_csc = MaxAbsScaler() for batch in gen_batches(n, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) X_csr = sparse.csr_matrix(X[batch]) scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr) X_csc = sparse.csc_matrix(X[batch]) scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr_csr.n_samples_seen_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr_csc.n_samples_seen_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MaxAbsScaler().fit(X[batch0]) scaler_incr = MaxAbsScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std until the end of partial fits, and scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() # Clean estimator for i, batch in enumerate(gen_batches(n, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_)
Example #8
Source File: recsys.py From modl with BSD 2-Clause "Simplified" License | 4 votes |
def fit(self, X, y=None): """Learns a dictionary from sparse matrix X Parameters ---------- X: csr-matrix (n_samples, n_features) Datset to learn the dictionary from """ if not sp.issparse(X): X = sp.csr_matrix(X) X = check_array(X, accept_sparse='csr', dtype=[np.float32, np.float64], copy=True) dtype = X.dtype n_samples, n_features = X.shape self.random_state = check_random_state(self.random_state) if self.detrend: self.row_mean_, self.col_mean_ = compute_biases(X, beta=self.beta, inplace=False) for i in range(X.shape[0]): X.data[X.indptr[i]:X.indptr[i + 1]] -= self.row_mean_[i] X.data -= self.col_mean_.take(X.indices, mode='clip') self.components_ = self.random_state.randn(self.n_components, n_features).astype(dtype) S = np.sqrt(np.sum(self.components_ ** 2, axis=1)) self.components_ /= S[:, np.newaxis] self.code_ = np.zeros((n_samples, self.n_components), dtype=dtype) self._refit(X) self.feature_freq_ = np.bincount(X.indices) / n_samples self.feature_n_iter_ = np.zeros(n_features, dtype=int) sparsity = X.nnz / n_samples / n_features if self.batch_size is None: batch_size = int(ceil(1. / sparsity)) else: batch_size = self.batch_size self.comp_norm_ = np.zeros(self.n_components, dtype=dtype) self.C_ = np.zeros((self.n_components, self.n_components), dtype=dtype) self.B_ = np.zeros((self.n_components, n_features), dtype=dtype) self.n_iter_ = 0 if self.verbose: log_lim = log(n_samples * self.n_epochs / batch_size, 10) self.verbose_iter_ = (np.logspace(0, log_lim, self.verbose, base=10) - 1) * batch_size self.verbose_iter_ = self.verbose_iter_.tolist() for i in range(self.n_epochs): permutation = self.random_state.permutation(n_samples) batches = gen_batches(n_samples, batch_size) for batch in batches: self._single_batch_fit(X, permutation[batch]) self._refit(X) return self
Example #9
Source File: dict_fact.py From modl with BSD 2-Clause "Simplified" License | 4 votes |
def transform(self, X): """ Compute the codes associated to input matrix X, decomposing it onto the dictionary Parameters ---------- X: ndarray, shape = (n_samples, n_features) Returns ------- code: ndarray, shape = (n_samples, n_components) """ check_is_fitted(self, 'components_') dtype = self.components_.dtype X = check_array(X, order='C', dtype=dtype.type) if X.flags['WRITEABLE'] is False: X = X.copy() n_samples, n_features = X.shape if not hasattr(self, 'G_agg') or self.G_agg != 'full': G = self.components_.dot(self.components_.T) else: G = self.G_ Dx = X.dot(self.components_.T) code = np.ones((n_samples, self.n_components), dtype=dtype) sample_indices = np.arange(n_samples) size_job = ceil(n_samples / self.n_threads) batches = list(gen_batches(n_samples, size_job)) par_func = lambda batch: _enet_regression_single_gram( G, Dx[batch], X[batch], code, get_sub_slice(sample_indices, batch), self.code_l1_ratio, self.code_alpha, self.code_pos, self.tol, self.max_iter) if self.n_threads > 1: res = self._pool.map(par_func, batches) _ = list(res) else: _enet_regression_single_gram( G, Dx, X, code, sample_indices, self.code_l1_ratio, self.code_alpha, self.code_pos, self.tol, self.max_iter) return code
Example #10
Source File: incremental_pca.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _fit(self, X, y=None): """Fit the model with X, using minibatches of size batch_size. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : Ignored Returns ------- self : object Returns the instance itself. """ self.components_ = None self.n_samples_seen_ = 0 self.mean_ = 0.0 self.var_ = 0.0 self.squared_sum_ = 0.0 self.sum_ = 0.0 self.singular_values_ = None self.explained_variance_ = None self.explained_variance_ratio_ = None self.singular_values_ = None self.noise_variance_ = None X = check_array( X, accept_sparse=["csr", "csc", "lil"], copy=self.copy, dtype=[np.float64, np.float32], accept_multiple_blocks=True, ) n_samples, n_features = X.shape if self.batch_size is None: self.batch_size_ = 5 * n_features else: self.batch_size_ = self.batch_size for batch in gen_batches( n_samples, self.batch_size_, min_batch_size=self.n_components or 0 ): X_batch = X[batch] if sparse.issparse(X_batch): X_batch = X_batch.toarray() self.partial_fit(X_batch, check_input=False) return self
Example #11
Source File: test_data.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_minmax_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MinMaxScaler().fit(X[batch0]) scaler_incr = MinMaxScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std until the end of partial fits, and scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_)
Example #12
Source File: test_data.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_standard_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = StandardScaler(with_std=False).fit(X) scaler_incr = StandardScaler(with_std=False) for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) assert_equal(scaler_batch.var_, scaler_incr.var_) # Nones assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_incr = StandardScaler().partial_fit(X[batch0]) if chunk_size == 1: assert_array_almost_equal(np.zeros(n_features, dtype=np.float64), scaler_incr.var_) assert_array_almost_equal(np.ones(n_features, dtype=np.float64), scaler_incr.scale_) else: assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_) assert_array_almost_equal(np.std(X[batch0], axis=0), scaler_incr.scale_) # no constants # Test std until the end of partial fits, and scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
Example #13
Source File: test_data.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_maxabs_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d[:100, :] n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() scaler_incr_csr = MaxAbsScaler() scaler_incr_csc = MaxAbsScaler() for batch in gen_batches(n, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) X_csr = sparse.csr_matrix(X[batch]) scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr) X_csc = sparse.csc_matrix(X[batch]) scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr_csr.n_samples_seen_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr_csc.n_samples_seen_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MaxAbsScaler().fit(X[batch0]) scaler_incr = MaxAbsScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std until the end of partial fits, and scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() # Clean estimator for i, batch in enumerate(gen_batches(n, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_)