Python sklearn.utils.gen_batches() Examples

The following are 13 code examples of sklearn.utils.gen_batches(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils , or try the search function .
Example #1
Source File: test_data.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_standard_scaler_trasform_with_partial_fit():
    # Check some postconditions after applying partial_fit and transform
    X = X_2d[:100, :]

    scaler_incr = StandardScaler()
    for i, batch in enumerate(gen_batches(X.shape[0], 1)):

        X_sofar = X[:(i + 1), :]
        chunks_copy = X_sofar.copy()
        scaled_batch = StandardScaler().fit_transform(X_sofar)

        scaler_incr = scaler_incr.partial_fit(X[batch])
        scaled_incr = scaler_incr.transform(X_sofar)

        assert_array_almost_equal(scaled_batch, scaled_incr)
        assert_array_almost_equal(X_sofar, chunks_copy)  # No change
        right_input = scaler_incr.inverse_transform(scaled_incr)
        assert_array_almost_equal(X_sofar, right_input)

        zero = np.zeros(X.shape[1])
        epsilon = np.finfo(float).eps
        assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
        assert_array_less(zero, scaler_incr.scale_ + epsilon)
        # (i+1) because the Scaler has been already fitted
        assert_equal((i + 1), scaler_incr.n_samples_seen_) 
Example #2
Source File: dict_fact.py    From modl with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def partial_fit(self, X, sample_indices=None):
        """
        Update the factorization using rows from X

        Parameters
        ----------
        X: ndarray, shape (n_samples, n_features)
            Input data
        sample_indices:
            Indices for each row of X. If None, consider that row i index is i
            (useful when providing the whole data to the function)
        Returns
        -------
        self
        """
        X = check_array(X, dtype=[np.float32, np.float64], order='C')

        n_samples, n_features = X.shape
        batches = gen_batches(n_samples, self.batch_size)

        for batch in batches:
            this_X = X[batch]
            these_sample_indices = get_sub_slice(sample_indices, batch)
            self._single_batch_fit(this_X, these_sample_indices)
        return self 
Example #3
Source File: test_data.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_standard_scaler_trasform_with_partial_fit():
    # Check some postconditions after applying partial_fit and transform
    X = X_2d[:100, :]

    scaler_incr = StandardScaler()
    for i, batch in enumerate(gen_batches(X.shape[0], 1)):

        X_sofar = X[:(i + 1), :]
        chunks_copy = X_sofar.copy()
        scaled_batch = StandardScaler().fit_transform(X_sofar)

        scaler_incr = scaler_incr.partial_fit(X[batch])
        scaled_incr = scaler_incr.transform(X_sofar)

        assert_array_almost_equal(scaled_batch, scaled_incr)
        assert_array_almost_equal(X_sofar, chunks_copy)  # No change
        right_input = scaler_incr.inverse_transform(scaled_incr)
        assert_array_almost_equal(X_sofar, right_input)

        zero = np.zeros(X.shape[1])
        epsilon = np.nextafter(0, 1)
        assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
        assert_array_less(zero, scaler_incr.scale_ + epsilon)
        # (i+1) because the Scaler has been already fitted
        assert_equal((i + 1), scaler_incr.n_samples_seen_) 
Example #4
Source File: rotation_forest.py    From RotationForest with MIT License 5 votes vote down vote up
def random_feature_subsets(array, batch_size, random_state=1234):
    """ Generate K subsets of the features in X """
    random_state = check_random_state(random_state)
    features = range(array.shape[1])
    random_state.shuffle(features)
    for batch in gen_batches(len(features), batch_size):
        yield features[batch] 
Example #5
Source File: test_data.py    From Mastering-Elasticsearch-7.0 with MIT License 4 votes vote down vote up
def test_minmax_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MinMaxScaler().fit(X)

        scaler_incr = MinMaxScaler()
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MinMaxScaler().fit(X[batch0])
        scaler_incr = MinMaxScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std until the end of partial fits, and
        scaler_batch = MinMaxScaler().fit(X)
        scaler_incr = MinMaxScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_) 
Example #6
Source File: test_data.py    From Mastering-Elasticsearch-7.0 with MIT License 4 votes vote down vote up
def test_standard_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = StandardScaler(with_std=False).fit(X)

        scaler_incr = StandardScaler(with_std=False)
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
        assert_equal(scaler_batch.var_, scaler_incr.var_)  # Nones
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_incr = StandardScaler().partial_fit(X[batch0])
        if chunk_size == 1:
            assert_array_almost_equal(np.zeros(n_features, dtype=np.float64),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.ones(n_features, dtype=np.float64),
                                      scaler_incr.scale_)
        else:
            assert_array_almost_equal(np.var(X[batch0], axis=0),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.std(X[batch0], axis=0),
                                      scaler_incr.scale_)  # no constants

        # Test std until the end of partial fits, and
        scaler_batch = StandardScaler().fit(X)
        scaler_incr = StandardScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

        assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) 
Example #7
Source File: test_data.py    From Mastering-Elasticsearch-7.0 with MIT License 4 votes vote down vote up
def test_maxabs_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d[:100, :]
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MaxAbsScaler().fit(X)

        scaler_incr = MaxAbsScaler()
        scaler_incr_csr = MaxAbsScaler()
        scaler_incr_csc = MaxAbsScaler()
        for batch in gen_batches(n, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            X_csr = sparse.csr_matrix(X[batch])
            scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
            X_csc = sparse.csc_matrix(X[batch])
            scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csc.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csc.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MaxAbsScaler().fit(X[batch0])
        scaler_incr = MaxAbsScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std until the end of partial fits, and
        scaler_batch = MaxAbsScaler().fit(X)
        scaler_incr = MaxAbsScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_) 
Example #8
Source File: recsys.py    From modl with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def fit(self, X, y=None):
        """Learns a dictionary from sparse matrix X

        Parameters
        ----------
        X: csr-matrix (n_samples, n_features)
            Datset to learn the dictionary from

        """
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        X = check_array(X, accept_sparse='csr',
                        dtype=[np.float32, np.float64], copy=True)
        dtype = X.dtype
        n_samples, n_features = X.shape

        self.random_state = check_random_state(self.random_state)

        if self.detrend:
            self.row_mean_, self.col_mean_ = compute_biases(X,
                                                            beta=self.beta,
                                                            inplace=False)
            for i in range(X.shape[0]):
                X.data[X.indptr[i]:X.indptr[i + 1]] -= self.row_mean_[i]
            X.data -= self.col_mean_.take(X.indices, mode='clip')

        self.components_ = self.random_state.randn(self.n_components,
                                                      n_features).astype(dtype)
        S = np.sqrt(np.sum(self.components_ ** 2, axis=1))
        self.components_ /= S[:, np.newaxis]
        self.code_ = np.zeros((n_samples, self.n_components), dtype=dtype)
        self._refit(X)

        self.feature_freq_ = np.bincount(X.indices) / n_samples
        self.feature_n_iter_ = np.zeros(n_features, dtype=int)

        sparsity = X.nnz / n_samples / n_features
        if self.batch_size is None:
            batch_size = int(ceil(1. / sparsity))
        else:
            batch_size = self.batch_size

        self.comp_norm_ = np.zeros(self.n_components, dtype=dtype)
        self.C_ = np.zeros((self.n_components, self.n_components), dtype=dtype)
        self.B_ = np.zeros((self.n_components, n_features), dtype=dtype)

        self.n_iter_ = 0

        if self.verbose:
            log_lim = log(n_samples * self.n_epochs / batch_size, 10)
            self.verbose_iter_ = (np.logspace(0, log_lim, self.verbose,
                                              base=10) - 1) * batch_size
            self.verbose_iter_ = self.verbose_iter_.tolist()

        for i in range(self.n_epochs):
            permutation = self.random_state.permutation(n_samples)
            batches = gen_batches(n_samples, batch_size)
            for batch in batches:
                self._single_batch_fit(X, permutation[batch])
        self._refit(X)
        return self 
Example #9
Source File: dict_fact.py    From modl with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def transform(self, X):
        """
        Compute the codes associated to input matrix X, decomposing it onto
        the dictionary

        Parameters
        ----------
        X: ndarray, shape = (n_samples, n_features)

        Returns
        -------
        code: ndarray, shape = (n_samples, n_components)
        """
        check_is_fitted(self, 'components_')

        dtype = self.components_.dtype
        X = check_array(X, order='C', dtype=dtype.type)
        if X.flags['WRITEABLE'] is False:
            X = X.copy()
        n_samples, n_features = X.shape
        if not hasattr(self, 'G_agg') or self.G_agg != 'full':
            G = self.components_.dot(self.components_.T)
        else:
            G = self.G_
        Dx = X.dot(self.components_.T)
        code = np.ones((n_samples, self.n_components), dtype=dtype)
        sample_indices = np.arange(n_samples)
        size_job = ceil(n_samples / self.n_threads)
        batches = list(gen_batches(n_samples, size_job))

        par_func = lambda batch: _enet_regression_single_gram(
            G, Dx[batch], X[batch], code,
            get_sub_slice(sample_indices, batch),
            self.code_l1_ratio, self.code_alpha, self.code_pos,
            self.tol, self.max_iter)
        if self.n_threads > 1:
            res = self._pool.map(par_func, batches)
            _ = list(res)
        else:
            _enet_regression_single_gram(
                G, Dx, X, code,
                sample_indices,
                self.code_l1_ratio, self.code_alpha, self.code_pos,
                self.tol, self.max_iter)

        return code 
Example #10
Source File: incremental_pca.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _fit(self, X, y=None):
        """Fit the model with X, using minibatches of size batch_size.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        y : Ignored

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        self.components_ = None
        self.n_samples_seen_ = 0
        self.mean_ = 0.0
        self.var_ = 0.0
        self.squared_sum_ = 0.0
        self.sum_ = 0.0
        self.singular_values_ = None
        self.explained_variance_ = None
        self.explained_variance_ratio_ = None
        self.singular_values_ = None
        self.noise_variance_ = None

        X = check_array(
            X,
            accept_sparse=["csr", "csc", "lil"],
            copy=self.copy,
            dtype=[np.float64, np.float32],
            accept_multiple_blocks=True,
        )
        n_samples, n_features = X.shape

        if self.batch_size is None:
            self.batch_size_ = 5 * n_features
        else:
            self.batch_size_ = self.batch_size

        for batch in gen_batches(
            n_samples, self.batch_size_, min_batch_size=self.n_components or 0
        ):
            X_batch = X[batch]
            if sparse.issparse(X_batch):
                X_batch = X_batch.toarray()
            self.partial_fit(X_batch, check_input=False)

        return self 
Example #11
Source File: test_data.py    From twitter-stock-recommendation with MIT License 4 votes vote down vote up
def test_minmax_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MinMaxScaler().fit(X)

        scaler_incr = MinMaxScaler()
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MinMaxScaler().fit(X[batch0])
        scaler_incr = MinMaxScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std until the end of partial fits, and
        scaler_batch = MinMaxScaler().fit(X)
        scaler_incr = MinMaxScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_) 
Example #12
Source File: test_data.py    From twitter-stock-recommendation with MIT License 4 votes vote down vote up
def test_standard_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = StandardScaler(with_std=False).fit(X)

        scaler_incr = StandardScaler(with_std=False)
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
        assert_equal(scaler_batch.var_, scaler_incr.var_)  # Nones
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_incr = StandardScaler().partial_fit(X[batch0])
        if chunk_size == 1:
            assert_array_almost_equal(np.zeros(n_features, dtype=np.float64),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.ones(n_features, dtype=np.float64),
                                      scaler_incr.scale_)
        else:
            assert_array_almost_equal(np.var(X[batch0], axis=0),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.std(X[batch0], axis=0),
                                      scaler_incr.scale_)  # no constants

        # Test std until the end of partial fits, and
        scaler_batch = StandardScaler().fit(X)
        scaler_incr = StandardScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

        assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) 
Example #13
Source File: test_data.py    From twitter-stock-recommendation with MIT License 4 votes vote down vote up
def test_maxabs_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d[:100, :]
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MaxAbsScaler().fit(X)

        scaler_incr = MaxAbsScaler()
        scaler_incr_csr = MaxAbsScaler()
        scaler_incr_csc = MaxAbsScaler()
        for batch in gen_batches(n, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            X_csr = sparse.csr_matrix(X[batch])
            scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
            X_csc = sparse.csc_matrix(X[batch])
            scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csc.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csc.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MaxAbsScaler().fit(X[batch0])
        scaler_incr = MaxAbsScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std until the end of partial fits, and
        scaler_batch = MaxAbsScaler().fit(X)
        scaler_incr = MaxAbsScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)