Python Examples of sklearn.utils.gen

Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_standard_scaler_trasform_with_partial_fit():
    # Check some postconditions after applying partial_fit and transform
    X = X_2d[:100, :]

    scaler_incr = StandardScaler()
    for i, batch in enumerate(gen_batches(X.shape[0], 1)):

        X_sofar = X[:(i + 1), :]
        chunks_copy = X_sofar.copy()
        scaled_batch = StandardScaler().fit_transform(X_sofar)

        scaler_incr = scaler_incr.partial_fit(X[batch])
        scaled_incr = scaler_incr.transform(X_sofar)

        assert_array_almost_equal(scaled_batch, scaled_incr)
        assert_array_almost_equal(X_sofar, chunks_copy)  # No change
        right_input = scaler_incr.inverse_transform(scaled_incr)
        assert_array_almost_equal(X_sofar, right_input)

        zero = np.zeros(X.shape[1])
        epsilon = np.finfo(float).eps
        assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
        assert_array_less(zero, scaler_incr.scale_ + epsilon)
        # (i+1) because the Scaler has been already fitted
        assert_equal((i + 1), scaler_incr.n_samples_seen_)

Source File: dict_fact.py From modl with BSD 2-Clause "Simplified" License

6 votes

def partial_fit(self, X, sample_indices=None):
        """
        Update the factorization using rows from X

        Parameters
        ----------
        X: ndarray, shape (n_samples, n_features)
            Input data
        sample_indices:
            Indices for each row of X. If None, consider that row i index is i
            (useful when providing the whole data to the function)
        Returns
        -------
        self
        """
        X = check_array(X, dtype=[np.float32, np.float64], order='C')

        n_samples, n_features = X.shape
        batches = gen_batches(n_samples, self.batch_size)

        for batch in batches:
            this_X = X[batch]
            these_sample_indices = get_sub_slice(sample_indices, batch)
            self._single_batch_fit(this_X, these_sample_indices)
        return self

Source File: test_data.py From twitter-stock-recommendation with MIT License

6 votes

def test_standard_scaler_trasform_with_partial_fit():
    # Check some postconditions after applying partial_fit and transform
    X = X_2d[:100, :]

    scaler_incr = StandardScaler()
    for i, batch in enumerate(gen_batches(X.shape[0], 1)):

        X_sofar = X[:(i + 1), :]
        chunks_copy = X_sofar.copy()
        scaled_batch = StandardScaler().fit_transform(X_sofar)

        scaler_incr = scaler_incr.partial_fit(X[batch])
        scaled_incr = scaler_incr.transform(X_sofar)

        assert_array_almost_equal(scaled_batch, scaled_incr)
        assert_array_almost_equal(X_sofar, chunks_copy)  # No change
        right_input = scaler_incr.inverse_transform(scaled_incr)
        assert_array_almost_equal(X_sofar, right_input)

        zero = np.zeros(X.shape[1])
        epsilon = np.nextafter(0, 1)
        assert_array_less(zero, scaler_incr.var_ + epsilon)  # as less or equal
        assert_array_less(zero, scaler_incr.scale_ + epsilon)
        # (i+1) because the Scaler has been already fitted
        assert_equal((i + 1), scaler_incr.n_samples_seen_)

Source File: rotation_forest.py From RotationForest with MIT License

5 votes

def random_feature_subsets(array, batch_size, random_state=1234):
    """ Generate K subsets of the features in X """
    random_state = check_random_state(random_state)
    features = range(array.shape[1])
    random_state.shuffle(features)
    for batch in gen_batches(len(features), batch_size):
        yield features[batch]

Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License

4 votes

def test_minmax_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MinMaxScaler().fit(X)

        scaler_incr = MinMaxScaler()
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MinMaxScaler().fit(X[batch0])
        scaler_incr = MinMaxScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std until the end of partial fits, and
        scaler_batch = MinMaxScaler().fit(X)
        scaler_incr = MinMaxScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License

4 votes

def test_standard_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = StandardScaler(with_std=False).fit(X)

        scaler_incr = StandardScaler(with_std=False)
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
        assert_equal(scaler_batch.var_, scaler_incr.var_)  # Nones
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_incr = StandardScaler().partial_fit(X[batch0])
        if chunk_size == 1:
            assert_array_almost_equal(np.zeros(n_features, dtype=np.float64),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.ones(n_features, dtype=np.float64),
                                      scaler_incr.scale_)
        else:
            assert_array_almost_equal(np.var(X[batch0], axis=0),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.std(X[batch0], axis=0),
                                      scaler_incr.scale_)  # no constants

        # Test std until the end of partial fits, and
        scaler_batch = StandardScaler().fit(X)
        scaler_incr = StandardScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

        assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)

Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License

4 votes

def test_maxabs_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d[:100, :]
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MaxAbsScaler().fit(X)

        scaler_incr = MaxAbsScaler()
        scaler_incr_csr = MaxAbsScaler()
        scaler_incr_csc = MaxAbsScaler()
        for batch in gen_batches(n, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            X_csr = sparse.csr_matrix(X[batch])
            scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
            X_csc = sparse.csc_matrix(X[batch])
            scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csc.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csc.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MaxAbsScaler().fit(X[batch0])
        scaler_incr = MaxAbsScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std until the end of partial fits, and
        scaler_batch = MaxAbsScaler().fit(X)
        scaler_incr = MaxAbsScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

Source File: recsys.py From modl with BSD 2-Clause "Simplified" License

4 votes

def fit(self, X, y=None):
        """Learns a dictionary from sparse matrix X

        Parameters
        ----------
        X: csr-matrix (n_samples, n_features)
            Datset to learn the dictionary from

        """
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        X = check_array(X, accept_sparse='csr',
                        dtype=[np.float32, np.float64], copy=True)
        dtype = X.dtype
        n_samples, n_features = X.shape

        self.random_state = check_random_state(self.random_state)

        if self.detrend:
            self.row_mean_, self.col_mean_ = compute_biases(X,
                                                            beta=self.beta,
                                                            inplace=False)
            for i in range(X.shape[0]):
                X.data[X.indptr[i]:X.indptr[i + 1]] -= self.row_mean_[i]
            X.data -= self.col_mean_.take(X.indices, mode='clip')

        self.components_ = self.random_state.randn(self.n_components,
                                                      n_features).astype(dtype)
        S = np.sqrt(np.sum(self.components_ ** 2, axis=1))
        self.components_ /= S[:, np.newaxis]
        self.code_ = np.zeros((n_samples, self.n_components), dtype=dtype)
        self._refit(X)

        self.feature_freq_ = np.bincount(X.indices) / n_samples
        self.feature_n_iter_ = np.zeros(n_features, dtype=int)

        sparsity = X.nnz / n_samples / n_features
        if self.batch_size is None:
            batch_size = int(ceil(1. / sparsity))
        else:
            batch_size = self.batch_size

        self.comp_norm_ = np.zeros(self.n_components, dtype=dtype)
        self.C_ = np.zeros((self.n_components, self.n_components), dtype=dtype)
        self.B_ = np.zeros((self.n_components, n_features), dtype=dtype)

        self.n_iter_ = 0

        if self.verbose:
            log_lim = log(n_samples * self.n_epochs / batch_size, 10)
            self.verbose_iter_ = (np.logspace(0, log_lim, self.verbose,
                                              base=10) - 1) * batch_size
            self.verbose_iter_ = self.verbose_iter_.tolist()

        for i in range(self.n_epochs):
            permutation = self.random_state.permutation(n_samples)
            batches = gen_batches(n_samples, batch_size)
            for batch in batches:
                self._single_batch_fit(X, permutation[batch])
        self._refit(X)
        return self

Source File: dict_fact.py From modl with BSD 2-Clause "Simplified" License

4 votes

def transform(self, X):
        """
        Compute the codes associated to input matrix X, decomposing it onto
        the dictionary

        Parameters
        ----------
        X: ndarray, shape = (n_samples, n_features)

        Returns
        -------
        code: ndarray, shape = (n_samples, n_components)
        """
        check_is_fitted(self, 'components_')

        dtype = self.components_.dtype
        X = check_array(X, order='C', dtype=dtype.type)
        if X.flags['WRITEABLE'] is False:
            X = X.copy()
        n_samples, n_features = X.shape
        if not hasattr(self, 'G_agg') or self.G_agg != 'full':
            G = self.components_.dot(self.components_.T)
        else:
            G = self.G_
        Dx = X.dot(self.components_.T)
        code = np.ones((n_samples, self.n_components), dtype=dtype)
        sample_indices = np.arange(n_samples)
        size_job = ceil(n_samples / self.n_threads)
        batches = list(gen_batches(n_samples, size_job))

        par_func = lambda batch: _enet_regression_single_gram(
            G, Dx[batch], X[batch], code,
            get_sub_slice(sample_indices, batch),
            self.code_l1_ratio, self.code_alpha, self.code_pos,
            self.tol, self.max_iter)
        if self.n_threads > 1:
            res = self._pool.map(par_func, batches)
            _ = list(res)
        else:
            _enet_regression_single_gram(
                G, Dx, X, code,
                sample_indices,
                self.code_l1_ratio, self.code_alpha, self.code_pos,
                self.tol, self.max_iter)

        return code

Source File: incremental_pca.py From dask-ml with BSD 3-Clause "New" or "Revised" License

4 votes

def _fit(self, X, y=None):
        """Fit the model with X, using minibatches of size batch_size.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        y : Ignored

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        self.components_ = None
        self.n_samples_seen_ = 0
        self.mean_ = 0.0
        self.var_ = 0.0
        self.squared_sum_ = 0.0
        self.sum_ = 0.0
        self.singular_values_ = None
        self.explained_variance_ = None
        self.explained_variance_ratio_ = None
        self.singular_values_ = None
        self.noise_variance_ = None

        X = check_array(
            X,
            accept_sparse=["csr", "csc", "lil"],
            copy=self.copy,
            dtype=[np.float64, np.float32],
            accept_multiple_blocks=True,
        )
        n_samples, n_features = X.shape

        if self.batch_size is None:
            self.batch_size_ = 5 * n_features
        else:
            self.batch_size_ = self.batch_size

        for batch in gen_batches(
            n_samples, self.batch_size_, min_batch_size=self.n_components or 0
        ):
            X_batch = X[batch]
            if sparse.issparse(X_batch):
                X_batch = X_batch.toarray()
            self.partial_fit(X_batch, check_input=False)

        return self

Source File: test_data.py From twitter-stock-recommendation with MIT License

4 votes

def test_minmax_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MinMaxScaler().fit(X)

        scaler_incr = MinMaxScaler()
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MinMaxScaler().fit(X[batch0])
        scaler_incr = MinMaxScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.data_min_,
                                  scaler_incr.data_min_)
        assert_array_almost_equal(scaler_batch.data_max_,
                                  scaler_incr.data_max_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.data_range_,
                                  scaler_incr.data_range_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)

        # Test std until the end of partial fits, and
        scaler_batch = MinMaxScaler().fit(X)
        scaler_incr = MinMaxScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

Source File: test_data.py From twitter-stock-recommendation with MIT License

4 votes

def test_standard_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = StandardScaler(with_std=False).fit(X)

        scaler_incr = StandardScaler(with_std=False)
        for batch in gen_batches(n_samples, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])

        assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
        assert_equal(scaler_batch.var_, scaler_incr.var_)  # Nones
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_incr = StandardScaler().partial_fit(X[batch0])
        if chunk_size == 1:
            assert_array_almost_equal(np.zeros(n_features, dtype=np.float64),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.ones(n_features, dtype=np.float64),
                                      scaler_incr.scale_)
        else:
            assert_array_almost_equal(np.var(X[batch0], axis=0),
                                      scaler_incr.var_)
            assert_array_almost_equal(np.std(X[batch0], axis=0),
                                      scaler_incr.scale_)  # no constants

        # Test std until the end of partial fits, and
        scaler_batch = StandardScaler().fit(X)
        scaler_incr = StandardScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

        assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)

Source File: test_data.py From twitter-stock-recommendation with MIT License

4 votes

def test_maxabs_scaler_partial_fit():
    # Test if partial_fit run over many batches of size 1 and 50
    # gives the same results as fit
    X = X_2d[:100, :]
    n = X.shape[0]

    for chunk_size in [1, 2, 50, n, n + 42]:
        # Test mean at the end of the process
        scaler_batch = MaxAbsScaler().fit(X)

        scaler_incr = MaxAbsScaler()
        scaler_incr_csr = MaxAbsScaler()
        scaler_incr_csc = MaxAbsScaler()
        for batch in gen_batches(n, chunk_size):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            X_csr = sparse.csr_matrix(X[batch])
            scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
            X_csc = sparse.csc_matrix(X[batch])
            scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csr.max_abs_)
        assert_array_almost_equal(scaler_batch.max_abs_,
                                  scaler_incr_csc.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csr.n_samples_seen_)
        assert_equal(scaler_batch.n_samples_seen_,
                     scaler_incr_csc.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std after 1 step
        batch0 = slice(0, chunk_size)
        scaler_batch = MaxAbsScaler().fit(X[batch0])
        scaler_incr = MaxAbsScaler().partial_fit(X[batch0])

        assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
        assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
        assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
        assert_array_almost_equal(scaler_batch.transform(X),
                                  scaler_incr.transform(X))

        # Test std until the end of partial fits, and
        scaler_batch = MaxAbsScaler().fit(X)
        scaler_incr = MaxAbsScaler()  # Clean estimator
        for i, batch in enumerate(gen_batches(n, chunk_size)):
            scaler_incr = scaler_incr.partial_fit(X[batch])
            assert_correct_incr(i, batch_start=batch.start,
                                batch_stop=batch.stop, n=n,
                                chunk_size=chunk_size,
                                n_samples_seen=scaler_incr.n_samples_seen_)

Python sklearn.utils.gen_batches() Examples