Python Examples of sklearn.utils.extmath.row

Source File: test_extmath.py From twitter-stock-recommendation with MIT License

6 votes

def test_row_norms():
    X = np.random.RandomState(42).randn(100, 100)
    for dtype in (np.float32, np.float64):
        if dtype is np.float32:
            precision = 4
        else:
            precision = 5

        X = X.astype(dtype)
        sq_norm = (X ** 2).sum(axis=1)

        assert_array_almost_equal(sq_norm, row_norms(X, squared=True),
                                  precision)
        assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)

        Xcsr = sparse.csr_matrix(X, dtype=dtype)
        assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
                                  precision)
        assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)

Source File: equal_groups.py From Same-Size-K-Means with BSD 3-Clause "New" or "Revised" License

6 votes

def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.
        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.
        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        check_is_fitted(self, 'cluster_centers_')

        X = self._check_test_data(X)
        x_squared_norms = row_norms(X, squared=True)
        return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]

Source File: equal_groups.py From Same-Size-K-Means with BSD 3-Clause "New" or "Revised" License

6 votes

def score(self, X, y=None):
        """Opposite of the value of X on the K-means objective.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data.
        Returns
        -------
        score : float
            Opposite of the value of X on the K-means objective.
        """
        check_is_fitted(self, 'cluster_centers_')

        X = self._check_test_data(X)
        x_squared_norms = row_norms(X, squared=True)
        return -_labels_inertia(X, x_squared_norms, self.cluster_centers_)[1]

Source File: _k_means_0_22.py From daal4py with Apache License 2.0

5 votes

def predict(self, X, sample_weight=None):
    """Predict the closest cluster each sample in X belongs to.

    In the vector quantization literature, `cluster_centers_` is called
    the code book and each value returned by `predict` is the index of
    the closest code in the code book.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
       New data to predict.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    Returns
    -------
    labels : array, shape [n_samples,]
        Index of the cluster each sample belongs to.
    """
    check_is_fitted(self)

    X = self._check_test_data(X)

    daal_ready = sample_weight is None and hasattr(X, '__array__') # or sp.isspmatrix_csr(X)

    if daal_ready:
        logging.info("sklearn.cluster.KMeans.predict: " + method_uses_daal)
        return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0]
    else:
        logging.info("sklearn.cluster.KMeans.predict: " + method_uses_sklearn)
        x_squared_norms = row_norms(X, squared=True)
        return _labels_inertia(X, sample_weight, x_squared_norms,
                               self.cluster_centers_)[0]

Source File: test_k_means.py From twitter-stock-recommendation with MIT License

5 votes

def test_labels_assignment_and_inertia():
    # pure numpy implementation as easily auditable reference gold
    # implementation
    rng = np.random.RandomState(42)
    noisy_centers = centers + rng.normal(size=centers.shape)
    labels_gold = - np.ones(n_samples, dtype=np.int)
    mindist = np.empty(n_samples)
    mindist.fill(np.infty)
    for center_id in range(n_clusters):
        dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1)
        labels_gold[dist < mindist] = center_id
        mindist = np.minimum(dist, mindist)
    inertia_gold = mindist.sum()
    assert_true((mindist >= 0.0).all())
    assert_true((labels_gold != -1).all())

    # perform label assignment using the dense array input
    x_squared_norms = (X ** 2).sum(axis=1)
    labels_array, inertia_array = _labels_inertia(
        X, x_squared_norms, noisy_centers)
    assert_array_almost_equal(inertia_array, inertia_gold)
    assert_array_equal(labels_array, labels_gold)

    # perform label assignment using the sparse CSR input
    x_squared_norms_from_csr = row_norms(X_csr, squared=True)
    labels_csr, inertia_csr = _labels_inertia(
        X_csr, x_squared_norms_from_csr, noisy_centers)
    assert_array_almost_equal(inertia_csr, inertia_gold)
    assert_array_equal(labels_csr, labels_gold)

Source File: utils.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def row_norms(X, squared=False):
    if isinstance(X, np.ndarray):
        return skm.row_norms(X, squared=squared)
    return X.map_blocks(
        skm.row_norms, chunks=(X.chunks[0],), drop_axis=1, squared=squared
    )

Source File: _k_means_0_21.py From daal4py with Apache License 2.0

5 votes

def predict(self, X, sample_weight=None):
    """Predict the closest cluster each sample in X belongs to.

    In the vector quantization literature, `cluster_centers_` is called
    the code book and each value returned by `predict` is the index of
    the closest code in the code book.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
       New data to predict.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    Returns
    -------
    labels : array, shape [n_samples,]
        Index of the cluster each sample belongs to.
    """
    check_is_fitted(self, 'cluster_centers_')

    X = self._check_test_data(X)

    daal_ready = sample_weight is None and hasattr(X, '__array__') # or sp.isspmatrix_csr(X)

    if daal_ready:
        logging.info("sklearn.cluster.KMeans.predict: " + method_uses_daal)
        return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0]
    else:
        logging.info("sklearn.cluster.KMeans.predict: " + method_uses_sklearn)
        x_squared_norms = row_norms(X, squared=True)
        return _labels_inertia(X, sample_weight, x_squared_norms,
                               self.cluster_centers_)[0]

Source File: test_extmath.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_row_norms(dtype):
    X = np.random.RandomState(42).randn(100, 100)
    if dtype is np.float32:
        precision = 4
    else:
        precision = 5

    X = X.astype(dtype, copy=False)
    sq_norm = (X ** 2).sum(axis=1)

    assert_array_almost_equal(sq_norm, row_norms(X, squared=True),
                              precision)
    assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)

    for csr_index_dtype in [np.int32, np.int64]:
        Xcsr = sparse.csr_matrix(X, dtype=dtype)
        # csr_matrix will use int32 indices by default,
        # up-casting those to int64 when necessary
        if csr_index_dtype is np.int64:
            Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype, copy=False)
            Xcsr.indices = Xcsr.indices.astype(csr_index_dtype, copy=False)
        assert Xcsr.indices.dtype == csr_index_dtype
        assert Xcsr.indptr.dtype == csr_index_dtype
        assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
                                  precision)
        assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr),
                                  precision)

Source File: _k_means_0_23.py From daal4py with Apache License 2.0

5 votes

def predict(self, X, sample_weight=None):
    """Predict the closest cluster each sample in X belongs to.

    In the vector quantization literature, `cluster_centers_` is called
    the code book and each value returned by `predict` is the index of
    the closest code in the code book.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
       New data to predict.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    Returns
    -------
    labels : array, shape [n_samples,]
        Index of the cluster each sample belongs to.
    """
    check_is_fitted(self)

    X = self._check_test_data(X)

    daal_ready = sample_weight is None and hasattr(X, '__array__') # or sp.isspmatrix_csr(X)

    if daal_ready:
        logging.info("sklearn.cluster.KMeans.predict: " + method_uses_daal)
        return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0]
    else:
        logging.info("sklearn.cluster.KMeans.predict: " + method_uses_sklearn)
        x_squared_norms = row_norms(X, squared=True)
        return _labels_inertia(X, sample_weight, x_squared_norms,
                               self.cluster_centers_)[0]

Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_labels_assignment_and_inertia():
    # pure numpy implementation as easily auditable reference gold
    # implementation
    rng = np.random.RandomState(42)
    noisy_centers = centers + rng.normal(size=centers.shape)
    labels_gold = np.full(n_samples, -1, dtype=np.int)
    mindist = np.empty(n_samples)
    mindist.fill(np.infty)
    for center_id in range(n_clusters):
        dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1)
        labels_gold[dist < mindist] = center_id
        mindist = np.minimum(dist, mindist)
    inertia_gold = mindist.sum()
    assert (mindist >= 0.0).all()
    assert (labels_gold != -1).all()

    sample_weight = None

    # perform label assignment using the dense array input
    x_squared_norms = (X ** 2).sum(axis=1)
    labels_array, inertia_array = _labels_inertia(
        X, sample_weight, x_squared_norms, noisy_centers)
    assert_array_almost_equal(inertia_array, inertia_gold)
    assert_array_equal(labels_array, labels_gold)

    # perform label assignment using the sparse CSR input
    x_squared_norms_from_csr = row_norms(X_csr, squared=True)
    labels_csr, inertia_csr = _labels_inertia(
        X_csr, sample_weight, x_squared_norms_from_csr, noisy_centers)
    assert_array_almost_equal(inertia_csr, inertia_gold)
    assert_array_equal(labels_csr, labels_gold)

Source File: _coordinate_descent_0_21.py From daal4py with Apache License 2.0

4 votes

def _daal4py_check(self, X, y, check_input):
    _fptype = getFPType(X)

    #check alpha
    if self.alpha == 0:
        warnings.warn("With alpha=0, this algorithm does not converge "
                      "well. You are advised to use the LinearRegression "
                      "estimator", stacklevel=2)

    #check precompute
    if isinstance(self.precompute, np.ndarray):
        if check_input:
            check_array(self.precompute, dtype=_fptype)
        self.precompute = make2d(self.precompute)
        #only for compliance with Sklearn
        if self.fit_intercept:
            X_offset = np.average(X, axis=0, weights=None)
            if self.normalize:
                X_scale = row_norms(X)
                if np.isscalar(X_scale):
                    if X_scale == .0:
                        X_scale = 1.
                elif isinstance(X_scale, np.ndarray):
                    X_scale[X_scale == 0.0] = 1.0
            else:
                X_scale = np.ones(X.shape[1], dtype=_fptype)
        else:
            X_offset = np.zeros(X.shape[1], dtype=_fptype)
            X_scale = np.ones(X.shape[1], dtype=_fptype)
        if (self.fit_intercept and not np.allclose(X_offset, np.zeros(X.shape[1])) or
                self.normalize and not np.allclose(X_scale, np.ones(X.shape[1]))):
            warnings.warn("Gram matrix was provided but X was centered"
                          " to fit intercept, "
                          "or X was normalized : recomputing Gram matrix.",
                          UserWarning)
    else:
        if self.precompute not in [False, True, 'auto']:
            raise ValueError("precompute should be one of True, False, "
                             "'auto' or array-like. Got %r" % self.precompute)

    #check selection
    if self.selection not in ['random', 'cyclic']:
        raise ValueError("selection should be either random or cyclic.")

Source File: dis_sim.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

4 votes

def fit(self, neigh_dist: np.ndarray, neigh_ind: np.ndarray, X: np.ndarray,
            assume_sorted: bool = True, *args, **kwargs) -> DisSimLocal:
        """ Fit the model using X, neigh_dist, and neigh_ind as training data.

        Parameters
        ----------
        neigh_dist: np.ndarray, shape (n_samples, n_neighbors)
            Distance matrix of training objects (rows) against their
            individual k nearest neighbors (colums).

        neigh_ind: np.ndarray, shape (n_samples, n_neighbors)
            Neighbor indices corresponding to the values in neigh_dist.

        X: np.ndarray, shape (n_samples, n_features)
            Training data, where n_samples is the number of vectors,
            and n_features their dimensionality (number of features).

        assume_sorted: bool, default = True
            Assume input matrices are sorted according to neigh_dist.
            If False, these are sorted here.
        """
        # Check equal number of rows and columns
        check_consistent_length(neigh_ind, neigh_dist)
        check_consistent_length(neigh_ind.T, neigh_dist.T)
        X = check_array(X)
        try:
            if self.k <= 0:
                raise ValueError(f"Expected k > 0. Got {self.k}")
        except TypeError:
            raise TypeError(f'Expected k: int > 0. Got {self.k}')

        k = self.k
        if k > neigh_ind.shape[1]:
            warnings.warn(f'Neighborhood parameter k larger than provided neighbors in neigh_dist, neigh_ind. '
                          f'Will reduce to k={neigh_ind.shape[1]}.')
            k = neigh_ind.shape[1]

        # Calculate local neighborhood centroids among the training points
        if assume_sorted:
            knn = neigh_ind[:, :k]
        else:
            mask = np.argpartition(neigh_dist, kth=k-1)[:, :k]
            knn = np.take_along_axis(neigh_ind, mask, axis=1)
        centroids = X[knn].mean(axis=1)
        dist_to_cent = row_norms(X - centroids, squared=True)

        self.X_train_ = X
        self.X_train_centroids_ = centroids
        self.X_train_dist_to_centroids_ = dist_to_cent

        return self

Source File: _coordinate_descent_0_23.py From daal4py with Apache License 2.0

4 votes

def _daal4py_check(self, X, y, check_input):
    _fptype = getFPType(X)

    #check alpha
    if self.alpha == 0:
        warnings.warn("With alpha=0, this algorithm does not converge "
                      "well. You are advised to use the LinearRegression "
                      "estimator", stacklevel=2)

    #check precompute
    if isinstance(self.precompute, np.ndarray):
        if check_input:
            check_array(self.precompute, dtype=_fptype)
        self.precompute = make2d(self.precompute)
        #only for compliance with Sklearn
        if self.fit_intercept:
            X_offset = np.average(X, axis=0, weights=None)
            if self.normalize:
                X_scale = row_norms(X)
                if np.isscalar(X_scale):
                    if X_scale == .0:
                        X_scale = 1.
                elif isinstance(X_scale, np.ndarray):
                    X_scale[X_scale == 0.0] = 1.0
            else:
                X_scale = np.ones(X.shape[1], dtype=_fptype)
        else:
            X_offset = np.zeros(X.shape[1], dtype=_fptype)
            X_scale = np.ones(X.shape[1], dtype=_fptype)
        if (self.fit_intercept and not np.allclose(X_offset, np.zeros(X.shape[1])) or
                self.normalize and not np.allclose(X_scale, np.ones(X.shape[1]))):
            warnings.warn("Gram matrix was provided but X was centered"
                          " to fit intercept, "
                          "or X was normalized : recomputing Gram matrix.",
                          UserWarning)
    else:
        if self.precompute not in [False, True, 'auto']:
            raise ValueError("precompute should be one of True, False, "
                             "'auto' or array-like. Got %r" % self.precompute)

    #check selection
    if self.selection not in ['random', 'cyclic']:
        raise ValueError("selection should be either random or cyclic.")

Source File: factorization_machine.py From polylearn with BSD 2-Clause "Simplified" License

4 votes

def fit(self, X, y):
        """Fit factorization machine to training data.

        Parameters
        ----------
        X : array-like or sparse, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : Estimator
            Returns self.
        """
        if self.degree > 3:
            raise ValueError("FMs with degree >3 not yet supported.")

        X, y = self._check_X_y(X, y)
        X = self._augment(X)
        n_features = X.shape[1]  # augmented
        X_col_norms = row_norms(X.T, squared=True)
        dataset = get_dataset(X, order="fortran")
        rng = check_random_state(self.random_state)
        loss_obj = self._get_loss(self.loss)

        if not (self.warm_start and hasattr(self, 'w_')):
            self.w_ = np.zeros(n_features, dtype=np.double)

        if self.fit_lower == 'explicit':
            n_orders = self.degree - 1
        else:
            n_orders = 1

        if not (self.warm_start and hasattr(self, 'P_')):
            self.P_ = 0.01 * rng.randn(n_orders, self.n_components, n_features)

        if not (self.warm_start and hasattr(self, 'lams_')):
            if self.init_lambdas == 'ones':
                self.lams_ = np.ones(self.n_components)
            elif self.init_lambdas == 'random_signs':
                self.lams_ = np.sign(rng.randn(self.n_components))
            else:
                raise ValueError("Lambdas must be initialized as ones "
                                 "(init_lambdas='ones') or as random "
                                 "+/- 1 (init_lambdas='random_signs').")

        y_pred = self._get_output(X)

        converged, self.n_iter_ = _cd_direct_ho(
            self.P_, self.w_, dataset, X_col_norms, y, y_pred,
            self.lams_, self.degree, self.alpha, self.beta, self.fit_linear,
            self.fit_lower == 'explicit', loss_obj, self.max_iter,
            self.tol, self.verbose)
        if not converged:
            warnings.warn("Objective did not converge. Increase max_iter.")

        return self

Source File: test_algebra_onnx_operators.py From sklearn-onnx with MIT License

4 votes

def test_sub_kmeans(self):

        def conv(scope, operator, container):
            X = operator.inputs[0]
            out = operator.outputs
            op = operator.raw_operator

            C = op.cluster_centers_
            C2 = row_norms(C, squared=True).astype(container.dtype)
            C = C.astype(container.dtype)

            rs = OnnxReduceSumSquare(
                X, axes=[1], keepdims=1,
                op_version=container.target_opset)

            N = X.type.shape[0]
            if isinstance(N, int):
                zeros = np.zeros((N, ))
            else:
                zeros = OnnxMul(
                    rs, np.array([0], dtype=np.float32),
                    op_version=container.target_opset)

            z = OnnxAdd(
                rs,
                OnnxGemm(
                    X, C, zeros, alpha=-2., transB=1,
                    op_version=container.target_opset),
                op_version=container.target_opset)
            y2 = OnnxAdd(C2, z, op_version=container.target_opset)
            lo = OnnxArgMin(
                y2, axis=1, keepdims=0, output_names=out[:1],
                op_version=container.target_opset)
            y2s = OnnxSqrt(
                y2, output_names=out[1:],
                op_version=container.target_opset)

            lo.add_to(scope, container)
            y2s.add_to(scope, container)

        data = load_iris()
        X = data.data
        model = KMeans(n_clusters=3)
        model.fit(X)
        model_onnx = convert_sklearn(
            model, 'a-kmeans',
            [('input', FloatTensorType([None, X.shape[1]]))],
            custom_conversion_functions={KMeans: conv},
            target_opset=TARGET_OPSET)

        dump_data_and_model(X.astype(np.float32)[40:60], model, model_onnx,
                            basename="SklearnKMeansCustom-Dec4")

Source File: test_sag.py From twitter-stock-recommendation with MIT License

4 votes

def test_get_auto_step_size():
    X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
    alpha = 1.2
    fit_intercept = False
    # sum the squares of the second sample because that's the largest
    max_squared_sum = 4 + 9 + 16
    max_squared_sum_ = row_norms(X, squared=True).max()
    n_samples = X.shape[0]
    assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)

    for saga in [True, False]:
        for fit_intercept in (True, False):
            if saga:
                L_sqr = (max_squared_sum + alpha + int(fit_intercept))
                L_log = (max_squared_sum + 4.0 * alpha +
                         int(fit_intercept)) / 4.0
                mun_sqr = min(2 * n_samples * alpha, L_sqr)
                mun_log = min(2 * n_samples * alpha, L_log)
                step_size_sqr = 1 / (2 * L_sqr + mun_sqr)
                step_size_log = 1 / (2 * L_log + mun_log)
            else:
                step_size_sqr = 1.0 / (max_squared_sum +
                                       alpha + int(fit_intercept))
                step_size_log = 4.0 / (max_squared_sum + 4.0 * alpha +
                                       int(fit_intercept))

            step_size_sqr_ = get_auto_step_size(max_squared_sum_, alpha,
                                                "squared",
                                                fit_intercept,
                                                n_samples=n_samples,
                                                is_saga=saga)
            step_size_log_ = get_auto_step_size(max_squared_sum_, alpha, "log",
                                                fit_intercept,
                                                n_samples=n_samples,
                                                is_saga=saga)

            assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
            assert_almost_equal(step_size_log, step_size_log_, decimal=4)

    msg = 'Unknown loss function for SAG solver, got wrong instead of'
    assert_raise_message(ValueError, msg, get_auto_step_size,
                         max_squared_sum_, alpha, "wrong", fit_intercept)

Source File: test_sag.py From Mastering-Elasticsearch-7.0 with MIT License

4 votes

def test_get_auto_step_size():
    X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
    alpha = 1.2
    fit_intercept = False
    # sum the squares of the second sample because that's the largest
    max_squared_sum = 4 + 9 + 16
    max_squared_sum_ = row_norms(X, squared=True).max()
    n_samples = X.shape[0]
    assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)

    for saga in [True, False]:
        for fit_intercept in (True, False):
            if saga:
                L_sqr = (max_squared_sum + alpha + int(fit_intercept))
                L_log = (max_squared_sum + 4.0 * alpha +
                         int(fit_intercept)) / 4.0
                mun_sqr = min(2 * n_samples * alpha, L_sqr)
                mun_log = min(2 * n_samples * alpha, L_log)
                step_size_sqr = 1 / (2 * L_sqr + mun_sqr)
                step_size_log = 1 / (2 * L_log + mun_log)
            else:
                step_size_sqr = 1.0 / (max_squared_sum +
                                       alpha + int(fit_intercept))
                step_size_log = 4.0 / (max_squared_sum + 4.0 * alpha +
                                       int(fit_intercept))

            step_size_sqr_ = get_auto_step_size(max_squared_sum_, alpha,
                                                "squared",
                                                fit_intercept,
                                                n_samples=n_samples,
                                                is_saga=saga)
            step_size_log_ = get_auto_step_size(max_squared_sum_, alpha, "log",
                                                fit_intercept,
                                                n_samples=n_samples,
                                                is_saga=saga)

            assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
            assert_almost_equal(step_size_log, step_size_log_, decimal=4)

    msg = 'Unknown loss function for SAG solver, got wrong instead of'
    assert_raise_message(ValueError, msg, get_auto_step_size,
                         max_squared_sum_, alpha, "wrong", fit_intercept)

Python sklearn.utils.extmath.row_norms() Examples