Python Examples of sklearn.metrics.euclidean

Source File: _template.py From project-template with BSD 3-Clause "New" or "Revised" License

6 votes

def predict(self, X):
        """ A reference implementation of a prediction for a classifier.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        y : ndarray, shape (n_samples,)
            The label for each sample is the label of the closest sample
            seen during fit.
        """
        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)

        closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
        return self.y_[closest]

Source File: test_affinity_propagation.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_equal_similarities_and_preferences():
    # Unequal distances
    X = np.array([[0, 0], [1, 1], [-2, -2]])
    S = -euclidean_distances(X, squared=True)

    assert not _equal_similarities_and_preferences(S, np.array(0))
    assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
    assert not _equal_similarities_and_preferences(S, np.array([0, 1]))

    # Equal distances
    X = np.array([[0, 0], [1, 1]])
    S = -euclidean_distances(X, squared=True)

    # Different preferences
    assert not _equal_similarities_and_preferences(S, np.array([0, 1]))

    # Same preferences
    assert _equal_similarities_and_preferences(S, np.array([0, 0]))
    assert _equal_similarities_and_preferences(S, np.array(0))

Source File: test_estimation.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

5 votes

def test_shuffle_equal(verbose):
    # for this data set there shouldn't be any equal distances,
    # and shuffle should make no difference
    X, _ = make_classification(random_state=12354)
    dist = euclidean_distances(X)
    skew_shuffle, skew_no_shuffle = \
        [Hubness(metric='precomputed', shuffle_equal=v, verbose=verbose)
         .fit(dist).score() for v in [True, False]]
    assert skew_no_shuffle == skew_shuffle

Source File: test_affinity_propagation.py From twitter-stock-recommendation with MIT License

5 votes

def test_affinity_propagation():
    # Affinity Propagation algorithm
    # Compute similarities
    S = -euclidean_distances(X, squared=True)
    preference = np.median(S) * 10
    # Compute Affinity Propagation
    cluster_centers_indices, labels = affinity_propagation(
        S, preference=preference)

    n_clusters_ = len(cluster_centers_indices)

    assert_equal(n_clusters, n_clusters_)

    af = AffinityPropagation(preference=preference, affinity="precomputed")
    labels_precomputed = af.fit(S).labels_

    af = AffinityPropagation(preference=preference, verbose=True)
    labels = af.fit(X).labels_

    assert_array_equal(labels, labels_precomputed)

    cluster_centers_indices = af.cluster_centers_indices_

    n_clusters_ = len(cluster_centers_indices)
    assert_equal(np.unique(labels).size, n_clusters_)
    assert_equal(n_clusters, n_clusters_)

    # Test also with no copy
    _, labels_no_copy = affinity_propagation(S, preference=preference,
                                             copy=False)
    assert_array_equal(labels, labels_no_copy)

    # Test input validation
    assert_raises(ValueError, affinity_propagation, S[:, :-1])
    assert_raises(ValueError, affinity_propagation, S, damping=0)
    af = AffinityPropagation(affinity="unknown")
    assert_raises(ValueError, af.fit, X)

Source File: test_random_projection.py From twitter-stock-recommendation with MIT License

5 votes

def test_random_projection_embedding_quality():
    data, _ = make_sparse_random_data(8, 5000, 15000)
    eps = 0.2

    original_distances = euclidean_distances(data, squared=True)
    original_distances = original_distances.ravel()
    non_identical = original_distances != 0.0

    # remove 0 distances to avoid division by 0
    original_distances = original_distances[non_identical]

    for RandomProjection in all_RandomProjection:
        rp = RandomProjection(n_components='auto', eps=eps, random_state=0)
        projected = rp.fit_transform(data)

        projected_distances = euclidean_distances(projected, squared=True)
        projected_distances = projected_distances.ravel()

        # remove 0 distances to avoid division by 0
        projected_distances = projected_distances[non_identical]

        distances_ratio = projected_distances / original_distances

        # check that the automatically tuned values for the density respect the
        # contract for eps: pairwise distances are preserved according to the
        # Johnson-Lindenstrauss lemma
        assert_less(distances_ratio.max(), 1 + eps)
        assert_less(1 - eps, distances_ratio.min())

Source File: test_mvmds.py From mvlearn with Apache License 2.0

5 votes

def test_dissimilarity_precomputed_euclidean(data):
    test_views = []
    for i in data['samp_views']:
        test_views.append(euclidean_distances(i))
    mvmds1 = MVMDS(dissimilarity='euclidean')
    mvmds2 = MVMDS(dissimilarity='precomputed')

    fit1 = mvmds1.fit_transform(data['samp_views'])
    fit2 = mvmds2.fit_transform(test_views)

    np.testing.assert_almost_equal(np.abs(fit2), np.abs(fit1))

Source File: RnaseqqcReport.py From CGATPipelines with MIT License

5 votes

def __call__(self, track,  slice=None):

        # remove WHERE when table cleaned up to remove header rows
        statement = (
            "SELECT transcript_id, TPM, sample_id FROM sailfish_transcripts")

        # fetch data
        df = pd.DataFrame.from_dict(self.getAll(statement))

        df = df.pivot('transcript_id', 'sample_id')['TPM']

        # calculate dissimilarities
        similarities = euclidean_distances(df.transpose())

        # run MDS
        mds = manifold.MDS(n_components=2, max_iter=3000,
                           eps=1e-9, dissimilarity="precomputed", n_jobs=1)
        mds = mds.fit(similarities)
        pos = pd.DataFrame(mds.embedding_)

        pos.columns = ["MD1", "MD2"]
        pos['sample'] = df.columns

        factors_df = self.getDataFrame(
            "SELECT * FROM factors WHERE factor != 'genome'")

        merged_df = pd.merge(pos, factors_df,
                             left_on="sample", right_on="sample_id")
        return merged_df.reset_index().set_index("factor")

Source File: _mdsw.py From scikit-multilearn with BSD 2-Clause "Simplified" License

5 votes

def fit_transform(self, X, y=None, init=None):
        """
        Fit the data from X, and returns the embedded coordinates

        Parameters
        ----------
        X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \
                if dissimilarity='precomputed'
            Input data.

        init : {None or ndarray, shape (n_samples,)}, optional
            If None, randomly chooses the initial configuration
            if ndarray, initialize the SMACOF algorithm with this array.

        """
        X = check_array(X)
        if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
            warnings.warn("The MDS API has changed. ``fit`` now constructs an"
                          " dissimilarity matrix from data. To use a custom "
                          "dissimilarity matrix, set "
                          "``dissimilarity=precomputed``.")

        if self.dissimilarity == "precomputed":
            self.dissimilarity_matrix_ = X
        elif self.dissimilarity == "euclidean":
            self.dissimilarity_matrix_ = euclidean_distances(X)
        else:
            raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
                             " Got %s instead" % str(self.dissimilarity))

        self.embedding_, self.stress_, self.n_iter_ = _smacof_w(
            self.dissimilarity_matrix_, self.n_uq, self.uq_weight, metric=self.metric,
            n_components=self.n_components, init=init, n_init=self.n_init,
            n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
            eps=self.eps, random_state=self.random_state,
            return_n_iter=True)

        return self.embedding_

Source File: test_estimation.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

5 votes

def test_hubness_against_distance(has_self_distances):
    """Test hubness class against distance-based methods."""

    np.random.seed(123)
    X = np.random.rand(100, 50)
    D = euclidean_distances(X)
    verbose = 1

    hub = Hubness(k=10, metric='precomputed',
                  store_k_occurrence=True,
                  store_k_neighbors=True,
                  )
    hub.fit(D)
    skew_d = hub.score(has_self_distances=has_self_distances)
    neigh_d = hub.k_neighbors
    occ_d = hub.k_occurrence

    hub = Hubness(k=10, metric='euclidean',
                  store_k_neighbors=True,
                  store_k_occurrence=True,
                  verbose=verbose)
    hub.fit(X)
    skew_v = hub.score(X if not has_self_distances else None)
    neigh_v = hub.k_neighbors
    occ_v = hub.k_occurrence

    np.testing.assert_allclose(skew_d, skew_v, atol=1e-7)
    np.testing.assert_array_equal(neigh_d, neigh_v)
    np.testing.assert_array_equal(occ_d, occ_v)

Source File: test_estimation.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

5 votes

def test_sparse_equal_dense_if_variable_hits_per_row(shuffle_equal):
    X, _ = make_classification(random_state=123)
    dist = euclidean_distances(X)
    dist[0, 1:3] = 999
    dist[1:3, 0] = 999
    dist[1, 1:5] = 999
    dist[1:5, 1] = 999
    sparse = dist.copy()
    sparse[0, 1:3] = 0
    sparse[1:3, 0] = 0
    sparse[1, 1:5] = 0
    sparse[1:5, 1] = 0
    sparse = csr_matrix(sparse)

    hub = Hubness(metric='precomputed',
                  shuffle_equal=shuffle_equal,
                  random_state=123)
    hub.fit(dist)
    skew_dense = hub.score(has_self_distances=True)

    hub = Hubness(metric='precomputed',
                  shuffle_equal=shuffle_equal,
                  random_state=123)
    hub.fit(sparse)
    skew_sparse = hub.score(has_self_distances=True)

    np.testing.assert_almost_equal(skew_dense, skew_sparse, decimal=2)

Source File: test_estimation.py From scikit-hubness with BSD 3-Clause "New" or "Revised" License

5 votes

def test_sparse_equal_dense(verbose, shuffle_equal):
    X, _ = make_classification()
    dist_dense = euclidean_distances(X)
    dist_sparse = csr_matrix(dist_dense)

    hub = Hubness(metric='precomputed',
                  shuffle_equal=shuffle_equal,
                  verbose=verbose)
    hub.fit(dist_dense)
    skew_dense = hub.score(has_self_distances=True)

    hub.fit(dist_sparse)
    skew_sparse = hub.score(has_self_distances=True)

    np.testing.assert_almost_equal(skew_dense, skew_sparse)

Source File: mdsp.py From libact with BSD 2-Clause "Simplified" License

5 votes

def fit_transform(self, X, y=None, init=None):
        """
        Fit the data from X, and returns the embedded coordinates

        Parameters
        ----------
        X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \
                if dissimilarity='precomputed'
            Input data.

        init : {None or ndarray, shape (n_samples,)}, optional
            If None, randomly chooses the initial configuration
            if ndarray, initialize the SMACOF algorithm with this array.

        """
        X = check_array(X)
        if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
            warnings.warn("The MDS API has changed. ``fit`` now constructs an"
                          " dissimilarity matrix from data. To use a custom "
                          "dissimilarity matrix, set "
                          "``dissimilarity=precomputed``.")

        if self.dissimilarity == "precomputed":
            self.dissimilarity_matrix_ = X
        elif self.dissimilarity == "euclidean":
            self.dissimilarity_matrix_ = euclidean_distances(X)
        else:
            raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
                             " Got %s instead" % str(self.dissimilarity))

        self.embedding_, self.stress_, self.n_iter_ = smacof_p(
            self.dissimilarity_matrix_, self.n_uq, metric=self.metric,
            n_components=self.n_components, init=init, n_init=self.n_init,
            n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
            eps=self.eps, random_state=self.random_state,
            return_n_iter=True)

        return self.embedding_

Source File: word_mover_distance.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def weighted_wmdistance(sent1_embs, sent2_embs, idfs, mean):
    wmd = 0.0
    for token1, x in sent1_embs:
        min_dist = sys.float_info.max
        weight = idfs[token1] if token1 in idfs else mean
        for _, y in sent2_embs:
            print(x, x.shape())
            print(y, y.shape())
            score = weight * euclidean_distances(x,y) 
            exit(0)
            if score < min_dist:
                min_dist = score
        wmd += min_dist
    return - float(wmd) / (len(sent1_embs) + len(sent2_embs))

Source File: word_mover_distance.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def wmdistance(sent1_embs, sent2_embs):
    wmd = 0.0
    for _,x in sent1_embs:
        min_dist = sys.float_info.max
        for _,y in sent2_embs:
            x = x.reshape(1, -1)
            y = y.reshape(1, -1)
            distance = euclidean_distances(x,y)
            if distance < min_dist:
                min_dist = distance
        wmd += min_dist
    return - float(wmd) / (len(sent1_embs) + len(sent2_embs))
    
# Note that this breaks the symmetry and is not a distance anymore:
# To overcome this, we compute the average of the score in both side: (weigthedWMD(a,b) + weightedWMD(b,a))/2

Source File: LFSBSS.py From fsfc with MIT License

5 votes

def predict(self, x):
        """
        Predict clusters for one sample

        Parameters
        ----------
        x: ndarray
            Samples to predict

        Returns
        -------
        label: int
            Predicted cluster
        """

        # Find the closest cluster to samples
        # To do it, project x to appropriate subspace, find distance to mean value and norm by variance
        min_score = None
        closest = None
        for i in range(self.clusters):
            projection = x[:, self.features_[i]]
            norm = euclidean_distances(projection, self.means_[i])
            score = norm / self.vars_[i]
            if min_score is None or score < min_score:
                min_score = score
                closest = i
        return closest

Source File: test_affinity_propagation.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_affinity_propagation_equal_mutual_similarities():
    X = np.array([[-1, 1], [1, -1]])
    S = -euclidean_distances(X, squared=True)

    # setting preference > similarity
    cluster_center_indices, labels = assert_warns_message(
        UserWarning, "mutually equal", affinity_propagation, S, preference=0)

    # expect every sample to become an exemplar
    assert_array_equal([0, 1], cluster_center_indices)
    assert_array_equal([0, 1], labels)

    # setting preference < similarity
    cluster_center_indices, labels = assert_warns_message(
        UserWarning, "mutually equal", affinity_propagation, S, preference=-10)

    # expect one cluster, with arbitrary (first) sample as exemplar
    assert_array_equal([0], cluster_center_indices)
    assert_array_equal([0, 0], labels)

    # setting different preferences
    cluster_center_indices, labels = assert_no_warnings(
        affinity_propagation, S, preference=[-20, -10])

    # expect one cluster, with highest-preference sample as exemplar
    assert_array_equal([1], cluster_center_indices)
    assert_array_equal([0, 0], labels)

Source File: test_affinity_propagation.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_affinity_propagation():
    # Affinity Propagation algorithm
    # Compute similarities
    S = -euclidean_distances(X, squared=True)
    preference = np.median(S) * 10
    # Compute Affinity Propagation
    cluster_centers_indices, labels = affinity_propagation(
        S, preference=preference)

    n_clusters_ = len(cluster_centers_indices)

    assert_equal(n_clusters, n_clusters_)

    af = AffinityPropagation(preference=preference, affinity="precomputed")
    labels_precomputed = af.fit(S).labels_

    af = AffinityPropagation(preference=preference, verbose=True)
    labels = af.fit(X).labels_

    assert_array_equal(labels, labels_precomputed)

    cluster_centers_indices = af.cluster_centers_indices_

    n_clusters_ = len(cluster_centers_indices)
    assert_equal(np.unique(labels).size, n_clusters_)
    assert_equal(n_clusters, n_clusters_)

    # Test also with no copy
    _, labels_no_copy = affinity_propagation(S, preference=preference,
                                             copy=False)
    assert_array_equal(labels, labels_no_copy)

    # Test input validation
    assert_raises(ValueError, affinity_propagation, S[:, :-1])
    assert_raises(ValueError, affinity_propagation, S, damping=0)
    af = AffinityPropagation(affinity="unknown")
    assert_raises(ValueError, af.fit, X)

Source File: test_random_projection.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_random_projection_embedding_quality():
    data, _ = make_sparse_random_data(8, 5000, 15000)
    eps = 0.2

    original_distances = euclidean_distances(data, squared=True)
    original_distances = original_distances.ravel()
    non_identical = original_distances != 0.0

    # remove 0 distances to avoid division by 0
    original_distances = original_distances[non_identical]

    for RandomProjection in all_RandomProjection:
        rp = RandomProjection(n_components='auto', eps=eps, random_state=0)
        projected = rp.fit_transform(data)

        projected_distances = euclidean_distances(projected, squared=True)
        projected_distances = projected_distances.ravel()

        # remove 0 distances to avoid division by 0
        projected_distances = projected_distances[non_identical]

        distances_ratio = projected_distances / original_distances

        # check that the automatically tuned values for the density respect the
        # contract for eps: pairwise distances are preserved according to the
        # Johnson-Lindenstrauss lemma
        assert_less(distances_ratio.max(), 1 + eps)
        assert_less(1 - eps, distances_ratio.min())

Source File: robust_soft_learning_vector_quantization.py From scikit-multiflow with BSD 3-Clause "New" or "Revised" License

5 votes

def _optimize(self, X, y):
        nb_prototypes = self.c_w_.size

        n_data, n_dim = X.shape
        prototypes = self.w_.reshape(nb_prototypes, n_dim)

        for i in range(n_data):
            xi = X[i]
            c_xi = int(y[i])
            best_euclid_corr = np.inf
            best_euclid_incorr = np.inf

            # find nearest correct and nearest wrong prototype
            for j in range(prototypes.shape[0]):
                if self.c_w_[j] == c_xi:
                    eucl_dis = euclidean_distances(xi.reshape(1, xi.size),
                                                   prototypes[j]
                                                   .reshape(1, prototypes[j]
                                                   .size))
                    if eucl_dis < best_euclid_corr:
                        best_euclid_corr = eucl_dis
                        corr_index = j
                else:
                    eucl_dis = euclidean_distances(xi.reshape(1, xi.size),
                                                   prototypes[j]
                                                   .reshape(1, prototypes[j]
                                                   .size))
                    if eucl_dis < best_euclid_incorr:
                        best_euclid_incorr = eucl_dis
                        incorr_index = j

            # Update nearest wrong prototype and nearest correct prototype
            # if correct prototype isn't the nearest
            if best_euclid_incorr < best_euclid_corr:
                self._update_prototype(j=corr_index, c_xi=c_xi, xi=xi,
                                       prototypes=prototypes)
                self._update_prototype(j=incorr_index, c_xi=c_xi, xi=xi,
                                       prototypes=prototypes)

Source File: mvmds.py From mvlearn with Apache License 2.0

4 votes

def fit(self, Xs):
        """
        Calculates dimensionally reduced components by inputting the Euclidean
        distances of each view, double centering them, and using the _commonpcs
        function to find common components between views. Works similarly to
        traditional, single-view Multidimensional Scaling.

        Parameters
        ----------
        Xs: list of array-likes or numpy.ndarray
                - Xs length: n_views
                - Xs[i] shape: (n_samples, n_features_i)

        """

        if (self.n_components) > len(Xs[0]):
            self.n_components = len(Xs[0])
            warnings.warn('The number of components you have requested is '
                          + 'greater than the number of samples in the '
                          + 'dataset. ' + str(self.n_components)
                          + ' components were computed instead.')

        Xs = check_Xs(Xs, multiview=True)

        mat = np.ones(shape=(len(Xs), len(Xs[0]), len(Xs[0])))

        # Double centering each view as in single-view MDS

        if (self.dissimilarity == 'euclidean'):

            for i in np.arange(len(Xs)):
                view = euclidean_distances(Xs[i])
                view_squared = np.power(np.array(view), 2)

                J = np.eye(len(view)) - (1/len(view))*np.ones(view.shape)
                B = -(1/2) * J @ view_squared @ J
                mat[i] = B

        # If user wants to input special distance matrix

        elif (self.dissimilarity == 'precomputed'):
            for i in np.arange(len(Xs)):
                if (Xs[i].shape[0] != Xs[i].shape[1]):
                    raise ValueError('The input distance matrix must be '
                                     + 'a square matrix')
                else:
                    view = Xs[i]
                    view_squared = np.power(np.array(view), 2)
                    J = np.eye(len(view)) - (1/len(view))*np.ones(view.shape)
                    B = -(1/2) * J @ view_squared @ J
                    mat[i] = B
        else:
            raise ValueError('The parameter `dissimilarity` must be one of \
                {`euclidean`, `precomputed`}')

        self.components_ = self._commonpcs(mat)

        return self

Source File: test_euclidean_distances.py From mars with Apache License 2.0

4 votes

def testEuclideanDistancesExecution(self):
        dense_raw_x = np.random.rand(30, 10)
        dense_raw_y = np.random.rand(40, 10)
        sparse_raw_x = SparseNDArray(sps.random(30, 10, density=0.5, format='csr'))
        sparse_raw_y = SparseNDArray(sps.random(40, 10, density=0.5, format='csr'))

        for raw_x, raw_y in [(dense_raw_x, dense_raw_y),
                             (sparse_raw_x, sparse_raw_y)]:
            x = mt.tensor(raw_x, chunk_size=9)
            y = mt.tensor(raw_y, chunk_size=7)

            distance = euclidean_distances(x, y)

            result = self.executor.execute_tensor(distance, concat=True)[0]
            expected = sk_euclidean_distances(raw_x, Y=raw_y)
            np.testing.assert_almost_equal(result, expected)

            x_norm = x.sum(axis=1)[..., np.newaxis]
            y_norm = y.sum(axis=1)[np.newaxis, ...]
            distance = euclidean_distances(x, y, X_norm_squared=x_norm,
                                           Y_norm_squared=y_norm)
            x_raw_norm = raw_x.sum(axis=1)[..., np.newaxis]
            y_raw_norm = raw_y.sum(axis=1)[np.newaxis, ...]

            result = self.executor.execute_tensor(distance, concat=True)[0]
            expected = sk_euclidean_distances(raw_x, raw_y, X_norm_squared=x_raw_norm,
                                              Y_norm_squared=y_raw_norm)
            np.testing.assert_almost_equal(result, expected)

            x_sq = (x ** 2).astype(np.float32)
            y_sq = (y ** 2).astype(np.float32)

            distance = euclidean_distances(x_sq, y_sq, squared=True)

            x_raw_sq = (raw_x ** 2).astype(np.float32)
            y_raw_sq = (raw_y ** 2).astype(np.float32)

            result = self.executor.execute_tensor(distance, concat=True)[0]
            expected = sk_euclidean_distances(x_raw_sq, y_raw_sq, squared=True)
            np.testing.assert_almost_equal(result, expected, decimal=6)

            # test x is y
            distance = euclidean_distances(x)

            result = self.executor.execute_tensor(distance, concat=True)[0]
            expected = sk_euclidean_distances(raw_x)

            np.testing.assert_almost_equal(result, expected)

Python sklearn.metrics.euclidean_distances() Examples