Python Examples of sklearn.cluster.AgglomerativeClustering

Source File: graph_eval.py From nodevectors with MIT License

6 votes

def print_labeled_tests(w, y, test_size=0.2, seed=42):
    """
    Clustering and label prediction tests
    """
    X_train, X_test, y_train, y_test = train_test_split(
        w, y, test_size=test_size, random_state=seed)
    # Print Label Prediction Tests
    res = LabelPrediction(w, y, test_size=test_size, seed=seed)
    # Can only cluster on single-label (not multioutput)
    if len(y.shape) < 2:
        n_clusters = np.unique(y).size
        umpagglo = cluster.AgglomerativeClustering(
            n_clusters=n_clusters, 
            affinity='cosine', 
            linkage='average'
        ).fit(w).labels_
        x = evalClusteringOnLabels(umpagglo, y, verbose=True)
        res = {**res, **x}
    return res

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])

Source File: test_hierarchical.py From twitter-stock-recommendation with MIT License

6 votes

def test_compute_full_tree():
    # Test that the full tree is computed if n_clusters is small
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)
    connectivity = kneighbors_graph(X, 5, include_self=False)

    # When n_clusters is less, the full tree should be built
    # that is the number of merges should be n_samples - 1
    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - 1)

    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
    # we should stop when there are n_clusters.
    n_clusters = 101
    X = rng.randn(200, 2)
    connectivity = kneighbors_graph(X, 10, include_self=False)
    agc = AgglomerativeClustering(n_clusters=n_clusters,
                                  connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - n_clusters)

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_single_linkage_clustering():
    # Check that we get the correct result in two emblematic cases
    moons, moon_labels = make_moons(noise=0.05, random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(moons)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     moon_labels), 1)

    circles, circle_labels = make_circles(factor=0.5, noise=0.025,
                                          random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(circles)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     circle_labels), 1)

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_identical_points():
    # Ensure identical points are handled correctly when using mst with
    # a sparse connectivity matrix
    X = np.array([[0, 0, 0], [0, 0, 0],
                  [1, 1, 1], [1, 1, 1],
                  [2, 2, 2], [2, 2, 2]])
    true_labels = np.array([0, 0, 1, 1, 2, 2])
    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    connectivity, n_components = _fix_connectivity(X,
                                                   connectivity,
                                                   'euclidean')

    for linkage in ('single', 'average', 'average', 'ward'):
        clustering = AgglomerativeClustering(n_clusters=3,
                                             linkage=linkage,
                                             connectivity=connectivity)
        clustering.fit(X)

        assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                         true_labels), 1)

Source File: agglomerative.py From Python-Machine-Learning-Cookbook-Second-Edition with MIT License

6 votes

def perform_clustering(X, connectivity, title, num_clusters=3, linkage='ward'):
    plt.figure()
    model = AgglomerativeClustering(linkage=linkage, 
                    connectivity=connectivity, n_clusters=num_clusters)
    model.fit(X)

    # extract labels
    labels = model.labels_

    # specify marker shapes for different clusters
    markers = '.vx'

    for i, marker in zip(range(num_clusters), markers):
        # plot the points belong to the current cluster
        plt.scatter(X[labels==i, 0], X[labels==i, 1], s=50, 
                    marker=marker, color='k', facecolors='none')

    plt.title(title)

Source File: cluster_manager.py From texta with GNU General Public License v3.0

6 votes

def _cluster_documents(self):
        method = self.params['cluster_method']
        n_clusters = int(self.params['cluster_n_clusters'])
        n_samples = len(self.document_vectors)

        if n_clusters > n_samples:
            n_clusters = n_samples

        if method == 'kmeans':
            clusterer = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
        else:
            clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete', affinity='cosine')

        clustering = clusterer.fit(self.document_vectors)
        cluster_labels = clustering.labels_
        clustering_dict = clustering.__dict__
        clusters = {}

        for document_id,cluster_label in enumerate(cluster_labels):
            if cluster_label not in clusters:
                clusters[cluster_label] = []
            clusters[cluster_label].append(document_id)

        return clusters

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_compute_full_tree():
    # Test that the full tree is computed if n_clusters is small
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)
    connectivity = kneighbors_graph(X, 5, include_self=False)

    # When n_clusters is less, the full tree should be built
    # that is the number of merges should be n_samples - 1
    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - 1)

    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
    # we should stop when there are n_clusters.
    n_clusters = 101
    X = rng.randn(200, 2)
    connectivity = kneighbors_graph(X, 10, include_self=False)
    agc = AgglomerativeClustering(n_clusters=n_clusters,
                                  connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - n_clusters)

Source File: test_hierarchical.py From twitter-stock-recommendation with MIT License

6 votes

def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cluster_distances_with_distance_threshold():
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.randint(-10, 10, size=(n_samples, 3))
    # check the distances within the clusters and with other clusters
    distance_threshold = 4
    clustering = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=distance_threshold,
        linkage="single").fit(X)
    labels = clustering.labels_
    D = pairwise_distances(X, metric="minkowski", p=2)
    # to avoid taking the 0 diagonal in min()
    np.fill_diagonal(D, np.inf)
    for label in np.unique(labels):
        in_cluster_mask = labels == label
        max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask]
                                   .min(axis=0).max())
        min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask]
                                    .min(axis=0).min())
        # single data point clusters only have that inf diagonal here
        if in_cluster_mask.sum() > 1:
            assert max_in_cluster_distance < distance_threshold
        assert min_out_cluster_distance >= distance_threshold

Source File: predicting.py From ImageSetCleaner with GNU General Public License v3.0

6 votes

def detection_with_agglomaritve_clustering(image_set):
    """
    Really good if the classes you are analyzing are close to what the network learned.

    :param image_set: The bottleneck values of the relevant images.
    :return: Predictions vector

     N.B : The detector breaks with a full black image.
    """

    # http://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_clustering.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-py
    clf = cluster.AgglomerativeClustering(n_clusters=2, affinity="l2", linkage="complete")

    clf.fit(image_set)

    predictions = clf.labels_
    predictions = normalize_predictions(predictions)

    return predictions

Source File: clusterings.py From parcellation_fragmenter with BSD 3-Clause "New" or "Revised" License

5 votes

def ward(n_clusters, samples):

    """
    Run Ward clustering on vertex coordinates.

    Parameters:
    - - - - -
    n_clusters : int
        number of clusters to generate
    samples : array
        Euclidean-space coordinates of vertices
    """

    # Generate KNN graph
    knn_graph = neighbors.kneighbors_graph(
        samples, n_neighbors=20, mode='connectivity', metric='minkowski', p=2,
        include_self=False, n_jobs=-1)

    # Apply Ward-Agglomerative clustering
    ward = cluster.AgglomerativeClustering(
        n_clusters=n_clusters, affinity='euclidean', connectivity=knn_graph,
        linkage='ward')

    ward.fit(samples)
    labels = ward.labels_.copy()
    labels = labels.astype(np.int32)+1

    return labels

Source File: __init__.py From dials with BSD 3-Clause "New" or "Revised" License

5 votes

def _agglomerative_clustering(self):
        X = self.coords.as_numpy_array()

        # Perform cluster analysis
        from sklearn.cluster import AgglomerativeClustering
        import numpy as np

        model = AgglomerativeClustering(
            n_clusters=self.params.cluster.n_clusters,
            linkage="average",
            affinity="cosine",
        )
        model.fit(X)
        return flex.int(model.labels_.astype(np.int32))

Source File: agglomerative.py From trajminer with MIT License

5 votes

def __init__(self, n_clusters, linkage='ward', measure='precomputed',
                 n_jobs=1):
        self.agglomerative = skAgglomerative(n_clusters=n_clusters,
                                             affinity='precomputed')
        self.n_clusters = n_clusters
        self.measure = measure
        self.n_jobs = n_jobs

Source File: cluster_images.py From NucleiDetectron with Apache License 2.0

5 votes

def create_color_cluster_agglomerative_clustering(in_df, num_clusters):
    cluster_maker = AgglomerativeClustering(linkage='average', n_clusters=num_clusters)

    cluster_maker.fit(in_df[color_features_names])

    in_df['cluster-id'] = cluster_maker.labels_

    in_df['cluster-id'] = in_df['cluster-id'].map(lambda x: str(x))
    return in_df

Source File: create_endpoints_mask_with_clustering.py From TractSeg with Apache License 2.0

5 votes

def cluster(points, algorithm=DBSCAN):
    print("Running {}...".format(algorithm))
    if algorithm == "KMeans":
        # not good at finding clusters if close together
        labels = KMeans(n_clusters=2, random_state=0, n_jobs=-1).fit_predict(points)
    elif algorithm == "DBSCAN":
        # no fixed number of labels; slow with high eps
        labels = DBSCAN(eps=3.0, n_jobs=-1).fit_predict(points)
    # labels = SpectralClustering(n_clusters=2, n_jobs=-1).fit_predict(points)  # slow (> 1min)
    # labels = AgglomerativeClustering(n_clusters=2).fit_predict(points)  # fast
    points_start, points_end = select_two_biggest_clusters(labels, points)
    return points_start, points_end

Source File: compare_clustering_algs.py From mmvt with GNU General Public License v3.0

5 votes

def compare(data, n_groups, output_fol):
    # plot_clusters(data.astype(np.float), scipy.cluster.vq.kmeans, 'scipy.cluster.vq.kmeans', output_fol, (n_groups,), {})
    plot_clusters(data, cluster.KMeans, 'KMeans', output_fol, (), {'n_clusters': n_groups})
    for ct in ['spherical', 'tied', 'diag', 'full']:
        plot_clusters(data, mixture.GaussianMixture, 'GMM_{}'.format(ct), output_fol, (),
                      {'n_components': n_groups, 'covariance_type': ct})
    plot_clusters(data, cluster.AffinityPropagation, 'AffinityPropagation', output_fol, (), {'preference': -5.0, 'damping': 0.95})
    plot_clusters(data, cluster.MeanShift, 'MeanShift', output_fol, (0.175,), {'cluster_all': False})
    plot_clusters(data, cluster.SpectralClustering, 'SpectralClustering', output_fol, (), {'n_clusters': n_groups})
    plot_clusters(data, cluster.AgglomerativeClustering, 'AgglomerativeClustering', output_fol, (), {'n_clusters': n_groups, 'linkage': 'ward'})
    plot_clusters(data, cluster.DBSCAN, 'DBSCAN', output_fol, (), {'eps': 0.025})
    # plot_clusters(data, hdbscan.HDBSCAN, 'HDBSCAN', output_fol, (), {'min_cluster_size': 15})

Source File: sklearn_cluster.py From learn-to-cluster with MIT License

5 votes

def hierarchy(feat, n_clusters, knn, **kwargs):
    from sklearn.neighbors import kneighbors_graph
    knn_graph = kneighbors_graph(feat, knn, include_self=False)
    hierarchy = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                connectivity=knn_graph,
                                                linkage='ward').fit(feat)
    return hierarchy.labels_

Source File: test_hierarchical.py From twitter-stock-recommendation with MIT License

5 votes

def test_agglomerative_clustering_wrong_arg_memory():
    # Test either if an error is raised when memory is not
    # either a str or a joblib.Memory instance
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    memory = 5
    clustering = AgglomerativeClustering(memory=memory)
    assert_raises(ValueError, clustering.fit, X)

Source File: test_hierarchical.py From twitter-stock-recommendation with MIT License

5 votes

def test_connectivity_propagation():
    # Check that connectivity in the ward tree is propagated correctly during
    # merging.
    X = np.array([(.014, .120), (.014, .099), (.014, .097),
                  (.017, .153), (.017, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .152), (.018, .149), (.018, .144)])
    connectivity = kneighbors_graph(X, 10, include_self=False)
    ward = AgglomerativeClustering(
        n_clusters=4, connectivity=connectivity, linkage='ward')
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)

Source File: test_hierarchical.py From twitter-stock-recommendation with MIT License

5 votes

def test_connectivity_fixing_non_lil():
    # Check non regression of a bug if a non item assignable connectivity is
    # provided with more than one component.
    # create dummy data
    x = np.array([[0, 0], [1, 1]])
    # create a mask with several components to force connectivity fixing
    m = np.array([[True, False], [False, True]])
    c = grid_to_graph(n_x=2, n_y=2, mask=m)
    w = AgglomerativeClustering(connectivity=c, linkage='ward')
    assert_warns(UserWarning, w.fit, x)

Source File: test_hierarchical.py From twitter-stock-recommendation with MIT License

5 votes

def test_connectivity_callable():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3, include_self=False)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(
        connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False))
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_)

Source File: test_hierarchical.py From twitter-stock-recommendation with MIT License

5 votes

def test_agg_n_clusters():
    # Test that an error is raised when n_clusters <= 0

    rng = np.random.RandomState(0)
    X = rng.rand(20, 10)
    for n_clus in [-1, 0]:
        agc = AgglomerativeClustering(n_clusters=n_clus)
        msg = ("n_clusters should be an integer greater than 0."
               " %s was provided." % str(agc.n_clusters))
        assert_raise_message(ValueError, msg, agc.fit, X)

Source File: stitch_patches_page.py From ScanSSD with MIT License

5 votes

def clustering(math_regions, char_data, image, algorithm, thresh_votes):

    centers = []
    for math_region in math_regions:
        center = [(math_region[0]+math_region[2])/2, (math_region[1]+math_region[3])/2]
        centers.append(center)

    clustering = AgglomerativeClustering().fit(centers)

    labels = np.unique(clustering.labels_)

    for label in labels:
        regions = math_regions[labels==label]

    pass

Source File: baseline_clustering.py From cdp with MIT License

5 votes

def hierarchy(feat, n_clusters=2, knn=30):
    from sklearn.neighbors import kneighbors_graph
    knn_graph = kneighbors_graph(feat, knn, include_self=False)
    hierarchy = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                connectivity=knn_graph,
                                                linkage='ward').fit(feat)
    return hierarchy.labels_

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_agglomerative_clustering_wrong_arg_memory():
    # Test either if an error is raised when memory is not
    # either a str or a joblib.Memory instance
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    memory = 5
    clustering = AgglomerativeClustering(memory=memory)
    assert_raises(ValueError, clustering.fit, X)

Source File: diarization.py From VBDiarization with Apache License 2.0

5 votes

def run_ahc(self, n_clusters, embeddings, scores_matrix):
        """ Run agglomerative hierarchical clustering.

        Returns:
            np.array: means of clusters
        """
        scores_matrix = -((scores_matrix - np.min(scores_matrix)) / (np.max(scores_matrix) - np.min(scores_matrix)))
        ahc = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=n_clusters)
        labels = ahc.fit_predict(scores_matrix)
        return np.array([np.mean(embeddings[np.where(labels == i)], axis=0) for i in range(n_clusters)])

Source File: infer.py From NLP_Toolkit with Apache License 2.0

5 votes

def n_cluster_embeddings(self, features=None, n_clusters=3, method='ac'):
        '''
        clusters the nodes based on embedding features
        features = None (use DGI generated embeddings)
        '''
        if method == 'ac':
            clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean',\
                                                 linkage='ward')
            clustering.fit(self.embeddings if features is None else features)
            self.labels = clustering.labels_
            self.score = silhouette_score(self.embeddings if features is None else features,\
                                          self.labels)
        return {'labels': self.labels, 'score': self.score}

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_connectivity_propagation():
    # Check that connectivity in the ward tree is propagated correctly during
    # merging.
    X = np.array([(.014, .120), (.014, .099), (.014, .097),
                  (.017, .153), (.017, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .152), (.018, .149), (.018, .144)])
    connectivity = kneighbors_graph(X, 10, include_self=False)
    ward = AgglomerativeClustering(
        n_clusters=4, connectivity=connectivity, linkage='ward')
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_n_components_deprecation():
    # Test that a Deprecation warning is thrown when n_components_
    # attribute is accessed

    X = np.array([[1, 2], [1, 4], [1, 0], [4, 2]])
    agc = AgglomerativeClustering().fit(X)

    match = ("``n_components_`` attribute was deprecated "
             "in favor of ``n_connected_components_``")
    with pytest.warns(DeprecationWarning, match=match):
        n = agc.n_components_
    assert n == agc.n_connected_components_

Python sklearn.cluster.AgglomerativeClustering() Examples