Python Examples of sklearn.cluster.KMeans

Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License

8 votes

def test_k_means_new_centers():
    # Explore the part of the code where a new center is reassigned
    X = np.array([[0, 0, 1, 1],
                  [0, 0, 0, 0],
                  [0, 1, 0, 0],
                  [0, 0, 0, 0],
                  [0, 0, 0, 0],
                  [0, 1, 0, 0]])
    labels = [0, 1, 2, 1, 1, 2]
    bad_centers = np.array([[+0, 1, 0, 0],
                            [.2, 0, .2, .2],
                            [+0, 0, 0, 0]])

    km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10,
                random_state=1)
    for this_X in (X, sp.coo_matrix(X)):
        km.fit(this_X)
        this_labels = km.labels_
        # Reorder the labels so that the first instance is in cluster 0,
        # the second in cluster 1, ...
        this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
        np.testing.assert_array_equal(this_labels, labels)

Source File: guess.py From Keras-BiGAN with MIT License

6 votes

def cluster(points, means = 8):
    kk = KMeans(n_clusters = means)
    kk.fit(points)

    labels = kk.predict(points)

    r = []

    for i in range(means):
        row = []
        while(len(row) < 8):
            image = random.randint(0, data.files.shape[0] - 1)
            if labels[image] == i:
                row.append(data.files[image])

        r.append(np.concatenate(row, axis=1))

    c = np.concatenate(r, axis=0)

    x = Image.fromarray(c)
    x.save('Results/clusters.png')

Source File: posterior.py From scVI with MIT License

6 votes

def clustering_scores(self, prediction_algorithm: str = "knn") -> Tuple:
        if self.gene_dataset.n_labels > 1:
            latent, _, labels = self.get_latent()
            if prediction_algorithm == "knn":
                labels_pred = KMeans(
                    self.gene_dataset.n_labels, n_init=200
                ).fit_predict(
                    latent
                )  # n_jobs>1 ?
            elif prediction_algorithm == "gmm":
                gmm = GMM(self.gene_dataset.n_labels)
                gmm.fit(latent)
                labels_pred = gmm.predict(latent)

            asw_score = silhouette_score(latent, labels)
            nmi_score = NMI(labels, labels_pred)
            ari_score = ARI(labels, labels_pred)
            uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0]
            logger.debug(
                "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f"
                % (asw_score, nmi_score, ari_score, uca_score)
            )
            return asw_score, nmi_score, ari_score, uca_score

Source File: discretize.py From msppy with BSD 3-Clause "New" or "Revised" License

6 votes

def SAA(self):
        """Use K-means method to discretize the Markovian process."""
        from sklearn.cluster import KMeans
        if self.int_flag == 0:
            labels = numpy.zeros(self.n_samples,dtype=int)
        self._initialize_matrix()
        for t in range(1,self.T):
            kmeans = KMeans(
                n_clusters=self.n_Markov_states[t],
                random_state=0,
            ).fit(self.samples[:,t,:])
            self.Markov_states[t] = kmeans.cluster_centers_
            if self.int_flag == 0:
                labels_new = kmeans.labels_
                counts = numpy.zeros([self.n_Markov_states[t-1],1])
                for i in range(self.n_samples):
                    counts[labels[i]] += 1
                    self.transition_matrix[t][labels[i]][labels_new[i]] += 1
                self.transition_matrix[t] /= counts
                labels = labels_new
        if self.int_flag == 1:
            self.train_transition_matrix()

        return (self.Markov_states,self.transition_matrix)

Source File: argva_node_clustering.py From pytorch_geometric with MIT License

6 votes

def test():
    model.eval()
    z = model.encode(data.x, data.train_pos_edge_index)

    # Cluster embedded values using k-means.
    kmeans_input = z.cpu().numpy()
    kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input)
    pred = kmeans.predict(kmeans_input)

    labels = data.y.cpu().numpy()
    completeness = completeness_score(labels, pred)
    hm = homogeneity_score(labels, pred)
    nmi = v_measure_score(labels, pred)

    auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)

    return auc, ap, completeness, hm, nmi

Source File: density_weighted_meta.py From libact with BSD 2-Clause "Simplified" License

6 votes

def __init__(self, dataset, base_query_strategy, similarity_metric=None,
                 clustering_method=None, beta=1.0, random_state=None):
        super(DensityWeightedMeta, self).__init__(dataset=dataset)
        if not isinstance(base_query_strategy, QueryStrategy):
            raise TypeError(
                "'base_query_strategy' has to be an instance of 'QueryStrategy'"
            )
        if base_query_strategy.dataset != self.dataset:
            raise ValueError("base_query_strategy should share the same"
                             "dataset instance with DensityWeightedMeta")

        self.base_query_strategy = base_query_strategy
        self.beta = beta
        self.random_state_ = seed_random_state(random_state)

        if clustering_method is not None:
            self.clustering_method = clustering_method
        else:
            self.clustering_method = KMeans(
                n_clusters=5, random_state=self.random_state_)
        
        if similarity_metric is not None:
            self.similarity_metric = similarity_metric
        else:
            self.similarity_metric = cosine_similarity

Source File: clustering.py From retentioneering-tools with Mozilla Public License 2.0

6 votes

def calc_mean_dist_from_center(data, km):
    """
    Calculates mean distance from cluster centers. Note that it will be calculated only for KMeans and GMM, because DBSCAN may have ambiguous form of clusters.

    Parameters
    --------
    data: pd.DataFrame
        Dataframe with features for clustering indexed as in ``retention_config.index_col``
    km:
        Already fitted clusterer.

    Returns
    -------
    Mapping of clusters names to mean distance from cluster centers.

    Return type
    -------
    Dict
    """
    res = {}
    cl = km.labels_
    cs = km.cluster_centers_
    for i in set(cl):
        res[i] = _cosine_dist(data[cl == i], cs[i]).mean()
    return res

Source File: test_clusters.py From mabwiser with Apache License 2.0

6 votes

def test_copy(self):
        arms, mab = self.predict(arms=[1, 2, 3, 4],
                                 decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3],
                                 rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1],
                                 learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0),
                                 neighborhood_policy=NeighborhoodPolicy.Clusters(2),
                                 context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0],
                                                  [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0],
                                                  [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3],
                                                  [0, 2, 1, 0, 0]],
                                 contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]],
                                 seed=123456,
                                 num_run=1,
                                 is_predict=True)

        clusters = deepcopy(mab._imp)
        self.assertIsNot(clusters, mab._imp)
        self.assertIsInstance(clusters.lp_list[0], _EpsilonGreedy)
        self.assertIsInstance(clusters.lp_list[1], _EpsilonGreedy)
        self.assertIsInstance(clusters.kmeans, KMeans)
        self.assertIsNot(clusters.kmeans, mab._imp.kmeans)
        self.assertIsNot(clusters.lp_list[0], mab._imp.lp_list[0])
        self.assertIsNot(clusters.lp_list[1], mab._imp.lp_list[1])
        self.assertEqual(clusters.lp_list[0].epsilon, mab._imp.lp_list[0].epsilon)
        self.assertEqual(clusters.lp_list[1].epsilon, mab._imp.lp_list[1].epsilon)

Source File: test_clusters.py From mabwiser with Apache License 2.0

6 votes

def test_greedy0_n2(self):

        arms, mab = self.predict(arms=[1, 2, 3, 4],
                                 decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3],
                                 rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1],
                                 learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0),
                                 neighborhood_policy=NeighborhoodPolicy.Clusters(2),
                                 context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0],
                                                  [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0],
                                                  [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3],
                                                  [0, 2, 1, 0, 0]],
                                 contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]],
                                 seed=123456,
                                 num_run=1,
                                 is_predict=True)

        self.assertListEqual(arms, [3, 1])
        self.assertTrue(isinstance(mab._imp.kmeans, KMeans))

Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_elkan_results(distribution):
    # check that results are identical between lloyd and elkan algorithms
    rnd = np.random.RandomState(0)
    if distribution == 'normal':
        X = rnd.normal(size=(50, 10))
    else:
        X, _ = make_blobs(random_state=rnd)

    km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1)
    km_elkan = KMeans(algorithm='elkan', n_clusters=5,
                      random_state=0, n_init=1)

    km_full.fit(X)
    km_elkan.fit(X)
    assert_array_almost_equal(km_elkan.cluster_centers_,
                              km_full.cluster_centers_)
    assert_array_equal(km_elkan.labels_, km_full.labels_)

Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_kmeans_results(representation, algo, dtype):
    # cheks that kmeans works as intended
    array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation]
    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)

    expected_labels = [0, 0, 1, 1]
    expected_inertia = 0.1875
    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
    expected_n_iter = 2

    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
    kmeans.fit(X, sample_weight=sample_weight)

    assert_array_equal(kmeans.labels_, expected_labels)
    assert_almost_equal(kmeans.inertia_, expected_inertia)
    assert_array_almost_equal(kmeans.cluster_centers_, expected_centers)
    assert kmeans.n_iter_ == expected_n_iter

Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_k_means_non_collapsed():
    # Check k_means with a bad initialization does not yield a singleton
    # Starting with bad centers that are quickly ignored should not
    # result in a repositioning of the centers to the center of mass that
    # would lead to collapsed centers which in turns make the clustering
    # dependent of the numerical unstabilities.
    my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]])
    array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]])
    km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1)
    km.fit(my_X)

    # centers must not been collapsed
    assert_equal(len(np.unique(km.labels_)), 3)

    centers = km.cluster_centers_
    assert np.linalg.norm(centers[0] - centers[1]) >= 0.1
    assert np.linalg.norm(centers[0] - centers[2]) >= 0.1
    assert np.linalg.norm(centers[1] - centers[2]) >= 0.1

Source File: spectral_graph_partition.py From LanczosNetwork with MIT License

6 votes

def spectral_clustering(L, K, seed=1234):
  """
  Implement paper "Shi, J. and Malik, J., 2000. Normalized cuts and image 
  segmentation. IEEE Transactions on pattern analysis and machine intelligence, 
  22(8), pp.888-905."

  Args:
    L: graph Laplacian, numpy or scipy matrix
    K: int, number of clusters

  Returns:
    node_label: list

  N.B.: for simplicity, we only consider simple and undirected graph
  """
  num_nodes = L.shape[0]
  assert (K < num_nodes - 1)

  eig, eig_vec = scipy.sparse.linalg.eigsh(
      L, k=K, which='LM', maxiter=num_nodes * 10000, tol=0, mode='normal')
  kmeans = KMeans(n_clusters=K, random_state=seed).fit(eig_vec.real)

  return kmeans.labels_

Source File: test_pipeline.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    iris = load_iris()
    scaler = StandardScaler()
    km = KMeans(random_state=0)
    # As pipeline doesn't clone estimators on construction,
    # it must have its own estimators
    scaler_for_pipeline = StandardScaler()
    km_for_pipeline = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([
        ('scaler', scaler_for_pipeline),
        ('Kmeans', km_for_pipeline)
    ])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)

Source File: clustering_kmeans.py From practicalDataAnalysisCookbook with GNU General Public License v2.0

6 votes

def findClusters_kmeans(data):
    '''
        Cluster data using k-means
    '''
    # create the classifier object
    kmeans = cl.KMeans(
        n_clusters=4,
        n_jobs=-1,
        verbose=0,
        n_init=30
    )

    # fit the data
    return kmeans.fit(data)

# the file name of the dataset

Source File: pancreas_tests.py From scanorama with MIT License

6 votes

def entropy_test(datasets_dimred, ds_labels):
    
    ds_labels = np.array(ds_labels)
    X_dimred = np.concatenate(datasets_dimred)
    embedding = None
    
    for k in range(10, 21):
        km = KMeans(n_clusters=k, n_jobs=-1, verbose=0)
        km.fit(X_dimred)

        if False and k % 5 == 0:
            embedding = visualize(
                datasets_dimred,
                km.labels_, NAMESPACE + '_km{}'.format(k),
                [ str(x) for x in range(k) ],
                embedding=embedding
            )
        
        print('k = {}, average normalized entropy = {}'
              .format(k, avg_norm_entropy(ds_labels, km.labels_)))

Source File: feature_preprocess.py From MassImageRetrieval with Apache License 2.0

6 votes

def analysis_KMeans():
	mean_distortions = []
	K = len(labels_idx)
	K_range = range(320, 1000)
	for k in K_range:
		print("Cluster k is {}".format(k))
		kmeans_model = KMeans(n_clusters=k, init="k-means++", n_jobs=-1)
		kmeans_model.fit(np_features)
		t_distortions = sum(
			np.min(cdist(np_features, kmeans_model.cluster_centers_, 'euclidean'), axis=1)) / np_features.shape[0]
		mean_distortions.append(t_distortions)

	with open("./kmeans_cluster.csv", "a+") as wh:
		for idx in range(len(K_range)):
			wh.write("{},{}\n".format(K_range[idx], mean_distortions[idx]))

	# plt.plot(K_range, mean_distortions, 'bx-')
	# plt.xlabel('k')
	# plt.ylabel(u'Avgerage distortion degree')
	# plt.title(u'Elbows rule to select the best K value')
	# plt.savefig("kmeans_cluster.png")

Source File: cluster.py From PHATE with GNU General Public License v2.0

6 votes

def silhouette_score(phate_op, n_clusters, random_state=None, **kwargs):
    """Compute the Silhouette score on KMeans on the PHATE potential

    Parameters
    ----------
    phate_op : phate.PHATE
        Fitted PHATE operator
    n_clusters : int
        Number of clusters.
    random_state : int or None, optional (default: None)
        Random seed for k-means

    Returns
    -------
    score : float
    """
    cluster_labels = kmeans(phate_op, n_clusters=n_clusters, random_state=random_state, **kwargs)
    return metrics.silhouette_score(phate_op.diff_potential, cluster_labels)

Source File: SpectralClustering.py From sparse-subspace-clustering-python with MIT License

6 votes

def SpectralClustering(CKSym, n):
    # This is direct port of JHU vision lab code. Could probably use sklearn SpectralClustering.
    CKSym = CKSym.astype(float)
    N, _ = CKSym.shape
    MAXiter = 1000  # Maximum number of iterations for KMeans
    REPlic = 20  # Number of replications for KMeans

    DN = np.diag(np.divide(1, np.sqrt(np.sum(CKSym, axis=0) + np.finfo(float).eps)))
    LapN = identity(N).toarray().astype(float) - np.matmul(np.matmul(DN, CKSym), DN)
    _, _, vN = np.linalg.svd(LapN)
    vN = vN.T
    kerN = vN[:, N - n:N]
    normN = np.sqrt(np.sum(np.square(kerN), axis=1))
    kerNS = np.divide(kerN, normN.reshape(len(normN), 1) + np.finfo(float).eps)
    km = KMeans(n_clusters=n, n_init=REPlic, max_iter=MAXiter, n_jobs=-1).fit(kerNS)
    return km.labels_

Source File: test_spectral_embedding.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_pipeline_spectral_clustering(seed=36):
    # Test using pipeline to do spectral clustering
    random_state = np.random.RandomState(seed)
    se_rbf = SpectralEmbedding(n_components=n_clusters,
                               affinity="rbf",
                               random_state=random_state)
    se_knn = SpectralEmbedding(n_components=n_clusters,
                               affinity="nearest_neighbors",
                               n_neighbors=5,
                               random_state=random_state)
    for se in [se_rbf, se_knn]:
        km = KMeans(n_clusters=n_clusters, random_state=random_state)
        km.fit(se.fit_transform(S))
        assert_array_almost_equal(
            normalized_mutual_info_score(
                km.labels_,
                true_labels), 1.0, 2)

Source File: clustering.py From retentioneering-tools with Mozilla Public License 2.0

5 votes

def simple_cluster(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs):
    """
    Finds cluster of users in data.

    Parameters
    -------
    data: pd.DataFrame
        Dataframe with features for clustering indexed as in ``retention_config.index_col``
    max_n_clusters: int, optional
        Maximal number of clusters for automatic selection for number of clusters. If ``None``, then uses n_clusters from arguments. Default: `None```
    use_csi: bool, optional
        If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True``
    random_state: int, optional
        Random state for KMeans clusterer. Default: ``0``
    kwargs: optional
        Parameters for ``sklearn.cluster.KMeans``

    Returns
    -------
    Array of clusters

    Return type
    -------
    np.array
    """
    if max_n_clusters is not None:
        kmargs = find_best_n_clusters(data, KMeans, max_n_clusters, random_state, **kwargs)
    else:
        kmargs = {i: j for i, j in kwargs.items() if i in __KMEANS_FILTER__}
    kmargs.update({'random_state': random_state})
    km = KMeans(**kmargs)
    cl = km.fit_predict(data.values)
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    return cl, metrics

Source File: clustering.py From retentioneering-tools with Mozilla Public License 2.0

5 votes

def find_best_n_clusters(data, clusterer, max_n_clusters, random_state, **kwargs):
    """
    Finds best number of clusters for KMeans and Gaussian Mixture.

    Parameters
    -------
    data: pd.DataFrame
        Dataframe with features for clustering with index as in ``retention_config.index_col``
    clusterer: sklearn clusterer class
        For instance, ``sklearn.cluster.KMeans`` or ``sklearn.mixture.GaussianMixture``.
    max_n_clusters: int
        Maximal number of clusters for searching.
    random_state: int
        Random state for clusterer.

    Returns
    -------
    Optimal keyword arguments for clustering method.

    Return type
    ------
    Dict
    """
    args = {i: j for i, j in kwargs.items() if i in clusterer.get_params(clusterer)}
    if 'n_clusters' in clusterer.get_params(clusterer):
        kms = True
    else:
        kms = False
    args.pop('n_clusters' if kms else 'n_components', None)
    args.update({'random_state': random_state})
    score = {}
    for i in range(2, max_n_clusters + 1):
        args.update({'n_clusters' if kms else 'n_components': i})
        km = clusterer(**args)
        score[i] = silhouette_score(data, km.fit_predict(data), metric='cosine')
    best = pd.Series(score).idxmax()
    args.update({'n_clusters' if kms else 'n_components': best})
    print(f'Best number of clusters is {best}')
    return args

Source File: K-Means_scikit-learn.py From MachineLearning_Python with MIT License

5 votes

def kMenas():
    data = spio.loadmat("data.mat")
    X = data['X']   
    model = KMeans(n_clusters=3).fit(X) # n_clusters指定3类，拟合数据
    centroids = model.cluster_centers_  # 聚类中心
    
    plt.scatter(X[:,0], X[:,1])     # 原数据的散点图
    plt.plot(centroids[:,0],centroids[:,1],'r^',markersize=10)  # 聚类中心
    plt.show()

Source File: clustering.py From retentioneering-tools with Mozilla Public License 2.0

5 votes

def dbscan(data, use_csi=True, epsq=None, max_cl_number=None, **kwargs):
    """
    Finds cluster of users in data using DBSCAN

    Parameters
    -------
    data: pd.DataFrame
        Dataframe with features for clustering indexed by users (sessions)
    use_csi: bool, optional
        If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True``
    epsq: float, optional
        Quantile of nearest neighbor positive distance between dots, its value will be an eps. If ``None``, then eps from keywords will be used. Default: ``None``
    max_cl_number: int, optional
        Maximal number of clusters for aggregation of small clusters. Default: ``None``
    kwargs: optional
        Parameters for ``sklearn.cluster.KMeans``

    Returns
    --------
    Array of clusters

    Return type
    -------
    np.array
    """
    kmargs = {i: j for i, j in kwargs.items() if i in DBSCAN.get_params(DBSCAN)}
    if epsq is not None:
        kmargs.update({'eps': find_best_eps(data, epsq)})
    km = DBSCAN(**kmargs)
    cl = km.fit_predict(data.values)
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    if max_cl_number is not None:
        cl = aggregate_cl(cl, max_cl_number)
    return cl, metrics

Source File: test_weight_boosting.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_sample_weight_missing():
    from sklearn.cluster import KMeans

    clf = AdaBoostClassifier(KMeans(), algorithm="SAMME")
    assert_raises(ValueError, clf.fit, X, y_regr)

    clf = AdaBoostRegressor(KMeans())
    assert_raises(ValueError, clf.fit, X, y_regr)

Source File: clustering_kmeans_search.py From practicalDataAnalysisCookbook with GNU General Public License v2.0

5 votes

def findClusters_kmeans(data, no_of_clusters):
    '''
        Cluster data using k-means
    '''
    # create the classifier object
    kmeans = cl.KMeans(
        n_clusters=no_of_clusters,
        n_jobs=-1,
        verbose=0,
        n_init=30
    )

    # fit the data
    return kmeans.fit(data)

Source File: cnn_lcd.py From CNN_LCD with GNU General Public License v3.0

5 votes

def cluster_kmeans(sim):
    """Run k-means on similarity matrix and segment"""
    sim_dim = sim.shape[0]
    sim = sim.reshape(-1, 1)

    # Augment with spatial coordinates
    sim_aug = np.concatenate(
        [sim,
         np.mgrid[:sim_dim, :sim_dim].reshape(-1, sim_dim ** 2).T],
        axis=1
    )

    # Empirical metric for number of loop-closures given number of images
    # in sequence (assumption: equally-spaced samples):
    n_clusters = int(np.sqrt(sim_dim))
    print('Performing clustering via KMeans(n={}).'.format(n_clusters))

    km = KMeans(n_clusters=n_clusters, n_jobs=2,
                max_iter=300)
    labels = km.fit_predict(sim_aug)
    print('Got cluster labels')

    for i in range(n_clusters):
        lab_idx = (labels == i)
        if lab_idx.size:
            cc = sim[lab_idx].mean()
            # cc = sim[lab_idx].max()
            sim[lab_idx] = cc

    # Re-normalize and reshape
    sim = sim.reshape(sim_dim, sim_dim) / sim.max()
    return sim

Source File: sam_knn.py From scikit-multiflow with BSD 3-Clause "New" or "Revised" License

5 votes

def cluster_down(self, samples, labels):
        """Performs classwise kMeans++ clustering for given samples with corresponding labels.
        The number of samples is halved per class."""
        logging.debug('cluster Down %d' % self.trainStepCount)
        uniqueLabels = np.unique(labels)
        newSamples = np.empty(shape=(0, samples.shape[1]))
        newLabels = np.empty(shape=(0), dtype=np.int32)
        for label in uniqueLabels:
            tmpSamples = samples[labels == label]
            newLength = int(max(tmpSamples.shape[0]/2, 1))
            clustering = KMeans(n_clusters=newLength, n_init=1, random_state=0)
            clustering.fit(tmpSamples)
            newSamples = np.vstack([newSamples, clustering.cluster_centers_])
            newLabels = np.append(newLabels, label*np.ones(shape=newLength, dtype=np.int32))
        return newSamples, newLabels

Source File: clustering_kmeans_search_alternative.py From practicalDataAnalysisCookbook with GNU General Public License v2.0

5 votes

def findClusters_kmeans(data, no_of_clusters):
    '''
        Cluster data using k-means
    '''
    # create the classifier object
    kmeans = cl.KMeans(
        n_clusters=no_of_clusters,
        n_jobs=-1,
        verbose=0,
        n_init=30
    )

    # fit the data
    return kmeans.fit(data)

Source File: cluster_features.py From bert-extractive-summarizer with MIT License

5 votes

def __get_model(self, k: int):
        """
        Retrieve clustering model

        :param k: amount of clusters
        :return: Clustering model

        """

        if self.algorithm == 'gmm':
            return GaussianMixture(n_components=k, random_state=self.random_state)
        return KMeans(n_clusters=k, random_state=self.random_state)

Python sklearn.cluster.KMeans() Examples