Python sklearn.cluster.KMeans() Examples
The following are 30
code examples of sklearn.cluster.KMeans().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.cluster
, or try the search function
.
Example #1
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 8 votes |
def test_k_means_new_centers(): # Explore the part of the code where a new center is reassigned X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0]]) labels = [0, 1, 2, 1, 1, 2] bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]]) km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10, random_state=1) for this_X in (X, sp.coo_matrix(X)): km.fit(this_X) this_labels = km.labels_ # Reorder the labels so that the first instance is in cluster 0, # the second in cluster 1, ... this_labels = np.unique(this_labels, return_index=True)[1][this_labels] np.testing.assert_array_equal(this_labels, labels)
Example #2
Source File: guess.py From Keras-BiGAN with MIT License | 6 votes |
def cluster(points, means = 8): kk = KMeans(n_clusters = means) kk.fit(points) labels = kk.predict(points) r = [] for i in range(means): row = [] while(len(row) < 8): image = random.randint(0, data.files.shape[0] - 1) if labels[image] == i: row.append(data.files[image]) r.append(np.concatenate(row, axis=1)) c = np.concatenate(r, axis=0) x = Image.fromarray(c) x.save('Results/clusters.png')
Example #3
Source File: posterior.py From scVI with MIT License | 6 votes |
def clustering_scores(self, prediction_algorithm: str = "knn") -> Tuple: if self.gene_dataset.n_labels > 1: latent, _, labels = self.get_latent() if prediction_algorithm == "knn": labels_pred = KMeans( self.gene_dataset.n_labels, n_init=200 ).fit_predict( latent ) # n_jobs>1 ? elif prediction_algorithm == "gmm": gmm = GMM(self.gene_dataset.n_labels) gmm.fit(latent) labels_pred = gmm.predict(latent) asw_score = silhouette_score(latent, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0] logger.debug( "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" % (asw_score, nmi_score, ari_score, uca_score) ) return asw_score, nmi_score, ari_score, uca_score
Example #4
Source File: discretize.py From msppy with BSD 3-Clause "New" or "Revised" License | 6 votes |
def SAA(self): """Use K-means method to discretize the Markovian process.""" from sklearn.cluster import KMeans if self.int_flag == 0: labels = numpy.zeros(self.n_samples,dtype=int) self._initialize_matrix() for t in range(1,self.T): kmeans = KMeans( n_clusters=self.n_Markov_states[t], random_state=0, ).fit(self.samples[:,t,:]) self.Markov_states[t] = kmeans.cluster_centers_ if self.int_flag == 0: labels_new = kmeans.labels_ counts = numpy.zeros([self.n_Markov_states[t-1],1]) for i in range(self.n_samples): counts[labels[i]] += 1 self.transition_matrix[t][labels[i]][labels_new[i]] += 1 self.transition_matrix[t] /= counts labels = labels_new if self.int_flag == 1: self.train_transition_matrix() return (self.Markov_states,self.transition_matrix)
Example #5
Source File: argva_node_clustering.py From pytorch_geometric with MIT License | 6 votes |
def test(): model.eval() z = model.encode(data.x, data.train_pos_edge_index) # Cluster embedded values using k-means. kmeans_input = z.cpu().numpy() kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input) pred = kmeans.predict(kmeans_input) labels = data.y.cpu().numpy() completeness = completeness_score(labels, pred) hm = homogeneity_score(labels, pred) nmi = v_measure_score(labels, pred) auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index) return auc, ap, completeness, hm, nmi
Example #6
Source File: density_weighted_meta.py From libact with BSD 2-Clause "Simplified" License | 6 votes |
def __init__(self, dataset, base_query_strategy, similarity_metric=None, clustering_method=None, beta=1.0, random_state=None): super(DensityWeightedMeta, self).__init__(dataset=dataset) if not isinstance(base_query_strategy, QueryStrategy): raise TypeError( "'base_query_strategy' has to be an instance of 'QueryStrategy'" ) if base_query_strategy.dataset != self.dataset: raise ValueError("base_query_strategy should share the same" "dataset instance with DensityWeightedMeta") self.base_query_strategy = base_query_strategy self.beta = beta self.random_state_ = seed_random_state(random_state) if clustering_method is not None: self.clustering_method = clustering_method else: self.clustering_method = KMeans( n_clusters=5, random_state=self.random_state_) if similarity_metric is not None: self.similarity_metric = similarity_metric else: self.similarity_metric = cosine_similarity
Example #7
Source File: clustering.py From retentioneering-tools with Mozilla Public License 2.0 | 6 votes |
def calc_mean_dist_from_center(data, km): """ Calculates mean distance from cluster centers. Note that it will be calculated only for KMeans and GMM, because DBSCAN may have ambiguous form of clusters. Parameters -------- data: pd.DataFrame Dataframe with features for clustering indexed as in ``retention_config.index_col`` km: Already fitted clusterer. Returns ------- Mapping of clusters names to mean distance from cluster centers. Return type ------- Dict """ res = {} cl = km.labels_ cs = km.cluster_centers_ for i in set(cl): res[i] = _cosine_dist(data[cl == i], cs[i]).mean() return res
Example #8
Source File: test_clusters.py From mabwiser with Apache License 2.0 | 6 votes |
def test_copy(self): arms, mab = self.predict(arms=[1, 2, 3, 4], decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3], rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1], learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0), neighborhood_policy=NeighborhoodPolicy.Clusters(2), context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0], [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0], [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3], [0, 2, 1, 0, 0]], contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]], seed=123456, num_run=1, is_predict=True) clusters = deepcopy(mab._imp) self.assertIsNot(clusters, mab._imp) self.assertIsInstance(clusters.lp_list[0], _EpsilonGreedy) self.assertIsInstance(clusters.lp_list[1], _EpsilonGreedy) self.assertIsInstance(clusters.kmeans, KMeans) self.assertIsNot(clusters.kmeans, mab._imp.kmeans) self.assertIsNot(clusters.lp_list[0], mab._imp.lp_list[0]) self.assertIsNot(clusters.lp_list[1], mab._imp.lp_list[1]) self.assertEqual(clusters.lp_list[0].epsilon, mab._imp.lp_list[0].epsilon) self.assertEqual(clusters.lp_list[1].epsilon, mab._imp.lp_list[1].epsilon)
Example #9
Source File: test_clusters.py From mabwiser with Apache License 2.0 | 6 votes |
def test_greedy0_n2(self): arms, mab = self.predict(arms=[1, 2, 3, 4], decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3], rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1], learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0), neighborhood_policy=NeighborhoodPolicy.Clusters(2), context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0], [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0], [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3], [0, 2, 1, 0, 0]], contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]], seed=123456, num_run=1, is_predict=True) self.assertListEqual(arms, [3, 1]) self.assertTrue(isinstance(mab._imp.kmeans, KMeans))
Example #10
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_elkan_results(distribution): # check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) if distribution == 'normal': X = rnd.normal(size=(50, 10)) else: X, _ = make_blobs(random_state=rnd) km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1) km_full.fit(X) km_elkan.fit(X) assert_array_almost_equal(km_elkan.cluster_centers_, km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_)
Example #11
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_kmeans_results(representation, algo, dtype): # cheks that kmeans works as intended array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation] X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) expected_labels = [0, 0, 1, 1] expected_inertia = 0.1875 expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) expected_n_iter = 2 kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) kmeans.fit(X, sample_weight=sample_weight) assert_array_equal(kmeans.labels_, expected_labels) assert_almost_equal(kmeans.inertia_, expected_inertia) assert_array_almost_equal(kmeans.cluster_centers_, expected_centers) assert kmeans.n_iter_ == expected_n_iter
Example #12
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_k_means_non_collapsed(): # Check k_means with a bad initialization does not yield a singleton # Starting with bad centers that are quickly ignored should not # result in a repositioning of the centers to the center of mass that # would lead to collapsed centers which in turns make the clustering # dependent of the numerical unstabilities. my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]]) array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]]) km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1) km.fit(my_X) # centers must not been collapsed assert_equal(len(np.unique(km.labels_)), 3) centers = km.cluster_centers_ assert np.linalg.norm(centers[0] - centers[1]) >= 0.1 assert np.linalg.norm(centers[0] - centers[2]) >= 0.1 assert np.linalg.norm(centers[1] - centers[2]) >= 0.1
Example #13
Source File: spectral_graph_partition.py From LanczosNetwork with MIT License | 6 votes |
def spectral_clustering(L, K, seed=1234): """ Implement paper "Shi, J. and Malik, J., 2000. Normalized cuts and image segmentation. IEEE Transactions on pattern analysis and machine intelligence, 22(8), pp.888-905." Args: L: graph Laplacian, numpy or scipy matrix K: int, number of clusters Returns: node_label: list N.B.: for simplicity, we only consider simple and undirected graph """ num_nodes = L.shape[0] assert (K < num_nodes - 1) eig, eig_vec = scipy.sparse.linalg.eigsh( L, k=K, which='LM', maxiter=num_nodes * 10000, tol=0, mode='normal') kmeans = KMeans(n_clusters=K, random_state=seed).fit(eig_vec.real) return kmeans.labels_
Example #14
Source File: test_pipeline.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([ ('scaler', scaler_for_pipeline), ('Kmeans', km_for_pipeline) ]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
Example #15
Source File: clustering_kmeans.py From practicalDataAnalysisCookbook with GNU General Public License v2.0 | 6 votes |
def findClusters_kmeans(data): ''' Cluster data using k-means ''' # create the classifier object kmeans = cl.KMeans( n_clusters=4, n_jobs=-1, verbose=0, n_init=30 ) # fit the data return kmeans.fit(data) # the file name of the dataset
Example #16
Source File: pancreas_tests.py From scanorama with MIT License | 6 votes |
def entropy_test(datasets_dimred, ds_labels): ds_labels = np.array(ds_labels) X_dimred = np.concatenate(datasets_dimred) embedding = None for k in range(10, 21): km = KMeans(n_clusters=k, n_jobs=-1, verbose=0) km.fit(X_dimred) if False and k % 5 == 0: embedding = visualize( datasets_dimred, km.labels_, NAMESPACE + '_km{}'.format(k), [ str(x) for x in range(k) ], embedding=embedding ) print('k = {}, average normalized entropy = {}' .format(k, avg_norm_entropy(ds_labels, km.labels_)))
Example #17
Source File: feature_preprocess.py From MassImageRetrieval with Apache License 2.0 | 6 votes |
def analysis_KMeans(): mean_distortions = [] K = len(labels_idx) K_range = range(320, 1000) for k in K_range: print("Cluster k is {}".format(k)) kmeans_model = KMeans(n_clusters=k, init="k-means++", n_jobs=-1) kmeans_model.fit(np_features) t_distortions = sum( np.min(cdist(np_features, kmeans_model.cluster_centers_, 'euclidean'), axis=1)) / np_features.shape[0] mean_distortions.append(t_distortions) with open("./kmeans_cluster.csv", "a+") as wh: for idx in range(len(K_range)): wh.write("{},{}\n".format(K_range[idx], mean_distortions[idx])) # plt.plot(K_range, mean_distortions, 'bx-') # plt.xlabel('k') # plt.ylabel(u'Avgerage distortion degree') # plt.title(u'Elbows rule to select the best K value') # plt.savefig("kmeans_cluster.png")
Example #18
Source File: cluster.py From PHATE with GNU General Public License v2.0 | 6 votes |
def silhouette_score(phate_op, n_clusters, random_state=None, **kwargs): """Compute the Silhouette score on KMeans on the PHATE potential Parameters ---------- phate_op : phate.PHATE Fitted PHATE operator n_clusters : int Number of clusters. random_state : int or None, optional (default: None) Random seed for k-means Returns ------- score : float """ cluster_labels = kmeans(phate_op, n_clusters=n_clusters, random_state=random_state, **kwargs) return metrics.silhouette_score(phate_op.diff_potential, cluster_labels)
Example #19
Source File: SpectralClustering.py From sparse-subspace-clustering-python with MIT License | 6 votes |
def SpectralClustering(CKSym, n): # This is direct port of JHU vision lab code. Could probably use sklearn SpectralClustering. CKSym = CKSym.astype(float) N, _ = CKSym.shape MAXiter = 1000 # Maximum number of iterations for KMeans REPlic = 20 # Number of replications for KMeans DN = np.diag(np.divide(1, np.sqrt(np.sum(CKSym, axis=0) + np.finfo(float).eps))) LapN = identity(N).toarray().astype(float) - np.matmul(np.matmul(DN, CKSym), DN) _, _, vN = np.linalg.svd(LapN) vN = vN.T kerN = vN[:, N - n:N] normN = np.sqrt(np.sum(np.square(kerN), axis=1)) kerNS = np.divide(kerN, normN.reshape(len(normN), 1) + np.finfo(float).eps) km = KMeans(n_clusters=n, n_init=REPlic, max_iter=MAXiter, n_jobs=-1).fit(kerNS) return km.labels_
Example #20
Source File: test_spectral_embedding.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed) se_rbf = SpectralEmbedding(n_components=n_clusters, affinity="rbf", random_state=random_state) se_knn = SpectralEmbedding(n_components=n_clusters, affinity="nearest_neighbors", n_neighbors=5, random_state=random_state) for se in [se_rbf, se_knn]: km = KMeans(n_clusters=n_clusters, random_state=random_state) km.fit(se.fit_transform(S)) assert_array_almost_equal( normalized_mutual_info_score( km.labels_, true_labels), 1.0, 2)
Example #21
Source File: clustering.py From retentioneering-tools with Mozilla Public License 2.0 | 5 votes |
def simple_cluster(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs): """ Finds cluster of users in data. Parameters ------- data: pd.DataFrame Dataframe with features for clustering indexed as in ``retention_config.index_col`` max_n_clusters: int, optional Maximal number of clusters for automatic selection for number of clusters. If ``None``, then uses n_clusters from arguments. Default: `None``` use_csi: bool, optional If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True`` random_state: int, optional Random state for KMeans clusterer. Default: ``0`` kwargs: optional Parameters for ``sklearn.cluster.KMeans`` Returns ------- Array of clusters Return type ------- np.array """ if max_n_clusters is not None: kmargs = find_best_n_clusters(data, KMeans, max_n_clusters, random_state, **kwargs) else: kmargs = {i: j for i, j in kwargs.items() if i in __KMEANS_FILTER__} kmargs.update({'random_state': random_state}) km = KMeans(**kmargs) cl = km.fit_predict(data.values) bs = pd.get_dummies(cl) bs.index = data.index metrics = calc_all_metrics(data, km) if use_csi: metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs) return cl, metrics
Example #22
Source File: clustering.py From retentioneering-tools with Mozilla Public License 2.0 | 5 votes |
def find_best_n_clusters(data, clusterer, max_n_clusters, random_state, **kwargs): """ Finds best number of clusters for KMeans and Gaussian Mixture. Parameters ------- data: pd.DataFrame Dataframe with features for clustering with index as in ``retention_config.index_col`` clusterer: sklearn clusterer class For instance, ``sklearn.cluster.KMeans`` or ``sklearn.mixture.GaussianMixture``. max_n_clusters: int Maximal number of clusters for searching. random_state: int Random state for clusterer. Returns ------- Optimal keyword arguments for clustering method. Return type ------ Dict """ args = {i: j for i, j in kwargs.items() if i in clusterer.get_params(clusterer)} if 'n_clusters' in clusterer.get_params(clusterer): kms = True else: kms = False args.pop('n_clusters' if kms else 'n_components', None) args.update({'random_state': random_state}) score = {} for i in range(2, max_n_clusters + 1): args.update({'n_clusters' if kms else 'n_components': i}) km = clusterer(**args) score[i] = silhouette_score(data, km.fit_predict(data), metric='cosine') best = pd.Series(score).idxmax() args.update({'n_clusters' if kms else 'n_components': best}) print(f'Best number of clusters is {best}') return args
Example #23
Source File: K-Means_scikit-learn.py From MachineLearning_Python with MIT License | 5 votes |
def kMenas(): data = spio.loadmat("data.mat") X = data['X'] model = KMeans(n_clusters=3).fit(X) # n_clusters指定3类,拟合数据 centroids = model.cluster_centers_ # 聚类中心 plt.scatter(X[:,0], X[:,1]) # 原数据的散点图 plt.plot(centroids[:,0],centroids[:,1],'r^',markersize=10) # 聚类中心 plt.show()
Example #24
Source File: clustering.py From retentioneering-tools with Mozilla Public License 2.0 | 5 votes |
def dbscan(data, use_csi=True, epsq=None, max_cl_number=None, **kwargs): """ Finds cluster of users in data using DBSCAN Parameters ------- data: pd.DataFrame Dataframe with features for clustering indexed by users (sessions) use_csi: bool, optional If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True`` epsq: float, optional Quantile of nearest neighbor positive distance between dots, its value will be an eps. If ``None``, then eps from keywords will be used. Default: ``None`` max_cl_number: int, optional Maximal number of clusters for aggregation of small clusters. Default: ``None`` kwargs: optional Parameters for ``sklearn.cluster.KMeans`` Returns -------- Array of clusters Return type ------- np.array """ kmargs = {i: j for i, j in kwargs.items() if i in DBSCAN.get_params(DBSCAN)} if epsq is not None: kmargs.update({'eps': find_best_eps(data, epsq)}) km = DBSCAN(**kmargs) cl = km.fit_predict(data.values) bs = pd.get_dummies(cl) bs.index = data.index metrics = calc_all_metrics(data, km) if use_csi: metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs) if max_cl_number is not None: cl = aggregate_cl(cl, max_cl_number) return cl, metrics
Example #25
Source File: test_weight_boosting.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_sample_weight_missing(): from sklearn.cluster import KMeans clf = AdaBoostClassifier(KMeans(), algorithm="SAMME") assert_raises(ValueError, clf.fit, X, y_regr) clf = AdaBoostRegressor(KMeans()) assert_raises(ValueError, clf.fit, X, y_regr)
Example #26
Source File: clustering_kmeans_search.py From practicalDataAnalysisCookbook with GNU General Public License v2.0 | 5 votes |
def findClusters_kmeans(data, no_of_clusters): ''' Cluster data using k-means ''' # create the classifier object kmeans = cl.KMeans( n_clusters=no_of_clusters, n_jobs=-1, verbose=0, n_init=30 ) # fit the data return kmeans.fit(data)
Example #27
Source File: cnn_lcd.py From CNN_LCD with GNU General Public License v3.0 | 5 votes |
def cluster_kmeans(sim): """Run k-means on similarity matrix and segment""" sim_dim = sim.shape[0] sim = sim.reshape(-1, 1) # Augment with spatial coordinates sim_aug = np.concatenate( [sim, np.mgrid[:sim_dim, :sim_dim].reshape(-1, sim_dim ** 2).T], axis=1 ) # Empirical metric for number of loop-closures given number of images # in sequence (assumption: equally-spaced samples): n_clusters = int(np.sqrt(sim_dim)) print('Performing clustering via KMeans(n={}).'.format(n_clusters)) km = KMeans(n_clusters=n_clusters, n_jobs=2, max_iter=300) labels = km.fit_predict(sim_aug) print('Got cluster labels') for i in range(n_clusters): lab_idx = (labels == i) if lab_idx.size: cc = sim[lab_idx].mean() # cc = sim[lab_idx].max() sim[lab_idx] = cc # Re-normalize and reshape sim = sim.reshape(sim_dim, sim_dim) / sim.max() return sim
Example #28
Source File: sam_knn.py From scikit-multiflow with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cluster_down(self, samples, labels): """Performs classwise kMeans++ clustering for given samples with corresponding labels. The number of samples is halved per class.""" logging.debug('cluster Down %d' % self.trainStepCount) uniqueLabels = np.unique(labels) newSamples = np.empty(shape=(0, samples.shape[1])) newLabels = np.empty(shape=(0), dtype=np.int32) for label in uniqueLabels: tmpSamples = samples[labels == label] newLength = int(max(tmpSamples.shape[0]/2, 1)) clustering = KMeans(n_clusters=newLength, n_init=1, random_state=0) clustering.fit(tmpSamples) newSamples = np.vstack([newSamples, clustering.cluster_centers_]) newLabels = np.append(newLabels, label*np.ones(shape=newLength, dtype=np.int32)) return newSamples, newLabels
Example #29
Source File: clustering_kmeans_search_alternative.py From practicalDataAnalysisCookbook with GNU General Public License v2.0 | 5 votes |
def findClusters_kmeans(data, no_of_clusters): ''' Cluster data using k-means ''' # create the classifier object kmeans = cl.KMeans( n_clusters=no_of_clusters, n_jobs=-1, verbose=0, n_init=30 ) # fit the data return kmeans.fit(data)
Example #30
Source File: cluster_features.py From bert-extractive-summarizer with MIT License | 5 votes |
def __get_model(self, k: int): """ Retrieve clustering model :param k: amount of clusters :return: Clustering model """ if self.algorithm == 'gmm': return GaussianMixture(n_components=k, random_state=self.random_state) return KMeans(n_clusters=k, random_state=self.random_state)