Python sklearn.metrics.normalized_mutual_info_score() Examples
The following are 27
code examples of sklearn.metrics.normalized_mutual_info_score().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.metrics
, or try the search function
.
Example #1
Source File: DCCComputation.py From DCC with MIT License | 6 votes |
def benchmarking(gtlabels, labels): # TODO: Please note that the AMI definition used in the paper differs from that in the sklearn python package. # TODO: Please modify it accordingly. numeval = len(gtlabels) ari = metrics.adjusted_rand_score(gtlabels[:numeval], labels[:numeval]) ami = metrics.adjusted_mutual_info_score(gtlabels[:numeval], labels[:numeval]) nmi = metrics.normalized_mutual_info_score(gtlabels[:numeval], labels[:numeval]) acc = clustering_accuracy(gtlabels[:numeval], labels[:numeval]) return ari, ami, nmi, acc
Example #2
Source File: test_spectral_embedding.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed) se_rbf = SpectralEmbedding(n_components=n_clusters, affinity="rbf", random_state=random_state) se_knn = SpectralEmbedding(n_components=n_clusters, affinity="nearest_neighbors", n_neighbors=5, random_state=random_state) for se in [se_rbf, se_knn]: km = KMeans(n_clusters=n_clusters, random_state=random_state) km.fit(se.fit_transform(S)) assert_array_almost_equal( normalized_mutual_info_score( km.labels_, true_labels), 1.0, 2)
Example #3
Source File: datasets.py From MLPrimitives with MIT License | 6 votes |
def load_amazon(): """Amazon dataset. Amazon product co-purchasing network and ground-truth communities. Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains an undirected edge from i to j. Each product category provided by Amazon defines each ground-truth community. """ dataset_path = _load('amazon') X = _load_csv(dataset_path, 'data') y = X.pop('label').values graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml'))) return Dataset(load_amazon.__doc__, X, y, normalized_mutual_info_score, 'graph', 'community_detection', graph=graph)
Example #4
Source File: test_clustering_loss.py From deep_metric_learning with MIT License | 6 votes |
def check_forward(self, x_data, c_data, gamma, T, y_star, y_pam): num_examples = len(x_data) x = chainer.Variable(x_data) c = chainer.Variable(c_data) loss = clustering_loss(x, c, gamma, T) sq_distances_ij = [] for i, j in zip(range(num_examples), y_pam): sqd_ij = np.sum((x_data[i] - x_data[j]) ** 2) sq_distances_ij.append(sqd_ij) f = -sum(sq_distances_ij) sq_distances_ij = [] for i, j in zip(range(num_examples), y_star): sqd_ij = np.sum((x_data[i] - x_data[j]) ** 2) sq_distances_ij.append(sqd_ij) f_tilde = -sum(sq_distances_ij) delta = 1.0 - normalized_mutual_info_score(cuda.to_cpu(c_data), y_pam) loss_expected = f + gamma * delta - f_tilde testing.assert_allclose(loss.data, loss_expected)
Example #5
Source File: datasets.py From MLBlocks with MIT License | 6 votes |
def load_amazon(): """Amazon product co-purchasing network and ground-truth communities. Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains an undirected edge from i to j. Each product category provided by Amazon defines each ground-truth community. """ dataset_path = _load('amazon') X = _load_csv(dataset_path, 'data') y = X.pop('label').values graph = nx.Graph(nx.read_gml(os.path.join(dataset_path, 'graph.gml'))) return Dataset(load_amazon.__doc__, X, y, normalized_mutual_info_score, graph=graph)
Example #6
Source File: posterior.py From scVI with MIT License | 6 votes |
def clustering_scores(self, prediction_algorithm: str = "knn") -> Tuple: if self.gene_dataset.n_labels > 1: latent, _, labels = self.get_latent() if prediction_algorithm == "knn": labels_pred = KMeans( self.gene_dataset.n_labels, n_init=200 ).fit_predict( latent ) # n_jobs>1 ? elif prediction_algorithm == "gmm": gmm = GMM(self.gene_dataset.n_labels) gmm.fit(latent) labels_pred = gmm.predict(latent) asw_score = silhouette_score(latent, labels) nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0] logger.debug( "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" % (asw_score, nmi_score, ari_score, uca_score) ) return asw_score, nmi_score, ari_score, uca_score
Example #7
Source File: test_spectral_embedding.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed) se_rbf = SpectralEmbedding(n_components=n_clusters, affinity="rbf", random_state=random_state) se_knn = SpectralEmbedding(n_components=n_clusters, affinity="nearest_neighbors", n_neighbors=5, random_state=random_state) for se in [se_rbf, se_knn]: km = KMeans(n_clusters=n_clusters, random_state=random_state) km.fit(se.fit_transform(S)) assert_array_almost_equal( normalized_mutual_info_score( km.labels_, true_labels), 1.0, 2)
Example #8
Source File: test_spectral_embedding.py From megaman with BSD 2-Clause "Simplified" License | 5 votes |
def test_spectral_embedding_two_components(seed=36): """Test spectral embedding with two components""" random_state = np.random.RandomState(seed) n_sample = 100 affinity = np.zeros(shape=[n_sample * 2, n_sample * 2]) # first component affinity[0:n_sample, 0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # second component affinity[n_sample::, n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # connection affinity[0, n_sample + 1] = 1 affinity[n_sample + 1, 0] = 1 affinity.flat[::2 * n_sample + 1] = 0 affinity = 0.5 * (affinity + affinity.T) true_label = np.zeros(shape=2 * n_sample) true_label[0:n_sample] = 1 se_precomp = SpectralEmbedding(n_components=1, random_state=np.random.RandomState(seed), eigen_solver = 'arpack') embedded_coordinate = se_precomp.fit_transform(affinity, input_type='affinity') # thresholding on the first components using 0. label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float") assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
Example #9
Source File: test_spectral_embedding.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_spectral_embedding_two_components(seed=36): # Test spectral embedding with two components random_state = np.random.RandomState(seed) n_sample = 100 affinity = np.zeros(shape=[n_sample * 2, n_sample * 2]) # first component affinity[0:n_sample, 0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # second component affinity[n_sample::, n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # Test of internal _graph_connected_component before connection component = _graph_connected_component(affinity, 0) assert_true(component[:n_sample].all()) assert_true(not component[n_sample:].any()) component = _graph_connected_component(affinity, -1) assert_true(not component[:n_sample].any()) assert_true(component[n_sample:].all()) # connection affinity[0, n_sample + 1] = 1 affinity[n_sample + 1, 0] = 1 affinity.flat[::2 * n_sample + 1] = 0 affinity = 0.5 * (affinity + affinity.T) true_label = np.zeros(shape=2 * n_sample) true_label[0:n_sample] = 1 se_precomp = SpectralEmbedding(n_components=1, affinity="precomputed", random_state=np.random.RandomState(seed)) embedded_coordinate = se_precomp.fit_transform(affinity) # Some numpy versions are touchy with types embedded_coordinate = \ se_precomp.fit_transform(affinity.astype(np.float32)) # thresholding on the first components using 0. label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float") assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
Example #10
Source File: test_spectral_embedding.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_spectral_embedding_two_components(seed=36): # Test spectral embedding with two components random_state = np.random.RandomState(seed) n_sample = 100 affinity = np.zeros(shape=[n_sample * 2, n_sample * 2]) # first component affinity[0:n_sample, 0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # second component affinity[n_sample::, n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # Test of internal _graph_connected_component before connection component = _graph_connected_component(affinity, 0) assert component[:n_sample].all() assert not component[n_sample:].any() component = _graph_connected_component(affinity, -1) assert not component[:n_sample].any() assert component[n_sample:].all() # connection affinity[0, n_sample + 1] = 1 affinity[n_sample + 1, 0] = 1 affinity.flat[::2 * n_sample + 1] = 0 affinity = 0.5 * (affinity + affinity.T) true_label = np.zeros(shape=2 * n_sample) true_label[0:n_sample] = 1 se_precomp = SpectralEmbedding(n_components=1, affinity="precomputed", random_state=np.random.RandomState(seed)) embedded_coordinate = se_precomp.fit_transform(affinity) # Some numpy versions are touchy with types embedded_coordinate = \ se_precomp.fit_transform(affinity.astype(np.float32)) # thresholding on the first components using 0. label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float") assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
Example #11
Source File: accuracy_calculator.py From pytorch-metric-learning with MIT License | 5 votes |
def calculate_NMI(self, query_labels, cluster_labels, **kwargs): return normalized_mutual_info_score(query_labels, cluster_labels)
Example #12
Source File: eval_metrics.py From IIC with MIT License | 5 votes |
def _nmi(preds, targets): return metrics.normalized_mutual_info_score(targets, preds)
Example #13
Source File: comparison.py From cdlib with BSD 2-Clause "Simplified" License | 5 votes |
def normalized_mutual_information(first_partition, second_partition): """ Normalized Mutual Information between two clusterings. Normalized Mutual Information (NMI) is an normalization of the Mutual Information (MI) score to scale the results between 0 (no mutual information) and 1 (perfect correlation). In this function, mutual information is normalized by ``sqrt(H(labels_true) * H(labels_pred))`` :param first_partition: NodeClustering object :param second_partition: NodeClustering object :return: MatchingResult object :Example: >>> from cdlib import evaluation, algorithms >>> g = nx.karate_club_graph() >>> louvain_communities = algorithms.louvain(g) >>> leiden_communities = algorithms.leiden(g) >>> evaluation.normalized_mutual_information(louvain_communities,leiden_communities) """ __check_partition_coverage(first_partition, second_partition) __check_partition_overlap(first_partition, second_partition) first_partition_c = [x[1] for x in sorted([(node, nid) for nid, cluster in enumerate(first_partition.communities) for node in cluster], key=lambda x: x[0])] second_partition_c = [x[1] for x in sorted([(node, nid) for nid, cluster in enumerate(second_partition.communities) for node in cluster], key=lambda x: x[0])] from sklearn.metrics import normalized_mutual_info_score return MatchingResult(score=normalized_mutual_info_score(first_partition_c, second_partition_c))
Example #14
Source File: test_metrics.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_normalized_mutual_info_score(self): result = self.df.metrics.normalized_mutual_info_score() expected = metrics.normalized_mutual_info_score(self.target, self.pred) self.assertEqual(result, expected)
Example #15
Source File: test_spectral_embedding.py From megaman with BSD 2-Clause "Simplified" License | 5 votes |
def test_diffusion_embedding_two_components_diffusion_time_one(seed=36): """Test spectral embedding with two components""" random_state = np.random.RandomState(seed) n_sample = 100 affinity = np.zeros(shape=[n_sample * 2, n_sample * 2]) # first component affinity[0:n_sample, 0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # second component affinity[n_sample::, n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # connection affinity[0, n_sample + 1] = 1 affinity[n_sample + 1, 0] = 1 affinity.flat[::2 * n_sample + 1] = 0 affinity = 0.5 * (affinity + affinity.T) true_label = np.zeros(shape=2 * n_sample) true_label[0:n_sample] = 1 geom_params = {'laplacian_method':'geometric'} se_precomp = SpectralEmbedding(n_components=1, random_state=np.random.RandomState(seed), eigen_solver = 'arpack', diffusion_maps = True, diffusion_time = 1.0, geom = geom_params) embedded_coordinate = se_precomp.fit_transform(affinity, input_type='affinity') # thresholding on the first components using 0. label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float") assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
Example #16
Source File: test_spectral_embedding.py From megaman with BSD 2-Clause "Simplified" License | 5 votes |
def test_diffusion_embedding_two_components_no_diffusion_time(seed=36): """Test spectral embedding with two components""" random_state = np.random.RandomState(seed) n_sample = 100 affinity = np.zeros(shape=[n_sample * 2, n_sample * 2]) # first component affinity[0:n_sample, 0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # second component affinity[n_sample::, n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # connection affinity[0, n_sample + 1] = 1 affinity[n_sample + 1, 0] = 1 affinity.flat[::2 * n_sample + 1] = 0 affinity = 0.5 * (affinity + affinity.T) true_label = np.zeros(shape=2 * n_sample) true_label[0:n_sample] = 1 geom_params = {'laplacian_method':'geometric'} se_precomp = SpectralEmbedding(n_components=1, random_state=np.random.RandomState(seed), eigen_solver = 'arpack', diffusion_maps = True, geom = geom_params) embedded_coordinate = se_precomp.fit_transform(affinity, input_type='affinity') # thresholding on the first components using 0. label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float") assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
Example #17
Source File: metric_learning_test.py From tf-slim with Apache License 2.0 | 5 votes |
def _augmented_update_medoid_ics_in_place(self, pdists, y_gt, cluster_ics, medoid_ics, loss_mult): for cluster_idx in range(self.n_clusters): # y_pred = self._get_cluster_ics(D, medoid_ics) # Don't prematurely do the assignment step. # Do this after we've updated all cluster medoids. y_pred = cluster_ics if sum(y_pred == cluster_idx) == 0: # Cluster is empty. continue curr_score = ( -1.0 * np.sum( pdists[medoid_ics[cluster_idx], y_pred == cluster_idx]) + loss_mult * (1.0 - metrics.normalized_mutual_info_score( y_gt, y_pred))) pdist_in = pdists[y_pred == cluster_idx, :] pdist_in = pdist_in[:, y_pred == cluster_idx] all_scores_fac = np.sum(-1.0 * pdist_in, axis=1) all_scores_loss = [] for i in range(y_pred.size): if y_pred[i] != cluster_idx: continue # remove this cluster's current centroid medoid_ics_i = medoid_ics[:cluster_idx] + medoid_ics[cluster_idx + 1:] # add this new candidate to the centroid list medoid_ics_i += [i] y_pred_i = self._get_cluster_ics(pdists, medoid_ics_i) all_scores_loss.append(loss_mult * ( 1.0 - metrics.normalized_mutual_info_score(y_gt, y_pred_i))) all_scores = all_scores_fac + all_scores_loss max_score_idx = np.argmax(all_scores) max_score = all_scores[max_score_idx] if max_score > curr_score: medoid_ics[cluster_idx] = np.where( y_pred == cluster_idx)[0][max_score_idx]
Example #18
Source File: clustering_util.py From active_learning_coreset with MIT License | 5 votes |
def evaluate_clustering(y_gt, y_assignment): return normalized_mutual_info_score(y_gt, y_assignment)
Example #19
Source File: HAN.py From OpenHINE with MIT License | 5 votes |
def my_Kmeans(x, y, k=4, time=10, return_NMI=False): x = np.array(x) x = np.squeeze(x) y = np.array(y) if len(y.shape) > 1: y = np.argmax(y, axis=1) estimator = KMeans(n_clusters=k) ARI_list = [] # adjusted_rand_score( NMI_list = [] if time: # print('KMeans exps {}次 æ±~B平å~]~G '.format(time)) for i in range(time): estimator.fit(x, y) y_pred = estimator.predict(x) score = normalized_mutual_info_score(y, y_pred) NMI_list.append(score) s2 = adjusted_rand_score(y, y_pred) ARI_list.append(s2) # print('NMI_list: {}'.format(NMI_list)) score = sum(NMI_list) / len(NMI_list) s2 = sum(ARI_list) / len(ARI_list) print('NMI (10 avg): {:.4f} , ARI (10avg): {:.4f}'.format(score, s2)) else: estimator.fit(x, y) y_pred = estimator.predict(x) score = normalized_mutual_info_score(y, y_pred) print("NMI on all label data: {:.5f}".format(score)) if return_NMI: return score, s2
Example #20
Source File: evaluation.py From OpenHINE with MIT License | 5 votes |
def evaluate_paper_cluster(self, embedding_matrix): embedding_list = embedding_matrix.tolist() X = [] Y = [] for paper in self.paper_label: X.append(embedding_list[paper]) Y.append(self.paper_label[paper]) pred_Y = KMeans(3).fit(np.array(X)).predict(X) score = normalized_mutual_info_score(np.array(Y), pred_Y) return score
Example #21
Source File: evaluation.py From OpenHINE with MIT License | 5 votes |
def evaluate_author_cluster(self, embedding_matrix): embedding_list = embedding_matrix.tolist() X = [] Y = [] for author in self.author_label: X.append(embedding_list[author]) Y.append(self.author_label[author]) pred_Y = KMeans(4).fit(np.array(X)).predict(X) score = normalized_mutual_info_score(np.array(Y), pred_Y) return score
Example #22
Source File: test.py From OpenHINE with MIT License | 5 votes |
def evaluate_cluster(self, embedding_list): X = [] Y = [] for p in self.label: X.append(embedding_list[p]) Y.append(self.label[p]) Y_pred = KMeans(self.n_label, random_state=self.seed).fit(np.array(X)).predict(X) nmi = normalized_mutual_info_score(np.array(Y), Y_pred) ari = adjusted_rand_score(np.array(Y), Y_pred) return nmi, ari
Example #23
Source File: metric_loss_ops.py From cluster-loss-tensorflow with BSD 2-Clause "Simplified" License | 5 votes |
def _compute_nmi_score(labels, predictions): return math_ops.to_float( script_ops.py_func( metrics.normalized_mutual_info_score, [labels, predictions], [dtypes.float64], name='nmi'))
Example #24
Source File: metric_learning_test.py From tf-slim with Apache License 2.0 | 5 votes |
def pam_augmented_fit(self, feat, y, loss_mult): pam_max_iter = 5 self._check_init_args() feat = self._check_array(feat) pdists = pairwise_distance_np(feat) self.loss_augmented_fit(feat, y, loss_mult) print('PAM -1 (before PAM): score: %f, score_aug: %f' % ( self.score_, self.score_aug_)) # Initialize from loss augmented facility location subset = self.center_ics_ for iter_ in range(pam_max_iter): # update the cluster assignment cluster_ics = self._get_cluster_ics(pdists, subset) # update the medoid for each clusters self._augmented_update_medoid_ics_in_place(pdists, y, cluster_ics, subset, loss_mult) self.score_ = np.float32(-1.0) * self._get_facility_distance( pdists, subset) self.score_aug_ = self.score_ + loss_mult * ( 1.0 - metrics.normalized_mutual_info_score( y, self._get_cluster_ics(pdists, subset))) self.score_aug_ = self.score_aug_.astype(np.float32) print('PAM iter: %d, score: %f, score_aug: %f' % (iter_, self.score_, self.score_aug_)) self.center_ics_ = subset self.labels_ = cluster_ics return self
Example #25
Source File: forward_greedy_facility.py From active_learning_coreset with MIT License | 4 votes |
def loss_augmented_fit(self, X, y, loss_mult): """Fit K-Medoids to the provided data. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Returns ------- self """ self._check_init_args() # Check that the array is good and attempt to convert it to # Numpy array if possible X = self._check_array(X) # Apply distance metric to get the distance matrix D = self.distance_func(X) num_data = X.shape[0] candidate_ids = range(num_data) candidate_scores = np.zeros(num_data,) subset = [] k = 0 while k < self.n_clusters: candidate_scores = [] for i in candidate_ids: # push i to subset subset.append(i) marginal_cost = np.sum(np.min(D[:, subset], axis=1)) loss = normalized_mutual_info_score(y,self._get_cluster_ics(D, subset)) candidate_scores.append(marginal_cost - loss_mult*loss) # remove i from subset subset.pop() # push i_star to subset i_star = candidate_ids[np.argmin(candidate_scores)] bisect.insort(subset, i_star) # remove i_star from candiate indices del candidate_ids[bisect.bisect_left(candidate_ids, i_star)] k = k + 1 #print '|S|: %d, F(S): %f' % (k, np.min(candidate_scores)) # Expose labels_ which are the assignments of # the training data to clusters self.labels_ = self._get_cluster_ics(D, subset) # Expose cluster centers, i.e. medoids self.cluster_centers_ = X.take(subset, axis=0) # Expose indices of chosen cluster centers self.center_ics_ = subset return self
Example #26
Source File: plot.py From SCALE with MIT License | 4 votes |
def plot_metrics(path, dataset, ref, fraction): ARI = [] NMI = [] F1 = [] methods = ['scABC', 'SC3', 'scVI', 'SCALE'] for frac in fraction: outdir = os.path.join(path, dataset, frac) #;print(outdir) scABC_pred, _ = read_labels(os.path.join(outdir, 'scABC_predict.txt')) if os.path.isfile(os.path.join(outdir, 'SC3_predict.txt')): SC3_pred, _ = read_labels(os.path.join(outdir, 'SC3_predict.txt')) else: SC3_pred = None scVI_pred, _ = read_labels(os.path.join(outdir, 'scVI_predict.txt')) scale_pred, pred_classes = read_labels(os.path.join(outdir, 'cluster_assignments.txt')) ari = [] nmi = [] f1 = [] for pred, method in zip([scABC_pred, SC3_pred, scVI_pred, scale_pred], methods): if pred is None: ari.append(0) nmi.append(0) f1.append(0) else: pred = reassign_cluster_with_ref(pred, ref) ari.append(adjusted_rand_score(ref, pred)) nmi.append(normalized_mutual_info_score(ref, pred)) f1.append(f1_score(ref, pred, average='micro')) ARI.append(ari) NMI.append(nmi) F1.append(f1) fraction = [ frac.replace('corrupt_', '') for frac in fraction] ARI = pd.Series(np.concatenate(ARI, axis=0)) NMI = pd.Series(np.concatenate(NMI, axis=0)) F1 = pd.Series(np.concatenate(F1, axis=0)) M = pd.Series(methods * len(fraction)) F = pd.Series(np.concatenate([[i]*len(methods) for i in fraction])) metrics = pd.concat([ARI, NMI, F1, M, F], axis=1) metrics.columns = ['ARI', 'NMI', 'F1', 'method', 'fraction'] lineplot(metrics, 'ARI', dataset, False) lineplot(metrics, 'NMI', dataset, False) lineplot(metrics, 'F1', dataset, True)
Example #27
Source File: metric_learning_test.py From tf-slim with Apache License 2.0 | 4 votes |
def loss_augmented_fit(self, feat, y, loss_mult): """Fit K-Medoids to the provided data.""" self._check_init_args() # Check that the array is good and attempt to convert it to # Numpy array if possible. feat = self._check_array(feat) # Apply distance metric to get the distance matrix. pdists = pairwise_distance_np(feat) num_data = feat.shape[0] candidate_ids = list(range(num_data)) candidate_scores = np.zeros(num_data,) subset = [] k = 0 while k < self.n_clusters: candidate_scores = [] for i in candidate_ids: # push i to subset. subset.append(i) marginal_cost = -1.0 * np.sum(np.min(pdists[:, subset], axis=1)) loss = 1.0 - metrics.normalized_mutual_info_score( y, self._get_cluster_ics(pdists, subset)) candidate_scores.append(marginal_cost + loss_mult * loss) # remove i from subset. subset.pop() # push i_star to subset. i_star = candidate_ids[np.argmax(candidate_scores)] subset.append(i_star) # remove i_star from candidate indices. candidate_ids.remove(i_star) k += 1 # Expose labels_ which are the assignments of # the training data to clusters. self.labels_ = self._get_cluster_ics(pdists, subset) # Expose cluster centers, i.e. medoids. self.cluster_centers_ = feat.take(subset, axis=0) # Expose indices of chosen cluster centers. self.center_ics_ = subset # Expose the score = -\sum_{i \in V} min_{j \in S} || x_i - x_j || self.score_ = np.float32(-1.0) * self._get_facility_distance(pdists, subset) self.score_aug_ = self.score_ + loss_mult * ( 1.0 - metrics.normalized_mutual_info_score( y, self._get_cluster_ics(pdists, subset))) self.score_aug_ = self.score_aug_.astype(np.float32) # Expose the chosen cluster indices. self.subset_ = subset return self