Python sklearn.metrics.silhouette_samples() Examples
The following are 12
code examples of sklearn.metrics.silhouette_samples().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.metrics
, or try the search function
.
Example #1
Source File: param_sensitivity.py From scanorama with MIT License | 7 votes |
def test_knn(datasets_dimred, genes, labels, idx, distr, xlabels): knns = [ 5, 10, 50, 100 ] len_distr = len(distr) for knn in knns: integrated = assemble(datasets_dimred[:], knn=knn, sigma=150) X = np.concatenate(integrated) distr.append(sil(X[idx, :], labels[idx])) for d in distr[:len_distr]: print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d))) xlabels.append(str(knn)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('knn'))
Example #2
Source File: param_sensitivity.py From scanorama with MIT License | 6 votes |
def test_sigma(datasets_dimred, genes, labels, idx, distr, xlabels): sigmas = [ 10, 50, 100, 200 ] len_distr = len(distr) for sigma in sigmas: integrated = assemble(datasets_dimred[:], sigma=sigma) X = np.concatenate(integrated) distr.append(sil(X[idx, :], labels[idx])) for d in distr[:len_distr]: print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d))) xlabels.append(str(sigma)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('sigma'))
Example #3
Source File: param_sensitivity.py From scanorama with MIT License | 6 votes |
def test_alpha(datasets_dimred, genes, labels, idx, distr, xlabels): alphas = [ 0, 0.05, 0.20, 0.50 ] len_distr = len(distr) for alpha in alphas: integrated = assemble(datasets_dimred[:], alpha=alpha, sigma=150) X = np.concatenate(integrated) distr.append(sil(X[idx, :], labels[idx])) for d in distr[:len_distr]: print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d))) xlabels.append(str(alpha)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('alpha'))
Example #4
Source File: param_sensitivity.py From scanorama with MIT License | 6 votes |
def test_approx(datasets_dimred, genes, labels, idx, distr, xlabels): integrated = assemble(datasets_dimred[:], approx=False, sigma=150) X = np.concatenate(integrated) distr.append(sil(X[idx, :], labels[idx])) len_distr = len(distr) for d in distr[:len_distr]: print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d))) xlabels.append('Exact NN') print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('approx'))
Example #5
Source File: param_sensitivity.py From scanorama with MIT License | 6 votes |
def test_perplexity(datasets_dimred, genes, labels, idx, distr, xlabels): X = np.concatenate(datasets_dimred) perplexities = [ 10, 100, 500, 2000 ] len_distr = len(distr) for perplexity in perplexities: embedding = fit_tsne(X, perplexity=perplexity) distr.append(sil(embedding[idx, :], labels[idx])) for d in distr[:len_distr]: print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d))) xlabels.append(str(perplexity)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('perplexity'))
Example #6
Source File: silhouette.py From SecuML with GNU General Public License v2.0 | 6 votes |
def gen_eval(self, output_dir, assigned_clusters, quick=False): if quick: self.silhouette_avg = 0 return if self.distances is not None: self.silhouette_values = silhouette_samples(self.distances, assigned_clusters, metric='precomputed') else: features = self.instances.features.get_values() self.silhouette_values = silhouette_samples(features, assigned_clusters) self.silhouette_avg = np.mean(self.silhouette_values) self.dispaly_silhouette(output_dir, assigned_clusters) # Code from a scikit-learn example: # Selecting the number of clusters with silhouette analysis on KMeans # clustering
Example #7
Source File: AgglomerativeClustering.py From mltk-algo-contrib with Apache License 2.0 | 5 votes |
def fit(self, df, options): """Do the clustering & merge labels with original data.""" # Make a copy of the input data X = df.copy() # Use the df_util prepare_features method to # - drop null columns & rows # - convert categorical columns into dummy indicator columns # X is our cleaned data, nans is a mask of the null value locations X, nans, columns = df_util.prepare_features(X, self.feature_variables) # Do the actual clustering y_hat = self.estimator.fit_predict(X.values) # attach silhouette coefficient score for each row silhouettes = silhouette_samples(X, y_hat) # Combine the two arrays, and transpose them. y_hat = np.vstack([y_hat, silhouettes]).T # Assign default output names default_name = 'cluster' # Get the value from the as-clause if present output_name = options.get('output_name', default_name) # There are two columns - one for the labels, for the silhouette scores output_names = [output_name, 'silhouette_score'] # Use the predictions & nans-mask to create a new dataframe output_df = df_util.create_output_dataframe(y_hat, nans, output_names) # Merge the dataframe with the original input data df = df_util.merge_predictions(df, output_df) return df
Example #8
Source File: test_silhouette.py From pyclust with GNU General Public License v2.0 | 5 votes |
def test_gmm(): sil = pyclust.validate.Silhouette() sil_score = sil.score(X, ypred, sample_size=None) print(sil_score[0]) print(sil.sample_scores[:10]) print(silhouette_score(X, ypred, sample_size=None)) print(silhouette_samples(X, ypred)[:10])
Example #9
Source File: test_metrics.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_silhouette_samples(self): result = self.df.metrics.silhouette_samples() expected = metrics.silhouette_samples(self.data, self.pred) self.assertTrue(isinstance(result, pdml.ModelSeries)) tm.assert_index_equal(result.index, self.df.index) self.assert_numpy_array_almost_equal(result.values, expected)
Example #10
Source File: confidence.py From nussl with MIT License | 4 votes |
def silhouette_confidence(audio_signal, features, num_sources, threshold=95, max_points=1000, **kwargs): """ Uses the silhouette score to compute the clusterability of the feature space. The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b). To clarify, b is the distance between a sample and the nearest cluster that the sample is not a part of. Note that Silhouette Coefficient is only defined if number of labels is 2 <= n_labels <= n_samples - 1. References: Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition. Diss. Northwestern University, 2019. Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis”. Computational and Applied Mathematics 20: 53-65. Args: audio_signal (AudioSignal): AudioSignal object which will be used to compute the mask over which to compute the confidence measure. This can be None, if and only if ``representation`` is passed as a keyword argument to this function. features (np.ndarray): Numpy array containing the features to be clustered. Should have the same dimensions as the representation. n_sources (int): Number of sources to cluster the features into. threshold (int, optional): Threshold by loudness. Points below the threshold are excluded from being used in the confidence measure. Defaults to 95. kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can go here as a keyword argument. max_points (int, optional): Maximum number of points to compute the Silhouette score for. Silhouette score is a costly operation. Defaults to 1000. Returns: float: Confidence given by Silhouette score. """ mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs) embedding_size = features.shape[-1] features = features[mask].reshape(-1, embedding_size) if features.shape[0] > max_points: idx = np.random.choice( np.arange(features.shape[0]), max_points, replace=False) features = features[idx] kmeans = KMeans(num_sources) labels = kmeans.fit_predict(features) confidence = silhouette_samples(features, labels) return confidence.mean()
Example #11
Source File: _silhouette.py From epiScanpy with BSD 3-Clause "New" or "Revised" License | 4 votes |
def silhouette(adata_name, cluster_annot, value='X_pca', metric='euclidean', key_added=None, copy=False): """ Compute silhouette scores. It computes the general silhouette score as well as a silhouette score for every cell according to the cell cluster assigned to it. Parameters ---------- adata_name: AnnData object cluster_annot: observational variable corresponding to a cell clustering value: measure used to build the silhouette plot (X_pca, X_tsne, X_umap) metric: 'euclidean' key_added: key to save the computed silhouette scores Return ------ general silhouette score in 'uns' of the AnnData object individual silhouette scores in 'obs' of the AnnData object Credit to sklearn script : https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py return score and silhouette plot. Still some work to do to finish the function. size=None but you can put 'large' if you want a bigger default figure size """ if copy: adata_name = adata_name.copy() X = adata_name.obsm[value] cluster_labels = adata_name.obs[cluster_annot] n_clusters = len(set(adata_name.obs[cluster_annot])) ## also, return sample_silhouette_values as adata.obs['silhouette_samples'] silhouette_avg = silhouette_score(X, cluster_labels, metric) sample_silhouette_values = silhouette_samples(X, cluster_labels, metric) if key_added: adata_name.obs[key_added] = sample_silhouette_values adata_name.uns[key_added] = silhouette_avg else: adata_name.obs['silhouette_samples'] = sample_silhouette_values adata_name.uns['silhouette_samples_avg'] = silhouette_avg if copy: return(adata_name) else: return()
Example #12
Source File: evaluate.py From BERMUDA with MIT License | 4 votes |
def evaluate_scores(div_ent_code, sil_code, cell_labels, dataset_labels, num_datasets, div_ent_dim, sil_dim, sil_dist): """ Calculate three proposed evaluation metrics Args: div_ent_code: num_cells * num_features, embedding for divergence and entropy calculation, usually with dim of 2 sil_code: num_cells * num_features, embedding for silhouette score calculation cell_labels: dataset_labels: num_datasets: div_ent_dim: if dimension of div_ent_code > div_ent_dim, apply PCA first sil_dim: if dimension of sil_code > sil_dim, apply PCA first sil_dist: distance metric for silhouette score calculation Returns: div_score: divergence score ent_score: entropy score sil_score: silhouette score """ # calculate divergence and entropy if div_ent_code.shape[1] > div_ent_dim: div_ent_code = PCA(n_components=div_ent_dim).fit_transform(div_ent_code) div_pq = [] # divergence dataset p, q div_qp = [] # divergence dataset q, p ent = [] # entropy # pairs of datasets for d1 in range(1, num_datasets+1): for d2 in range(d1+1, num_datasets+1): idx1 = dataset_labels == d1 idx2 = dataset_labels == d2 labels = np.intersect1d(np.unique(cell_labels[idx1]), np.unique(cell_labels[idx2])) idx1_mutual = np.logical_and(idx1, np.isin(cell_labels, labels)) idx2_mutual = np.logical_and(idx2, np.isin(cell_labels, labels)) idx_specific = np.logical_and(np.logical_or(idx1, idx2), np.logical_not(np.isin(cell_labels, labels))) # divergence if np.sum(idx1_mutual) >= cal_min and np.sum(idx2_mutual) >= cal_min: div_pq.append(max(estimate(div_ent_code[idx1_mutual, :], div_ent_code[idx2_mutual, :], cal_min), 0)) div_qp.append(max(estimate(div_ent_code[idx2_mutual, :], div_ent_code[idx1_mutual, :], cal_min), 0)) # entropy if (sum(idx_specific) > 0): ent_tmp = cal_entropy(div_ent_code, idx_specific, dataset_labels) ent.append(sum(ent_tmp) / len(ent_tmp)) if len(ent) == 0: # if no dataset specific cell types, store entropy as -1 ent.append(-1) # calculate silhouette_score if sil_code.shape[1] > sil_dim: sil_code = PCA(n_components=sil_dim).fit_transform(sil_code) sil_scores = silhouette_samples(sil_code, cell_labels, metric=sil_dist) # average for scores div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2 ent_score = sum(ent) / len(ent) sil_score = sum(sil_scores) / len(sil_scores) return div_score, ent_score, sil_score