Python sklearn.metrics.silhouette_samples() Examples

The following are 12 code examples of sklearn.metrics.silhouette_samples(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.metrics , or try the search function .
Example #1
Source File: param_sensitivity.py    From scanorama with MIT License 7 votes vote down vote up
def test_knn(datasets_dimred, genes, labels, idx, distr, xlabels):
    knns = [ 5, 10, 50, 100 ]
    len_distr = len(distr)
    for knn in knns:
        integrated = assemble(datasets_dimred[:], knn=knn, sigma=150)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(knn))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('knn')) 
Example #2
Source File: param_sensitivity.py    From scanorama with MIT License 6 votes vote down vote up
def test_sigma(datasets_dimred, genes, labels, idx, distr, xlabels):
    sigmas = [ 10, 50, 100, 200 ]
    len_distr = len(distr)
    for sigma in sigmas:
        integrated = assemble(datasets_dimred[:], sigma=sigma)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(sigma))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('sigma')) 
Example #3
Source File: param_sensitivity.py    From scanorama with MIT License 6 votes vote down vote up
def test_alpha(datasets_dimred, genes, labels, idx, distr, xlabels):
    alphas = [ 0, 0.05, 0.20, 0.50 ]
    len_distr = len(distr)
    for alpha in alphas:
        integrated = assemble(datasets_dimred[:], alpha=alpha, sigma=150)
        X = np.concatenate(integrated)
        distr.append(sil(X[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(alpha))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('alpha')) 
Example #4
Source File: param_sensitivity.py    From scanorama with MIT License 6 votes vote down vote up
def test_approx(datasets_dimred, genes, labels, idx, distr, xlabels):
    integrated = assemble(datasets_dimred[:], approx=False, sigma=150)
    X = np.concatenate(integrated)
    distr.append(sil(X[idx, :], labels[idx]))
    len_distr = len(distr)
    for d in distr[:len_distr]:
        print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
    xlabels.append('Exact NN')
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('approx')) 
Example #5
Source File: param_sensitivity.py    From scanorama with MIT License 6 votes vote down vote up
def test_perplexity(datasets_dimred, genes, labels, idx,
                    distr, xlabels):
    X = np.concatenate(datasets_dimred)

    perplexities = [ 10, 100, 500, 2000 ]
    len_distr = len(distr)
    for perplexity in perplexities:
        embedding = fit_tsne(X, perplexity=perplexity)
        distr.append(sil(embedding[idx, :], labels[idx]))
        for d in distr[:len_distr]:
            print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d)))
        xlabels.append(str(perplexity))
    print('')
    
    plt.figure()
    plt.boxplot(distr, showmeans=True, whis='range')
    plt.xticks(range(1, len(xlabels) + 1), xlabels)
    plt.ylabel('Silhouette Coefficient')
    plt.ylim((-1, 1))
    plt.savefig('param_sensitivity_{}.svg'.format('perplexity')) 
Example #6
Source File: silhouette.py    From SecuML with GNU General Public License v2.0 6 votes vote down vote up
def gen_eval(self, output_dir, assigned_clusters, quick=False):
        if quick:
            self.silhouette_avg = 0
            return
        if self.distances is not None:
            self.silhouette_values = silhouette_samples(self.distances,
                                                        assigned_clusters,
                                                        metric='precomputed')
        else:
            features = self.instances.features.get_values()
            self.silhouette_values = silhouette_samples(features,
                                                        assigned_clusters)
        self.silhouette_avg = np.mean(self.silhouette_values)
        self.dispaly_silhouette(output_dir, assigned_clusters)

    # Code from a scikit-learn example:
    # Selecting the number of clusters with silhouette analysis on KMeans
    # clustering 
Example #7
Source File: AgglomerativeClustering.py    From mltk-algo-contrib with Apache License 2.0 5 votes vote down vote up
def fit(self, df, options):
        """Do the clustering & merge labels with original data."""
        # Make a copy of the input data
        X = df.copy()

        # Use the df_util prepare_features method to
        # - drop null columns & rows
        # - convert categorical columns into dummy indicator columns
        # X is our cleaned data, nans is a mask of the null value locations
        X, nans, columns = df_util.prepare_features(X, self.feature_variables)

        # Do the actual clustering
        y_hat = self.estimator.fit_predict(X.values)

        # attach silhouette coefficient score for each row
        silhouettes = silhouette_samples(X, y_hat)

        # Combine the two arrays, and transpose them.
        y_hat = np.vstack([y_hat, silhouettes]).T

        # Assign default output names
        default_name = 'cluster'

        # Get the value from the as-clause if present
        output_name = options.get('output_name', default_name)

        # There are two columns - one for the labels, for the silhouette scores
        output_names = [output_name, 'silhouette_score']

        # Use the predictions & nans-mask to create a new dataframe
        output_df = df_util.create_output_dataframe(y_hat, nans, output_names)

        # Merge the dataframe with the original input data
        df = df_util.merge_predictions(df, output_df)
        return df 
Example #8
Source File: test_silhouette.py    From pyclust with GNU General Public License v2.0 5 votes vote down vote up
def test_gmm():
    sil = pyclust.validate.Silhouette()
    sil_score = sil.score(X, ypred, sample_size=None)

    print(sil_score[0])

    print(sil.sample_scores[:10])

    print(silhouette_score(X, ypred, sample_size=None))
    
    print(silhouette_samples(X, ypred)[:10]) 
Example #9
Source File: test_metrics.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_silhouette_samples(self):
        result = self.df.metrics.silhouette_samples()
        expected = metrics.silhouette_samples(self.data, self.pred)

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        tm.assert_index_equal(result.index, self.df.index)
        self.assert_numpy_array_almost_equal(result.values, expected) 
Example #10
Source File: confidence.py    From nussl with MIT License 4 votes vote down vote up
def silhouette_confidence(audio_signal, features, num_sources, threshold=95, 
                          max_points=1000, **kwargs):
    """
    Uses the silhouette score to compute the clusterability of the feature space.

    The Silhouette Coefficient is calculated using the 
    mean intra-cluster distance (a) and the mean nearest-cluster distance (b) 
    for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b). 
    To clarify, b is the distance between a sample and the nearest cluster 
    that the sample is not a part of. Note that Silhouette Coefficient is 
    only defined if number of labels is 2 <= n_labels <= n_samples - 1.

    References:

    Seetharaman, Prem. Bootstrapping the Learning Process for Computer Audition. 
    Diss. Northwestern University, 2019.

    Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the 
    Interpretation and Validation of Cluster Analysis”. Computational and 
    Applied Mathematics 20: 53-65.
    
    Args:
        audio_signal (AudioSignal): AudioSignal object which will be used to compute
          the mask over which to compute the confidence measure. This can be None, if
          and only if ``representation`` is passed as a keyword argument to this 
          function.
        features (np.ndarray): Numpy array containing the features to be clustered. 
          Should have the same dimensions as the representation.
        n_sources (int): Number of sources to cluster the features into.
        threshold (int, optional): Threshold by loudness. Points below the threshold are
          excluded from being used in the confidence measure. Defaults to 95.
        kwargs: Keyword arguments to `_get_loud_bins_mask`. Namely, representation can
          go here as a keyword argument.
        max_points (int, optional): Maximum number of points to compute the Silhouette
          score for. Silhouette score is a costly operation. Defaults to 1000.
    
    Returns:
        float: Confidence given by Silhouette score.
    """
    mask, _ = _get_loud_bins_mask(threshold, audio_signal, **kwargs)
    embedding_size = features.shape[-1]
    features = features[mask].reshape(-1, embedding_size)

    if features.shape[0] > max_points:
        idx = np.random.choice(
            np.arange(features.shape[0]), max_points,
            replace=False)
        features = features[idx]
    
    kmeans = KMeans(num_sources)

    labels = kmeans.fit_predict(features)
    confidence = silhouette_samples(features, labels)

    return confidence.mean() 
Example #11
Source File: _silhouette.py    From epiScanpy with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def silhouette(adata_name, cluster_annot, value='X_pca', metric='euclidean',
               key_added=None, copy=False):
    """

    Compute silhouette scores.

    It computes the general silhouette score as well as a silhouette score for every cell according 
    to the cell cluster assigned to it. 

    Parameters
    ----------
    adata_name: AnnData object

    cluster_annot: observational variable corresponding to a cell clustering

    value: measure used to build the silhouette plot (X_pca, X_tsne, X_umap)

    metric: 'euclidean'

    key_added: key to save the computed silhouette scores

    Return
    ------

    general silhouette score in 'uns' of the AnnData object
    individual silhouette scores in 'obs' of the AnnData object



    Credit to sklearn script : 
    https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py
    return score and silhouette plot. Still some work to do to finish the function.
    size=None but you can put 'large' if you want a bigger default figure size
    """
    
    if copy:
      adata_name = adata_name.copy()
      
    X = adata_name.obsm[value]
    cluster_labels = adata_name.obs[cluster_annot]
    n_clusters = len(set(adata_name.obs[cluster_annot]))

    ## also, return sample_silhouette_values as adata.obs['silhouette_samples']
    silhouette_avg = silhouette_score(X, cluster_labels, metric)
    sample_silhouette_values = silhouette_samples(X, cluster_labels, metric)
    
    if key_added:
        adata_name.obs[key_added] = sample_silhouette_values
        adata_name.uns[key_added] = silhouette_avg
    else:
        adata_name.obs['silhouette_samples'] = sample_silhouette_values
        adata_name.uns['silhouette_samples_avg'] = silhouette_avg

    if copy:
        return(adata_name)
    else:
        return() 
Example #12
Source File: evaluate.py    From BERMUDA with MIT License 4 votes vote down vote up
def evaluate_scores(div_ent_code, sil_code, cell_labels, dataset_labels, num_datasets, div_ent_dim, sil_dim, sil_dist):
    """ Calculate three proposed evaluation metrics
    Args:
        div_ent_code: num_cells * num_features, embedding for divergence and entropy calculation, usually with dim of 2
        sil_code: num_cells * num_features, embedding for silhouette score calculation
        cell_labels:
        dataset_labels:
        num_datasets:
        div_ent_dim: if dimension of div_ent_code > div_ent_dim, apply PCA first
        sil_dim: if dimension of sil_code > sil_dim, apply PCA first
        sil_dist: distance metric for silhouette score calculation
    Returns:
        div_score: divergence score
        ent_score: entropy score
        sil_score: silhouette score
    """
    # calculate divergence and entropy
    if div_ent_code.shape[1] > div_ent_dim:
        div_ent_code = PCA(n_components=div_ent_dim).fit_transform(div_ent_code)
    div_pq = []  # divergence dataset p, q
    div_qp = []  # divergence dataset q, p
    ent = []  # entropy
    # pairs of datasets
    for d1 in range(1, num_datasets+1):
        for d2 in range(d1+1, num_datasets+1):
            idx1 = dataset_labels == d1
            idx2 = dataset_labels == d2
            labels = np.intersect1d(np.unique(cell_labels[idx1]), np.unique(cell_labels[idx2]))
            idx1_mutual = np.logical_and(idx1, np.isin(cell_labels, labels))
            idx2_mutual = np.logical_and(idx2, np.isin(cell_labels, labels))
            idx_specific = np.logical_and(np.logical_or(idx1, idx2), np.logical_not(np.isin(cell_labels, labels)))
            # divergence
            if np.sum(idx1_mutual) >= cal_min and np.sum(idx2_mutual) >= cal_min:
                div_pq.append(max(estimate(div_ent_code[idx1_mutual, :], div_ent_code[idx2_mutual, :], cal_min), 0))
                div_qp.append(max(estimate(div_ent_code[idx2_mutual, :], div_ent_code[idx1_mutual, :], cal_min), 0))
            # entropy
            if (sum(idx_specific) > 0):
                ent_tmp = cal_entropy(div_ent_code, idx_specific, dataset_labels)
                ent.append(sum(ent_tmp) / len(ent_tmp))
    if len(ent) == 0:  # if no dataset specific cell types, store entropy as -1
        ent.append(-1)

    # calculate silhouette_score
    if sil_code.shape[1] > sil_dim:
        sil_code = PCA(n_components=sil_dim).fit_transform(sil_code)
    sil_scores = silhouette_samples(sil_code, cell_labels, metric=sil_dist)

    # average for scores
    div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2
    ent_score = sum(ent) / len(ent)
    sil_score = sum(sil_scores) / len(sil_scores)

    return div_score, ent_score, sil_score