Python Examples of scipy.cluster

Source File: signal_recompose.py From NeuroKit with MIT License

6 votes

def _signal_recompose_sum(components, clusters):
    # Reorient components
    components = components.T

    # Reconstruct Time Series from correlated components
    clusters = [np.where(clusters == cluster)[0] for cluster in np.unique(clusters)]

    if len(clusters) == 0:
        raise ValueError("Not enough clusters of components detected. Please decrease the " "`threshold`.")
    # Initialize components matrix
    recomposed = np.zeros((len(components), len(clusters)))
    for i, indices in enumerate(clusters):
        recomposed[:, i] = components[:, indices].sum(axis=1)
    return recomposed.T


# =============================================================================
# Clustering Methods
# =============================================================================

# Weighted Correlation
# ----------------------------------------------------------------------------

Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License

6 votes

def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################

Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License

6 votes

def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################

Source File: diarizationFunctions.py From pyBK with MIT License

6 votes

def performClusteringLinkage(segmentBKTable, segmentCVTable, N_init, linkageCriterion,linkageMetric ):
    from scipy.cluster.hierarchy import linkage
    from scipy import cluster
    if linkageMetric == 'jaccard':
      observations = segmentBKTable
    elif linkageMetric == 'cosine':
      observations = segmentCVTable
    else:
      observations = segmentCVTable      
    clusteringTable = np.zeros([np.size(segmentCVTable,0),N_init]) 
    Z = linkage(observations,method=linkageCriterion,metric=linkageMetric)
    for i in np.arange(N_init):
      clusteringTable[:,i] = cluster.hierarchy.cut_tree(Z,N_init-i).T+1  
    k=N_init
    print('done')
    return clusteringTable, k

Source File: utils.py From epiScanpy with BSD 3-Clause "New" or "Revised" License

6 votes

def hierarch_cluster(M):
    """Cluster matrix using hierarchical clustering.

    Parameters
    ----------
    M : np.ndarray
        Matrix, for example, distance matrix.

    Returns
    -------
    Mclus : np.ndarray
        Clustered matrix.
    indices : np.ndarray
        Indices used to cluster the matrix.
    """
    import scipy as sp
    import scipy.cluster
    link = sp.cluster.hierarchy.linkage(M)
    indices = sp.cluster.hierarchy.leaves_list(link)
    Mclus = np.array(M[:, indices])
    Mclus = Mclus[indices, :]
    if False:
        pl.matshow(Mclus)
        pl.colorbar()
    return Mclus, indices

Source File: utils.py From epiScanpy with BSD 3-Clause "New" or "Revised" License

6 votes

def compute_group_overlap_score(ref_labels, pred_labels,
                                threshold_overlap_pred=0.5,
                                threshold_overlap_ref=0.5):
    """How well do the pred_labels explain the ref_labels?

    A predicted cluster explains a reference cluster if it is contained within the reference
    cluster with at least 50% (threshold_overlap_pred) of its points and these correspond
    to at least 50% (threshold_overlap_ref) of the reference cluster.
    """
    ref_unique, ref_counts = np.unique(ref_labels, return_counts=True)
    ref_dict = dict(zip(ref_unique, ref_counts))
    pred_unique, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_dict = dict(zip(pred_unique, pred_counts))
    summary = []
    for true in ref_unique:
        sub_pred_unique, sub_pred_counts = np.unique(pred_labels[true == ref_labels], return_counts=True)
        relative_overlaps_pred = [sub_pred_counts[i] / pred_dict[n] for i, n in enumerate(sub_pred_unique)]
        relative_overlaps_ref = [sub_pred_counts[i] / ref_dict[true] for i, n in enumerate(sub_pred_unique)]
        pred_best_index = np.argmax(relative_overlaps_pred)
        summary.append(1 if (relative_overlaps_pred[pred_best_index] >= threshold_overlap_pred and
                             relative_overlaps_ref[pred_best_index] >= threshold_overlap_ref)
                       else 0)
        # print(true, sub_pred_unique[pred_best_index], relative_overlaps_pred[pred_best_index],
        #       relative_overlaps_ref[pred_best_index], summary[-1])
    return sum(summary)/len(summary)

Source File: dbscan.py From link-prediction_with_deep-learning with MIT License

6 votes

def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.eps <= 0.0:
        raise ValueError('eps must be > 0')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options

Source File: cmag.py From neuropythy with GNU Affero General Public License v3.0

6 votes

def sigma_bin_walls(sigma, bins):
        import scipy, scipy.cluster, scipy.cluster.vq as vq
        std = np.std(sigma)
        if np.isclose(std, 0): return pimms.imm_array([0, np.max(sigma)])
        cl = sorted(std * vq.kmeans(sigma/std, bins)[0])
        cl = np.mean([cl[:-1],cl[1:]], axis=0)
        return pimms.imm_array(np.concatenate(([0], cl, [np.max(sigma)])))

Source File: leveler.py From Maybe-Useful-Cogs with MIT License

5 votes

def _auto_color(self, url:str, ranks):
        phrases = ["Calculating colors..."] # in case I want more
        #try:
        await self.bot.say("**{}**".format(random.choice(phrases)))
        clusters = 10

        async with aiohttp.get(url) as r:
            image = await r.content.read()
        with open('data/leveler/temp_auto.png','wb') as f:
            f.write(image)

        im = Image.open('data/leveler/temp_auto.png').convert('RGBA')
        im = im.resize((290, 290)) # resized to reduce time
        ar = scipy.misc.fromimage(im)
        shape = ar.shape
        ar = ar.reshape(scipy.product(shape[:2]), shape[2])

        codes, dist = scipy.cluster.vq.kmeans(ar.astype(float), clusters)
        vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
        counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

        # sort counts
        freq_index = []
        index = 0
        for count in counts:
            freq_index.append((index, count))
            index += 1
        sorted_list = sorted(freq_index, key=operator.itemgetter(1), reverse=True)

        colors = []
        for rank in ranks:
            color_index = min(rank, len(codes))
            peak = codes[sorted_list[color_index][0]] # gets the original index
            peak = peak.astype(int)

            colors.append(''.join(format(c, '02x') for c in peak))
        return colors # returns array
        #except:
            #await self.bot.say("```Error or no scipy. Install scipy doing 'pip3 install numpy' and 'pip3 install scipy' or read here: https://github.com/AznStevy/Maybe-Useful-Cogs/blob/master/README.md```")

    # converts hex to rgb

Source File: kmeans.py From link-prediction_with_deep-learning with MIT License

5 votes

def write_cluster_ids(words, cluster_ids, out=None):
    """Write given list of words and their corresponding cluster ids to out."""

    assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'

    if out is None:
        out = sys.stdout
    for word, cid in izip(words, cluster_ids):
        print >> out, '%s\t%d' % (word, cid)

Source File: data_viewing.py From lumin with Apache License 2.0

5 votes

def plot_rank_order_dendrogram(df:pd.DataFrame, threshold:float=0.8, savename:Optional[str]=None, settings:PlotSettings=PlotSettings()) \
        -> Dict[str,Union[List[str],float]]:
    r'''
    Plots a dendrogram of features in df clustered via Spearman's rank correlation coefficient.
    Also returns a sets of features with correlation coefficients greater than the threshold

    Arguments:
        df: Pandas DataFrame containing data
        threshold: Threshold on correlation coefficient
        savename: Optional name of file to which to save the plot of feature importances
        settings: :class:`~lumin.plotting.plot_settings.PlotSettings` class to control figure appearance

    Returns:
        Dict of sets of features with correlation coefficients greater than the threshold and cluster distance
    '''

    corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
    corr_condensed = hc.distance.squareform(1-np.abs(corr))  # Abs because negtaive of a feature is a trvial transformation: information unaffected
    z = hc.linkage(corr_condensed, method='average', optimal_ordering=True)

    with sns.axes_style('white'), sns.color_palette(settings.cat_palette):
        plt.figure(figsize=(settings.w_large, (0.5*len(df.columns))))
        hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=settings.lbl_sz, color_threshold=1-threshold)
        plt.xlabel("Distance (1 - |Spearman's Rank Correlation Coefficient|)", fontsize=settings.lbl_sz, color=settings.lbl_col)
        plt.xticks(fontsize=settings.tk_sz, color=settings.tk_col)
        if savename is not None: plt.savefig(settings.savepath/f'{savename}{settings.format}', bbox_inches='tight')
        plt.show()

    feats = df.columns
    sets = {}
    for i, merge in enumerate(z):
        if merge[2] > 1-threshold: continue
        if merge[0] <= len(z): a = [feats[int(merge[0])]]
        else:                  a = sets.pop(int(merge[0]))['children']
        if merge[1] <= len(z): b = [feats[int(merge[1])]]
        else:                  b = sets.pop(int(merge[1]))['children']
        sets[1 + i + len(z)] = {'children': [*a, *b], 'distance': merge[2]}
    return sets

Source File: kmeans.py From link-prediction_with_deep-learning with MIT License

5 votes

def kmeans(vectors, k, jobs=1):
    vectors = numpy.array(vectors)
    if with_sklearn:
        if jobs == 1:
            kmeans = sklearn.cluster.KMeans(k)
        else:
            kmeans = sklearn.cluster.KMeans(k, n_jobs=jobs) # sklearn > 0.10
        kmeans.fit(vectors)
        return kmeans.labels_
    else:
        codebook, distortion = scipy.cluster.vq.kmeans(vectors, k)
        cluster_ids, dist = scipy.cluster.vq.vq(vectors, codebook)
        return cluster_ids

Source File: kmeans.py From link-prediction_with_deep-learning with MIT License

5 votes

def minibatch_kmeans(vectors, k):
    if not with_sklearn:
        raise NotImplementedError
    # Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
    # uses batch size 1000. sklearn KMeans defaults to n_init 10
    kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10)
    kmeans.fit(vectors)
    return kmeans.labels_

Source File: kmeans.py From link-prediction_with_deep-learning with MIT License

5 votes

def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.k is not None and options.k < 2:
        raise ValueError('cluster number must be >= 2')

    if options.method == MINIBATCH_KMEANS and not with_sklearn:
        logging.warning('minibatch kmeans not available, using kmeans (slow)')
        options.method = KMEANS

    if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
        logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
        options.jobs = 1

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.k is None:
        options.k = int(math.ceil((len(wv.words())/2)**0.5))
        logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options

Source File: signal_recompose.py From NeuroKit with MIT License

5 votes

def _signal_recompose_wcorr(components, threshold=0.5, metric="chebyshev"):
    """"""
    # Calculate the w-correlation matrix.
    wcorr = _signal_recompose_get_wcorr(components, show=False)

    # Find clusters in correlation matrix
    pairwise_distances = scipy.cluster.hierarchy.distance.pdist(wcorr, metric=metric)
    linkage = scipy.cluster.hierarchy.linkage(pairwise_distances, method="complete")
    threshold = threshold * pairwise_distances.max()
    clusters = scipy.cluster.hierarchy.fcluster(linkage, threshold, "distance")

    return clusters

Source File: bounding.py From dynesty with MIT License

5 votes

def _get_covariance_from_clusters(self, points):
        """Compute covariance from re-centered clusters."""

        # Compute pairwise distances.
        distances = spatial.distance.pdist(points, metric='mahalanobis',
                                           VI=self.am)

        # Identify conglomerates of points by constructing a linkage matrix.
        linkages = cluster.hierarchy.single(distances)

        # Cut when linkage between clusters exceed the radius.
        clusteridxs = cluster.hierarchy.fcluster(linkages, 1.0,
                                                 criterion='distance')
        nclusters = np.max(clusteridxs)
        if nclusters == 1:
            return self._get_covariance_from_all_points(points)
        else:
            i = 0
            overlapped_points = np.empty_like(points)
            for idx in np.unique(clusteridxs):
                group_points = points[clusteridxs == idx, :]
                group_mean = group_points.mean(axis=0).reshape((1, -1))
                j = i + len(group_points)
                overlapped_points[i:j, :] = group_points - group_mean
                i = j
            return self._get_covariance_from_all_points(overlapped_points)


##################
# HELPER FUNCTIONS
##################

Source File: bounding.py From dynesty with MIT License

5 votes

def _get_covariance_from_clusters(self, points):
        """Compute covariance from re-centered clusters."""

        # Compute pairwise distances.
        distances = spatial.distance.pdist(points, metric='mahalanobis',
                                           VI=self.am)

        # Identify conglomerates of points by constructing a linkage matrix.
        linkages = cluster.hierarchy.single(distances)

        # Cut when linkage between clusters exceed the radius.
        clusteridxs = cluster.hierarchy.fcluster(linkages, 1.0,
                                                 criterion='distance')
        nclusters = np.max(clusteridxs)
        if nclusters == 1:
            return self._get_covariance_from_all_points(points)
        else:
            i = 0
            overlapped_points = np.empty_like(points)
            for idx in np.unique(clusteridxs):
                group_points = points[clusteridxs == idx, :]
                group_mean = group_points.mean(axis=0).reshape((1, -1))
                j = i + len(group_points)
                overlapped_points[i:j, :] = group_points - group_mean
                i = j
            return self._get_covariance_from_all_points(overlapped_points)

Source File: clust_color.py From pyPESTO with BSD 3-Clause "New" or "Revised" License

4 votes

def assign_colors_for_result_list(num_results, colors=None):
    """
    Creates a list of colors for a list of pypesto.Result objects or checks
    a user-provided list of colors and uses this if everything is ok

    Parameters
    ----------

    num_results: int
        number of results in list

    colors: list, or RGBA, optional
        list of colors, or single color

    Returns
    -------

    colors: list of RGBA
        One for each element in 'vals'.
    """

    # if the user did not specify any colors:
    if colors is None:
        # default colors will be used, on for each entry in the result list.
        # Colors are created from assign_colors, which needs a dummy list
        dummy_clusters = np.array(list(range(num_results)) * 2)

        # we don't want alpha levels for all plotting routines in this case...
        colors = assign_colors(dummy_clusters, balance_alpha=False,
                               highlight_global=False)

        # dummy cluster had twice as many entries as really there. Reduce.
        real_indices = list(range(int(colors.shape[0] / 2)))
        return colors[real_indices]

    # if the user specified color lies does not match the number of results
    if len(colors) != num_results:
        raise ('Incorrect color input. Colors must be specified either as '
               'list of [r, g, b, alpha] with length equal to function '
               'values Number of function (here: ' + str(num_results) + '), '
               'or as one single [r, g, b, alpha] color.')

    return colors

Source File: clust_color.py From pyPESTO with BSD 3-Clause "New" or "Revised" License

4 votes

def assign_clusters(vals):
    """
    Find clustering.

    Parameters
    ----------

    vals: numeric list or array
        List to be clustered.

    Returns
    -------

    clust: numeric list
         Indicating the corresponding cluster of each element from
         'vals'.

    clustsize: numeric list
        Size of clusters, length equals number of clusters.
    """

    # sanity checks
    if vals is None or len(vals) == 0:
        return [], []
    elif len(vals) == 1:
        return np.array([0]), np.array([1.])

    # linkage requires (n, 1) data array
    vals = np.reshape(vals, (-1, 1))

    # however: clusters are sorted by size, not by value... Resort.
    # Create preallocated object first
    cluster_indices = np.zeros(vals.size, dtype=int)

    # get clustering based on distance
    clust = cluster.hierarchy.fcluster(
        cluster.hierarchy.linkage(vals),
        t=0.1, criterion='distance')

    # get unique clusters
    _, ind_clust = np.unique(clust, return_index=True)
    unique_clust = clust[np.sort(ind_clust)]
    cluster_size = np.zeros(unique_clust.size, dtype=int)

    # loop over clusters: resort and count number of entries
    for index, i_clust in enumerate(unique_clust):
        cluster_indices[np.where(clust == i_clust)] = index
        cluster_size[index] = sum(clust == i_clust)

    return cluster_indices, cluster_size

Source File: bounding.py From dynesty with MIT License

4 votes

def bounding_ellipsoids(points, pointvol=0., vol_dec=0.5, vol_check=2.):
    """
    Calculate a set of ellipsoids that bound the collection of points.

    Parameters
    ----------
    points : `~numpy.ndarray` with shape (npoints, ndim)
        A set of coordinates.

    pointvol : float, optional
        Volume represented by a single point. When provided,
        used to set a minimum bound on the ellipsoid volume
        as `npoints * pointvol`. Default is `0.`.

    vol_dec : float, optional
        The required fractional reduction in volume after splitting an
        ellipsoid in order to to accept the split. Default is `0.5`.

    vol_check : float, optional
        The factor used to when checking whether the volume of the
        original bounding ellipsoid is large enough to warrant more
        trial splits via `ell.vol > vol_check * npoints * pointvol`.
        Default is `2.0`.

    Returns
    -------
    mell : :class:`MultiEllipsoid` object
        The :class:`MultiEllipsoid` object used to bound the
        collection of points.

    """

    if not HAVE_KMEANS:
        raise ValueError("scipy.cluster.vq.kmeans2 is required to compute "
                         "ellipsoid decompositions.")  # pragma: no cover

    # Calculate the bounding ellipsoid for the points possibly
    # enlarged to a minimum volume.
    ell = bounding_ellipsoid(points, pointvol=pointvol)

    # Recursively split the bounding ellipsoid until the volume of each
    # split no longer decreases by a factor of `vol_dec`.
    ells = _bounding_ellipsoids(points, ell, pointvol=pointvol,
                                vol_dec=vol_dec, vol_check=vol_check)

    return MultiEllipsoid(ells=ells)

Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License

4 votes

def get_normalization_cutoff(data, manual_cutoff=9600):
    
    col_sums = np.array(np.sum(data, axis=0))

    # continue the loop if the differece the means is larger than the 2*2*STD of the larger cluster 
    while True:

        try:
            # doing Kmean clustering
            col_sums_for_cluster = col_sums.reshape(-1,1)
            
            #separate distributions using kmean
            #kmeans = KMeans(n_clusters=2, random_state=0).fit(col_sums_for_cluster)
            #labels = kmeans.labels_
            
            #separate distributions using mixture model
            clf = mixture.GaussianMixture(n_components=2, covariance_type='full')
            clf.fit(col_sums_for_cluster)
            labels = clf.predict(col_sums_for_cluster)
            
            unique, counts = np.unique(labels, return_counts=True)
            if len(unique)==1:
                break
            bigger_cluster = unique[np.argmax(counts)]
            smaller_cluster = unique[np.argmin(counts)]
            
            # estimating the magnitute of discripancy better the clusters
            bigger_cluster__dist = col_sums[labels==bigger_cluster]
            smaller_cluster__dist = col_sums[labels==smaller_cluster]
            bigger_cluster__dist_mean = np.mean(bigger_cluster__dist) 
            bigger_cluster__dist_std = np.std(bigger_cluster__dist) 
            smaller_cluster__dist_mean = np.mean(smaller_cluster__dist) 
            #smaller_cluster__dist_std = np.std(smaller_cluster__dist) 
            
             
            
            #print("bigger_cluster__dist_mean ", bigger_cluster__dist_mean)
            #print("bigger_cluster__dist_std ", bigger_cluster__dist_std)
            #print("smaller_cluster__dist_mean ", smaller_cluster__dist_mean)
            #print("smaller_cluster__dist_std ", smaller_cluster__dist_std)
            
            # continue the loop if the differece the means is larger than the 2*STD of the larger cluster 
            
            if abs(bigger_cluster__dist_mean-smaller_cluster__dist_mean)< 2*2*bigger_cluster__dist_std:
                break
                
            
            # col_sums will be equal to bigger_cluster__dist for the next iteration 
            col_sums = bigger_cluster__dist
        except:
            break
            
     
    mean = np.mean(col_sums)  
    std = np.std(col_sums)
    cutoff = (mean + 2*(std)).astype(int)
    
    if cutoff<manual_cutoff:
        cutoff = manual_cutoff
    
    return cutoff

Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License

4 votes

def get_normalization_cutoff(data, manual_cutoff=9600):
    
    col_sums = np.array(np.sum(data, axis=0))

    # continue the loop if the differece the means is larger than the 2*2*STD of the larger cluster 
    while True:

        try:
            # doing Kmean clustering
            col_sums_for_cluster = col_sums.reshape(-1,1)
            
            #separate distributions using kmean
            #kmeans = KMeans(n_clusters=2, random_state=0).fit(col_sums_for_cluster)
            #labels = kmeans.labels_
            
            #separate distributions using mixture model
            clf = mixture.GaussianMixture(n_components=2, covariance_type='full')
            clf.fit(col_sums_for_cluster)
            labels = clf.predict(col_sums_for_cluster)
            
            unique, counts = np.unique(labels, return_counts=True)
            if len(unique)==1:
                break
            bigger_cluster = unique[np.argmax(counts)]
            smaller_cluster = unique[np.argmin(counts)]
            
            # estimating the magnitute of discripancy better the clusters
            bigger_cluster__dist = col_sums[labels==bigger_cluster]
            smaller_cluster__dist = col_sums[labels==smaller_cluster]
            bigger_cluster__dist_mean = np.mean(bigger_cluster__dist) 
            bigger_cluster__dist_std = np.std(bigger_cluster__dist) 
            smaller_cluster__dist_mean = np.mean(smaller_cluster__dist) 
            #smaller_cluster__dist_std = np.std(smaller_cluster__dist) 
            
             
            
            #print("bigger_cluster__dist_mean ", bigger_cluster__dist_mean)
            #print("bigger_cluster__dist_std ", bigger_cluster__dist_std)
            #print("smaller_cluster__dist_mean ", smaller_cluster__dist_mean)
            #print("smaller_cluster__dist_std ", smaller_cluster__dist_std)
            
            # continue the loop if the differece the means is larger than the 2*STD of the larger cluster 
            
            if abs(bigger_cluster__dist_mean-smaller_cluster__dist_mean)< 2*2*bigger_cluster__dist_std:
                break
                
            
            # col_sums will be equal to bigger_cluster__dist for the next iteration 
            col_sums = bigger_cluster__dist
        except:
            break
            
     
    mean = np.mean(col_sums)  
    std = np.std(col_sums)
    cutoff = (mean + 2*(std)).astype(int)
    
    if cutoff<manual_cutoff:
        cutoff = manual_cutoff
    
    return cutoff

Source File: diarizationFunctions.py From pyBK with MIT License

4 votes

def performClustering( speechMapping, segmentTable, segmentBKTable, segmentCVTable, Vg, bitsPerSegmentFactor, kbmSize, N_init, initialClustering, clusteringMetric):
    numberOfSegments = np.size(segmentTable,0)
    clusteringTable = np.zeros([numberOfSegments, N_init])
    finalClusteringTable = np.zeros([numberOfSegments, N_init])
    activeClusters = np.ones([N_init,1]) 
    clustersBKTable = np.zeros([N_init, kbmSize])
    clustersCVTable = np.zeros([N_init, kbmSize])    
    clustersBKTable, clustersCVTable = calcClusters(clustersCVTable,clustersBKTable,activeClusters,initialClustering,N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor)   
    ####### Here the clustering algorithm begins. Steps are:
    ####### 1. Reassign all data among all existing signatures and retrain them
    ####### using the new clustering
    ####### 2. Save the resulting clustering solution
    ####### 3. Compare all signatures with each other and merge those two with
    ####### highest similarity, creating a new signature for the resulting
    ####### cluster
    ####### 4. Back to 1 if #clusters > 1      
    for k in range(N_init):
    ####### 1. Data reassignment. Calculate the similarity between the current segment with all clusters and assign it to the one which maximizes
    ####### the similarity. Finally re-calculate binaryKeys for all cluster   
    # before doing anything, check if there are remaining clusters
    # if there is only one active cluster, break    
        if np.sum(activeClusters)==1:
            break          
        clustersStillActive=np.zeros([1,N_init])
        segmentToClustersSimilarityMatrix = binaryKeySimilarity_cdist(clusteringMetric,segmentBKTable,segmentCVTable,clustersBKTable,clustersCVTable)
        # clusteringTable[:,k] = finalClusteringTable[:,k] = np.argmax(segmentToClustersSimilarityMatrix,axis=1)+1
        clusteringTable[:,k] = finalClusteringTable[:,k] = np.nanargmax(segmentToClustersSimilarityMatrix,axis=1)+1
        # clustersStillActive[:,np.unique(clusteringTable[:,k]).astype(int)-1] = 1
        clustersStillActive[:,np.unique(clusteringTable[:,k]).astype(int)-1] = 1       
        ####### update all binaryKeys for all new clusters        
        activeClusters = clustersStillActive
        clustersBKTable, clustersCVTable = calcClusters(clustersCVTable,clustersBKTable,activeClusters.T,clusteringTable[:,k].astype(int),N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor)                
        ####### 2. Compare all signatures with each other and merge those two with highest similarity, creating a new signature for the resulting        
        clusterSimilarityMatrix = binaryKeySimilarity_cdist(clusteringMetric,clustersBKTable,clustersCVTable,clustersBKTable,clustersCVTable)        
        np.fill_diagonal(clusterSimilarityMatrix,np.nan)        
        value = np.nanmax(clusterSimilarityMatrix)
        location = np.nanargmax(clusterSimilarityMatrix)        
        R,C = np.unravel_index(location,(N_init,N_init))        
        ### Then we merge clusters R and C
        #print('Merging clusters',R+1,'and',C+1,'with a similarity score of',np.around(value,decimals=4))
        print('Merging clusters','%3s'%str(R+1),'and','%3s'%str(C+1),'with a similarity score of',np.around(value,decimals=4))
        activeClusters[0,C]=0        
        ### 3. Save the resulting clustering and go back to 1 if the number of clusters >1
        mergingClusteringIndices = np.where(clusteringTable[:,k]==C+1)
        # update clustering table
        clusteringTable[mergingClusteringIndices[0],k]=R+1
        # remove binarykey for removed cluster
        clustersBKTable[C,:]=np.zeros([1,kbmSize])
        clustersCVTable[C,:]=np.nan
        # prepare the vector with the indices of the features of thew new cluster and then binarize
        segmentsToBinarize = np.where(clusteringTable[:,k]==R+1)[0]
        M=[]
        for l in np.arange(np.size(segmentsToBinarize,0)):
            M = np.append(M,np.arange(int(segmentTable[segmentsToBinarize][:][l,1]),int(segmentTable[segmentsToBinarize][:][l,2])+1))
        clustersBKTable[R,:], clustersCVTable[R,:]=binarizeFeatures(kbmSize,Vg[np.array(speechMapping[np.array(M,dtype='int')],dtype='int')-1].T,bitsPerSegmentFactor)
    print('done')
    return clusteringTable, k

Source File: utils.py From opensurfaces with MIT License

4 votes

def get_dominant_image_colors(image, num_clusters=4):
    """
    Returns the dominant image color that isn't pure white or black.  Uses
    kmeans on the colors.  Returns the result as RGB hex strings in the format
    ['#rrggbb', '#rrggbb', ...].

    :param image: PIL image or path
    """

    if isinstance(image, basestring):
        image = Image.open(image)

    # downsample for speed
    im = image.resize((512, 512), Image.ANTIALIAS)

    # reshape
    ar0 = scipy.misc.fromimage(im)
    shape = ar0.shape
    npixels = scipy.product(shape[:2])
    ar0 = ar0.reshape(npixels, shape[2])

    # keep only nontransparent elements
    ar = ar0[ar0[:, 3] == 255][:, 0:3]

    try:
        # kmeans clustering
        codes, dist = scipy.cluster.vq.kmeans(ar, num_clusters)
    except:
        # kmeans sometimes fails -- if that is the case, use the mean color and
        # nothing else.
        arf = ar.astype(float)
        clamp = lambda p: max(0, min(255, int(p)))
        return ['#' + ''.join(['%0.2x' % clamp(arf[:, i].sum() / float(arf.shape[1])) for i in (0, 1, 2)])]

    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

    # sort by count frequency
    indices = [i[0] for i in
               sorted(enumerate(counts), key=lambda x:x[1], reverse=True)]

    # convert to hex strings
    colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]

    results = []
    for idx in indices:
        color = colors[idx]
        if color != 'ffffff' and color != '000000':
            results.append('#' + color)

    return results

Python scipy.cluster() Examples