Python scipy.cluster() Examples
The following are 24
code examples of scipy.cluster().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scipy
, or try the search function
.
Example #1
Source File: signal_recompose.py From NeuroKit with MIT License | 6 votes |
def _signal_recompose_sum(components, clusters): # Reorient components components = components.T # Reconstruct Time Series from correlated components clusters = [np.where(clusters == cluster)[0] for cluster in np.unique(clusters)] if len(clusters) == 0: raise ValueError("Not enough clusters of components detected. Please decrease the " "`threshold`.") # Initialize components matrix recomposed = np.zeros((len(components), len(clusters))) for i, indices in enumerate(clusters): recomposed[:, i] = components[:, indices].sum(axis=1) return recomposed.T # ============================================================================= # Clustering Methods # ============================================================================= # Weighted Correlation # ----------------------------------------------------------------------------
Example #2
Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 6 votes |
def dendrogram(data, threshold, layer_directory): colnames = data.columns data = np.array(data) Z = hierarchy.linkage(data.T, 'single', 'cosine') plt.figure(figsize=(15, 9)) dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold) plt.title("Clustering of Samples Based on Mutational Signatures" ) plt.ylabel("Cosine Distance") plt.xlabel("Sample IDs") #plt.ylim((0,1)) plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300) # which datapoints goes to which cluster # The indices of the datapoints will be displayed as the ids Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None) dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)}) dataframe = dataframe.set_index("Sample Names") #print(dataframe) dictionary = {"clusters":Y, "informations":dn} return dataframe ######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################
Example #3
Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 6 votes |
def dendrogram(data, threshold, layer_directory): colnames = data.columns data = np.array(data) Z = hierarchy.linkage(data.T, 'single', 'cosine') plt.figure(figsize=(15, 9)) dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold) plt.title("Clustering of Samples Based on Mutational Signatures" ) plt.ylabel("Cosine Distance") plt.xlabel("Sample IDs") #plt.ylim((0,1)) plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300) # which datapoints goes to which cluster # The indices of the datapoints will be displayed as the ids Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None) dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)}) dataframe = dataframe.set_index("Sample Names") #print(dataframe) dictionary = {"clusters":Y, "informations":dn} return dataframe ######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################
Example #4
Source File: diarizationFunctions.py From pyBK with MIT License | 6 votes |
def performClusteringLinkage(segmentBKTable, segmentCVTable, N_init, linkageCriterion,linkageMetric ): from scipy.cluster.hierarchy import linkage from scipy import cluster if linkageMetric == 'jaccard': observations = segmentBKTable elif linkageMetric == 'cosine': observations = segmentCVTable else: observations = segmentCVTable clusteringTable = np.zeros([np.size(segmentCVTable,0),N_init]) Z = linkage(observations,method=linkageCriterion,metric=linkageMetric) for i in np.arange(N_init): clusteringTable[:,i] = cluster.hierarchy.cut_tree(Z,N_init-i).T+1 k=N_init print('done') return clusteringTable, k
Example #5
Source File: utils.py From epiScanpy with BSD 3-Clause "New" or "Revised" License | 6 votes |
def hierarch_cluster(M): """Cluster matrix using hierarchical clustering. Parameters ---------- M : np.ndarray Matrix, for example, distance matrix. Returns ------- Mclus : np.ndarray Clustered matrix. indices : np.ndarray Indices used to cluster the matrix. """ import scipy as sp import scipy.cluster link = sp.cluster.hierarchy.linkage(M) indices = sp.cluster.hierarchy.leaves_list(link) Mclus = np.array(M[:, indices]) Mclus = Mclus[indices, :] if False: pl.matshow(Mclus) pl.colorbar() return Mclus, indices
Example #6
Source File: utils.py From epiScanpy with BSD 3-Clause "New" or "Revised" License | 6 votes |
def compute_group_overlap_score(ref_labels, pred_labels, threshold_overlap_pred=0.5, threshold_overlap_ref=0.5): """How well do the pred_labels explain the ref_labels? A predicted cluster explains a reference cluster if it is contained within the reference cluster with at least 50% (threshold_overlap_pred) of its points and these correspond to at least 50% (threshold_overlap_ref) of the reference cluster. """ ref_unique, ref_counts = np.unique(ref_labels, return_counts=True) ref_dict = dict(zip(ref_unique, ref_counts)) pred_unique, pred_counts = np.unique(pred_labels, return_counts=True) pred_dict = dict(zip(pred_unique, pred_counts)) summary = [] for true in ref_unique: sub_pred_unique, sub_pred_counts = np.unique(pred_labels[true == ref_labels], return_counts=True) relative_overlaps_pred = [sub_pred_counts[i] / pred_dict[n] for i, n in enumerate(sub_pred_unique)] relative_overlaps_ref = [sub_pred_counts[i] / ref_dict[true] for i, n in enumerate(sub_pred_unique)] pred_best_index = np.argmax(relative_overlaps_pred) summary.append(1 if (relative_overlaps_pred[pred_best_index] >= threshold_overlap_pred and relative_overlaps_ref[pred_best_index] >= threshold_overlap_ref) else 0) # print(true, sub_pred_unique[pred_best_index], relative_overlaps_pred[pred_best_index], # relative_overlaps_ref[pred_best_index], summary[-1]) return sum(summary)/len(summary)
Example #7
Source File: dbscan.py From link-prediction_with_deep-learning with MIT License | 6 votes |
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.eps <= 0.0: raise ValueError('eps must be > 0') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
Example #8
Source File: cmag.py From neuropythy with GNU Affero General Public License v3.0 | 6 votes |
def sigma_bin_walls(sigma, bins): import scipy, scipy.cluster, scipy.cluster.vq as vq std = np.std(sigma) if np.isclose(std, 0): return pimms.imm_array([0, np.max(sigma)]) cl = sorted(std * vq.kmeans(sigma/std, bins)[0]) cl = np.mean([cl[:-1],cl[1:]], axis=0) return pimms.imm_array(np.concatenate(([0], cl, [np.max(sigma)])))
Example #9
Source File: leveler.py From Maybe-Useful-Cogs with MIT License | 5 votes |
def _auto_color(self, url:str, ranks): phrases = ["Calculating colors..."] # in case I want more #try: await self.bot.say("**{}**".format(random.choice(phrases))) clusters = 10 async with aiohttp.get(url) as r: image = await r.content.read() with open('data/leveler/temp_auto.png','wb') as f: f.write(image) im = Image.open('data/leveler/temp_auto.png').convert('RGBA') im = im.resize((290, 290)) # resized to reduce time ar = scipy.misc.fromimage(im) shape = ar.shape ar = ar.reshape(scipy.product(shape[:2]), shape[2]) codes, dist = scipy.cluster.vq.kmeans(ar.astype(float), clusters) vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences # sort counts freq_index = [] index = 0 for count in counts: freq_index.append((index, count)) index += 1 sorted_list = sorted(freq_index, key=operator.itemgetter(1), reverse=True) colors = [] for rank in ranks: color_index = min(rank, len(codes)) peak = codes[sorted_list[color_index][0]] # gets the original index peak = peak.astype(int) colors.append(''.join(format(c, '02x') for c in peak)) return colors # returns array #except: #await self.bot.say("```Error or no scipy. Install scipy doing 'pip3 install numpy' and 'pip3 install scipy' or read here: https://github.com/AznStevy/Maybe-Useful-Cogs/blob/master/README.md```") # converts hex to rgb
Example #10
Source File: kmeans.py From link-prediction_with_deep-learning with MIT License | 5 votes |
def write_cluster_ids(words, cluster_ids, out=None): """Write given list of words and their corresponding cluster ids to out.""" assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch' if out is None: out = sys.stdout for word, cid in izip(words, cluster_ids): print >> out, '%s\t%d' % (word, cid)
Example #11
Source File: data_viewing.py From lumin with Apache License 2.0 | 5 votes |
def plot_rank_order_dendrogram(df:pd.DataFrame, threshold:float=0.8, savename:Optional[str]=None, settings:PlotSettings=PlotSettings()) \ -> Dict[str,Union[List[str],float]]: r''' Plots a dendrogram of features in df clustered via Spearman's rank correlation coefficient. Also returns a sets of features with correlation coefficients greater than the threshold Arguments: df: Pandas DataFrame containing data threshold: Threshold on correlation coefficient savename: Optional name of file to which to save the plot of feature importances settings: :class:`~lumin.plotting.plot_settings.PlotSettings` class to control figure appearance Returns: Dict of sets of features with correlation coefficients greater than the threshold and cluster distance ''' corr = np.round(scipy.stats.spearmanr(df).correlation, 4) corr_condensed = hc.distance.squareform(1-np.abs(corr)) # Abs because negtaive of a feature is a trvial transformation: information unaffected z = hc.linkage(corr_condensed, method='average', optimal_ordering=True) with sns.axes_style('white'), sns.color_palette(settings.cat_palette): plt.figure(figsize=(settings.w_large, (0.5*len(df.columns)))) hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=settings.lbl_sz, color_threshold=1-threshold) plt.xlabel("Distance (1 - |Spearman's Rank Correlation Coefficient|)", fontsize=settings.lbl_sz, color=settings.lbl_col) plt.xticks(fontsize=settings.tk_sz, color=settings.tk_col) if savename is not None: plt.savefig(settings.savepath/f'{savename}{settings.format}', bbox_inches='tight') plt.show() feats = df.columns sets = {} for i, merge in enumerate(z): if merge[2] > 1-threshold: continue if merge[0] <= len(z): a = [feats[int(merge[0])]] else: a = sets.pop(int(merge[0]))['children'] if merge[1] <= len(z): b = [feats[int(merge[1])]] else: b = sets.pop(int(merge[1]))['children'] sets[1 + i + len(z)] = {'children': [*a, *b], 'distance': merge[2]} return sets
Example #12
Source File: kmeans.py From link-prediction_with_deep-learning with MIT License | 5 votes |
def kmeans(vectors, k, jobs=1): vectors = numpy.array(vectors) if with_sklearn: if jobs == 1: kmeans = sklearn.cluster.KMeans(k) else: kmeans = sklearn.cluster.KMeans(k, n_jobs=jobs) # sklearn > 0.10 kmeans.fit(vectors) return kmeans.labels_ else: codebook, distortion = scipy.cluster.vq.kmeans(vectors, k) cluster_ids, dist = scipy.cluster.vq.vq(vectors, codebook) return cluster_ids
Example #13
Source File: kmeans.py From link-prediction_with_deep-learning with MIT License | 5 votes |
def minibatch_kmeans(vectors, k): if not with_sklearn: raise NotImplementedError # Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) # uses batch size 1000. sklearn KMeans defaults to n_init 10 kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10) kmeans.fit(vectors) return kmeans.labels_
Example #14
Source File: kmeans.py From link-prediction_with_deep-learning with MIT License | 5 votes |
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.k is not None and options.k < 2: raise ValueError('cluster number must be >= 2') if options.method == MINIBATCH_KMEANS and not with_sklearn: logging.warning('minibatch kmeans not available, using kmeans (slow)') options.method = KMEANS if options.jobs != 1 and (options.method != KMEANS or not with_sklearn): logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS) options.jobs = 1 wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.k is None: options.k = int(math.ceil((len(wv.words())/2)**0.5)) logging.info('set k=%d (%d words)' % (options.k, len(wv.words()))) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
Example #15
Source File: signal_recompose.py From NeuroKit with MIT License | 5 votes |
def _signal_recompose_wcorr(components, threshold=0.5, metric="chebyshev"): """""" # Calculate the w-correlation matrix. wcorr = _signal_recompose_get_wcorr(components, show=False) # Find clusters in correlation matrix pairwise_distances = scipy.cluster.hierarchy.distance.pdist(wcorr, metric=metric) linkage = scipy.cluster.hierarchy.linkage(pairwise_distances, method="complete") threshold = threshold * pairwise_distances.max() clusters = scipy.cluster.hierarchy.fcluster(linkage, threshold, "distance") return clusters
Example #16
Source File: bounding.py From dynesty with MIT License | 5 votes |
def _get_covariance_from_clusters(self, points): """Compute covariance from re-centered clusters.""" # Compute pairwise distances. distances = spatial.distance.pdist(points, metric='mahalanobis', VI=self.am) # Identify conglomerates of points by constructing a linkage matrix. linkages = cluster.hierarchy.single(distances) # Cut when linkage between clusters exceed the radius. clusteridxs = cluster.hierarchy.fcluster(linkages, 1.0, criterion='distance') nclusters = np.max(clusteridxs) if nclusters == 1: return self._get_covariance_from_all_points(points) else: i = 0 overlapped_points = np.empty_like(points) for idx in np.unique(clusteridxs): group_points = points[clusteridxs == idx, :] group_mean = group_points.mean(axis=0).reshape((1, -1)) j = i + len(group_points) overlapped_points[i:j, :] = group_points - group_mean i = j return self._get_covariance_from_all_points(overlapped_points) ################## # HELPER FUNCTIONS ##################
Example #17
Source File: bounding.py From dynesty with MIT License | 5 votes |
def _get_covariance_from_clusters(self, points): """Compute covariance from re-centered clusters.""" # Compute pairwise distances. distances = spatial.distance.pdist(points, metric='mahalanobis', VI=self.am) # Identify conglomerates of points by constructing a linkage matrix. linkages = cluster.hierarchy.single(distances) # Cut when linkage between clusters exceed the radius. clusteridxs = cluster.hierarchy.fcluster(linkages, 1.0, criterion='distance') nclusters = np.max(clusteridxs) if nclusters == 1: return self._get_covariance_from_all_points(points) else: i = 0 overlapped_points = np.empty_like(points) for idx in np.unique(clusteridxs): group_points = points[clusteridxs == idx, :] group_mean = group_points.mean(axis=0).reshape((1, -1)) j = i + len(group_points) overlapped_points[i:j, :] = group_points - group_mean i = j return self._get_covariance_from_all_points(overlapped_points)
Example #18
Source File: clust_color.py From pyPESTO with BSD 3-Clause "New" or "Revised" License | 4 votes |
def assign_colors_for_result_list(num_results, colors=None): """ Creates a list of colors for a list of pypesto.Result objects or checks a user-provided list of colors and uses this if everything is ok Parameters ---------- num_results: int number of results in list colors: list, or RGBA, optional list of colors, or single color Returns ------- colors: list of RGBA One for each element in 'vals'. """ # if the user did not specify any colors: if colors is None: # default colors will be used, on for each entry in the result list. # Colors are created from assign_colors, which needs a dummy list dummy_clusters = np.array(list(range(num_results)) * 2) # we don't want alpha levels for all plotting routines in this case... colors = assign_colors(dummy_clusters, balance_alpha=False, highlight_global=False) # dummy cluster had twice as many entries as really there. Reduce. real_indices = list(range(int(colors.shape[0] / 2))) return colors[real_indices] # if the user specified color lies does not match the number of results if len(colors) != num_results: raise ('Incorrect color input. Colors must be specified either as ' 'list of [r, g, b, alpha] with length equal to function ' 'values Number of function (here: ' + str(num_results) + '), ' 'or as one single [r, g, b, alpha] color.') return colors
Example #19
Source File: clust_color.py From pyPESTO with BSD 3-Clause "New" or "Revised" License | 4 votes |
def assign_clusters(vals): """ Find clustering. Parameters ---------- vals: numeric list or array List to be clustered. Returns ------- clust: numeric list Indicating the corresponding cluster of each element from 'vals'. clustsize: numeric list Size of clusters, length equals number of clusters. """ # sanity checks if vals is None or len(vals) == 0: return [], [] elif len(vals) == 1: return np.array([0]), np.array([1.]) # linkage requires (n, 1) data array vals = np.reshape(vals, (-1, 1)) # however: clusters are sorted by size, not by value... Resort. # Create preallocated object first cluster_indices = np.zeros(vals.size, dtype=int) # get clustering based on distance clust = cluster.hierarchy.fcluster( cluster.hierarchy.linkage(vals), t=0.1, criterion='distance') # get unique clusters _, ind_clust = np.unique(clust, return_index=True) unique_clust = clust[np.sort(ind_clust)] cluster_size = np.zeros(unique_clust.size, dtype=int) # loop over clusters: resort and count number of entries for index, i_clust in enumerate(unique_clust): cluster_indices[np.where(clust == i_clust)] = index cluster_size[index] = sum(clust == i_clust) return cluster_indices, cluster_size
Example #20
Source File: bounding.py From dynesty with MIT License | 4 votes |
def bounding_ellipsoids(points, pointvol=0., vol_dec=0.5, vol_check=2.): """ Calculate a set of ellipsoids that bound the collection of points. Parameters ---------- points : `~numpy.ndarray` with shape (npoints, ndim) A set of coordinates. pointvol : float, optional Volume represented by a single point. When provided, used to set a minimum bound on the ellipsoid volume as `npoints * pointvol`. Default is `0.`. vol_dec : float, optional The required fractional reduction in volume after splitting an ellipsoid in order to to accept the split. Default is `0.5`. vol_check : float, optional The factor used to when checking whether the volume of the original bounding ellipsoid is large enough to warrant more trial splits via `ell.vol > vol_check * npoints * pointvol`. Default is `2.0`. Returns ------- mell : :class:`MultiEllipsoid` object The :class:`MultiEllipsoid` object used to bound the collection of points. """ if not HAVE_KMEANS: raise ValueError("scipy.cluster.vq.kmeans2 is required to compute " "ellipsoid decompositions.") # pragma: no cover # Calculate the bounding ellipsoid for the points possibly # enlarged to a minimum volume. ell = bounding_ellipsoid(points, pointvol=pointvol) # Recursively split the bounding ellipsoid until the volume of each # split no longer decreases by a factor of `vol_dec`. ells = _bounding_ellipsoids(points, ell, pointvol=pointvol, vol_dec=vol_dec, vol_check=vol_check) return MultiEllipsoid(ells=ells)
Example #21
Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 4 votes |
def get_normalization_cutoff(data, manual_cutoff=9600): col_sums = np.array(np.sum(data, axis=0)) # continue the loop if the differece the means is larger than the 2*2*STD of the larger cluster while True: try: # doing Kmean clustering col_sums_for_cluster = col_sums.reshape(-1,1) #separate distributions using kmean #kmeans = KMeans(n_clusters=2, random_state=0).fit(col_sums_for_cluster) #labels = kmeans.labels_ #separate distributions using mixture model clf = mixture.GaussianMixture(n_components=2, covariance_type='full') clf.fit(col_sums_for_cluster) labels = clf.predict(col_sums_for_cluster) unique, counts = np.unique(labels, return_counts=True) if len(unique)==1: break bigger_cluster = unique[np.argmax(counts)] smaller_cluster = unique[np.argmin(counts)] # estimating the magnitute of discripancy better the clusters bigger_cluster__dist = col_sums[labels==bigger_cluster] smaller_cluster__dist = col_sums[labels==smaller_cluster] bigger_cluster__dist_mean = np.mean(bigger_cluster__dist) bigger_cluster__dist_std = np.std(bigger_cluster__dist) smaller_cluster__dist_mean = np.mean(smaller_cluster__dist) #smaller_cluster__dist_std = np.std(smaller_cluster__dist) #print("bigger_cluster__dist_mean ", bigger_cluster__dist_mean) #print("bigger_cluster__dist_std ", bigger_cluster__dist_std) #print("smaller_cluster__dist_mean ", smaller_cluster__dist_mean) #print("smaller_cluster__dist_std ", smaller_cluster__dist_std) # continue the loop if the differece the means is larger than the 2*STD of the larger cluster if abs(bigger_cluster__dist_mean-smaller_cluster__dist_mean)< 2*2*bigger_cluster__dist_std: break # col_sums will be equal to bigger_cluster__dist for the next iteration col_sums = bigger_cluster__dist except: break mean = np.mean(col_sums) std = np.std(col_sums) cutoff = (mean + 2*(std)).astype(int) if cutoff<manual_cutoff: cutoff = manual_cutoff return cutoff
Example #22
Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 4 votes |
def get_normalization_cutoff(data, manual_cutoff=9600): col_sums = np.array(np.sum(data, axis=0)) # continue the loop if the differece the means is larger than the 2*2*STD of the larger cluster while True: try: # doing Kmean clustering col_sums_for_cluster = col_sums.reshape(-1,1) #separate distributions using kmean #kmeans = KMeans(n_clusters=2, random_state=0).fit(col_sums_for_cluster) #labels = kmeans.labels_ #separate distributions using mixture model clf = mixture.GaussianMixture(n_components=2, covariance_type='full') clf.fit(col_sums_for_cluster) labels = clf.predict(col_sums_for_cluster) unique, counts = np.unique(labels, return_counts=True) if len(unique)==1: break bigger_cluster = unique[np.argmax(counts)] smaller_cluster = unique[np.argmin(counts)] # estimating the magnitute of discripancy better the clusters bigger_cluster__dist = col_sums[labels==bigger_cluster] smaller_cluster__dist = col_sums[labels==smaller_cluster] bigger_cluster__dist_mean = np.mean(bigger_cluster__dist) bigger_cluster__dist_std = np.std(bigger_cluster__dist) smaller_cluster__dist_mean = np.mean(smaller_cluster__dist) #smaller_cluster__dist_std = np.std(smaller_cluster__dist) #print("bigger_cluster__dist_mean ", bigger_cluster__dist_mean) #print("bigger_cluster__dist_std ", bigger_cluster__dist_std) #print("smaller_cluster__dist_mean ", smaller_cluster__dist_mean) #print("smaller_cluster__dist_std ", smaller_cluster__dist_std) # continue the loop if the differece the means is larger than the 2*STD of the larger cluster if abs(bigger_cluster__dist_mean-smaller_cluster__dist_mean)< 2*2*bigger_cluster__dist_std: break # col_sums will be equal to bigger_cluster__dist for the next iteration col_sums = bigger_cluster__dist except: break mean = np.mean(col_sums) std = np.std(col_sums) cutoff = (mean + 2*(std)).astype(int) if cutoff<manual_cutoff: cutoff = manual_cutoff return cutoff
Example #23
Source File: diarizationFunctions.py From pyBK with MIT License | 4 votes |
def performClustering( speechMapping, segmentTable, segmentBKTable, segmentCVTable, Vg, bitsPerSegmentFactor, kbmSize, N_init, initialClustering, clusteringMetric): numberOfSegments = np.size(segmentTable,0) clusteringTable = np.zeros([numberOfSegments, N_init]) finalClusteringTable = np.zeros([numberOfSegments, N_init]) activeClusters = np.ones([N_init,1]) clustersBKTable = np.zeros([N_init, kbmSize]) clustersCVTable = np.zeros([N_init, kbmSize]) clustersBKTable, clustersCVTable = calcClusters(clustersCVTable,clustersBKTable,activeClusters,initialClustering,N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor) ####### Here the clustering algorithm begins. Steps are: ####### 1. Reassign all data among all existing signatures and retrain them ####### using the new clustering ####### 2. Save the resulting clustering solution ####### 3. Compare all signatures with each other and merge those two with ####### highest similarity, creating a new signature for the resulting ####### cluster ####### 4. Back to 1 if #clusters > 1 for k in range(N_init): ####### 1. Data reassignment. Calculate the similarity between the current segment with all clusters and assign it to the one which maximizes ####### the similarity. Finally re-calculate binaryKeys for all cluster # before doing anything, check if there are remaining clusters # if there is only one active cluster, break if np.sum(activeClusters)==1: break clustersStillActive=np.zeros([1,N_init]) segmentToClustersSimilarityMatrix = binaryKeySimilarity_cdist(clusteringMetric,segmentBKTable,segmentCVTable,clustersBKTable,clustersCVTable) # clusteringTable[:,k] = finalClusteringTable[:,k] = np.argmax(segmentToClustersSimilarityMatrix,axis=1)+1 clusteringTable[:,k] = finalClusteringTable[:,k] = np.nanargmax(segmentToClustersSimilarityMatrix,axis=1)+1 # clustersStillActive[:,np.unique(clusteringTable[:,k]).astype(int)-1] = 1 clustersStillActive[:,np.unique(clusteringTable[:,k]).astype(int)-1] = 1 ####### update all binaryKeys for all new clusters activeClusters = clustersStillActive clustersBKTable, clustersCVTable = calcClusters(clustersCVTable,clustersBKTable,activeClusters.T,clusteringTable[:,k].astype(int),N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor) ####### 2. Compare all signatures with each other and merge those two with highest similarity, creating a new signature for the resulting clusterSimilarityMatrix = binaryKeySimilarity_cdist(clusteringMetric,clustersBKTable,clustersCVTable,clustersBKTable,clustersCVTable) np.fill_diagonal(clusterSimilarityMatrix,np.nan) value = np.nanmax(clusterSimilarityMatrix) location = np.nanargmax(clusterSimilarityMatrix) R,C = np.unravel_index(location,(N_init,N_init)) ### Then we merge clusters R and C #print('Merging clusters',R+1,'and',C+1,'with a similarity score of',np.around(value,decimals=4)) print('Merging clusters','%3s'%str(R+1),'and','%3s'%str(C+1),'with a similarity score of',np.around(value,decimals=4)) activeClusters[0,C]=0 ### 3. Save the resulting clustering and go back to 1 if the number of clusters >1 mergingClusteringIndices = np.where(clusteringTable[:,k]==C+1) # update clustering table clusteringTable[mergingClusteringIndices[0],k]=R+1 # remove binarykey for removed cluster clustersBKTable[C,:]=np.zeros([1,kbmSize]) clustersCVTable[C,:]=np.nan # prepare the vector with the indices of the features of thew new cluster and then binarize segmentsToBinarize = np.where(clusteringTable[:,k]==R+1)[0] M=[] for l in np.arange(np.size(segmentsToBinarize,0)): M = np.append(M,np.arange(int(segmentTable[segmentsToBinarize][:][l,1]),int(segmentTable[segmentsToBinarize][:][l,2])+1)) clustersBKTable[R,:], clustersCVTable[R,:]=binarizeFeatures(kbmSize,Vg[np.array(speechMapping[np.array(M,dtype='int')],dtype='int')-1].T,bitsPerSegmentFactor) print('done') return clusteringTable, k
Example #24
Source File: utils.py From opensurfaces with MIT License | 4 votes |
def get_dominant_image_colors(image, num_clusters=4): """ Returns the dominant image color that isn't pure white or black. Uses kmeans on the colors. Returns the result as RGB hex strings in the format ['#rrggbb', '#rrggbb', ...]. :param image: PIL image or path """ if isinstance(image, basestring): image = Image.open(image) # downsample for speed im = image.resize((512, 512), Image.ANTIALIAS) # reshape ar0 = scipy.misc.fromimage(im) shape = ar0.shape npixels = scipy.product(shape[:2]) ar0 = ar0.reshape(npixels, shape[2]) # keep only nontransparent elements ar = ar0[ar0[:, 3] == 255][:, 0:3] try: # kmeans clustering codes, dist = scipy.cluster.vq.kmeans(ar, num_clusters) except: # kmeans sometimes fails -- if that is the case, use the mean color and # nothing else. arf = ar.astype(float) clamp = lambda p: max(0, min(255, int(p))) return ['#' + ''.join(['%0.2x' % clamp(arf[:, i].sum() / float(arf.shape[1])) for i in (0, 1, 2)])] vecs, dist = scipy.cluster.vq.vq(ar, codes) # assign codes counts, bins = scipy.histogram(vecs, len(codes)) # count occurrences # sort by count frequency indices = [i[0] for i in sorted(enumerate(counts), key=lambda x:x[1], reverse=True)] # convert to hex strings colors = [''.join(chr(c) for c in code).encode('hex') for code in codes] results = [] for idx in indices: color = colors[idx] if color != 'ffffff' and color != '000000': results.append('#' + color) return results