Python scipy.cluster.hierarchy.fcluster() Examples
The following are 29
code examples of scipy.cluster.hierarchy.fcluster().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scipy.cluster.hierarchy
, or try the search function
.
Example #1
Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 6 votes |
def dendrogram(data, threshold, layer_directory): colnames = data.columns data = np.array(data) Z = hierarchy.linkage(data.T, 'single', 'cosine') plt.figure(figsize=(15, 9)) dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold) plt.title("Clustering of Samples Based on Mutational Signatures" ) plt.ylabel("Cosine Distance") plt.xlabel("Sample IDs") #plt.ylim((0,1)) plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300) # which datapoints goes to which cluster # The indices of the datapoints will be displayed as the ids Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None) dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)}) dataframe = dataframe.set_index("Sample Names") #print(dataframe) dictionary = {"clusters":Y, "informations":dn} return dataframe ######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################
Example #2
Source File: make_bed.py From mCaller with MIT License | 6 votes |
def cluster(currents,context,original_labels,chrom,pos1,plot,plotdir,cluster=False): colours = {'m6A':'#B4656F','A':'#55B196'} #TODO update for other labels if len(currents) > 1 and cluster : pdistance = ssd.pdist(currents,metric='correlation') dm = ssd.squareform(pdistance) link = linkage(dm,method='complete',metric='correlation') klabels = fcluster(link,2,'maxclust') #1,'inconsistent') #2,'maxclust') #klabels = [1 if x == 1 else 0 for x in klabels] #labels = ['m6A']*len(klabels) strategy = 'correlation' else: klabels = [1 if x==1 else 0 for x in original_labels] strategy = 'classifierProb' if plot: plot_w_labels(klabels,original_labels,currents,strategy,context,'chrom.'+chrom+'.pos.'+pos1,plotdir,colours) #for cluster in clusters:
Example #3
Source File: cluster.py From cesi with Apache License 2.0 | 6 votes |
def getClusters(self, embed): n, m = len(embed), self.p.embed_dims X = np.empty((n, m), np.float32) for i in range(len(embed)): X[i, :] = embed[i] dist = pdist(X, metric=self.p.metric) clust_res = linkage(dist, method=self.p.linkage) labels = fcluster(clust_res, t=self.p.thresh_val, criterion='distance') - 1 clusters = [[] for i in range(max(labels) + 1)] for i in range(len(labels)): clusters[labels[i]].append(i) return clusters
Example #4
Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 6 votes |
def dendrogram(data, threshold, layer_directory): colnames = data.columns data = np.array(data) Z = hierarchy.linkage(data.T, 'single', 'cosine') plt.figure(figsize=(15, 9)) dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold) plt.title("Clustering of Samples Based on Mutational Signatures" ) plt.ylabel("Cosine Distance") plt.xlabel("Sample IDs") #plt.ylim((0,1)) plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300) # which datapoints goes to which cluster # The indices of the datapoints will be displayed as the ids Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None) dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)}) dataframe = dataframe.set_index("Sample Names") #print(dataframe) dictionary = {"clusters":Y, "informations":dn} return dataframe ######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################
Example #5
Source File: common.py From plastering with MIT License | 6 votes |
def hier_clustering(d, threshold=3): srcids = d.keys() tokenizer = lambda x: x.split() vectorizer = TfidfVectorizer(tokenizer=tokenizer) assert isinstance(d, dict) assert isinstance(list(d.values())[0], list) assert isinstance(list(d.values())[0][0], str) doc = [' '.join(d[srcid]) for srcid in srcids] vect = vectorizer.fit_transform(doc) #TODO: Make vect aligned to the required format z = linkage(vect.toarray(), metric='cityblock', method='complete') dists = list(set(z[:,2])) # threshold = 3 #threshold = (dists[2] + dists[3]) / 2 b = hier.fcluster(z, threshold, criterion='distance') cluster_dict = defaultdict(list) for srcid, cluster_id in zip(srcids, b): cluster_dict[str(cluster_id)].append(srcid) value_lengther = lambda x: len(x[1]) return OrderedDict(\ sorted(cluster_dict.items(), key=value_lengther, reverse=True))
Example #6
Source File: zodiac.py From plastering with MIT License | 6 votes |
def create_cluster_map(self, bow, srcids): cluster_map = {} z = linkage(bow, metric='cityblock', method='complete') dists = list(set(z[:, 2])) thresh = (dists[1] + dists[2]) / 2 self.logger.info('Threshold: {0}'.format(thresh)) b = hier.fcluster(z, thresh, criterion='distance') assert bow.shape[0] == len(b) assert len(b) == len(srcids) for cid, srcid in zip(b, srcids): cluster_map[cid] = cluster_map.get(cid, []) + [srcid] self.logger.info('# of clusters: {0}'.format(len(b))) self.logger.info('sizes of clustsers:{0}'.format(sorted(map(len, cluster_map.values())))) return cluster_map
Example #7
Source File: acquisition_scheme.py From dmipy with MIT License | 5 votes |
def calculate_shell_bvalues_and_indices(bvalues, max_distance=20e6): """ Calculates which measurements belong to different acquisition shells. It uses scipy's linkage clustering algorithm, which uses the max_distance input as a limit of including measurements in the same cluster. For example, if bvalues were [1, 2, 3, 4, 5] and max_distance was 1, then all bvalues would belong to the same cluster. However, if bvalues were [1, 2, 4, 5] max max_distance was 1, then this would result in 2 clusters. Parameters ---------- bvalues: 1D numpy array of shape (Ndata) bvalues of the acquisition in s/m^2. max_distance: float maximum b-value distance for a measurement to be included in the same shell. Returns ------- shell_indices: 1D numpy array of shape (Ndata) array of integers, starting from 0, representing to which shell a measurement belongs. The number itself has no meaning other than just being different for different shells. shell_bvalues: 1D numpy array of shape (Nshells) array of the mean bvalues for every acquisition shell. """ linkage_matrix = linkage(np.c_[bvalues]) clusters = fcluster(linkage_matrix, max_distance, criterion='distance') shell_indices = np.empty_like(bvalues, dtype=int) cluster_bvalues = np.zeros((np.max(clusters), 2)) for ind in np.unique(clusters): cluster_bvalues[ind - 1] = np.mean(bvalues[clusters == ind]), ind shell_bvalues, ordered_cluster_indices = ( cluster_bvalues[cluster_bvalues[:, 0].argsort()].T) for i, ind in enumerate(ordered_cluster_indices): shell_indices[clusters == ind] = i return shell_indices, shell_bvalues
Example #8
Source File: helper.py From Ensemble-Bayesian-Optimization with MIT License | 5 votes |
def mean_z(z_all, dim_limit): # use correlation clustering to average group assignments lz = hi.linkage(z_all.T, 'single', 'hamming') # not sure why cluster id starts from 1 z = hi.fcluster(lz, 0) - 1 all_cat = np.unique(z) for a in all_cat: a_size = np.sum(a == z) if a_size > dim_limit: z[a == z] = sample_multinomial([1.] * a_size, a_size, dim_limit) return z
Example #9
Source File: heatmap.py From SqueezeMeta with GNU General Public License v3.0 | 5 votes |
def plotDendrogram(self, matrix, axis, clusteringThreshold, orientation): d = dist.pdist(matrix) linkage = cluster.linkage(dist.squareform(d), method='average', metric='cityblock') dendrogram = cluster.dendrogram(linkage, orientation=orientation, link_color_func=lambda k: 'k') index = cluster.fcluster(linkage, clusteringThreshold * max(linkage[:,2]), 'distance') axis.set_xticks([]) axis.set_yticks([]) return index, dendrogram['leaves']
Example #10
Source File: regions.py From TOBIAS with MIT License | 5 votes |
def cluster(self, threshold=0.5, method="average"): """ Main function to cluster the overlap dictionary into clusters""" self.overlap_to_distance() if len(self.names) > 1: self.linkage_mat = linkage(squareform(self.distance_mat), method) self.labels = fcluster(self.linkage_mat, threshold, criterion="distance") #ordering of the dendrogram #Find clusters below threshold self.linkage_clusters = dict(zip(range(self.n), [[num] for num in range(self.n)])) for i, row in enumerate(self.linkage_mat): ID1 = int(row[0]) ID2 = int(row[1]) new = self.n + i dist = row[2] if dist <= threshold: self.linkage_clusters[new] = self.linkage_clusters[ID1] + self.linkage_clusters[ID2] + [new] del self.linkage_clusters[ID1] del self.linkage_clusters[ID2] #Add member-names to clusters for cluster in self.linkage_clusters: self.clusters[cluster] = {"member_idx": [idx for idx in self.linkage_clusters[cluster] if idx < self.n]} self.clusters[cluster]["member_names"] = [self.names[idx] for idx in self.clusters[cluster]["member_idx"]] else: #only one TF self.linkage_clusters = {0:[0]} self.linkage_mat = np.array([[0]]) self.clusters[0] = {"member_idx":[0]} self.clusters[0]["member_names"] = [self.names[idx] for idx in self.clusters[0]["member_idx"]] self.get_cluster_names() #Set names of clusters self.assign_colors()
Example #11
Source File: motifs.py From TOBIAS with MIT License | 5 votes |
def cluster(self, threshold=0.5, metric = "pcc", clust_method = "average"): """ Returns: ---------- dict A dictionary with keys=cluster names and values=MotifList objects """ #Needs gimmemotif from gimmemotifs.motif import Motif from gimmemotifs.comparison import MotifComparer sns.set_style("ticks") #set style back to ticks, as this is set globally during gimmemotifs import #Fill in self.gimme_obj variable motif_list = [motif.get_gimmemotif().gimme_obj for motif in self] #list of gimmemotif objects #Similarities between all motifs mc = MotifComparer() score_dict = mc.get_all_scores(motif_list, motif_list, match = "total", metric = metric, combine = "mean") #metric can be: seqcor, pcc, ed, distance, wic, chisq, akl or ssd self.similarity_matrix = generate_similarity_matrix(score_dict) # Clustering vector = ssd.squareform(self.similarity_matrix.to_numpy()) self.linkage_mat = linkage(vector, method=clust_method) # Flatten clusters fclust_labels = fcluster(self.linkage_mat, threshold, criterion="distance") #cluster membership per motif formatted_labels = ["Cluster_{0}".format(label) for label in fclust_labels] # Extract motifs belonging to each cluster cluster_dict = {label: MotifList() for label in formatted_labels} #initialize dictionary for i, cluster_label in enumerate(formatted_labels): cluster_dict[cluster_label].append(self[i]) return cluster_dict
Example #12
Source File: clustering.py From clust with GNU Lesser General Public License v3.0 | 5 votes |
def chc(X, K, params=()): pnames = ['linkage_method', 'distance'] dflts = [ 'ward', 'euclidean'] if isinstance(params, np.ndarray): paramsloc = params.tolist() else: paramsloc = params (linkage_method, distance) = ds.resolveargumentpairs(pnames, dflts, paramsloc) Z = sphc.linkage(X, method=linkage_method, metric=distance) C = sphc.fcluster(Z, K, criterion='maxclust') return clustVec2partMat(C, K) # Other related functions
Example #13
Source File: construction.py From FinanceHub with MIT License | 5 votes |
def __init__(self, data, method='single', metric='euclidean'): """ Combines the assets in `data` using HRP returns an object with the following attributes: - 'cov': covariance matrix of the returns - 'corr': correlation matrix of the returns - 'sort_ix': list of sorted column names according to cluster - 'link': linkage matrix of size (N-1)x4 with structure Y=[{y_m,1 y_m,2 y_m,3 y_m,4}_m=1,N-1]. At the i-th iteration, clusters with indices link[i, 0] and link[i, 1] are combined to form cluster n+1. A cluster with an index less than n corresponds to one of the original observations. The distance between clusters link[i, 0] and link[i, 1] is given by link[i, 2]. The fourth value link[i, 3] represents the number of original observations in the newly formed cluster. - 'weights': final weights for each asset :param data: pandas DataFrame where each column is a series of returns :param method: any method available in scipy.cluster.hierarchy.linkage :param metric: any metric available in scipy.cluster.hierarchy.linkage """ assert isinstance(data, pd.DataFrame), "input 'data' must be a pandas DataFrame" self.cov = data.cov() self.corr = data.corr() self.method = method self.metric = metric self.link = self._tree_clustering(self.corr, self.method, self.metric) self.sort_ix = self._get_quasi_diag(self.link) self.sort_ix = self.corr.index[self.sort_ix].tolist() # recover labels self.sorted_corr = self.corr.loc[self.sort_ix, self.sort_ix] # reorder correlation matrix self.weights = self._get_recursive_bisection(self.cov, self.sort_ix) # TODO self.cluster_nember = sch.fcluster(self.link, t=5, criterion='maxclust')
Example #14
Source File: heatmap.py From CompareM with GNU General Public License v3.0 | 5 votes |
def plotDendrogram(self, matrix, axis, clusteringThreshold, orientation): d = dist.pdist(matrix) linkage = cluster.linkage(dist.squareform(d), method='average', metric='cityblock') dendrogram = cluster.dendrogram(linkage, orientation=orientation, link_color_func=lambda k: 'k') index = cluster.fcluster(linkage, clusteringThreshold * max(linkage[:,2]), 'distance') axis.set_xticks([]) axis.set_yticks([]) return index, dendrogram['leaves']
Example #15
Source File: normalizer.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 5 votes |
def _normalize_col(self, df, ci): """ Normalizing column in given dataframe :param df: input dataframe :param ci: column name :return: normalized dataframe with respect to given column """ col_name = ci.col_name col = df.select(col_name).collect() col = [row[col_name].encode('utf-8', 'replace') if row[col_name] is not None else ''for row in col] distinct = list(set(col)) if len(distinct) > self.max_distinct or len(distinct) <= 1: return df similarity = self._compute_distances(distinct, ci.distance_fcn) z = linkage(similarity) labels = fcluster(z, ci.threshold, 'distance') # sets up map from value to most common value in that cluster clusters = self._get_exemplars(col, labels, distinct) new_col = [clusters[val][0] for val in col] df = df.na.replace(col, new_col, col_name) return df
Example #16
Source File: listing_6_4_find_metric_groups.py From fight-churn with MIT License | 5 votes |
def find_correlation_clusters(corr,corr_thresh): dissimilarity = 1.0 - corr hierarchy = linkage(squareform(dissimilarity), method='single') diss_thresh = 1.0 - corr_thresh labels = fcluster(hierarchy, diss_thresh, criterion='distance') return labels
Example #17
Source File: handcrafting.py From pysystemtrade with GNU General Public License v3.0 | 5 votes |
def _cluster_breakdown(self): """ Creates clusters from the portfolio (doesn't create sub portfolios, but tells you which ones to make) Credit to this notebook: https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb :return: list of int same length as instruments """ X = self.corr_matrix.values d = sch.distance.pdist(X) L = sch.linkage(d, method='complete') ind = sch.fcluster(L, MAX_CLUSTER_SIZE, criterion='maxclust') return list(ind)
Example #18
Source File: SVIM_clustering.py From svim with GNU General Public License v3.0 | 5 votes |
def clusters_from_partitions(partitions, options): """Finds clusters in partitions using span-position distance and hierarchical clustering. Assumes that all signatures in the given partition are of the same type and on the same contig""" clusters_final = [] large_partitions = 0 # Find clusters in each partition individually. for partition in partitions: if len(partition) == 1: clusters_final.append([partition[0]]) continue elif len(partition) > 100: partition_sample = sample(partition, 100) large_partitions += 1 else: partition_sample = partition element_type = partition_sample[0].type if element_type == "DEL" or element_type == "INV" or element_type == "DUP_TAN": data = np.array( [[signature.get_source()[1], signature.get_source()[2], options.distance_normalizer] for signature in partition_sample]) Z = linkage(data, method = "average", metric = span_position_distance) elif element_type == "INS": data = np.array( [[signature.get_source()[1], signature.get_source()[2], options.distance_normalizer] for signature in partition_sample]) Z = linkage(data, method = "average", metric = span_position_distance_insertions) elif element_type == "DUP_INT": data = np.array( [[signature.get_source()[1], signature.get_source()[2], signature.get_destination()[1], options.distance_normalizer] for signature in partition_sample]) Z = linkage(data, method = "average", metric = span_position_distance_intdups) cluster_indices = list(fcluster(Z, options.cluster_max_distance, criterion='distance')) new_clusters = [[] for i in range(max(cluster_indices))] for signature_index, cluster_index in enumerate(cluster_indices): new_clusters[cluster_index-1].append(partition_sample[signature_index]) clusters_final.extend(new_clusters) if len(partitions) > 0: if len(partitions[0]) > 0: logging.debug("%d out of %d partitions for %s exceeded 100 elements." % (large_partitions, len(partitions), partitions[0][0].type)) return clusters_final
Example #19
Source File: SemanticAnalysis.py From CAN_Reverse_Engineering with GNU General Public License v3.0 | 5 votes |
def signal_clustering(corr_matrix: DataFrame, threshold: float, cluster_pickle: str = "", linkage_pickle: str = "", force: bool = False): if force: if path.isfile(cluster_pickle): remove(cluster_pickle) if path.isfile(linkage_pickle): remove(linkage_pickle) if path.isfile(cluster_pickle) and path.isfile(linkage_pickle): print("\nSignal clustering already completed and forcing is turned off. Using pickled data...") return [load(open(cluster_pickle, "rb")), load(open(linkage_pickle, "rb"))] # Remove negative values from the correlation matrix and invert the values corr_matrix.where(corr_matrix > 0, 0, inplace=True) corr_matrix = 1 - corr_matrix X = corr_matrix.values # type: ndarray Y = clip(ssd.squareform(X), 0, None) # Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method Z = linkage(Y, method='single', optimal_ordering=True) fclus = fcluster(Z, t=threshold, criterion='distance') cluster_dict = {} for i, cluster_label in enumerate(fclus): if cluster_label in cluster_dict: cluster_dict[cluster_label].append(corr_matrix.index[i]) else: cluster_dict[cluster_label] = [corr_matrix.index[i]] return cluster_dict, Z
Example #20
Source File: atlas3.py From ssbio with MIT License | 5 votes |
def remove_correlated_feats(df): tmp = df.T # Remove columns with no variation nunique = tmp.apply(pd.Series.nunique) cols_to_drop = nunique[nunique == 1].index tmp.drop(cols_to_drop, axis=1, inplace=True) perc_spearman = scipy.stats.spearmanr(tmp) abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape), np.absolute(perc_spearman.correlation)) np.fill_diagonal(abs_corr, 0) abs_corr_clean = np.maximum(abs_corr, abs_corr.transpose()) # some floating point mismatches, just make symmetric clustering = linkage(squareform(abs_corr_clean), method='average') clusters = fcluster(clustering, .1, criterion='distance') names = tmp.columns.tolist() names_to_cluster = list(zip(names, clusters)) indices_to_keep = [] ### Extract models closest to cluster centroids for x in range(1, len(set(clusters)) + 1): # Create mask from the list of assignments for extracting submatrix of the cluster mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool) # Take the index of the column with the smallest sum of distances from the submatrix idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :])) # Extract names of cluster elements from names_to_cluster sublist = [name for (name, cluster) in names_to_cluster if cluster == x] # Element closest to centroid centroid = sublist[idx] indices_to_keep.append(centroid) return df.loc[df.index.isin(indices_to_keep)]
Example #21
Source File: hierarchy.py From malss with MIT License | 5 votes |
def fit_predict(self, X, y=None): self.model = linkage(X, method=self.method, metric=self.metric) return fcluster(self.model, t=self.n_clusters, criterion='maxclust') - 1
Example #22
Source File: LogClustering.py From loglizer with MIT License | 5 votes |
def _offline_clustering(self, X): print('Starting offline clustering...') p_dist = pdist(X, metric=self._distance_metric) Z = linkage(p_dist, 'complete') cluster_index = fcluster(Z, self.max_dist, criterion='distance') self._extract_representatives(X, cluster_index) print('Processed {} instances.'.format(X.shape[0])) print('Found {} clusters offline.\n'.format(len(self.representatives))) # print('The representive vectors are:') # pprint.pprint(self.representatives.tolist())
Example #23
Source File: common.py From plastering with MIT License | 5 votes |
def get_word_clusters(sentence_dict): srcids = list(sentence_dict.keys()) sentences = [] for srcid in srcids: sentence = [] for metadata_type, sent in sentence_dict[srcid].items(): sentence.append(''.join(sent)) sentence = '\n'.join(sentence) sentence = ' '.join(re.findall('[a-z]+', sentence)) sentences.append(sentence) vect = TfidfVectorizer() #vect = CountVectorizer() bow = vect.fit_transform(sentences).toarray() try: z = linkage(bow, metric='cityblock', method='complete') except: pdb.set_trace() dists = list(set(z[:,2])) thresh = (dists[2] + dists[3]) /2 #thresh = (dists[1] + dists[2]) /2 print("Threshold: ", thresh) b = hier.fcluster(z,thresh, criterion='distance') cluster_dict = defaultdict(list) for srcid, cluster_id in zip(srcids, b): cluster_dict[cluster_id].append(srcid) return dict(cluster_dict)
Example #24
Source File: cluster.py From catch with MIT License | 5 votes |
def cluster_from_dist_matrix(dist_matrix, threshold): """Use scipy to cluster a distance matrix. Args: dist_matrix: distance matrix, represented in scipy's 1d condensed form threshold: maximum inter-cluster distance to merge clusters (higher results in fewer clusters) Returns: list c such that c[i] is a collection of all the observations (whose pairwise distances are indexed in dist) in the i'th cluster, in sorted order by descending cluster size """ linkage = hierarchy.linkage(dist_matrix, method='average') clusters = hierarchy.fcluster(linkage, threshold, criterion='distance') # clusters are numbered starting at 1, but base the count on # first_clust_num just in case this changes first_clust_num = min(clusters) num_clusters = max(clusters) + 1 - first_clust_num elements_in_cluster = defaultdict(list) for i, clust_num in enumerate(clusters): elements_in_cluster[clust_num].append(i) cluster_sizes = {c: len(elements_in_cluster[c]) for c in range(first_clust_num, num_clusters + first_clust_num)} elements_in_cluster_sorted = [] for clust_num, _ in sorted(cluster_sizes.items(), key=operator.itemgetter(1), reverse=True): elements_in_cluster_sorted += [elements_in_cluster[clust_num]] return elements_in_cluster_sorted
Example #25
Source File: nominal.py From dython with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cluster_correlations(corr_mat, indices=None): """ Apply agglomerative clustering in order to sort a correlation matrix. Based on https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb Parameters: ----------- - corr_mat : a square correlation matrix (pandas DataFrame) - indices : cluster labels [None]; if not provided we'll do an aglomerative clustering to get cluster labels. Returns: -------- - corr : a sorted correlation matrix - indices : cluster indexes based on the original dataset Example: -------- >> assoc = associations( customers, plot=False ) >> correlations = assoc['corr'] >> correlations, _ = cluster_correlations(correlations) """ if indices is None: X = corr_mat.values d = sch.distance.pdist(X) L = sch.linkage(d, method='complete') indices = sch.fcluster(L, 0.5 * d.max(), 'distance') columns = [corr_mat.columns.tolist()[i] for i in list((np.argsort(indices)))] corr_mat = corr_mat.reindex(columns=columns).reindex(index=columns) return corr_mat, indices
Example #26
Source File: env_corr.py From glosim with MIT License | 4 votes |
def clusterdistmat(distmatrixfile,sim,dcut,mode='average',plot=False): # Compute the clusturing on dist^2 so that the average # distance of a cluster with an other is the RMS distance sim2 = sim*sim Z = sc.linkage(sim2,mode) cdist = Z[:,2] # get the full tree # dendo = sc.dendrogram(Z) # clist = dendo['leaves'] nclust = cluster.estimate_ncluster(cdist,dcut) clist = sc.fcluster(Z,nclust,criterion='maxclust') c_count = Counter(clist) nbclst = len(c_count) print "Number of clusters", nbclst rep_ind = getrep_ind(sim2,clist,c_count) # Write the groupe indices and representatives filename=basename(distmatrixfile)+'-cluster.index' f=open(filename,"w") f.write(" # groupid representative \n ") for i in range(len(sim)): iselect=0 if i in rep_ind: iselect=2 f.write("%d %d \n " %(clist[i]-1, iselect)) f.close() if plot: filename=basename(distmatrixfile)+'-dendogram.eps' plotdendro(Z,nclust,filename,rep_ind) c_list = np.zeros(len(sim)) # Change cluster groups numbering to (0:n-1) for i in range(len(sim)): c_list[i] = int(clist[i]-1) return c_list,Z # Determine the representative element of each cluster group
Example #27
Source File: cluster_eac.py From combo with BSD 2-Clause "Simplified" License | 4 votes |
def fit(self, X): """Fit estimators. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. """ # Validate inputs X X = check_array(X) n_samples = X.shape[0] # initialize similarity matrix sim_mat_all = np.zeros([n_samples, n_samples]) if self.pre_fitted: print("Training Skipped") else: for clf in self.base_estimators: clf.fit(X) clf.fitted_ = True for i, estimator in enumerate(self.base_estimators): check_is_fitted(estimator, ['labels_']) # get the labels from each base estimator labels = estimator.labels_.reshape(n_samples, 1) # generate the similarity matrix for the current estimator sim_mat = _generate_similarity_mat(labels) # add to the main similarity mat sim_mat_all = sim_mat_all + sim_mat # get the average of the similarity mat sim_mat_avg = np.divide(sim_mat_all, self.n_base_estimators_) # flip the similarity. smaller value implies more similarity sim_mat_avg = np.abs(np.max(sim_mat_avg) - sim_mat_avg) # build clusters self.Z_ = linkage(sim_mat_avg, method=self.linkage_method) self.labels_ = fcluster(self.Z_, self.n_clusters, criterion='maxclust') # it may leads to different number of clusters as specified by the user if len(np.unique(self.labels_)) != self.n_clusters: warnings.warn( 'EAC generates {n} clusters instead of {n_clusters}'.format( n=len(np.unique(self.labels_)), n_clusters=self.n_clusters)) return self
Example #28
Source File: atlas3.py From ssbio with MIT License | 4 votes |
def clean_data(self, keep_features=None, remove_correlated_feats=True): self.features_df = self.features_df.astype(float).fillna(0) self.features_df = self.features_df.loc[(self.features_df > 0).any(axis=1)] if keep_features: self.features_df = self.features_df.loc[self.features_df.index.isin(keep_features)] if remove_correlated_feats: tmp = self.features_df.T # Remove columns with no variation nunique = tmp.apply(pd.Series.nunique) cols_to_drop = nunique[nunique == 1].index tmp.drop(cols_to_drop, axis=1, inplace=True) perc_spearman = scipy.stats.spearmanr(tmp) abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape), np.absolute(perc_spearman.correlation)) np.fill_diagonal(abs_corr, 0) abs_corr_clean = np.maximum(abs_corr, abs_corr.transpose()) # some floating point mismatches, just make symmetric clustering = linkage(squareform(abs_corr_clean), method='average') clusters = fcluster(clustering, .1, criterion='distance') names = tmp.columns.tolist() names_to_cluster = list(zip(names, clusters)) indices_to_keep = [] ### Extract models closest to cluster centroids for x in range(1, len(set(clusters)) + 1): # Create mask from the list of assignments for extracting submatrix of the cluster mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool) # Take the index of the column with the smallest sum of distances from the submatrix idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :])) # Extract names of cluster elements from names_to_cluster sublist = [name for (name, cluster) in names_to_cluster if cluster == x] # Element closest to centroid centroid = sublist[idx] indices_to_keep.append(centroid) self.features_df = self.features_df.loc[self.features_df.index.isin(indices_to_keep)]
Example #29
Source File: safe.py From safepy with GNU General Public License v3.0 | 4 votes |
def define_domains(self, **kwargs): # Overwriting global settings, if necessary if 'attribute_distance_threshold' in kwargs: self.attribute_distance_threshold = kwargs['attribute_distance_threshold'] # Make sure that the settings are still valid self.validate_config() m = self.nes_binary[:, self.attributes['top']].T Z = linkage(m, method='average', metric=self.attribute_distance_metric) max_d = np.max(Z[:, 2] * self.attribute_distance_threshold) domains = fcluster(Z, max_d, criterion='distance') self.attributes['domain'] = 0 self.attributes.loc[self.attributes['top'], 'domain'] = domains # Assign nodes to domains node2nes = pd.DataFrame(data=self.nes, columns=[self.attributes.index.values, self.attributes['domain']]) node2nes_binary = pd.DataFrame(data=self.nes_binary, columns=[self.attributes.index.values, self.attributes['domain']]) # # A node belongs to the domain that contains the attribute # for which the node has the highest enrichment # self.node2domain = node2es.groupby(level='domain', axis=1).max() # t_max = self.node2domain.loc[:, 1:].max(axis=1) # t_idxmax = self.node2domain.loc[:, 1:].idxmax(axis=1) # t_idxmax[t_max < -np.log10(self.enrichment_threshold)] = 0 # A node belongs to the domain that contains the highest number of attributes # for which the nodes is significantly enriched self.node2domain = node2nes_binary.groupby(level='domain', axis=1).sum() t_max = self.node2domain.loc[:, 1:].max(axis=1) t_idxmax = self.node2domain.loc[:, 1:].idxmax(axis=1) t_idxmax[t_max == 0] = 0 self.node2domain['primary_domain'] = t_idxmax # Get the max NES for the primary domain o = node2nes.groupby(level='domain', axis=1).max() i = pd.Series(t_idxmax) self.node2domain['primary_nes'] = o.lookup(i.index, i.values) if self.verbose: num_domains = len(np.unique(domains)) num_attributes_per_domain = self.attributes.loc[self.attributes['domain'] > 0].groupby('domain')['id'].count() min_num_attributes = num_attributes_per_domain.min() max_num_attributes = num_attributes_per_domain.max() print('Number of domains: %d (containing %d-%d attributes)' % (num_domains, min_num_attributes, max_num_attributes))