Python scipy.cluster.hierarchy.linkage() Examples
The following are 30
code examples of scipy.cluster.hierarchy.linkage().
Example #1
Source File: From anvio with GNU General Public License v3.0 | 6 votes |
def create_newick_file_from_matrix_file(observation_matrix_path, output_file_path, linkage=constants.linkage_method_default, distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False, items_order_file_path=None): is_distance_and_linkage_compatible(distance, linkage) filesnpaths.is_file_exists(observation_matrix_path) filesnpaths.is_file_tab_delimited(observation_matrix_path) filesnpaths.is_output_file_writable(output_file_path) if items_order_file_path: filesnpaths.is_output_file_writable(items_order_file_path) id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path, transpose=transpose) vectors = np.array(vectors) newick = get_newick_from_matrix(vectors, distance, linkage, norm, id_to_sample_dict) if output_file_path: open(output_file_path, 'w').write(newick.strip() + '\n') if items_order_file_path: open(items_order_file_path, 'w').write('\n'.join(utils.get_names_order_from_newick_tree(newick)) + '\n')
Example #2
Source File: From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 6 votes |
def dendrogram(data, threshold, layer_directory): colnames = data.columns data = np.array(data) Z = hierarchy.linkage(data.T, 'single', 'cosine') plt.figure(figsize=(15, 9)) dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold) plt.title("Clustering of Samples Based on Mutational Signatures" ) plt.ylabel("Cosine Distance") plt.xlabel("Sample IDs") #plt.ylim((0,1)) plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300) # which datapoints goes to which cluster # The indices of the datapoints will be displayed as the ids Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None) dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)}) dataframe = dataframe.set_index("Sample Names") #print(dataframe) dictionary = {"clusters":Y, "informations":dn} return dataframe ######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################
Example #3
Source File: From aggregation with Apache License 2.0 | 6 votes |
def __agglomerative__(self,markings): """ runs an initial agglomerative clustering over the given markings :param markings: :return: """ # this converts stuff into panda format - probably a better way to do this but the labels do seem # necessary labels = [str(i) for i in markings] param_labels = [str(i) for i in range(len(markings[0]))] df = pd.DataFrame(np.array(markings), columns=param_labels, index=labels) row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels) # use ward metric to do the actual clustering row_clusters = linkage(row_dist, method='ward') return row_clusters
Example #4
Source File: From scprep with GNU General Public License v3.0 | 6 votes |
def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels, s, c): # cluster tissues hierarchically using mean size and color tissue_features = [] for tissue in tissue_names: tissue_data = [] for cluster in cluster_names: tissue_cluster_idx = np.where( (np.array(tissue_labels) == tissue) & (np.array(cluster_labels) == cluster) ) tissue_data.append( np.vstack([s[tissue_cluster_idx], c[tissue_cluster_idx]]).mean(axis=1) ) tissue_features.append(np.concatenate(tissue_data)) tissue_features = np.array(tissue_features) # normalize tissue_features = tissue_features / np.sqrt(np.sum(tissue_features ** 2)) tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features)) return tissues_order
Example #5
Source File: From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 6 votes |
def dendrogram(data, threshold, layer_directory): colnames = data.columns data = np.array(data) Z = hierarchy.linkage(data.T, 'single', 'cosine') plt.figure(figsize=(15, 9)) dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold) plt.title("Clustering of Samples Based on Mutational Signatures" ) plt.ylabel("Cosine Distance") plt.xlabel("Sample IDs") #plt.ylim((0,1)) plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300) # which datapoints goes to which cluster # The indices of the datapoints will be displayed as the ids Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None) dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)}) dataframe = dataframe.set_index("Sample Names") #print(dataframe) dictionary = {"clusters":Y, "informations":dn} return dataframe ######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################
Example #6
Source File: From pypath with GNU General Public License v3.0 | 6 votes |
def make_plot(self): self.z = hc.linkage(, method='average') = self.fig.add_subplot(1, 1, 1) self.dendro = \ hc.dendrogram(self.z,, color_threshold=0, orientation='left',, link_color_func=lambda x: self.color) _ = [ tl.set_fontproperties(self.fp_ticklabel) for tl in ] _ = [ tl.set_fontproperties(self.fp_ticklabel) for tl in ], color='#FFFFFF', lw=1, ls='solid')'#EAEAF2') list(map(lambda s: s.set_lw(0),'both', length=0)
Example #7
Source File: From cesi with Apache License 2.0 | 6 votes |
def getClusters(self, embed): n, m = len(embed), self.p.embed_dims X = np.empty((n, m), np.float32) for i in range(len(embed)): X[i, :] = embed[i] dist = pdist(X, metric=self.p.metric) clust_res = linkage(dist, method=self.p.linkage) labels = fcluster(clust_res, t=self.p.thresh_val, criterion='distance') - 1 clusters = [[] for i in range(max(labels) + 1)] for i in range(len(labels)): clusters[labels[i]].append(i) return clusters
Example #8
Source File: From pyBK with MIT License | 6 votes |
def performClusteringLinkage(segmentBKTable, segmentCVTable, N_init, linkageCriterion,linkageMetric ): from scipy.cluster.hierarchy import linkage from scipy import cluster if linkageMetric == 'jaccard': observations = segmentBKTable elif linkageMetric == 'cosine': observations = segmentCVTable else: observations = segmentCVTable clusteringTable = np.zeros([np.size(segmentCVTable,0),N_init]) Z = linkage(observations,method=linkageCriterion,metric=linkageMetric) for i in np.arange(N_init): clusteringTable[:,i] = cluster.hierarchy.cut_tree(Z,N_init-i).T+1 k=N_init print('done') return clusteringTable, k
Example #9
Source File: From finance_ml with MIT License | 6 votes |
def get_hrp(cov, corr): """Construct a hierarchical portfolio Params ------ cov: pd.DataFrame corr: pd.DataFrame Returns ------- pd.Series """ dist = get_corr_dist(corr) link = sch.linkage(dist, 'single') sort_idx = get_quasi_diag(link) # Recover label sort_idx = corr.index[sort_idx].tolist() hrp = get_rec_bipart(cov, sort_idx) return hrp.sort_index()
Example #10
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_cluster_distances_with_distance_threshold(): rng = np.random.RandomState(0) n_samples = 100 X = rng.randint(-10, 10, size=(n_samples, 3)) # check the distances within the clusters and with other clusters distance_threshold = 4 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, linkage="single").fit(X) labels = clustering.labels_ D = pairwise_distances(X, metric="minkowski", p=2) # to avoid taking the 0 diagonal in min() np.fill_diagonal(D, np.inf) for label in np.unique(labels): in_cluster_mask = labels == label max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask] .min(axis=0).max()) min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask] .min(axis=0).min()) # single data point clusters only have that inf diagonal here if in_cluster_mask.sum() > 1: assert max_in_cluster_distance < distance_threshold assert min_out_cluster_distance >= distance_threshold
Example #11
Source File: From TTClust with GNU General Public License v3.0 | 6 votes |
def generate_graphs(clusters_list, output, size, linkage, cutoff, distances, traj): """ DESCRIPTION Create a linear cluster mapping graph where every frame is printed as a colored barplot Args: clusters_list (list): list of cluster output (string): output name for graph size (int): number of frames linkage (numpy array): matrix linkage cutoff (float): cutoff distance value for clustering (in the dendogram) distances(numpy array): distance matrix traj (Trajectory): trajectory for time usage in axis barplot Return: colors_list (list) to be used with 2D distance projection graph """ colors_list = plot_barplot(clusters_list, output, size, traj) plot_dendro(linkage, output, cutoff, colors_list, clusters_list) plot_hist(clusters_list, output, colors_list) if (distances.shape[0] < 10000): plot_distmat(distances, output) else: printScreenLogfile("Too many frames! The RMSD distance matrix will not be generated") return colors_list
Example #12
Source File: From plastering with MIT License | 6 votes |
def hier_clustering(d, threshold=3): srcids = d.keys() tokenizer = lambda x: x.split() vectorizer = TfidfVectorizer(tokenizer=tokenizer) assert isinstance(d, dict) assert isinstance(list(d.values())[0], list) assert isinstance(list(d.values())[0][0], str) doc = [' '.join(d[srcid]) for srcid in srcids] vect = vectorizer.fit_transform(doc) #TODO: Make vect aligned to the required format z = linkage(vect.toarray(), metric='cityblock', method='complete') dists = list(set(z[:,2])) # threshold = 3 #threshold = (dists[2] + dists[3]) / 2 b = hier.fcluster(z, threshold, criterion='distance') cluster_dict = defaultdict(list) for srcid, cluster_id in zip(srcids, b): cluster_dict[str(cluster_id)].append(srcid) value_lengther = lambda x: len(x[1]) return OrderedDict(\ sorted(cluster_dict.items(), key=value_lengther, reverse=True))
Example #13
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') for linkage in ('single', 'average', 'average', 'ward'): clustering = AgglomerativeClustering(n_clusters=3, linkage=linkage, connectivity=connectivity) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, true_labels), 1)
Example #14
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') assert_almost_equal(normalized_mutual_info_score(clustering.labels_, moon_labels), 1) circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') assert_almost_equal(normalized_mutual_info_score(clustering.labels_, circle_labels), 1)
Example #15
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_unstructured_linkage_tree(): # Check that we obtain the correct solution for unstructured linkage trees. rng = np.random.RandomState(0) X = rng.randn(50, 100) for this_X in (X, X[0]): # With specified a number of clusters just for the sake of # raising a warning and testing the warning code with ignore_warnings(): children, n_nodes, n_leaves, parent = assert_warns( UserWarning, ward_tree, this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert_equal(len(children) + n_leaves, n_nodes) for tree_builder in _TREE_BUILDERS.values(): for this_X in (X, X[0]): with ignore_warnings(): children, n_nodes, n_leaves, parent = assert_warns( UserWarning, tree_builder, this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert_equal(len(children) + n_leaves, n_nodes)
Example #16
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_structured_linkage_tree(): # Check that we obtain the correct solution for structured linkage trees. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) # Avoiding a mask with only 'True' entries mask[4:7, 4:7] = 0 X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) for tree_builder in _TREE_BUILDERS.values(): children, n_components, n_leaves, parent = \ tree_builder(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes # Check that ward_tree raises a ValueError with a connectivity matrix # of the wrong shape assert_raises(ValueError, tree_builder, X.T, np.ones((4, 4))) # Check that fitting with no samples raises an error assert_raises(ValueError, tree_builder, X.T[:0], connectivity)
Example #17
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foo') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
Example #18
Source File: From dash-bio with MIT License | 6 votes |
def _get_clusters(self): """Cluster the data according to the specified dimensions. Returns: - tuple: The linkage matrices for the columns and/or rows. """ Zcol = None Zrow = None # cluster along columns if self._cluster in ["col", "all"]: tmp = np.transpose(self._data) dcol = self._dist_fun(tmp, metric=self._col_dist) Zcol = self._link_fun(dcol, optimal_ordering=self._optimal_leaf_order) # cluster along rows only if 'all' is selected if self._cluster in ["row", "all"]: drow = self._dist_fun(self._data, metric=self._row_dist) Zrow = self._link_fun(drow, optimal_ordering=self._optimal_leaf_order) return (Zcol, Zrow)
Example #19
Source File: From pancanatlas_code_public with MIT License | 6 votes |
def get_col_linkage(combined_df, method='ward', metric='cosine'): CACHE_DIR = os.path.expanduser('~/cache/alt_splice_heatmap/sqtl') if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) col_linkage_cache_path = os.path.join(CACHE_DIR, 'col_linkage_%s_%s.npy' %(method, metric)) idx_linkage_cache_path = os.path.join(CACHE_DIR, 'idx.npy') col_name_cache_path = os.path.join(CACHE_DIR, 'col_names.npy') if os.path.exists(col_linkage_cache_path): print "Loading linkage from %s" %col_linkage_cache_path col_linkage = np.load(col_linkage_cache_path) assert np.array_equal(np.load(idx_linkage_cache_path), combined_df.index) assert np.array_equal(np.load(col_name_cache_path), combined_df.columns) else: print "Calculating linkage" col_linkage = hc.linkage(sp.distance.pdist(combined_df.values.T), method=method, metric=metric), col_linkage), combined_df.index), combined_df.columns) return col_linkage
Example #20
Source File: From plastering with MIT License | 6 votes |
def create_cluster_map(self, bow, srcids): cluster_map = {} z = linkage(bow, metric='cityblock', method='complete') dists = list(set(z[:, 2])) thresh = (dists[1] + dists[2]) / 2'Threshold: {0}'.format(thresh)) b = hier.fcluster(z, thresh, criterion='distance') assert bow.shape[0] == len(b) assert len(b) == len(srcids) for cid, srcid in zip(b, srcids): cluster_map[cid] = cluster_map.get(cid, []) + [srcid]'# of clusters: {0}'.format(len(b)))'sizes of clustsers:{0}'.format(sorted(map(len, cluster_map.values())))) return cluster_map
Example #21
Source File: From scedar with MIT License | 5 votes |
def sort_x_by_d(x, dmat=None, metric="cosine", linkage="auto", n_eval_rounds=None, optimal_ordering=False, nprocs=None, verbose=False): dmat = SampleDistanceMatrix(x, d=dmat, metric=metric, nprocs=nprocs)._d xhct = HClustTree.hclust_tree(dmat, linkage="auto", is_euc_dist=(metric == "euclidean"), optimal_ordering=optimal_ordering) return xhct.leaf_ids()
Example #22
Source File: From texta with GNU General Public License v3.0 | 5 votes |
def __init__(self,words, vectors, number_of_steps = 21,metric="cosine",linkage="complete"): self.words = words self.vectors = vectors self.number_of_steps = number_of_steps self.metric = metric self.linkage = linkage
Example #23
Source File: From lens with Apache License 2.0 | 5 votes |
def hierarchical_ordering_indices(columns, correlation_matrix): """Return array with hierarchical cluster ordering of columns Parameters ---------- columns: iterable of str Names of columns. correlation_matrix: np.ndarray Matrix of correlation coefficients between columns. Returns ------- indices: iterable of int Indices with order of columns """ if len(columns) > 2: pairwise_dists = distance.pdist( np.where(np.isnan(correlation_matrix), 0, correlation_matrix), metric="euclidean", ) linkage = hierarchy.linkage(pairwise_dists, method="average") dendogram = hierarchy.dendrogram( linkage, no_plot=True, color_threshold=-np.inf ) idx = dendogram["leaves"] else: idx = list(range(len(columns))) return idx
Example #24
Source File: From texta with GNU General Public License v3.0 | 5 votes |
def __call__(self): if len(self.words) == 0 or len(self.vectors) == 0: return [] if len(self.words) == 1: self.words.append(self.words[0]) self.vectors.append(self.vectors[0]) distance_matrix = scidist.pdist(np.array(self.vectors),self.metric) linkage_matrix = hier.linkage(distance_matrix,self.linkage) dendrogram = self._linkage_matrix_to_dendrogram(linkage_matrix,self.words,self.vectors) clusterings = self._create_clusterings(dendrogram) return [[(node.label,node.vector) for node in _get_cluster_nodes(cluster)] for cluster in self._find_optimal_clustering(clusterings)]
Example #25
Source File: From Kitsune-py with MIT License | 5 votes |
def cluster(self,maxClust): D = self.corrDist() Z = linkage(D[np.triu_indices(self.n, 1)]) # create a linkage matrix based on the distance matrix if maxClust < 1: maxClust = 1 if maxClust > self.n: maxClust = self.n map = self.__breakClust__(to_tree(Z),maxClust) return map # a recursive helper function which breaks down the dendrogram branches until all clusters have no more than maxClust elements
Example #26
Source File: From scedar with MIT License | 5 votes |
def hclust_linkage(dmat, linkage="complete", n_eval_rounds=None, is_euc_dist=False, optimal_ordering=False, verbose=False): dmat = np.array(dmat, dtype="float") dmat = SampleDistanceMatrix.num_correct_dist_mat(dmat) n = dmat.shape[0] if linkage == "auto": try_linkages = ("single", "complete", "average", "weighted") if is_euc_dist: try_linkages += ("centroid", "median", "ward") if n_eval_rounds is None: n_eval_rounds = int(np.ceil(np.log2(n))) else: n_eval_rounds = int(np.ceil(max(np.log2(n), n_eval_rounds))) ltype_mdl_list = [] for iter_ltype in try_linkages: iter_lhct = HClustTree.hclust_tree(dmat, linkage=iter_ltype) iter_nbp_cnt_list = iter_lhct.n_round_bipar_cnt(n_eval_rounds) iter_nbp_mdl_arr = np.array(list(map( lambda x: MultinomialMdl(np.array(x)).mdl, iter_nbp_cnt_list))) iter_nbp_mdl = np.sum( iter_nbp_mdl_arr / np.arange(1, n_eval_rounds + 1)) ltype_mdl_list.append(iter_nbp_mdl) linkage = try_linkages[ltype_mdl_list.index(max(ltype_mdl_list))] if verbose: print(linkage, tuple(zip(try_linkages, ltype_mdl_list)), sep="\n") dmat_sf = spspatial.distance.squareform(dmat) hac_z = sch.linkage(dmat_sf, method=linkage, optimal_ordering=optimal_ordering) return hac_z
Example #27
Source File: From ontobio with BSD 3-Clause "New" or "Revised" License | 5 votes |
def plot_subject_term_matrix(ont, aset, args): import numpy as np import pandas as pd import scipy.cluster.hierarchy as sch import scipy.spatial as scs df = aset.as_dataframe(subjects=args.subjects) print('DF={}'.format(df)) d = scs.distance.pdist(df) Z = sch.linkage(d, method='complete') P = sch.dendrogram(Z) print(P)
Example #28
Source File: From loglizer with MIT License | 5 votes |
def _offline_clustering(self, X): print('Starting offline clustering...') p_dist = pdist(X, metric=self._distance_metric) Z = linkage(p_dist, 'complete') cluster_index = fcluster(Z, self.max_dist, criterion='distance') self._extract_representatives(X, cluster_index) print('Processed {} instances.'.format(X.shape[0])) print('Found {} clusters offline.\n'.format(len(self.representatives))) # print('The representive vectors are:') # pprint.pprint(self.representatives.tolist())
Example #29
Source File: From plastering with MIT License | 5 votes |
def get_word_clusters(sentence_dict): srcids = list(sentence_dict.keys()) sentences = [] for srcid in srcids: sentence = [] for metadata_type, sent in sentence_dict[srcid].items(): sentence.append(''.join(sent)) sentence = '\n'.join(sentence) sentence = ' '.join(re.findall('[a-z]+', sentence)) sentences.append(sentence) vect = TfidfVectorizer() #vect = CountVectorizer() bow = vect.fit_transform(sentences).toarray() try: z = linkage(bow, metric='cityblock', method='complete') except: pdb.set_trace() dists = list(set(z[:,2])) thresh = (dists[2] + dists[3]) /2 #thresh = (dists[1] + dists[2]) /2 print("Threshold: ", thresh) b = hier.fcluster(z,thresh, criterion='distance') cluster_dict = defaultdict(list) for srcid, cluster_id in zip(srcids, b): cluster_dict[cluster_id].append(srcid) return dict(cluster_dict)
Example #30
Source File: From pancanatlas_code_public with MIT License | 5 votes |
