Python scipy.cluster.hierarchy.dendrogram() Examples

The following are 30 code examples of scipy.cluster.hierarchy.dendrogram(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.cluster.hierarchy , or try the search function .
Example #1
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
Example #2
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
Example #3
Source File: regions.py    From TOBIAS with MIT License 6 votes vote down vote up
def assign_colors(self):
		""" Assign colors for plotting the dendrogram """

		clusters = self.linkage_clusters
		no_IDS = self.n

		colorlist = ["blue", "green", "red", "orange"]
		node_color = ["black"] * (2*no_IDS-1)
		i = 0
		for cluster in sorted(list(clusters.keys())):
			if len(clusters[cluster]) > 1:
				color = colorlist[i]
				for node in clusters[cluster]:
					node_color[node] = color
				i += 1 

				if i == len(colorlist):
					i = 0

		self.node_color = node_color #list corresponding to each possible clustering in tree 
Example #4
Source File: plot.py    From pypath with GNU General Public License v3.0 6 votes vote down vote up
def make_plot(self):

        self.z = hc.linkage(self.data, method='average')

        self.ax = self.fig.add_subplot(1, 1, 1)

        self.dendro = \
            hc.dendrogram(self.z,
                          labels=self.data.columns,
                          color_threshold=0,
                          orientation='left',
                          ax=self.ax,
                          link_color_func=lambda x: self.color)

        _ = [
            tl.set_fontproperties(self.fp_ticklabel)
            for tl in self.ax.get_yticklabels()
        ]
        _ = [
            tl.set_fontproperties(self.fp_ticklabel)
            for tl in self.ax.get_xticklabels()
        ]

        self.ax.xaxis.grid(True, color='#FFFFFF', lw=1, ls='solid')
        self.ax.yaxis.grid(False)
        self.ax.set_axisbelow(True)
        self.ax.set_facecolor('#EAEAF2')
        list(map(lambda s: s.set_lw(0), self.ax.spines.values()))
        self.ax.tick_params(which='both', length=0) 
Example #5
Source File: env_corr.py    From glosim with MIT License 6 votes vote down vote up
def plotdendro(Z,ncluster,filename,rep_ind):
	plt.figure(figsize=(10, 15))
	plt.title('Hierarchical Clustering Dendrogram')
	plt.xlabel('sample index')
	plt.ylabel('distance')

	d = sc.dendrogram(Z,truncate_mode='lastp', p=ncluster,orientation='right',leaf_rotation=90.,leaf_font_size=20.,show_contracted=False)
	
	coord=[]
	for i in range(len(d['icoord'])):
		if d['dcoord'][i][0]==0.0 :
			coord.append(d['icoord'][i][0])
	for i in range(len(d['icoord'])):
		if d['dcoord'][i][3]==0.0 :
			coord.append(d['icoord'][i][3])

	plt.savefig(filename, dpi=100, facecolor='w', edgecolor='w',
        orientation='portrait', papertype='letter', format=None,
        transparent=True, bbox_inches=None, pad_inches=0.1,
        frameon=None) 
Example #6
Source File: ontobio-assoc.py    From ontobio with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def run_query_associations(ont, aset, args):
    if args.dendrogram:
        plot_subject_term_matrix(ont, aset, args)
        return
    import plotly.plotly as py
    import plotly.graph_objs as go
    tups = aset.query_associations(subjects=args.subjects)
    for (s,c) in tups:
        print("{} {}".format(s, c))
    z, xaxis, yaxis = tuple_to_matrix(tups)
    xaxis = mk_axis(xaxis, aset, args)
    yaxis = mk_axis(yaxis, aset, args)
    logging.info("PLOTTING: {} x {} = {}".format(xaxis, yaxis, z))
    trace = go.Heatmap(z=z,
                       x=xaxis,
                       y=yaxis)
    data=[trace]
    py.plot(data, filename='labelled-heatmap')
    #plot_dendrogram(z, xaxis, yaxis)
    
# TODO: fix this really dumb implementation 
Example #7
Source File: agglomerative.py    From atap with Apache License 2.0 6 votes vote down vote up
def plot_dendrogram(self, **kwargs):
        # Distances between each pair of children
        distance = np.arange(self.children.shape[0])
        position = np.arange(self.children.shape[0])

        # Create linkage matrix and then plot the dendrogram
        linkage_matrix = np.column_stack([
            self.children, distance, position]
        ).astype(float)

        # Plot the corresponding dendrogram
        fig, ax = plt.subplots(figsize=(15, 7))  # set size
        ax = dendrogram(linkage_matrix, **kwargs)
        plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off')
        plt.tight_layout()
        plt.show() 
Example #8
Source File: construction.py    From FinanceHub with MIT License 6 votes vote down vote up
def plot_dendrogram(self, show_chart=True, save_path=None, figsize=(8, 8),
                        threshold=None):
        """
        Plots the dendrogram using scipy's own method.
        :param show_chart: If True, shows the chart.
        :param save_path: local directory to save file.
        :param figsize: tuple with figsize dimensions.
        :param threshold: height of the dendrogram to color the nodes. If None, the colors of the nodes follow scipy's
                           standard behaviour, which cuts the dendrogram on 70% of its height (0.7*max(self.link[:,2]).
        """

        plt.figure(figsize=figsize)
        dn = sch.dendrogram(self.link, orientation='left', labels=self.sort_ix, color_threshold=threshold)

        plt.tight_layout()

        if not (save_path is None):
            plt.savefig(save_path,
                        pad_inches=1,
                        dpi=400)

        if show_chart:
            plt.show()

        plt.close() 
Example #9
Source File: KEGG_clustering.py    From BioData with MIT License 5 votes vote down vote up
def hClust_euclidean(genome_df):
	linkage_matrix = linkage(genome_df, method='average', metric='euclidean')
	#linkage_matrix = linkage(df, metric='braycurtis')
	names = genome_df.index.tolist()
	#clust = dendrogram(linkage_matrix, orientation="right", labels=names, get_leaves=True)
	clust = dendrogram(linkage_matrix, no_plot=True, labels=names, get_leaves=True)
	leaves = clust['ivl']
	leave_order = list(leaves)
	genome_df = genome_df.reindex(leave_order)

	return genome_df 
Example #10
Source File: __init__.py    From pyani with MIT License 5 votes vote down vote up
def clean_axis(axis):
    """Remove ticks, tick labels, and frame from axis.

    :param axis:
    """
    axis.get_xaxis().set_ticks([])
    axis.get_yaxis().set_ticks([])
    for spine in list(axis.spines.values()):
        spine.set_visible(False)


# Add dendrogram and axes to passed figure 
Example #11
Source File: heatmap.py    From CompareM with GNU General Public License v3.0 5 votes vote down vote up
def plotDendrogram(self, matrix, axis, clusteringThreshold, orientation):

        d = dist.pdist(matrix)
        linkage = cluster.linkage(dist.squareform(d), method='average', metric='cityblock')
        dendrogram = cluster.dendrogram(linkage, orientation=orientation, link_color_func=lambda k: 'k')
        index = cluster.fcluster(linkage, clusteringThreshold * max(linkage[:,2]), 'distance')
        axis.set_xticks([])
        axis.set_yticks([])

        return index, dendrogram['leaves'] 
Example #12
Source File: utils.py    From lens with Apache License 2.0 5 votes vote down vote up
def hierarchical_ordering_indices(columns, correlation_matrix):
    """Return array with hierarchical cluster ordering of columns

    Parameters
    ----------
    columns: iterable of str
        Names of columns.
    correlation_matrix: np.ndarray
        Matrix of correlation coefficients between columns.

    Returns
    -------
    indices: iterable of int
        Indices with order of columns
    """
    if len(columns) > 2:
        pairwise_dists = distance.pdist(
            np.where(np.isnan(correlation_matrix), 0, correlation_matrix),
            metric="euclidean",
        )
        linkage = hierarchy.linkage(pairwise_dists, method="average")
        dendogram = hierarchy.dendrogram(
            linkage, no_plot=True, color_threshold=-np.inf
        )
        idx = dendogram["leaves"]
    else:
        idx = list(range(len(columns)))

    return idx 
Example #13
Source File: cluster_line_markings.py    From Data-digging with MIT License 5 votes vote down vote up
def plot_dendrogram(the_linkage, theid, d_max):
    fig = plt.figure(figsize=(12, 5))

    axL = fig.add_subplot(1,2,1)

    axL.set_title('Hierarchical %s Clustering Dendrogram' % cluster_method)
    axL.set_xlabel('sample index')
    axL.set_ylabel('distance')
    dendrogram(
        the_linkage,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=8.,  # font size for the x axis labels
    )
    # freeze the current xlimits
    xlimits = axL.get_xlim()

    axL.plot(xlimits, np.array([d_max, d_max]), linestyle='-', color="#777777")

    axL.set_xlim(xlimits)


    axR = fig.add_subplot(1,2,2)
    axR.plot(the_linkage[:,2])
    # freeze the current ylimits
    xlimits = axR.get_xlim()
    axR.plot(xlimits, np.array([d_max, d_max]), linestyle='-', color="#777777")

    axR.set_xlim(xlimits)

    axR.set_xlabel('iteration')
    axR.set_ylabel('distance')

    plt.tight_layout()

    plt.savefig('dendrograms/dendrogram_%s_%s.png' % (theid, cluster_method), facecolor='None', edgecolor='None')
    #plt.show()
    plt.clf()
    plt.cla()
    plt.close('')
    plt.close('All') 
Example #14
Source File: document_clustering.py    From text-analytics-with-python with Apache License 2.0 5 votes vote down vote up
def plot_hierarchical_clusters(linkage_matrix, movie_data, figure_size=(8,12)):
    # set size
    fig, ax = plt.subplots(figsize=figure_size) 
    movie_titles = movie_data['Title'].values.tolist()
    # plot dendrogram
    ax = dendrogram(linkage_matrix, orientation="left", labels=movie_titles)
    plt.tick_params(axis= 'x',   
                    which='both',  
                    bottom='off',
                    top='off',
                    labelbottom='off')
    plt.tight_layout()
    plt.savefig('ward_hierachical_clusters.png', dpi=200)

# build ward's linkage matrix 
Example #15
Source File: heatmap.py    From SqueezeMeta with GNU General Public License v3.0 5 votes vote down vote up
def plotDendrogram(self, matrix, axis, clusteringThreshold, orientation):

        d = dist.pdist(matrix)
        linkage = cluster.linkage(dist.squareform(d), method='average', metric='cityblock')
        dendrogram = cluster.dendrogram(linkage, orientation=orientation, link_color_func=lambda k: 'k')
        index = cluster.fcluster(linkage, clusteringThreshold * max(linkage[:,2]), 'distance')
        axis.set_xticks([])
        axis.set_yticks([])

        return index, dendrogram['leaves'] 
Example #16
Source File: KEGG_clustering.py    From BioData with MIT License 5 votes vote down vote up
def hClust_correlation(genome_df):
	linkage_matrix = linkage(genome_df, method='single', metric='correlation')
	#linkage_matrix = linkage(df, metric='braycurtis')
	names = genome_df.index.tolist()
	#clust = dendrogram(linkage_matrix, orientation="right", labels=names, get_leaves=True)
	clust = dendrogram(linkage_matrix, no_plot=True, labels=names, get_leaves=True)
	leaves = clust['ivl']
	leave_order = list(leaves)
	genome_df = genome_df.reindex(leave_order)

	return genome_df 
Example #17
Source File: __init__.py    From xai with MIT License 5 votes vote down vote up
def _plot_correlation_dendogram(
        corr: pd.DataFrame, 
        cols: List[str],
        plt_kwargs={}):
    """
    Plot dendogram of a correlation matrix, using the columns provided. 
    This consists of a chart that that shows hierarchically the variables
    that are most correlated by the connecting trees. The closer to the right
    that the connection is, the more correlated the features are.
    If you would like to visualise this as a tree, please 
    see the function _plot_correlation_dendogram.

    :Example:

    columns_to_include=["age", "loan", "gender"]
    xai._plot_correlation_dendogram(df, cols=columns_to_include)

    :returns: Null
    :rtype: None

    """

    corr = np.round(corr, 4)
    corr_condensed = hc.distance.squareform(1-corr)
    z = hc.linkage(corr_condensed, method="average")
    fig = plt.figure(**plt_kwargs)
    dendrogram = hc.dendrogram(
        z, labels=cols, orientation="left", leaf_font_size=16)
    plt.show() 
Example #18
Source File: heatmap.py    From traitar with GNU General Public License v3.0 5 votes vote down vote up
def exportFlatClusterData(filename, new_row_header,new_column_header,xt,ind1,ind2):
    """ Export the clustered results as a text file, only indicating the flat-clusters rather than the tree """
    
    filename = string.replace(filename,'.pdf','.txt')
    export_text = open(filename,'w')
    column_header = string.join(['UID','row_clusters-flat']+new_column_header,'\t')+'\n' ### format column-names for export
    export_text.write(column_header)
    column_clusters = string.join(['column_clusters-flat','']+ map(str, ind2),'\t')+'\n' ### format column-flat-clusters for export
    export_text.write(column_clusters)
    
    ### The clusters, dendrogram and flat clusters are drawn bottom-up, so we need to reverse the order to match
    new_row_header = new_row_header[::-1]
    xt = xt[::-1]
    
    ### Export each row in the clustered data matrix xt
    i=0
    for row in xt:
        export_text.write(string.join([new_row_header[i],str(ind1[i])]+map(str, row),'\t')+'\n')
        i+=1
    export_text.close()
    
    ### Export as CDT file
    filename = string.replace(filename,'.txt','.cdt')
    export_cdt = open(filename,'w')
    column_header = string.join(['UNIQID','NAME','GWEIGHT']+new_column_header,'\t')+'\n' ### format column-names for export
    export_cdt.write(column_header)
    eweight = string.join(['EWEIGHT','','']+ ['1']*len(new_column_header),'\t')+'\n' ### format column-flat-clusters for export
    export_cdt.write(eweight)
    
    ### Export each row in the clustered data matrix xt
    i=0
    for row in xt:
        export_cdt.write(string.join([new_row_header[i]]*2+['1']+map(str, row),'\t')+'\n')
        i+=1
    export_cdt.close()

################# Create Custom Color Gradients #################
#http://matplotlib.sourceforge.net/examples/pylab_examples/custom_cmap.html 
Example #19
Source File: regions.py    From TOBIAS with MIT License 5 votes vote down vote up
def cluster(self, threshold=0.5, method="average"):
		""" Main function to cluster the overlap dictionary into clusters"""

		self.overlap_to_distance()

		if len(self.names) > 1:
			self.linkage_mat = linkage(squareform(self.distance_mat), method)
			self.labels = fcluster(self.linkage_mat, threshold, criterion="distance")		#ordering of the dendrogram

			#Find clusters below threshold
			self.linkage_clusters = dict(zip(range(self.n), [[num] for num in range(self.n)]))
			for i, row in enumerate(self.linkage_mat):
				ID1 = int(row[0])
				ID2 = int(row[1])
				new = self.n + i
				dist = row[2]

				if dist <= threshold:
					self.linkage_clusters[new] = self.linkage_clusters[ID1] + self.linkage_clusters[ID2] + [new]
					del self.linkage_clusters[ID1]
					del self.linkage_clusters[ID2]

			#Add member-names to clusters
			for cluster in self.linkage_clusters:

				self.clusters[cluster] = {"member_idx": [idx for idx in self.linkage_clusters[cluster] if idx < self.n]}
				self.clusters[cluster]["member_names"] = [self.names[idx] for idx in self.clusters[cluster]["member_idx"]]
		
		else:	#only one TF
			self.linkage_clusters = {0:[0]}
			self.linkage_mat = np.array([[0]])
			self.clusters[0] = {"member_idx":[0]}
			self.clusters[0]["member_names"] = [self.names[idx] for idx in self.clusters[0]["member_idx"]]

		self.get_cluster_names()	#Set names of clusters
		self.assign_colors() 
Example #20
Source File: motif_clust.py    From TOBIAS with MIT License 5 votes vote down vote up
def plot_dendrogram(label, linkage, font_size, out, title, threshold, dpi):
	"""Plot dendrogram with highlighted threshold 
	Parameter:
	----------
	label : list
		List of labels
	linkage : ndarray
		The hierarchical clustering of rows or cols encoded as a linkage matrix.
	font_size : int
		font size
	out : String
		Output path
	title : String
		Plot title
	threshold : float
		dendrogram cluster threshold
	dpi : int
		dpi of plot
	"""

	x = 10.0
	y = x * len(label)/(x*3)    #ensure good aspect ratio
								#set cap on y axis (prevent errors from too large figure)

	plt.figure(figsize=(x, y))
	plt.title(title, fontsize=20)
	plt.axvline(x=threshold, color="red")
	dendrogram(linkage, color_threshold=threshold, labels=label, leaf_font_size=font_size, orientation="right")
	try:
		plt.tight_layout()
		plt.savefig(out, dpi=dpi)

	except ValueError as e:
		print("Skipped plotting of dendrogram.")
		print("Error: " + str(e))

#--------------------------------------------------------------------------------------------------------# 
Example #21
Source File: __init__.py    From EDeN with MIT License 5 votes vote down vote up
def dendrogram(data,
               vectorizer,
               method="ward",
               color_threshold=1,
               size=10,
               filename=None):
    """dendrogram.

    "median","centroid","weighted","single","ward","complete","average"
    """
    data = list(data)
    # get labels
    labels = []
    for graph in data:
        label = graph.graph.get('id', None)
        if label:
            labels.append(label)
    # transform input into sparse vectors
    data_matrix = vectorizer.transform(data)

    # labels
    if not labels:
        labels = [str(i) for i in range(data_matrix.shape[0])]

    # embed high dimensional sparse vectors in 2D
    from sklearn import metrics
    from scipy.cluster.hierarchy import linkage, dendrogram
    distance_matrix = metrics.pairwise.pairwise_distances(data_matrix)
    linkage_matrix = linkage(distance_matrix, method=method)
    plt.figure(figsize=(size, size))
    dendrogram(linkage_matrix,
               color_threshold=color_threshold,
               labels=labels,
               orientation='right')
    if filename is not None:
        plt.savefig(filename)
    else:
        plt.show() 
Example #22
Source File: hierarchy.py    From malss with MIT License 5 votes vote down vote up
def dendrogram(self):
        return dendrogram(self.model, truncate_mode='lastp', p=min(12, len(self.model))) 
Example #23
Source File: cluster.py    From glosim with MIT License 5 votes vote down vote up
def plotdendro(Z,ncluster,filename,rep_ind):
  plt.figure(figsize=(10, 15))
  plt.title('Hierarchical Clustering Dendrogram')
  plt.xlabel('sample index')
  plt.ylabel('distance')
  d=sc.dendrogram(Z,truncate_mode='lastp', p=ncluster,orientation='right',leaf_rotation=90.,leaf_font_size=20.,show_contracted=False)
#  coord = np.c_[np.array(d['icoord'])[:,1:3],np.array(d['dcoord'])[:,1]]
#  coord = coord[np.argsort(coord[:,2])]
  num=ncluster-1
  coord=[]
  for i in range(len(d['icoord'])):
    if d['dcoord'][i][0]==0.0 :
     coord.append(d['icoord'][i][0])
  for i in range(len(d['icoord'])):
    if d['dcoord'][i][3]==0.0 :
     coord.append(d['icoord'][i][3])
  #print d['leaves']
  #return
  #for posi in coord:
  # x = posi
  #  y = 0.05
  #  plt.plot(x, y, 'ro')
  #  plt.annotate("%2i" % rep_ind[num], (x, y), xytext=(0, -8),
  #               textcoords='offset points',
  #               va='top', ha='center')
  #  num = num-1
  #plt.show()
  
  plt.savefig(filename, dpi=100, facecolor='w', edgecolor='w',
        orientation='portrait', papertype='letter', format=None,
        transparent=True, bbox_inches=None, pad_inches=0.1,
        frameon=None) 
Example #24
Source File: ontobio-assoc.py    From ontobio with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def plot_subject_term_matrix(ont, aset, args):
    import numpy as np
    import pandas as pd
    import scipy.cluster.hierarchy as sch
    import scipy.spatial as scs
    df = aset.as_dataframe(subjects=args.subjects)
    print('DF={}'.format(df))
    d = scs.distance.pdist(df)
    Z = sch.linkage(d, method='complete')
    P = sch.dendrogram(Z)
    print(P) 
Example #25
Source File: env_corr.py    From glosim with MIT License 5 votes vote down vote up
def clusterdistmatfull(distmatrixfile,sim,mode='average',plot=False):
	# Compute the clusturing on dist^2 so that the average 
	# distance of a cluster with an other is the RMS distance
	sim2 = sim*sim
	Z = sc.linkage(sim2,mode)

	# get the full tree
	plt.figure(figsize=(10, 15))
	plt.title('Hierarchical Clustering Dendrogram')
	plt.xlabel('sample index')
	plt.ylabel('distance')
	dendo = sc.dendrogram(Z,orientation='right',leaf_rotation=90.,leaf_font_size=20.,show_contracted=False)
	c_list = np.array(dendo['leaves'])

	c_count = Counter(c_list)
	nbclst = len(c_count)

	print "Number of clusters", nbclst 
	
	# c_list = np.zeros(len(sim))

	# # Change cluster groups numbering to (0:n-1)
	# for i in range(len(sim)):
	# 	c_list[i] = int(clist[i]-1)

	return c_list,Z 
Example #26
Source File: rep_dists.py    From pancanatlas_code_public with MIT License 5 votes vote down vote up
def heatmap_dists(data, norm=False, labels=None, metric='euclidean', method='ward'):
    fig, (ax, cax) = plt.subplots(ncols=2,figsize=(7 * 1.05 ,7),
                                  gridspec_kw={"width_ratios":[1, 0.05]})

    if labels is None:
        try:
            labels = data.index
        except AttributeError:
            pass

    n = data.shape[0]
    assert labels is None or len(labels) == n

    dists = ssd.pdist(data, metric=metric)
    linkage = sch.linkage(dists, metric=metric, method=method)
    dendro = sch.dendrogram(linkage, no_plot=True)
    order = dendro['leaves']
    sq_form_dists = ssd.squareform(dists)[order][:, order]
    assert sq_form_dists.shape == (n,n)

    hmap = ax.imshow(sq_form_dists, aspect='auto')
    ax.set_xticks(np.arange(n))
    ax.set_yticks(np.arange(n))
    if labels is not None:
        ax.set_xticklabels(labels[order], rotation=90)
        ax.set_yticklabels(labels[order])
    cb = plt.colorbar(hmap, cax=cax)
    return fig, (ax, cax)


# Tasks 
Example #27
Source File: Plotter.py    From CAN_Reverse_Engineering with GNU General Public License v3.0 5 votes vote down vote up
def plot_dendrogram(a_timer: PipelineTimer,
                    linkage_matrix,
                    threshold: float,
                    vehicle_number: str,
                    force: bool = False):
    dendrogram_filename = "dendrogram_" + vehicle_number + "." + figure_format
    if path.isfile(dendrogram_filename):
        if force:
            remove(dendrogram_filename)
        else:
            print("Dendrogram already plotted. Skipping...")
            return
    plt.figure(figsize=(7, 7), dpi=600)
    R: dict = dendrogram(Z=linkage_matrix, orientation='top', distance_sort='ascending', no_labels=True)
    plt.title("Dendrogram of Agglomerative Clustering for Vehicle " + vehicle_number)
    plt.xlabel("Signals Observed")
    plt.ylabel("Single Linkage Cluster Merge Distance")
    xmin, xmax = plt.xlim()
    # Add a 25% opacity dashed black line to the entropy gradient plot at one boundary of each sub-flow
    plt.hlines(y=threshold, xmin=xmin, xmax=xmax, alpha=0.25, colors='black', linestyle='dashed',
               label='cluster threshold')
    plt.legend(loc='upper right')

    print("\tPlotting dendrogram and saving to " + dendrogram_filename)

    savefig(dendrogram_filename,
            bbox_iches='tight',
            pad_inches=0.0,
            dpi=600,
            format=figure_format,
            transparent=figure_transp)
    plt.close()
    print("\t\tComplete...") 
Example #28
Source File: _clustergram.py    From dash-bio with MIT License 5 votes vote down vote up
def _sort_traces(self, rdt, cdt):
        """Sort row dendrogram clusters and column dendrogram clusters
        so that the background trace (above threshold) is trace 0
        and all other traces are ordered top-to-bottom (row dendrogram)
        or left-to-right (column dendrogram).

        Parameters:
        - rdt (list[dict]): The row dendrogram cluster traces.
        - cdt (list[dict]): The column dendrogram cluster traces.

        Returns:
        - tuple: The sorted row dendrogram clusters and column
            dendrogram clusters.
        """

        tmp_rdt = []
        tmp_cdt = []

        if len(rdt) > 0:
            # first, find background trace: (max 'x')
            rdt.sort(key=lambda t: -1 * max(list(t["x"])))
            tmp_rdt.append(rdt[0])
            # then, sort top-to-bottom
            r = rdt[1:]
            r.sort(key=lambda t: -1 * min(list(t["y"])))
            tmp_rdt += r
        if len(cdt) > 0:
            # background trace has max 'y'
            cdt.sort(key=lambda t: -1 * max(list(t["y"])))
            tmp_cdt.append(cdt[0])
            # sort left to right
            c = cdt[1:]
            c.sort(key=lambda t: min(list(t["x"])))
            tmp_cdt += c

        return (tmp_rdt, tmp_cdt) 
Example #29
Source File: data_viewing.py    From lumin with Apache License 2.0 5 votes vote down vote up
def plot_rank_order_dendrogram(df:pd.DataFrame, threshold:float=0.8, savename:Optional[str]=None, settings:PlotSettings=PlotSettings()) \
        -> Dict[str,Union[List[str],float]]:
    r'''
    Plots a dendrogram of features in df clustered via Spearman's rank correlation coefficient.
    Also returns a sets of features with correlation coefficients greater than the threshold

    Arguments:
        df: Pandas DataFrame containing data
        threshold: Threshold on correlation coefficient
        savename: Optional name of file to which to save the plot of feature importances
        settings: :class:`~lumin.plotting.plot_settings.PlotSettings` class to control figure appearance

    Returns:
        Dict of sets of features with correlation coefficients greater than the threshold and cluster distance
    '''

    corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
    corr_condensed = hc.distance.squareform(1-np.abs(corr))  # Abs because negtaive of a feature is a trvial transformation: information unaffected
    z = hc.linkage(corr_condensed, method='average', optimal_ordering=True)

    with sns.axes_style('white'), sns.color_palette(settings.cat_palette):
        plt.figure(figsize=(settings.w_large, (0.5*len(df.columns))))
        hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=settings.lbl_sz, color_threshold=1-threshold)
        plt.xlabel("Distance (1 - |Spearman's Rank Correlation Coefficient|)", fontsize=settings.lbl_sz, color=settings.lbl_col)
        plt.xticks(fontsize=settings.tk_sz, color=settings.tk_col)
        if savename is not None: plt.savefig(settings.savepath/f'{savename}{settings.format}', bbox_inches='tight')
        plt.show()

    feats = df.columns
    sets = {}
    for i, merge in enumerate(z):
        if merge[2] > 1-threshold: continue
        if merge[0] <= len(z): a = [feats[int(merge[0])]]
        else:                  a = sets.pop(int(merge[0]))['children']
        if merge[1] <= len(z): b = [feats[int(merge[1])]]
        else:                  b = sets.pop(int(merge[1]))['children']
        sets[1 + i + len(z)] = {'children': [*a, *b], 'distance': merge[2]}
    return sets 
Example #30
Source File: rep_dists.py    From pancanatlas_code_public with MIT License 4 votes vote down vote up
def heatmap_dists_with_dendro(data, norm=False, labels=None, metric='euclidean', method='ward'):

    fig = plt.figure(figsize=(7 * 1.30, 7 * 1.25))
    gs = gridspec.GridSpec(ncols=3, nrows=2, height_ratios=[.25, 1], width_ratios=[.25, 1, .05], hspace=0)
    dend_top_ax = fig.add_subplot(gs[0,1])
    hmap_ax     = fig.add_subplot(gs[1,1])
    cbar_ax     = fig.add_subplot(gs[1,2])
    dend_top_ax.set_axis_off()

    if labels is None:
        try:
            labels = data.index
        except AttributeError:
            pass

    n = data.shape[0]
    assert labels is None or len(labels) == n

    dists = ssd.pdist(data, metric=metric)
    linkage = sch.linkage(dists, metric=metric, method=method)
    dendro = sch.dendrogram(linkage, ax=dend_top_ax, color_threshold=0, above_threshold_color='black')
    order = dendro['leaves']
    sq_form_dists = ssd.squareform(dists)[order][:, order]
    assert sq_form_dists.shape == (n,n)

    if norm:
        sq_form_dists = spst.zscore(sq_form_dists, axis=None)
        sq_form_dists *= -1
        cmap = plt.get_cmap('cubehelix')
        vmin = -4
        vmax = 4
    else:
        cmap = plt.get_cmap()
        vmin = None
        vmax = None
    hmap = hmap_ax.imshow(sq_form_dists, aspect='auto', cmap=cmap, vmin=vmin, vmax=vmax)
    hmap_ax.set_xticks(np.arange(n))
    hmap_ax.set_yticks(np.arange(n))
    if labels is not None:
        hmap_ax.set_xticklabels(labels[order], rotation=90)
        hmap_ax.set_yticklabels(labels[order])
    cb = plt.colorbar(hmap, cax=cbar_ax)
    return