Python scipy.cluster.hierarchy.linkage() Examples

The following are 30 code examples of scipy.cluster.hierarchy.linkage(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.cluster.hierarchy , or try the search function .
Example #1
Source File: clustering.py    From anvio with GNU General Public License v3.0 6 votes vote down vote up
def create_newick_file_from_matrix_file(observation_matrix_path, output_file_path, linkage=constants.linkage_method_default,
                         distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False,
                         items_order_file_path=None):
    is_distance_and_linkage_compatible(distance, linkage)
    filesnpaths.is_file_exists(observation_matrix_path)
    filesnpaths.is_file_tab_delimited(observation_matrix_path)

    filesnpaths.is_output_file_writable(output_file_path)
    if items_order_file_path:
        filesnpaths.is_output_file_writable(items_order_file_path)

    id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path, transpose=transpose)

    vectors = np.array(vectors)

    newick = get_newick_from_matrix(vectors, distance, linkage, norm, id_to_sample_dict)

    if output_file_path:
        open(output_file_path, 'w').write(newick.strip() + '\n')

    if items_order_file_path:
        open(items_order_file_path, 'w').write('\n'.join(utils.get_names_order_from_newick_tree(newick)) + '\n') 
Example #2
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
Example #3
Source File: agglomerative.py    From aggregation with Apache License 2.0 6 votes vote down vote up
def __agglomerative__(self,markings):
        """
        runs an initial agglomerative clustering over the given markings
        :param markings:
        :return:
        """
        # this converts stuff into panda format - probably a better way to do this but the labels do seem
        # necessary
        labels = [str(i) for i in markings]
        param_labels = [str(i) for i in range(len(markings[0]))]

        df = pd.DataFrame(np.array(markings), columns=param_labels, index=labels)
        row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)
        # use ward metric to do the actual clustering
        row_clusters = linkage(row_dist, method='ward')

        return row_clusters 
Example #4
Source File: marker.py    From scprep with GNU General Public License v3.0 6 votes vote down vote up
def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels, s, c):
    # cluster tissues hierarchically using mean size and color
    tissue_features = []
    for tissue in tissue_names:
        tissue_data = []
        for cluster in cluster_names:
            tissue_cluster_idx = np.where(
                (np.array(tissue_labels) == tissue)
                & (np.array(cluster_labels) == cluster)
            )
            tissue_data.append(
                np.vstack([s[tissue_cluster_idx], c[tissue_cluster_idx]]).mean(axis=1)
            )
        tissue_features.append(np.concatenate(tissue_data))
    tissue_features = np.array(tissue_features)
    # normalize
    tissue_features = tissue_features / np.sqrt(np.sum(tissue_features ** 2))
    tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features))
    return tissues_order 
Example #5
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
Example #6
Source File: plot.py    From pypath with GNU General Public License v3.0 6 votes vote down vote up
def make_plot(self):

        self.z = hc.linkage(self.data, method='average')

        self.ax = self.fig.add_subplot(1, 1, 1)

        self.dendro = \
            hc.dendrogram(self.z,
                          labels=self.data.columns,
                          color_threshold=0,
                          orientation='left',
                          ax=self.ax,
                          link_color_func=lambda x: self.color)

        _ = [
            tl.set_fontproperties(self.fp_ticklabel)
            for tl in self.ax.get_yticklabels()
        ]
        _ = [
            tl.set_fontproperties(self.fp_ticklabel)
            for tl in self.ax.get_xticklabels()
        ]

        self.ax.xaxis.grid(True, color='#FFFFFF', lw=1, ls='solid')
        self.ax.yaxis.grid(False)
        self.ax.set_axisbelow(True)
        self.ax.set_facecolor('#EAEAF2')
        list(map(lambda s: s.set_lw(0), self.ax.spines.values()))
        self.ax.tick_params(which='both', length=0) 
Example #7
Source File: cluster.py    From cesi with Apache License 2.0 6 votes vote down vote up
def getClusters(self, embed):

		n, m 	= len(embed), self.p.embed_dims
		X 	= np.empty((n, m), np.float32)

		for i in range(len(embed)): 
			X[i, :] = embed[i]

		dist 	  = pdist(X, 	  metric=self.p.metric)
		clust_res = linkage(dist, method=self.p.linkage)
		labels    = fcluster(clust_res, t=self.p.thresh_val, criterion='distance') - 1
		clusters  = [[] for i in range(max(labels) + 1)]

		for i in range(len(labels)): 
			clusters[labels[i]].append(i)

		return clusters 
Example #8
Source File: diarizationFunctions.py    From pyBK with MIT License 6 votes vote down vote up
def performClusteringLinkage(segmentBKTable, segmentCVTable, N_init, linkageCriterion,linkageMetric ):
    from scipy.cluster.hierarchy import linkage
    from scipy import cluster
    if linkageMetric == 'jaccard':
      observations = segmentBKTable
    elif linkageMetric == 'cosine':
      observations = segmentCVTable
    else:
      observations = segmentCVTable      
    clusteringTable = np.zeros([np.size(segmentCVTable,0),N_init]) 
    Z = linkage(observations,method=linkageCriterion,metric=linkageMetric)
    for i in np.arange(N_init):
      clusteringTable[:,i] = cluster.hierarchy.cut_tree(Z,N_init-i).T+1  
    k=N_init
    print('done')
    return clusteringTable, k 
Example #9
Source File: allocation.py    From finance_ml with MIT License 6 votes vote down vote up
def get_hrp(cov, corr):
    """Construct a hierarchical portfolio
    
    Params
    ------
    cov: pd.DataFrame
    corr: pd.DataFrame
    
    Returns
    -------
    pd.Series
    """
    dist = get_corr_dist(corr)
    link = sch.linkage(dist, 'single')
    sort_idx = get_quasi_diag(link)
    # Recover label
    sort_idx = corr.index[sort_idx].tolist()
    hrp = get_rec_bipart(cov, sort_idx)
    return hrp.sort_index() 
Example #10
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cluster_distances_with_distance_threshold():
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.randint(-10, 10, size=(n_samples, 3))
    # check the distances within the clusters and with other clusters
    distance_threshold = 4
    clustering = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=distance_threshold,
        linkage="single").fit(X)
    labels = clustering.labels_
    D = pairwise_distances(X, metric="minkowski", p=2)
    # to avoid taking the 0 diagonal in min()
    np.fill_diagonal(D, np.inf)
    for label in np.unique(labels):
        in_cluster_mask = labels == label
        max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask]
                                   .min(axis=0).max())
        min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask]
                                    .min(axis=0).min())
        # single data point clusters only have that inf diagonal here
        if in_cluster_mask.sum() > 1:
            assert max_in_cluster_distance < distance_threshold
        assert min_out_cluster_distance >= distance_threshold 
Example #11
Source File: ttclust.py    From TTClust with GNU General Public License v3.0 6 votes vote down vote up
def generate_graphs(clusters_list, output, size, linkage, cutoff, distances, traj):
    """
    DESCRIPTION
    Create a linear cluster mapping graph where every frame is printed as a
    colored barplot
    Args:
        clusters_list (list): list of cluster
        output (string): output name for graph
        size (int): number of frames
        linkage (numpy array): matrix linkage
        cutoff (float): cutoff distance value for clustering (in the dendogram)
        distances(numpy array): distance matrix
        traj (Trajectory): trajectory for time usage in axis barplot
    Return:
        colors_list (list) to be used with 2D distance projection graph
    """
    colors_list = plot_barplot(clusters_list, output, size, traj)
    plot_dendro(linkage, output, cutoff, colors_list, clusters_list)
    plot_hist(clusters_list, output, colors_list)
    if (distances.shape[0] < 10000):
        plot_distmat(distances, output)
    else:
        printScreenLogfile("Too many frames! The RMSD distance matrix will not be generated")
    return colors_list 
Example #12
Source File: common.py    From plastering with MIT License 6 votes vote down vote up
def hier_clustering(d, threshold=3):
    srcids = d.keys()
    tokenizer = lambda x: x.split()
    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
    assert isinstance(d, dict)
    assert isinstance(list(d.values())[0], list)
    assert isinstance(list(d.values())[0][0], str)
    doc = [' '.join(d[srcid]) for srcid in srcids]
    vect = vectorizer.fit_transform(doc)
    #TODO: Make vect aligned to the required format
    z = linkage(vect.toarray(), metric='cityblock', method='complete')
    dists = list(set(z[:,2]))
#    threshold = 3
    #threshold = (dists[2] + dists[3]) / 2
    b = hier.fcluster(z, threshold, criterion='distance')
    cluster_dict = defaultdict(list)
    for srcid, cluster_id in zip(srcids, b):
        cluster_dict[str(cluster_id)].append(srcid)
    value_lengther = lambda x: len(x[1])
    return OrderedDict(\
               sorted(cluster_dict.items(), key=value_lengther, reverse=True)) 
Example #13
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_identical_points():
    # Ensure identical points are handled correctly when using mst with
    # a sparse connectivity matrix
    X = np.array([[0, 0, 0], [0, 0, 0],
                  [1, 1, 1], [1, 1, 1],
                  [2, 2, 2], [2, 2, 2]])
    true_labels = np.array([0, 0, 1, 1, 2, 2])
    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    connectivity, n_components = _fix_connectivity(X,
                                                   connectivity,
                                                   'euclidean')

    for linkage in ('single', 'average', 'average', 'ward'):
        clustering = AgglomerativeClustering(n_clusters=3,
                                             linkage=linkage,
                                             connectivity=connectivity)
        clustering.fit(X)

        assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                         true_labels), 1) 
Example #14
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_single_linkage_clustering():
    # Check that we get the correct result in two emblematic cases
    moons, moon_labels = make_moons(noise=0.05, random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(moons)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     moon_labels), 1)

    circles, circle_labels = make_circles(factor=0.5, noise=0.025,
                                          random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(circles)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     circle_labels), 1) 
Example #15
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_unstructured_linkage_tree():
    # Check that we obtain the correct solution for unstructured linkage trees.
    rng = np.random.RandomState(0)
    X = rng.randn(50, 100)
    for this_X in (X, X[0]):
        # With specified a number of clusters just for the sake of
        # raising a warning and testing the warning code
        with ignore_warnings():
            children, n_nodes, n_leaves, parent = assert_warns(
                UserWarning, ward_tree, this_X.T, n_clusters=10)
        n_nodes = 2 * X.shape[1] - 1
        assert_equal(len(children) + n_leaves, n_nodes)

    for tree_builder in _TREE_BUILDERS.values():
        for this_X in (X, X[0]):
            with ignore_warnings():
                children, n_nodes, n_leaves, parent = assert_warns(
                    UserWarning, tree_builder, this_X.T, n_clusters=10)

            n_nodes = 2 * X.shape[1] - 1
            assert_equal(len(children) + n_leaves, n_nodes) 
Example #16
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_structured_linkage_tree():
    # Check that we obtain the correct solution for structured linkage trees.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    # Avoiding a mask with only 'True' entries
    mask[4:7, 4:7] = 0
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    for tree_builder in _TREE_BUILDERS.values():
        children, n_components, n_leaves, parent = \
            tree_builder(X.T, connectivity)
        n_nodes = 2 * X.shape[1] - 1
        assert len(children) + n_leaves == n_nodes
        # Check that ward_tree raises a ValueError with a connectivity matrix
        # of the wrong shape
        assert_raises(ValueError,
                      tree_builder, X.T, np.ones((4, 4)))
        # Check that fitting with no samples raises an error
        assert_raises(ValueError,
                      tree_builder, X.T[:0], connectivity) 
Example #17
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0]) 
Example #18
Source File: _clustergram.py    From dash-bio with MIT License 6 votes vote down vote up
def _get_clusters(self):
        """Cluster the data according to the specified dimensions.

        Returns:
        - tuple: The linkage matrices for the columns and/or rows.
        """

        Zcol = None
        Zrow = None

        # cluster along columns
        if self._cluster in ["col", "all"]:
            tmp = np.transpose(self._data)
            dcol = self._dist_fun(tmp, metric=self._col_dist)
            Zcol = self._link_fun(dcol, optimal_ordering=self._optimal_leaf_order)
        # cluster along rows only if 'all' is selected
        if self._cluster in ["row", "all"]:
            drow = self._dist_fun(self._data, metric=self._row_dist)
            Zrow = self._link_fun(drow, optimal_ordering=self._optimal_leaf_order)

        return (Zcol, Zrow) 
Example #19
Source File: sqtl.py    From pancanatlas_code_public with MIT License 6 votes vote down vote up
def get_col_linkage(combined_df, method='ward', metric='cosine'):
    CACHE_DIR = os.path.expanduser('~/cache/alt_splice_heatmap/sqtl')
    if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR)
    col_linkage_cache_path = os.path.join(CACHE_DIR, 'col_linkage_%s_%s.npy' %(method, metric))
    idx_linkage_cache_path = os.path.join(CACHE_DIR, 'idx.npy')
    col_name_cache_path = os.path.join(CACHE_DIR, 'col_names.npy')
    if os.path.exists(col_linkage_cache_path):
        print "Loading linkage from %s" %col_linkage_cache_path
        col_linkage = np.load(col_linkage_cache_path)
        assert np.array_equal(np.load(idx_linkage_cache_path), combined_df.index)
        assert np.array_equal(np.load(col_name_cache_path), combined_df.columns)
    else:
        print "Calculating linkage"
        col_linkage = hc.linkage(sp.distance.pdist(combined_df.values.T), method=method, metric=metric)
        np.save(col_linkage_cache_path, col_linkage)
        np.save(idx_linkage_cache_path, combined_df.index)
        np.save(col_name_cache_path, combined_df.columns)
    return col_linkage 
Example #20
Source File: zodiac.py    From plastering with MIT License 6 votes vote down vote up
def create_cluster_map(self, bow, srcids):
        cluster_map = {}
        z = linkage(bow, metric='cityblock', method='complete')
        dists = list(set(z[:, 2]))
        thresh = (dists[1] + dists[2]) / 2
        self.logger.info('Threshold: {0}'.format(thresh))
        b = hier.fcluster(z, thresh, criterion='distance')
        assert bow.shape[0] == len(b)
        assert len(b) == len(srcids)
        for cid, srcid in zip(b, srcids):
            cluster_map[cid] = cluster_map.get(cid, []) + [srcid]

        self.logger.info('# of clusters: {0}'.format(len(b)))
        self.logger.info('sizes of clustsers:{0}'.format(sorted(map(len, cluster_map.values()))))

        return cluster_map 
Example #21
Source File: sdm.py    From scedar with MIT License 5 votes vote down vote up
def sort_x_by_d(x, dmat=None, metric="cosine", linkage="auto",
                    n_eval_rounds=None, optimal_ordering=False,
                    nprocs=None, verbose=False):
        dmat = SampleDistanceMatrix(x, d=dmat, metric=metric,
                                    nprocs=nprocs)._d
        xhct = HClustTree.hclust_tree(dmat, linkage="auto",
                                      is_euc_dist=(metric == "euclidean"),
                                      optimal_ordering=optimal_ordering)
        return xhct.leaf_ids() 
Example #22
Source File: precluster.py    From texta with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self,words, vectors, number_of_steps = 21,metric="cosine",linkage="complete"):
        self.words = words
        self.vectors = vectors
        self.number_of_steps = number_of_steps
        self.metric = metric
        self.linkage = linkage 
Example #23
Source File: utils.py    From lens with Apache License 2.0 5 votes vote down vote up
def hierarchical_ordering_indices(columns, correlation_matrix):
    """Return array with hierarchical cluster ordering of columns

    Parameters
    ----------
    columns: iterable of str
        Names of columns.
    correlation_matrix: np.ndarray
        Matrix of correlation coefficients between columns.

    Returns
    -------
    indices: iterable of int
        Indices with order of columns
    """
    if len(columns) > 2:
        pairwise_dists = distance.pdist(
            np.where(np.isnan(correlation_matrix), 0, correlation_matrix),
            metric="euclidean",
        )
        linkage = hierarchy.linkage(pairwise_dists, method="average")
        dendogram = hierarchy.dendrogram(
            linkage, no_plot=True, color_threshold=-np.inf
        )
        idx = dendogram["leaves"]
    else:
        idx = list(range(len(columns)))

    return idx 
Example #24
Source File: precluster.py    From texta with GNU General Public License v3.0 5 votes vote down vote up
def __call__(self):
        if len(self.words) == 0 or len(self.vectors) == 0:
            return []
        if len(self.words) == 1:
            self.words.append(self.words[0])
            self.vectors.append(self.vectors[0])

        distance_matrix = scidist.pdist(np.array(self.vectors),self.metric)
        linkage_matrix = hier.linkage(distance_matrix,self.linkage)

        dendrogram = self._linkage_matrix_to_dendrogram(linkage_matrix,self.words,self.vectors)
        clusterings = self._create_clusterings(dendrogram)
        return [[(node.label,node.vector) for node in _get_cluster_nodes(cluster)] for cluster in self._find_optimal_clustering(clusterings)] 
Example #25
Source File: corClust.py    From Kitsune-py with MIT License 5 votes vote down vote up
def cluster(self,maxClust):
        D = self.corrDist()
        Z = linkage(D[np.triu_indices(self.n, 1)])  # create a linkage matrix based on the distance matrix
        if maxClust < 1:
            maxClust = 1
        if maxClust > self.n:
            maxClust = self.n
        map = self.__breakClust__(to_tree(Z),maxClust)
        return map

    # a recursive helper function which breaks down the dendrogram branches until all clusters have no more than maxClust elements 
Example #26
Source File: sdm.py    From scedar with MIT License 5 votes vote down vote up
def hclust_linkage(dmat, linkage="complete", n_eval_rounds=None,
                       is_euc_dist=False, optimal_ordering=False,
                       verbose=False):
        dmat = np.array(dmat, dtype="float")
        dmat = SampleDistanceMatrix.num_correct_dist_mat(dmat)

        n = dmat.shape[0]

        if linkage == "auto":
            try_linkages = ("single", "complete", "average", "weighted")

            if is_euc_dist:
                try_linkages += ("centroid", "median", "ward")

            if n_eval_rounds is None:
                n_eval_rounds = int(np.ceil(np.log2(n)))
            else:
                n_eval_rounds = int(np.ceil(max(np.log2(n), n_eval_rounds)))

            ltype_mdl_list = []
            for iter_ltype in try_linkages:
                iter_lhct = HClustTree.hclust_tree(dmat, linkage=iter_ltype)
                iter_nbp_cnt_list = iter_lhct.n_round_bipar_cnt(n_eval_rounds)
                iter_nbp_mdl_arr = np.array(list(map(
                    lambda x: MultinomialMdl(np.array(x)).mdl,
                    iter_nbp_cnt_list)))
                iter_nbp_mdl = np.sum(
                    iter_nbp_mdl_arr / np.arange(1, n_eval_rounds + 1))
                ltype_mdl_list.append(iter_nbp_mdl)

            linkage = try_linkages[ltype_mdl_list.index(max(ltype_mdl_list))]

            if verbose:
                print(linkage, tuple(zip(try_linkages, ltype_mdl_list)),
                      sep="\n")

        dmat_sf = spspatial.distance.squareform(dmat)
        hac_z = sch.linkage(dmat_sf, method=linkage,
                            optimal_ordering=optimal_ordering)
        return hac_z 
Example #27
Source File: ontobio-assoc.py    From ontobio with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def plot_subject_term_matrix(ont, aset, args):
    import numpy as np
    import pandas as pd
    import scipy.cluster.hierarchy as sch
    import scipy.spatial as scs
    df = aset.as_dataframe(subjects=args.subjects)
    print('DF={}'.format(df))
    d = scs.distance.pdist(df)
    Z = sch.linkage(d, method='complete')
    P = sch.dendrogram(Z)
    print(P) 
Example #28
Source File: LogClustering.py    From loglizer with MIT License 5 votes vote down vote up
def _offline_clustering(self, X):
        print('Starting offline clustering...')
        p_dist = pdist(X, metric=self._distance_metric)
        Z = linkage(p_dist, 'complete')
        cluster_index = fcluster(Z, self.max_dist, criterion='distance')
        self._extract_representatives(X, cluster_index)
        print('Processed {} instances.'.format(X.shape[0]))
        print('Found {} clusters offline.\n'.format(len(self.representatives)))
        # print('The representive vectors are:')
        # pprint.pprint(self.representatives.tolist()) 
Example #29
Source File: common.py    From plastering with MIT License 5 votes vote down vote up
def get_word_clusters(sentence_dict):
    srcids = list(sentence_dict.keys())
    sentences = []
    for srcid in srcids:
        sentence = []
        for metadata_type, sent in sentence_dict[srcid].items():
            sentence.append(''.join(sent))
        sentence = '\n'.join(sentence)
        sentence = ' '.join(re.findall('[a-z]+', sentence))
        sentences.append(sentence)
    vect = TfidfVectorizer()
    #vect = CountVectorizer()
    bow = vect.fit_transform(sentences).toarray()
    try:
        z = linkage(bow, metric='cityblock', method='complete')
    except:
        pdb.set_trace()
    dists = list(set(z[:,2]))
    thresh = (dists[2] + dists[3]) /2
    #thresh = (dists[1] + dists[2]) /2
    print("Threshold: ", thresh)
    b = hier.fcluster(z,thresh, criterion='distance')
    cluster_dict = defaultdict(list)

    for srcid, cluster_id in zip(srcids, b):
        cluster_dict[cluster_id].append(srcid)
    return dict(cluster_dict) 
Example #30
Source File: rep_dists.py    From pancanatlas_code_public with MIT License 5 votes vote down vote up
def heatmap_dists(data, norm=False, labels=None, metric='euclidean', method='ward'):
    fig, (ax, cax) = plt.subplots(ncols=2,figsize=(7 * 1.05 ,7),
                                  gridspec_kw={"width_ratios":[1, 0.05]})

    if labels is None:
        try:
            labels = data.index
        except AttributeError:
            pass

    n = data.shape[0]
    assert labels is None or len(labels) == n

    dists = ssd.pdist(data, metric=metric)
    linkage = sch.linkage(dists, metric=metric, method=method)
    dendro = sch.dendrogram(linkage, no_plot=True)
    order = dendro['leaves']
    sq_form_dists = ssd.squareform(dists)[order][:, order]
    assert sq_form_dists.shape == (n,n)

    hmap = ax.imshow(sq_form_dists, aspect='auto')
    ax.set_xticks(np.arange(n))
    ax.set_yticks(np.arange(n))
    if labels is not None:
        ax.set_xticklabels(labels[order], rotation=90)
        ax.set_yticklabels(labels[order])
    cb = plt.colorbar(hmap, cax=cax)
    return fig, (ax, cax)


# Tasks