Python Examples of scipy.cluster.hierarchy.linkage

Source File: clustering.py From anvio with GNU General Public License v3.0

6 votes

def create_newick_file_from_matrix_file(observation_matrix_path, output_file_path, linkage=constants.linkage_method_default,
                         distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False,
                         items_order_file_path=None):
    is_distance_and_linkage_compatible(distance, linkage)
    filesnpaths.is_file_exists(observation_matrix_path)
    filesnpaths.is_file_tab_delimited(observation_matrix_path)

    filesnpaths.is_output_file_writable(output_file_path)
    if items_order_file_path:
        filesnpaths.is_output_file_writable(items_order_file_path)

    id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path, transpose=transpose)

    vectors = np.array(vectors)

    newick = get_newick_from_matrix(vectors, distance, linkage, norm, id_to_sample_dict)

    if output_file_path:
        open(output_file_path, 'w').write(newick.strip() + '\n')

    if items_order_file_path:
        open(items_order_file_path, 'w').write('\n'.join(utils.get_names_order_from_newick_tree(newick)) + '\n')

Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License

6 votes

def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################

Source File: agglomerative.py From aggregation with Apache License 2.0

6 votes

def __agglomerative__(self,markings):
        """
        runs an initial agglomerative clustering over the given markings
        :param markings:
        :return:
        """
        # this converts stuff into panda format - probably a better way to do this but the labels do seem
        # necessary
        labels = [str(i) for i in markings]
        param_labels = [str(i) for i in range(len(markings[0]))]

        df = pd.DataFrame(np.array(markings), columns=param_labels, index=labels)
        row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)
        # use ward metric to do the actual clustering
        row_clusters = linkage(row_dist, method='ward')

        return row_clusters

Source File: marker.py From scprep with GNU General Public License v3.0

6 votes

def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels, s, c):
    # cluster tissues hierarchically using mean size and color
    tissue_features = []
    for tissue in tissue_names:
        tissue_data = []
        for cluster in cluster_names:
            tissue_cluster_idx = np.where(
                (np.array(tissue_labels) == tissue)
                & (np.array(cluster_labels) == cluster)
            )
            tissue_data.append(
                np.vstack([s[tissue_cluster_idx], c[tissue_cluster_idx]]).mean(axis=1)
            )
        tissue_features.append(np.concatenate(tissue_data))
    tissue_features = np.array(tissue_features)
    # normalize
    tissue_features = tissue_features / np.sqrt(np.sum(tissue_features ** 2))
    tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features))
    return tissues_order

Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License

6 votes

def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature ####################################################

Source File: plot.py From pypath with GNU General Public License v3.0

6 votes

def make_plot(self):

        self.z = hc.linkage(self.data, method='average')

        self.ax = self.fig.add_subplot(1, 1, 1)

        self.dendro = \
            hc.dendrogram(self.z,
                          labels=self.data.columns,
                          color_threshold=0,
                          orientation='left',
                          ax=self.ax,
                          link_color_func=lambda x: self.color)

        _ = [
            tl.set_fontproperties(self.fp_ticklabel)
            for tl in self.ax.get_yticklabels()
        ]
        _ = [
            tl.set_fontproperties(self.fp_ticklabel)
            for tl in self.ax.get_xticklabels()
        ]

        self.ax.xaxis.grid(True, color='#FFFFFF', lw=1, ls='solid')
        self.ax.yaxis.grid(False)
        self.ax.set_axisbelow(True)
        self.ax.set_facecolor('#EAEAF2')
        list(map(lambda s: s.set_lw(0), self.ax.spines.values()))
        self.ax.tick_params(which='both', length=0)

Source File: cluster.py From cesi with Apache License 2.0

6 votes

def getClusters(self, embed):

		n, m 	= len(embed), self.p.embed_dims
		X 	= np.empty((n, m), np.float32)

		for i in range(len(embed)): 
			X[i, :] = embed[i]

		dist 	  = pdist(X, 	  metric=self.p.metric)
		clust_res = linkage(dist, method=self.p.linkage)
		labels    = fcluster(clust_res, t=self.p.thresh_val, criterion='distance') - 1
		clusters  = [[] for i in range(max(labels) + 1)]

		for i in range(len(labels)): 
			clusters[labels[i]].append(i)

		return clusters

Source File: diarizationFunctions.py From pyBK with MIT License

6 votes

def performClusteringLinkage(segmentBKTable, segmentCVTable, N_init, linkageCriterion,linkageMetric ):
    from scipy.cluster.hierarchy import linkage
    from scipy import cluster
    if linkageMetric == 'jaccard':
      observations = segmentBKTable
    elif linkageMetric == 'cosine':
      observations = segmentCVTable
    else:
      observations = segmentCVTable      
    clusteringTable = np.zeros([np.size(segmentCVTable,0),N_init]) 
    Z = linkage(observations,method=linkageCriterion,metric=linkageMetric)
    for i in np.arange(N_init):
      clusteringTable[:,i] = cluster.hierarchy.cut_tree(Z,N_init-i).T+1  
    k=N_init
    print('done')
    return clusteringTable, k

Source File: allocation.py From finance_ml with MIT License

6 votes

def get_hrp(cov, corr):
    """Construct a hierarchical portfolio
    
    Params
    ------
    cov: pd.DataFrame
    corr: pd.DataFrame
    
    Returns
    -------
    pd.Series
    """
    dist = get_corr_dist(corr)
    link = sch.linkage(dist, 'single')
    sort_idx = get_quasi_diag(link)
    # Recover label
    sort_idx = corr.index[sort_idx].tolist()
    hrp = get_rec_bipart(cov, sort_idx)
    return hrp.sort_index()

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cluster_distances_with_distance_threshold():
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.randint(-10, 10, size=(n_samples, 3))
    # check the distances within the clusters and with other clusters
    distance_threshold = 4
    clustering = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=distance_threshold,
        linkage="single").fit(X)
    labels = clustering.labels_
    D = pairwise_distances(X, metric="minkowski", p=2)
    # to avoid taking the 0 diagonal in min()
    np.fill_diagonal(D, np.inf)
    for label in np.unique(labels):
        in_cluster_mask = labels == label
        max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask]
                                   .min(axis=0).max())
        min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask]
                                    .min(axis=0).min())
        # single data point clusters only have that inf diagonal here
        if in_cluster_mask.sum() > 1:
            assert max_in_cluster_distance < distance_threshold
        assert min_out_cluster_distance >= distance_threshold

Source File: ttclust.py From TTClust with GNU General Public License v3.0

6 votes

def generate_graphs(clusters_list, output, size, linkage, cutoff, distances, traj):
    """
    DESCRIPTION
    Create a linear cluster mapping graph where every frame is printed as a
    colored barplot
    Args:
        clusters_list (list): list of cluster
        output (string): output name for graph
        size (int): number of frames
        linkage (numpy array): matrix linkage
        cutoff (float): cutoff distance value for clustering (in the dendogram)
        distances(numpy array): distance matrix
        traj (Trajectory): trajectory for time usage in axis barplot
    Return:
        colors_list (list) to be used with 2D distance projection graph
    """
    colors_list = plot_barplot(clusters_list, output, size, traj)
    plot_dendro(linkage, output, cutoff, colors_list, clusters_list)
    plot_hist(clusters_list, output, colors_list)
    if (distances.shape[0] < 10000):
        plot_distmat(distances, output)
    else:
        printScreenLogfile("Too many frames! The RMSD distance matrix will not be generated")
    return colors_list

Source File: common.py From plastering with MIT License

6 votes

def hier_clustering(d, threshold=3):
    srcids = d.keys()
    tokenizer = lambda x: x.split()
    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
    assert isinstance(d, dict)
    assert isinstance(list(d.values())[0], list)
    assert isinstance(list(d.values())[0][0], str)
    doc = [' '.join(d[srcid]) for srcid in srcids]
    vect = vectorizer.fit_transform(doc)
    #TODO: Make vect aligned to the required format
    z = linkage(vect.toarray(), metric='cityblock', method='complete')
    dists = list(set(z[:,2]))
#    threshold = 3
    #threshold = (dists[2] + dists[3]) / 2
    b = hier.fcluster(z, threshold, criterion='distance')
    cluster_dict = defaultdict(list)
    for srcid, cluster_id in zip(srcids, b):
        cluster_dict[str(cluster_id)].append(srcid)
    value_lengther = lambda x: len(x[1])
    return OrderedDict(\
               sorted(cluster_dict.items(), key=value_lengther, reverse=True))

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_identical_points():
    # Ensure identical points are handled correctly when using mst with
    # a sparse connectivity matrix
    X = np.array([[0, 0, 0], [0, 0, 0],
                  [1, 1, 1], [1, 1, 1],
                  [2, 2, 2], [2, 2, 2]])
    true_labels = np.array([0, 0, 1, 1, 2, 2])
    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    connectivity, n_components = _fix_connectivity(X,
                                                   connectivity,
                                                   'euclidean')

    for linkage in ('single', 'average', 'average', 'ward'):
        clustering = AgglomerativeClustering(n_clusters=3,
                                             linkage=linkage,
                                             connectivity=connectivity)
        clustering.fit(X)

        assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                         true_labels), 1)

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_single_linkage_clustering():
    # Check that we get the correct result in two emblematic cases
    moons, moon_labels = make_moons(noise=0.05, random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(moons)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     moon_labels), 1)

    circles, circle_labels = make_circles(factor=0.5, noise=0.025,
                                          random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(circles)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     circle_labels), 1)

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_unstructured_linkage_tree():
    # Check that we obtain the correct solution for unstructured linkage trees.
    rng = np.random.RandomState(0)
    X = rng.randn(50, 100)
    for this_X in (X, X[0]):
        # With specified a number of clusters just for the sake of
        # raising a warning and testing the warning code
        with ignore_warnings():
            children, n_nodes, n_leaves, parent = assert_warns(
                UserWarning, ward_tree, this_X.T, n_clusters=10)
        n_nodes = 2 * X.shape[1] - 1
        assert_equal(len(children) + n_leaves, n_nodes)

    for tree_builder in _TREE_BUILDERS.values():
        for this_X in (X, X[0]):
            with ignore_warnings():
                children, n_nodes, n_leaves, parent = assert_warns(
                    UserWarning, tree_builder, this_X.T, n_clusters=10)

            n_nodes = 2 * X.shape[1] - 1
            assert_equal(len(children) + n_leaves, n_nodes)

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_structured_linkage_tree():
    # Check that we obtain the correct solution for structured linkage trees.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    # Avoiding a mask with only 'True' entries
    mask[4:7, 4:7] = 0
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    for tree_builder in _TREE_BUILDERS.values():
        children, n_components, n_leaves, parent = \
            tree_builder(X.T, connectivity)
        n_nodes = 2 * X.shape[1] - 1
        assert len(children) + n_leaves == n_nodes
        # Check that ward_tree raises a ValueError with a connectivity matrix
        # of the wrong shape
        assert_raises(ValueError,
                      tree_builder, X.T, np.ones((4, 4)))
        # Check that fitting with no samples raises an error
        assert_raises(ValueError,
                      tree_builder, X.T[:0], connectivity)

Source File: test_hierarchical.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])

Source File: _clustergram.py From dash-bio with MIT License

6 votes

def _get_clusters(self):
        """Cluster the data according to the specified dimensions.

        Returns:
        - tuple: The linkage matrices for the columns and/or rows.
        """

        Zcol = None
        Zrow = None

        # cluster along columns
        if self._cluster in ["col", "all"]:
            tmp = np.transpose(self._data)
            dcol = self._dist_fun(tmp, metric=self._col_dist)
            Zcol = self._link_fun(dcol, optimal_ordering=self._optimal_leaf_order)
        # cluster along rows only if 'all' is selected
        if self._cluster in ["row", "all"]:
            drow = self._dist_fun(self._data, metric=self._row_dist)
            Zrow = self._link_fun(drow, optimal_ordering=self._optimal_leaf_order)

        return (Zcol, Zrow)

Source File: sqtl.py From pancanatlas_code_public with MIT License

6 votes

def get_col_linkage(combined_df, method='ward', metric='cosine'):
    CACHE_DIR = os.path.expanduser('~/cache/alt_splice_heatmap/sqtl')
    if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR)
    col_linkage_cache_path = os.path.join(CACHE_DIR, 'col_linkage_%s_%s.npy' %(method, metric))
    idx_linkage_cache_path = os.path.join(CACHE_DIR, 'idx.npy')
    col_name_cache_path = os.path.join(CACHE_DIR, 'col_names.npy')
    if os.path.exists(col_linkage_cache_path):
        print "Loading linkage from %s" %col_linkage_cache_path
        col_linkage = np.load(col_linkage_cache_path)
        assert np.array_equal(np.load(idx_linkage_cache_path), combined_df.index)
        assert np.array_equal(np.load(col_name_cache_path), combined_df.columns)
    else:
        print "Calculating linkage"
        col_linkage = hc.linkage(sp.distance.pdist(combined_df.values.T), method=method, metric=metric)
        np.save(col_linkage_cache_path, col_linkage)
        np.save(idx_linkage_cache_path, combined_df.index)
        np.save(col_name_cache_path, combined_df.columns)
    return col_linkage

Source File: zodiac.py From plastering with MIT License

6 votes

def create_cluster_map(self, bow, srcids):
        cluster_map = {}
        z = linkage(bow, metric='cityblock', method='complete')
        dists = list(set(z[:, 2]))
        thresh = (dists[1] + dists[2]) / 2
        self.logger.info('Threshold: {0}'.format(thresh))
        b = hier.fcluster(z, thresh, criterion='distance')
        assert bow.shape[0] == len(b)
        assert len(b) == len(srcids)
        for cid, srcid in zip(b, srcids):
            cluster_map[cid] = cluster_map.get(cid, []) + [srcid]

        self.logger.info('# of clusters: {0}'.format(len(b)))
        self.logger.info('sizes of clustsers:{0}'.format(sorted(map(len, cluster_map.values()))))

        return cluster_map

Source File: sdm.py From scedar with MIT License

5 votes

def sort_x_by_d(x, dmat=None, metric="cosine", linkage="auto",
                    n_eval_rounds=None, optimal_ordering=False,
                    nprocs=None, verbose=False):
        dmat = SampleDistanceMatrix(x, d=dmat, metric=metric,
                                    nprocs=nprocs)._d
        xhct = HClustTree.hclust_tree(dmat, linkage="auto",
                                      is_euc_dist=(metric == "euclidean"),
                                      optimal_ordering=optimal_ordering)
        return xhct.leaf_ids()

Source File: precluster.py From texta with GNU General Public License v3.0

5 votes

def __init__(self,words, vectors, number_of_steps = 21,metric="cosine",linkage="complete"):
        self.words = words
        self.vectors = vectors
        self.number_of_steps = number_of_steps
        self.metric = metric
        self.linkage = linkage

Source File: utils.py From lens with Apache License 2.0

5 votes

def hierarchical_ordering_indices(columns, correlation_matrix):
    """Return array with hierarchical cluster ordering of columns

    Parameters
    ----------
    columns: iterable of str
        Names of columns.
    correlation_matrix: np.ndarray
        Matrix of correlation coefficients between columns.

    Returns
    -------
    indices: iterable of int
        Indices with order of columns
    """
    if len(columns) > 2:
        pairwise_dists = distance.pdist(
            np.where(np.isnan(correlation_matrix), 0, correlation_matrix),
            metric="euclidean",
        )
        linkage = hierarchy.linkage(pairwise_dists, method="average")
        dendogram = hierarchy.dendrogram(
            linkage, no_plot=True, color_threshold=-np.inf
        )
        idx = dendogram["leaves"]
    else:
        idx = list(range(len(columns)))

    return idx

Source File: precluster.py From texta with GNU General Public License v3.0

5 votes

def __call__(self):
        if len(self.words) == 0 or len(self.vectors) == 0:
            return []
        if len(self.words) == 1:
            self.words.append(self.words[0])
            self.vectors.append(self.vectors[0])

        distance_matrix = scidist.pdist(np.array(self.vectors),self.metric)
        linkage_matrix = hier.linkage(distance_matrix,self.linkage)

        dendrogram = self._linkage_matrix_to_dendrogram(linkage_matrix,self.words,self.vectors)
        clusterings = self._create_clusterings(dendrogram)
        return [[(node.label,node.vector) for node in _get_cluster_nodes(cluster)] for cluster in self._find_optimal_clustering(clusterings)]

Source File: corClust.py From Kitsune-py with MIT License

5 votes

def cluster(self,maxClust):
        D = self.corrDist()
        Z = linkage(D[np.triu_indices(self.n, 1)])  # create a linkage matrix based on the distance matrix
        if maxClust < 1:
            maxClust = 1
        if maxClust > self.n:
            maxClust = self.n
        map = self.__breakClust__(to_tree(Z),maxClust)
        return map

    # a recursive helper function which breaks down the dendrogram branches until all clusters have no more than maxClust elements

Source File: sdm.py From scedar with MIT License

5 votes

def hclust_linkage(dmat, linkage="complete", n_eval_rounds=None,
                       is_euc_dist=False, optimal_ordering=False,
                       verbose=False):
        dmat = np.array(dmat, dtype="float")
        dmat = SampleDistanceMatrix.num_correct_dist_mat(dmat)

        n = dmat.shape[0]

        if linkage == "auto":
            try_linkages = ("single", "complete", "average", "weighted")

            if is_euc_dist:
                try_linkages += ("centroid", "median", "ward")

            if n_eval_rounds is None:
                n_eval_rounds = int(np.ceil(np.log2(n)))
            else:
                n_eval_rounds = int(np.ceil(max(np.log2(n), n_eval_rounds)))

            ltype_mdl_list = []
            for iter_ltype in try_linkages:
                iter_lhct = HClustTree.hclust_tree(dmat, linkage=iter_ltype)
                iter_nbp_cnt_list = iter_lhct.n_round_bipar_cnt(n_eval_rounds)
                iter_nbp_mdl_arr = np.array(list(map(
                    lambda x: MultinomialMdl(np.array(x)).mdl,
                    iter_nbp_cnt_list)))
                iter_nbp_mdl = np.sum(
                    iter_nbp_mdl_arr / np.arange(1, n_eval_rounds + 1))
                ltype_mdl_list.append(iter_nbp_mdl)

            linkage = try_linkages[ltype_mdl_list.index(max(ltype_mdl_list))]

            if verbose:
                print(linkage, tuple(zip(try_linkages, ltype_mdl_list)),
                      sep="\n")

        dmat_sf = spspatial.distance.squareform(dmat)
        hac_z = sch.linkage(dmat_sf, method=linkage,
                            optimal_ordering=optimal_ordering)
        return hac_z

Source File: ontobio-assoc.py From ontobio with BSD 3-Clause "New" or "Revised" License

5 votes

def plot_subject_term_matrix(ont, aset, args):
    import numpy as np
    import pandas as pd
    import scipy.cluster.hierarchy as sch
    import scipy.spatial as scs
    df = aset.as_dataframe(subjects=args.subjects)
    print('DF={}'.format(df))
    d = scs.distance.pdist(df)
    Z = sch.linkage(d, method='complete')
    P = sch.dendrogram(Z)
    print(P)

Source File: LogClustering.py From loglizer with MIT License

5 votes

def _offline_clustering(self, X):
        print('Starting offline clustering...')
        p_dist = pdist(X, metric=self._distance_metric)
        Z = linkage(p_dist, 'complete')
        cluster_index = fcluster(Z, self.max_dist, criterion='distance')
        self._extract_representatives(X, cluster_index)
        print('Processed {} instances.'.format(X.shape[0]))
        print('Found {} clusters offline.\n'.format(len(self.representatives)))
        # print('The representive vectors are:')
        # pprint.pprint(self.representatives.tolist())

Source File: common.py From plastering with MIT License

5 votes

def get_word_clusters(sentence_dict):
    srcids = list(sentence_dict.keys())
    sentences = []
    for srcid in srcids:
        sentence = []
        for metadata_type, sent in sentence_dict[srcid].items():
            sentence.append(''.join(sent))
        sentence = '\n'.join(sentence)
        sentence = ' '.join(re.findall('[a-z]+', sentence))
        sentences.append(sentence)
    vect = TfidfVectorizer()
    #vect = CountVectorizer()
    bow = vect.fit_transform(sentences).toarray()
    try:
        z = linkage(bow, metric='cityblock', method='complete')
    except:
        pdb.set_trace()
    dists = list(set(z[:,2]))
    thresh = (dists[2] + dists[3]) /2
    #thresh = (dists[1] + dists[2]) /2
    print("Threshold: ", thresh)
    b = hier.fcluster(z,thresh, criterion='distance')
    cluster_dict = defaultdict(list)

    for srcid, cluster_id in zip(srcids, b):
        cluster_dict[cluster_id].append(srcid)
    return dict(cluster_dict)

Source File: rep_dists.py From pancanatlas_code_public with MIT License

5 votes

def heatmap_dists(data, norm=False, labels=None, metric='euclidean', method='ward'):
    fig, (ax, cax) = plt.subplots(ncols=2,figsize=(7 * 1.05 ,7),
                                  gridspec_kw={"width_ratios":[1, 0.05]})

    if labels is None:
        try:
            labels = data.index
        except AttributeError:
            pass

    n = data.shape[0]
    assert labels is None or len(labels) == n

    dists = ssd.pdist(data, metric=metric)
    linkage = sch.linkage(dists, metric=metric, method=method)
    dendro = sch.dendrogram(linkage, no_plot=True)
    order = dendro['leaves']
    sq_form_dists = ssd.squareform(dists)[order][:, order]
    assert sq_form_dists.shape == (n,n)

    hmap = ax.imshow(sq_form_dists, aspect='auto')
    ax.set_xticks(np.arange(n))
    ax.set_yticks(np.arange(n))
    if labels is not None:
        ax.set_xticklabels(labels[order], rotation=90)
        ax.set_yticklabels(labels[order])
    cb = plt.colorbar(hmap, cax=cax)
    return fig, (ax, cax)


# Tasks

Python scipy.cluster.hierarchy.linkage() Examples