Python Examples of sklearn.cluster.AffinityPropagation

Source File: document_clustering.py From text-analytics-with-python with Apache License 2.0

6 votes

def affinity_propagation(feature_matrix):
    
    sim = feature_matrix * feature_matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_          
    return ap, clusters

# get clusters using affinity propagation

Source File: test_cluster.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_affinity_propagation_class(self):
        from sklearn.datasets.samples_generator import make_blobs

        centers = [[1, 1], [-1, -1], [1, -1]]
        X, labels_true = make_blobs(n_samples=300, centers=centers,
                                    cluster_std=0.5, random_state=0)

        df = pdml.ModelFrame(data=X, target=labels_true)
        af = df.cluster.AffinityPropagation(preference=-50)
        df.fit(af)

        af2 = cluster.AffinityPropagation(preference=-50).fit(X)

        tm.assert_numpy_array_equal(af.cluster_centers_indices_,
                                    af2.cluster_centers_indices_)
        tm.assert_numpy_array_equal(af.labels_, af2.labels_)

Source File: test_cluster.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_Classifications(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        models = ['AffinityPropagation', 'MeanShift']
        for model in models:
            mod1 = getattr(df.cluster, model)()
            mod2 = getattr(cluster, model)()

            df.fit(mod1)
            mod2.fit(iris.data)

            result = df.predict(mod1)
            expected = mod2.predict(iris.data)

            self.assertIsInstance(result, pdml.ModelSeries)
            self.assert_numpy_array_almost_equal(result.values, expected)

Source File: cluster.py From poem with MIT License

5 votes

def cluster(X):
    X = preprocessing.normalize(X, norm='l2')
    distance = X.dot(X.transpose())
    c = AffinityPropagation(affinity="precomputed")
    y = c.fit_predict(distance)
    return y

Source File: sense_clusterer.py From yelp with GNU Lesser General Public License v2.1

5 votes

def cluster_affinity_propagation(similarity_matrix, desired_keys=None):

    numpy_matrix = similarity_matrix_to_numpy(similarity_matrix, desired_keys)

    clusterer = AffinityPropagation()
    return clusterer.fit_predict(numpy_matrix)

Source File: compare_clustering_algs.py From mmvt with GNU General Public License v3.0

5 votes

def compare(data, n_groups, output_fol):
    # plot_clusters(data.astype(np.float), scipy.cluster.vq.kmeans, 'scipy.cluster.vq.kmeans', output_fol, (n_groups,), {})
    plot_clusters(data, cluster.KMeans, 'KMeans', output_fol, (), {'n_clusters': n_groups})
    for ct in ['spherical', 'tied', 'diag', 'full']:
        plot_clusters(data, mixture.GaussianMixture, 'GMM_{}'.format(ct), output_fol, (),
                      {'n_components': n_groups, 'covariance_type': ct})
    plot_clusters(data, cluster.AffinityPropagation, 'AffinityPropagation', output_fol, (), {'preference': -5.0, 'damping': 0.95})
    plot_clusters(data, cluster.MeanShift, 'MeanShift', output_fol, (0.175,), {'cluster_all': False})
    plot_clusters(data, cluster.SpectralClustering, 'SpectralClustering', output_fol, (), {'n_clusters': n_groups})
    plot_clusters(data, cluster.AgglomerativeClustering, 'AgglomerativeClustering', output_fol, (), {'n_clusters': n_groups, 'linkage': 'ward'})
    plot_clusters(data, cluster.DBSCAN, 'DBSCAN', output_fol, (), {'eps': 0.025})
    # plot_clusters(data, hdbscan.HDBSCAN, 'HDBSCAN', output_fol, (), {'min_cluster_size': 15})

Source File: lexrankr.py From lexrankr with MIT License

5 votes

def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number'], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs):
        self.decay_window = decay_window
        self.decay_alpha = decay_alpha
        if similarity == 'cosine':  # very, very slow :(
            self.vectorizer = DictVectorizer()
            self.uniform_sim = self._sim_cosine
        elif similarity == 'jaccard':
            self.uniform_sim = self._sim_jaccard
        elif similarity == 'normalized_cooccurrence':
            self.uniform_sim = self._sim_normalized_cooccurrence
        else:
            raise LexRankError("available similarity functions are: cosine, jaccard, normalized_cooccurrence")
        self.sim = lambda sentence1, sentence2: self.decay(sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
        self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs)
        if clustering == 'birch':
            self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
            self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix)
        elif clustering == 'dbscan':
            self._dbscan = DBSCAN()
            self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix)
        elif clustering == 'affinity':
            self._affinity = AffinityPropagation()
            self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix)
        elif clustering is None:
            self._clusterer = lambda matrix: [0 for index in range(matrix.shape[0])]
        else:
            raise LexRankError("available clustering algorithms are: birch, markov, no-clustering(use `None`)")
        self.no_below_word_count = no_below_word_count
        self.no_above_word_portion = no_above_word_portion
        self.max_dictionary_size = max_dictionary_size
        self.similarity_threshold = similarity_threshold
        self.min_cluster_size = min_cluster_size
        self.matrix_smoothing = matrix_smoothing
        self.compactify = compactify

Source File: song2vec_operator.py From MusicTaster with MIT License

4 votes

def cluster_song_in_playlist(self, playlist_id, cluster_n=5, is_detailed=False):
        """
        获取单个歌单内的歌曲聚类信息
        Args:
            playlist_id: 歌单id
            cluster_n:聚类数
            is_detailed: 返回的结果是否包含详情

        Returns:
            聚类后的列表
        """
        playlist_obj = playlist_detail(playlist_id)
        song_list = []
        vec_list = []
        song_info_dict = {}
        ap_cluster = AffinityPropagation()
        data_process_logger.info('clustering playlist: %s' % playlist_obj['name'])
        for item in playlist_obj['tracks']:
            song = item['name'].lower()
            song_info_dict[song] = {
                'name': song,
                'artist': item['artists'][0]['name'],
                'id': item['id'],
                'album_img_url': item['album']['picUrl'],
                'site_url': 'http://music.163.com/#/song?id=%s' % item['id']
            }
            # print song
            if song not in song_list:
                song_list.append(song)
                # print self.song2vec_model.vocab.get(song)
                # print self.song2vec_model.syn0norm == None
                if self.song2vec_model.vocab.get(song) and len(self.song2vec_model.syn0norm):
                    song_vec = self.song2vec_model.syn0norm[self.song2vec_model.vocab[song].index]
                else:
                    data_process_logger.warn(
                        'The song %s of playlist-%s is not in dataset' % (song, playlist_obj['name']))
                    song_vec = [0 for i in range(self.song2vec_model.vector_size)]
                vec_list.append(song_vec)
        # song_list = list(song_list)
        if len(vec_list) > 1:
            cluster_result = ap_cluster.fit(vec_list, song_list)
            cluster_array = [[] for i in range(len(cluster_result.cluster_centers_indices_))]
            for i in range(len(cluster_result.labels_)):
                label = cluster_result.labels_[i]
                index = i
                cluster_array[label].append(song_list[i])
            return cluster_array, playlist_obj['name'], song_info_dict
        else:
            return [song_list], playlist_obj['name'], song_info_dict

Source File: song2vec_operator.py From MusicTaster with MIT License

4 votes

def cluster_artist_in_playlist(self, playlist_id, cluster_n=5, is_detailed=False):
        """
        获取单个歌单内的歌手聚类信息
        Args:
            playlist_id: 歌单id
            cluster_n:聚类数
            is_detailed: 是否包含详情信息

        Returns:
            聚类后的列表
        """
        playlist_obj = playlist_detail(playlist_id)
        artist_list = []
        vec_list = []
        ap_cluster = AffinityPropagation()
        data_process_logger.info('clustering playlist: %s' % playlist_obj['name'])
        for item in playlist_obj['tracks']:
            artist = item['artists'][0]['name'].lower()
            # print artist
            if artist not in artist_list:
                artist_list.append(artist)
                # print self.song2vec_model.vocab.get(artist)
                # print self.song2vec_model.syn0norm == None
                if self.artist2vec_model.vocab.get(artist) and len(self.artist2vec_model.syn0norm):
                    artist_vec = self.artist2vec_model.syn0norm[self.artist2vec_model.vocab[artist].index]
                else:
                    data_process_logger.warn(
                        'The artist %s of playlist-%s is not in dataset' % (artist, playlist_obj['name']))
                    artist_vec = [0 for i in range(self.artist2vec_model.vector_size)]
                vec_list.append(artist_vec)
        # artist_list = list(artist_list)
        # vec_list = list(vec_list)
        if len(vec_list) > 1:
            cluster_result = ap_cluster.fit(vec_list, artist_list)
            cluster_array = [[] for i in range(len(cluster_result.cluster_centers_indices_))]
            for i in range(len(cluster_result.labels_)):
                label = cluster_result.labels_[i]
                index = i
                cluster_array[label].append(artist_list[i])
            return cluster_array, playlist_obj['name'], {}
        else:
            return [artist_list], playlist_obj['name'], {}

Source File: context_utils.py From yelp with GNU Lesser General Public License v2.1

4 votes

def build_groups2(nouns):

    print('building groups', time.strftime("%H:%M:%S"))
    all_senses = set()

    sense_word_map = {}
    for noun in nouns:
        senses = wordnet.synsets(noun, pos='n')
        all_senses.update(senses)
        for sense in senses:
            if sense.name() not in sense_word_map:
                sense_word_map[sense.name()] = []
            sense_word_map[sense.name()].append(noun)

    all_senses = list(all_senses)
    all_senses_names = [sense.name() for sense in all_senses]

    print('number of senses:', len(all_senses))
    sense_similarity_matrix, sense_similarity_matrix_columns =\
        get_sense_similarity_submatrix(all_senses_names)
    print('submatrix ready', time.strftime("%H:%M:%S"))

    # affinity_propagation = AffinityPropagation()
    # labels1 = affinity_propagation.fit_predict(sense_similarity_matrix)
    # print('affinity propagation ready', time.strftime("%H:%M:%S"))

    grouper = BaumanSensesGrouper(sense_similarity_matrix, 0.7)
    groups = grouper.group_senses()
    print('groups')
    # print(groups)
    new_groups = []
    for group in groups:
        new_group = set()
        for element in group:
            sense_name = sense_similarity_matrix_columns[element]
            new_group.add(sense_name)
        new_groups.append(new_group)

    print('finished groups', time.strftime("%H:%M:%S"))
    # print(groups)
    # print(new_groups)
    print('num groups: %d' % len(groups))

    sense_groups = []
    for group in new_groups:
        sense_group = SenseGroup(group)
        for sense in sense_group.senses:
            sense_group.nouns |= set(sense_word_map[sense])
        sense_groups.append(sense_group)

    return sense_groups

Source File: context_utils.py From yelp with GNU Lesser General Public License v2.1

4 votes

def evaluate_clustering():

    similarity_matrix = get_sense_similarity_submatrix(range(10000))
    matrix_size = len(similarity_matrix)
    print('got matrix')

    affinity_propagation = AffinityPropagation()
    labels1 = affinity_propagation.fit_predict(similarity_matrix)
    print('affinity propagation')

    dbscan = DBSCAN(min_samples=1)
    labels2 = dbscan.fit_predict(similarity_matrix)
    print('print dbscan')

    distance_matrix = np.ndarray((matrix_size, matrix_size))
    for i in range(matrix_size):
        for j in range(matrix_size):
            distance_matrix[i, j] = 1 - similarity_matrix[i, j]

    print(distance_matrix[1, 2])
    print(distance_matrix[1, 1])

    print('created distance matrix')

    cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1)
    cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2)

    print(cluster_map1)
    print(cluster_map2)

    sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean')
    sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean')
    sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix)
    sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix)

    num_elements1 = [len(values) for values in cluster_map1.values()]
    num_elements2 = [len(values) for values in cluster_map2.values()]
    print(num_elements1)
    print(num_elements2)

    print('Number of clusters Affinity Propagation: %f' % len(cluster_map1))
    print('Number of clusters DBSCAN: %f' % len(cluster_map2))
    print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1))
    print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2))
    print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1))
    print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2))
    print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1)
    print('Silouhette score DBSCAN (distance matrix): %f' % sc2)
    print('Dunn index Affinity Propagation (distance matrix): %f' % sc5)
    print('Dunn index DBSCAN (distance matrix): %f' % sc6)


# start = time.time()
# main()
# evaluate_clustering()
# get_similarity_submatrix()
# end = time.time()
# total_time = end - start
# print("Total time = %f seconds" % total_time)

Source File: exposons.py From enspara with GNU General Public License v3.0

4 votes

def exposons_from_sasas(sasas, damping, weights, threshold):
    """Compute exposons for an MDTraj trajectory.

    This function is a convenience wrapper to compute exposons using other
    functions already existing in MDTraj, sklearn, and elsewhere in enspara.

    Parameters
    ----------
    sasas: np.ndarray, shape=(n_conformations, n_sidechains)
        SASAs to use in the calculations.
    damping: float
        Damping parameter to use for affinity propagation. Goes from 0.5
        to <1.0. Empirically, values between 0.85 and 0.95 tend to work best.
    weights: ndarray, shape=(len(trj),), default=None
        Weight of each frame in the simulation for the mutual information
        calculation. Useful if `trj` represents cluster centers of an MSM
        rather than a full trajectory. If None, frames will be weighted
        equally.
    threshold: float, default=0.02
        Sidechains with greater than this amount of total SASA will count
        as exposed for the purposes of the exposed/buried dichotomy used
        in mutual information calculations.

    Returns
    -------
    sasa_mi: np.ndarray, shape=(n_res, n_res)
        Mutual information of each sidchain with each other sidechain
        computed for the purposes of clustering exposons.
    exposons: np.ndarray, shape=(n_res,)
        Assignment of residues to exposons. Residues in the same exposon
        share the same number in this array.
    """

    sasa_mi = weighted_mi(sasas > threshold, weights)

    c = AffinityPropagation(
        damping=damping,
        affinity='precomputed',
        preference=0,
        max_iter=10000)
    c.fit(sasa_mi)

    return sasa_mi, c.labels_

Source File: test_cluster.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

3 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.cluster.AffinityPropagation, cluster.AffinityPropagation)
        self.assertIs(df.cluster.AgglomerativeClustering, cluster.AgglomerativeClustering)
        self.assertIs(df.cluster.Birch, cluster.Birch)
        self.assertIs(df.cluster.DBSCAN, cluster.DBSCAN)
        self.assertIs(df.cluster.FeatureAgglomeration, cluster.FeatureAgglomeration)
        self.assertIs(df.cluster.KMeans, cluster.KMeans)
        self.assertIs(df.cluster.MiniBatchKMeans, cluster.MiniBatchKMeans)
        self.assertIs(df.cluster.MeanShift, cluster.MeanShift)
        self.assertIs(df.cluster.SpectralClustering, cluster.SpectralClustering)

        self.assertIs(df.cluster.bicluster.SpectralBiclustering,
                      cluster.bicluster.SpectralBiclustering)
        self.assertIs(df.cluster.bicluster.SpectralCoclustering,
                      cluster.bicluster.SpectralCoclustering)

Python sklearn.cluster.AffinityPropagation() Examples