Python sklearn.cluster.AffinityPropagation() Examples

The following are 13 code examples of sklearn.cluster.AffinityPropagation(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cluster , or try the search function .
Example #1
Source File: document_clustering.py    From text-analytics-with-python with Apache License 2.0 6 votes vote down vote up
def affinity_propagation(feature_matrix):
    
    sim = feature_matrix * feature_matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_          
    return ap, clusters

# get clusters using affinity propagation 
Example #2
Source File: test_cluster.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_affinity_propagation_class(self):
        from sklearn.datasets.samples_generator import make_blobs

        centers = [[1, 1], [-1, -1], [1, -1]]
        X, labels_true = make_blobs(n_samples=300, centers=centers,
                                    cluster_std=0.5, random_state=0)

        df = pdml.ModelFrame(data=X, target=labels_true)
        af = df.cluster.AffinityPropagation(preference=-50)
        df.fit(af)

        af2 = cluster.AffinityPropagation(preference=-50).fit(X)

        tm.assert_numpy_array_equal(af.cluster_centers_indices_,
                                    af2.cluster_centers_indices_)
        tm.assert_numpy_array_equal(af.labels_, af2.labels_) 
Example #3
Source File: test_cluster.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_Classifications(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        models = ['AffinityPropagation', 'MeanShift']
        for model in models:
            mod1 = getattr(df.cluster, model)()
            mod2 = getattr(cluster, model)()

            df.fit(mod1)
            mod2.fit(iris.data)

            result = df.predict(mod1)
            expected = mod2.predict(iris.data)

            self.assertIsInstance(result, pdml.ModelSeries)
            self.assert_numpy_array_almost_equal(result.values, expected) 
Example #4
Source File: cluster.py    From poem with MIT License 5 votes vote down vote up
def cluster(X):
    X = preprocessing.normalize(X, norm='l2')
    distance = X.dot(X.transpose())
    c = AffinityPropagation(affinity="precomputed")
    y = c.fit_predict(distance)
    return y 
Example #5
Source File: sense_clusterer.py    From yelp with GNU Lesser General Public License v2.1 5 votes vote down vote up
def cluster_affinity_propagation(similarity_matrix, desired_keys=None):

    numpy_matrix = similarity_matrix_to_numpy(similarity_matrix, desired_keys)

    clusterer = AffinityPropagation()
    return clusterer.fit_predict(numpy_matrix) 
Example #6
Source File: compare_clustering_algs.py    From mmvt with GNU General Public License v3.0 5 votes vote down vote up
def compare(data, n_groups, output_fol):
    # plot_clusters(data.astype(np.float), scipy.cluster.vq.kmeans, 'scipy.cluster.vq.kmeans', output_fol, (n_groups,), {})
    plot_clusters(data, cluster.KMeans, 'KMeans', output_fol, (), {'n_clusters': n_groups})
    for ct in ['spherical', 'tied', 'diag', 'full']:
        plot_clusters(data, mixture.GaussianMixture, 'GMM_{}'.format(ct), output_fol, (),
                      {'n_components': n_groups, 'covariance_type': ct})
    plot_clusters(data, cluster.AffinityPropagation, 'AffinityPropagation', output_fol, (), {'preference': -5.0, 'damping': 0.95})
    plot_clusters(data, cluster.MeanShift, 'MeanShift', output_fol, (0.175,), {'cluster_all': False})
    plot_clusters(data, cluster.SpectralClustering, 'SpectralClustering', output_fol, (), {'n_clusters': n_groups})
    plot_clusters(data, cluster.AgglomerativeClustering, 'AgglomerativeClustering', output_fol, (), {'n_clusters': n_groups, 'linkage': 'ward'})
    plot_clusters(data, cluster.DBSCAN, 'DBSCAN', output_fol, (), {'eps': 0.025})
    # plot_clusters(data, hdbscan.HDBSCAN, 'HDBSCAN', output_fol, (), {'min_cluster_size': 15}) 
Example #7
Source File: lexrankr.py    From lexrankr with MIT License 5 votes vote down vote up
def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number'], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs):
        self.decay_window = decay_window
        self.decay_alpha = decay_alpha
        if similarity == 'cosine':  # very, very slow :(
            self.vectorizer = DictVectorizer()
            self.uniform_sim = self._sim_cosine
        elif similarity == 'jaccard':
            self.uniform_sim = self._sim_jaccard
        elif similarity == 'normalized_cooccurrence':
            self.uniform_sim = self._sim_normalized_cooccurrence
        else:
            raise LexRankError("available similarity functions are: cosine, jaccard, normalized_cooccurrence")
        self.sim = lambda sentence1, sentence2: self.decay(sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
        self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs)
        if clustering == 'birch':
            self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
            self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix)
        elif clustering == 'dbscan':
            self._dbscan = DBSCAN()
            self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix)
        elif clustering == 'affinity':
            self._affinity = AffinityPropagation()
            self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix)
        elif clustering is None:
            self._clusterer = lambda matrix: [0 for index in range(matrix.shape[0])]
        else:
            raise LexRankError("available clustering algorithms are: birch, markov, no-clustering(use `None`)")
        self.no_below_word_count = no_below_word_count
        self.no_above_word_portion = no_above_word_portion
        self.max_dictionary_size = max_dictionary_size
        self.similarity_threshold = similarity_threshold
        self.min_cluster_size = min_cluster_size
        self.matrix_smoothing = matrix_smoothing
        self.compactify = compactify 
Example #8
Source File: song2vec_operator.py    From MusicTaster with MIT License 4 votes vote down vote up
def cluster_song_in_playlist(self, playlist_id, cluster_n=5, is_detailed=False):
        """
        获取单个歌单内的歌曲聚类信息
        Args:
            playlist_id: 歌单id
            cluster_n:聚类数
            is_detailed: 返回的结果是否包含详情

        Returns:
            聚类后的列表
        """
        playlist_obj = playlist_detail(playlist_id)
        song_list = []
        vec_list = []
        song_info_dict = {}
        ap_cluster = AffinityPropagation()
        data_process_logger.info('clustering playlist: %s' % playlist_obj['name'])
        for item in playlist_obj['tracks']:
            song = item['name'].lower()
            song_info_dict[song] = {
                'name': song,
                'artist': item['artists'][0]['name'],
                'id': item['id'],
                'album_img_url': item['album']['picUrl'],
                'site_url': 'http://music.163.com/#/song?id=%s' % item['id']
            }
            # print song
            if song not in song_list:
                song_list.append(song)
                # print self.song2vec_model.vocab.get(song)
                # print self.song2vec_model.syn0norm == None
                if self.song2vec_model.vocab.get(song) and len(self.song2vec_model.syn0norm):
                    song_vec = self.song2vec_model.syn0norm[self.song2vec_model.vocab[song].index]
                else:
                    data_process_logger.warn(
                        'The song %s of playlist-%s is not in dataset' % (song, playlist_obj['name']))
                    song_vec = [0 for i in range(self.song2vec_model.vector_size)]
                vec_list.append(song_vec)
        # song_list = list(song_list)
        if len(vec_list) > 1:
            cluster_result = ap_cluster.fit(vec_list, song_list)
            cluster_array = [[] for i in range(len(cluster_result.cluster_centers_indices_))]
            for i in range(len(cluster_result.labels_)):
                label = cluster_result.labels_[i]
                index = i
                cluster_array[label].append(song_list[i])
            return cluster_array, playlist_obj['name'], song_info_dict
        else:
            return [song_list], playlist_obj['name'], song_info_dict 
Example #9
Source File: song2vec_operator.py    From MusicTaster with MIT License 4 votes vote down vote up
def cluster_artist_in_playlist(self, playlist_id, cluster_n=5, is_detailed=False):
        """
        获取单个歌单内的歌手聚类信息
        Args:
            playlist_id: 歌单id
            cluster_n:聚类数
            is_detailed: 是否包含详情信息

        Returns:
            聚类后的列表
        """
        playlist_obj = playlist_detail(playlist_id)
        artist_list = []
        vec_list = []
        ap_cluster = AffinityPropagation()
        data_process_logger.info('clustering playlist: %s' % playlist_obj['name'])
        for item in playlist_obj['tracks']:
            artist = item['artists'][0]['name'].lower()
            # print artist
            if artist not in artist_list:
                artist_list.append(artist)
                # print self.song2vec_model.vocab.get(artist)
                # print self.song2vec_model.syn0norm == None
                if self.artist2vec_model.vocab.get(artist) and len(self.artist2vec_model.syn0norm):
                    artist_vec = self.artist2vec_model.syn0norm[self.artist2vec_model.vocab[artist].index]
                else:
                    data_process_logger.warn(
                        'The artist %s of playlist-%s is not in dataset' % (artist, playlist_obj['name']))
                    artist_vec = [0 for i in range(self.artist2vec_model.vector_size)]
                vec_list.append(artist_vec)
        # artist_list = list(artist_list)
        # vec_list = list(vec_list)
        if len(vec_list) > 1:
            cluster_result = ap_cluster.fit(vec_list, artist_list)
            cluster_array = [[] for i in range(len(cluster_result.cluster_centers_indices_))]
            for i in range(len(cluster_result.labels_)):
                label = cluster_result.labels_[i]
                index = i
                cluster_array[label].append(artist_list[i])
            return cluster_array, playlist_obj['name'], {}
        else:
            return [artist_list], playlist_obj['name'], {} 
Example #10
Source File: context_utils.py    From yelp with GNU Lesser General Public License v2.1 4 votes vote down vote up
def build_groups2(nouns):

    print('building groups', time.strftime("%H:%M:%S"))
    all_senses = set()

    sense_word_map = {}
    for noun in nouns:
        senses = wordnet.synsets(noun, pos='n')
        all_senses.update(senses)
        for sense in senses:
            if sense.name() not in sense_word_map:
                sense_word_map[sense.name()] = []
            sense_word_map[sense.name()].append(noun)

    all_senses = list(all_senses)
    all_senses_names = [sense.name() for sense in all_senses]

    print('number of senses:', len(all_senses))
    sense_similarity_matrix, sense_similarity_matrix_columns =\
        get_sense_similarity_submatrix(all_senses_names)
    print('submatrix ready', time.strftime("%H:%M:%S"))

    # affinity_propagation = AffinityPropagation()
    # labels1 = affinity_propagation.fit_predict(sense_similarity_matrix)
    # print('affinity propagation ready', time.strftime("%H:%M:%S"))

    grouper = BaumanSensesGrouper(sense_similarity_matrix, 0.7)
    groups = grouper.group_senses()
    print('groups')
    # print(groups)
    new_groups = []
    for group in groups:
        new_group = set()
        for element in group:
            sense_name = sense_similarity_matrix_columns[element]
            new_group.add(sense_name)
        new_groups.append(new_group)

    print('finished groups', time.strftime("%H:%M:%S"))
    # print(groups)
    # print(new_groups)
    print('num groups: %d' % len(groups))

    sense_groups = []
    for group in new_groups:
        sense_group = SenseGroup(group)
        for sense in sense_group.senses:
            sense_group.nouns |= set(sense_word_map[sense])
        sense_groups.append(sense_group)

    return sense_groups 
Example #11
Source File: context_utils.py    From yelp with GNU Lesser General Public License v2.1 4 votes vote down vote up
def evaluate_clustering():

    similarity_matrix = get_sense_similarity_submatrix(range(10000))
    matrix_size = len(similarity_matrix)
    print('got matrix')

    affinity_propagation = AffinityPropagation()
    labels1 = affinity_propagation.fit_predict(similarity_matrix)
    print('affinity propagation')

    dbscan = DBSCAN(min_samples=1)
    labels2 = dbscan.fit_predict(similarity_matrix)
    print('print dbscan')

    distance_matrix = np.ndarray((matrix_size, matrix_size))
    for i in range(matrix_size):
        for j in range(matrix_size):
            distance_matrix[i, j] = 1 - similarity_matrix[i, j]

    print(distance_matrix[1, 2])
    print(distance_matrix[1, 1])

    print('created distance matrix')

    cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1)
    cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2)

    print(cluster_map1)
    print(cluster_map2)

    sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean')
    sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean')
    sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix)
    sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix)

    num_elements1 = [len(values) for values in cluster_map1.values()]
    num_elements2 = [len(values) for values in cluster_map2.values()]
    print(num_elements1)
    print(num_elements2)

    print('Number of clusters Affinity Propagation: %f' % len(cluster_map1))
    print('Number of clusters DBSCAN: %f' % len(cluster_map2))
    print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1))
    print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2))
    print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1))
    print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2))
    print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1)
    print('Silouhette score DBSCAN (distance matrix): %f' % sc2)
    print('Dunn index Affinity Propagation (distance matrix): %f' % sc5)
    print('Dunn index DBSCAN (distance matrix): %f' % sc6)


# start = time.time()
# main()
# evaluate_clustering()
# get_similarity_submatrix()
# end = time.time()
# total_time = end - start
# print("Total time = %f seconds" % total_time) 
Example #12
Source File: exposons.py    From enspara with GNU General Public License v3.0 4 votes vote down vote up
def exposons_from_sasas(sasas, damping, weights, threshold):
    """Compute exposons for an MDTraj trajectory.

    This function is a convenience wrapper to compute exposons using other
    functions already existing in MDTraj, sklearn, and elsewhere in enspara.

    Parameters
    ----------
    sasas: np.ndarray, shape=(n_conformations, n_sidechains)
        SASAs to use in the calculations.
    damping: float
        Damping parameter to use for affinity propagation. Goes from 0.5
        to <1.0. Empirically, values between 0.85 and 0.95 tend to work best.
    weights: ndarray, shape=(len(trj),), default=None
        Weight of each frame in the simulation for the mutual information
        calculation. Useful if `trj` represents cluster centers of an MSM
        rather than a full trajectory. If None, frames will be weighted
        equally.
    threshold: float, default=0.02
        Sidechains with greater than this amount of total SASA will count
        as exposed for the purposes of the exposed/buried dichotomy used
        in mutual information calculations.

    Returns
    -------
    sasa_mi: np.ndarray, shape=(n_res, n_res)
        Mutual information of each sidchain with each other sidechain
        computed for the purposes of clustering exposons.
    exposons: np.ndarray, shape=(n_res,)
        Assignment of residues to exposons. Residues in the same exposon
        share the same number in this array.
    """

    sasa_mi = weighted_mi(sasas > threshold, weights)

    c = AffinityPropagation(
        damping=damping,
        affinity='precomputed',
        preference=0,
        max_iter=10000)
    c.fit(sasa_mi)

    return sasa_mi, c.labels_ 
Example #13
Source File: test_cluster.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 3 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.cluster.AffinityPropagation, cluster.AffinityPropagation)
        self.assertIs(df.cluster.AgglomerativeClustering, cluster.AgglomerativeClustering)
        self.assertIs(df.cluster.Birch, cluster.Birch)
        self.assertIs(df.cluster.DBSCAN, cluster.DBSCAN)
        self.assertIs(df.cluster.FeatureAgglomeration, cluster.FeatureAgglomeration)
        self.assertIs(df.cluster.KMeans, cluster.KMeans)
        self.assertIs(df.cluster.MiniBatchKMeans, cluster.MiniBatchKMeans)
        self.assertIs(df.cluster.MeanShift, cluster.MeanShift)
        self.assertIs(df.cluster.SpectralClustering, cluster.SpectralClustering)

        self.assertIs(df.cluster.bicluster.SpectralBiclustering,
                      cluster.bicluster.SpectralBiclustering)
        self.assertIs(df.cluster.bicluster.SpectralCoclustering,
                      cluster.bicluster.SpectralCoclustering)