Python sklearn.cluster.MiniBatchKMeans() Examples

The following are 30 code examples of sklearn.cluster.MiniBatchKMeans(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cluster , or try the search function .
Example #1
Source File: word_cluster.py    From texta with GNU General Public License v3.0 6 votes vote down vote up
def cluster(self, embedding, n_clusters=None):
        vocab = list(embedding.wv.vocab.keys())
        vocab_vectors = np.array([embedding[word] for word in vocab])
        
        if not n_clusters:
            # number of clusters = 10% of embedding vocabulary
            # if larger than 1000, limit to 1000
            n_clusters = int(len(vocab) * 0.1)
            if n_clusters > 1000:
                n_clusters = 1000

        clustering = MiniBatchKMeans(n_clusters=n_clusters).fit(vocab_vectors)
        cluster_labels = clustering.labels_
        
        for i,cluster_label in enumerate(cluster_labels):
            word = vocab[i]
            etalon = embedding.wv.most_similar(positive=[clustering.cluster_centers_[cluster_label]])[0][0]
            
            if etalon not in self.cluster_dict:
                self.cluster_dict[etalon] = []
                
            self.cluster_dict[etalon].append(word)
            self.word_to_cluster_dict[word] = etalon
        
        return True 
Example #2
Source File: graph.py    From EDeN with MIT License 6 votes vote down vote up
def auto_label(graphs, n_clusters=16, **opts):
    """Label nodes with cluster id.

    Cluster nodes using as features the output of vertex_vectorize.
    """
    data_list = Vectorizer(**opts).vertex_transform(graphs)
    data_matrix = vstack(data_list)
    clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
    clu.fit(data_matrix)
    preds = clu.predict(data_matrix)
    vecs = clu.transform(data_matrix)
    sizes = [m.shape[0] for m in data_list]
    label_list = []
    vecs_list = []
    pointer = 0
    for size in sizes:
        label_list.append(preds[pointer: pointer + size])
        vecs_list.append(vecs[pointer: pointer + size])
        pointer += size
    return label_list, vecs_list 
Example #3
Source File: mab.py    From mabwiser with Apache License 2.0 6 votes vote down vote up
def neighborhood_policy(self):
        """
        Creates named tuple of the neighborhood policy based on the implementor.

        Returns
        -------
        The neighborhood policy
        """
        if isinstance(self._imp, _KNearest):
            return NeighborhoodPolicy.KNearest(self._imp.k, self._imp.metric)
        elif isinstance(self._imp, _Radius):
            return NeighborhoodPolicy.Radius(self._imp.radius, self._imp.metric, self._imp.no_nhood_prob_of_arm)
        elif isinstance(self._imp, _Clusters):
            return NeighborhoodPolicy.Clusters(self._imp.n_clusters, isinstance(self._imp.kmeans, MiniBatchKMeans))
        else:
            return None 
Example #4
Source File: test_clusters.py    From mabwiser with Apache License 2.0 6 votes vote down vote up
def test_greedy0_n2_mini(self):

        arms, mab = self.predict(arms=[1, 2, 3, 4],
                                 decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3],
                                 rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1],
                                 learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0),
                                 neighborhood_policy=NeighborhoodPolicy.Clusters(2, True),
                                 context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0],
                                                  [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0],
                                                  [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3],
                                                  [0, 2, 1, 0, 0]],
                                 contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]],
                                 seed=123456,
                                 num_run=1,
                                 is_predict=True)

        self.assertListEqual(arms, [3, 1])
        self.assertTrue(isinstance(mab._imp.kmeans, MiniBatchKMeans)) 
Example #5
Source File: test_src.py    From reviewer_experience_prediction with MIT License 6 votes vote down vote up
def test_find_default_param_grid(self):
        """
        Test the `find_default_param_grid` function.
        """

        custom_param_grids = \
            {MiniBatchKMeans: {'n_clusters': [4, 5, 6, 7, 9],
                               'init' : ['k-means++', 'random']},
             BernoulliNB: {'alpha': [0.1, 0.5, 1.0]},
             MultinomialNB: {'alpha': [0.5, 0.75, 1.0]},
             Perceptron: {'penalty': ['l2', 'l1', 'elasticnet'],
                          'alpha': [0.0001, 0.001, 0.01],
                          'n_iter': [5]},
             PassiveAggressiveRegressor: {'C': [0.01, 0.1, 1.0],
                                          'n_iter': [10],
                                          'loss': ['epsilon_insensitive']}}

        learners = [MiniBatchKMeans, BernoulliNB, MultinomialNB, Perceptron,
                    PassiveAggressiveRegressor]
        learner_abbrevs = ['mbkm', 'bnb', 'mnb', 'perc', 'pagr']
        for param_grids in [DEFAULT_PARAM_GRIDS, custom_param_grids]:
            yield (self.check_find_default_param_grid_defaults,
                   list(zip(learner_abbrevs, learners)),
                   param_grids) 
Example #6
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_sparse_mb_k_means_callable_init():

    def test_init(X, k, random_state):
        return centers

    # Small test to check that giving the wrong number of centers
    # raises a meaningful error
    msg = "does not match the number of clusters"
    assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
                                                         random_state=42).fit,
                        X_csr)

    # Now check that the fit actually works
    mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
                                 random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means) 
Example #7
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
                                 random_state=42, init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) 
Example #8
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
                                 random_state=42, init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) 
Example #9
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_sparse_mb_k_means_callable_init():

    def test_init(X, k, random_state):
        return centers

    # Small test to check that giving the wrong number of centers
    # raises a meaningful error
    msg = "does not match the number of clusters"
    assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
                                                         random_state=42).fit,
                        X_csr)

    # Now check that the fit actually works
    mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
                                 random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means) 
Example #10
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_k_means_explicit_init_shape():
    # test for sensible errors when giving explicit init
    # with wrong number of features or clusters
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(40, 3))
    for Class in [KMeans, MiniBatchKMeans]:
        # mismatch of number of features
        km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
        msg = "does not match the number of features of the data"
        assert_raises_regex(ValueError, msg, km.fit, X)
        # for callable init
        km = Class(n_init=1,
                   init=lambda X_, k, random_state: X_[:, :2],
                   n_clusters=len(X))
        assert_raises_regex(ValueError, msg, km.fit, X)
        # mismatch of number of clusters
        msg = "does not match the number of clusters"
        km = Class(n_init=1, init=X[:2, :], n_clusters=3)
        assert_raises_regex(ValueError, msg, km.fit, X)
        # for callable init
        km = Class(n_init=1,
                   init=lambda X_, k, random_state: X_[:2, :],
                   n_clusters=3)
        assert_raises_regex(ValueError, msg, km.fit, X) 
Example #11
Source File: estimator_utils.py    From EDeN with MIT License 6 votes vote down vote up
def process_vec_info(g, n_clusters=8):
    """process_vec_info."""
    # extract node vec information and make np data matrix
    data_matrix = np.array([g.node[u]['vec'] for u in g.nodes()])
    # cluster with kmeans
    clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
    clu.fit(data_matrix)
    preds = clu.predict(data_matrix)
    vecs = clu.transform(data_matrix)
    vecs = 1 / (1 + vecs)
    # replace node information
    graph = g.copy()
    for u in graph.nodes():
        graph.node[u]['label'] = str(preds[u])
        graph.node[u]['vec'] = list(vecs[u])
    return graph 
Example #12
Source File: model.py    From ColumbiaImageSearch with Apache License 2.0 6 votes vote down vote up
def train_subquantizers(data, num_buckets, subquantizer_clusters=256, kmeans_local_iters=20, n_init=10, random_state=None):
    """
    Fit a set of num_buckets subquantizers for corresponding subvectors.
    """

    subquantizers = list()
    for i, d in enumerate(np.split(data, num_buckets, axis=1)):
        #model = KMeans(n_clusters=subquantizer_clusters, init="k-means++", max_iter=kmeans_local_iters,
        #               n_init=n_init, n_jobs=1, verbose=False, random_state=random_state)
        model = MiniBatchKMeans(n_clusters=subquantizer_clusters, init='k-means++', max_iter=kmeans_local_iters,
                                n_init=n_init, batch_size=10000, verbose=False, random_state=random_state)
        model.fit(d)
        subquantizers.append(model.cluster_centers_)
        logger.info('Fit subquantizer %d of %d.' % (i + 1, num_buckets))

    return subquantizers 
Example #13
Source File: test_sklearn_k_means_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_batchkmeans_clustering(self):
        data = load_iris()
        X = data.data
        model = MiniBatchKMeans(n_clusters=3)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", FloatTensorType([None, 4]))],
                                     target_opset=TARGET_OPSET)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.float32)[40:60],
            model,
            model_onnx,
            basename="SklearnKMeans-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2')",
        ) 
Example #14
Source File: test_sklearn_k_means_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_batchkmeans_clustering_opset9(self):
        data = load_iris()
        X = data.data
        model = MiniBatchKMeans(n_clusters=3)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", FloatTensorType([None, 4]))],
                                     target_opset=9)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.float32)[40:60],
            model,
            model_onnx,
            basename="SklearnKMeansOp9-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2')",
        ) 
Example #15
Source File: clusterings.py    From parcellation_fragmenter with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def k_means(n_clusters, samples):

    """
    Run k-means clustering on vertex coordinates.

    Parameters:
    - - - - -
    n_clusters : int
        number of clusters to generate
    samples : array
        Euclidean-space coordinates of vertices
    """

    # Run Mini-Batch K-Means
    k_means = cluster.MiniBatchKMeans(
        n_clusters=n_clusters, init='k-means++', max_iter=1000,
        batch_size=10000, verbose=False, compute_labels=True,
        max_no_improvement=100, n_init=5, reassignment_ratio=0.1)
    k_means.fit(samples)

    labels = k_means.labels_.copy()
    labels = labels.astype(np.int32)+1

    return labels 
Example #16
Source File: test_sklearn_k_means_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_batchkmeans_clustering_opset11(self):
        data = load_iris()
        X = data.data
        model = MiniBatchKMeans(n_clusters=3)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", FloatTensorType([None, 4]))],
                                     target_opset=11)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.float32)[40:60],
            model,
            model_onnx,
            basename="SklearnKMeansOp9-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2')") 
Example #17
Source File: test_sklearn_k_means_converter.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def test_batchkmeans_clustering_int(self):
        data = load_digits()
        X = data.data
        model = MiniBatchKMeans(n_clusters=4)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", Int64TensorType([None,
                                      X.shape[1]]))],
                                     target_opset=TARGET_OPSET)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.int64)[40:60],
            model,
            model_onnx,
            basename="SklearnBatchKMeansInt-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2') or "
                          "StrictVersion(onnxruntime.__version__) "
                          "<= StrictVersion('0.2.1')",
        ) 
Example #18
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_weighted_vs_repeated():
    # a sample weight of N should yield the same result as an N-fold
    # repetition of the sample
    rng = np.random.RandomState(0)
    sample_weight = rng.randint(1, 5, size=n_samples)
    X_repeat = np.repeat(X, sample_weight, axis=0)
    estimators = [KMeans(init="k-means++", n_clusters=n_clusters,
                         random_state=42),
                  KMeans(init="random", n_clusters=n_clusters,
                         random_state=42),
                  KMeans(init=centers.copy(), n_clusters=n_clusters,
                         random_state=42),
                  MiniBatchKMeans(n_clusters=n_clusters, batch_size=10,
                                  random_state=42)]
    for estimator in estimators:
        est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
        est_repeated = clone(estimator).fit(X_repeat)
        repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
        assert_almost_equal(v_measure_score(est_repeated.labels_,
                                            repeated_labels), 1.0)
        if not isinstance(estimator, MiniBatchKMeans):
            assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
                                _sort_centers(est_repeated.cluster_centers_)) 
Example #19
Source File: kmeans_smote.py    From kmeans_smote with MIT License 5 votes vote down vote up
def _cluster(self, X):
        """Run k-means to cluster the dataset

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        Returns
        -------
        cluster_assignment : ndarray, shape (n_samples)
            The corresponding cluster labels of ``X``.
        """

        if self.use_minibatch_kmeans:
            from sklearn.cluster import MiniBatchKMeans as KMeans
        else:
            from sklearn.cluster import KMeans as KMeans

        kmeans = KMeans(**self.kmeans_args)
        if self.use_minibatch_kmeans and 'init_size' not in self.kmeans_args:
            self.kmeans_args['init_size'] = min(2 * kmeans.n_clusters, X.shape[0])
            kmeans = KMeans(**self.kmeans_args)

        kmeans.fit_transform(X)
        cluster_assignment = kmeans.labels_
        # kmeans.labels_ does not use continuous labels,
        # i.e. some labels in 0..n_clusters may not exist. Tidy up this mess.
        return cluster_assignment 
Example #20
Source File: estimator.py    From EDeN with MIT License 5 votes vote down vote up
def cluster(self, graphs, n_clusters=16):
        """cluster."""
        x = self.transform(graphs)
        clust_est = MiniBatchKMeans(n_clusters=n_clusters)
        cluster_ids = clust_est.fit_predict(x)
        return cluster_ids 
Example #21
Source File: test_sklearn_k_means_converter.py    From sklearn-onnx with MIT License 5 votes vote down vote up
def test_batchkmeans_clustering_opset1(self):
        data = load_iris()
        X = data.data
        model = MiniBatchKMeans(n_clusters=3)
        model.fit(X)
        try:
            convert_sklearn(model, "kmeans",
                            [("input", FloatTensorType([None, 4]))],
                            target_opset=1)
        except RuntimeError as e:
            assert "Node 'OnnxAdd' has been changed since version" in str(e) 
Example #22
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_predict_minibatch_dense_input():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, random_state=40).fit(X)

    # sanity check: predict centroid labels
    pred = mb_k_means.predict(mb_k_means.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # sanity check: re-predict labeling for training set samples
    pred = mb_k_means.predict(X)
    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_) 
Example #23
Source File: represent_cluster_centers.py    From active-learning with Apache License 2.0 5 votes vote down vote up
def select_batch_(self, model, N, already_selected, **kwargs):
    # Probably okay to always use MiniBatchKMeans
    # Should standardize data before clustering
    # Can cluster on standardized data but train on raw features if desired
    try:
      distances = model.decision_function(self.X)
    except:
      distances = model.predict_proba(self.X)
    if len(distances.shape) < 2:
      min_margin = abs(distances)
    else:
      sort_distances = np.sort(distances, 1)[:, -2:]
      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
    rank_ind = np.argsort(min_margin)
    rank_ind = [i for i in rank_ind if i not in already_selected]

    distances = abs(model.decision_function(self.X))
    min_margin_by_class = np.min(abs(distances[already_selected]),axis=0)
    unlabeled_in_margin = np.array([i for i in range(len(self.y))
                                    if i not in already_selected and
                                    any(distances[i]<min_margin_by_class)])
    if len(unlabeled_in_margin) < N:
      print("Not enough points within margin of classifier, using simple uncertainty sampling")
      return rank_ind[0:N]
    clustering_model = MiniBatchKMeans(n_clusters=N)
    dist_to_centroid = clustering_model.fit_transform(self.flat_X[unlabeled_in_margin])
    medoids = np.argmin(dist_to_centroid,axis=0)
    medoids = list(set(medoids))
    selected_indices = unlabeled_in_margin[medoids]
    selected_indices = sorted(selected_indices,key=lambda x: min_margin[x])
    remaining = [i for i in rank_ind if i not in selected_indices]
    selected_indices.extend(remaining[0:N-len(selected_indices)])
    return selected_indices 
Example #24
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_predict_minibatch_random_init_sparse_input():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='random',
                                 n_init=10).fit(X_csr)

    # sanity check: re-predict labeling for training set samples
    assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_)

    # sanity check: predict centroid labels
    pred = mb_k_means.predict(mb_k_means.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # check that models trained on sparse input also works for dense input at
    # predict time
    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_) 
Example #25
Source File: informative_diverse.py    From active-learning with Apache License 2.0 5 votes vote down vote up
def __init__(self, X, y, seed):
    self.name = 'informative_and_diverse'
    self.X = X
    self.flat_X = self.flatten_X()
    # y only used for determining how many clusters there should be
    # probably not practical to assume we know # of classes before hand
    # should also probably scale with dimensionality of data
    self.y = y
    self.n_clusters = len(list(set(y)))
    self.cluster_model = MiniBatchKMeans(n_clusters=self.n_clusters)
    self.cluster_data() 
Example #26
Source File: informative_diverse.py    From active-learning with Apache License 2.0 5 votes vote down vote up
def cluster_data(self):
    # Probably okay to always use MiniBatchKMeans
    # Should standardize data before clustering
    # Can cluster on standardized data but train on raw features if desired
    self.cluster_model.fit(self.flat_X)
    unique, counts = np.unique(self.cluster_model.labels_, return_counts=True)
    self.cluster_prob = counts/sum(counts)
    self.cluster_labels = self.cluster_model.labels_ 
Example #27
Source File: unsupervised_cluster.py    From chemprop with MIT License 5 votes vote down vote up
def get_cluster_labels(encodings, n_clusters: int = 10000, seed: int = 0, logger: Logger = None):
    n_clusters = int(min(n_clusters, len(encodings)/10)) # so we don't crash if we only picked a small number of encodings
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=seed)
    cluster_labels = kmeans.fit_predict(encodings)
    return cluster_labels 
Example #28
Source File: sklearn_cluster.py    From learn-to-cluster with MIT License 5 votes vote down vote up
def mini_batch_kmeans(feat, n_clusters, batch_size, **kwargs):
    kmeans = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                     batch_size=batch_size,
                                     random_state=0).fit(feat)
    return kmeans.labels_ 
Example #29
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_predict_minibatch_kmeanspp_init_sparse_input():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
                                 n_init=10).fit(X_csr)

    # sanity check: re-predict labeling for training set samples
    assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_)

    # sanity check: predict centroid labels
    pred = mb_k_means.predict(mb_k_means.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # check that models trained on sparse input also works for dense input at
    # predict time
    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_) 
Example #30
Source File: scaffold.py    From chemprop with MIT License 5 votes vote down vote up
def cluster_split(data: MoleculeDataset,
                  n_clusters: int,
                  ratio_tolerance: int,
                  seed: int = 0,
                  logger: logging.Logger = None) -> List[MoleculeDataset]:
    """
    Split a dataset by K-means clustering on Morgan fingerprints. 

    :param data: A list of data points (smiles string, target values).
    :param n_clusters: Number of clusters for K-means. 
    :param ratio_tolerance: Max ratio of sizes between clusters.
    :param seed: Random seed for K-means. 
    :param logger: A logger for logging cluster split stats.
    :return: A list containing the K-means splits.
    """
    worst_ratio = ratio_tolerance + 1
    fp = [morgan_fingerprint(s) for s in data.mols()]
    while worst_ratio > ratio_tolerance:
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=seed)
        cluster_labels = kmeans.fit_predict(fp)

        clusters = [[] for _ in range(n_clusters)]
        for i in range(len(data)):
            clusters[cluster_labels[i]].append(data[i])
        
        max_cluster_len = max([len(c) for c in clusters])
        min_cluster_len = min([len(c) for c in clusters])
        worst_ratio = max_cluster_len / min_cluster_len
        seed += 1
    
    if logger is not None:
        logger.debug(f'Split into {n_clusters} clusters')
        logger.debug(f'Cluster sizes: {[len(c) for c in clusters]}')

    return [MoleculeDataset(cluster) for cluster in clusters]