Python Examples of sklearn.cluster.MiniBatchKMeans

Source File: word_cluster.py From texta with GNU General Public License v3.0

6 votes

def cluster(self, embedding, n_clusters=None):
        vocab = list(embedding.wv.vocab.keys())
        vocab_vectors = np.array([embedding[word] for word in vocab])
        
        if not n_clusters:
            # number of clusters = 10% of embedding vocabulary
            # if larger than 1000, limit to 1000
            n_clusters = int(len(vocab) * 0.1)
            if n_clusters > 1000:
                n_clusters = 1000

        clustering = MiniBatchKMeans(n_clusters=n_clusters).fit(vocab_vectors)
        cluster_labels = clustering.labels_
        
        for i,cluster_label in enumerate(cluster_labels):
            word = vocab[i]
            etalon = embedding.wv.most_similar(positive=[clustering.cluster_centers_[cluster_label]])[0][0]
            
            if etalon not in self.cluster_dict:
                self.cluster_dict[etalon] = []
                
            self.cluster_dict[etalon].append(word)
            self.word_to_cluster_dict[word] = etalon
        
        return True

Source File: graph.py From EDeN with MIT License

6 votes

def auto_label(graphs, n_clusters=16, **opts):
    """Label nodes with cluster id.

    Cluster nodes using as features the output of vertex_vectorize.
    """
    data_list = Vectorizer(**opts).vertex_transform(graphs)
    data_matrix = vstack(data_list)
    clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
    clu.fit(data_matrix)
    preds = clu.predict(data_matrix)
    vecs = clu.transform(data_matrix)
    sizes = [m.shape[0] for m in data_list]
    label_list = []
    vecs_list = []
    pointer = 0
    for size in sizes:
        label_list.append(preds[pointer: pointer + size])
        vecs_list.append(vecs[pointer: pointer + size])
        pointer += size
    return label_list, vecs_list

Source File: mab.py From mabwiser with Apache License 2.0

6 votes

def neighborhood_policy(self):
        """
        Creates named tuple of the neighborhood policy based on the implementor.

        Returns
        -------
        The neighborhood policy
        """
        if isinstance(self._imp, _KNearest):
            return NeighborhoodPolicy.KNearest(self._imp.k, self._imp.metric)
        elif isinstance(self._imp, _Radius):
            return NeighborhoodPolicy.Radius(self._imp.radius, self._imp.metric, self._imp.no_nhood_prob_of_arm)
        elif isinstance(self._imp, _Clusters):
            return NeighborhoodPolicy.Clusters(self._imp.n_clusters, isinstance(self._imp.kmeans, MiniBatchKMeans))
        else:
            return None

Source File: test_clusters.py From mabwiser with Apache License 2.0

6 votes

def test_greedy0_n2_mini(self):

        arms, mab = self.predict(arms=[1, 2, 3, 4],
                                 decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3],
                                 rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1],
                                 learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0),
                                 neighborhood_policy=NeighborhoodPolicy.Clusters(2, True),
                                 context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0],
                                                  [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0],
                                                  [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3],
                                                  [0, 2, 1, 0, 0]],
                                 contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]],
                                 seed=123456,
                                 num_run=1,
                                 is_predict=True)

        self.assertListEqual(arms, [3, 1])
        self.assertTrue(isinstance(mab._imp.kmeans, MiniBatchKMeans))

Source File: test_src.py From reviewer_experience_prediction with MIT License

6 votes

def test_find_default_param_grid(self):
        """
        Test the `find_default_param_grid` function.
        """

        custom_param_grids = \
            {MiniBatchKMeans: {'n_clusters': [4, 5, 6, 7, 9],
                               'init' : ['k-means++', 'random']},
             BernoulliNB: {'alpha': [0.1, 0.5, 1.0]},
             MultinomialNB: {'alpha': [0.5, 0.75, 1.0]},
             Perceptron: {'penalty': ['l2', 'l1', 'elasticnet'],
                          'alpha': [0.0001, 0.001, 0.01],
                          'n_iter': [5]},
             PassiveAggressiveRegressor: {'C': [0.01, 0.1, 1.0],
                                          'n_iter': [10],
                                          'loss': ['epsilon_insensitive']}}

        learners = [MiniBatchKMeans, BernoulliNB, MultinomialNB, Perceptron,
                    PassiveAggressiveRegressor]
        learner_abbrevs = ['mbkm', 'bnb', 'mnb', 'perc', 'pagr']
        for param_grids in [DEFAULT_PARAM_GRIDS, custom_param_grids]:
            yield (self.check_find_default_param_grid_defaults,
                   list(zip(learner_abbrevs, learners)),
                   param_grids)

Source File: test_k_means.py From twitter-stock-recommendation with MIT License

6 votes

def test_sparse_mb_k_means_callable_init():

    def test_init(X, k, random_state):
        return centers

    # Small test to check that giving the wrong number of centers
    # raises a meaningful error
    msg = "does not match the number of clusters"
    assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
                                                         random_state=42).fit,
                        X_csr)

    # Now check that the fit actually works
    mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
                                 random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means)

Source File: test_k_means.py From twitter-stock-recommendation with MIT License

6 votes

def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
                                 random_state=42, init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
                                 random_state=42, init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_sparse_mb_k_means_callable_init():

    def test_init(X, k, random_state):
        return centers

    # Small test to check that giving the wrong number of centers
    # raises a meaningful error
    msg = "does not match the number of clusters"
    assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
                                                         random_state=42).fit,
                        X_csr)

    # Now check that the fit actually works
    mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
                                 random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means)

Source File: test_k_means.py From twitter-stock-recommendation with MIT License

6 votes

def test_k_means_explicit_init_shape():
    # test for sensible errors when giving explicit init
    # with wrong number of features or clusters
    rnd = np.random.RandomState(0)
    X = rnd.normal(size=(40, 3))
    for Class in [KMeans, MiniBatchKMeans]:
        # mismatch of number of features
        km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
        msg = "does not match the number of features of the data"
        assert_raises_regex(ValueError, msg, km.fit, X)
        # for callable init
        km = Class(n_init=1,
                   init=lambda X_, k, random_state: X_[:, :2],
                   n_clusters=len(X))
        assert_raises_regex(ValueError, msg, km.fit, X)
        # mismatch of number of clusters
        msg = "does not match the number of clusters"
        km = Class(n_init=1, init=X[:2, :], n_clusters=3)
        assert_raises_regex(ValueError, msg, km.fit, X)
        # for callable init
        km = Class(n_init=1,
                   init=lambda X_, k, random_state: X_[:2, :],
                   n_clusters=3)
        assert_raises_regex(ValueError, msg, km.fit, X)

Source File: estimator_utils.py From EDeN with MIT License

6 votes

def process_vec_info(g, n_clusters=8):
    """process_vec_info."""
    # extract node vec information and make np data matrix
    data_matrix = np.array([g.node[u]['vec'] for u in g.nodes()])
    # cluster with kmeans
    clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
    clu.fit(data_matrix)
    preds = clu.predict(data_matrix)
    vecs = clu.transform(data_matrix)
    vecs = 1 / (1 + vecs)
    # replace node information
    graph = g.copy()
    for u in graph.nodes():
        graph.node[u]['label'] = str(preds[u])
        graph.node[u]['vec'] = list(vecs[u])
    return graph

Source File: model.py From ColumbiaImageSearch with Apache License 2.0

6 votes

def train_subquantizers(data, num_buckets, subquantizer_clusters=256, kmeans_local_iters=20, n_init=10, random_state=None):
    """
    Fit a set of num_buckets subquantizers for corresponding subvectors.
    """

    subquantizers = list()
    for i, d in enumerate(np.split(data, num_buckets, axis=1)):
        #model = KMeans(n_clusters=subquantizer_clusters, init="k-means++", max_iter=kmeans_local_iters,
        #               n_init=n_init, n_jobs=1, verbose=False, random_state=random_state)
        model = MiniBatchKMeans(n_clusters=subquantizer_clusters, init='k-means++', max_iter=kmeans_local_iters,
                                n_init=n_init, batch_size=10000, verbose=False, random_state=random_state)
        model.fit(d)
        subquantizers.append(model.cluster_centers_)
        logger.info('Fit subquantizer %d of %d.' % (i + 1, num_buckets))

    return subquantizers

Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License

6 votes

def test_batchkmeans_clustering(self):
        data = load_iris()
        X = data.data
        model = MiniBatchKMeans(n_clusters=3)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", FloatTensorType([None, 4]))],
                                     target_opset=TARGET_OPSET)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.float32)[40:60],
            model,
            model_onnx,
            basename="SklearnKMeans-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2')",
        )

Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License

6 votes

def test_batchkmeans_clustering_opset9(self):
        data = load_iris()
        X = data.data
        model = MiniBatchKMeans(n_clusters=3)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", FloatTensorType([None, 4]))],
                                     target_opset=9)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.float32)[40:60],
            model,
            model_onnx,
            basename="SklearnKMeansOp9-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2')",
        )

Source File: clusterings.py From parcellation_fragmenter with BSD 3-Clause "New" or "Revised" License

6 votes

def k_means(n_clusters, samples):

    """
    Run k-means clustering on vertex coordinates.

    Parameters:
    - - - - -
    n_clusters : int
        number of clusters to generate
    samples : array
        Euclidean-space coordinates of vertices
    """

    # Run Mini-Batch K-Means
    k_means = cluster.MiniBatchKMeans(
        n_clusters=n_clusters, init='k-means++', max_iter=1000,
        batch_size=10000, verbose=False, compute_labels=True,
        max_no_improvement=100, n_init=5, reassignment_ratio=0.1)
    k_means.fit(samples)

    labels = k_means.labels_.copy()
    labels = labels.astype(np.int32)+1

    return labels

Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License

6 votes

def test_batchkmeans_clustering_opset11(self):
        data = load_iris()
        X = data.data
        model = MiniBatchKMeans(n_clusters=3)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", FloatTensorType([None, 4]))],
                                     target_opset=11)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.float32)[40:60],
            model,
            model_onnx,
            basename="SklearnKMeansOp9-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2')")

Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License

6 votes

def test_batchkmeans_clustering_int(self):
        data = load_digits()
        X = data.data
        model = MiniBatchKMeans(n_clusters=4)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", Int64TensorType([None,
                                      X.shape[1]]))],
                                     target_opset=TARGET_OPSET)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.int64)[40:60],
            model,
            model_onnx,
            basename="SklearnBatchKMeansInt-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2') or "
                          "StrictVersion(onnxruntime.__version__) "
                          "<= StrictVersion('0.2.1')",
        )

Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_weighted_vs_repeated():
    # a sample weight of N should yield the same result as an N-fold
    # repetition of the sample
    rng = np.random.RandomState(0)
    sample_weight = rng.randint(1, 5, size=n_samples)
    X_repeat = np.repeat(X, sample_weight, axis=0)
    estimators = [KMeans(init="k-means++", n_clusters=n_clusters,
                         random_state=42),
                  KMeans(init="random", n_clusters=n_clusters,
                         random_state=42),
                  KMeans(init=centers.copy(), n_clusters=n_clusters,
                         random_state=42),
                  MiniBatchKMeans(n_clusters=n_clusters, batch_size=10,
                                  random_state=42)]
    for estimator in estimators:
        est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
        est_repeated = clone(estimator).fit(X_repeat)
        repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
        assert_almost_equal(v_measure_score(est_repeated.labels_,
                                            repeated_labels), 1.0)
        if not isinstance(estimator, MiniBatchKMeans):
            assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
                                _sort_centers(est_repeated.cluster_centers_))

Source File: kmeans_smote.py From kmeans_smote with MIT License

5 votes

def _cluster(self, X):
        """Run k-means to cluster the dataset

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        Returns
        -------
        cluster_assignment : ndarray, shape (n_samples)
            The corresponding cluster labels of ``X``.
        """

        if self.use_minibatch_kmeans:
            from sklearn.cluster import MiniBatchKMeans as KMeans
        else:
            from sklearn.cluster import KMeans as KMeans

        kmeans = KMeans(**self.kmeans_args)
        if self.use_minibatch_kmeans and 'init_size' not in self.kmeans_args:
            self.kmeans_args['init_size'] = min(2 * kmeans.n_clusters, X.shape[0])
            kmeans = KMeans(**self.kmeans_args)

        kmeans.fit_transform(X)
        cluster_assignment = kmeans.labels_
        # kmeans.labels_ does not use continuous labels,
        # i.e. some labels in 0..n_clusters may not exist. Tidy up this mess.
        return cluster_assignment

Source File: estimator.py From EDeN with MIT License

5 votes

def cluster(self, graphs, n_clusters=16):
        """cluster."""
        x = self.transform(graphs)
        clust_est = MiniBatchKMeans(n_clusters=n_clusters)
        cluster_ids = clust_est.fit_predict(x)
        return cluster_ids

Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License

5 votes

def test_batchkmeans_clustering_opset1(self):
        data = load_iris()
        X = data.data
        model = MiniBatchKMeans(n_clusters=3)
        model.fit(X)
        try:
            convert_sklearn(model, "kmeans",
                            [("input", FloatTensorType([None, 4]))],
                            target_opset=1)
        except RuntimeError as e:
            assert "Node 'OnnxAdd' has been changed since version" in str(e)

Source File: test_k_means.py From twitter-stock-recommendation with MIT License

5 votes

def test_predict_minibatch_dense_input():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, random_state=40).fit(X)

    # sanity check: predict centroid labels
    pred = mb_k_means.predict(mb_k_means.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # sanity check: re-predict labeling for training set samples
    pred = mb_k_means.predict(X)
    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)

Source File: represent_cluster_centers.py From active-learning with Apache License 2.0

5 votes

def select_batch_(self, model, N, already_selected, **kwargs):
    # Probably okay to always use MiniBatchKMeans
    # Should standardize data before clustering
    # Can cluster on standardized data but train on raw features if desired
    try:
      distances = model.decision_function(self.X)
    except:
      distances = model.predict_proba(self.X)
    if len(distances.shape) < 2:
      min_margin = abs(distances)
    else:
      sort_distances = np.sort(distances, 1)[:, -2:]
      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
    rank_ind = np.argsort(min_margin)
    rank_ind = [i for i in rank_ind if i not in already_selected]

    distances = abs(model.decision_function(self.X))
    min_margin_by_class = np.min(abs(distances[already_selected]),axis=0)
    unlabeled_in_margin = np.array([i for i in range(len(self.y))
                                    if i not in already_selected and
                                    any(distances[i]<min_margin_by_class)])
    if len(unlabeled_in_margin) < N:
      print("Not enough points within margin of classifier, using simple uncertainty sampling")
      return rank_ind[0:N]
    clustering_model = MiniBatchKMeans(n_clusters=N)
    dist_to_centroid = clustering_model.fit_transform(self.flat_X[unlabeled_in_margin])
    medoids = np.argmin(dist_to_centroid,axis=0)
    medoids = list(set(medoids))
    selected_indices = unlabeled_in_margin[medoids]
    selected_indices = sorted(selected_indices,key=lambda x: min_margin[x])
    remaining = [i for i in rank_ind if i not in selected_indices]
    selected_indices.extend(remaining[0:N-len(selected_indices)])
    return selected_indices

Source File: test_k_means.py From twitter-stock-recommendation with MIT License

5 votes

def test_predict_minibatch_random_init_sparse_input():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='random',
                                 n_init=10).fit(X_csr)

    # sanity check: re-predict labeling for training set samples
    assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_)

    # sanity check: predict centroid labels
    pred = mb_k_means.predict(mb_k_means.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # check that models trained on sparse input also works for dense input at
    # predict time
    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)

Source File: informative_diverse.py From active-learning with Apache License 2.0

5 votes

def __init__(self, X, y, seed):
    self.name = 'informative_and_diverse'
    self.X = X
    self.flat_X = self.flatten_X()
    # y only used for determining how many clusters there should be
    # probably not practical to assume we know # of classes before hand
    # should also probably scale with dimensionality of data
    self.y = y
    self.n_clusters = len(list(set(y)))
    self.cluster_model = MiniBatchKMeans(n_clusters=self.n_clusters)
    self.cluster_data()

Source File: informative_diverse.py From active-learning with Apache License 2.0

5 votes

def cluster_data(self):
    # Probably okay to always use MiniBatchKMeans
    # Should standardize data before clustering
    # Can cluster on standardized data but train on raw features if desired
    self.cluster_model.fit(self.flat_X)
    unique, counts = np.unique(self.cluster_model.labels_, return_counts=True)
    self.cluster_prob = counts/sum(counts)
    self.cluster_labels = self.cluster_model.labels_

Source File: unsupervised_cluster.py From chemprop with MIT License

5 votes

def get_cluster_labels(encodings, n_clusters: int = 10000, seed: int = 0, logger: Logger = None):
    n_clusters = int(min(n_clusters, len(encodings)/10)) # so we don't crash if we only picked a small number of encodings
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=seed)
    cluster_labels = kmeans.fit_predict(encodings)
    return cluster_labels

Source File: sklearn_cluster.py From learn-to-cluster with MIT License

5 votes

def mini_batch_kmeans(feat, n_clusters, batch_size, **kwargs):
    kmeans = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                     batch_size=batch_size,
                                     random_state=0).fit(feat)
    return kmeans.labels_

Source File: test_k_means.py From twitter-stock-recommendation with MIT License

5 votes

def test_predict_minibatch_kmeanspp_init_sparse_input():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
                                 n_init=10).fit(X_csr)

    # sanity check: re-predict labeling for training set samples
    assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_)

    # sanity check: predict centroid labels
    pred = mb_k_means.predict(mb_k_means.cluster_centers_)
    assert_array_equal(pred, np.arange(n_clusters))

    # check that models trained on sparse input also works for dense input at
    # predict time
    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)

Source File: scaffold.py From chemprop with MIT License

5 votes

def cluster_split(data: MoleculeDataset,
                  n_clusters: int,
                  ratio_tolerance: int,
                  seed: int = 0,
                  logger: logging.Logger = None) -> List[MoleculeDataset]:
    """
    Split a dataset by K-means clustering on Morgan fingerprints. 

    :param data: A list of data points (smiles string, target values).
    :param n_clusters: Number of clusters for K-means. 
    :param ratio_tolerance: Max ratio of sizes between clusters.
    :param seed: Random seed for K-means. 
    :param logger: A logger for logging cluster split stats.
    :return: A list containing the K-means splits.
    """
    worst_ratio = ratio_tolerance + 1
    fp = [morgan_fingerprint(s) for s in data.mols()]
    while worst_ratio > ratio_tolerance:
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=seed)
        cluster_labels = kmeans.fit_predict(fp)

        clusters = [[] for _ in range(n_clusters)]
        for i in range(len(data)):
            clusters[cluster_labels[i]].append(data[i])
        
        max_cluster_len = max([len(c) for c in clusters])
        min_cluster_len = min([len(c) for c in clusters])
        worst_ratio = max_cluster_len / min_cluster_len
        seed += 1
    
    if logger is not None:
        logger.debug(f'Split into {n_clusters} clusters')
        logger.debug(f'Cluster sizes: {[len(c) for c in clusters]}')

    return [MoleculeDataset(cluster) for cluster in clusters]

Python sklearn.cluster.MiniBatchKMeans() Examples