Python sklearn.cluster.MiniBatchKMeans() Examples
The following are 30
code examples of sklearn.cluster.MiniBatchKMeans().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.cluster
, or try the search function
.
Example #1
Source File: word_cluster.py From texta with GNU General Public License v3.0 | 6 votes |
def cluster(self, embedding, n_clusters=None): vocab = list(embedding.wv.vocab.keys()) vocab_vectors = np.array([embedding[word] for word in vocab]) if not n_clusters: # number of clusters = 10% of embedding vocabulary # if larger than 1000, limit to 1000 n_clusters = int(len(vocab) * 0.1) if n_clusters > 1000: n_clusters = 1000 clustering = MiniBatchKMeans(n_clusters=n_clusters).fit(vocab_vectors) cluster_labels = clustering.labels_ for i,cluster_label in enumerate(cluster_labels): word = vocab[i] etalon = embedding.wv.most_similar(positive=[clustering.cluster_centers_[cluster_label]])[0][0] if etalon not in self.cluster_dict: self.cluster_dict[etalon] = [] self.cluster_dict[etalon].append(word) self.word_to_cluster_dict[word] = etalon return True
Example #2
Source File: graph.py From EDeN with MIT License | 6 votes |
def auto_label(graphs, n_clusters=16, **opts): """Label nodes with cluster id. Cluster nodes using as features the output of vertex_vectorize. """ data_list = Vectorizer(**opts).vertex_transform(graphs) data_matrix = vstack(data_list) clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10) clu.fit(data_matrix) preds = clu.predict(data_matrix) vecs = clu.transform(data_matrix) sizes = [m.shape[0] for m in data_list] label_list = [] vecs_list = [] pointer = 0 for size in sizes: label_list.append(preds[pointer: pointer + size]) vecs_list.append(vecs[pointer: pointer + size]) pointer += size return label_list, vecs_list
Example #3
Source File: mab.py From mabwiser with Apache License 2.0 | 6 votes |
def neighborhood_policy(self): """ Creates named tuple of the neighborhood policy based on the implementor. Returns ------- The neighborhood policy """ if isinstance(self._imp, _KNearest): return NeighborhoodPolicy.KNearest(self._imp.k, self._imp.metric) elif isinstance(self._imp, _Radius): return NeighborhoodPolicy.Radius(self._imp.radius, self._imp.metric, self._imp.no_nhood_prob_of_arm) elif isinstance(self._imp, _Clusters): return NeighborhoodPolicy.Clusters(self._imp.n_clusters, isinstance(self._imp.kmeans, MiniBatchKMeans)) else: return None
Example #4
Source File: test_clusters.py From mabwiser with Apache License 2.0 | 6 votes |
def test_greedy0_n2_mini(self): arms, mab = self.predict(arms=[1, 2, 3, 4], decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3], rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1], learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0), neighborhood_policy=NeighborhoodPolicy.Clusters(2, True), context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0], [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0], [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3], [0, 2, 1, 0, 0]], contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]], seed=123456, num_run=1, is_predict=True) self.assertListEqual(arms, [3, 1]) self.assertTrue(isinstance(mab._imp.kmeans, MiniBatchKMeans))
Example #5
Source File: test_src.py From reviewer_experience_prediction with MIT License | 6 votes |
def test_find_default_param_grid(self): """ Test the `find_default_param_grid` function. """ custom_param_grids = \ {MiniBatchKMeans: {'n_clusters': [4, 5, 6, 7, 9], 'init' : ['k-means++', 'random']}, BernoulliNB: {'alpha': [0.1, 0.5, 1.0]}, MultinomialNB: {'alpha': [0.5, 0.75, 1.0]}, Perceptron: {'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.001, 0.01], 'n_iter': [5]}, PassiveAggressiveRegressor: {'C': [0.01, 0.1, 1.0], 'n_iter': [10], 'loss': ['epsilon_insensitive']}} learners = [MiniBatchKMeans, BernoulliNB, MultinomialNB, Perceptron, PassiveAggressiveRegressor] learner_abbrevs = ['mbkm', 'bnb', 'mnb', 'perc', 'pagr'] for param_grids in [DEFAULT_PARAM_GRIDS, custom_param_grids]: yield (self.check_find_default_param_grid_defaults, list(zip(learner_abbrevs, learners)), param_grids)
Example #6
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_sparse_mb_k_means_callable_init(): def test_init(X, k, random_state): return centers # Small test to check that giving the wrong number of centers # raises a meaningful error msg = "does not match the number of clusters" assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init, random_state=42).fit, X_csr) # Now check that the fit actually works mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init, random_state=42).fit(X_csr) _check_fitted_model(mb_k_means)
Example #7
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_minibatch_sensible_reassign_fit(): # check if identical initial clusters are reassigned # also a regression test for when there are more desired reassignments than # samples. zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) # do the same with batch-size > X.shape[0] (regression test) mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
Example #8
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_minibatch_sensible_reassign_fit(): # check if identical initial clusters are reassigned # also a regression test for when there are more desired reassignments than # samples. zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) # do the same with batch-size > X.shape[0] (regression test) mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
Example #9
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_sparse_mb_k_means_callable_init(): def test_init(X, k, random_state): return centers # Small test to check that giving the wrong number of centers # raises a meaningful error msg = "does not match the number of clusters" assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init, random_state=42).fit, X_csr) # Now check that the fit actually works mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init, random_state=42).fit(X_csr) _check_fitted_model(mb_k_means)
Example #10
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_k_means_explicit_init_shape(): # test for sensible errors when giving explicit init # with wrong number of features or clusters rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 3)) for Class in [KMeans, MiniBatchKMeans]: # mismatch of number of features km = Class(n_init=1, init=X[:, :2], n_clusters=len(X)) msg = "does not match the number of features of the data" assert_raises_regex(ValueError, msg, km.fit, X) # for callable init km = Class(n_init=1, init=lambda X_, k, random_state: X_[:, :2], n_clusters=len(X)) assert_raises_regex(ValueError, msg, km.fit, X) # mismatch of number of clusters msg = "does not match the number of clusters" km = Class(n_init=1, init=X[:2, :], n_clusters=3) assert_raises_regex(ValueError, msg, km.fit, X) # for callable init km = Class(n_init=1, init=lambda X_, k, random_state: X_[:2, :], n_clusters=3) assert_raises_regex(ValueError, msg, km.fit, X)
Example #11
Source File: estimator_utils.py From EDeN with MIT License | 6 votes |
def process_vec_info(g, n_clusters=8): """process_vec_info.""" # extract node vec information and make np data matrix data_matrix = np.array([g.node[u]['vec'] for u in g.nodes()]) # cluster with kmeans clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10) clu.fit(data_matrix) preds = clu.predict(data_matrix) vecs = clu.transform(data_matrix) vecs = 1 / (1 + vecs) # replace node information graph = g.copy() for u in graph.nodes(): graph.node[u]['label'] = str(preds[u]) graph.node[u]['vec'] = list(vecs[u]) return graph
Example #12
Source File: model.py From ColumbiaImageSearch with Apache License 2.0 | 6 votes |
def train_subquantizers(data, num_buckets, subquantizer_clusters=256, kmeans_local_iters=20, n_init=10, random_state=None): """ Fit a set of num_buckets subquantizers for corresponding subvectors. """ subquantizers = list() for i, d in enumerate(np.split(data, num_buckets, axis=1)): #model = KMeans(n_clusters=subquantizer_clusters, init="k-means++", max_iter=kmeans_local_iters, # n_init=n_init, n_jobs=1, verbose=False, random_state=random_state) model = MiniBatchKMeans(n_clusters=subquantizer_clusters, init='k-means++', max_iter=kmeans_local_iters, n_init=n_init, batch_size=10000, verbose=False, random_state=random_state) model.fit(d) subquantizers.append(model.cluster_centers_) logger.info('Fit subquantizer %d of %d.' % (i + 1, num_buckets)) return subquantizers
Example #13
Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_batchkmeans_clustering(self): data = load_iris() X = data.data model = MiniBatchKMeans(n_clusters=3) model.fit(X) model_onnx = convert_sklearn(model, "kmeans", [("input", FloatTensorType([None, 4]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(numpy.float32)[40:60], model, model_onnx, basename="SklearnKMeans-Dec4", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2')", )
Example #14
Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_batchkmeans_clustering_opset9(self): data = load_iris() X = data.data model = MiniBatchKMeans(n_clusters=3) model.fit(X) model_onnx = convert_sklearn(model, "kmeans", [("input", FloatTensorType([None, 4]))], target_opset=9) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(numpy.float32)[40:60], model, model_onnx, basename="SklearnKMeansOp9-Dec4", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2')", )
Example #15
Source File: clusterings.py From parcellation_fragmenter with BSD 3-Clause "New" or "Revised" License | 6 votes |
def k_means(n_clusters, samples): """ Run k-means clustering on vertex coordinates. Parameters: - - - - - n_clusters : int number of clusters to generate samples : array Euclidean-space coordinates of vertices """ # Run Mini-Batch K-Means k_means = cluster.MiniBatchKMeans( n_clusters=n_clusters, init='k-means++', max_iter=1000, batch_size=10000, verbose=False, compute_labels=True, max_no_improvement=100, n_init=5, reassignment_ratio=0.1) k_means.fit(samples) labels = k_means.labels_.copy() labels = labels.astype(np.int32)+1 return labels
Example #16
Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_batchkmeans_clustering_opset11(self): data = load_iris() X = data.data model = MiniBatchKMeans(n_clusters=3) model.fit(X) model_onnx = convert_sklearn(model, "kmeans", [("input", FloatTensorType([None, 4]))], target_opset=11) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(numpy.float32)[40:60], model, model_onnx, basename="SklearnKMeansOp9-Dec4", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2')")
Example #17
Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_batchkmeans_clustering_int(self): data = load_digits() X = data.data model = MiniBatchKMeans(n_clusters=4) model.fit(X) model_onnx = convert_sklearn(model, "kmeans", [("input", Int64TensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(numpy.int64)[40:60], model, model_onnx, basename="SklearnBatchKMeansInt-Dec4", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__) " "<= StrictVersion('0.2.1')", )
Example #18
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_weighted_vs_repeated(): # a sample weight of N should yield the same result as an N-fold # repetition of the sample rng = np.random.RandomState(0) sample_weight = rng.randint(1, 5, size=n_samples) X_repeat = np.repeat(X, sample_weight, axis=0) estimators = [KMeans(init="k-means++", n_clusters=n_clusters, random_state=42), KMeans(init="random", n_clusters=n_clusters, random_state=42), KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42)] for estimator in estimators: est_weighted = clone(estimator).fit(X, sample_weight=sample_weight) est_repeated = clone(estimator).fit(X_repeat) repeated_labels = np.repeat(est_weighted.labels_, sample_weight) assert_almost_equal(v_measure_score(est_repeated.labels_, repeated_labels), 1.0) if not isinstance(estimator, MiniBatchKMeans): assert_almost_equal(_sort_centers(est_weighted.cluster_centers_), _sort_centers(est_repeated.cluster_centers_))
Example #19
Source File: kmeans_smote.py From kmeans_smote with MIT License | 5 votes |
def _cluster(self, X): """Run k-means to cluster the dataset Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. Returns ------- cluster_assignment : ndarray, shape (n_samples) The corresponding cluster labels of ``X``. """ if self.use_minibatch_kmeans: from sklearn.cluster import MiniBatchKMeans as KMeans else: from sklearn.cluster import KMeans as KMeans kmeans = KMeans(**self.kmeans_args) if self.use_minibatch_kmeans and 'init_size' not in self.kmeans_args: self.kmeans_args['init_size'] = min(2 * kmeans.n_clusters, X.shape[0]) kmeans = KMeans(**self.kmeans_args) kmeans.fit_transform(X) cluster_assignment = kmeans.labels_ # kmeans.labels_ does not use continuous labels, # i.e. some labels in 0..n_clusters may not exist. Tidy up this mess. return cluster_assignment
Example #20
Source File: estimator.py From EDeN with MIT License | 5 votes |
def cluster(self, graphs, n_clusters=16): """cluster.""" x = self.transform(graphs) clust_est = MiniBatchKMeans(n_clusters=n_clusters) cluster_ids = clust_est.fit_predict(x) return cluster_ids
Example #21
Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License | 5 votes |
def test_batchkmeans_clustering_opset1(self): data = load_iris() X = data.data model = MiniBatchKMeans(n_clusters=3) model.fit(X) try: convert_sklearn(model, "kmeans", [("input", FloatTensorType([None, 4]))], target_opset=1) except RuntimeError as e: assert "Node 'OnnxAdd' has been changed since version" in str(e)
Example #22
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_predict_minibatch_dense_input(): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, random_state=40).fit(X) # sanity check: predict centroid labels pred = mb_k_means.predict(mb_k_means.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) # sanity check: re-predict labeling for training set samples pred = mb_k_means.predict(X) assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
Example #23
Source File: represent_cluster_centers.py From active-learning with Apache License 2.0 | 5 votes |
def select_batch_(self, model, N, already_selected, **kwargs): # Probably okay to always use MiniBatchKMeans # Should standardize data before clustering # Can cluster on standardized data but train on raw features if desired try: distances = model.decision_function(self.X) except: distances = model.predict_proba(self.X) if len(distances.shape) < 2: min_margin = abs(distances) else: sort_distances = np.sort(distances, 1)[:, -2:] min_margin = sort_distances[:, 1] - sort_distances[:, 0] rank_ind = np.argsort(min_margin) rank_ind = [i for i in rank_ind if i not in already_selected] distances = abs(model.decision_function(self.X)) min_margin_by_class = np.min(abs(distances[already_selected]),axis=0) unlabeled_in_margin = np.array([i for i in range(len(self.y)) if i not in already_selected and any(distances[i]<min_margin_by_class)]) if len(unlabeled_in_margin) < N: print("Not enough points within margin of classifier, using simple uncertainty sampling") return rank_ind[0:N] clustering_model = MiniBatchKMeans(n_clusters=N) dist_to_centroid = clustering_model.fit_transform(self.flat_X[unlabeled_in_margin]) medoids = np.argmin(dist_to_centroid,axis=0) medoids = list(set(medoids)) selected_indices = unlabeled_in_margin[medoids] selected_indices = sorted(selected_indices,key=lambda x: min_margin[x]) remaining = [i for i in rank_ind if i not in selected_indices] selected_indices.extend(remaining[0:N-len(selected_indices)]) return selected_indices
Example #24
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_predict_minibatch_random_init_sparse_input(): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=10).fit(X_csr) # sanity check: re-predict labeling for training set samples assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_) # sanity check: predict centroid labels pred = mb_k_means.predict(mb_k_means.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) # check that models trained on sparse input also works for dense input at # predict time assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
Example #25
Source File: informative_diverse.py From active-learning with Apache License 2.0 | 5 votes |
def __init__(self, X, y, seed): self.name = 'informative_and_diverse' self.X = X self.flat_X = self.flatten_X() # y only used for determining how many clusters there should be # probably not practical to assume we know # of classes before hand # should also probably scale with dimensionality of data self.y = y self.n_clusters = len(list(set(y))) self.cluster_model = MiniBatchKMeans(n_clusters=self.n_clusters) self.cluster_data()
Example #26
Source File: informative_diverse.py From active-learning with Apache License 2.0 | 5 votes |
def cluster_data(self): # Probably okay to always use MiniBatchKMeans # Should standardize data before clustering # Can cluster on standardized data but train on raw features if desired self.cluster_model.fit(self.flat_X) unique, counts = np.unique(self.cluster_model.labels_, return_counts=True) self.cluster_prob = counts/sum(counts) self.cluster_labels = self.cluster_model.labels_
Example #27
Source File: unsupervised_cluster.py From chemprop with MIT License | 5 votes |
def get_cluster_labels(encodings, n_clusters: int = 10000, seed: int = 0, logger: Logger = None): n_clusters = int(min(n_clusters, len(encodings)/10)) # so we don't crash if we only picked a small number of encodings kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=seed) cluster_labels = kmeans.fit_predict(encodings) return cluster_labels
Example #28
Source File: sklearn_cluster.py From learn-to-cluster with MIT License | 5 votes |
def mini_batch_kmeans(feat, n_clusters, batch_size, **kwargs): kmeans = cluster.MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=0).fit(feat) return kmeans.labels_
Example #29
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_predict_minibatch_kmeanspp_init_sparse_input(): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=10).fit(X_csr) # sanity check: re-predict labeling for training set samples assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_) # sanity check: predict centroid labels pred = mb_k_means.predict(mb_k_means.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) # check that models trained on sparse input also works for dense input at # predict time assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
Example #30
Source File: scaffold.py From chemprop with MIT License | 5 votes |
def cluster_split(data: MoleculeDataset, n_clusters: int, ratio_tolerance: int, seed: int = 0, logger: logging.Logger = None) -> List[MoleculeDataset]: """ Split a dataset by K-means clustering on Morgan fingerprints. :param data: A list of data points (smiles string, target values). :param n_clusters: Number of clusters for K-means. :param ratio_tolerance: Max ratio of sizes between clusters. :param seed: Random seed for K-means. :param logger: A logger for logging cluster split stats. :return: A list containing the K-means splits. """ worst_ratio = ratio_tolerance + 1 fp = [morgan_fingerprint(s) for s in data.mols()] while worst_ratio > ratio_tolerance: kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=seed) cluster_labels = kmeans.fit_predict(fp) clusters = [[] for _ in range(n_clusters)] for i in range(len(data)): clusters[cluster_labels[i]].append(data[i]) max_cluster_len = max([len(c) for c in clusters]) min_cluster_len = min([len(c) for c in clusters]) worst_ratio = max_cluster_len / min_cluster_len seed += 1 if logger is not None: logger.debug(f'Split into {n_clusters} clusters') logger.debug(f'Cluster sizes: {[len(c) for c in clusters]}') return [MoleculeDataset(cluster) for cluster in clusters]