Python sklearn.datasets.samples_generator.make_blobs() Examples
The following are 30
code examples of sklearn.datasets.samples_generator.make_blobs().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.datasets.samples_generator
, or try the search function
.
Example #1
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_minibatch_sensible_reassign_fit(): # check if identical initial clusters are reassigned # also a regression test for when there are more desired reassignments than # samples. zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) # do the same with batch-size > X.shape[0] (regression test) mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
Example #2
Source File: make_data.py From DCC with MIT License | 6 votes |
def make_easy_visual_data(path, N=600): """Make 3 clusters of 2D data where the cluster centers lie along a line. The latent variable would be just their x or y value since that uniquely defines their projection onto the line. """ line = (1.5, 1) centers = [(m, m * line[0] + line[1]) for m in (-4, 0, 6)] cluster_std = [1, 1, 1.5] X, labels = make_blobs(n_samples=N, cluster_std=cluster_std, centers=centers, n_features=len(centers[0])) # scale data minmaxscale = MinMaxScaler().fit(X) X = minmaxscale.transform(X) save_misc_data(path, X, labels, N) return X, labels
Example #3
Source File: test_cluster.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_affinity_propagation_class(self): from sklearn.datasets.samples_generator import make_blobs centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5, random_state=0) df = pdml.ModelFrame(data=X, target=labels_true) af = df.cluster.AffinityPropagation(preference=-50) df.fit(af) af2 = cluster.AffinityPropagation(preference=-50).fit(X) tm.assert_numpy_array_equal(af.cluster_centers_indices_, af2.cluster_centers_indices_) tm.assert_numpy_array_equal(af.labels_, af2.labels_)
Example #4
Source File: test_optics.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_dbscan_optics_parity(eps, min_samples): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # calculate optics with dbscan extract at 0.3 epsilon op = OPTICS(min_samples=min_samples, cluster_method='dbscan', eps=eps).fit(X) # calculate dbscan labels db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) contingency = contingency_matrix(db.labels_, op.labels_) agree = min(np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))) disagree = X.shape[0] - agree percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) # verify label mismatch is <= 5% labels assert percent_mismatch <= 0.05
Example #5
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_elkan_results(distribution): # check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) if distribution == 'normal': X = rnd.normal(size=(50, 10)) else: X, _ = make_blobs(random_state=rnd) km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1) km_full.fit(X) km_elkan.fit(X) assert_array_almost_equal(km_elkan.cluster_centers_, km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_)
Example #6
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_minibatch_sensible_reassign_fit(): # check if identical initial clusters are reassigned # also a regression test for when there are more desired reassignments than # samples. zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) # do the same with batch-size > X.shape[0] (regression test) mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
Example #7
Source File: gmm.py From intro_ds with Apache License 2.0 | 5 votes |
def generateCaseTwo(n): """ 随机生成内部方差不相同的数据 """ centers = [[-2, 0], [0, 2], [2, 4]] std = [0.1, 1, 0.2] data, _ = make_blobs(n_samples=n, centers=centers, cluster_std=std) return data
Example #8
Source File: kmeans.py From MachineLearning with BSD 3-Clause "New" or "Revised" License | 5 votes |
def plot_kmeans(): X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=0.60) y_pred = KMeans(4).fit(X).predict(X) fig, ax = plt.subplots(1, 2, figsize=(12, 6)) ax[0].scatter(X[:, 0], X[:, 1]) ax[0].set_title('Input') ax[1].scatter(X[:, 0], X[:, 1], c=y) ax[1].set_title('Labels determined by K Means')
Example #9
Source File: kmeans_limitations.py From intro_ds with Apache License 2.0 | 5 votes |
def generateCaseTwo(n): """ 随机生成内部方差不相同的数据 """ centers = [[-2, 0], [0, 2], [2, 4]] std = [0.1, 1, 0.2] data, _ = make_blobs(n_samples=n, centers=centers, cluster_std=std) return data
Example #10
Source File: kmeans.py From intro_ds with Apache License 2.0 | 5 votes |
def generateData(n): """ 生成随机的聚类数据 """ centers = [[1, 1], [-1, -1]] X, _ = make_blobs(n_samples=n, centers=centers, cluster_std=0.5) return X
Example #11
Source File: gmm_choose_k.py From intro_ds with Apache License 2.0 | 5 votes |
def generateData(n): """ 随机生成内部方差不相同的数据 """ centers = [[-2, 0], [0, 2], [2, 4]] std = [0.1, 1, 0.2] data, _ = make_blobs(n_samples=n, centers=centers, cluster_std=std) return data
Example #12
Source File: kmeans_choose_k.py From intro_ds with Apache License 2.0 | 5 votes |
def generateData(n): """ 生成随机的聚类数据,聚类中心为3个 """ centers = [[1, 1], [-1, -1], [1, -1]] X, _ = make_blobs(n_samples=n, centers=centers, cluster_std=0.5) return X
Example #13
Source File: object_ranking_data_generator.py From cs-ranking with Apache License 2.0 | 5 votes |
def make_gp_non_transitive( self, n_instances=1000, n_objects=5, n_features=100, center_box=(-10.0, 10.0), cluster_std=2.0, seed=42, **kwd, ): n_samples = n_instances * n_objects random_state = check_random_state(seed=seed) x, y = make_blobs( n_samples=n_samples, centers=n_objects, n_features=n_features, cluster_std=cluster_std, center_box=center_box, random_state=random_state, shuffle=True, ) y = np.array([y]) samples = np.append(x, y.T, axis=1) samples = samples[samples[:, n_features].argsort()] pairwise_prob = create_pairwise_prob_matrix(n_objects) X = [] Y = [] for inst in range(n_instances): feature = np.array( [samples[inst + i * n_instances, 0:-1] for i in range(n_objects)] ) matrix = np.random.binomial(1, pairwise_prob) objects = list(np.arange(n_objects)) ordering = np.array(quicksort(objects, matrix)) ranking = np.argsort(ordering) X.append(feature) Y.append(ranking) X = np.array(X) Y = np.array(Y) return X, Y
Example #14
Source File: test_spectral.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_spectral_amg_mode(): # Test the amg mode of SpectralClustering centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) try: from pyamg import smoothed_aggregation_solver # noqa amg_loaded = True except ImportError: amg_loaded = False if amg_loaded: labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, eigen_solver="amg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3) else: assert_raises(ValueError, spectral_embedding, S, n_components=len(centers), random_state=0, eigen_solver="amg")
Example #15
Source File: test_spectral.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_spectral_unknown_mode(): # Test that SpectralClustering fails with an unknown mode set. centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) assert_raises(ValueError, spectral_clustering, S, n_clusters=2, random_state=0, eigen_solver="<unknown>")
Example #16
Source File: test_spectral.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_spectral_unknown_assign_labels(): # Test that SpectralClustering fails with an unknown assign_labels set. centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) assert_raises(ValueError, spectral_clustering, S, n_clusters=2, random_state=0, assign_labels="<unknown>")
Example #17
Source File: test_spectral.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_spectral_clustering_sparse(): X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01) S = rbf_kernel(X, gamma=1) S = np.maximum(S - 1e-4, 0) S = sparse.coo_matrix(S) labels = SpectralClustering(random_state=0, n_clusters=2, affinity='precomputed').fit(S).labels_ assert_equal(adjusted_rand_score(y, labels), 1)
Example #18
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_elkan_results(): rnd = np.random.RandomState(0) X_normal = rnd.normal(size=(50, 10)) X_blobs, _ = make_blobs(random_state=0) km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1) for X in [X_normal, X_blobs]: km_full.fit(X) km_elkan.fit(X) assert_array_almost_equal(km_elkan.cluster_centers_, km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_)
Example #19
Source File: test_k_means.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_minibatch_sensible_reassign_partial_fit(): zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") for i in range(100): mb_k_means.partial_fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
Example #20
Source File: sgd_separator.py From ESAC-stats-2014 with BSD 2-Clause "Simplified" License | 5 votes |
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([x1, x2]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
Example #21
Source File: sgd_separator.py From MachineLearning with BSD 3-Clause "New" or "Revised" License | 5 votes |
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([x1, x2]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, s=60) ax.axis('tight')
Example #22
Source File: sgd_separator.py From sklearn_pydata2015 with BSD 3-Clause "New" or "Revised" License | 5 votes |
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([x1, x2]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
Example #23
Source File: gmm_plots.py From numpy-ml with GNU General Public License v3.0 | 5 votes |
def plot(): fig, axes = plt.subplots(4, 4) fig.set_size_inches(10, 10) for i, ax in enumerate(axes.flatten()): n_ex = 150 n_in = 2 n_classes = np.random.randint(2, 4) X, y = make_blobs( n_samples=n_ex, centers=n_classes, n_features=n_in, random_state=i ) X -= X.mean(axis=0) # take best fit over 10 runs best_elbo = -np.inf for k in range(10): _G = GMM(C=n_classes, seed=i * 3) ret = _G.fit(X, max_iter=100, verbose=False) while ret != 0: print("Components collapsed; Refitting") ret = _G.fit(X, max_iter=100, verbose=False) if _G.best_elbo > best_elbo: best_elbo = _G.best_elbo G = _G ax = plot_clusters(G, X, ax) ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) ax.set_title("# Classes: {}; Final VLB: {:.2f}".format(n_classes, G.best_elbo)) plt.tight_layout() plt.savefig("img/plot.png", dpi=300) plt.close("all")
Example #24
Source File: lm_plots.py From numpy-ml with GNU General Public License v3.0 | 5 votes |
def random_classification_problem(n_ex, n_classes, n_in, seed=0): X, y = make_blobs( n_samples=n_ex, centers=n_classes, n_features=n_in, random_state=seed ) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=seed ) return X_train, y_train, X_test, y_test ####################################################################### # Plots # #######################################################################
Example #25
Source File: test_k_means.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_minibatch_sensible_reassign_partial_fit(): zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") for i in range(100): mb_k_means.partial_fit(zeroed_X) # there should not be too many exact zero cluster centers assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
Example #26
Source File: test_optics.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_close_extract(): # Test extract where extraction eps is close to scaled max_eps centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # Compute OPTICS clust = OPTICS(max_eps=1.0, cluster_method='dbscan', eps=0.3, min_samples=10).fit(X) # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters assert_equal(max(clust.labels_), 2)
Example #27
Source File: test_optics.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_bad_reachability(): msg = "All reachability values are inf. Set a larger max_eps." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) with pytest.warns(UserWarning, match=msg): clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) clust.fit(X)
Example #28
Source File: test_optics.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_bad_extract(): # Test an extraction of eps too close to original eps msg = "Specify an epsilon smaller than 0.15. Got 0.3." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # Compute OPTICS clust = OPTICS(max_eps=5.0 * 0.03, cluster_method='dbscan', eps=0.3, min_samples=10) assert_raise_message(ValueError, msg, clust.fit, X)
Example #29
Source File: test_mean_shift.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_parallel(): centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 X, _ = make_blobs(n_samples=50, n_features=2, centers=centers, cluster_std=0.4, shuffle=True, random_state=11) ms1 = MeanShift(n_jobs=2) ms1.fit(X) ms2 = MeanShift() ms2.fit(X) assert_array_almost_equal(ms1.cluster_centers_, ms2.cluster_centers_) assert_array_equal(ms1.labels_, ms2.labels_)
Example #30
Source File: test_spectral.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_spectral_clustering_sparse(): X, y = make_blobs(n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01) S = rbf_kernel(X, gamma=1) S = np.maximum(S - 1e-4, 0) S = sparse.coo_matrix(S) labels = SpectralClustering(random_state=0, n_clusters=2, affinity='precomputed').fit(S).labels_ assert adjusted_rand_score(y, labels) == 1