Python sklearn.datasets.make_blobs() Examples

The following are 30 code examples of sklearn.datasets.make_blobs(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.datasets , or try the search function .
Example #1
Source File: test_confidence.py    From nussl with MIT License 7 votes vote down vote up
def test_js_divergence():
    n_samples = 1000
    blobs, _ = datasets.make_blobs(n_samples=n_samples, random_state=8)

    one_component_a = ml.cluster.GaussianMixture(1)
    one_component_b = ml.cluster.GaussianMixture(1)
    two_component = ml.cluster.GaussianMixture(2)

    one_component_a.fit(blobs)
    one_component_b.fit(blobs)
    two_component.fit(blobs)

    confidence_2v1 = ml.confidence.jensen_shannon_divergence(
        one_component_a, two_component)

    confidence_1v1 = ml.confidence.jensen_shannon_divergence(
        one_component_a, one_component_b)

    assert confidence_2v1 > confidence_1v1 
Example #2
Source File: test_samples_generator.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_make_blobs_error():
    n_samples = [20, 20, 20]
    centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
    cluster_stds = np.array([0.05, 0.2, 0.4])
    wrong_centers_msg = ("Length of `n_samples` not consistent "
                         "with number of centers. Got n_samples = {} "
                         "and centers = {}".format(n_samples, centers[:-1]))
    assert_raise_message(ValueError, wrong_centers_msg,
                         make_blobs, n_samples, centers=centers[:-1])
    wrong_std_msg = ("Length of `clusters_std` not consistent with "
                     "number of centers. Got centers = {} "
                     "and cluster_std = {}".format(centers, cluster_stds[:-1]))
    assert_raise_message(ValueError, wrong_std_msg,
                         make_blobs, n_samples,
                         centers=centers, cluster_std=cluster_stds[:-1])
    wrong_type_msg = ("Parameter `centers` must be array-like. "
                      "Got {!r} instead".format(3))
    assert_raise_message(ValueError, wrong_type_msg,
                         make_blobs, n_samples, centers=3) 
Example #3
Source File: helper.py    From practicalDataAnalysisCookbook with GNU General Public License v2.0 6 votes vote down vote up
def produce_XOR(sampleSize):
    import sklearn.datasets as dt

    # centers of the blobs
    centers = [(0,0),(3,0),(3,3),(0,3)]

    # create the sample
    x, y = dt.make_blobs(n_samples=sampleSize, n_features=2,
        cluster_std=0.8, centers=centers, shuffle=False
    )

    # and make it XOR like
    y[y == 2] = 0 
    y[y == 3] = 1

    return x, y 
Example #4
Source File: test_sparse.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_svc():
    """Check that sparse SVC gives the same result as SVC"""
    # many class dataset:
    X_blobs, y_blobs = make_blobs(n_samples=100, centers=10, random_state=0)
    X_blobs = sparse.csr_matrix(X_blobs)

    datasets = [[X_sp, Y, T], [X2_sp, Y2, T2],
                [X_blobs[:80], y_blobs[:80], X_blobs[80:]],
                [iris.data, iris.target, iris.data]]
    kernels = ["linear", "poly", "rbf", "sigmoid"]
    for dataset in datasets:
        for kernel in kernels:
            clf = svm.SVC(gamma=1, kernel=kernel, probability=True,
                          random_state=0, decision_function_shape='ovo')
            sp_clf = svm.SVC(gamma=1, kernel=kernel, probability=True,
                             random_state=0, decision_function_shape='ovo')
            check_svm_model_equal(clf, sp_clf, *dataset) 
Example #5
Source File: test_search.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_grid_search_no_score():
    # Test grid-search on classifier that has no score function.
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    clf_no_score = LinearSVCNoScore(random_state=0)
    grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy')
    grid_search.fit(X, y)

    grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs},
                                        scoring='accuracy')
    # smoketest grid search
    grid_search_no_score.fit(X, y)

    # check that best params are equal
    assert_equal(grid_search_no_score.best_params_, grid_search.best_params_)
    # check that we can call score and that it gives the correct result
    assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y))

    # giving no scoring function raises an error
    grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
    assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
                         [[1]]) 
Example #6
Source File: test_search.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_unsupervised_grid_search():
    # test grid-search with unsupervised estimator
    X, y = make_blobs(random_state=0)
    km = KMeans(random_state=0)

    # Multi-metric evaluation unsupervised
    scoring = ['adjusted_rand_score', 'fowlkes_mallows_score']
    for refit in ['adjusted_rand_score', 'fowlkes_mallows_score']:
        grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
                                   scoring=scoring, refit=refit)
        grid_search.fit(X, y)
        # Both ARI and FMS can find the right number :)
        assert_equal(grid_search.best_params_["n_clusters"], 3)

    # Single metric evaluation unsupervised
    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
                               scoring='fowlkes_mallows_score')
    grid_search.fit(X, y)
    assert_equal(grid_search.best_params_["n_clusters"], 3)

    # Now without a score, and without y
    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
    grid_search.fit(X)
    assert_equal(grid_search.best_params_["n_clusters"], 4) 
Example #7
Source File: test_search.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_deprecated_grid_search_iid():
    depr_message = ("The default of the `iid` parameter will change from True "
                    "to False in version 0.22")
    X, y = make_blobs(n_samples=54, random_state=0, centers=2)
    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=3)
    # no warning with equally sized test sets
    assert_no_warnings(grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=5)
    # warning because 54 % 5 != 0
    assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=2)
    # warning because stratification into two classes and 27 % 2 != 0
    assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=KFold(2))
    # no warning because no stratification and 54 % 2 == 0
    assert_no_warnings(grid.fit, X, y) 
Example #8
Source File: test_discriminant_analysis.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_lda_coefs():
    # Test if the coefficients of the solvers are approximately the same.
    n_features = 2
    n_classes = 2
    n_samples = 1000
    X, y = make_blobs(n_samples=n_samples, n_features=n_features,
                      centers=n_classes, random_state=11)

    clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
    clf_lda_lsqr = LinearDiscriminantAnalysis(solver="lsqr")
    clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")

    clf_lda_svd.fit(X, y)
    clf_lda_lsqr.fit(X, y)
    clf_lda_eigen.fit(X, y)

    assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_lsqr.coef_, 1)
    assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_eigen.coef_, 1)
    assert_array_almost_equal(clf_lda_eigen.coef_, clf_lda_lsqr.coef_, 1) 
Example #9
Source File: test_birch.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_partial_fit():
    # Test that fit is equivalent to calling partial_fit multiple times
    X, y = make_blobs(n_samples=100)
    brc = Birch(n_clusters=3)
    brc.fit(X)
    brc_partial = Birch(n_clusters=None)
    brc_partial.partial_fit(X[:50])
    brc_partial.partial_fit(X[50:])
    assert_array_almost_equal(brc_partial.subcluster_centers_,
                              brc.subcluster_centers_)

    # Test that same global labels are obtained after calling partial_fit
    # with None
    brc_partial.set_params(n_clusters=3)
    brc_partial.partial_fit(None)
    assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_) 
Example #10
Source File: test_birch.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_n_clusters():
    # Test that n_clusters param works properly
    X, y = make_blobs(n_samples=100, centers=10)
    brc1 = Birch(n_clusters=10)
    brc1.fit(X)
    assert_greater(len(brc1.subcluster_centers_), 10)
    assert_equal(len(np.unique(brc1.labels_)), 10)

    # Test that n_clusters = Agglomerative Clustering gives
    # the same results.
    gc = AgglomerativeClustering(n_clusters=10)
    brc2 = Birch(n_clusters=gc)
    brc2.fit(X)
    assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
    assert_array_equal(brc1.labels_, brc2.labels_)

    # Test that the wrong global clustering step raises an Error.
    clf = ElasticNet()
    brc3 = Birch(n_clusters=clf)
    assert_raises(ValueError, brc3.fit, X)

    # Test that a small number of clusters raises a warning.
    brc4 = Birch(threshold=10000.)
    assert_warns(ConvergenceWarning, brc4.fit, X) 
Example #11
Source File: test_birch.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_branching_factor():
    # Test that nodes have at max branching_factor number of subclusters
    X, y = make_blobs()
    branching_factor = 9

    # Purposefully set a low threshold to maximize the subclusters.
    brc = Birch(n_clusters=None, branching_factor=branching_factor,
                threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)
    brc = Birch(n_clusters=3, branching_factor=branching_factor,
                threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)

    # Raises error when branching_factor is set to one.
    brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
    assert_raises(ValueError, brc.fit, X) 
Example #12
Source File: test_kmeans.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_fit_predict(self):
        """ Tests fit_predict."""
        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))

        x_train = ds.array(x_filtered, block_size=(300, 2))

        kmeans = KMeans(n_clusters=3, random_state=170)
        labels = kmeans.fit_predict(x_train).collect()

        skmeans = SKMeans(n_clusters=3, random_state=170)
        sklabels = skmeans.fit_predict(x_filtered)

        centers = np.array([[-8.941375656533449, -5.481371322614891],
                            [-4.524023204953875, 0.06235042593214654],
                            [2.332994701667008, 0.37681003933082696]])

        self.assertTrue(np.allclose(centers, kmeans.centers))
        self.assertTrue(np.allclose(labels, sklabels)) 
Example #13
Source File: sequential_minimum_optimization.py    From Python with MIT License 6 votes vote down vote up
def test_linear_kernel(ax, cost):
    train_x, train_y = make_blobs(
        n_samples=500, centers=2, n_features=2, random_state=1
    )
    train_y[train_y == 0] = -1
    scaler = StandardScaler()
    train_x_scaled = scaler.fit_transform(train_x, train_y)
    train_data = np.hstack((train_y.reshape(500, 1), train_x_scaled))
    mykernel = Kernel(kernel="linear", degree=5, coef0=1, gamma=0.5)
    mysvm = SmoSVM(
        train=train_data,
        kernel_func=mykernel,
        cost=cost,
        tolerance=0.001,
        auto_norm=False,
    )
    mysvm.fit()
    plot_partition_boundary(mysvm, train_data, ax=ax) 
Example #14
Source File: test_umap_trustworthiness.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_string_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    labels = np.array(["this", "that", "other"])[labels]
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="string",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    ) 
Example #15
Source File: test_dbscan.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_sparse(self):
        """ Tests that DBSCAN produces the same results with sparse and
        dense data.
        """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        dbscan = DBSCAN(n_regions=1, eps=.15)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        x = StandardScaler().fit_transform(x)

        dense = ds.array(x, block_size=(300, 2))
        sparse = ds.array(csr_matrix(x), block_size=(300, 2))

        y_dense = dbscan.fit_predict(dense).collect()
        y_sparse = dbscan.fit_predict(sparse).collect()

        self.assertTrue(np.array_equal(y_dense, y_sparse)) 
Example #16
Source File: test_dbscan.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_n_clusters_aniso_dimensions(self):
        """ Tests that DBSCAN finds the correct number of clusters when
        dimensions is not None.
        """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        dbscan = DBSCAN(n_regions=5, dimensions=[1], eps=.15)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        x = StandardScaler().fit_transform(x)
        ds_x = ds.array(x, block_size=(300, 2))
        y_pred = dbscan.fit_predict(ds_x).collect()
        true_sizes = {19, 496, 491, 488, 6}
        cluster_sizes = {y_pred[y_pred == -1].size,
                         y_pred[y_pred == 0].size,
                         y_pred[y_pred == 1].size,
                         y_pred[y_pred == 2].size,
                         y_pred[y_pred == 3].size}

        self.assertEqual(dbscan.n_clusters, 4)
        self.assertEqual(true_sizes, cluster_sizes) 
Example #17
Source File: test_dbscan.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_n_clusters_aniso_grid(self):
        """ Tests that DBSCAN finds the correct number of clusters when
        setting n_regions > 1 with anisotropicly distributed data.
        """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        dbscan = DBSCAN(n_regions=4, eps=.15, max_samples=500)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        x = StandardScaler().fit_transform(x)
        ds_x = ds.array(x, block_size=(300, 2))
        y_pred = dbscan.fit_predict(ds_x).collect()
        true_sizes = {19, 496, 491, 488, 6}
        cluster_sizes = {y_pred[y_pred == -1].size,
                         y_pred[y_pred == 0].size,
                         y_pred[y_pred == 1].size,
                         y_pred[y_pred == 2].size,
                         y_pred[y_pred == 3].size}

        self.assertEqual(dbscan.n_clusters, 4)
        self.assertEqual(true_sizes, cluster_sizes) 
Example #18
Source File: test_dbscan.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_n_clusters_aniso_max_samples(self):
        """ Tests that DBSCAN finds the correct number of clusters when
        defining max_samples with anisotropicly distributed data.
        """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        dbscan = DBSCAN(n_regions=1, eps=.15, max_samples=500)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        x = StandardScaler().fit_transform(x)
        ds_x = ds.array(x, block_size=(300, 2))
        y_pred = dbscan.fit_predict(ds_x).collect()

        true_sizes = {19, 496, 491, 488, 6}
        cluster_sizes = {y_pred[y_pred == -1].size,
                         y_pred[y_pred == 0].size,
                         y_pred[y_pred == 1].size,
                         y_pred[y_pred == 2].size,
                         y_pred[y_pred == 3].size}

        self.assertEqual(dbscan.n_clusters, 4)
        self.assertEqual(true_sizes, cluster_sizes) 
Example #19
Source File: test_dbscan.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_n_clusters_aniso(self):
        """ Tests that DBSCAN finds the correct number of clusters with
        anisotropicly distributed data.
        """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        dbscan = DBSCAN(n_regions=1, eps=.15)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        x = StandardScaler().fit_transform(x)
        ds_x = ds.array(x, block_size=(300, 2))
        y_pred = dbscan.fit_predict(ds_x).collect()
        true_sizes = {19, 496, 491, 488, 6}
        cluster_sizes = {y_pred[y_pred == -1].size,
                         y_pred[y_pred == 0].size,
                         y_pred[y_pred == 1].size,
                         y_pred[y_pred == 2].size,
                         y_pred[y_pred == 3].size}

        self.assertEqual(dbscan.n_clusters, 4)
        self.assertEqual(true_sizes, cluster_sizes) 
Example #20
Source File: test_preproc.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_irregular(self):
        """ Test with an irregular array """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        ds_arr = ds.array(x, block_size=(300, 2))
        ds_arr = ds_arr[297:602]
        x = x[297:602]

        sc1 = SKScaler()
        scaled_x = sc1.fit_transform(x)
        sc2 = StandardScaler()
        ds_scaled = sc2.fit_transform(ds_arr)

        self.assertTrue(np.allclose(scaled_x, ds_scaled.collect()))
        self.assertTrue(np.allclose(sc1.mean_, sc2.mean_.collect()))
        self.assertTrue(np.allclose(sc1.var_, sc2.var_.collect()))
        self.assertEqual(ds_scaled._top_left_shape,
                         compss_wait_on(ds_scaled._blocks[0][0]).shape)
        self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape)
        self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape)
        self.assertEqual(ds_arr.shape, ds_scaled.shape)
        self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks) 
Example #21
Source File: test_preproc.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_fit_transform(self):
        """ Tests fit_transform against scikit-learn.
        """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        ds_arr = ds.array(x, block_size=(300, 2))

        sc1 = SKScaler()
        scaled_x = sc1.fit_transform(x)
        sc2 = StandardScaler()
        ds_scaled = sc2.fit_transform(ds_arr)

        self.assertTrue(np.allclose(scaled_x, ds_scaled.collect()))
        self.assertTrue(np.allclose(sc1.mean_, sc2.mean_.collect()))
        self.assertTrue(np.allclose(sc1.var_, sc2.var_.collect()))
        self.assertEqual(ds_scaled._top_left_shape,
                         ds_scaled._blocks[0][0].shape)
        self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape)
        self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape)
        self.assertEqual(ds_arr.shape, ds_scaled.shape)
        self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks) 
Example #22
Source File: estimator_checks.py    From Splunking-Crime with GNU Affero General Public License v3.0 6 votes vote down vote up
def check_decision_proba_consistency(name, estimator_orig):
    # Check whether an estimator having both decision_function and
    # predict_proba methods has outputs with perfect rank correlation.

    centers = [(2, 2), (4, 4)]
    X, y = make_blobs(n_samples=100, random_state=0, n_features=4,
                      centers=centers, cluster_std=1.0, shuffle=True)
    X_test = np.random.randn(20, 2) + 4
    estimator = clone(estimator_orig)

    if (hasattr(estimator, "decision_function") and
            hasattr(estimator, "predict_proba")):

        estimator.fit(X, y)
        a = estimator.predict_proba(X_test)[:, 1]
        b = estimator.decision_function(X_test)
        assert_array_equal(rankdata(a), rankdata(b)) 
Example #23
Source File: estimator_checks.py    From Splunking-Crime with GNU Affero General Public License v3.0 6 votes vote down vote up
def check_estimators_partial_fit_n_features(name, estimator_orig):
    # check if number of features changes between calls to partial_fit.
    if not hasattr(estimator_orig, 'partial_fit'):
        return
    estimator = clone(estimator_orig)
    X, y = make_blobs(n_samples=50, random_state=1)
    X -= X.min()

    try:
        if is_classifier(estimator):
            classes = np.unique(y)
            estimator.partial_fit(X, y, classes=classes)
        else:
            estimator.partial_fit(X, y)
    except NotImplementedError:
        return

    assert_raises(ValueError, estimator.partial_fit, X[:, :-1], y) 
Example #24
Source File: test_umap_trustworthiness.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_discrete_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="ordinal",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    ) 
Example #25
Source File: test_umap_trustworthiness.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="l1",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    ) 
Example #26
Source File: estimator_checks.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def check_estimators_fit_returns_self(name, estimator_orig):
    """Check if self is returned when calling fit"""
    X, y = make_blobs(random_state=0, n_samples=9, n_features=4)
    # some want non-negative input
    X -= X.min()

    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)

    set_random_state(estimator)

    assert_true(estimator.fit(X, y) is estimator) 
Example #27
Source File: test_t_sne.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_optimization_minimizes_kl_divergence():
    """t-SNE should give a lower KL divergence with more iterations."""
    random_state = check_random_state(0)
    X, _ = make_blobs(n_features=3, random_state=random_state)
    kl_divergences = []
    for n_iter in [250, 300, 350]:
        tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
                    n_iter=n_iter, random_state=0)
        tsne.fit_transform(X)
        kl_divergences.append(tsne.kl_divergence_)
    assert_less_equal(kl_divergences[1], kl_divergences[0])
    assert_less_equal(kl_divergences[2], kl_divergences[1]) 
Example #28
Source File: estimator_checks.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def check_clusterer_compute_labels_predict(name, clusterer_orig):
    """Check that predict is invariant of compute_labels"""
    X, y = make_blobs(n_samples=20, random_state=0)
    clusterer = clone(clusterer_orig)

    if hasattr(clusterer, "compute_labels"):
        # MiniBatchKMeans
        if hasattr(clusterer, "random_state"):
            clusterer.set_params(random_state=0)

        X_pred1 = clusterer.fit(X).predict(X)
        clusterer.set_params(compute_labels=False)
        X_pred2 = clusterer.fit(X).predict(X)
        assert_array_equal(X_pred1, X_pred2) 
Example #29
Source File: benchmarks.py    From kepler-mapper with MIT License 5 votes vote down vote up
def profile():
    num_sets = 100
    blob_size = 1000
    nr_cubes = 10
    overlap = 0.2

    blob_list = []
    for i in range(num_sets):
        data, _ = datasets.make_blobs(blob_size)
        blob_list.append(data)

    mapper = KeplerMapper(verbose=0)

    pr = cProfile.Profile()
    pr.enable()

    for data in blob_list:
        lens = mapper.fit_transform(data)
        graph = mapper.map(lens, data, nr_cubes=nr_cubes, overlap_perc=overlap)

    pr.disable()
    s = io.StringIO()
    sortby = "cumulative"
    ps = pstats.Stats(pr, stream=s).strip_dirs().sort_stats(sortby)
    ps.print_stats("kmapper")
    print(
        "Ran {} blobs of size {} with params (nr_cubes:{}\toverlap:{})".format(
            num_sets, blob_size, nr_cubes, overlap
        )
    )
    print(s.getvalue()) 
Example #30
Source File: estimator_checks.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def check_classifiers_classes(name, classifier_orig):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        classifier = clone(classifier_orig)
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_random_state(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))