Python Examples of sklearn.datasets.make

Source File: test_confidence.py From nussl with MIT License

7 votes

def test_js_divergence():
    n_samples = 1000
    blobs, _ = datasets.make_blobs(n_samples=n_samples, random_state=8)

    one_component_a = ml.cluster.GaussianMixture(1)
    one_component_b = ml.cluster.GaussianMixture(1)
    two_component = ml.cluster.GaussianMixture(2)

    one_component_a.fit(blobs)
    one_component_b.fit(blobs)
    two_component.fit(blobs)

    confidence_2v1 = ml.confidence.jensen_shannon_divergence(
        one_component_a, two_component)

    confidence_1v1 = ml.confidence.jensen_shannon_divergence(
        one_component_a, one_component_b)

    assert confidence_2v1 > confidence_1v1

Source File: test_samples_generator.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_make_blobs_error():
    n_samples = [20, 20, 20]
    centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
    cluster_stds = np.array([0.05, 0.2, 0.4])
    wrong_centers_msg = ("Length of `n_samples` not consistent "
                         "with number of centers. Got n_samples = {} "
                         "and centers = {}".format(n_samples, centers[:-1]))
    assert_raise_message(ValueError, wrong_centers_msg,
                         make_blobs, n_samples, centers=centers[:-1])
    wrong_std_msg = ("Length of `clusters_std` not consistent with "
                     "number of centers. Got centers = {} "
                     "and cluster_std = {}".format(centers, cluster_stds[:-1]))
    assert_raise_message(ValueError, wrong_std_msg,
                         make_blobs, n_samples,
                         centers=centers, cluster_std=cluster_stds[:-1])
    wrong_type_msg = ("Parameter `centers` must be array-like. "
                      "Got {!r} instead".format(3))
    assert_raise_message(ValueError, wrong_type_msg,
                         make_blobs, n_samples, centers=3)

Source File: helper.py From practicalDataAnalysisCookbook with GNU General Public License v2.0

6 votes

def produce_XOR(sampleSize):
    import sklearn.datasets as dt

    # centers of the blobs
    centers = [(0,0),(3,0),(3,3),(0,3)]

    # create the sample
    x, y = dt.make_blobs(n_samples=sampleSize, n_features=2,
        cluster_std=0.8, centers=centers, shuffle=False
    )

    # and make it XOR like
    y[y == 2] = 0 
    y[y == 3] = 1

    return x, y

Source File: test_sparse.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_svc():
    """Check that sparse SVC gives the same result as SVC"""
    # many class dataset:
    X_blobs, y_blobs = make_blobs(n_samples=100, centers=10, random_state=0)
    X_blobs = sparse.csr_matrix(X_blobs)

    datasets = [[X_sp, Y, T], [X2_sp, Y2, T2],
                [X_blobs[:80], y_blobs[:80], X_blobs[80:]],
                [iris.data, iris.target, iris.data]]
    kernels = ["linear", "poly", "rbf", "sigmoid"]
    for dataset in datasets:
        for kernel in kernels:
            clf = svm.SVC(gamma=1, kernel=kernel, probability=True,
                          random_state=0, decision_function_shape='ovo')
            sp_clf = svm.SVC(gamma=1, kernel=kernel, probability=True,
                             random_state=0, decision_function_shape='ovo')
            check_svm_model_equal(clf, sp_clf, *dataset)

Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_grid_search_no_score():
    # Test grid-search on classifier that has no score function.
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    clf_no_score = LinearSVCNoScore(random_state=0)
    grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy')
    grid_search.fit(X, y)

    grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs},
                                        scoring='accuracy')
    # smoketest grid search
    grid_search_no_score.fit(X, y)

    # check that best params are equal
    assert_equal(grid_search_no_score.best_params_, grid_search.best_params_)
    # check that we can call score and that it gives the correct result
    assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y))

    # giving no scoring function raises an error
    grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
    assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
                         [[1]])

Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_unsupervised_grid_search():
    # test grid-search with unsupervised estimator
    X, y = make_blobs(random_state=0)
    km = KMeans(random_state=0)

    # Multi-metric evaluation unsupervised
    scoring = ['adjusted_rand_score', 'fowlkes_mallows_score']
    for refit in ['adjusted_rand_score', 'fowlkes_mallows_score']:
        grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
                                   scoring=scoring, refit=refit)
        grid_search.fit(X, y)
        # Both ARI and FMS can find the right number :)
        assert_equal(grid_search.best_params_["n_clusters"], 3)

    # Single metric evaluation unsupervised
    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
                               scoring='fowlkes_mallows_score')
    grid_search.fit(X, y)
    assert_equal(grid_search.best_params_["n_clusters"], 3)

    # Now without a score, and without y
    grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
    grid_search.fit(X)
    assert_equal(grid_search.best_params_["n_clusters"], 4)

Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_deprecated_grid_search_iid():
    depr_message = ("The default of the `iid` parameter will change from True "
                    "to False in version 0.22")
    X, y = make_blobs(n_samples=54, random_state=0, centers=2)
    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=3)
    # no warning with equally sized test sets
    assert_no_warnings(grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=5)
    # warning because 54 % 5 != 0
    assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=2)
    # warning because stratification into two classes and 27 % 2 != 0
    assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=KFold(2))
    # no warning because no stratification and 54 % 2 == 0
    assert_no_warnings(grid.fit, X, y)

Source File: test_discriminant_analysis.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_lda_coefs():
    # Test if the coefficients of the solvers are approximately the same.
    n_features = 2
    n_classes = 2
    n_samples = 1000
    X, y = make_blobs(n_samples=n_samples, n_features=n_features,
                      centers=n_classes, random_state=11)

    clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
    clf_lda_lsqr = LinearDiscriminantAnalysis(solver="lsqr")
    clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")

    clf_lda_svd.fit(X, y)
    clf_lda_lsqr.fit(X, y)
    clf_lda_eigen.fit(X, y)

    assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_lsqr.coef_, 1)
    assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_eigen.coef_, 1)
    assert_array_almost_equal(clf_lda_eigen.coef_, clf_lda_lsqr.coef_, 1)

Source File: test_birch.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_partial_fit():
    # Test that fit is equivalent to calling partial_fit multiple times
    X, y = make_blobs(n_samples=100)
    brc = Birch(n_clusters=3)
    brc.fit(X)
    brc_partial = Birch(n_clusters=None)
    brc_partial.partial_fit(X[:50])
    brc_partial.partial_fit(X[50:])
    assert_array_almost_equal(brc_partial.subcluster_centers_,
                              brc.subcluster_centers_)

    # Test that same global labels are obtained after calling partial_fit
    # with None
    brc_partial.set_params(n_clusters=3)
    brc_partial.partial_fit(None)
    assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)

Source File: test_birch.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_n_clusters():
    # Test that n_clusters param works properly
    X, y = make_blobs(n_samples=100, centers=10)
    brc1 = Birch(n_clusters=10)
    brc1.fit(X)
    assert_greater(len(brc1.subcluster_centers_), 10)
    assert_equal(len(np.unique(brc1.labels_)), 10)

    # Test that n_clusters = Agglomerative Clustering gives
    # the same results.
    gc = AgglomerativeClustering(n_clusters=10)
    brc2 = Birch(n_clusters=gc)
    brc2.fit(X)
    assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
    assert_array_equal(brc1.labels_, brc2.labels_)

    # Test that the wrong global clustering step raises an Error.
    clf = ElasticNet()
    brc3 = Birch(n_clusters=clf)
    assert_raises(ValueError, brc3.fit, X)

    # Test that a small number of clusters raises a warning.
    brc4 = Birch(threshold=10000.)
    assert_warns(ConvergenceWarning, brc4.fit, X)

Source File: test_birch.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_branching_factor():
    # Test that nodes have at max branching_factor number of subclusters
    X, y = make_blobs()
    branching_factor = 9

    # Purposefully set a low threshold to maximize the subclusters.
    brc = Birch(n_clusters=None, branching_factor=branching_factor,
                threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)
    brc = Birch(n_clusters=3, branching_factor=branching_factor,
                threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)

    # Raises error when branching_factor is set to one.
    brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
    assert_raises(ValueError, brc.fit, X)

Source File: test_kmeans.py From dislib with Apache License 2.0

6 votes

def test_fit_predict(self):
        """ Tests fit_predict."""
        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))

        x_train = ds.array(x_filtered, block_size=(300, 2))

        kmeans = KMeans(n_clusters=3, random_state=170)
        labels = kmeans.fit_predict(x_train).collect()

        skmeans = SKMeans(n_clusters=3, random_state=170)
        sklabels = skmeans.fit_predict(x_filtered)

        centers = np.array([[-8.941375656533449, -5.481371322614891],
                            [-4.524023204953875, 0.06235042593214654],
                            [2.332994701667008, 0.37681003933082696]])

        self.assertTrue(np.allclose(centers, kmeans.centers))
        self.assertTrue(np.allclose(labels, sklabels))

Source File: sequential_minimum_optimization.py From Python with MIT License

6 votes

def test_linear_kernel(ax, cost):
    train_x, train_y = make_blobs(
        n_samples=500, centers=2, n_features=2, random_state=1
    )
    train_y[train_y == 0] = -1
    scaler = StandardScaler()
    train_x_scaled = scaler.fit_transform(train_x, train_y)
    train_data = np.hstack((train_y.reshape(500, 1), train_x_scaled))
    mykernel = Kernel(kernel="linear", degree=5, coef0=1, gamma=0.5)
    mysvm = SmoSVM(
        train=train_data,
        kernel_func=mykernel,
        cost=cost,
        tolerance=0.001,
        auto_norm=False,
    )
    mysvm.fit()
    plot_partition_boundary(mysvm, train_data, ax=ax)

Source File: test_umap_trustworthiness.py From umap with BSD 3-Clause "New" or "Revised" License

6 votes

def test_string_metric_supervised_umap_trustworthiness():
    data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
    labels = np.array(["this", "that", "other"])[labels]
    embedding = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        target_metric="string",
        target_weight=0.8,
        n_epochs=100,
        random_state=42,
    ).fit_transform(data, labels)
    trust = trustworthiness(data, embedding, 10)
    assert_greater_equal(
        trust,
        0.95,
        "Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
    )