Python Examples of sklearn.datasets.load

Source File: label_digits.py From libact with BSD 2-Clause "Simplified" License

8 votes

def split_train_test(n_classes):
    from sklearn.datasets import load_digits

    n_labeled = 5
    digits = load_digits(n_class=n_classes)  # consider binary case
    X = digits.data
    y = digits.target
    print(np.shape(X))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    while len(np.unique(y_train[:n_labeled])) < n_classes:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.33)

    trn_ds = Dataset(X_train, np.concatenate(
        [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)

    return trn_ds, tst_ds, digits

Source File: train_model.py From production-tools with BSD 3-Clause "New" or "Revised" License

6 votes

def get_mnist_data():
    """Loads the MNIST data set into memory.

    Returns
    -------
    X : array-like, shape=[n_samples, n_features]
        Training data for the MNIST data set.
        
    y : array-like, shape=[n_samples,]
        Labels for the MNIST data set.
    """
    digits = load_digits()
    X, y = digits.data, digits.target
    y = LabelBinarizer().fit_transform(y)

    return X, y

Source File: test_pca.py From mars with Apache License 2.0

6 votes

def test_pca_score_with_different_solvers(self):
        digits = datasets.load_digits()
        X_digits = mt.tensor(digits.data)

        pca_dict = {svd_solver: PCA(n_components=30, svd_solver=svd_solver,
                                    random_state=0)
                    for svd_solver in self.solver_list}

        for pca in pca_dict.values():
            pca.fit(X_digits)
            # Sanity check for the noise_variance_. For more details see
            # https://github.com/scikit-learn/scikit-learn/issues/7568
            # https://github.com/scikit-learn/scikit-learn/issues/8541
            # https://github.com/scikit-learn/scikit-learn/issues/8544
            assert mt.all((pca.explained_variance_ - pca.noise_variance_) >= 0).to_numpy()

        # Compare scores with different svd_solvers
        score_dict = {svd_solver: pca.score(X_digits).to_numpy()
                      for svd_solver, pca in pca_dict.items()}
        assert_almost_equal(score_dict['full'], score_dict['randomized'],
                            decimal=3)

Source File: random_forest.py From ML-From-Scratch with MIT License

6 votes

def main():
    data = datasets.load_digits()
    X = data.data
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2)

    clf = RandomForest(n_estimators=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print ("Accuracy:", accuracy)

    Plot().plot_in_2d(X_test, y_pred, title="Random Forest", accuracy=accuracy, legend_labels=data.target_names)

Source File: multilayer_perceptron.py From ML-From-Scratch with MIT License

6 votes

def main():
    data = datasets.load_digits()
    X = normalize(data.data)
    y = data.target

    # Convert the nominal y values to binary
    y = to_categorical(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=1)

    # MLP
    clf = MultilayerPerceptron(n_hidden=16,
        n_iterations=1000,
        learning_rate=0.01)

    clf.fit(X_train, y_train)
    y_pred = np.argmax(clf.predict(X_test), axis=1)
    y_test = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_test, y_pred)
    print ("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test, y_pred, title="Multilayer Perceptron", accuracy=accuracy, legend_labels=np.unique(y))

Source File: naive_bayes.py From ML-From-Scratch with MIT License

6 votes

def main():
    data = datasets.load_digits()
    X = normalize(data.data)
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    clf = NaiveBayes()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print ("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test, y_pred, title="Naive Bayes", accuracy=accuracy, legend_labels=data.target_names)

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_train_test_split(self):

        df = pdml.ModelFrame(datasets.load_digits())
        self.assertIsInstance(df, pdml.ModelFrame)

        train_df, test_df = df.model_selection.train_test_split()
        tm.assert_index_equal(df.columns, train_df.columns)
        tm.assert_index_equal(df.columns, test_df.columns)

        self.assertEqual(len(df), len(train_df) + len(test_df))
        self.assertEqual(df.shape[1], train_df.shape[1])
        self.assertEqual(df.shape[1], test_df.shape[1])

        tm.assert_index_equal(df.columns, train_df.columns)
        tm.assert_index_equal(df.columns, test_df.columns)

        df = pdml.ModelFrame(datasets.load_digits())
        df.target_name = 'xxx'

        train_df, test_df = df.model_selection.train_test_split()
        tm.assert_index_equal(df.columns, train_df.columns)
        tm.assert_index_equal(df.columns, test_df.columns)
        self.assertEqual(train_df.target_name, 'xxx')
        self.assertEqual(test_df.target_name, 'xxx')

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_validation_curve(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        param_range = np.logspace(-2, -1, 2)

        svc = df.svm.SVC(random_state=self.random_state)
        result = df.model_selection.validation_curve(svc, 'gamma',
                                                     param_range)
        expected = ms.validation_curve(svm.SVC(random_state=self.random_state),
                                       digits.data, digits.target,
                                       'gamma', param_range)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assert_numpy_array_almost_equal(result[1], expected[1])

Source File: test_metrics.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def setup_method(self):
        import sklearn.svm as svm
        digits = datasets.load_digits()
        self.data = digits.data
        self.target = digits.target
        self.df = pdml.ModelFrame(digits)

        estimator1 = self.df.svm.LinearSVC(C=1.0, random_state=self.random_state)
        self.df.fit(estimator1)

        estimator2 = svm.LinearSVC(C=1.0, random_state=self.random_state)
        estimator2.fit(self.data, self.target)
        self.pred = estimator2.predict(self.data)
        self.decision = estimator2.decision_function(self.data)

        # argument for classification reports
        self.labels = np.array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

Source File: test_sklearn_feature_union.py From sklearn-onnx with MIT License

6 votes

def test_feature_union_transformer_weights_2(self):
        data = load_digits()
        X, y = data.data, data.target
        X = X.astype(np.float32)
        X_train, X_test, *_ = train_test_split(X, y, test_size=0.5,
                                               random_state=42)
        model = FeatureUnion([('pca', PCA()),
                              ('svd', TruncatedSVD())],
                             transformer_weights={'pca1': 10, 'svd2': 3}
                             ).fit(X_train)
        model_onnx = convert_sklearn(
            model, 'feature union',
            [('input', FloatTensorType([None, X_test.shape[1]]))])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X_test,
            model,
            model_onnx,
            basename="SklearnFeatureUnionTransformerWeights2-Dec4",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_feature_union.py From sklearn-onnx with MIT License

6 votes

def test_feature_union_transformer_weights_1(self):
        data = load_digits()
        X, y = data.data, data.target
        X = X.astype(np.int64)
        X_train, X_test, *_ = train_test_split(X, y, test_size=0.5,
                                               random_state=42)
        model = FeatureUnion([('pca', PCA()),
                              ('svd', TruncatedSVD())],
                             transformer_weights={'pca': 10, 'svd': 3}
                             ).fit(X_train)
        model_onnx = convert_sklearn(
            model, 'feature union',
            [('input', Int64TensorType([None, X_test.shape[1]]))])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X_test,
            model,
            model_onnx,
            basename="SklearnFeatureUnionTransformerWeights1-Dec4",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_calibrated_classifier_cv_converter.py From sklearn-onnx with MIT License

6 votes

def test_model_calibrated_classifier_cv_int(self):
        data = load_digits()
        X, y = data.data, data.target
        clf = MultinomialNB().fit(X, y)
        model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn CalibratedClassifierCVMNB",
            [("input", Int64TensorType([None, X.shape[1]]))],
            target_opset=TARGET_OPSET
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnCalibratedClassifierCVInt-Dec4",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License

6 votes

def test_batchkmeans_clustering_int(self):
        data = load_digits()
        X = data.data
        model = MiniBatchKMeans(n_clusters=4)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", Int64TensorType([None,
                                      X.shape[1]]))],
                                     target_opset=TARGET_OPSET)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.int64)[40:60],
            model,
            model_onnx,
            basename="SklearnBatchKMeansInt-Dec4",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2') or "
                          "StrictVersion(onnxruntime.__version__) "
                          "<= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_k_means_converter.py From sklearn-onnx with MIT License

6 votes

def test_kmeans_clustering_int(self):
        data = load_digits()
        X = data.data
        model = KMeans(n_clusters=4)
        model.fit(X)
        model_onnx = convert_sklearn(model, "kmeans",
                                     [("input", Int64TensorType([None,
                                      X.shape[1]]))],
                                     target_opset=TARGET_OPSET)
        self.assertIsNotNone(model_onnx)
        dump_data_and_model(
            X.astype(numpy.int64)[40:60],
            model,
            model_onnx,
            basename="SklearnKMeansInt-Dec4",
            # Operator gemm is not implemented in onnxruntime
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2') or "
                          "StrictVersion(onnxruntime.__version__) "
                          "<= StrictVersion('0.2.1')",
        )

Source File: test_topology_prune.py From sklearn-onnx with MIT License

6 votes

def test_dummy_identity(self):

        digits = datasets.load_digits(n_class=6)
        Xd = digits.data[:20]
        yd = digits.target[:20]
        n_samples, n_features = Xd.shape

        idtr = make_pipeline(IdentityTransformer(), identity())
        idtr.fit(Xd, yd)

        update_registered_converter(IdentityTransformer, "IdentityTransformer",
                                    dummy_shape_calculator, dummy_converter)
        update_registered_converter(identity, "identity",
                                    dummy_shape_calculator, dummy_converter)

        model_onnx = convert_sklearn(
            idtr,
            "idtr",
            [("input", FloatTensorType([None, Xd.shape[1]]))],
            target_opset=TARGET_OPSET)

        idnode = [node for node in model_onnx.graph.node
                  if node.op_type == "Identity"]
        assert len(idnode) == 2

Source File: test_sklearn_pca_converter.py From sklearn-onnx with MIT License

6 votes

def test_pca_default_int_randomised(self):
        data = load_digits()
        X_train, X_test, *_ = train_test_split(
            data.data, data.target, test_size=0.2, random_state=42)
        model = PCA(random_state=42, svd_solver='randomized',
                    iterated_power=3).fit(X_train)
        model_onnx = convert_sklearn(
            model,
            initial_types=[("input",
                            Int64TensorType([None, X_test.shape[1]]))],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X_test.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnPCADefaultIntRandomised",
            allow_failure="StrictVersion("
            "onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_rpforest.py From rpforest with Apache License 2.0

6 votes

def _get_mnist_data(seed=None):

    digits = load_digits()["images"]

    if seed is not None:
        rnd = np.random.RandomState(seed=seed)
    else:
        rnd = np.random.RandomState()

    no_img, rows, cols = digits.shape
    X = digits.reshape((no_img, rows * cols))
    X = np.ascontiguousarray(X)
    rnd.shuffle(X)

    X_test = X[:100]
    X_train = X[100:]

    return X_train, X_test

Source File: datasets.py From pyDML with GNU General Public License v3.0

6 votes

def digits_reduced():
    data=load_digits()
    XX = data['data']
    y = data['target']
    nn,dd = XX.shape
    XX = XX.reshape([nn,8,8])

    X = np.empty([nn,3])
    for i in xrange(nn):
        X[i,0] = simetria_hor(XX[i,:,:])
        X[i,1] = simetria_ver(XX[i,:,:])
        X[i,2] = np.mean(XX[i,:])
    
    return X,y

### ARFF dataframes ###

Source File: datasets.py From pyDML with GNU General Public License v3.0

6 votes

def digits_reduced():
    data=load_digits()
    XX = data['data']
    y = data['target']
    nn,dd = XX.shape
    XX = XX.reshape([nn,8,8])

    X = np.empty([nn,3])
    for i in xrange(nn):
        X[i,0] = simetria_hor(XX[i,:,:])
        X[i,1] = simetria_ver(XX[i,:,:])
        X[i,2] = np.mean(XX[i,:])
    
    return X,y

### ARFF dataframes ###

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_cross_val_score(self):
        import sklearn.svm as svm
        digits = datasets.load_digits()

        df = pdml.ModelFrame(digits)
        clf = svm.SVC(kernel=str('linear'), C=1)
        result = df.model_selection.cross_val_score(clf, cv=5)
        expected = ms.cross_val_score(clf, X=digits.data, y=digits.target, cv=5)
        self.assert_numpy_array_almost_equal(result, expected)

Source File: datasets.py From pyDML with GNU General Public License v3.0

5 votes

def digits(numbers=None):
    data=load_digits()     # DIGITS
    X=data['data']
    y=data['target']
    
    if numbers is None:
        numbers=[0,1,2,3,4,5,6,7,8,9]
        
    selected = np.where(np.isin(y,numbers))[0]
    return X[selected,:], y[selected]

    return X,y

Source File: nn.py From L2L with GNU General Public License v3.0

5 votes

def main():
    from sklearn.datasets import load_digits, fetch_mldata

    SMALL_MNIST = False

    if SMALL_MNIST:
        mnist_digits = load_digits()
        n_input = np.prod(mnist_digits.images.shape[1:])
        n_images = len(mnist_digits.images)  # 1797
        data_images = mnist_digits.images.reshape(n_images, -1) / 16.  # -> 1797 x 64
        data_targets = mnist_digits.target
        # im_size_x, im_size_y = 8, 8
    else:
        mnist_digits = fetch_mldata('MNIST original')
        n_input = np.prod(mnist_digits.data.shape[1:])
        data_images = mnist_digits.data / 255.  # -> 70000 x 284
        data_targets = mnist_digits.target
        # im_size_x, im_size_y = 28, 28

    n_hidden, n_output = 5, 10
    nn = NeuralNetworkClassifier(n_input, n_hidden, n_output)
    weight_shapes = nn.get_weights_shapes()
    weights = []
    for weight_shape in weight_shapes:
        weights.append(np.random.randn(*weight_shape))
    nn.set_weights(*weights)
    score = nn.score(data_images, data_targets)
    print("Score is: ", score)

Source File: optimizee.py From L2L with GNU General Public License v3.0

5 votes

def __init__(self, traj, parameters):
        super().__init__(traj)

        if parameters.use_small_mnist:
            # 8 x 8 images
            mnist_digits = load_digits()
            n_input = np.prod(mnist_digits.images.shape[1:])
            n_images = len(mnist_digits.images)  # 1797
            data_images = mnist_digits.images.reshape(n_images, -1) / 16.  # -> 1797 x 64
            data_targets = mnist_digits.target
        else:
            # 28 x 28 images
            mnist_digits = fetch_mldata('MNIST original')
            n_input = np.prod(mnist_digits.data.shape[1:])
            data_images = mnist_digits.data / 255.  # -> 70000 x 284
            n_images = len(data_images)
            data_targets = mnist_digits.target

        self.n_images = n_images
        self.data_images, self.data_targets = data_images, data_targets

        seed = parameters.seed
        n_hidden = parameters.n_hidden

        seed = np.uint32(seed)
        self.random_state = np.random.RandomState(seed=seed)

        n_output = 10  # This is always true for mnist
        self.nn = NeuralNetworkClassifier(n_input, n_hidden, n_output)

        self.random_state = np.random.RandomState(seed=seed)

        # create_individual can be called because __init__ is complete except for traj initializtion
        indiv_dict = self.create_individual()
        for key, val in indiv_dict.items():
            traj.individual.f_add_parameter(key, val)
        traj.individual.f_add_parameter('seed', seed)

Source File: adaboost.py From ML-From-Scratch with MIT License

5 votes

def main():
    data = datasets.load_digits()
    X = data.data
    y = data.target

    digit1 = 1
    digit2 = 8
    idx = np.append(np.where(y == digit1)[0], np.where(y == digit2)[0])
    y = data.target[idx]
    # Change labels to {-1, 1}
    y[y == digit1] = -1
    y[y == digit2] = 1
    X = data.data[idx]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    # Adaboost classification with 5 weak classifiers
    clf = Adaboost(n_clf=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print ("Accuracy:", accuracy)

    # Reduce dimensions to 2d using pca and plot the results
    Plot().plot_in_2d(X_test, y_pred, title="Adaboost", accuracy=accuracy)

Source File: principal_component_analysis.py From ML-From-Scratch with MIT License

5 votes

def main():

    # Demo of how to reduce the dimensionality of the data to two dimension
    # and plot the results. 

    # Load the dataset
    data = datasets.load_digits()
    X = data.data
    y = data.target

    # Project the data onto the 2 primary principal components
    X_trans = PCA().transform(X, 2)

    x1 = X_trans[:, 0]
    x2 = X_trans[:, 1]

    cmap = plt.get_cmap('viridis')
    colors = [cmap(i) for i in np.linspace(0, 1, len(np.unique(y)))]

    class_distr = []
    # Plot the different class distributions
    for i, l in enumerate(np.unique(y)):
        _x1 = x1[y == l]
        _x2 = x2[y == l]
        _y = y[y == l]
        class_distr.append(plt.scatter(_x1, _x2, color=colors[i]))

    # Add a legend
    plt.legend(class_distr, y, loc=1)

    # Axis labels
    plt.suptitle("PCA Dimensionality Reduction")
    plt.title("Digit Dataset")
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()

Source File: perceptron.py From ML-From-Scratch with MIT License

5 votes

def main():
    data = datasets.load_digits()
    X = normalize(data.data)
    y = data.target

    # One-hot encoding of nominal y-values
    y = to_categorical(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1)

    # Perceptron
    clf = Perceptron(n_iterations=5000,
        learning_rate=0.001, 
        loss=CrossEntropy,
        activation_function=Sigmoid)
    clf.fit(X_train, y_train)

    y_pred = np.argmax(clf.predict(X_test), axis=1)
    y_test = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_test, y_pred)

    print ("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test, y_pred, title="Perceptron", accuracy=accuracy, legend_labels=np.unique(y))

Source File: test_utils.py From pyDML with GNU General Public License v3.0

5 votes

def digits_data():
    data = load_digits()     # DIGITS
    X = data['data']
    y = data['target']

    return X, y

Source File: datasets.py From pyDML with GNU General Public License v3.0

5 votes

def digits(numbers=None):
    data=load_digits()     # DIGITS
    X=data['data']
    y=data['target']
    
    if numbers is None:
        numbers=[0,1,2,3,4,5,6,7,8,9]
        
    selected = np.where(np.isin(y,numbers))[0]
    return X[selected,:], y[selected]

    return X,y

Source File: test_sparse.py From twitter-stock-recommendation with MIT License

5 votes

def test_unsorted_indices():
    # test that the result with sorted and unsorted indices in csr is the same
    # we use a subset of digits as iris, blobs or make_classification didn't
    # show the problem
    digits = load_digits()
    X, y = digits.data[:50], digits.target[:50]
    X_test = sparse.csr_matrix(digits.data[50:100])

    X_sparse = sparse.csr_matrix(X)
    coef_dense = svm.SVC(kernel='linear', probability=True,
                         random_state=0).fit(X, y).coef_
    sparse_svc = svm.SVC(kernel='linear', probability=True,
                         random_state=0).fit(X_sparse, y)
    coef_sorted = sparse_svc.coef_
    # make sure dense and sparse SVM give the same result
    assert_array_almost_equal(coef_dense, coef_sorted.toarray())

    X_sparse_unsorted = X_sparse[np.arange(X.shape[0])]
    X_test_unsorted = X_test[np.arange(X_test.shape[0])]

    # make sure we scramble the indices
    assert_false(X_sparse_unsorted.has_sorted_indices)
    assert_false(X_test_unsorted.has_sorted_indices)

    unsorted_svc = svm.SVC(kernel='linear', probability=True,
                           random_state=0).fit(X_sparse_unsorted, y)
    coef_unsorted = unsorted_svc.coef_
    # make sure unsorted indices give same result
    assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray())
    assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted),
                              sparse_svc.predict_proba(X_test))

Source File: test_pnn.py From neupy with MIT License

5 votes

def test_digit_prediction(self):
        dataset = datasets.load_digits()
        x_train, x_test, y_train, y_test = train_test_split(
            dataset.data, dataset.target, test_size=0.3
        )

        nw = algorithms.PNN(verbose=False, std=10)
        nw.train(x_train, y_train)
        result = nw.predict(x_test)

        accuracy = metrics.accuracy_score(y_test, result)
        self.assertAlmostEqual(accuracy, 0.9889, places=4)

Python sklearn.datasets.load_digits() Examples