Python Examples of sklearn.datasets.load_breast

Source File: test_classifier_comb.py From combo with BSD 2-Clause "Simplified" License

7 votes

def setUp(self):
        self.roc_floor = 0.9
        self.accuracy_floor = 0.9

        random_state = 42
        X, y = load_breast_cancer(return_X_y=True)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=random_state),
                       GradientBoostingClassifier(random_state=random_state)]

        self.clf = SimpleClassifierAggregator(classifiers, method='average')

Source File: test_Feature_Binarizer_From_Trees.py From AIX360 with Apache License 2.0

7 votes

def setUp(self) -> None:
        self.random_state = 0
        d: dict = load_breast_cancer()
        X: DataFrame = DataFrame(d['data'], columns=d['feature_names'])
        self.col_ordinal = X.columns.to_list()
        np.random.seed(self.random_state)
        s = np.array(['a', 'b', 'c'])
        X['cat alpha'] = s[np.random.randint(0, 3, len(X))]
        X['cat num'] = np.random.randint(0, 3, len(X))
        self.col_categorical = ['cat alpha', 'cat num']
        s = np.array(['a', 'b'])
        X['bin alpha'] = s[np.random.randint(0, 2, len(X))]
        X['bin num'] = np.random.randint(0, 2, len(X))
        self.col_binary = ['bin alpha', 'bin num']
        self.X = X
        self.y: ndarray = d['target']
        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(self.X, self.y, test_size=0.4, random_state=self.random_state)

Source File: test_classifier_stacking.py From combo with BSD 2-Clause "Simplified" License

7 votes

def setUp(self):
        self.roc_floor = 0.9
        self.accuracy_floor = 0.9

        random_state = 42
        X, y = load_breast_cancer(return_X_y=True)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=random_state),
                       GradientBoostingClassifier(random_state=random_state)]

        self.clf = Stacking(classifiers, n_folds=4)
        self.clf.fit(self.X_train, self.y_train)

Source File: test_sklearn_feature_selection_converters.py From sklearn-onnx with MIT License

6 votes

def test_select_fwe_int(self):
        model = SelectFwe()
        X, y = load_breast_cancer(return_X_y=True)
        model.fit(X, y)
        model_onnx = convert_sklearn(
            model, "select fwe",
            [("input", Int64TensorType([None, X.shape[1]]))])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnSelectFwe",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2') or "
                          "StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.2.1')",
        )

Source File: main_nearest_neighbor.py From wisconsin-breast-cancer with Apache License 2.0

6 votes

def main():
    dataset = datasets.load_breast_cancer()

    features = dataset.data
    labels = dataset.target

    num_features = features.shape[1]

    features = StandardScaler().fit_transform(features)

    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.3, stratify=labels
    )

    model = NearestNeighbor(train_features, train_labels, num_features)

    model.predict(test_features, test_labels, result_path="./results/nearest_neighbor/")

Source File: test_classifier_des.py From combo with BSD 2-Clause "Simplified" License

6 votes

def setUp(self):
        self.roc_floor = 0.9
        self.accuracy_floor = 0.9

        random_state = 42
        X, y = load_breast_cancer(return_X_y=True)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=random_state),
                       GradientBoostingClassifier(random_state=random_state)]

        self.clf = DES_LA(classifiers, local_region_size=30)
        self.clf.fit(self.X_train, self.y_train)

Source File: test_gridsearch.py From dislib with Apache License 2.0

6 votes

def test_fit_2(self):
        """Tests GridSearchCV fit() with different data."""
        x_np, y_np = datasets.load_breast_cancer(return_X_y=True)
        x = ds.array(x_np, block_size=(100, 10))
        x = StandardScaler().fit_transform(x)
        y = ds.array(y_np.reshape(-1, 1), block_size=(100, 1))
        parameters = {'c': [0.1], 'gamma': [0.1]}
        csvm = CascadeSVM()
        searcher = GridSearchCV(csvm, parameters, cv=5)
        searcher.fit(x, y)

        self.assertTrue(hasattr(searcher, 'best_estimator_'))
        self.assertTrue(hasattr(searcher, 'best_score_'))
        self.assertTrue(hasattr(searcher, 'best_params_'))
        self.assertTrue(hasattr(searcher, 'best_index_'))
        self.assertTrue(hasattr(searcher, 'scorer_'))
        self.assertEqual(searcher.n_splits_, 5)

Source File: test_classifier_comb.py From combo with BSD 2-Clause "Simplified" License

6 votes

def setUp(self):
        self.roc_floor = 0.9
        self.accuracy_floor = 0.9

        random_state = 42
        X, y = load_breast_cancer(return_X_y=True)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=random_state),
                       GradientBoostingClassifier(random_state=random_state)]

        self.clf = SimpleClassifierAggregator(classifiers, method='average')
        self.clf.fit(self.X_train, self.y_train)

Source File: test_classifier_comb.py From combo with BSD 2-Clause "Simplified" License

6 votes

def setUp(self):
        self.roc_floor = 0.9
        self.accuracy_floor = 0.9

        random_state = 42
        X, y = load_breast_cancer(return_X_y=True)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        clf_weights = np.array([0.1, 0.4, 0.1, 0.2, 0.2])

        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=random_state),
                       GradientBoostingClassifier(random_state=random_state)]

        self.clf = SimpleClassifierAggregator(classifiers, method='average',
                                              weights=clf_weights)

        self.clf.fit(self.X_train, self.y_train)

Source File: test_classifier_comb.py From combo with BSD 2-Clause "Simplified" License

6 votes

def setUp(self):
        self.roc_floor = 0.9
        self.accuracy_floor = 0.9

        random_state = 42
        X, y = load_breast_cancer(return_X_y=True)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=random_state),
                       GradientBoostingClassifier(random_state=random_state)]

        self.clf = SimpleClassifierAggregator(classifiers,
                                              method='maximization')
        self.clf.fit(self.X_train, self.y_train)

Source File: test_pyfms.py From pyfms with MIT License

6 votes

def test_save_load_classifier(self):
        X, y = datasets.load_breast_cancer(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        k = 4

        classifier_before = pyfms.Classifier(X.shape[1], k=k)
        classifier_before.fit(X_train, y_train, nb_epoch=1000)

        weights_before = classifier_before.get_weights()
        accuracy_before = accuracy_score(y_test, classifier_before.predict(X_test))

        classifier_file = os.path.join(self.workspace, 'classifier.fm')
        classifier_before.save_weights(classifier_file)

        classifier_after = pyfms.Classifier(X.shape[1])
        classifier_after.load_weights(classifier_file)

        weights_after = classifier_after.get_weights()
        accuracy_after = accuracy_score(y_test, classifier_after.predict(X_test))

        for wb, wa in zip(weights_before, weights_after):
            np.testing.assert_array_equal(wb, wa)
        self.assertEqual(accuracy_before, accuracy_after)

Source File: test_classifier_comb.py From combo with BSD 2-Clause "Simplified" License

6 votes

def setUp(self):
        self.roc_floor = 0.9
        self.accuracy_floor = 0.9

        random_state = 42
        X, y = load_breast_cancer(return_X_y=True)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=random_state)

        classifiers = [DecisionTreeClassifier(random_state=random_state),
                       LogisticRegression(random_state=random_state),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=random_state),
                       GradientBoostingClassifier(random_state=random_state)]

        self.clf = SimpleClassifierAggregator(classifiers,
                                              method='median')
        self.clf.fit(self.X_train, self.y_train)

Source File: test_cluster_comb.py From combo with BSD 2-Clause "Simplified" License

6 votes

def setUp(self):
        self.X, self.y = load_breast_cancer(return_X_y=True)

        self.n_clusters = 5
        self.n_estimators = 3

        # Initialize a set of estimators
        estimators = [KMeans(n_clusters=self.n_clusters),
                      MiniBatchKMeans(n_clusters=self.n_clusters),
                      AgglomerativeClustering(n_clusters=self.n_clusters)]

        # Clusterer Ensemble without initializing a new Class
        self.original_labels = np.zeros([self.X.shape[0], self.n_estimators])

        for i, estimator in enumerate(estimators):
            estimator.fit(self.X)
            self.original_labels[:, i] = estimator.labels_

Source File: test_sklearn_feature_selection_converters.py From sklearn-onnx with MIT License

6 votes

def test_select_fdr_int(self):
        model = SelectFdr()
        X, y = load_breast_cancer(return_X_y=True)
        model.fit(X, y)
        model_onnx = convert_sklearn(
            model, "select fdr",
            [("input", Int64TensorType([None, X.shape[1]]))])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnSelectFdr",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2') or "
                          "StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_feature_selection_converters.py From sklearn-onnx with MIT License

6 votes

def test_select_fdr_float(self):
        model = SelectFdr()
        X, y = load_breast_cancer(return_X_y=True)
        model.fit(X, y)
        model_onnx = convert_sklearn(
            model, "select fdr",
            [("input", FloatTensorType([None, X.shape[1]]))])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.float32),
            model,
            model_onnx,
            basename="SklearnSelectFdr",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2') or "
                          "StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.2.1')",
        )

Source File: test_sklearn_feature_selection_converters.py From sklearn-onnx with MIT License

6 votes

def test_select_fwe_float(self):
        model = SelectFwe()
        X, y = load_breast_cancer(return_X_y=True)
        model.fit(X, y)
        model_onnx = convert_sklearn(
            model, "select fwe",
            [("input", FloatTensorType([None, X.shape[1]]))])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.float32),
            model,
            model_onnx,
            basename="SklearnSelectFwe",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.2') or "
                          "StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.2.1')",
        )

Source File: test_logistic.py From h2o4gpu with Apache License 2.0

6 votes

def test_not_labels():
    data = load_breast_cancer()
    X = data.data
    y = data.target

    # convert class values to [0,2]
    # y = y * 2

    # Splitting data into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42)

    # sklearn
    clf_sklearn = linear_model.LogisticRegression()
    clf_sklearn.fit(X_train, y_train)
    y_pred_sklearn = clf_sklearn.predict(X_test)

    # h2o
    clf_h2o = h2o4gpu.LogisticRegression()
    clf_h2o.fit(X_train, y_train)
    y_pred_h2o = clf_h2o.predict(X_test)

    assert np.allclose(accuracy_score(y_test, y_pred_sklearn), accuracy_score(y_test, y_pred_h2o.squeeze()))

Source File: test_des_integration.py From DESlib with BSD 3-Clause "New" or "Revised" License

6 votes

def load_dataset(encode_labels, rng):
    # Generate a classification dataset
    data = load_breast_cancer()
    X = data.data
    y = data.target
    if encode_labels is not None:
        y = np.take(encode_labels, y)
    # split the data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
                                                        random_state=rng)
    # Scale the variables to have 0 mean and unit variance
    scalar = StandardScaler()
    X_train = scalar.fit_transform(X_train)
    X_test = scalar.transform(X_test)
    # Split the data into training and DSEL for DS techniques
    X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train,
                                                        test_size=0.5,
                                                        random_state=rng)
    # Considering a pool composed of 10 base classifiers
    # Calibrating Perceptrons to estimate probabilities
    return X_dsel, X_test, X_train, y_dsel, y_test, y_train

Source File: test_des_integration.py From DESlib with BSD 3-Clause "New" or "Revised" License

6 votes

def test_meta_no_pool_of_classifiers(knn_methods):
    rng = np.random.RandomState(123456)

    data = load_breast_cancer()
    X = data.data
    y = data.target

    # split the data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
                                                        random_state=rng)
    # Scale the variables to have 0 mean and unit variance
    scalar = StandardScaler()
    X_train = scalar.fit_transform(X_train)
    X_test = scalar.transform(X_test)

    meta_des = METADES(knn_classifier=knn_methods, random_state=rng,
                       DSEL_perc=0.5)
    meta_des.fit(X_train, y_train)
    assert np.isclose(meta_des.score(X_test, y_test), 0.9095744680851063)

Source File: test_utils.py From pyDML with GNU General Public License v3.0

5 votes

def breast_cancer():
    return Xy_dataset(load_breast_cancer)

Source File: custom_objective.py From autogbt-alt with MIT License

5 votes

def main():
    X, y = load_breast_cancer(return_X_y=True)
    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.1)
    model = AutoGBTClassifier(n_trials=5, objective=CustomObjective())
    model.fit(train_X, train_y)
    print('valid AUC: %.3f' % (roc_auc_score(valid_y, model.predict(valid_X))))
    print('CV AUC: %.3f' % (model.best_score))

Source File: sofm_heatmap_visualization.py From neupy with MIT License

5 votes

def load_data():
    data, target = datasets.load_breast_cancer(return_X_y=True)

    scaler = preprocessing.MinMaxScaler()
    data = scaler.fit_transform(data)

    return data, target

Source File: common_utils.py From interpret-community with MIT License

5 votes

def create_scikit_cancer_data():
    breast_cancer_data = load_breast_cancer()
    classes = breast_cancer_data.target_names.tolist()

    # Split data into train and test
    x_train, x_test, y_train, y_test = train_test_split(breast_cancer_data.data,
                                                        breast_cancer_data.target,
                                                        test_size=0.2,
                                                        random_state=0)
    feature_names = breast_cancer_data.feature_names
    classes = breast_cancer_data.target_names.tolist()
    return x_train, x_test, y_train, y_test, feature_names, classes

Source File: test_distns.py From ngboost with Apache License 2.0

5 votes

def cls_data(self):
        X, Y = load_breast_cancer(True)
        return train_test_split(X, Y, test_size=0.2)

Source File: __init__.py From skoot with MIT License

5 votes

def load_breast_cancer_df(include_tgt=True, tgt_name="target", names=None):
    """Get the breast cancer dataset.

    Loads the breast cancer dataset into a dataframe with the
    target set as the "target" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------
    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="target")
        The name of the target feature

    names : iterable or None
        The column names for the dataframe. If not
        defined, will default to the ``feature_names``
        attribute in the sklearn bunch instance.

    Returns
    -------
    X : pd.DataFrame, shape=(n_samples, n_features)
        The loaded breast cancer dataset
    """
    from sklearn.datasets import load_breast_cancer
    return _load_from_bunch(load_breast_cancer(), include_tgt,
                            tgt_name, names)

Source File: dominance.py From dominance-analysis with MIT License

5 votes

def get_breast_cancer(cls):
		print("""The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is downloaded from: https://goo.gl/U2Uwz2""")
		print("""Internally using load_breast_cancer function from sklearn.datasets """)
		breast_cancer_data=pd.DataFrame(data=load_breast_cancer()['data'],columns=load_breast_cancer()['feature_names'])
		breast_cancer_data['target']=load_breast_cancer()['target']
		target_dict=dict({j for i,j in zip(load_breast_cancer()['target_names'],enumerate(load_breast_cancer()['target_names']))})
		breast_cancer_data['target_names']=breast_cancer_data['target'].map(target_dict)
		return breast_cancer_data.iloc[:,:-1]

Source File: conftest.py From python-sasctl with Apache License 2.0

5 votes

def cancer_dataset():
    """Binary classification dataset."""
    pytest.importorskip('sklearn')
    pd = pytest.importorskip('pandas')
    from sklearn import datasets

    raw = datasets.load_breast_cancer()
    df = pd.DataFrame(raw.data, columns=raw.feature_names)
    df['Type'] = raw.target
    df.Type = df.Type.astype('category')
    df.Type.cat.categories = raw.target_names
    return df

Source File: test_environment.py From hyperparameter_hunter with MIT License

5 votes

def get_breast_cancer_data():
    data = load_breast_cancer()
    df = pd.DataFrame(data=data.data, columns=data.feature_names)
    df["diagnosis"] = data.target
    return df

Source File: test_cc.py From pycobra with MIT License

5 votes

def setUp(self):
        # setting up our random data-set
        rng = np.random.RandomState(42)
        bc = datasets.load_breast_cancer()
        self.X = bc.data[:-20]
        self.y = bc.target[:-20]
        self.test_data = bc.data[-20:]
        self.cc = ClassifierCobra(random_state=0).fit(self.X, self.y)

Source File: test_plots.py From scikit-optimize with BSD 3-Clause "New" or "Revised" License

5 votes

def test_plots_work_without_cat():
    """Basic smoke tests to make sure plotting doesn't crash."""
    SPACE = [
        Integer(1, 20, name='max_depth'),
        Integer(2, 100, name='min_samples_split'),
        Integer(5, 30, name='min_samples_leaf'),
        Integer(1, 30, name='max_features'),
    ]

    def objective(params):
        clf = DecisionTreeClassifier(random_state=3,
                                     **{dim.name: val
                                        for dim, val in zip(SPACE, params)
                                        if dim.name != 'dummy'})
        return -np.mean(cross_val_score(clf, *load_breast_cancer(True)))

    res = gp_minimize(objective, SPACE, n_calls=10, random_state=3)
    plots.plot_convergence(res)
    plots.plot_evaluations(res)
    plots.plot_objective(res)
    plots.plot_objective(res,
                         minimum='expected_minimum')
    plots.plot_objective(res,
                         sample_source='expected_minimum',
                         n_minimum_search=10)
    plots.plot_objective(res, sample_source='result')
    plots.plot_regret(res)

    # TODO: Compare plots to known good results?
    # Look into how matplotlib does this.

Python sklearn.datasets.load_breast_cancer() Examples