Python sklearn.datasets() Examples
The following are 30
code examples of sklearn.datasets().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn
, or try the search function
.
Example #1
Source File: test_shap.py From AIX360 with Apache License 2.0 | 8 votes |
def test_ShapLinearExplainer(self): corpus, y = shap.datasets.imdb() corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7) vectorizer = TfidfVectorizer(min_df=10) X_train = vectorizer.fit_transform(corpus_train) X_test = vectorizer.transform(corpus_test) model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear') model.fit(X_train, y_train) shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent") shap_values = shapexplainer.explain_instance(X_test) print("Invoked Shap LinearExplainer") # comment this test as travis runs out of resources
Example #2
Source File: test_mldata.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_fetch_one_column(tmpdata): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdata) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref
Example #3
Source File: datasets.py From treeano with Apache License 2.0 | 6 votes |
def mnist(random_state=42): """ x is in [0, 1] with shape (b, 1, 28, 28) and dtype floatX y is an int32 vector in range(10) """ raw = sklearn.datasets.fetch_mldata('MNIST original') # rescaling to [0, 1] instead of [0, 255] x = raw['data'].reshape(-1, 1, 28, 28).astype(fX) / 255.0 y = raw['target'].astype("int32") # NOTE: train data is initially in order of 0 through 9 x1, x2, y1, y2 = sklearn.cross_validation.train_test_split( x[:60000], y[:60000], random_state=random_state, test_size=10000) train = {"x": x1, "y": y1} valid = {"x": x2, "y": y2} # NOTE: test data is in order of 0 through 9 test = {"x": x[60000:], "y": y[60000:]} return train, valid, test
Example #4
Source File: _datasets.py From scanpy with BSD 3-Clause "New" or "Revised" License | 6 votes |
def pbmc68k_reduced() -> AnnData: """\ Subsampled and processed 68k PBMCs. 10x PBMC 68k dataset from https://support.10xgenomics.com/single-cell-gene-expression/datasets The original PBMC 68k dataset was preprocessed using scanpy and was saved keeping only 724 cells and 221 highly variable genes. The saved file contains the annotation of cell types (key: `'bulk_labels'`), UMAP coordinates, louvain clustering and gene rankings based on the `bulk_labels`. Returns ------- Annotated data matrix. """ filename = HERE / '10x_pbmc68k_reduced.h5ad' with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=FutureWarning, module="anndata") return read(filename)
Example #5
Source File: _datasets.py From scanpy with BSD 3-Clause "New" or "Revised" License | 6 votes |
def burczynski06() -> AnnData: """\ Bulk data with conditions ulcerative colitis (UC) and Crohn's disease (CD). The study assesses transcriptional profiles in peripheral blood mononuclear cells from 42 healthy individuals, 59 CD patients, and 26 UC patients by hybridization to microarrays interrogating more than 22,000 sequences. Reference --------- Burczynski et al., "Molecular classification of Crohn's disease and ulcerative colitis patients using transcriptional profiles in peripheral blood mononuclear cells" J Mol Diagn 8, 51 (2006). PMID:16436634. """ filename = settings.datasetdir / 'burczynski06/GDS1615_full.soft.gz' url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz' adata = read(filename, backup_url=url) return adata
Example #6
Source File: datasets.py From treeano with Apache License 2.0 | 6 votes |
def cluttered_mnist(base_dir="~/cluttered_mnist"): base_dir = os.path.expanduser(base_dir) # use the one from lasagne: # https://github.com/Lasagne/Recipes/blob/master/examples/spatial_transformer_network.ipynb CLUTTERED_MNIST_PATH = ("https://s3.amazonaws.com/lasagne/recipes/" "datasets/mnist_cluttered_60x60_6distortions.npz") subprocess.call(["wget", "-N", CLUTTERED_MNIST_PATH, "-P", base_dir]) data = np.load(os.path.join(base_dir, "mnist_cluttered_60x60_6distortions.npz")) X_train, X_valid, X_test = [data[n].reshape((-1, 1, 60, 60)) for n in ["x_train", "x_valid", "x_test"]] y_train, y_valid, y_test = [np.argmax(data[n], axis=-1).astype('int32') for n in ["y_train", "y_valid", "y_test"]] train = {"x": X_train, "y": y_train} valid = {"x": X_valid, "y": y_valid} test = {"x": X_test, "y": y_test} return train, valid, test
Example #7
Source File: clf_helpers.py From ibeis with Apache License 2.0 | 6 votes |
def setup(pblm): import sklearn.datasets iris = sklearn.datasets.load_iris() pblm.primary_task_key = 'iris' pblm.default_data_key = 'learn(all)' pblm.default_clf_key = 'RF' X_df = pd.DataFrame(iris.data, columns=iris.feature_names) samples = MultiTaskSamples(X_df.index) samples.apply_indicators( {'iris': {name: iris.target == idx for idx, name in enumerate(iris.target_names)}}) samples.X_dict = {'learn(all)': X_df} pblm.samples = samples pblm.xval_kw['type'] = 'StratifiedKFold'
Example #8
Source File: datasets.py From ann-benchmarks with MIT License | 6 votes |
def get_dataset(which): hdf5_fn = get_dataset_fn(which) try: url = 'http://ann-benchmarks.com/%s.hdf5' % which download(url, hdf5_fn) except: print("Cannot download %s" % url) if which in DATASETS: print("Creating dataset locally") DATASETS[which](hdf5_fn) hdf5_f = h5py.File(hdf5_fn, 'r') return hdf5_f # Everything below this line is related to creating datasets # You probably never need to do this at home, # just rely on the prepared datasets at http://ann-benchmarks.com
Example #9
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir): def _mock_urlopen_raise(request): raise ValueError('This mechanism intends to test correct cache' 'handling. As such, urlopen should never be ' 'accessed. URL: %s' % request.get_full_url()) data_id = 2 cache_directory = str(tmpdir.mkdir('scikit_learn_data')) _monkey_patch_webbased_functions( monkeypatch, data_id, gzip_response) X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen_raise) X_cached, y_cached = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) np.testing.assert_array_equal(X_fetched, X_cached) np.testing.assert_array_equal(y_fetched, y_cached)
Example #10
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_retry_with_clean_cache(tmpdir): data_id = 61 openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) location = _get_local_path(openml_path, cache_directory) os.makedirs(os.path.dirname(location)) with open(location, 'w') as f: f.write("") @_retry_with_clean_cache(openml_path, cache_directory) def _load_data(): # The first call will raise an error since location exists if os.path.exists(location): raise Exception("File exist!") return 1 warn_msg = "Invalid cache, redownloading file" with pytest.warns(RuntimeWarning, match=warn_msg): result = _load_data() assert result == 1
Example #11
Source File: ridgeregression.py From mpyc with MIT License | 6 votes |
def synthesize_data(n_samples, n_features, n_targets): rnd = await mpc.transfer(random.randrange(2**31), senders=0) X, Y = sklearn.datasets.make_regression(n_samples=n_samples, n_features=n_features, n_informative=max(1, n_features - 5), n_targets=n_targets, bias=42, effective_rank=max(1, n_features - 3), tail_strength=0.5, noise=1.2, random_state=rnd) # all parties use same rnd if n_targets == 1: Y = np.transpose([Y]) X = np.concatenate((X, Y), axis=1) b_m = np.min(X, axis=0) b_M = np.max(X, axis=0) coef_add = [-(m + M) / 2 for m, M in zip(b_m, b_M)] coef_mul = [2 / (M - m) for m, M in zip(b_m, b_M)] for xi in X: for j in range(len(xi)): # map to [-1,1] range xi[j] = (xi[j] + coef_add[j]) * coef_mul[j] return X
Example #12
Source File: test_mldata.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_download(tmpdata): """Test that fetch_mldata is able to download and cache a data set.""" _urlopen_ref = datasets.mldata.urlopen datasets.mldata.urlopen = mock_mldata_urlopen({ 'mock': { 'label': sp.ones((150,)), 'data': sp.ones((150, 4)), }, }) try: mock = assert_warns(DeprecationWarning, fetch_mldata, 'mock', data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data"]: assert_in(n, mock) assert_equal(mock.target.shape, (150,)) assert_equal(mock.data.shape, (150, 4)) assert_raises(datasets.mldata.HTTPError, assert_warns, DeprecationWarning, fetch_mldata, 'not_existing_name') finally: datasets.mldata.urlopen = _urlopen_ref
Example #13
Source File: sklearn_to_pandas.py From lale with Apache License 2.0 | 6 votes |
def digits_df(test_size=0.2, random_state=42): digits = sklearn.datasets.load_digits() ncols = digits.data.shape[1] schema_X = { 'description': 'Features of digits dataset (classification).', 'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#optical-recognition-of-handwritten-digits-dataset', 'type': 'array', 'items': { 'type': 'array', 'minItems': ncols, 'maxItems': ncols, 'items': { 'type': 'number', 'minimum': 0, 'maximum': 16}}} schema_y = { '$schema': 'http://json-schema.org/draft-04/schema#', 'type': 'array', 'items': { 'type': 'integer', 'minimum': 0, 'maximum': 9}} (train_X, train_y), (test_X, test_y) = _bunch_to_df( digits, schema_X, schema_y, test_size, random_state) return (train_X, train_y), (test_X, test_y)
Example #14
Source File: sklearn_to_pandas.py From lale with Apache License 2.0 | 6 votes |
def load_iris_df(test_size=0.2): iris = sklearn.datasets.load_iris() X = iris.data y = iris.target target_name = 'target' X, y = shuffle(iris.data, iris.target, random_state=42) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42) X_train_df = pd.DataFrame(X_train, columns = iris.feature_names) y_train_df = pd.Series(y_train, name = target_name) X_test_df = pd.DataFrame(X_test, columns = iris.feature_names) y_test_df = pd.Series(y_test, name = target_name) return (X_train_df, y_train_df), (X_test_df, y_test_df)
Example #15
Source File: test_shap.py From AIX360 with Apache License 2.0 | 6 votes |
def test_ShapGradientExplainer(self): # model = VGG16(weights='imagenet', include_top=True) # X, y = shap.datasets.imagenet50() # to_explain = X[[39, 41]] # # url = "https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json" # fname = shap.datasets.cache(url) # with open(fname) as f: # class_names = json.load(f) # # def map2layer(x, layer): # feed_dict = dict(zip([model.layers[0].input], [preprocess_input(x.copy())])) # return K.get_session().run(model.layers[layer].input, feed_dict) # # e = GradientExplainer((model.layers[7].input, model.layers[-1].output), # map2layer(preprocess_input(X.copy()), 7)) # shap_values, indexes = e.explain_instance(map2layer(to_explain, 7), ranked_outputs=2) # print("Skipped Shap GradientExplainer")
Example #16
Source File: data.py From TextCategorization with MIT License | 6 votes |
def __init__(self, subset, shuffle=True, random_state=42): if subset == "all": shuffle = False # chronological split violated if shuffled else: shuffle = shuffle dataset = sklearn.datasets.fetch_rcv1(subset=subset, shuffle=shuffle, random_state=random_state) self.data = dataset.data self.labels = dataset.target self.class_names = dataset.target_names assert len(self.class_names) == 103 # 103 categories according to LYRL2004 N, C = self.labels.shape assert C == len(self.class_names) N, V = self.data.shape self.vocab = np.zeros(V) # hacky workaround to create placeholder value self.orig_vocab_size = V
Example #17
Source File: sklearn_to_pandas.py From lale with Apache License 2.0 | 6 votes |
def _bunch_to_df(bunch, schema_X, schema_y, test_size=0.2, random_state=42): train_X_arr, test_X_arr, train_y_arr, test_y_arr = train_test_split( bunch.data, bunch.target, test_size=test_size, random_state=random_state) feature_schemas = schema_X['items']['items'] if isinstance(feature_schemas, list): feature_names = [f['description'] for f in feature_schemas] else: feature_names = [f'x{i}' for i in range(schema_X['items']['maxItems'])] train_X_df = pd.DataFrame(train_X_arr, columns=feature_names) test_X_df = pd.DataFrame(test_X_arr, columns=feature_names) train_y_df = pd.Series(train_y_arr, name='target') test_y_df = pd.Series(test_y_arr, name='target') train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0] train_X = lale.datasets.data_schemas.add_schema(train_X_df, { **schema_X, 'minItems': train_nrows, 'maxItems': train_nrows }) test_X = lale.datasets.data_schemas.add_schema(test_X_df, { **schema_X, 'minItems': test_nrows, 'maxItems': test_nrows }) train_y = lale.datasets.data_schemas.add_schema(train_y_df, { **schema_y, 'minItems': train_nrows, 'maxItems': train_nrows }) test_y = lale.datasets.data_schemas.add_schema(test_y_df, { **schema_y, 'minItems': test_nrows, 'maxItems': test_nrows }) return (train_X, train_y), (test_X, test_y)
Example #18
Source File: test_mldata.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_download(): """Test that fetch_mldata is able to download and cache a data set.""" _urlopen_ref = datasets.mldata.urlopen datasets.mldata.urlopen = mock_mldata_urlopen({ 'mock': { 'label': sp.ones((150,)), 'data': sp.ones((150, 4)), }, }) try: mock = fetch_mldata('mock', data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data"]: assert_in(n, mock) assert_equal(mock.target.shape, (150,)) assert_equal(mock.data.shape, (150, 4)) assert_raises(datasets.mldata.HTTPError, fetch_mldata, 'not_existing_name') finally: datasets.mldata.urlopen = _urlopen_ref
Example #19
Source File: test_mldata.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_fetch_one_column(): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref
Example #20
Source File: planar_utils.py From DeeplearningAI_AndrewNg with MIT License | 5 votes |
def load_extra_datasets(): N = 200 noisy_circles = sklearn.datasets.make_circles(n_samples=N, factor=.5, noise=.3) noisy_moons = sklearn.datasets.make_moons(n_samples=N, noise=.2) blobs = sklearn.datasets.make_blobs(n_samples=N, random_state=5, n_features=2, centers=6) gaussian_quantiles = sklearn.datasets.make_gaussian_quantiles(mean=None, cov=0.5, n_samples=N, n_features=2, n_classes=2, shuffle=True, random_state=None) no_structure = np.random.rand(N, 2), np.random.rand(N, 2) return noisy_circles, noisy_moons, blobs, gaussian_quantiles, no_structure
Example #21
Source File: sklearn_to_pandas.py From lale with Apache License 2.0 | 5 votes |
def california_housing_df(test_size=0.2, random_state=42): housing = sklearn.datasets.fetch_california_housing() schema_X = { 'description': 'Features of California housing dataset (regression).', 'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#california-housing-dataset', 'type': 'array', 'items': { 'type': 'array', 'minItems': 8, 'maxItems': 8, 'items': [ {'description': 'MedInc', 'type': 'number', 'minimum': 0.0}, {'description': 'HouseAge', 'type': 'number', 'minimum': 0.0}, {'description': 'AveRooms', 'type': 'number', 'minimum': 0.0}, {'description': 'AveBedrms', 'type': 'number', 'minimum': 0.0}, {'description': 'Population', 'type': 'number', 'minimum': 0.0}, {'description': 'AveOccup', 'type': 'number', 'minimum': 0.0}, {'description': 'Latitude', 'type': 'number', 'minimum': 0.0}, {'description': 'Longitude', 'type': 'number'}]}} schema_y = { 'description': 'Target of California housing dataset (regression).', 'documentation_url': 'https://scikit-learn.org/0.20/datasets/index.html#california-housing-dataset', 'type': 'array', 'items': { 'description': 'Median house value for California districts.', 'type': 'number', 'minimum': 0.0}} (train_X, train_y), (test_X, test_y) = _bunch_to_df( housing, schema_X, schema_y, test_size, random_state) return (train_X, train_y), (test_X, test_y)
Example #22
Source File: dominance.py From dominance-analysis with MIT License | 5 votes |
def get_breast_cancer(cls): print("""The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is downloaded from: https://goo.gl/U2Uwz2""") print("""Internally using load_breast_cancer function from sklearn.datasets """) breast_cancer_data=pd.DataFrame(data=load_breast_cancer()['data'],columns=load_breast_cancer()['feature_names']) breast_cancer_data['target']=load_breast_cancer()['target'] target_dict=dict({j for i,j in zip(load_breast_cancer()['target_names'],enumerate(load_breast_cancer()['target_names']))}) breast_cancer_data['target_names']=breast_cancer_data['target'].map(target_dict) return breast_cancer_data.iloc[:,:-1]
Example #23
Source File: unit_tests.py From pynisher with MIT License | 5 votes |
def svc_example(n_samples = 10000, n_features = 4): from sklearn.svm import LinearSVC from sklearn.preprocessing import PolynomialFeatures from sklearn.datasets import make_classification X,Y = make_classification(n_samples, n_features) #pp = PolynomialFeatures(degree=3) #X = pp.fit_transform(X) m = LinearSVC() m.fit(X,Y)
Example #24
Source File: dominance.py From dominance-analysis with MIT License | 5 votes |
def get_boston(cls): print("""The copy of Boston Housing Dataset is downloaded from: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html""") print("""Internally using load_boston function from sklearn.datasets """) boston_data=pd.DataFrame(data=load_boston()['data'],columns=load_boston()['feature_names']) boston_data['House_Price']=load_boston()['target'] return boston_data
Example #25
Source File: datasets.py From ann-benchmarks with MIT License | 5 votes |
def random_float(out_fn, n_dims, n_samples, centers, distance): import sklearn.datasets X, _ = sklearn.datasets.make_blobs( n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1) X_train, X_test = train_test_split(X, test_size=0.1) write_output(X_train, X_test, out_fn, distance)
Example #26
Source File: datasets.py From ann-benchmarks with MIT License | 5 votes |
def random_bitstring(out_fn, n_dims, n_samples, n_queries): import sklearn.datasets Y, _ = sklearn.datasets.make_blobs( n_samples=n_samples, n_features=n_dims, centers=n_queries, random_state=1) X = numpy.zeros((n_samples, n_dims), dtype=numpy.bool) for i, vec in enumerate(Y): X[i] = numpy.array([v > 0 for v in vec], dtype=numpy.bool) X_train, X_test = train_test_split(X, test_size=n_queries) write_output(X_train, X_test, out_fn, 'hamming', 'bit')
Example #27
Source File: datasets.py From ann-benchmarks with MIT License | 5 votes |
def sift_hamming(out_fn, fn): import tarfile local_fn = fn + '.tar.gz' url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn download(url, local_fn) print('parsing vectors in %s...' % local_fn) with tarfile.open(local_fn, 'r:gz') as t: f = t.extractfile(fn) lines = f.readlines() X = numpy.zeros((len(lines), 256), dtype=numpy.bool) for i, line in enumerate(lines): X[i] = numpy.array( [int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool) X_train, X_test = train_test_split(X, test_size=1000) write_output(X_train, X_test, out_fn, 'hamming', 'bit')
Example #28
Source File: test_cli.py From mlflow with Apache License 2.0 | 5 votes |
def iris_data(): iris = sklearn.datasets.load_iris() x = iris.data[:, :2] y = iris.target return x, y
Example #29
Source File: svm.py From ibench with MIT License | 5 votes |
def _gen_datasets(self, features, vectors, classes, dest='data'): """Generate classification datasets in binary .npy files features: a list of feature lengths to test vectors: a list of sample lengths to test classes: number of classes (2 for binary classification dataset) """ self._X, self._y = make_classification(n_samples=vectors, n_features=features, n_informative=features, n_redundant=0, n_classes=classes, random_state=0) return self._X, self._y
Example #30
Source File: GetMLPara.py From dr_droid with Apache License 2.0 | 5 votes |
def _dataset_sample(): iris = datasets.load_iris() X = iris.data y = iris.target return X,y ################## this is to find the best feature selection###############