Python sklearn.datasets.fetch_openml() Examples
The following are 29
code examples of sklearn.datasets.fetch_openml().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.datasets
, or try the search function
.
Example #1
Source File: ml_elm.py From Python-ELM with MIT License | 8 votes |
def main(): from sklearn import preprocessing from sklearn.datasets import fetch_openml as fetch_mldata from sklearn.model_selection import train_test_split db_name = 'diabetes' data_set = fetch_mldata(db_name) data_set.data = preprocessing.normalize(data_set.data) tmp = data_set.target tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp] data_set.target = tmpL X_train, X_test, y_train, y_test = train_test_split( data_set.data, data_set.target, test_size=0.4) mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train) elm = ELM(200).fit(X_train, y_train) print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test)) print("ELM Accuracy %0.3f " % elm.score(X_test, y_test))
Example #2
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir): def _mock_urlopen_raise(request): raise ValueError('This mechanism intends to test correct cache' 'handling. As such, urlopen should never be ' 'accessed. URL: %s' % request.get_full_url()) data_id = 2 cache_directory = str(tmpdir.mkdir('scikit_learn_data')) _monkey_patch_webbased_functions( monkeypatch, data_id, gzip_response) X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen_raise) X_cached, y_cached = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, return_X_y=True) np.testing.assert_array_equal(X_fetched, X_cached) np.testing.assert_array_equal(y_fetched, y_cached)
Example #3
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_warn_ignore_attribute(monkeypatch, gzip_response): data_id = 40966 expected_row_id_msg = "target_column={} has flag is_row_identifier." expected_ignore_msg = "target_column={} has flag is_ignore." _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # single column test assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), fetch_openml, data_id=data_id, target_column='MouseID', cache=False) assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), fetch_openml, data_id=data_id, target_column='Genotype', cache=False) # multi column test assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), fetch_openml, data_id=data_id, target_column=['MouseID', 'class'], cache=False) assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), fetch_openml, data_id=data_id, target_column=['Genotype', 'class'], cache=False)
Example #4
Source File: StructuredInferencePlaygroundOscillatoryPlusDrift.py From Brancher with MIT License | 6 votes |
def load_mauna_loa_atmospheric_co2(): ml_data = fetch_openml(data_id=41187) months = [] ppmv_sums = [] counts = [] y = ml_data.data[:, 0] m = ml_data.data[:, 1] month_float = y + (m - 1) / 12 ppmvs = ml_data.target for month, ppmv in zip(month_float, ppmvs): if not months or month != months[-1]: months.append(month) ppmv_sums.append(ppmv) counts.append(1) else: # aggregate monthly sum to produce average ppmv_sums[-1] += ppmv counts[-1] += 1 months = np.asarray(months).reshape(-1, 1) avg_ppmvs = np.asarray(ppmv_sums) / counts return months, avg_ppmvs
Example #5
Source File: StructuredInferencePlaygroundOscillatoryC02data.py From Brancher with MIT License | 6 votes |
def load_mauna_loa_atmospheric_co2(): ml_data = fetch_openml(data_id=41187) months = [] ppmv_sums = [] counts = [] y = ml_data.data[:, 0] m = ml_data.data[:, 1] month_float = y + (m - 1) / 12 ppmvs = ml_data.target for month, ppmv in zip(month_float, ppmvs): if not months or month != months[-1]: months.append(month) ppmv_sums.append(ppmv) counts.append(1) else: # aggregate monthly sum to produce average ppmv_sums[-1] += ppmv counts[-1] += 1 months = np.asarray(months).reshape(-1, 1) avg_ppmvs = np.asarray(ppmv_sums) / counts return months, avg_ppmvs
Example #6
Source File: StructuredInferencePlaygroundOscillatoryExperimentC02.py From Brancher with MIT License | 6 votes |
def load_mauna_loa_atmospheric_co2(): ml_data = fetch_openml(data_id=41187) months = [] ppmv_sums = [] counts = [] y = ml_data.data[:, 0] m = ml_data.data[:, 1] month_float = y + (m - 1) / 12 ppmvs = ml_data.target for month, ppmv in zip(month_float, ppmvs): if not months or month != months[-1]: months.append(month) ppmv_sums.append(ppmv) counts.append(1) else: # aggregate monthly sum to produce average ppmv_sums[-1] += ppmv counts[-1] += 1 months = np.asarray(months).reshape(-1, 1) avg_ppmvs = np.asarray(ppmv_sums) / counts return months, avg_ppmvs
Example #7
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_illegal_column(monkeypatch, gzip_response): data_id = 61 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) assert_raise_message(KeyError, "Could not find target_column=", fetch_openml, data_id=data_id, target_column='undefined', cache=False) assert_raise_message(KeyError, "Could not find target_column=", fetch_openml, data_id=data_id, target_column=['undefined', 'class'], cache=False)
Example #8
Source File: mnist_cnn.py From neupy with MIT License | 5 votes |
def load_data(): X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True) X = X.reshape(-1, 28, 28, 1) X /= 255. target_scaler = OneHotEncoder(sparse=False, categories='auto') y = target_scaler.fit_transform(y.reshape(-1, 1)) return train_test_split( X.astype(np.float32), y.astype(np.float32), test_size=(1 / 7.) )
Example #9
Source File: variational_autoencoder.py From neupy with MIT License | 5 votes |
def load_data(): X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True) X = X / 255. X -= X.mean(axis=0) x_train, x_test = model_selection.train_test_split( X.astype(np.float32), test_size=(1 / 7.) ) return x_train, x_test
Example #10
Source File: conv_autoencoder.py From neupy with MIT License | 5 votes |
def load_data(): X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True) X = (X / 255.).astype(np.float32) np.random.shuffle(X) x_train_2d, x_test_2d = X[:60000], X[60000:] x_train_4d = x_train_2d.reshape((60000, 28, 28, 1)) x_test_4d = x_test_2d.reshape((10000, 28, 28, 1)) return x_train_4d, x_test_4d
Example #11
Source File: denoising_autoencoder.py From neupy with MIT License | 5 votes |
def load_data(): X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True) X = (X / 255.).astype(np.float32) np.random.shuffle(X) x_train, x_test = X[:60000], X[60000:] return x_train, x_test
Example #12
Source File: mnist_mlp.py From neupy with MIT License | 5 votes |
def load_data(): X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True) X /= 255. X -= X.mean(axis=0) target_scaler = OneHotEncoder(sparse=False, categories='auto') y = target_scaler.fit_transform(y.reshape(-1, 1)) return model_selection.train_test_split( X.astype(np.float32), y.astype(np.float32), test_size=(1 / 7.))
Example #13
Source File: plot_study.py From optuna with MIT License | 5 votes |
def objective(trial): fmnist = fetch_openml(name="Fashion-MNIST", version=1) classes = list(set(fmnist.target)) # For demonstrational purpose, only use a subset of the dataset. n_samples = 4000 data = fmnist.data[:n_samples] target = fmnist.target[:n_samples] x_train, x_valid, y_train, y_valid = train_test_split(data, target) clf = MLPClassifier( hidden_layer_sizes=tuple( [trial.suggest_int("n_units_l{}".format(i), 32, 64) for i in range(3)] ), learning_rate_init=trial.suggest_loguniform("lr_init", 1e-5, 1e-1), ) for step in range(100): clf.partial_fit(x_train, y_train, classes=classes) value = clf.score(x_valid, y_valid) # Report intermediate objective value. trial.report(value, step) # Handle pruning based on the intermediate value. if trial.should_prune(): raise optuna.TrialPruned() return value
Example #14
Source File: dataset.py From palladium with Apache License 2.0 | 5 votes |
def __call__(self): dataset = fetch_openml(self.name) return dataset.data, dataset.target
Example #15
Source File: datasets.py From pywsl with MIT License | 5 votes |
def get_mnist(): x, y = fetch_openml('mnist_784', data_home='~', version=1, return_X_y=True) y = y.astype(np.int) return x, y
Example #16
Source File: mnist.py From skorch with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_data(num_samples): mnist = fetch_openml('mnist_784') torch.manual_seed(0) X = mnist.data.astype('float32').reshape(-1, 1, 28, 28) y = mnist.target.astype('int64') X, y = shuffle(X, y) X, y = X[:num_samples], y[:num_samples] X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0) X_train /= 255 X_test /= 255 return X_train, X_test, y_train, y_test
Example #17
Source File: KNN.py From AI_Sudoku with Creative Commons Zero v1.0 Universal | 5 votes |
def __init__(self, k): self.mnist = datasets.fetch_openml('mnist_784', data_home='mnist_dataset/') self.data, self.target = self.mnist.data, self.mnist.target # Make an array of indices the size of MNIST to use for making the data sets. # This array is in random order, so we can use it to scramble up the MNIST data self.indx = np.random.choice(len(self.target), 70000, replace=False) # Initialising the classifier self.classifier = KNeighborsClassifier(n_neighbors=k) # method for building the datasets to test with
Example #18
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_fetch_openml_raises_illegal_argument(): assert_raise_message(ValueError, "Dataset data_id=", fetch_openml, data_id=-1, name="name") assert_raise_message(ValueError, "Dataset data_id=", fetch_openml, data_id=-1, name=None, version="version") assert_raise_message(ValueError, "Dataset data_id=", fetch_openml, data_id=-1, name="name", version="version") assert_raise_message(ValueError, "Neither name nor data_id are provided. " "Please provide name or data_id.", fetch_openml)
Example #19
Source File: elm.py From Python-ELM with MIT License | 5 votes |
def main(): from sklearn import preprocessing from sklearn.datasets import fetch_openml as fetch_mldata from sklearn.model_selection import ShuffleSplit, KFold, cross_val_score db_name = 'australian' hid_nums = [100, 200, 300] data_set = fetch_mldata(db_name) data_set.data = preprocessing.normalize(data_set.data) data_set.target = [1 if i == 1 else -1 for i in data_set.target.astype(int)] for hid_num in hid_nums: print(hid_num, end=' ') e = ELM(hid_num) ave = 0 for i in range(10): cv = KFold(n_splits=5, shuffle=True) scores = cross_val_score( e, data_set.data, data_set.target, cv=cv, scoring='accuracy', n_jobs=-1) ave += scores.mean() ave /= 10 print("Accuracy: %0.3f " % (ave))
Example #20
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_dataset_with_openml_warning(monkeypatch, gzip_response): data_id = 3 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) assert_warns_message( UserWarning, "OpenML raised a warning on the dataset. It might be unusable. " "Warning:", fetch_openml, data_id=data_id, cache=False )
Example #21
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_dataset_with_openml_error(monkeypatch, gzip_response): data_id = 1 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) assert_warns_message( UserWarning, "OpenML registered a problem with the dataset. It might be unusable. " "Error:", fetch_openml, data_id=data_id, cache=False )
Example #22
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_string_attribute(monkeypatch, gzip_response): data_id = 40945 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # single column test assert_raise_message(ValueError, 'STRING attributes are not yet supported', fetch_openml, data_id=data_id, cache=False)
Example #23
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_fetch_nonexiting(monkeypatch, gzip_response): # there is no active version of glass2 data_id = 40675 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # Note that we only want to search by name (not data id) assert_raise_message(ValueError, "No active dataset glass2 found", fetch_openml, name='glass2', cache=False)
Example #24
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_fetch_openml_inactive(monkeypatch, gzip_response): # fetch inactive dataset by id data_id = 40675 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) glas2 = assert_warns_message( UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, data_id=data_id, cache=False) # fetch inactive dataset by name and version assert glas2.data.shape == (163, 9) glas2_by_version = assert_warns_message( UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, data_id=None, name="glass2", version=1, cache=False) assert int(glas2_by_version.details['id']) == data_id
Example #25
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_fetch_openml_notarget(monkeypatch, gzip_response): data_id = 61 target_column = None expected_observations = 150 expected_features = 5 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) data = fetch_openml(data_id=data_id, target_column=target_column, cache=False) assert data.data.shape == (expected_observations, expected_features) assert data.target is None
Example #26
Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def _test_features_list(data_id): # XXX Test is intended to verify/ensure correct decoding behavior # Not usable with sparse data or datasets that have columns marked as # {row_identifier, ignore} def decode_column(data_bunch, col_idx): col_name = data_bunch.feature_names[col_idx] if col_name in data_bunch.categories: # XXX: This would be faster with np.take, although it does not # handle missing values fast (also not with mode='wrap') cat = data_bunch.categories[col_name] result = [None if is_scalar_nan(idx) else cat[int(idx)] for idx in data_bunch.data[:, col_idx]] return np.array(result, dtype='O') else: # non-nominal attribute return data_bunch.data[:, col_idx] data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) # also obtain decoded arff data_description = _get_data_description_by_id(data_id, None) sparse = data_description['format'].lower() == 'sparse_arff' if sparse is True: raise ValueError('This test is not intended for sparse data, to keep ' 'code relatively simple') data_arff = _download_data_arff(data_description['file_id'], sparse, None, False) data_downloaded = np.array(list(data_arff['data']), dtype='O') for i in range(len(data_bunch.feature_names)): # XXX: Test per column, as this makes it easier to avoid problems with # missing values np.testing.assert_array_equal(data_downloaded[:, i], decode_column(data_bunch, i))
Example #27
Source File: fetching.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fetch_employee_salaries(): """fetches the employee_salaries dataset The employee_salaries dataset contains information about annual salaries (year 2016) for more than 9,000 employees of the Montgomery County (Maryland, US). Returns ------- dict a dictionary containing: - a short description of the dataset (under the ``DESCR`` key) - the tabular data (under the ``data`` key) - the target (under the ``target`` key) References ---------- https://catalog.data.gov/dataset/employee-salaries-2016 """ data = fetch_openml(data_id=42125, as_frame=True) data.data['Current Annual Salary'] = data['target'] return data # link dead. # return fetch_dataset(EMPLOYEE_SALARIES_CONFIG, show_progress=False)
Example #28
Source File: uci_loader.py From highdimensional-decision-boundary-plot with MIT License | 5 votes |
def getdataset(datasetname, onehot_encode_strings=True): # load dataset = fetch_openml(datasetname) # get X and y X = dshape(dataset.data) try: target = dshape(dataset.target) except: print("WARNING: No target found. Taking last column of data matrix as target") target = X[:, -1] X = X[:, :-1] if ( len(target.shape) > 1 and target.shape[1] > X.shape[1] ): # some mldata sets are mixed up... X = target target = dshape(dataset.data) if len(X.shape) == 1 or X.shape[1] <= 1: for k in dataset.keys(): if k != "data" and k != "target" and len(dataset[k]) == X.shape[1]: X = np.hstack((X, dshape(dataset[k]))) # one-hot for categorical values if onehot_encode_strings: cat_ft = [ i for i in range(X.shape[1]) if "str" in str(type(unpack(X[0, i]))) or "unicode" in str(type(unpack(X[0, i]))) ] if len(cat_ft): for i in cat_ft: X[:, i] = tonumeric(X[:, i]) X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X) # if sparse, make dense try: X = X.toarray() except: pass # convert y to monotonically increasing ints y = tonumeric(target).astype(int) return np.nan_to_num(X.astype(float)), y
Example #29
Source File: ecob_elm.py From Python-ELM with MIT License | 5 votes |
def main(): from sklearn import preprocessing from sklearn.datasets import fetch_openml as fetch_mldata from sklearn.model_selection import cross_val_score db_name = 'iris' hid_num = 1000 data_set = fetch_mldata(db_name, version=1) data_set.data = preprocessing.scale(data_set.data) data_set.target = preprocessing.LabelEncoder().fit_transform(data_set.target) print(db_name) print('ECOBELM', hid_num) e = ECOBELM(hid_num, c=2**5) ave = 0 for i in range(10): scores = cross_val_score( e, data_set.data, data_set.target, cv=5, scoring='accuracy') ave += scores.mean() ave /= 10 print("Accuracy: %0.2f " % (ave)) print('ELM', hid_num) e = ELM(hid_num) ave = 0 for i in range(10): scores = cross_val_score( e, data_set.data, data_set.target, cv=5, scoring='accuracy') ave += scores.mean() ave /= 10 print("Accuracy: %0.2f " % (ave))