Python sklearn.datasets.fetch_openml() Examples

The following are 29 code examples of sklearn.datasets.fetch_openml(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.datasets , or try the search function .
Example #1
Source File: ml_elm.py    From Python-ELM with MIT License 8 votes vote down vote up
def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import train_test_split

    db_name = 'diabetes'
    data_set = fetch_mldata(db_name)
    data_set.data = preprocessing.normalize(data_set.data)

    tmp = data_set.target
    tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp]
    data_set.target = tmpL

    X_train, X_test, y_train, y_test = train_test_split(
        data_set.data, data_set.target, test_size=0.4)

    mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train)
    elm = ELM(200).fit(X_train, y_train)

    print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test))
    print("ELM Accuracy %0.3f " % elm.score(X_test, y_test)) 
Example #2
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
    def _mock_urlopen_raise(request):
        raise ValueError('This mechanism intends to test correct cache'
                         'handling. As such, urlopen should never be '
                         'accessed. URL: %s' % request.get_full_url())
    data_id = 2
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    _monkey_patch_webbased_functions(
        monkeypatch, data_id, gzip_response)
    X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
                                        data_home=cache_directory,
                                        return_X_y=True)

    monkeypatch.setattr(sklearn.datasets.openml, 'urlopen',
                        _mock_urlopen_raise)

    X_cached, y_cached = fetch_openml(data_id=data_id, cache=True,
                                      data_home=cache_directory,
                                      return_X_y=True)
    np.testing.assert_array_equal(X_fetched, X_cached)
    np.testing.assert_array_equal(y_fetched, y_cached) 
Example #3
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_warn_ignore_attribute(monkeypatch, gzip_response):
    data_id = 40966
    expected_row_id_msg = "target_column={} has flag is_row_identifier."
    expected_ignore_msg = "target_column={} has flag is_ignore."
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    # single column test
    assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                         fetch_openml, data_id=data_id,
                         target_column='MouseID',
                         cache=False)
    assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
                         fetch_openml, data_id=data_id,
                         target_column='Genotype',
                         cache=False)
    # multi column test
    assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                         fetch_openml, data_id=data_id,
                         target_column=['MouseID', 'class'],
                         cache=False)
    assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
                         fetch_openml, data_id=data_id,
                         target_column=['Genotype', 'class'],
                         cache=False) 
Example #4
Source File: StructuredInferencePlaygroundOscillatoryPlusDrift.py    From Brancher with MIT License 6 votes vote down vote up
def load_mauna_loa_atmospheric_co2():
    ml_data = fetch_openml(data_id=41187)
    months = []
    ppmv_sums = []
    counts = []

    y = ml_data.data[:, 0]
    m = ml_data.data[:, 1]
    month_float = y + (m - 1) / 12
    ppmvs = ml_data.target

    for month, ppmv in zip(month_float, ppmvs):
        if not months or month != months[-1]:
            months.append(month)
            ppmv_sums.append(ppmv)
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += ppmv
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs 
Example #5
Source File: StructuredInferencePlaygroundOscillatoryC02data.py    From Brancher with MIT License 6 votes vote down vote up
def load_mauna_loa_atmospheric_co2():
    ml_data = fetch_openml(data_id=41187)
    months = []
    ppmv_sums = []
    counts = []

    y = ml_data.data[:, 0]
    m = ml_data.data[:, 1]
    month_float = y + (m - 1) / 12
    ppmvs = ml_data.target

    for month, ppmv in zip(month_float, ppmvs):
        if not months or month != months[-1]:
            months.append(month)
            ppmv_sums.append(ppmv)
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += ppmv
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs 
Example #6
Source File: StructuredInferencePlaygroundOscillatoryExperimentC02.py    From Brancher with MIT License 6 votes vote down vote up
def load_mauna_loa_atmospheric_co2():
    ml_data = fetch_openml(data_id=41187)
    months = []
    ppmv_sums = []
    counts = []

    y = ml_data.data[:, 0]
    m = ml_data.data[:, 1]
    month_float = y + (m - 1) / 12
    ppmvs = ml_data.target

    for month, ppmv in zip(month_float, ppmvs):
        if not months or month != months[-1]:
            months.append(month)
            ppmv_sums.append(ppmv)
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += ppmv
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs 
Example #7
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_illegal_column(monkeypatch, gzip_response):
    data_id = 61
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    assert_raise_message(KeyError, "Could not find target_column=",
                         fetch_openml, data_id=data_id,
                         target_column='undefined', cache=False)

    assert_raise_message(KeyError, "Could not find target_column=",
                         fetch_openml, data_id=data_id,
                         target_column=['undefined', 'class'],
                         cache=False) 
Example #8
Source File: mnist_cnn.py    From neupy with MIT License 5 votes vote down vote up
def load_data():
    X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
    X = X.reshape(-1, 28, 28, 1)
    X /= 255.

    target_scaler = OneHotEncoder(sparse=False, categories='auto')
    y = target_scaler.fit_transform(y.reshape(-1, 1))

    return train_test_split(
        X.astype(np.float32),
        y.astype(np.float32),
        test_size=(1 / 7.)
    ) 
Example #9
Source File: variational_autoencoder.py    From neupy with MIT License 5 votes vote down vote up
def load_data():
    X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)

    X = X / 255.
    X -= X.mean(axis=0)

    x_train, x_test = model_selection.train_test_split(
        X.astype(np.float32),
        test_size=(1 / 7.)
    )
    return x_train, x_test 
Example #10
Source File: conv_autoencoder.py    From neupy with MIT License 5 votes vote down vote up
def load_data():
    X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
    X = (X / 255.).astype(np.float32)

    np.random.shuffle(X)
    x_train_2d, x_test_2d = X[:60000], X[60000:]
    x_train_4d = x_train_2d.reshape((60000, 28, 28, 1))
    x_test_4d = x_test_2d.reshape((10000, 28, 28, 1))

    return x_train_4d, x_test_4d 
Example #11
Source File: denoising_autoencoder.py    From neupy with MIT License 5 votes vote down vote up
def load_data():
    X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
    X = (X / 255.).astype(np.float32)

    np.random.shuffle(X)
    x_train, x_test = X[:60000], X[60000:]

    return x_train, x_test 
Example #12
Source File: mnist_mlp.py    From neupy with MIT License 5 votes vote down vote up
def load_data():
    X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
    X /= 255.
    X -= X.mean(axis=0)

    target_scaler = OneHotEncoder(sparse=False, categories='auto')
    y = target_scaler.fit_transform(y.reshape(-1, 1))

    return model_selection.train_test_split(
        X.astype(np.float32),
        y.astype(np.float32),
        test_size=(1 / 7.)) 
Example #13
Source File: plot_study.py    From optuna with MIT License 5 votes vote down vote up
def objective(trial):

    fmnist = fetch_openml(name="Fashion-MNIST", version=1)
    classes = list(set(fmnist.target))

    # For demonstrational purpose, only use a subset of the dataset.
    n_samples = 4000
    data = fmnist.data[:n_samples]
    target = fmnist.target[:n_samples]

    x_train, x_valid, y_train, y_valid = train_test_split(data, target)

    clf = MLPClassifier(
        hidden_layer_sizes=tuple(
            [trial.suggest_int("n_units_l{}".format(i), 32, 64) for i in range(3)]
        ),
        learning_rate_init=trial.suggest_loguniform("lr_init", 1e-5, 1e-1),
    )

    for step in range(100):
        clf.partial_fit(x_train, y_train, classes=classes)
        value = clf.score(x_valid, y_valid)

        # Report intermediate objective value.
        trial.report(value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.TrialPruned()

    return value 
Example #14
Source File: dataset.py    From palladium with Apache License 2.0 5 votes vote down vote up
def __call__(self):
        dataset = fetch_openml(self.name)
        return dataset.data, dataset.target 
Example #15
Source File: datasets.py    From pywsl with MIT License 5 votes vote down vote up
def get_mnist():
    x, y = fetch_openml('mnist_784', data_home='~', version=1, return_X_y=True)
    y = y.astype(np.int)
    return x, y 
Example #16
Source File: mnist.py    From skorch with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_data(num_samples):
    mnist = fetch_openml('mnist_784')
    torch.manual_seed(0)
    X = mnist.data.astype('float32').reshape(-1, 1, 28, 28)
    y = mnist.target.astype('int64')
    X, y = shuffle(X, y)
    X, y = X[:num_samples], y[:num_samples]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    X_train /= 255
    X_test /= 255
    return X_train, X_test, y_train, y_test 
Example #17
Source File: KNN.py    From AI_Sudoku with Creative Commons Zero v1.0 Universal 5 votes vote down vote up
def __init__(self, k):
        self.mnist = datasets.fetch_openml('mnist_784', data_home='mnist_dataset/')
        self.data, self.target = self.mnist.data, self.mnist.target
        # Make an array of indices the size of MNIST to use for making the data sets.
        # This array is in random order, so we can use it to scramble up the MNIST data
        self.indx = np.random.choice(len(self.target), 70000, replace=False)
        # Initialising the classifier
        self.classifier = KNeighborsClassifier(n_neighbors=k)

    # method for building the datasets to test with 
Example #18
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_fetch_openml_raises_illegal_argument():
    assert_raise_message(ValueError, "Dataset data_id=",
                         fetch_openml, data_id=-1, name="name")

    assert_raise_message(ValueError, "Dataset data_id=",
                         fetch_openml, data_id=-1, name=None,
                         version="version")

    assert_raise_message(ValueError, "Dataset data_id=",
                         fetch_openml, data_id=-1, name="name",
                         version="version")

    assert_raise_message(ValueError, "Neither name nor data_id are provided. "
                         "Please provide name or data_id.", fetch_openml) 
Example #19
Source File: elm.py    From Python-ELM with MIT License 5 votes vote down vote up
def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import ShuffleSplit, KFold, cross_val_score

    db_name = 'australian'
    hid_nums = [100, 200, 300]

    data_set = fetch_mldata(db_name)
    data_set.data = preprocessing.normalize(data_set.data)
    data_set.target = [1 if i == 1 else -1
                       for i in  data_set.target.astype(int)]

    for hid_num in hid_nums:
        print(hid_num, end=' ')
        e = ELM(hid_num)

        ave = 0
        for i in range(10):
            cv = KFold(n_splits=5, shuffle=True)
            scores = cross_val_score(
                e, data_set.data, data_set.target,
                cv=cv, scoring='accuracy', n_jobs=-1)
            ave += scores.mean()

        ave /= 10

        print("Accuracy: %0.3f " % (ave)) 
Example #20
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_dataset_with_openml_warning(monkeypatch, gzip_response):
    data_id = 3
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    assert_warns_message(
        UserWarning,
        "OpenML raised a warning on the dataset. It might be unusable. "
        "Warning:",
        fetch_openml, data_id=data_id, cache=False
    ) 
Example #21
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_dataset_with_openml_error(monkeypatch, gzip_response):
    data_id = 1
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    assert_warns_message(
        UserWarning,
        "OpenML registered a problem with the dataset. It might be unusable. "
        "Error:",
        fetch_openml, data_id=data_id, cache=False
    ) 
Example #22
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_string_attribute(monkeypatch, gzip_response):
    data_id = 40945
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    # single column test
    assert_raise_message(ValueError,
                         'STRING attributes are not yet supported',
                         fetch_openml, data_id=data_id, cache=False) 
Example #23
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_fetch_nonexiting(monkeypatch, gzip_response):
    # there is no active version of glass2
    data_id = 40675
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    # Note that we only want to search by name (not data id)
    assert_raise_message(ValueError, "No active dataset glass2 found",
                         fetch_openml, name='glass2', cache=False) 
Example #24
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_fetch_openml_inactive(monkeypatch, gzip_response):
    # fetch inactive dataset by id
    data_id = 40675
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    glas2 = assert_warns_message(
        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
        data_id=data_id, cache=False)
    # fetch inactive dataset by name and version
    assert glas2.data.shape == (163, 9)
    glas2_by_version = assert_warns_message(
        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
        data_id=None, name="glass2", version=1, cache=False)
    assert int(glas2_by_version.details['id']) == data_id 
Example #25
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_fetch_openml_notarget(monkeypatch, gzip_response):
    data_id = 61
    target_column = None
    expected_observations = 150
    expected_features = 5

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    data = fetch_openml(data_id=data_id, target_column=target_column,
                        cache=False)
    assert data.data.shape == (expected_observations, expected_features)
    assert data.target is None 
Example #26
Source File: test_openml.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def _test_features_list(data_id):
    # XXX Test is intended to verify/ensure correct decoding behavior
    # Not usable with sparse data or datasets that have columns marked as
    # {row_identifier, ignore}
    def decode_column(data_bunch, col_idx):
        col_name = data_bunch.feature_names[col_idx]
        if col_name in data_bunch.categories:
            # XXX: This would be faster with np.take, although it does not
            # handle missing values fast (also not with mode='wrap')
            cat = data_bunch.categories[col_name]
            result = [None if is_scalar_nan(idx) else cat[int(idx)]
                      for idx in data_bunch.data[:, col_idx]]
            return np.array(result, dtype='O')
        else:
            # non-nominal attribute
            return data_bunch.data[:, col_idx]

    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)

    # also obtain decoded arff
    data_description = _get_data_description_by_id(data_id, None)
    sparse = data_description['format'].lower() == 'sparse_arff'
    if sparse is True:
        raise ValueError('This test is not intended for sparse data, to keep '
                         'code relatively simple')
    data_arff = _download_data_arff(data_description['file_id'],
                                    sparse, None, False)
    data_downloaded = np.array(list(data_arff['data']), dtype='O')

    for i in range(len(data_bunch.feature_names)):
        # XXX: Test per column, as this makes it easier to avoid problems with
        # missing values

        np.testing.assert_array_equal(data_downloaded[:, i],
                                      decode_column(data_bunch, i)) 
Example #27
Source File: fetching.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_employee_salaries():
    """fetches the employee_salaries dataset

    The employee_salaries dataset contains information about annual salaries
    (year 2016) for more than 9,000 employees of the Montgomery County
    (Maryland, US).


    Returns
    -------
    dict
        a dictionary containing:

            - a short description of the dataset (under the ``DESCR``
              key)
            - the tabular data (under the ``data`` key)
            - the target (under the ``target`` key)

    References
    ----------
    https://catalog.data.gov/dataset/employee-salaries-2016

    """

    data = fetch_openml(data_id=42125, as_frame=True)
    data.data['Current Annual Salary'] = data['target']
    return data

    # link dead.
    # return fetch_dataset(EMPLOYEE_SALARIES_CONFIG, show_progress=False) 
Example #28
Source File: uci_loader.py    From highdimensional-decision-boundary-plot with MIT License 5 votes vote down vote up
def getdataset(datasetname, onehot_encode_strings=True):
    # load
    dataset = fetch_openml(datasetname)
    # get X and y
    X = dshape(dataset.data)
    try:
        target = dshape(dataset.target)
    except:
        print("WARNING: No target found. Taking last column of data matrix as target")
        target = X[:, -1]
        X = X[:, :-1]
    if (
        len(target.shape) > 1 and target.shape[1] > X.shape[1]
    ):  # some mldata sets are mixed up...
        X = target
        target = dshape(dataset.data)
    if len(X.shape) == 1 or X.shape[1] <= 1:
        for k in dataset.keys():
            if k != "data" and k != "target" and len(dataset[k]) == X.shape[1]:
                X = np.hstack((X, dshape(dataset[k])))
    # one-hot for categorical values
    if onehot_encode_strings:
        cat_ft = [
            i
            for i in range(X.shape[1])
            if "str" in str(type(unpack(X[0, i])))
            or "unicode" in str(type(unpack(X[0, i])))
        ]
        if len(cat_ft):
            for i in cat_ft:
                X[:, i] = tonumeric(X[:, i])
            X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
    # if sparse, make dense
    try:
        X = X.toarray()
    except:
        pass
    # convert y to monotonically increasing ints
    y = tonumeric(target).astype(int)
    return np.nan_to_num(X.astype(float)), y 
Example #29
Source File: ecob_elm.py    From Python-ELM with MIT License 5 votes vote down vote up
def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import cross_val_score

    db_name = 'iris'
    hid_num = 1000
    data_set = fetch_mldata(db_name, version=1)
    data_set.data = preprocessing.scale(data_set.data)
    data_set.target = preprocessing.LabelEncoder().fit_transform(data_set.target)

    print(db_name)
    print('ECOBELM', hid_num)
    e = ECOBELM(hid_num, c=2**5)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave))

    print('ELM', hid_num)
    e = ELM(hid_num)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave))