Python Examples of sklearn.datasets.fetch

Source File: ml_elm.py From Python-ELM with MIT License

8 votes

def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import train_test_split

    db_name = 'diabetes'
    data_set = fetch_mldata(db_name)
    data_set.data = preprocessing.normalize(data_set.data)

    tmp = data_set.target
    tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp]
    data_set.target = tmpL

    X_train, X_test, y_train, y_test = train_test_split(
        data_set.data, data_set.target, test_size=0.4)

    mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train)
    elm = ELM(200).fit(X_train, y_train)

    print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test))
    print("ELM Accuracy %0.3f " % elm.score(X_test, y_test))

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
    def _mock_urlopen_raise(request):
        raise ValueError('This mechanism intends to test correct cache'
                         'handling. As such, urlopen should never be '
                         'accessed. URL: %s' % request.get_full_url())
    data_id = 2
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    _monkey_patch_webbased_functions(
        monkeypatch, data_id, gzip_response)
    X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
                                        data_home=cache_directory,
                                        return_X_y=True)

    monkeypatch.setattr(sklearn.datasets.openml, 'urlopen',
                        _mock_urlopen_raise)

    X_cached, y_cached = fetch_openml(data_id=data_id, cache=True,
                                      data_home=cache_directory,
                                      return_X_y=True)
    np.testing.assert_array_equal(X_fetched, X_cached)
    np.testing.assert_array_equal(y_fetched, y_cached)

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_warn_ignore_attribute(monkeypatch, gzip_response):
    data_id = 40966
    expected_row_id_msg = "target_column={} has flag is_row_identifier."
    expected_ignore_msg = "target_column={} has flag is_ignore."
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    # single column test
    assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                         fetch_openml, data_id=data_id,
                         target_column='MouseID',
                         cache=False)
    assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
                         fetch_openml, data_id=data_id,
                         target_column='Genotype',
                         cache=False)
    # multi column test
    assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                         fetch_openml, data_id=data_id,
                         target_column=['MouseID', 'class'],
                         cache=False)
    assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
                         fetch_openml, data_id=data_id,
                         target_column=['Genotype', 'class'],
                         cache=False)

Source File: StructuredInferencePlaygroundOscillatoryPlusDrift.py From Brancher with MIT License

6 votes

def load_mauna_loa_atmospheric_co2():
    ml_data = fetch_openml(data_id=41187)
    months = []
    ppmv_sums = []
    counts = []

    y = ml_data.data[:, 0]
    m = ml_data.data[:, 1]
    month_float = y + (m - 1) / 12
    ppmvs = ml_data.target

    for month, ppmv in zip(month_float, ppmvs):
        if not months or month != months[-1]:
            months.append(month)
            ppmv_sums.append(ppmv)
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += ppmv
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs

Source File: StructuredInferencePlaygroundOscillatoryC02data.py From Brancher with MIT License

6 votes

def load_mauna_loa_atmospheric_co2():
    ml_data = fetch_openml(data_id=41187)
    months = []
    ppmv_sums = []
    counts = []

    y = ml_data.data[:, 0]
    m = ml_data.data[:, 1]
    month_float = y + (m - 1) / 12
    ppmvs = ml_data.target

    for month, ppmv in zip(month_float, ppmvs):
        if not months or month != months[-1]:
            months.append(month)
            ppmv_sums.append(ppmv)
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += ppmv
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs

Source File: StructuredInferencePlaygroundOscillatoryExperimentC02.py From Brancher with MIT License

6 votes

def load_mauna_loa_atmospheric_co2():
    ml_data = fetch_openml(data_id=41187)
    months = []
    ppmv_sums = []
    counts = []

    y = ml_data.data[:, 0]
    m = ml_data.data[:, 1]
    month_float = y + (m - 1) / 12
    ppmvs = ml_data.target

    for month, ppmv in zip(month_float, ppmvs):
        if not months or month != months[-1]:
            months.append(month)
            ppmv_sums.append(ppmv)
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += ppmv
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_illegal_column(monkeypatch, gzip_response):
    data_id = 61
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    assert_raise_message(KeyError, "Could not find target_column=",
                         fetch_openml, data_id=data_id,
                         target_column='undefined', cache=False)

    assert_raise_message(KeyError, "Could not find target_column=",
                         fetch_openml, data_id=data_id,
                         target_column=['undefined', 'class'],
                         cache=False)

Source File: mnist_cnn.py From neupy with MIT License

5 votes

def load_data():
    X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
    X = X.reshape(-1, 28, 28, 1)
    X /= 255.

    target_scaler = OneHotEncoder(sparse=False, categories='auto')
    y = target_scaler.fit_transform(y.reshape(-1, 1))

    return train_test_split(
        X.astype(np.float32),
        y.astype(np.float32),
        test_size=(1 / 7.)
    )

Source File: variational_autoencoder.py From neupy with MIT License

5 votes

def load_data():
    X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)

    X = X / 255.
    X -= X.mean(axis=0)

    x_train, x_test = model_selection.train_test_split(
        X.astype(np.float32),
        test_size=(1 / 7.)
    )
    return x_train, x_test

Source File: conv_autoencoder.py From neupy with MIT License

5 votes

def load_data():
    X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
    X = (X / 255.).astype(np.float32)

    np.random.shuffle(X)
    x_train_2d, x_test_2d = X[:60000], X[60000:]
    x_train_4d = x_train_2d.reshape((60000, 28, 28, 1))
    x_test_4d = x_test_2d.reshape((10000, 28, 28, 1))

    return x_train_4d, x_test_4d

Source File: denoising_autoencoder.py From neupy with MIT License

5 votes

def load_data():
    X, _ = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
    X = (X / 255.).astype(np.float32)

    np.random.shuffle(X)
    x_train, x_test = X[:60000], X[60000:]

    return x_train, x_test

Source File: mnist_mlp.py From neupy with MIT License

5 votes

def load_data():
    X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
    X /= 255.
    X -= X.mean(axis=0)

    target_scaler = OneHotEncoder(sparse=False, categories='auto')
    y = target_scaler.fit_transform(y.reshape(-1, 1))

    return model_selection.train_test_split(
        X.astype(np.float32),
        y.astype(np.float32),
        test_size=(1 / 7.))

Source File: plot_study.py From optuna with MIT License

5 votes

def objective(trial):

    fmnist = fetch_openml(name="Fashion-MNIST", version=1)
    classes = list(set(fmnist.target))

    # For demonstrational purpose, only use a subset of the dataset.
    n_samples = 4000
    data = fmnist.data[:n_samples]
    target = fmnist.target[:n_samples]

    x_train, x_valid, y_train, y_valid = train_test_split(data, target)

    clf = MLPClassifier(
        hidden_layer_sizes=tuple(
            [trial.suggest_int("n_units_l{}".format(i), 32, 64) for i in range(3)]
        ),
        learning_rate_init=trial.suggest_loguniform("lr_init", 1e-5, 1e-1),
    )

    for step in range(100):
        clf.partial_fit(x_train, y_train, classes=classes)
        value = clf.score(x_valid, y_valid)

        # Report intermediate objective value.
        trial.report(value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.TrialPruned()

    return value

Source File: dataset.py From palladium with Apache License 2.0

5 votes

def __call__(self):
        dataset = fetch_openml(self.name)
        return dataset.data, dataset.target

Source File: datasets.py From pywsl with MIT License

5 votes

def get_mnist():
    x, y = fetch_openml('mnist_784', data_home='~', version=1, return_X_y=True)
    y = y.astype(np.int)
    return x, y

Source File: mnist.py From skorch with BSD 3-Clause "New" or "Revised" License

5 votes

def get_data(num_samples):
    mnist = fetch_openml('mnist_784')
    torch.manual_seed(0)
    X = mnist.data.astype('float32').reshape(-1, 1, 28, 28)
    y = mnist.target.astype('int64')
    X, y = shuffle(X, y)
    X, y = X[:num_samples], y[:num_samples]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    X_train /= 255
    X_test /= 255
    return X_train, X_test, y_train, y_test

Source File: KNN.py From AI_Sudoku with Creative Commons Zero v1.0 Universal

5 votes

def __init__(self, k):
        self.mnist = datasets.fetch_openml('mnist_784', data_home='mnist_dataset/')
        self.data, self.target = self.mnist.data, self.mnist.target
        # Make an array of indices the size of MNIST to use for making the data sets.
        # This array is in random order, so we can use it to scramble up the MNIST data
        self.indx = np.random.choice(len(self.target), 70000, replace=False)
        # Initialising the classifier
        self.classifier = KNeighborsClassifier(n_neighbors=k)

    # method for building the datasets to test with

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_fetch_openml_raises_illegal_argument():
    assert_raise_message(ValueError, "Dataset data_id=",
                         fetch_openml, data_id=-1, name="name")

    assert_raise_message(ValueError, "Dataset data_id=",
                         fetch_openml, data_id=-1, name=None,
                         version="version")

    assert_raise_message(ValueError, "Dataset data_id=",
                         fetch_openml, data_id=-1, name="name",
                         version="version")

    assert_raise_message(ValueError, "Neither name nor data_id are provided. "
                         "Please provide name or data_id.", fetch_openml)

Source File: elm.py From Python-ELM with MIT License

5 votes

def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import ShuffleSplit, KFold, cross_val_score

    db_name = 'australian'
    hid_nums = [100, 200, 300]

    data_set = fetch_mldata(db_name)
    data_set.data = preprocessing.normalize(data_set.data)
    data_set.target = [1 if i == 1 else -1
                       for i in  data_set.target.astype(int)]

    for hid_num in hid_nums:
        print(hid_num, end=' ')
        e = ELM(hid_num)

        ave = 0
        for i in range(10):
            cv = KFold(n_splits=5, shuffle=True)
            scores = cross_val_score(
                e, data_set.data, data_set.target,
                cv=cv, scoring='accuracy', n_jobs=-1)
            ave += scores.mean()

        ave /= 10

        print("Accuracy: %0.3f " % (ave))

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_dataset_with_openml_warning(monkeypatch, gzip_response):
    data_id = 3
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    assert_warns_message(
        UserWarning,
        "OpenML raised a warning on the dataset. It might be unusable. "
        "Warning:",
        fetch_openml, data_id=data_id, cache=False
    )

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_dataset_with_openml_error(monkeypatch, gzip_response):
    data_id = 1
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    assert_warns_message(
        UserWarning,
        "OpenML registered a problem with the dataset. It might be unusable. "
        "Error:",
        fetch_openml, data_id=data_id, cache=False
    )

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_string_attribute(monkeypatch, gzip_response):
    data_id = 40945
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    # single column test
    assert_raise_message(ValueError,
                         'STRING attributes are not yet supported',
                         fetch_openml, data_id=data_id, cache=False)

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_fetch_nonexiting(monkeypatch, gzip_response):
    # there is no active version of glass2
    data_id = 40675
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    # Note that we only want to search by name (not data id)
    assert_raise_message(ValueError, "No active dataset glass2 found",
                         fetch_openml, name='glass2', cache=False)

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_fetch_openml_inactive(monkeypatch, gzip_response):
    # fetch inactive dataset by id
    data_id = 40675
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    glas2 = assert_warns_message(
        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
        data_id=data_id, cache=False)
    # fetch inactive dataset by name and version
    assert glas2.data.shape == (163, 9)
    glas2_by_version = assert_warns_message(
        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
        data_id=None, name="glass2", version=1, cache=False)
    assert int(glas2_by_version.details['id']) == data_id

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_fetch_openml_notarget(monkeypatch, gzip_response):
    data_id = 61
    target_column = None
    expected_observations = 150
    expected_features = 5

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    data = fetch_openml(data_id=data_id, target_column=target_column,
                        cache=False)
    assert data.data.shape == (expected_observations, expected_features)
    assert data.target is None

Source File: test_openml.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def _test_features_list(data_id):
    # XXX Test is intended to verify/ensure correct decoding behavior
    # Not usable with sparse data or datasets that have columns marked as
    # {row_identifier, ignore}
    def decode_column(data_bunch, col_idx):
        col_name = data_bunch.feature_names[col_idx]
        if col_name in data_bunch.categories:
            # XXX: This would be faster with np.take, although it does not
            # handle missing values fast (also not with mode='wrap')
            cat = data_bunch.categories[col_name]
            result = [None if is_scalar_nan(idx) else cat[int(idx)]
                      for idx in data_bunch.data[:, col_idx]]
            return np.array(result, dtype='O')
        else:
            # non-nominal attribute
            return data_bunch.data[:, col_idx]

    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)

    # also obtain decoded arff
    data_description = _get_data_description_by_id(data_id, None)
    sparse = data_description['format'].lower() == 'sparse_arff'
    if sparse is True:
        raise ValueError('This test is not intended for sparse data, to keep '
                         'code relatively simple')
    data_arff = _download_data_arff(data_description['file_id'],
                                    sparse, None, False)
    data_downloaded = np.array(list(data_arff['data']), dtype='O')

    for i in range(len(data_bunch.feature_names)):
        # XXX: Test per column, as this makes it easier to avoid problems with
        # missing values

        np.testing.assert_array_equal(data_downloaded[:, i],
                                      decode_column(data_bunch, i))

Source File: fetching.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

5 votes

def fetch_employee_salaries():
    """fetches the employee_salaries dataset

    The employee_salaries dataset contains information about annual salaries
    (year 2016) for more than 9,000 employees of the Montgomery County
    (Maryland, US).


    Returns
    -------
    dict
        a dictionary containing:

            - a short description of the dataset (under the ``DESCR``
              key)
            - the tabular data (under the ``data`` key)
            - the target (under the ``target`` key)

    References
    ----------
    https://catalog.data.gov/dataset/employee-salaries-2016

    """

    data = fetch_openml(data_id=42125, as_frame=True)
    data.data['Current Annual Salary'] = data['target']
    return data

    # link dead.
    # return fetch_dataset(EMPLOYEE_SALARIES_CONFIG, show_progress=False)

Source File: uci_loader.py From highdimensional-decision-boundary-plot with MIT License

5 votes

def getdataset(datasetname, onehot_encode_strings=True):
    # load
    dataset = fetch_openml(datasetname)
    # get X and y
    X = dshape(dataset.data)
    try:
        target = dshape(dataset.target)
    except:
        print("WARNING: No target found. Taking last column of data matrix as target")
        target = X[:, -1]
        X = X[:, :-1]
    if (
        len(target.shape) > 1 and target.shape[1] > X.shape[1]
    ):  # some mldata sets are mixed up...
        X = target
        target = dshape(dataset.data)
    if len(X.shape) == 1 or X.shape[1] <= 1:
        for k in dataset.keys():
            if k != "data" and k != "target" and len(dataset[k]) == X.shape[1]:
                X = np.hstack((X, dshape(dataset[k])))
    # one-hot for categorical values
    if onehot_encode_strings:
        cat_ft = [
            i
            for i in range(X.shape[1])
            if "str" in str(type(unpack(X[0, i])))
            or "unicode" in str(type(unpack(X[0, i])))
        ]
        if len(cat_ft):
            for i in cat_ft:
                X[:, i] = tonumeric(X[:, i])
            X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
    # if sparse, make dense
    try:
        X = X.toarray()
    except:
        pass
    # convert y to monotonically increasing ints
    y = tonumeric(target).astype(int)
    return np.nan_to_num(X.astype(float)), y

Source File: ecob_elm.py From Python-ELM with MIT License

5 votes

def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import cross_val_score

    db_name = 'iris'
    hid_num = 1000
    data_set = fetch_mldata(db_name, version=1)
    data_set.data = preprocessing.scale(data_set.data)
    data_set.target = preprocessing.LabelEncoder().fit_transform(data_set.target)

    print(db_name)
    print('ECOBELM', hid_num)
    e = ECOBELM(hid_num, c=2**5)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave))

    print('ELM', hid_num)
    e = ELM(hid_num)
    ave = 0
    for i in range(10):
        scores = cross_val_score(
            e, data_set.data, data_set.target, cv=5, scoring='accuracy')
        ave += scores.mean()
    ave /= 10
    print("Accuracy: %0.2f " % (ave))

Python sklearn.datasets.fetch_openml() Examples