Python sklearn.datasets.fetch_mldata() Examples
The following are 21
code examples of sklearn.datasets.fetch_mldata().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.datasets
, or try the search function
.
Example #1
Source File: test_mldata.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_fetch_one_column(): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref
Example #2
Source File: test_mldata.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_download(): """Test that fetch_mldata is able to download and cache a data set.""" _urlopen_ref = datasets.mldata.urlopen datasets.mldata.urlopen = mock_mldata_urlopen({ 'mock': { 'label': sp.ones((150,)), 'data': sp.ones((150, 4)), }, }) try: mock = fetch_mldata('mock', data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data"]: assert_in(n, mock) assert_equal(mock.target.shape, (150,)) assert_equal(mock.data.shape, (150, 4)) assert_raises(datasets.mldata.HTTPError, fetch_mldata, 'not_existing_name') finally: datasets.mldata.urlopen = _urlopen_ref
Example #3
Source File: mnist.py From mlens with MIT License | 6 votes |
def load_data(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset safe_print("Loading dataset...") data = fetch_mldata('MNIST original') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255 # Create train-test split (as [Joachims, 2006]) safe_print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test
Example #4
Source File: test_mldata.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_download(tmpdata): """Test that fetch_mldata is able to download and cache a data set.""" _urlopen_ref = datasets.mldata.urlopen datasets.mldata.urlopen = mock_mldata_urlopen({ 'mock': { 'label': sp.ones((150,)), 'data': sp.ones((150, 4)), }, }) try: mock = assert_warns(DeprecationWarning, fetch_mldata, 'mock', data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data"]: assert_in(n, mock) assert_equal(mock.target.shape, (150,)) assert_equal(mock.data.shape, (150, 4)) assert_raises(datasets.mldata.HTTPError, assert_warns, DeprecationWarning, fetch_mldata, 'not_existing_name') finally: datasets.mldata.urlopen = _urlopen_ref
Example #5
Source File: test_mldata.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_fetch_one_column(tmpdata): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdata) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref
Example #6
Source File: autoencoder.py From ML-From-Scratch with MIT License | 6 votes |
def train(self, n_epochs, batch_size=128, save_interval=50): mnist = fetch_mldata('MNIST original') X = mnist.data y = mnist.target # Rescale [-1, 1] X = (X.astype(np.float32) - 127.5) / 127.5 for epoch in range(n_epochs): # Select a random half batch of images idx = np.random.randint(0, X.shape[0], batch_size) imgs = X[idx] # Train the Autoencoder loss, _ = self.autoencoder.train_on_batch(imgs, imgs) # Display the progress print ("%d [D loss: %f]" % (epoch, loss)) # If at save interval => save generated image samples if epoch % save_interval == 0: self.save_imgs(epoch, X)
Example #7
Source File: base.py From impyute with MIT License | 6 votes |
def mnist(missingness="mcar", thr=0.2): """ Loads corrupted MNIST Parameters ---------- missingness: ('mcar', 'mar', 'mnar') Type of missigness you want in your dataset th: float between [0,1] Percentage of missing data in generated data Returns ------- numpy.ndarray """ from sklearn.datasets import fetch_mldata dataset = fetch_mldata('MNIST original') corruptor = Corruptor(dataset.data, thr=thr) data = getattr(corruptor, missingness)() return {"X": data, "Y": dataset.target}
Example #8
Source File: data.py From barrista with MIT License | 5 votes |
def training_data(): """Get the `MNIST original` training data.""" _np.random.seed(1) permutation = _np.random.permutation(range(60000)) mnist = _fetch_mldata('MNIST original', data_home=_os.path.join(_DATA_FOLDER, 'MNIST_original')) return (mnist.data[:60000, :][permutation, :].reshape((60000, 1, 28, 28)).astype('float32'), mnist.target[:60000][permutation].reshape((60000, 1)).astype('float32'))
Example #9
Source File: data.py From SNIPER-mxnet with Apache License 2.0 | 5 votes |
def get_mnist(): np.random.seed(1234) # set seed for deterministic ordering data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) data_path = os.path.join(data_path, '../../data') mnist = fetch_mldata('MNIST original', data_home=data_path) p = np.random.permutation(mnist.data.shape[0]) X = mnist.data[p].astype(np.float32)*0.02 Y = mnist.target[p] return X, Y
Example #10
Source File: data.py From SNIPER-mxnet with Apache License 2.0 | 5 votes |
def get_mnist(): """ Gets MNIST dataset """ np.random.seed(1234) # set seed for deterministic ordering data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) data_path = os.path.join(data_path, '../../data') mnist = fetch_mldata('MNIST original', data_home=data_path) p = np.random.permutation(mnist.data.shape[0]) X = mnist.data[p].astype(np.float32)*0.02 Y = mnist.target[p] return X, Y
Example #11
Source File: optimizee.py From L2L with GNU General Public License v3.0 | 5 votes |
def __init__(self, traj, parameters): super().__init__(traj) if parameters.use_small_mnist: # 8 x 8 images mnist_digits = load_digits() n_input = np.prod(mnist_digits.images.shape[1:]) n_images = len(mnist_digits.images) # 1797 data_images = mnist_digits.images.reshape(n_images, -1) / 16. # -> 1797 x 64 data_targets = mnist_digits.target else: # 28 x 28 images mnist_digits = fetch_mldata('MNIST original') n_input = np.prod(mnist_digits.data.shape[1:]) data_images = mnist_digits.data / 255. # -> 70000 x 284 n_images = len(data_images) data_targets = mnist_digits.target self.n_images = n_images self.data_images, self.data_targets = data_images, data_targets seed = parameters.seed n_hidden = parameters.n_hidden seed = np.uint32(seed) self.random_state = np.random.RandomState(seed=seed) n_output = 10 # This is always true for mnist self.nn = NeuralNetworkClassifier(n_input, n_hidden, n_output) self.random_state = np.random.RandomState(seed=seed) # create_individual can be called because __init__ is complete except for traj initializtion indiv_dict = self.create_individual() for key, val in indiv_dict.items(): traj.individual.f_add_parameter(key, val) traj.individual.f_add_parameter('seed', seed)
Example #12
Source File: nn.py From L2L with GNU General Public License v3.0 | 5 votes |
def main(): from sklearn.datasets import load_digits, fetch_mldata SMALL_MNIST = False if SMALL_MNIST: mnist_digits = load_digits() n_input = np.prod(mnist_digits.images.shape[1:]) n_images = len(mnist_digits.images) # 1797 data_images = mnist_digits.images.reshape(n_images, -1) / 16. # -> 1797 x 64 data_targets = mnist_digits.target # im_size_x, im_size_y = 8, 8 else: mnist_digits = fetch_mldata('MNIST original') n_input = np.prod(mnist_digits.data.shape[1:]) data_images = mnist_digits.data / 255. # -> 70000 x 284 data_targets = mnist_digits.target # im_size_x, im_size_y = 28, 28 n_hidden, n_output = 5, 10 nn = NeuralNetworkClassifier(n_input, n_hidden, n_output) weight_shapes = nn.get_weights_shapes() weights = [] for weight_shape in weight_shapes: weights.append(np.random.randn(*weight_shape)) nn.set_weights(*weights) score = nn.score(data_images, data_targets) print("Score is: ", score)
Example #13
Source File: data.py From barrista with MIT License | 5 votes |
def test_data(): """Get the `MNIST original` test data.""" mnist = _fetch_mldata('MNIST original', data_home=_os.path.join(_DATA_FOLDER, 'MNIST_original')) return (mnist.data[60000:, :].reshape((10000, 1, 28, 28)).astype('float32'), mnist.target[60000:].reshape((10000, 1)).astype('float32'))
Example #14
Source File: data.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 5 votes |
def get_mnist(): """ Gets MNIST dataset """ np.random.seed(1234) # set seed for deterministic ordering data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) data_path = os.path.join(data_path, '../../data') mnist = fetch_mldata('MNIST original', data_home=data_path) p = np.random.permutation(mnist.data.shape[0]) X = mnist.data[p].astype(np.float32)*0.02 Y = mnist.target[p] return X, Y
Example #15
Source File: bench_ml.py From scikit-optimize with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_data_target(name): """ Loads data and target given the name of the dataset. """ if name == "Boston": data = load_boston() elif name == "Housing": data = fetch_california_housing() dataset_size = 1000 # this is necessary so that SVR does not slow down too much data["data"] = data["data"][:dataset_size] data["target"] =data["target"][:dataset_size] elif name == "digits": data = load_digits() elif name == "Climate Model Crashes": try: data = fetch_mldata("climate-model-simulation-crashes") except HTTPError as e: url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat" data = urlopen(url).read().split('\n')[1:] data = [[float(v) for v in d.split()] for d in data] samples = np.array(data) data = dict() data["data"] = samples[:, :-1] data["target"] = np.array(samples[:, -1], dtype=np.int) else: raise ValueError("dataset not supported.") return data["data"], data["target"]
Example #16
Source File: data.py From training_results_v0.6 with Apache License 2.0 | 5 votes |
def get_mnist(): np.random.seed(1234) # set seed for deterministic ordering data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) data_path = os.path.join(data_path, '../../data') mnist = fetch_mldata('MNIST original', data_home=data_path) p = np.random.permutation(mnist.data.shape[0]) X = mnist.data[p].astype(np.float32)*0.02 Y = mnist.target[p] return X, Y
Example #17
Source File: DatasetLoad.py From deepJDOT with MIT License | 5 votes |
def MNIST_dataload(): from sklearn.datasets import fetch_mldata import numpy as np mnist = fetch_mldata('MNIST original') Data = mnist.data label = mnist.target return Data,label
Example #18
Source File: data.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 5 votes |
def get_mnist(): np.random.seed(1234) # set seed for deterministic ordering data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) data_path = os.path.join(data_path, '../../data') mnist = fetch_mldata('MNIST original', data_home=data_path) p = np.random.permutation(mnist.data.shape[0]) X = mnist.data[p].astype(np.float32)*0.02 Y = mnist.target[p] return X, Y
Example #19
Source File: create_data.py From active-learning with Apache License 2.0 | 4 votes |
def get_mldata(dataset): # Use scikit to grab datasets and save them save_dir. save_dir = FLAGS.save_dir filename = os.path.join(save_dir, dataset[1]+'.pkl') if not gfile.Exists(save_dir): gfile.MkDir(save_dir) if not gfile.Exists(filename): if dataset[0][-3:] == 'csv': data = get_csv_data(dataset[0]) elif dataset[0] == 'breast_cancer': data = load_breast_cancer() elif dataset[0] == 'iris': data = load_iris() elif dataset[0] == 'newsgroup': # Removing header information to make sure that no newsgroup identifying # information is included in data data = fetch_20newsgroups_vectorized(subset='all', remove=('headers')) tfidf = TfidfTransformer(norm='l2') X = tfidf.fit_transform(data.data) data.data = X elif dataset[0] == 'rcv1': sklearn.datasets.rcv1.URL = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a13-vector-files/lyrl2004_vectors') sklearn.datasets.rcv1.URL_topics = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz') data = sklearn.datasets.fetch_rcv1( data_home='/tmp') elif dataset[0] == 'wikipedia_attack': data = get_wikipedia_talk_data() elif dataset[0] == 'cifar10': data = get_cifar10() elif 'keras' in dataset[0]: data = get_keras_data(dataset[0]) else: try: data = fetch_mldata(dataset[0]) except: raise Exception('ERROR: failed to fetch data from mldata.org') X = data.data y = data.target if X.shape[0] != y.shape[0]: X = np.transpose(X) assert X.shape[0] == y.shape[0] data = {'data': X, 'target': y} pickle.dump(data, gfile.GFile(filename, 'w'))
Example #20
Source File: restricted_boltzmann_machine.py From ML-From-Scratch with MIT License | 4 votes |
def main(): mnist = fetch_mldata('MNIST original') X = mnist.data / 255.0 y = mnist.target # Select the samples of the digit 2 X = X[y == 2] # Limit dataset to 500 samples idx = np.random.choice(range(X.shape[0]), size=500, replace=False) X = X[idx] rbm = RBM(n_hidden=50, n_iterations=200, batch_size=25, learning_rate=0.001) rbm.fit(X) # Training error plot training, = plt.plot(range(len(rbm.training_errors)), rbm.training_errors, label="Training Error") plt.legend(handles=[training]) plt.title("Error Plot") plt.ylabel('Error') plt.xlabel('Iterations') plt.show() # Get the images that were reconstructed during training gen_imgs = rbm.training_reconstructions # Plot the reconstructed images during the first iteration fig, axs = plt.subplots(5, 5) plt.suptitle("Restricted Boltzmann Machine - First Iteration") cnt = 0 for i in range(5): for j in range(5): axs[i,j].imshow(gen_imgs[0][cnt].reshape((28, 28)), cmap='gray') axs[i,j].axis('off') cnt += 1 fig.savefig("rbm_first.png") plt.close() # Plot the images during the last iteration fig, axs = plt.subplots(5, 5) plt.suptitle("Restricted Boltzmann Machine - Last Iteration") cnt = 0 for i in range(5): for j in range(5): axs[i,j].imshow(gen_imgs[-1][cnt].reshape((28, 28)), cmap='gray') axs[i,j].axis('off') cnt += 1 fig.savefig("rbm_last.png") plt.close()
Example #21
Source File: load.py From FRU with MIT License | 4 votes |
def load_mnist(params): mnist = fetch_mldata('MNIST original') mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=params.random_seed) mnist_X = mnist_X / 255.0 print("MNIST data prepared") mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64') def flatten_img(images): ''' images: shape => (n, rows, columns) output: shape => (n, rows*columns) ''' n_rows = images.shape[1] n_columns = images.shape[2] for num in range(n_rows): if num % 2 != 0: images[:, num, :] = images[:, num, :][:, ::-1] output = images.reshape(-1, n_rows*n_columns) return output time_steps = 28*28 if params.dataset.startswith("mnist.permute"): print "permuate MNIST" mnist_X = mnist_X.reshape((-1, time_steps)) perm = np.random.permutation(time_steps) for i in xrange(len(mnist_X)): mnist_X[i] = mnist_X[i][perm] if len(params.dataset) > len("mnist.permute."): time_steps = int(params.dataset[len("mnist.permute."):]) else: if len(params.dataset) > len("mnist."): # mnist.xx time_steps = int(params.dataset[len("mnist."):]) print "time_steps = ", time_steps mnist_X = mnist_X.reshape((-1, time_steps, 28*28/time_steps)) #mnist_X = flatten_img(mnist_X) # X.shape => (n_samples, seq_len) print "mnist_X.shape = ", mnist_X.shape #mnist_X = mnist_X[:, :, np.newaxis] # X.shape => (n_samples, seq_len, n_features) mnist_y_one_hot = np.zeros((mnist_y.shape[0], 10)) for i in xrange(len(mnist_y)): mnist_y_one_hot[i][mnist_y[i]] = 1 print "mnist_y.shape = ", mnist_y_one_hot.shape # split to training and testing set train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y_one_hot, test_size=0.2, random_state=params.random_seed) # need to set parameters according to dataset params.time_steps = train_X.shape[1] params.input_size = train_X.shape[2] params.output_size = 10 params.regression_flag = False return train_X, test_X, train_y, test_y # synthetic sine curves