Python sklearn.datasets.load_svmlight_file() Examples
The following are 30
code examples of sklearn.datasets.load_svmlight_file().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.datasets
, or try the search function
.
Example #1
Source File: data_io.py From Kaggler with MIT License | 6 votes |
def load_data(path, dense=False): """Load data from a CSV, LibSVM or HDF5 file based on the file extension. Args: path (str): A path to the CSV, LibSVM or HDF5 format file. dense (boolean): An optional variable indicating if the return matrix should be dense. By default, it is false. Returns: Data matrix X and target vector y """ catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5} ext = os.path.splitext(path)[1] func = catalog[ext] X, y = func(path) if dense and sparse.issparse(X): X = X.todense() return X, y
Example #2
Source File: sklearn_test.py From nni with MIT License | 6 votes |
def test(): url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2' urllib.request.urlretrieve(url_zip_train, filename='train.bz2') f_svm = open('train.svm', 'wt') with bz2.open('train.bz2', 'rb') as f_zip: data = f_zip.read() f_svm.write(data.decode('utf-8')) f_svm.close() X, y = load_svmlight_file('train.svm') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression()) # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression()) pipeline.fit(X_train, y_train) print("Pipeline Score: ", pipeline.score(X_train, y_train))
Example #3
Source File: datasets.py From RFHO with MIT License | 6 votes |
def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True): X, y = sk_dt.load_svmlight_file(folder + "/real-sim") y = np.array([int(yy) for yy in y]) if one_hot: y = to_one_hot_enc(y) res = [Dataset(data=X, target=y)] if partitions_proportions: res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions) res = Datasets.from_list(res) if as_tensor: [dat.convert_to_tensor() for dat in res] return res # noinspection PyPep8Naming
Example #4
Source File: io.py From dislib with Apache License 2.0 | 6 votes |
def _read_svmlight(lines, out_blocks, col_size, n_features, store_sparse): from tempfile import SpooledTemporaryFile from sklearn.datasets import load_svmlight_file # Creating a tmp file to use load_svmlight_file method should be more # efficient than parsing the lines manually tmp_file = SpooledTemporaryFile(mode="wb+", max_size=2e8) tmp_file.writelines(lines) tmp_file.seek(0) x, y = load_svmlight_file(tmp_file, n_features) if not store_sparse: x = x.toarray() # tried also converting to csc/ndarray first for faster splitting but it's # not worth. Position 0 contains the X for i in range(ceil(n_features / col_size)): out_blocks[0][i] = x[:, i * col_size:(i + 1) * col_size] # Position 1 contains the y block out_blocks[1][0] = y.reshape(-1, 1)
Example #5
Source File: test_array.py From dislib with Apache License 2.0 | 6 votes |
def test_load_svmlight_file(self): """ Tests loading a LibSVM file """ file_ = "tests/files/libsvm/1" x_np, y_np = load_svmlight_file(file_, n_features=780) # Load SVM and store in sparse x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780, store_sparse=True) self.assertTrue(_equal_arrays(x.collect(), x_np)) self.assertTrue(_equal_arrays(y.collect(), y_np)) # Load SVM and store in dense x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780, store_sparse=False) self.assertTrue(_equal_arrays(x.collect(), x_np.toarray())) self.assertTrue(_equal_arrays(y.collect(), y_np))
Example #6
Source File: test_svmlight_format.py From rankeval with Mozilla Public License 2.0 | 6 votes |
def test_dump_qid(self): tmpfile = "/tmp/tmp_dump.txt" try: # loads from file Xs, y, q = load_svmlight_file(qid_datafile, query_id=True) # dumps to file dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False) # loads them as CSR MATRIX with scikit-learn X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True) X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype) X2.toarray(out=X3) # check assertions assert_array_almost_equal(Xs, X3) assert_array_almost_equal(y, y2) assert_array_equal(q, q2) finally: if os.path.exists(tmpfile): os.remove(tmpfile)
Example #7
Source File: test_svmlight_format.py From rankeval with Mozilla Public License 2.0 | 6 votes |
def test_dump(self): tmpfile = "tmp_dump.txt" try: # loads from file Xs, y = load_svmlight_file(datafile) # dumps to file dump_svmlight_file(Xs, y, tmpfile, zero_based=False) # loads them as CSR MATRIX X2, y2 = sk_load_svmlight_file(tmpfile) X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype) X2.toarray(out=X3) # check assertions assert_array_almost_equal(Xs, X3) assert_array_almost_equal(y, y2) finally: if os.path.exists(tmpfile): os.remove(tmpfile)
Example #8
Source File: data_reader.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def read_year_prediction_data(fileName): feature_dim = 90 print("Reading data from disk...") train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32) train_features = train_features.todense() # normalize the data: subtract means and divide by standard deviations label_mean = train_labels.mean() label_std = np.sqrt(np.square(train_labels - label_mean).mean()) feature_means = train_features.mean(axis=0) feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0)) train_features = (train_features - feature_means) / feature_stds train_labels = (train_labels - label_mean) / label_std return feature_dim, train_features, train_labels
Example #9
Source File: python2libsvm.py From AiLearning with GNU General Public License v3.0 | 5 votes |
def get_data(file_input, separator='\t'): if 'libsvm' not in file_input: file_input = other2libsvm(file_input, separator) data = datasets.load_svmlight_file(file_input) return data[0], data[1]
Example #10
Source File: pg2.py From aca with MIT License | 5 votes |
def homepage_xgb_model(model_path, training_set='True'): training_set = './data/%s_features.svm.txt'%(training_set) model = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=5, min_child_weight=1, gamma= 0.3, subsample= 0.7, colsample_bytree=0.7, objective= 'binary:logistic', scale_pos_weight=1) X, y = load_svmlight_file(training_set) model.fit(X,y) pickle.dump(model, open(model_path, 'wb')) return model
Example #11
Source File: datasets.py From interpret-community with MIT License | 5 votes |
def retrieve_dataset(dataset, **kwargs): # if data not extracted, download zip and extract outdirname = 'datasets.12.18.2019' if not os.path.exists(outdirname): try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve import zipfile zipfilename = outdirname + '.zip' urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename) with zipfile.ZipFile(zipfilename, 'r') as unzip: unzip.extractall('.') extension = os.path.splitext(dataset)[1] filepath = os.path.join(outdirname, dataset) if extension == '.npz': # sparse format file from scipy.sparse import load_npz return load_npz(filepath) elif extension == '.svmlight': from sklearn import datasets return datasets.load_svmlight_file(filepath) elif extension == '.json': import json with open(filepath, encoding='utf-8') as f: dataset = json.load(f) return dataset elif extension == '.csv': import pandas as pd return pd.read_csv(filepath, **kwargs) else: raise Exception('Unrecognized file extension: ' + extension)
Example #12
Source File: transitionparser.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def train(self, depgraphs, modelfile, verbose=True): """ :param depgraphs : list of DependencyGraph as the training data :type depgraphs : DependencyGraph :param modelfile : file name to save the trained model :type modelfile : str """ try: input_file = tempfile.NamedTemporaryFile( prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False ) if self._algorithm == self.ARC_STANDARD: self._create_training_examples_arc_std(depgraphs, input_file) else: self._create_training_examples_arc_eager(depgraphs, input_file) input_file.close() # Using the temporary file to train the libsvm classifier x_train, y_train = load_svmlight_file(input_file.name) # The parameter is set according to the paper: # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre # Todo : because of probability = True => very slow due to # cross-validation. Need to improve the speed here model = svm.SVC( kernel='poly', degree=2, coef0=0, gamma=0.2, C=0.5, verbose=verbose, probability=True, ) model.fit(x_train, y_train) # Save the model to file name (as pickle) pickle.dump(model, open(modelfile, 'wb')) finally: remove(input_file.name)
Example #13
Source File: io.py From transfer with MIT License | 5 votes |
def __init__(self, file_path, domain=0): self.file_path = file_path X, Y = load_svmlight_file(file_path) # Y is synthetic label, not used X = X.todense().astype(np.float32) self.X = torch.from_numpy(X) self.Y = torch.LongTensor([domain] * self.X.shape[0])
Example #14
Source File: io.py From transfer with MIT License | 5 votes |
def __init__(self, file_path): self.file_path = file_path X, Y = load_svmlight_file(file_path) # X is a sparse matrix # L = [X[i].nonzero()[0].shape[0] for i in range(X.shape[0])] X = X.todense().astype(np.float32) Y = np.array((Y + 1) / 2, dtype=int) self.X = torch.from_numpy(X) self.Y = torch.from_numpy(Y)
Example #15
Source File: dataset_loading.py From allRank with Apache License 2.0 | 5 votes |
def from_svm_file(cls, svm_file_path, transform=None): """ Instantiate a LibSVMDataset from a LibSVM file path. :param svm_file_path: LibSVM file path :param transform: a callable defining an optional transformation called on the dataset :return: LibSVMDataset instantiated from a given file and with an optional transformation defined """ x, y, query_ids = load_svmlight_file(svm_file_path, query_id=True) logger.info("loaded dataset from {} and got x shape {}, y shape {} and query_ids shape {}".format( svm_file_path, x.shape, y.shape, query_ids.shape)) return cls(x, y, query_ids, transform)
Example #16
Source File: benchmark_test.py From nni with MIT License | 5 votes |
def run_test(self, pipeline, name, path): print("download " + name) update_name = self.download(name, path) X, y = load_svmlight_file(update_name) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42) pipeline.fit(X_train, y_train) print("[Benchmark "+ name + " Score]: ", pipeline.score(X_test, y_test))
Example #17
Source File: classifier.py From Semantic-Texual-Similarity-Toolkits with MIT License | 5 votes |
def load_file(self, file_path): data = load_svmlight_file(file_path) return data[0], data[1]
Example #18
Source File: datasets.py From interpret-text with MIT License | 5 votes |
def retrieve_dataset(dataset, **kwargs): # if data not extracted, download zip and extract outdirname = "datasets.1.17.2019" if not os.path.exists(outdirname): try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve import zipfile zipfilename = outdirname + ".zip" urlretrieve( "https://publictestdatasets.blob.core.windows.net/data/" + zipfilename, zipfilename, ) with zipfile.ZipFile(zipfilename, "r") as unzip: unzip.extractall(".") extension = os.path.splitext(dataset)[1] filepath = os.path.join(outdirname, dataset) if extension == ".npz": # sparse format file import scipy.sparse as sparse return sparse.load_npz(filepath) elif extension == ".svmlight": from sklearn import datasets return datasets.load_svmlight_file(filepath) elif extension == ".json": import json with open(filepath, encoding="utf-8") as f: dataset = json.load(f) return dataset elif extension == ".csv": import pandas as pd return pd.read_csv(filepath, **kwargs) else: raise Exception("Unrecognized file extension: " + extension)
Example #19
Source File: transitionparser.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def train(self, depgraphs, modelfile): """ :param depgraphs : list of DependencyGraph as the training data :type depgraphs : DependencyGraph :param modelfile : file name to save the trained model :type modelfile : str """ try: input_file = tempfile.NamedTemporaryFile( prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) if self._algorithm == self.ARC_STANDARD: self._create_training_examples_arc_std(depgraphs, input_file) else: self._create_training_examples_arc_eager(depgraphs, input_file) input_file.close() # Using the temporary file to train the libsvm classifier x_train, y_train = load_svmlight_file(input_file.name) # The parameter is set according to the paper: # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre # Todo : because of probability = True => very slow due to # cross-validation. Need to improve the speed here model = svm.SVC( kernel='poly', degree=2, coef0=0, gamma=0.2, C=0.5, verbose=True, probability=True) model.fit(x_train, y_train) # Save the model to file name (as pickle) pickle.dump(model, open(modelfile, 'wb')) finally: remove(input_file.name)
Example #20
Source File: fetch_url_dataset.py From tick with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_url_dataset_day(cache_path, days): """Loads url dataset from a tar file Parameters ---------- cache_path : `str` Path to the tar file days : `list` or `range` Days to be loaded Returns ------- X : `np.ndarray` A sparse matrix containing the features y : `np.ndarray` An array containing the labels """ tar_file = tarfile.open(cache_path, "r:gz") X, y = None, None for day in days: data_filename = 'url_svmlight/Day{}.svm'.format(day) with tar_file.extractfile(data_filename) as data_file: X_day, y_day = load_svmlight_file(data_file, n_features=_N_FEATURES) if X is None: X, y = X_day, y_day else: X = scipy.sparse.vstack((X, X_day)) y = np.hstack((y, y_day)) return X, y
Example #21
Source File: download_helper.py From tick with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_dataset(dataset_path, data_home=None, n_features=None): """Load dataset from given path Parameters ---------- dataset_path : `str` Dataset relative path data_home : `str`, optional, default=None Specify a download and cache folder for the datasets. If None and not configured with TICK_DATASETS environement variable all tick datasets are stored in '~/tick_datasets' subfolders. n_features : `int`, optional, default=None The number of features to use. If None, it will be inferred. This argument is useful to load several files that are subsets of a bigger sliced dataset: each subset might not have examples of every feature, hence the inferred shape might vary from one slice to another. Returns ------- output : `np.ndarray` or `dict` or `tuple` Dataset. Its format will depend on queried dataset. """ data_home = get_data_home(data_home) cache_path = os.path.join(data_home, dataset_path) if cache_path.endswith(".npz"): dataset = np.load(cache_path, allow_pickle=True) # If we have only one numpy array we return it directly otherwise # we return the row dictionary if len(dataset) == 1: key_0 = list(dataset.keys())[0] dataset = dataset[key_0] else: dataset = dataset.items() else: dataset = load_svmlight_file(cache_path, n_features=n_features) return dataset
Example #22
Source File: data_reader.py From training_results_v0.6 with Apache License 2.0 | 5 votes |
def get_year_prediction_data(dirname=None): feature_dim = 90 if dirname is None: dirname = os.path.join(os.path.dirname(__file__), 'data') filename = 'YearPredictionMSD' download_filename = os.path.join(dirname, "%s.bz2" % filename) extracted_filename = os.path.join(dirname, filename) if not os.path.isfile(download_filename): print("Downloading data...") mx.test_utils.download('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/%s.bz2' % filename, dirname=dirname) if not os.path.isfile(extracted_filename): print("Extracting data...") with bz2.BZ2File(download_filename) as fr, open(extracted_filename,"wb") as fw: shutil.copyfileobj(fr,fw) print("Reading data from disk...") train_features, train_labels = load_svmlight_file(extracted_filename, n_features=feature_dim, dtype=np.float32) train_features = train_features.todense() # normalize the data: subtract means and divide by standard deviations label_mean = train_labels.mean() label_std = np.sqrt(np.square(train_labels - label_mean).mean()) feature_means = train_features.mean(axis=0) feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0)) train_features = (train_features - feature_means) / feature_stds train_labels = (train_labels - label_mean) / label_std return feature_dim, train_features, train_labels
Example #23
Source File: dataset.py From libact with BSD 2-Clause "Simplified" License | 5 votes |
def import_libsvm_sparse(filename): """Imports dataset file in libsvm sparse format""" from sklearn.datasets import load_svmlight_file X, y = load_svmlight_file(filename) return Dataset(X.toarray(), y)
Example #24
Source File: run_sklearn.py From recipy with Apache License 2.0 | 5 votes |
def load_svmlight_file(self): """ Use sklearn.datasets.load_svmlight_file to load data.svmlight. """ file_name = os.path.join(self.data_dir, "data.svmlight") datasets.load_svmlight_file(file_name)
Example #25
Source File: data_utils.py From pyxclib with MIT License | 5 votes |
def read_data(filename, header=True, dtype='float32', zero_based=True): """Read data in sparse format Arguments --------- filename: str output file name header: bool, default=True If header is present or not dtype: str, default='float32' data type of values zero_based: boolean, default=True zwero based indices? Returns -------- features: csr_matrix features matrix labels: csr_matix labels matrix num_samples: int #instances num_feat: int #features num_labels: int #labels """ with open(filename, 'rb') as f: _l_shape = None if header: line = f.readline().decode('utf-8').rstrip("\n") line = line.split(" ") num_samples, num_feat, num_labels = int( line[0]), int(line[1]), int(line[2]) _l_shape = (num_samples, num_labels) else: num_samples, num_feat, num_labels = None, None, None features, labels = load_svmlight_file(f, multilabel=True) labels = ll_to_sparse( labels, dtype=dtype, zero_based=zero_based, shape=_l_shape) return features, labels, num_samples, num_feat, num_labels
Example #26
Source File: test_multilabel_realdata.py From libact with BSD 2-Clause "Simplified" License | 5 votes |
def setUp(self): dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'datasets/yeast_train.svm') X, y = load_svmlight_file(dataset_filepath, multilabel=True) self.X = X.todense().tolist() self.y = MultiLabelBinarizer().fit_transform(y).tolist() self.quota = 10
Example #27
Source File: io.py From dislib with Apache License 2.0 | 4 votes |
def load_svmlight_file(path, block_size, n_features, store_sparse): """ Loads a SVMLight file into a distributed array. Parameters ---------- path : string File path. block_size : tuple (int, int) Size of the blocks for the output ds-array. n_features : int Number of features. store_sparse : boolean Whether to use scipy.sparse data structures to store data. If False, numpy.array is used instead. Returns ------- x, y : (ds-array, ds-array) A distributed representation (ds-array) of the X and y. """ n, m = block_size lines = [] x_blocks, y_blocks = [], [] n_rows = 0 with open(path, "r") as f: for line in f: n_rows += 1 lines.append(line.encode()) if len(lines) == n: # line 0 -> X, line 1 -> y out_blocks = Array._get_out_blocks((1, ceil(n_features / m))) out_blocks.append([object()]) # out_blocks.append([]) _read_svmlight(lines, out_blocks, col_size=m, n_features=n_features, store_sparse=store_sparse) # we append only the list forming the row (out_blocks depth=2) x_blocks.append(out_blocks[0]) y_blocks.append(out_blocks[1]) lines = [] if lines: out_blocks = Array._get_out_blocks((1, ceil(n_features / m))) out_blocks.append([object()]) _read_svmlight(lines, out_blocks, col_size=m, n_features=n_features, store_sparse=store_sparse) # we append only the list forming the row (out_blocks depth=2) x_blocks.append(out_blocks[0]) y_blocks.append(out_blocks[1]) x = Array(x_blocks, top_left_shape=block_size, reg_shape=block_size, shape=(n_rows, n_features), sparse=store_sparse) # y has only a single line but it's treated as a 'column' y = Array(y_blocks, top_left_shape=(n, 1), reg_shape=(n, 1), shape=(n_rows, 1), sparse=False) return x, y
Example #28
Source File: bidnn.py From BiDNN with GNU Affero General Public License v3.0 | 4 votes |
def load_dataset(self, X=None): if self.conf.verbosity > 1: print "Loading dataset..." if X is None: self.X_train, self.tl = load_svmlight_file(self.conf.fname_in, dtype=np.float32, multilabel=False) # we're saving tl (target labels) just in case they exist and the user needs them - since # this is unsupervised learning, we completely ignore the labels and don't expect them to exist else: self.X_train = X self.X_train = self.X_train.todense() if (self.conf.mod1size + self.conf.mod2size) != self.X_train.shape[1]: raise ValueError("Provided dimensionality of 1st modality ("+str(self.conf.mod1size)+") and 2nd modality ("+str(self.conf.mod2size)+") " \ "does not sum to the dimensionality provided in the input file ("+str(self.X_train.shape[1])+")") # indices of missing modalities (stored for later) self.idxMissingFirst = [] self.idxMissingSecond = [] # generate training data for modality translation self.X_first = [] self.X_second = [] bothMissing = both = 0 if self.conf.ignore_zeroes: # zeroes are not treated as missing modalities # I have no idea why this might be useful, but ok :D # since idxMissing* are left empty, this is the only # place where we should take care of this for i in range(self.X_train.shape[0]): both += 1 self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size])) self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:])) else: # zero vectors are treated as missing modalities (default) for i in range(self.X_train.shape[0]): if not np.any(self.X_train[i, :self.conf.mod1size]): # first missing if np.any(self.X_train[i, self.conf.mod1size:]): # second not missing # second ok, need to reconstruct first self.idxMissingFirst.append(i) else: bothMissing += 1 # missing both else: # first ok if not np.any(self.X_train[i, self.conf.mod1size:]): # second missing self.idxMissingSecond.append(i) else: #both ok -> use them to train translator both += 1 self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size])) self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:])) if self.conf.verbosity > 1: print "Both modalities present:",both, "\nMissing 1st:", len(self.idxMissingFirst), "\nMissing 2nd:",len(self.idxMissingSecond) print "Missing both modalities:", bothMissing, "\n" self.X_first = np.array(self.X_first) self.X_second = np.array(self.X_second)
Example #29
Source File: libsvm.py From celer with BSD 3-Clause "New" or "Revised" License | 4 votes |
def get_X_y(dataset, compressed_path, multilabel, replace=False): """Load a LIBSVM dataset as sparse X and observation y/Y. If X and y already exists as npz and npy, they are not redownloaded unless replace=True.""" ext = '.npz' if multilabel else '.npy' y_path = pjoin(CELER_PATH, "%s_target%s" % (NAMES[dataset], ext)) X_path = pjoin(CELER_PATH, "%s_data.npz" % NAMES[dataset]) if replace or not os.path.isfile(y_path) or not os.path.isfile(X_path): tmp_path = pjoin(CELER_PATH, "%s" % NAMES[dataset]) decompressor = BZ2Decompressor() print("Decompressing...") with open(tmp_path, "wb") as f, open(compressed_path, "rb") as g: for data in iter(lambda: g.read(100 * 1024), b''): f.write(decompressor.decompress(data)) n_features_total = N_FEATURES[dataset] print("Loading svmlight file...") with open(tmp_path, 'rb') as f: X, y = load_svmlight_file( f, n_features_total, multilabel=multilabel) os.remove(tmp_path) X = sparse.csc_matrix(X) X.sort_indices() sparse.save_npz(X_path, X) if multilabel: indices = np.array([lab for labels in y for lab in labels]) indptr = np.cumsum([0] + [len(labels) for labels in y]) data = np.ones_like(indices) Y = sparse.csr_matrix((data, indices, indptr)) sparse.save_npz(y_path, Y) return X, Y else: np.save(y_path, y) else: X = sparse.load_npz(X_path) y = np.load(y_path) return X, y
Example #30
Source File: multiclass_soft_confidence_weighted_1_diag.py From python-online-machine-learning-library with BSD 3-Clause "New" or "Revised" License | 4 votes |
def main(): """ Example of how to use """ # data load #fname = "/home/kzk/datasets/uci_csv/iris.csv" fname = "/home/kzk/datasets/uci_csv/glass.csv" #fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv" #fname = "/home/kzk/datasets/uci_csv/car.csv" #fname = "/home/kzk/datasets/uci_csv/credit.csv" #fname = "/home/kzk/datasets/uci_csv/usps.csv" #fname = "/home/kzk/datasets/uci_csv/liver.csv" #fname = "/home/kzk/datasets/uci_csv/haberman.csv" #fname = "/home/kzk/datasets/uci_csv/pima.csv" #fname = "/home/kzk/datasets/uci_csv/parkinsons.csv" #fname = "/home/kzk/datasets/uci_csv/ionosphere.csv" #fname = "/home/kzk/datasets/uci_csv/isolet.csv" #fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv" #fname = "/home/kzk/datasets/uci_csv/mammographic.csv" #fname = "/home/kzk/datasets/uci_csv/yeast.csv" fname = "/home/k_yoshiyama/datasets/news20/news20.dat" print "dataset is", fname #data = np.loadtxt(fname, delimiter=" ") #X = data[:, 1:] #y = data[:, 0] (X, y) = load_svmlight_file(fname) n_samples = X.shape[0] y_pred = np.ndarray(n_samples) #X = X.toarray() n_samples = X.shape[0] y_pred = np.ndarray(n_samples) # learn model = MSCWIDiag(C=1, eta=0.9, epochs=1) model.learn(X, y) # predict st = time.time() for i in xrange(0, n_samples): if i % 1000 == 0: print "#samples = %d" % i pass sample = X[i, :] y_pred[i] = model.predict(sample) et = time.time() print "prediction time: %f[s]" % (et - st) print "prediction time/sample: %f[s]" % ((et - st) / n_samples) # show result cm = confusion_matrix(y, y_pred) #print cm print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm))