Python Examples of sklearn.datasets.load_svmlight

Source File: data_io.py From Kaggler with MIT License

6 votes

def load_data(path, dense=False):
    """Load data from a CSV, LibSVM or HDF5 file based on the file extension.

    Args:
        path (str): A path to the CSV, LibSVM or HDF5 format file.
        dense (boolean): An optional variable indicating if the return matrix
                         should be dense.  By default, it is false.

    Returns:
        Data matrix X and target vector y
    """

    catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5}

    ext = os.path.splitext(path)[1]
    func = catalog[ext]
    X, y = func(path)

    if dense and sparse.issparse(X):
        X = X.todense()

    return X, y

Source File: sklearn_test.py From nni with MIT License

6 votes

def test():
    url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
    urllib.request.urlretrieve(url_zip_train, filename='train.bz2')

    f_svm = open('train.svm', 'wt')
    with bz2.open('train.bz2', 'rb') as f_zip:
        data = f_zip.read()
        f_svm.write(data.decode('utf-8'))
    f_svm.close()


    X, y = load_svmlight_file('train.svm')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


    pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
    # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())

    pipeline.fit(X_train, y_train)

    print("Pipeline Score: ", pipeline.score(X_train, y_train))

Source File: datasets.py From RFHO with MIT License

6 votes

def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True):
    X, y = sk_dt.load_svmlight_file(folder + "/real-sim")
    y = np.array([int(yy) for yy in y])
    if one_hot:
        y = to_one_hot_enc(y)
    res = [Dataset(data=X, target=y)]
    if partitions_proportions:
        res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions)
        res = Datasets.from_list(res)

    if as_tensor: [dat.convert_to_tensor() for dat in res]

    return res


# noinspection PyPep8Naming

Source File: io.py From dislib with Apache License 2.0

6 votes

def _read_svmlight(lines, out_blocks, col_size, n_features, store_sparse):
    from tempfile import SpooledTemporaryFile
    from sklearn.datasets import load_svmlight_file

    # Creating a tmp file to use load_svmlight_file method should be more
    # efficient than parsing the lines manually
    tmp_file = SpooledTemporaryFile(mode="wb+", max_size=2e8)
    tmp_file.writelines(lines)
    tmp_file.seek(0)

    x, y = load_svmlight_file(tmp_file, n_features)
    if not store_sparse:
        x = x.toarray()

    # tried also converting to csc/ndarray first for faster splitting but it's
    # not worth. Position 0 contains the X
    for i in range(ceil(n_features / col_size)):
        out_blocks[0][i] = x[:, i * col_size:(i + 1) * col_size]

    # Position 1 contains the y block
    out_blocks[1][0] = y.reshape(-1, 1)

Source File: test_array.py From dislib with Apache License 2.0

6 votes

def test_load_svmlight_file(self):
        """ Tests loading a LibSVM file  """
        file_ = "tests/files/libsvm/1"

        x_np, y_np = load_svmlight_file(file_, n_features=780)

        # Load SVM and store in sparse
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=True)

        self.assertTrue(_equal_arrays(x.collect(), x_np))
        self.assertTrue(_equal_arrays(y.collect(), y_np))

        # Load SVM and store in dense
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=False)

        self.assertTrue(_equal_arrays(x.collect(), x_np.toarray()))
        self.assertTrue(_equal_arrays(y.collect(), y_np))

Source File: test_svmlight_format.py From rankeval with Mozilla Public License 2.0

6 votes

def test_dump_qid(self):
        tmpfile = "/tmp/tmp_dump.txt"
        try:
            # loads from file
            Xs, y, q = load_svmlight_file(qid_datafile, query_id=True)

            # dumps to file
            dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False)

            # loads them as CSR MATRIX with scikit-learn
            X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True)

            X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
            X2.toarray(out=X3)

            # check assertions
            assert_array_almost_equal(Xs, X3)
            assert_array_almost_equal(y, y2)
            assert_array_equal(q, q2)
        finally:
            if os.path.exists(tmpfile):
                os.remove(tmpfile)

Source File: test_svmlight_format.py From rankeval with Mozilla Public License 2.0

6 votes

def test_dump(self):
        tmpfile = "tmp_dump.txt"
        try:
            # loads from file
            Xs, y = load_svmlight_file(datafile)

            # dumps to file
            dump_svmlight_file(Xs, y, tmpfile, zero_based=False)

            # loads them as CSR MATRIX
            X2, y2 = sk_load_svmlight_file(tmpfile)

            X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
            X2.toarray(out=X3)

            # check assertions
            assert_array_almost_equal(Xs, X3)
            assert_array_almost_equal(y, y2)
        finally:
            if os.path.exists(tmpfile):
                os.remove(tmpfile)

Source File: data_reader.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0

6 votes

def read_year_prediction_data(fileName):
    feature_dim = 90
    print("Reading data from disk...")
    train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32)
    train_features = train_features.todense()

    # normalize the data: subtract means and divide by standard deviations
    label_mean = train_labels.mean()
    label_std = np.sqrt(np.square(train_labels - label_mean).mean())
    feature_means = train_features.mean(axis=0)
    feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))

    train_features = (train_features - feature_means) / feature_stds
    train_labels = (train_labels - label_mean) / label_std

    return feature_dim, train_features, train_labels

Source File: python2libsvm.py From AiLearning with GNU General Public License v3.0

5 votes

def get_data(file_input, separator='\t'):
    if 'libsvm' not in file_input:
        file_input = other2libsvm(file_input, separator)
    data = datasets.load_svmlight_file(file_input)
    return data[0], data[1]

Source File: pg2.py From aca with MIT License

5 votes

def homepage_xgb_model(model_path, training_set='True'):
    training_set = './data/%s_features.svm.txt'%(training_set)
    model = xgb.XGBClassifier( learning_rate =0.1,
         n_estimators=200,
         max_depth=5,
         min_child_weight=1,
         gamma= 0.3,
         subsample= 0.7,
         colsample_bytree=0.7,
         objective= 'binary:logistic',
         scale_pos_weight=1)
    X, y = load_svmlight_file(training_set)
    model.fit(X,y)
    pickle.dump(model, open(model_path, 'wb'))
    return model

Source File: datasets.py From interpret-community with MIT License

5 votes

def retrieve_dataset(dataset, **kwargs):
    # if data not extracted, download zip and extract
    outdirname = 'datasets.12.18.2019'
    if not os.path.exists(outdirname):
        try:
            from urllib import urlretrieve
        except ImportError:
            from urllib.request import urlretrieve
        import zipfile
        zipfilename = outdirname + '.zip'
        urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename)
        with zipfile.ZipFile(zipfilename, 'r') as unzip:
            unzip.extractall('.')
    extension = os.path.splitext(dataset)[1]
    filepath = os.path.join(outdirname, dataset)
    if extension == '.npz':
        # sparse format file
        from scipy.sparse import load_npz
        return load_npz(filepath)
    elif extension == '.svmlight':
        from sklearn import datasets
        return datasets.load_svmlight_file(filepath)
    elif extension == '.json':
        import json
        with open(filepath, encoding='utf-8') as f:
            dataset = json.load(f)
        return dataset
    elif extension == '.csv':
        import pandas as pd
        return pd.read_csv(filepath, **kwargs)
    else:
        raise Exception('Unrecognized file extension: ' + extension)

Source File: transitionparser.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def train(self, depgraphs, modelfile, verbose=True):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False
            )

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=verbose,
                probability=True,
            )

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

Source File: io.py From transfer with MIT License

5 votes

def __init__(self, file_path, domain=0):
        self.file_path = file_path
        X, Y = load_svmlight_file(file_path) # Y is synthetic label, not used
        X = X.todense().astype(np.float32)
        self.X = torch.from_numpy(X)
        self.Y = torch.LongTensor([domain] * self.X.shape[0])

Source File: io.py From transfer with MIT License

5 votes

def __init__(self, file_path):
        self.file_path = file_path
        X, Y = load_svmlight_file(file_path) # X is a sparse matrix
        # L = [X[i].nonzero()[0].shape[0] for i in range(X.shape[0])]
        X = X.todense().astype(np.float32)
        Y = np.array((Y + 1) / 2, dtype=int)
        self.X = torch.from_numpy(X)
        self.Y = torch.from_numpy(Y)

Source File: dataset_loading.py From allRank with Apache License 2.0

5 votes

def from_svm_file(cls, svm_file_path, transform=None):
        """
        Instantiate a LibSVMDataset from a LibSVM file path.
        :param svm_file_path: LibSVM file path
        :param transform: a callable defining an optional transformation called on the dataset
        :return: LibSVMDataset instantiated from a given file and with an optional transformation defined
        """
        x, y, query_ids = load_svmlight_file(svm_file_path, query_id=True)
        logger.info("loaded dataset from {} and got x shape {}, y shape {} and query_ids shape {}".format(
            svm_file_path, x.shape, y.shape, query_ids.shape))
        return cls(x, y, query_ids, transform)

Source File: benchmark_test.py From nni with MIT License

5 votes

def run_test(self, pipeline, name, path):
        print("download " + name)
        update_name = self.download(name, path)
        X, y = load_svmlight_file(update_name)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)
        
        pipeline.fit(X_train, y_train)
        print("[Benchmark "+ name + " Score]: ", pipeline.score(X_test, y_test))

Source File: classifier.py From Semantic-Texual-Similarity-Toolkits with MIT License

5 votes

def load_file(self, file_path):
        data = load_svmlight_file(file_path)
        return data[0], data[1]

Source File: datasets.py From interpret-text with MIT License

5 votes

def retrieve_dataset(dataset, **kwargs):
    # if data not extracted, download zip and extract
    outdirname = "datasets.1.17.2019"
    if not os.path.exists(outdirname):
        try:
            from urllib import urlretrieve
        except ImportError:
            from urllib.request import urlretrieve
        import zipfile

        zipfilename = outdirname + ".zip"
        urlretrieve(
            "https://publictestdatasets.blob.core.windows.net/data/" + zipfilename,
            zipfilename,
        )
        with zipfile.ZipFile(zipfilename, "r") as unzip:
            unzip.extractall(".")
    extension = os.path.splitext(dataset)[1]
    filepath = os.path.join(outdirname, dataset)
    if extension == ".npz":
        # sparse format file
        import scipy.sparse as sparse

        return sparse.load_npz(filepath)
    elif extension == ".svmlight":
        from sklearn import datasets

        return datasets.load_svmlight_file(filepath)
    elif extension == ".json":
        import json

        with open(filepath, encoding="utf-8") as f:
            dataset = json.load(f)
        return dataset
    elif extension == ".csv":
        import pandas as pd

        return pd.read_csv(filepath, **kwargs)
    else:
        raise Exception("Unrecognized file extension: " + extension)

Source File: transitionparser.py From razzy-spinner with GNU General Public License v3.0

5 votes

def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name)

Source File: fetch_url_dataset.py From tick with BSD 3-Clause "New" or "Revised" License

5 votes

def load_url_dataset_day(cache_path, days):
    """Loads url dataset from a tar file

    Parameters
    ----------
    cache_path : `str`
        Path to the tar file

    days : `list` or `range`
        Days to be loaded

    Returns
    -------
    X : `np.ndarray`
        A sparse matrix containing the features

    y : `np.ndarray`
        An array containing the labels
    """
    tar_file = tarfile.open(cache_path, "r:gz")

    X, y = None, None

    for day in days:
        data_filename = 'url_svmlight/Day{}.svm'.format(day)
        with tar_file.extractfile(data_filename) as data_file:
            X_day, y_day = load_svmlight_file(data_file,
                                              n_features=_N_FEATURES)

        if X is None:
            X, y = X_day, y_day
        else:
            X = scipy.sparse.vstack((X, X_day))
            y = np.hstack((y, y_day))

    return X, y

Source File: download_helper.py From tick with BSD 3-Clause "New" or "Revised" License

5 votes

def load_dataset(dataset_path, data_home=None, n_features=None):
    """Load dataset from given path
    Parameters
    ----------
    dataset_path : `str`
        Dataset relative path
    data_home : `str`, optional, default=None
        Specify a download and cache folder for the datasets. If None
        and not configured with TICK_DATASETS environement variable
        all tick datasets are stored in '~/tick_datasets' subfolders.
    n_features : `int`, optional, default=None
        The number of features to use. If None, it will be inferred. This
        argument is useful to load several files that are subsets of a
        bigger sliced dataset: each subset might not have examples of
        every feature, hence the inferred shape might vary from one
        slice to another.
    Returns
    -------
    output : `np.ndarray` or `dict` or `tuple`
        Dataset. Its format will depend on queried dataset.
    """
    data_home = get_data_home(data_home)
    cache_path = os.path.join(data_home, dataset_path)

    if cache_path.endswith(".npz"):
        dataset = np.load(cache_path, allow_pickle=True)
        # If we have only one numpy array we return it directly otherwise
        # we return the row dictionary
        if len(dataset) == 1:
            key_0 = list(dataset.keys())[0]
            dataset = dataset[key_0]
        else:
            dataset = dataset.items()
    else:
        dataset = load_svmlight_file(cache_path, n_features=n_features)

    return dataset

Source File: data_reader.py From training_results_v0.6 with Apache License 2.0

5 votes

def get_year_prediction_data(dirname=None):
    feature_dim = 90
    if dirname is None:
        dirname = os.path.join(os.path.dirname(__file__), 'data')
    filename = 'YearPredictionMSD'
    download_filename = os.path.join(dirname, "%s.bz2" % filename)
    extracted_filename = os.path.join(dirname, filename)
    if not os.path.isfile(download_filename):
        print("Downloading data...")
        mx.test_utils.download('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/%s.bz2' % filename, dirname=dirname)
    if not os.path.isfile(extracted_filename):
        print("Extracting data...")
        with bz2.BZ2File(download_filename) as fr, open(extracted_filename,"wb") as fw:
            shutil.copyfileobj(fr,fw)
    print("Reading data from disk...")
    train_features, train_labels = load_svmlight_file(extracted_filename, n_features=feature_dim, dtype=np.float32)
    train_features = train_features.todense()

    # normalize the data: subtract means and divide by standard deviations
    label_mean = train_labels.mean()
    label_std = np.sqrt(np.square(train_labels - label_mean).mean())
    feature_means = train_features.mean(axis=0)
    feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))

    train_features = (train_features - feature_means) / feature_stds
    train_labels = (train_labels - label_mean) / label_std

    return feature_dim, train_features, train_labels

Source File: dataset.py From libact with BSD 2-Clause "Simplified" License

5 votes

def import_libsvm_sparse(filename):
    """Imports dataset file in libsvm sparse format"""
    from sklearn.datasets import load_svmlight_file
    X, y = load_svmlight_file(filename)
    return Dataset(X.toarray(), y)

Source File: run_sklearn.py From recipy with Apache License 2.0

5 votes

def load_svmlight_file(self):
        """
        Use sklearn.datasets.load_svmlight_file to load data.svmlight.
        """
        file_name = os.path.join(self.data_dir, "data.svmlight")
        datasets.load_svmlight_file(file_name)

Source File: data_utils.py From pyxclib with MIT License

5 votes

def read_data(filename, header=True, dtype='float32', zero_based=True):
    """Read data in sparse format

    Arguments
    ---------
    filename: str
        output file name
    header: bool, default=True
        If header is present or not
    dtype: str, default='float32'
        data type of values
    zero_based: boolean, default=True
        zwero based indices?

    Returns
    --------
    features: csr_matrix
        features matrix
    labels: csr_matix
        labels matrix
    num_samples: int
        #instances
    num_feat: int
        #features
    num_labels: int
        #labels
    """
    with open(filename, 'rb') as f:
        _l_shape = None
        if header:
            line = f.readline().decode('utf-8').rstrip("\n")
            line = line.split(" ")
            num_samples, num_feat, num_labels = int(
                line[0]), int(line[1]), int(line[2])
            _l_shape = (num_samples, num_labels)
        else:
            num_samples, num_feat, num_labels = None, None, None
        features, labels = load_svmlight_file(f, multilabel=True)
        labels = ll_to_sparse(
            labels, dtype=dtype, zero_based=zero_based, shape=_l_shape)
    return features, labels, num_samples, num_feat, num_labels

Source File: test_multilabel_realdata.py From libact with BSD 2-Clause "Simplified" License

5 votes

def setUp(self):
        dataset_filepath = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            'datasets/yeast_train.svm')
        X, y = load_svmlight_file(dataset_filepath, multilabel=True)
        self.X = X.todense().tolist()
        self.y = MultiLabelBinarizer().fit_transform(y).tolist()
        self.quota = 10

Source File: io.py From dislib with Apache License 2.0

4 votes

def load_svmlight_file(path, block_size, n_features, store_sparse):
    """ Loads a SVMLight file into a distributed array.

    Parameters
    ----------
    path : string
        File path.
    block_size : tuple (int, int)
        Size of the blocks for the output ds-array.
    n_features : int
        Number of features.
    store_sparse : boolean
        Whether to use scipy.sparse data structures to store data. If False,
        numpy.array is used instead.

    Returns
    -------
    x, y : (ds-array, ds-array)
        A distributed representation (ds-array) of the X and y.
    """
    n, m = block_size
    lines = []
    x_blocks, y_blocks = [], []

    n_rows = 0
    with open(path, "r") as f:
        for line in f:
            n_rows += 1
            lines.append(line.encode())

            if len(lines) == n:
                # line 0 -> X, line 1 -> y
                out_blocks = Array._get_out_blocks((1, ceil(n_features / m)))
                out_blocks.append([object()])
                # out_blocks.append([])
                _read_svmlight(lines, out_blocks, col_size=m,
                               n_features=n_features,
                               store_sparse=store_sparse)
                # we append only the list forming the row (out_blocks depth=2)
                x_blocks.append(out_blocks[0])
                y_blocks.append(out_blocks[1])
                lines = []

    if lines:
        out_blocks = Array._get_out_blocks((1, ceil(n_features / m)))
        out_blocks.append([object()])
        _read_svmlight(lines, out_blocks, col_size=m,
                       n_features=n_features, store_sparse=store_sparse)
        # we append only the list forming the row (out_blocks depth=2)
        x_blocks.append(out_blocks[0])
        y_blocks.append(out_blocks[1])

    x = Array(x_blocks, top_left_shape=block_size, reg_shape=block_size,
              shape=(n_rows, n_features), sparse=store_sparse)

    # y has only a single line but it's treated as a 'column'
    y = Array(y_blocks, top_left_shape=(n, 1), reg_shape=(n, 1),
              shape=(n_rows, 1), sparse=False)

    return x, y

Source File: bidnn.py From BiDNN with GNU Affero General Public License v3.0

4 votes

def load_dataset(self, X=None):
        if self.conf.verbosity > 1:
            print "Loading dataset..."
        if X is None:
            self.X_train, self.tl = load_svmlight_file(self.conf.fname_in, dtype=np.float32, multilabel=False)
            # we're saving tl (target labels) just in case they exist and the user needs them - since
            # this is unsupervised learning, we completely ignore the labels and don't expect them to exist
        else:
            self.X_train = X
        
        self.X_train = self.X_train.todense()

        if (self.conf.mod1size + self.conf.mod2size) != self.X_train.shape[1]:
            raise ValueError("Provided dimensionality of 1st modality ("+str(self.conf.mod1size)+") and 2nd modality ("+str(self.conf.mod2size)+") " \
                             "does not sum to the dimensionality provided in the input file ("+str(self.X_train.shape[1])+")")

        # indices of missing modalities (stored for later)
        self.idxMissingFirst = []
        self.idxMissingSecond = []
        
        # generate training data for modality translation
        self.X_first = [] 
        self.X_second = []
        
        bothMissing = both = 0
        if self.conf.ignore_zeroes:
            # zeroes are not treated as missing modalities
            # I have no idea why this might be useful, but ok :D
            # since idxMissing* are left empty, this is the only
            # place where we should take care of this
            for i in range(self.X_train.shape[0]):
                both += 1
                self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size]))
                self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:]))
        else:
            # zero vectors are treated as missing modalities (default)
            for i in range(self.X_train.shape[0]):
                if not np.any(self.X_train[i, :self.conf.mod1size]): # first missing
                    if np.any(self.X_train[i, self.conf.mod1size:]): # second not missing
                        # second ok, need to reconstruct first
                        self.idxMissingFirst.append(i)
                    else:
                        bothMissing +=  1 # missing both
                else: # first ok
                    if not np.any(self.X_train[i, self.conf.mod1size:]): # second missing
                        self.idxMissingSecond.append(i)
                    else: #both ok -> use them to train translator
                        both += 1
                        self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size]))
                        self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:]))
            
        if self.conf.verbosity > 1:
            print "Both modalities present:",both, "\nMissing 1st:", len(self.idxMissingFirst), "\nMissing 2nd:",len(self.idxMissingSecond)
            print "Missing both modalities:", bothMissing, "\n"

        self.X_first = np.array(self.X_first)
        self.X_second = np.array(self.X_second)

Source File: libsvm.py From celer with BSD 3-Clause "New" or "Revised" License

4 votes

def get_X_y(dataset, compressed_path, multilabel, replace=False):
    """Load a LIBSVM dataset as sparse X and observation y/Y.
    If X and y already exists as npz and npy, they are not redownloaded unless
    replace=True."""

    ext = '.npz' if multilabel else '.npy'
    y_path = pjoin(CELER_PATH, "%s_target%s" % (NAMES[dataset], ext))
    X_path = pjoin(CELER_PATH, "%s_data.npz" % NAMES[dataset])
    if replace or not os.path.isfile(y_path) or not os.path.isfile(X_path):
        tmp_path = pjoin(CELER_PATH, "%s" % NAMES[dataset])

        decompressor = BZ2Decompressor()
        print("Decompressing...")
        with open(tmp_path, "wb") as f, open(compressed_path, "rb") as g:
            for data in iter(lambda: g.read(100 * 1024), b''):
                f.write(decompressor.decompress(data))

        n_features_total = N_FEATURES[dataset]
        print("Loading svmlight file...")
        with open(tmp_path, 'rb') as f:
            X, y = load_svmlight_file(
                f, n_features_total, multilabel=multilabel)

        os.remove(tmp_path)
        X = sparse.csc_matrix(X)
        X.sort_indices()
        sparse.save_npz(X_path, X)

        if multilabel:
            indices = np.array([lab for labels in y for lab in labels])
            indptr = np.cumsum([0] + [len(labels) for labels in y])
            data = np.ones_like(indices)
            Y = sparse.csr_matrix((data, indices, indptr))
            sparse.save_npz(y_path, Y)
            return X, Y

        else:
            np.save(y_path, y)

    else:
        X = sparse.load_npz(X_path)
        y = np.load(y_path)

    return X, y

Source File: multiclass_soft_confidence_weighted_1_diag.py From python-online-machine-learning-library with BSD 3-Clause "New" or "Revised" License

4 votes

def main():
    """
    Example of how to use
    """
    # data load
    #fname = "/home/kzk/datasets/uci_csv/iris.csv"
    fname = "/home/kzk/datasets/uci_csv/glass.csv"
    #fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv"
    #fname = "/home/kzk/datasets/uci_csv/car.csv"
    #fname = "/home/kzk/datasets/uci_csv/credit.csv"
    #fname = "/home/kzk/datasets/uci_csv/usps.csv"
    #fname = "/home/kzk/datasets/uci_csv/liver.csv"
    #fname = "/home/kzk/datasets/uci_csv/haberman.csv"
    #fname = "/home/kzk/datasets/uci_csv/pima.csv"
    #fname = "/home/kzk/datasets/uci_csv/parkinsons.csv"
    #fname = "/home/kzk/datasets/uci_csv/ionosphere.csv"
    #fname = "/home/kzk/datasets/uci_csv/isolet.csv"
    #fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv"
    #fname = "/home/kzk/datasets/uci_csv/mammographic.csv"
    #fname = "/home/kzk/datasets/uci_csv/yeast.csv"
    fname = "/home/k_yoshiyama/datasets/news20/news20.dat"
    print "dataset is", fname
    
    #data = np.loadtxt(fname, delimiter=" ")
    #X = data[:, 1:]
    #y = data[:, 0]

    (X, y) = load_svmlight_file(fname)
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)
    #X = X.toarray()
    
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)

    # learn
    model = MSCWIDiag(C=1, eta=0.9, epochs=1)
    model.learn(X, y)

    # predict
    st = time.time()
    for i in xrange(0, n_samples):
        if i % 1000 == 0:
            print "#samples = %d" % i
            pass
        sample = X[i, :]
        y_pred[i] = model.predict(sample)
    et = time.time()
    print "prediction time: %f[s]" % (et - st)
    print "prediction time/sample: %f[s]" % ((et - st) / n_samples)
    
    # show result
    cm = confusion_matrix(y, y_pred)
    #print cm
    print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm))

Python sklearn.datasets.load_svmlight_file() Examples