Python Examples of scipy.io.mmread

Source File: _load_matrix.py From epiScanpy with BSD 3-Clause "New" or "Revised" License

6 votes

def read_ATAC_10x(matrix, cell_names='', var_names='', path_file=''):
    """
    Load sparse matrix (including matrices corresponding to 10x data) as AnnData objects.
    read the mtx file, tsv file coresponding to cell_names and the bed file containing the variable names

    Parameters
    ----------
    matrix: sparse count matrix

    cell_names: optional, tsv file containing cell names

    var_names: optional, bed file containing the feature names

    Return
    ------
    AnnData object

    """

    
    mat = mmread(''.join([path_file, matrix]))
    mat = mat.toarray()
    mat = np.matrix(mat.transpose())
    
    with open(path_file+cell_names) as f:
        barcodes = f.readlines() 
        barcodes = [x[:-1] for x in barcodes]
        
    with open(path_file+var_names) as f:
        var_names = f.readlines()
        var_names = ["_".join(x[:-1].split('\t')) for x in var_names]
        
    adata = ad.AnnData(mat, obs=pd.DataFrame(index=barcodes), var=pd.DataFrame(index=var_names))
    adata.uns['omic'] = 'ATAC'
    
    return(adata)

Source File: main.py From yelp with GNU Lesser General Public License v2.1

6 votes

def factorize_nmf():
    print('factorizing matrix')

    newsgroups_mmf_file = '/Users/fpena/tmp/nmf_graphlab/newsgroups/newsgroups_matrix.mmf'
    document_term_matrix = mmread(newsgroups_mmf_file)

    factorizer = decomposition.NMF(
        init="nndsvd", n_components=Constants.TOPIC_MODEL_NUM_TOPICS,
        max_iter=Constants.TOPIC_MODEL_ITERATIONS,
        alpha=Constants.NMF_REGULARIZATION,
        l1_ratio=Constants.NMF_REGULARIZATION_RATIO
    )
    document_topic_matrix = \
        factorizer.fit_transform(document_term_matrix)
    topic_term_matrix = factorizer.components_
    # mmwrite(mmf_file, small_matrix)
    # mmwrite(newsgroups_mmf_file, X)

Source File: dataset.py From SCALE with MIT License

5 votes

def read_mtx(path):
    for filename in glob(path+'/*'):
        basename = os.path.basename(filename)
        if (('count' in basename) or ('matrix' in basename)) and ('mtx' in basename):
            count = mmread(filename).T.tocsr().astype('float32')
        elif 'barcode' in basename:
            barcode = pd.read_csv(filename, sep='\t', header=None)[0].values
        elif 'gene' in basename or 'peak' in basename:
            feature = pd.read_csv(filename, sep='\t', header=None).iloc[:, -1].values

    return count, feature, barcode

Source File: read.py From anndata with BSD 3-Clause "New" or "Revised" License

5 votes

def read_mtx(filename: PathLike, dtype: str = "float32") -> AnnData:
    """\
    Read `.mtx` file.

    Parameters
    ----------
    filename
        The filename.
    dtype
        Numpy data type.
    """
    from scipy.io import mmread

    # could be rewritten accounting for dtype to be more performant
    X = mmread(fspath(filename)).astype(dtype)
    from scipy.sparse import csr_matrix

    X = csr_matrix(X)
    return AnnData(X, dtype=dtype)

Source File: train_funcs.py From BootEA with MIT License

5 votes

def generate_related_mat(folder, triples1, triples2, ref_ent1, ref_ent2):
    t = time.time()
    if "15" in folder:
        out_related_file = folder + "out_related_mat.npy"
        in_related_file = folder + "in_related_mat.npy"
        if os.path.exists(out_related_file):
            out_related_mat = np.load(out_related_file)
        else:
            out_related_mat = generate_out_related_mat(triples1, triples2, ref_ent1, ref_ent2)
            np.save(out_related_file, out_related_mat)
        if os.path.exists(in_related_file):
            in_related_mat = np.load(in_related_file)
        else:
            in_related_mat = generate_in_related_mat(triples1, triples2, ref_ent1, ref_ent2)
            np.save(in_related_file, in_related_mat)
        related_mat1 = out_related_mat
        # related_mat2 = out_related_mat + in_related_mat
        print("load related mat", round(time.time() - t, 2))
        return related_mat1
    else:
        out_related_file = folder + "out_related_mat.mtx"
        in_related_file = folder + "in_related_mat.mtx"
        if os.path.exists(out_related_file):
            out_related_mat = io.mmread(out_related_file)
        else:
            out_related_mat = generate_out_related_mat(triples1, triples2, ref_ent1, ref_ent2)
            io.mmwrite(out_related_file, sp.sparse.lil_matrix(out_related_mat))
        if os.path.exists(in_related_file):
            in_related_mat = io.mmread(in_related_file)
        else:
            in_related_mat = generate_in_related_mat(triples1, triples2, ref_ent1, ref_ent2)
            io.mmwrite(in_related_file, in_related_mat)
        related_mat1 = out_related_mat
        # related_mat2 = out_related_mat + in_related_mat
        print("load related mat", round(time.time() - t, 2))
        return related_mat1

Source File: mtx.py From scprep with GNU General Public License v3.0

5 votes

def load_mtx(mtx_file, cell_axis="row", gene_names=None, cell_names=None, sparse=None):
    """Load a mtx file

    Parameters
    ----------
    filename : str
        The name of the mtx file to be loaded
    cell_axis : {'row', 'column'}, optional (default: 'row')
        If your data has genes on the rows and cells on the columns, use
        cell_axis='column'
    gene_names : `str`, array-like, or `None` (default: None)
        Expects a filename or an array containing a list of gene symbols or ids
    cell_names : `str`, array-like, or `None` (default: None)
        Expects a filename or an array containing a list of cell barcodes.
    sparse : bool, optional (default: None)
        If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
        but more CPU.

    Returns
    -------
    data : array-like, shape=[n_samples, n_features]
        If either gene or cell names are given, data will be a pd.DataFrame or
        pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
        or scipy.sparse.spmatrix
    """
    if cell_axis not in ["row", "column", "col"]:
        raise ValueError(
            "cell_axis {} not recognized. Expected 'row' or 'column'".format(cell_axis)
        )
    # Read in mtx file
    data = sio.mmread(mtx_file)
    if cell_axis in ["column", "col"]:
        data = data.T
    data = _matrix_to_data_frame(
        data, gene_names=gene_names, cell_names=cell_names, sparse=sparse
    )
    return data

Source File: embed_func.py From JAPE with MIT License

5 votes

def get_all_sim_mat_sparse(folder):
    cross_sim_mat = preprocessing.normalize(io.mmread(folder + 'ents_sim.mtx'), norm='l1')
    kb1_sim_mat = preprocessing.normalize(io.mmread(folder + 'kb1_ents_sim.mtx'), norm='l1')
    kb2_sim_mat = preprocessing.normalize(io.mmread(folder + 'kb2_ents_sim.mtx'), norm='l1')
    return cross_sim_mat, kb1_sim_mat, kb2_sim_mat

Source File: visAnnos.py From scMatch with MIT License

5 votes

def main(testFormat, testDS, annoFile, visMethod):
    #load test data
    print('##########loading test data')
    if testFormat == '10x':
        fileItem = glob.glob(os.path.join(testDS, "matrix.mtx"))[0]
        em = io.mmread(fileItem)
        em = em.tocsr().toarray()
        if os.path.exists(os.path.join(opt.testDS, 'genes.tsv')):
            row = pd.read_table(fileItem[:-10]+"genes.tsv", header=None, index_col=None)
        else:
            row = pd.read_table(fileItem[:-10]+"features.tsv", header=None, index_col=None)
        col = pd.read_table(fileItem[:-10]+"barcodes.tsv", header=None, index_col=None)
        em = pd.DataFrame(em, index=row.T.values[1], columns=col.T.values[0])
        savefolder = testDS
    else:
        em = pd.read_csv(testDS, index_col=0, header=0)
        savefolder = testDS[:-4]
        
    print('##########reducing dimensions')
    cords = CalCords(savefolder, em, visMethod)
    annos = pd.read_csv(annoFile, index_col=0, header=0)
    commonIdx = set(cords.index).intersection(set(annos.index))
    cords = cords.ix[commonIdx,]
    annos = annos.ix[commonIdx,]
    
    print('##########darwing the scatter plots in the folder: %s' % savefolder)
    DrawScatters(savefolder, annoFile, visMethod, cords, annos)
    print('##########DONE!')

Source File: msm.py From enspara with GNU General Public License v3.0

5 votes

def load(cls, path, manifest='manifest.json'):
        '''Load an MSM object from disk into memory.

        Parameters
        ----------
        path : str
            The location of the root directory of the MSM seralization
        manifest : str
            The name of the file to save as a json manifest of the MSM
            directory (contains the paths to each other file).
        '''
        if not os.path.isdir(path):
            raise NotImplementedError("MSMs don't handle zip archives yet.")

        with open(os.path.join(path, manifest)) as f:
            fname_dict = json.load(f)

        # decorate fname_dict values with path
        fname_dict = {k: os.path.join(path, v) for k, v in fname_dict.items()}

        with open(fname_dict['config'], 'rb') as f:
            config = pickle.load(f)

        msm = MSM(**config)

        msm.tcounts_ = mmread(fname_dict['tcounts_'])
        msm.tprobs_ = mmread(fname_dict['tprobs_'])
        msm.mapping_ = TrimMapping.load(fname_dict['mapping_'])
        msm.eq_probs_ = np.loadtxt(fname_dict['eq_probs_'])

        return msm

Source File: loompy.py From loompy with BSD 2-Clause "Simplified" License

4 votes

def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) -> str:
	"""
	Create a .loom file from 10X Genomics cellranger output

	Args:
		indir (str):	path to the cellranger output folder (the one that contains 'outs')
		outdir (str):	output folder wher the new loom file should be saved (default to indir)
		genome (str):	genome build to load (e.g. 'mm10'; if None, determine species from outs folder)

	Returns:
		path (str):		Full path to the created loom file.

	Remarks:
		The resulting file will be named ``{sampleID}.loom``, where the sampleID is the one given by cellranger.
	"""
	if outdir is None:
		outdir = indir
	sampleid = os.path.split(os.path.abspath(indir))[-1]
	matrix_folder = os.path.join(indir, 'outs', 'filtered_gene_bc_matrices')
	if os.path.exists(matrix_folder):
		if genome is None:
			genome = [f for f in os.listdir(matrix_folder) if not f.startswith(".")][0]
		matrix_folder = os.path.join(matrix_folder, genome)
		matrix = mmread(os.path.join(matrix_folder, "matrix.mtx")).todense()
		genelines = open(os.path.join(matrix_folder, "genes.tsv"), "r").readlines()
		bclines = open(os.path.join(matrix_folder, "barcodes.tsv"), "r").readlines()
	else:  # cellranger V3 file locations
		if genome is None:
			genome = ""  # Genome is not visible from V3 folder
		matrix_folder = os.path.join(indir, 'outs', 'filtered_feature_bc_matrix')
		matrix = mmread(os.path.join(matrix_folder, "matrix.mtx.gz")).todense()
		genelines = [l.decode() for l in gzip.open(os.path.join(matrix_folder, "features.tsv.gz"), "r").readlines()]
		bclines = [l.decode() for l in gzip.open(os.path.join(matrix_folder, "barcodes.tsv.gz"), "r").readlines()]

	accession = np.array([x.split("\t")[0] for x in genelines]).astype("str")
	gene = np.array([x.split("\t")[1].strip() for x in genelines]).astype("str")
	cellids = np.array([sampleid + ":" + x.strip() for x in bclines]).astype("str")

	col_attrs = {"CellID": cellids}
	row_attrs = {"Accession": accession, "Gene": gene}

	tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "projection.csv")
	# In cellranger V2 the file moved one level deeper
	if not os.path.exists(tsne_file):
		tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "2_components", "projection.csv")
	if os.path.exists(tsne_file):
		tsne = np.loadtxt(tsne_file, usecols=(1, 2), delimiter=',', skiprows=1)
		col_attrs["X"] = tsne[:, 0].astype('float32')
		col_attrs["Y"] = tsne[:, 1].astype('float32')

	clusters_file = os.path.join(indir, "outs", "analysis", "clustering", "graphclust", "clusters.csv")
	if os.path.exists(clusters_file):
		labels = np.loadtxt(clusters_file, usecols=(1, ), delimiter=',', skiprows=1)
		col_attrs["ClusterID"] = labels.astype('int') - 1

	path = os.path.join(outdir, sampleid + ".loom")
	create(path, matrix, row_attrs, col_attrs, file_attrs={"Genome": genome})
	return path

Source File: loompy.py From loompy with BSD 2-Clause "Simplified" License

4 votes

def create_from_matrix_market(out_file: str, sample_id: str, layer_paths: Dict[str, str], row_metadata_path: str, column_metadata_path: str, delim: str = "\t", skip_row_headers: bool = False, skip_colums_headers: bool = False, file_attrs: Dict[str, str] = None, matrix_transposed: bool = False) -> None:
	"""
	Create a .loom file from .mtx matrix market format

	Args:
		out_file:				path to the newly created .loom file (will be overwritten if it exists)
		sample_id:				string to use as prefix for cell IDs
		layer_paths:			dict mapping layer names to paths to the corresponding matrix file (usually with .mtx extension)
		row_metadata_path:		path to the row (usually genes) metadata file
		column_metadata_path:	path to the column (usually cells) metadata file
		delim:					delimiter used for metadata (default: "\t")
		skip_row_headers:		if true, skip first line in rows metadata file
		skip_column_headers: 	if true, skip first line in columns metadata file
		file_attrs:				dict of global file attributes, or None
		matrix_transposed:		if true, the main matrix is transposed
	
	Remarks:
		layer_paths should typically map the empty string to a matrix market file: {"": "path/to/filename.mtx"}.
		To create a multilayer loom file, map multiple named layers {"": "path/to/layer1.mtx", "layer2": "path/to/layer2.mtx"}
		Note: the created file MUST have a main layer named "". If no such layer is given, BUT all given layers are the same
		datatype, then a main layer will be created as the sum of the other layers. For example, {"spliced": "spliced.mtx", "unspliced": "unspliced.mtx"}
		will create three layers, "", "spliced", and "unspliced", where "" is the sum of the other two.
	"""
	layers: Dict[str, Union[np.ndarray, scipy.sparse.coo_matrix]] = {}

	for name, path in layer_paths.items():
		matrix = mmread(path)
		if matrix_transposed:
			matrix = matrix.T
		layers[name] = matrix
	if "" not in layers:
		main_matrix = None
		for name, matrix in layers.items():
			if main_matrix is None:
				main_matrix = matrix.copy()
			else:
				main_matrix = main_matrix + matrix
		layers[""] = main_matrix

	genelines = open(row_metadata_path, "r").readlines()
	bclines = open(column_metadata_path, "r").readlines()

	accession = np.array([x.split("\t")[0] for x in genelines]).astype("str")
	if(len(genelines[0].split("\t")) > 1):
		gene = np.array([x.split("\t")[1].strip() for x in genelines]).astype("str")
		row_attrs = {"Accession": accession, "Gene": gene}
	else:
		row_attrs = {"Accession": accession}

	cellids = np.array([sample_id + ":" + x.strip() for x in bclines]).astype("str")
	col_attrs = {"CellID": cellids}

	create(out_file, layers[""], row_attrs, col_attrs, file_attrs=file_attrs)

	if len(layers) > 1:
		with loompy.connect(out_file) as ds:
			for name, layer in layers.items():
				if name == "":
					continue
				ds[name] = layer

Source File: cell_collection.py From altanalyze with Apache License 2.0

4 votes

def from_cellranger_mtx(mtx_directory, genome=None, returnGenes=False):

        """
        Creates a CellCollection from a sparse matrix (.mtx and associated files) exported by CellRanger

        Recognize directories from CellRanger version 2 (files: matrix.mtx, genes.tsv, barcodes.tsv) and
        CellRanger v3 (files: matrix.mtx.gz, features.tsv.gz, barcodes.tsv.gz)
        """

        start = time.time()
        coll = CellCollection()
        cellranger_version = 2
        if '.mtx' in mtx_directory:
            mtx_file = mtx_directory ### Hence an mtx file was directly supplied
            mtx_directory = os.path.abspath(os.path.join(mtx_file, os.pardir))
        else:
            mtx_file = os.path.join(mtx_directory, "matrix.mtx")
            
        if not os.path.exists(mtx_file):
            cellranger_version = 3
            mtx_file = mtx_file + ".gz"
            if not os.path.exists(mtx_file):
                raise Exception("Directory {} does not contain a recognizable matrix file".format(mtx_directory))
        if '.gz' in mtx_file:
            cellranger_version = 3
        sparse_matrix = io.mmread(mtx_file)
        coll._matrix = sparse_matrix.tocsc()
        coll._gene_ids = np.empty((coll._matrix.shape[0], ), np.object)
        coll._gene_names = np.empty((coll._matrix.shape[0], ), np.object)
        
        if cellranger_version == 2:
            with open(os.path.join(mtx_directory, "genes.tsv"), "rU") as f:
                idx = 0
                for line in f:
                    i, n = line.rstrip().split("\t")
                    coll._gene_ids[idx] = i
                    coll._gene_names[idx] = n
                    idx += 1
            with open(os.path.join(mtx_directory, "barcodes.tsv"), "rU") as f:
                coll._barcodes = np.array( [ line.rstrip() for line in f ] )
        else:
            with gzip.open(os.path.join(mtx_directory, "features.tsv.gz"), "rt") as f:
                idx = 0
                indices = []
                for line in f:
                    i, n, t = line.rstrip().split("\t")
                    coll._gene_ids[idx] = i
                    coll._gene_names[idx] = n
                    if t == 'Gene Expression':
                        indices.append(idx)
                    idx += 1
                coll._filter_genes_by_index(indices)
            with gzip.open(os.path.join(mtx_directory, "barcodes.tsv.gz"), "rt") as f:
                coll._barcodes = np.array( [ line.rstrip() for line in f ] )

        if returnGenes:
            """ Do not import the matrix at this point """
            return list(coll._gene_names)
            
        print('sparse matrix data imported from mtx file in %s seconds' % str(time.time()-start))
        return coll

Python scipy.io.mmread() Examples