Python scipy.io.mmread() Examples

The following are 12 code examples of scipy.io.mmread(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.io , or try the search function .
Example #1
Source File: _load_matrix.py    From epiScanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def read_ATAC_10x(matrix, cell_names='', var_names='', path_file=''):
    """
    Load sparse matrix (including matrices corresponding to 10x data) as AnnData objects.
    read the mtx file, tsv file coresponding to cell_names and the bed file containing the variable names

    Parameters
    ----------
    matrix: sparse count matrix

    cell_names: optional, tsv file containing cell names

    var_names: optional, bed file containing the feature names

    Return
    ------
    AnnData object

    """

    
    mat = mmread(''.join([path_file, matrix]))
    mat = mat.toarray()
    mat = np.matrix(mat.transpose())
    
    with open(path_file+cell_names) as f:
        barcodes = f.readlines() 
        barcodes = [x[:-1] for x in barcodes]
        
    with open(path_file+var_names) as f:
        var_names = f.readlines()
        var_names = ["_".join(x[:-1].split('\t')) for x in var_names]
        
    adata = ad.AnnData(mat, obs=pd.DataFrame(index=barcodes), var=pd.DataFrame(index=var_names))
    adata.uns['omic'] = 'ATAC'
    
    return(adata) 
Example #2
Source File: main.py    From yelp with GNU Lesser General Public License v2.1 6 votes vote down vote up
def factorize_nmf():
    print('factorizing matrix')

    newsgroups_mmf_file = '/Users/fpena/tmp/nmf_graphlab/newsgroups/newsgroups_matrix.mmf'
    document_term_matrix = mmread(newsgroups_mmf_file)

    factorizer = decomposition.NMF(
        init="nndsvd", n_components=Constants.TOPIC_MODEL_NUM_TOPICS,
        max_iter=Constants.TOPIC_MODEL_ITERATIONS,
        alpha=Constants.NMF_REGULARIZATION,
        l1_ratio=Constants.NMF_REGULARIZATION_RATIO
    )
    document_topic_matrix = \
        factorizer.fit_transform(document_term_matrix)
    topic_term_matrix = factorizer.components_
    # mmwrite(mmf_file, small_matrix)
    # mmwrite(newsgroups_mmf_file, X) 
Example #3
Source File: dataset.py    From SCALE with MIT License 5 votes vote down vote up
def read_mtx(path):
    for filename in glob(path+'/*'):
        basename = os.path.basename(filename)
        if (('count' in basename) or ('matrix' in basename)) and ('mtx' in basename):
            count = mmread(filename).T.tocsr().astype('float32')
        elif 'barcode' in basename:
            barcode = pd.read_csv(filename, sep='\t', header=None)[0].values
        elif 'gene' in basename or 'peak' in basename:
            feature = pd.read_csv(filename, sep='\t', header=None).iloc[:, -1].values

    return count, feature, barcode 
Example #4
Source File: read.py    From anndata with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def read_mtx(filename: PathLike, dtype: str = "float32") -> AnnData:
    """\
    Read `.mtx` file.

    Parameters
    ----------
    filename
        The filename.
    dtype
        Numpy data type.
    """
    from scipy.io import mmread

    # could be rewritten accounting for dtype to be more performant
    X = mmread(fspath(filename)).astype(dtype)
    from scipy.sparse import csr_matrix

    X = csr_matrix(X)
    return AnnData(X, dtype=dtype) 
Example #5
Source File: train_funcs.py    From BootEA with MIT License 5 votes vote down vote up
def generate_related_mat(folder, triples1, triples2, ref_ent1, ref_ent2):
    t = time.time()
    if "15" in folder:
        out_related_file = folder + "out_related_mat.npy"
        in_related_file = folder + "in_related_mat.npy"
        if os.path.exists(out_related_file):
            out_related_mat = np.load(out_related_file)
        else:
            out_related_mat = generate_out_related_mat(triples1, triples2, ref_ent1, ref_ent2)
            np.save(out_related_file, out_related_mat)
        if os.path.exists(in_related_file):
            in_related_mat = np.load(in_related_file)
        else:
            in_related_mat = generate_in_related_mat(triples1, triples2, ref_ent1, ref_ent2)
            np.save(in_related_file, in_related_mat)
        related_mat1 = out_related_mat
        # related_mat2 = out_related_mat + in_related_mat
        print("load related mat", round(time.time() - t, 2))
        return related_mat1
    else:
        out_related_file = folder + "out_related_mat.mtx"
        in_related_file = folder + "in_related_mat.mtx"
        if os.path.exists(out_related_file):
            out_related_mat = io.mmread(out_related_file)
        else:
            out_related_mat = generate_out_related_mat(triples1, triples2, ref_ent1, ref_ent2)
            io.mmwrite(out_related_file, sp.sparse.lil_matrix(out_related_mat))
        if os.path.exists(in_related_file):
            in_related_mat = io.mmread(in_related_file)
        else:
            in_related_mat = generate_in_related_mat(triples1, triples2, ref_ent1, ref_ent2)
            io.mmwrite(in_related_file, in_related_mat)
        related_mat1 = out_related_mat
        # related_mat2 = out_related_mat + in_related_mat
        print("load related mat", round(time.time() - t, 2))
        return related_mat1 
Example #6
Source File: mtx.py    From scprep with GNU General Public License v3.0 5 votes vote down vote up
def load_mtx(mtx_file, cell_axis="row", gene_names=None, cell_names=None, sparse=None):
    """Load a mtx file

    Parameters
    ----------
    filename : str
        The name of the mtx file to be loaded
    cell_axis : {'row', 'column'}, optional (default: 'row')
        If your data has genes on the rows and cells on the columns, use
        cell_axis='column'
    gene_names : `str`, array-like, or `None` (default: None)
        Expects a filename or an array containing a list of gene symbols or ids
    cell_names : `str`, array-like, or `None` (default: None)
        Expects a filename or an array containing a list of cell barcodes.
    sparse : bool, optional (default: None)
        If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
        but more CPU.

    Returns
    -------
    data : array-like, shape=[n_samples, n_features]
        If either gene or cell names are given, data will be a pd.DataFrame or
        pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
        or scipy.sparse.spmatrix
    """
    if cell_axis not in ["row", "column", "col"]:
        raise ValueError(
            "cell_axis {} not recognized. Expected 'row' or 'column'".format(cell_axis)
        )
    # Read in mtx file
    data = sio.mmread(mtx_file)
    if cell_axis in ["column", "col"]:
        data = data.T
    data = _matrix_to_data_frame(
        data, gene_names=gene_names, cell_names=cell_names, sparse=sparse
    )
    return data 
Example #7
Source File: embed_func.py    From JAPE with MIT License 5 votes vote down vote up
def get_all_sim_mat_sparse(folder):
    cross_sim_mat = preprocessing.normalize(io.mmread(folder + 'ents_sim.mtx'), norm='l1')
    kb1_sim_mat = preprocessing.normalize(io.mmread(folder + 'kb1_ents_sim.mtx'), norm='l1')
    kb2_sim_mat = preprocessing.normalize(io.mmread(folder + 'kb2_ents_sim.mtx'), norm='l1')
    return cross_sim_mat, kb1_sim_mat, kb2_sim_mat 
Example #8
Source File: visAnnos.py    From scMatch with MIT License 5 votes vote down vote up
def main(testFormat, testDS, annoFile, visMethod):
    #load test data
    print('##########loading test data')
    if testFormat == '10x':
        fileItem = glob.glob(os.path.join(testDS, "matrix.mtx"))[0]
        em = io.mmread(fileItem)
        em = em.tocsr().toarray()
        if os.path.exists(os.path.join(opt.testDS, 'genes.tsv')):
            row = pd.read_table(fileItem[:-10]+"genes.tsv", header=None, index_col=None)
        else:
            row = pd.read_table(fileItem[:-10]+"features.tsv", header=None, index_col=None)
        col = pd.read_table(fileItem[:-10]+"barcodes.tsv", header=None, index_col=None)
        em = pd.DataFrame(em, index=row.T.values[1], columns=col.T.values[0])
        savefolder = testDS
    else:
        em = pd.read_csv(testDS, index_col=0, header=0)
        savefolder = testDS[:-4]
        
    print('##########reducing dimensions')
    cords = CalCords(savefolder, em, visMethod)
    annos = pd.read_csv(annoFile, index_col=0, header=0)
    commonIdx = set(cords.index).intersection(set(annos.index))
    cords = cords.ix[commonIdx,]
    annos = annos.ix[commonIdx,]
    
    print('##########darwing the scatter plots in the folder: %s' % savefolder)
    DrawScatters(savefolder, annoFile, visMethod, cords, annos)
    print('##########DONE!') 
Example #9
Source File: msm.py    From enspara with GNU General Public License v3.0 5 votes vote down vote up
def load(cls, path, manifest='manifest.json'):
        '''Load an MSM object from disk into memory.

        Parameters
        ----------
        path : str
            The location of the root directory of the MSM seralization
        manifest : str
            The name of the file to save as a json manifest of the MSM
            directory (contains the paths to each other file).
        '''
        if not os.path.isdir(path):
            raise NotImplementedError("MSMs don't handle zip archives yet.")

        with open(os.path.join(path, manifest)) as f:
            fname_dict = json.load(f)

        # decorate fname_dict values with path
        fname_dict = {k: os.path.join(path, v) for k, v in fname_dict.items()}

        with open(fname_dict['config'], 'rb') as f:
            config = pickle.load(f)

        msm = MSM(**config)

        msm.tcounts_ = mmread(fname_dict['tcounts_'])
        msm.tprobs_ = mmread(fname_dict['tprobs_'])
        msm.mapping_ = TrimMapping.load(fname_dict['mapping_'])
        msm.eq_probs_ = np.loadtxt(fname_dict['eq_probs_'])

        return msm 
Example #10
Source File: loompy.py    From loompy with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) -> str:
	"""
	Create a .loom file from 10X Genomics cellranger output

	Args:
		indir (str):	path to the cellranger output folder (the one that contains 'outs')
		outdir (str):	output folder wher the new loom file should be saved (default to indir)
		genome (str):	genome build to load (e.g. 'mm10'; if None, determine species from outs folder)

	Returns:
		path (str):		Full path to the created loom file.

	Remarks:
		The resulting file will be named ``{sampleID}.loom``, where the sampleID is the one given by cellranger.
	"""
	if outdir is None:
		outdir = indir
	sampleid = os.path.split(os.path.abspath(indir))[-1]
	matrix_folder = os.path.join(indir, 'outs', 'filtered_gene_bc_matrices')
	if os.path.exists(matrix_folder):
		if genome is None:
			genome = [f for f in os.listdir(matrix_folder) if not f.startswith(".")][0]
		matrix_folder = os.path.join(matrix_folder, genome)
		matrix = mmread(os.path.join(matrix_folder, "matrix.mtx")).todense()
		genelines = open(os.path.join(matrix_folder, "genes.tsv"), "r").readlines()
		bclines = open(os.path.join(matrix_folder, "barcodes.tsv"), "r").readlines()
	else:  # cellranger V3 file locations
		if genome is None:
			genome = ""  # Genome is not visible from V3 folder
		matrix_folder = os.path.join(indir, 'outs', 'filtered_feature_bc_matrix')
		matrix = mmread(os.path.join(matrix_folder, "matrix.mtx.gz")).todense()
		genelines = [l.decode() for l in gzip.open(os.path.join(matrix_folder, "features.tsv.gz"), "r").readlines()]
		bclines = [l.decode() for l in gzip.open(os.path.join(matrix_folder, "barcodes.tsv.gz"), "r").readlines()]

	accession = np.array([x.split("\t")[0] for x in genelines]).astype("str")
	gene = np.array([x.split("\t")[1].strip() for x in genelines]).astype("str")
	cellids = np.array([sampleid + ":" + x.strip() for x in bclines]).astype("str")

	col_attrs = {"CellID": cellids}
	row_attrs = {"Accession": accession, "Gene": gene}

	tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "projection.csv")
	# In cellranger V2 the file moved one level deeper
	if not os.path.exists(tsne_file):
		tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "2_components", "projection.csv")
	if os.path.exists(tsne_file):
		tsne = np.loadtxt(tsne_file, usecols=(1, 2), delimiter=',', skiprows=1)
		col_attrs["X"] = tsne[:, 0].astype('float32')
		col_attrs["Y"] = tsne[:, 1].astype('float32')

	clusters_file = os.path.join(indir, "outs", "analysis", "clustering", "graphclust", "clusters.csv")
	if os.path.exists(clusters_file):
		labels = np.loadtxt(clusters_file, usecols=(1, ), delimiter=',', skiprows=1)
		col_attrs["ClusterID"] = labels.astype('int') - 1

	path = os.path.join(outdir, sampleid + ".loom")
	create(path, matrix, row_attrs, col_attrs, file_attrs={"Genome": genome})
	return path 
Example #11
Source File: loompy.py    From loompy with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def create_from_matrix_market(out_file: str, sample_id: str, layer_paths: Dict[str, str], row_metadata_path: str, column_metadata_path: str, delim: str = "\t", skip_row_headers: bool = False, skip_colums_headers: bool = False, file_attrs: Dict[str, str] = None, matrix_transposed: bool = False) -> None:
	"""
	Create a .loom file from .mtx matrix market format

	Args:
		out_file:				path to the newly created .loom file (will be overwritten if it exists)
		sample_id:				string to use as prefix for cell IDs
		layer_paths:			dict mapping layer names to paths to the corresponding matrix file (usually with .mtx extension)
		row_metadata_path:		path to the row (usually genes) metadata file
		column_metadata_path:	path to the column (usually cells) metadata file
		delim:					delimiter used for metadata (default: "\t")
		skip_row_headers:		if true, skip first line in rows metadata file
		skip_column_headers: 	if true, skip first line in columns metadata file
		file_attrs:				dict of global file attributes, or None
		matrix_transposed:		if true, the main matrix is transposed
	
	Remarks:
		layer_paths should typically map the empty string to a matrix market file: {"": "path/to/filename.mtx"}.
		To create a multilayer loom file, map multiple named layers {"": "path/to/layer1.mtx", "layer2": "path/to/layer2.mtx"}
		Note: the created file MUST have a main layer named "". If no such layer is given, BUT all given layers are the same
		datatype, then a main layer will be created as the sum of the other layers. For example, {"spliced": "spliced.mtx", "unspliced": "unspliced.mtx"}
		will create three layers, "", "spliced", and "unspliced", where "" is the sum of the other two.
	"""
	layers: Dict[str, Union[np.ndarray, scipy.sparse.coo_matrix]] = {}

	for name, path in layer_paths.items():
		matrix = mmread(path)
		if matrix_transposed:
			matrix = matrix.T
		layers[name] = matrix
	if "" not in layers:
		main_matrix = None
		for name, matrix in layers.items():
			if main_matrix is None:
				main_matrix = matrix.copy()
			else:
				main_matrix = main_matrix + matrix
		layers[""] = main_matrix

	genelines = open(row_metadata_path, "r").readlines()
	bclines = open(column_metadata_path, "r").readlines()

	accession = np.array([x.split("\t")[0] for x in genelines]).astype("str")
	if(len(genelines[0].split("\t")) > 1):
		gene = np.array([x.split("\t")[1].strip() for x in genelines]).astype("str")
		row_attrs = {"Accession": accession, "Gene": gene}
	else:
		row_attrs = {"Accession": accession}

	cellids = np.array([sample_id + ":" + x.strip() for x in bclines]).astype("str")
	col_attrs = {"CellID": cellids}

	create(out_file, layers[""], row_attrs, col_attrs, file_attrs=file_attrs)

	if len(layers) > 1:
		with loompy.connect(out_file) as ds:
			for name, layer in layers.items():
				if name == "":
					continue
				ds[name] = layer 
Example #12
Source File: cell_collection.py    From altanalyze with Apache License 2.0 4 votes vote down vote up
def from_cellranger_mtx(mtx_directory, genome=None, returnGenes=False):

        """
        Creates a CellCollection from a sparse matrix (.mtx and associated files) exported by CellRanger

        Recognize directories from CellRanger version 2 (files: matrix.mtx, genes.tsv, barcodes.tsv) and
        CellRanger v3 (files: matrix.mtx.gz, features.tsv.gz, barcodes.tsv.gz)
        """

        start = time.time()
        coll = CellCollection()
        cellranger_version = 2
        if '.mtx' in mtx_directory:
            mtx_file = mtx_directory ### Hence an mtx file was directly supplied
            mtx_directory = os.path.abspath(os.path.join(mtx_file, os.pardir))
        else:
            mtx_file = os.path.join(mtx_directory, "matrix.mtx")
            
        if not os.path.exists(mtx_file):
            cellranger_version = 3
            mtx_file = mtx_file + ".gz"
            if not os.path.exists(mtx_file):
                raise Exception("Directory {} does not contain a recognizable matrix file".format(mtx_directory))
        if '.gz' in mtx_file:
            cellranger_version = 3
        sparse_matrix = io.mmread(mtx_file)
        coll._matrix = sparse_matrix.tocsc()
        coll._gene_ids = np.empty((coll._matrix.shape[0], ), np.object)
        coll._gene_names = np.empty((coll._matrix.shape[0], ), np.object)
        
        if cellranger_version == 2:
            with open(os.path.join(mtx_directory, "genes.tsv"), "rU") as f:
                idx = 0
                for line in f:
                    i, n = line.rstrip().split("\t")
                    coll._gene_ids[idx] = i
                    coll._gene_names[idx] = n
                    idx += 1
            with open(os.path.join(mtx_directory, "barcodes.tsv"), "rU") as f:
                coll._barcodes = np.array( [ line.rstrip() for line in f ] )
        else:
            with gzip.open(os.path.join(mtx_directory, "features.tsv.gz"), "rt") as f:
                idx = 0
                indices = []
                for line in f:
                    i, n, t = line.rstrip().split("\t")
                    coll._gene_ids[idx] = i
                    coll._gene_names[idx] = n
                    if t == 'Gene Expression':
                        indices.append(idx)
                    idx += 1
                coll._filter_genes_by_index(indices)
            with gzip.open(os.path.join(mtx_directory, "barcodes.tsv.gz"), "rt") as f:
                coll._barcodes = np.array( [ line.rstrip() for line in f ] )

        if returnGenes:
            """ Do not import the matrix at this point """
            return list(coll._gene_names)
            
        print('sparse matrix data imported from mtx file in %s seconds' % str(time.time()-start))
        return coll