Python scipy.io.mmread() Examples
The following are 12
code examples of scipy.io.mmread().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scipy.io
, or try the search function
.
Example #1
Source File: _load_matrix.py From epiScanpy with BSD 3-Clause "New" or "Revised" License | 6 votes |
def read_ATAC_10x(matrix, cell_names='', var_names='', path_file=''): """ Load sparse matrix (including matrices corresponding to 10x data) as AnnData objects. read the mtx file, tsv file coresponding to cell_names and the bed file containing the variable names Parameters ---------- matrix: sparse count matrix cell_names: optional, tsv file containing cell names var_names: optional, bed file containing the feature names Return ------ AnnData object """ mat = mmread(''.join([path_file, matrix])) mat = mat.toarray() mat = np.matrix(mat.transpose()) with open(path_file+cell_names) as f: barcodes = f.readlines() barcodes = [x[:-1] for x in barcodes] with open(path_file+var_names) as f: var_names = f.readlines() var_names = ["_".join(x[:-1].split('\t')) for x in var_names] adata = ad.AnnData(mat, obs=pd.DataFrame(index=barcodes), var=pd.DataFrame(index=var_names)) adata.uns['omic'] = 'ATAC' return(adata)
Example #2
Source File: main.py From yelp with GNU Lesser General Public License v2.1 | 6 votes |
def factorize_nmf(): print('factorizing matrix') newsgroups_mmf_file = '/Users/fpena/tmp/nmf_graphlab/newsgroups/newsgroups_matrix.mmf' document_term_matrix = mmread(newsgroups_mmf_file) factorizer = decomposition.NMF( init="nndsvd", n_components=Constants.TOPIC_MODEL_NUM_TOPICS, max_iter=Constants.TOPIC_MODEL_ITERATIONS, alpha=Constants.NMF_REGULARIZATION, l1_ratio=Constants.NMF_REGULARIZATION_RATIO ) document_topic_matrix = \ factorizer.fit_transform(document_term_matrix) topic_term_matrix = factorizer.components_ # mmwrite(mmf_file, small_matrix) # mmwrite(newsgroups_mmf_file, X)
Example #3
Source File: dataset.py From SCALE with MIT License | 5 votes |
def read_mtx(path): for filename in glob(path+'/*'): basename = os.path.basename(filename) if (('count' in basename) or ('matrix' in basename)) and ('mtx' in basename): count = mmread(filename).T.tocsr().astype('float32') elif 'barcode' in basename: barcode = pd.read_csv(filename, sep='\t', header=None)[0].values elif 'gene' in basename or 'peak' in basename: feature = pd.read_csv(filename, sep='\t', header=None).iloc[:, -1].values return count, feature, barcode
Example #4
Source File: read.py From anndata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def read_mtx(filename: PathLike, dtype: str = "float32") -> AnnData: """\ Read `.mtx` file. Parameters ---------- filename The filename. dtype Numpy data type. """ from scipy.io import mmread # could be rewritten accounting for dtype to be more performant X = mmread(fspath(filename)).astype(dtype) from scipy.sparse import csr_matrix X = csr_matrix(X) return AnnData(X, dtype=dtype)
Example #5
Source File: train_funcs.py From BootEA with MIT License | 5 votes |
def generate_related_mat(folder, triples1, triples2, ref_ent1, ref_ent2): t = time.time() if "15" in folder: out_related_file = folder + "out_related_mat.npy" in_related_file = folder + "in_related_mat.npy" if os.path.exists(out_related_file): out_related_mat = np.load(out_related_file) else: out_related_mat = generate_out_related_mat(triples1, triples2, ref_ent1, ref_ent2) np.save(out_related_file, out_related_mat) if os.path.exists(in_related_file): in_related_mat = np.load(in_related_file) else: in_related_mat = generate_in_related_mat(triples1, triples2, ref_ent1, ref_ent2) np.save(in_related_file, in_related_mat) related_mat1 = out_related_mat # related_mat2 = out_related_mat + in_related_mat print("load related mat", round(time.time() - t, 2)) return related_mat1 else: out_related_file = folder + "out_related_mat.mtx" in_related_file = folder + "in_related_mat.mtx" if os.path.exists(out_related_file): out_related_mat = io.mmread(out_related_file) else: out_related_mat = generate_out_related_mat(triples1, triples2, ref_ent1, ref_ent2) io.mmwrite(out_related_file, sp.sparse.lil_matrix(out_related_mat)) if os.path.exists(in_related_file): in_related_mat = io.mmread(in_related_file) else: in_related_mat = generate_in_related_mat(triples1, triples2, ref_ent1, ref_ent2) io.mmwrite(in_related_file, in_related_mat) related_mat1 = out_related_mat # related_mat2 = out_related_mat + in_related_mat print("load related mat", round(time.time() - t, 2)) return related_mat1
Example #6
Source File: mtx.py From scprep with GNU General Public License v3.0 | 5 votes |
def load_mtx(mtx_file, cell_axis="row", gene_names=None, cell_names=None, sparse=None): """Load a mtx file Parameters ---------- filename : str The name of the mtx file to be loaded cell_axis : {'row', 'column'}, optional (default: 'row') If your data has genes on the rows and cells on the columns, use cell_axis='column' gene_names : `str`, array-like, or `None` (default: None) Expects a filename or an array containing a list of gene symbols or ids cell_names : `str`, array-like, or `None` (default: None) Expects a filename or an array containing a list of cell barcodes. sparse : bool, optional (default: None) If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory but more CPU. Returns ------- data : array-like, shape=[n_samples, n_features] If either gene or cell names are given, data will be a pd.DataFrame or pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray or scipy.sparse.spmatrix """ if cell_axis not in ["row", "column", "col"]: raise ValueError( "cell_axis {} not recognized. Expected 'row' or 'column'".format(cell_axis) ) # Read in mtx file data = sio.mmread(mtx_file) if cell_axis in ["column", "col"]: data = data.T data = _matrix_to_data_frame( data, gene_names=gene_names, cell_names=cell_names, sparse=sparse ) return data
Example #7
Source File: embed_func.py From JAPE with MIT License | 5 votes |
def get_all_sim_mat_sparse(folder): cross_sim_mat = preprocessing.normalize(io.mmread(folder + 'ents_sim.mtx'), norm='l1') kb1_sim_mat = preprocessing.normalize(io.mmread(folder + 'kb1_ents_sim.mtx'), norm='l1') kb2_sim_mat = preprocessing.normalize(io.mmread(folder + 'kb2_ents_sim.mtx'), norm='l1') return cross_sim_mat, kb1_sim_mat, kb2_sim_mat
Example #8
Source File: visAnnos.py From scMatch with MIT License | 5 votes |
def main(testFormat, testDS, annoFile, visMethod): #load test data print('##########loading test data') if testFormat == '10x': fileItem = glob.glob(os.path.join(testDS, "matrix.mtx"))[0] em = io.mmread(fileItem) em = em.tocsr().toarray() if os.path.exists(os.path.join(opt.testDS, 'genes.tsv')): row = pd.read_table(fileItem[:-10]+"genes.tsv", header=None, index_col=None) else: row = pd.read_table(fileItem[:-10]+"features.tsv", header=None, index_col=None) col = pd.read_table(fileItem[:-10]+"barcodes.tsv", header=None, index_col=None) em = pd.DataFrame(em, index=row.T.values[1], columns=col.T.values[0]) savefolder = testDS else: em = pd.read_csv(testDS, index_col=0, header=0) savefolder = testDS[:-4] print('##########reducing dimensions') cords = CalCords(savefolder, em, visMethod) annos = pd.read_csv(annoFile, index_col=0, header=0) commonIdx = set(cords.index).intersection(set(annos.index)) cords = cords.ix[commonIdx,] annos = annos.ix[commonIdx,] print('##########darwing the scatter plots in the folder: %s' % savefolder) DrawScatters(savefolder, annoFile, visMethod, cords, annos) print('##########DONE!')
Example #9
Source File: msm.py From enspara with GNU General Public License v3.0 | 5 votes |
def load(cls, path, manifest='manifest.json'): '''Load an MSM object from disk into memory. Parameters ---------- path : str The location of the root directory of the MSM seralization manifest : str The name of the file to save as a json manifest of the MSM directory (contains the paths to each other file). ''' if not os.path.isdir(path): raise NotImplementedError("MSMs don't handle zip archives yet.") with open(os.path.join(path, manifest)) as f: fname_dict = json.load(f) # decorate fname_dict values with path fname_dict = {k: os.path.join(path, v) for k, v in fname_dict.items()} with open(fname_dict['config'], 'rb') as f: config = pickle.load(f) msm = MSM(**config) msm.tcounts_ = mmread(fname_dict['tcounts_']) msm.tprobs_ = mmread(fname_dict['tprobs_']) msm.mapping_ = TrimMapping.load(fname_dict['mapping_']) msm.eq_probs_ = np.loadtxt(fname_dict['eq_probs_']) return msm
Example #10
Source File: loompy.py From loompy with BSD 2-Clause "Simplified" License | 4 votes |
def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) -> str: """ Create a .loom file from 10X Genomics cellranger output Args: indir (str): path to the cellranger output folder (the one that contains 'outs') outdir (str): output folder wher the new loom file should be saved (default to indir) genome (str): genome build to load (e.g. 'mm10'; if None, determine species from outs folder) Returns: path (str): Full path to the created loom file. Remarks: The resulting file will be named ``{sampleID}.loom``, where the sampleID is the one given by cellranger. """ if outdir is None: outdir = indir sampleid = os.path.split(os.path.abspath(indir))[-1] matrix_folder = os.path.join(indir, 'outs', 'filtered_gene_bc_matrices') if os.path.exists(matrix_folder): if genome is None: genome = [f for f in os.listdir(matrix_folder) if not f.startswith(".")][0] matrix_folder = os.path.join(matrix_folder, genome) matrix = mmread(os.path.join(matrix_folder, "matrix.mtx")).todense() genelines = open(os.path.join(matrix_folder, "genes.tsv"), "r").readlines() bclines = open(os.path.join(matrix_folder, "barcodes.tsv"), "r").readlines() else: # cellranger V3 file locations if genome is None: genome = "" # Genome is not visible from V3 folder matrix_folder = os.path.join(indir, 'outs', 'filtered_feature_bc_matrix') matrix = mmread(os.path.join(matrix_folder, "matrix.mtx.gz")).todense() genelines = [l.decode() for l in gzip.open(os.path.join(matrix_folder, "features.tsv.gz"), "r").readlines()] bclines = [l.decode() for l in gzip.open(os.path.join(matrix_folder, "barcodes.tsv.gz"), "r").readlines()] accession = np.array([x.split("\t")[0] for x in genelines]).astype("str") gene = np.array([x.split("\t")[1].strip() for x in genelines]).astype("str") cellids = np.array([sampleid + ":" + x.strip() for x in bclines]).astype("str") col_attrs = {"CellID": cellids} row_attrs = {"Accession": accession, "Gene": gene} tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "projection.csv") # In cellranger V2 the file moved one level deeper if not os.path.exists(tsne_file): tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "2_components", "projection.csv") if os.path.exists(tsne_file): tsne = np.loadtxt(tsne_file, usecols=(1, 2), delimiter=',', skiprows=1) col_attrs["X"] = tsne[:, 0].astype('float32') col_attrs["Y"] = tsne[:, 1].astype('float32') clusters_file = os.path.join(indir, "outs", "analysis", "clustering", "graphclust", "clusters.csv") if os.path.exists(clusters_file): labels = np.loadtxt(clusters_file, usecols=(1, ), delimiter=',', skiprows=1) col_attrs["ClusterID"] = labels.astype('int') - 1 path = os.path.join(outdir, sampleid + ".loom") create(path, matrix, row_attrs, col_attrs, file_attrs={"Genome": genome}) return path
Example #11
Source File: loompy.py From loompy with BSD 2-Clause "Simplified" License | 4 votes |
def create_from_matrix_market(out_file: str, sample_id: str, layer_paths: Dict[str, str], row_metadata_path: str, column_metadata_path: str, delim: str = "\t", skip_row_headers: bool = False, skip_colums_headers: bool = False, file_attrs: Dict[str, str] = None, matrix_transposed: bool = False) -> None: """ Create a .loom file from .mtx matrix market format Args: out_file: path to the newly created .loom file (will be overwritten if it exists) sample_id: string to use as prefix for cell IDs layer_paths: dict mapping layer names to paths to the corresponding matrix file (usually with .mtx extension) row_metadata_path: path to the row (usually genes) metadata file column_metadata_path: path to the column (usually cells) metadata file delim: delimiter used for metadata (default: "\t") skip_row_headers: if true, skip first line in rows metadata file skip_column_headers: if true, skip first line in columns metadata file file_attrs: dict of global file attributes, or None matrix_transposed: if true, the main matrix is transposed Remarks: layer_paths should typically map the empty string to a matrix market file: {"": "path/to/filename.mtx"}. To create a multilayer loom file, map multiple named layers {"": "path/to/layer1.mtx", "layer2": "path/to/layer2.mtx"} Note: the created file MUST have a main layer named "". If no such layer is given, BUT all given layers are the same datatype, then a main layer will be created as the sum of the other layers. For example, {"spliced": "spliced.mtx", "unspliced": "unspliced.mtx"} will create three layers, "", "spliced", and "unspliced", where "" is the sum of the other two. """ layers: Dict[str, Union[np.ndarray, scipy.sparse.coo_matrix]] = {} for name, path in layer_paths.items(): matrix = mmread(path) if matrix_transposed: matrix = matrix.T layers[name] = matrix if "" not in layers: main_matrix = None for name, matrix in layers.items(): if main_matrix is None: main_matrix = matrix.copy() else: main_matrix = main_matrix + matrix layers[""] = main_matrix genelines = open(row_metadata_path, "r").readlines() bclines = open(column_metadata_path, "r").readlines() accession = np.array([x.split("\t")[0] for x in genelines]).astype("str") if(len(genelines[0].split("\t")) > 1): gene = np.array([x.split("\t")[1].strip() for x in genelines]).astype("str") row_attrs = {"Accession": accession, "Gene": gene} else: row_attrs = {"Accession": accession} cellids = np.array([sample_id + ":" + x.strip() for x in bclines]).astype("str") col_attrs = {"CellID": cellids} create(out_file, layers[""], row_attrs, col_attrs, file_attrs=file_attrs) if len(layers) > 1: with loompy.connect(out_file) as ds: for name, layer in layers.items(): if name == "": continue ds[name] = layer
Example #12
Source File: cell_collection.py From altanalyze with Apache License 2.0 | 4 votes |
def from_cellranger_mtx(mtx_directory, genome=None, returnGenes=False): """ Creates a CellCollection from a sparse matrix (.mtx and associated files) exported by CellRanger Recognize directories from CellRanger version 2 (files: matrix.mtx, genes.tsv, barcodes.tsv) and CellRanger v3 (files: matrix.mtx.gz, features.tsv.gz, barcodes.tsv.gz) """ start = time.time() coll = CellCollection() cellranger_version = 2 if '.mtx' in mtx_directory: mtx_file = mtx_directory ### Hence an mtx file was directly supplied mtx_directory = os.path.abspath(os.path.join(mtx_file, os.pardir)) else: mtx_file = os.path.join(mtx_directory, "matrix.mtx") if not os.path.exists(mtx_file): cellranger_version = 3 mtx_file = mtx_file + ".gz" if not os.path.exists(mtx_file): raise Exception("Directory {} does not contain a recognizable matrix file".format(mtx_directory)) if '.gz' in mtx_file: cellranger_version = 3 sparse_matrix = io.mmread(mtx_file) coll._matrix = sparse_matrix.tocsc() coll._gene_ids = np.empty((coll._matrix.shape[0], ), np.object) coll._gene_names = np.empty((coll._matrix.shape[0], ), np.object) if cellranger_version == 2: with open(os.path.join(mtx_directory, "genes.tsv"), "rU") as f: idx = 0 for line in f: i, n = line.rstrip().split("\t") coll._gene_ids[idx] = i coll._gene_names[idx] = n idx += 1 with open(os.path.join(mtx_directory, "barcodes.tsv"), "rU") as f: coll._barcodes = np.array( [ line.rstrip() for line in f ] ) else: with gzip.open(os.path.join(mtx_directory, "features.tsv.gz"), "rt") as f: idx = 0 indices = [] for line in f: i, n, t = line.rstrip().split("\t") coll._gene_ids[idx] = i coll._gene_names[idx] = n if t == 'Gene Expression': indices.append(idx) idx += 1 coll._filter_genes_by_index(indices) with gzip.open(os.path.join(mtx_directory, "barcodes.tsv.gz"), "rt") as f: coll._barcodes = np.array( [ line.rstrip() for line in f ] ) if returnGenes: """ Do not import the matrix at this point """ return list(coll._gene_names) print('sparse matrix data imported from mtx file in %s seconds' % str(time.time()-start)) return coll