Python pysam.tabix_index() Examples
The following are 16
code examples of pysam.tabix_index().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pysam
, or try the search function
.
Example #1
Source File: prepare_compliance_data.py From ga4gh-server with Apache License 2.0 | 6 votes |
def addVariantSet( self, variantFileName, dataset, referenceSet, ontology, biosamples): inputVcf = os.path.join( self.inputDirectory, variantFileName) outputVcf = os.path.join( self.outputDirectory, variantFileName) shutil.copy(inputVcf, outputVcf) pysam.tabix_index(outputVcf, preset="vcf") variantSet = variants.HtslibVariantSet( dataset, variantFileName.split('_')[1]) variantSet.setReferenceSet(referenceSet) variantSet.populateFromFile( [os.path.abspath(outputVcf + ".gz")], [os.path.abspath(outputVcf + ".gz.tbi")]) variantSet.checkConsistency() for callSet in variantSet.getCallSets(): for biosample in biosamples: if biosample.getLocalId() == callSet.getLocalId(): callSet.setBiosampleId(biosample.getId()) self.repo.insertVariantSet(variantSet) for annotationSet in variantSet.getVariantAnnotationSets(): annotationSet.setOntology(ontology) self.repo.insertVariantAnnotationSet(annotationSet)
Example #2
Source File: matrix.py From pheweb with GNU Affero General Public License v3.0 | 6 votes |
def run(argv): if '-h' in argv or '--help' in argv: print('Make a single large tabixed file of all phenotypes data') exit(1) if should_run(): # we don't need `ffi.new('char[]', ...)` because args are `const` ret = lib.cffi_make_matrix(sites_filepath.encode('utf8'), common_filepaths['pheno']('*').encode('utf8'), matrix_gz_tmp_filepath.encode('utf8')) ret_bytes = ffi.string(ret, maxlen=1000) if ret_bytes != b'ok': raise PheWebError('The portion of `pheweb matrix` written in c++/cffi failed with the message ' + repr(ret_bytes)) os.rename(matrix_gz_tmp_filepath, matrix_gz_filepath) pysam.tabix_index( filename=matrix_gz_filepath, force=True, seq_col=0, start_col=1, end_col=1 # note: these are 0-based, but `/usr/bin/tabix` is 1-based ) else: print('matrix is up-to-date!')
Example #3
Source File: samtools_variants.py From ariba with GNU General Public License v3.0 | 6 votes |
def _make_vcf_and_read_depths_files(self): if not os.path.exists(self.ref_fa + '.fai'): pysam.faidx(self.ref_fa) tmp_vcf = self.vcf_file + '.tmp' with open(tmp_vcf, 'w') as f: print(pysam.mpileup( '-t', 'INFO/AD,INFO/ADF,INFO/ADR', '-L', '99999999', '-A', '-f', self.ref_fa, '-u', '-v', self.bam, ), end='', file=f) got = vcfcall_ariba.vcfcall_ariba(tmp_vcf, self.outprefix, self.min_var_read_depth, self.min_second_var_read_depth, self.max_allele_freq) if got != 0: raise Error('Error parsing vcf file. Cannot contine') pysam.tabix_compress(self.outprefix + '.read_depths', self.read_depths_file) pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1) os.unlink(self.outprefix + '.read_depths') os.unlink(tmp_vcf)
Example #4
Source File: tabix.py From svviz with MIT License | 5 votes |
def ensureIndexed(bedPath, preset="bed", trySorting=True): if not bedPath.endswith(".gz"): if not os.path.exists(bedPath+".gz"): logging.info("bgzf compressing {}".format(bedPath)) pysam.tabix_compress(bedPath, bedPath+".gz") if not os.path.exists(bedPath+".gz"): raise Exception("Failed to create compress {preset} file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format(preset=preset, file=bedPath)) bedPath += ".gz" if not os.path.exists(bedPath+".tbi"): logging.info("creating tabix index for {}".format(bedPath)) pysam.tabix_index(bedPath, preset=preset) if not os.path.exists(bedPath+".tbi"): raise Exception("Failed to create tabix index file for {file}; make sure the {preset} file is " "sorted and the directory is writeable".format(preset=preset, file=bedPath)) line = next(pysam.Tabixfile(bedPath).fetch()) if len(line.strip().split("\t")) < 6 and preset == "bed": raise AnnotationError("BED files need to have at least 6 (tab-delimited) fields (including " "chrom, start, end, name, score, strand; score is unused)") if len(line.strip().split("\t")) < 9 and preset == "gff": raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields") return bedPath # def sortFile(uncompressedPath, preset): # if preset == "bed": # fields = {"chrom":0, "start":1, "end":2} # elif preset == "gff": # fields = {"chrom":0, "start":3, "end":4} # sortCommand = "sort -k{chrom}V -k{start}n -k{end}n".format(**fields) # tabixCommand = "{sort} {path} | bgzip > {path}.gz".format(sort=sortCommand, path=uncompressedPath) # logging.info("Trying to sort input annotation file with command:") # logging.info(" {}".format(tabixCommand)) # subprocess.check_call(tabixCommand, shell=True)
Example #5
Source File: get_ins.py From NucleoATAC with MIT License | 5 votes |
def get_ins(args, bases = 50000, splitsize = 1000): """function to get insertions """ if not args.out: if args.bed is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) if args.bed is None: chrs = read_chrom_sizes_from_bam(args.bam) chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool1 = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.ins.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeIns, args=(write_queue, args.out)) write_process.start() for j in sets: if args.smooth: tmp = pool1.map(_insHelperSmooth, zip(j,itertools.repeat(args))) else: tmp = pool1.map(_insHelper, zip(j,itertools.repeat(args))) for track in tmp: write_queue.put(track) pool1.close() pool1.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
Example #6
Source File: get_cov.py From NucleoATAC with MIT License | 5 votes |
def get_cov(args, bases = 50000, splitsize = 1000): """function to get coverages """ if not args.out: if args.bed is None: args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) if args.bed is None: chrs = read_chrom_sizes_from_bam(args.bam) chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool1 = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.cov.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeCov, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args))) for track in tmp: write_queue.put(track) pool1.close() pool1.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.cov.bedgraph') pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)
Example #7
Source File: bedgraph.py From NucleoATAC with MIT License | 5 votes |
def tabix_bedgraph(bedgraph): pysam.tabix_compress(bedgraph,bedgraph+'.gz') pysam.tabix_index(bedgraph+'.gz', seq_col=0, start_col=1, end_col=2, zerobased=True)
Example #8
Source File: make_bias_track.py From NucleoATAC with MIT License | 5 votes |
def make_bias_track(args, bases = 500000, splitsize = 1000): """function to compute bias track """ if args.out is None: if args.bed is not None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) else: args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1]) params = _BiasParams(args.fasta, args.pwm) if args.bed is None: chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize) sets = chunks.split(items = bases/splitsize) else: chunks = ChunkList.read(args.bed) chunks.checkChroms(params.chrs.keys()) chunks.merge() sets = chunks.split(bases = bases) maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks]))) pool = mp.Pool(processes = max(1,args.cores-1)) out_handle = open(args.out + '.Scores.bedgraph','w') out_handle.close() write_queue = mp.JoinableQueue(maxsize = maxQueueSize) write_process = mp.Process(target = _writeBias, args=(write_queue, args.out)) write_process.start() for j in sets: tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params))) for track in tmp: write_queue.put(track) pool.close() pool.join() write_queue.put('STOP') write_process.join() pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.Scores.bedgraph') pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)
Example #9
Source File: diff_occ.py From NucleoATAC with MIT License | 5 votes |
def run_diff(args, bases = 500000): """run differential occupancy calling """ chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down)) chunks.merge() maxQueueSize = max(2,int(100 * bases / np.mean([chunk.length() for chunk in chunks]))) #get fragmentsizes fragment_dist1 = FragmentMixDistribution(0, upper = args.upper) fragment_dist1.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes1).get(0,args.upper)) fragment_dist1.modelNFR() fragment_dist2 = FragmentMixDistribution(0, upper = args.upper) fragment_dist2.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes2).get(0,args.upper)) fragment_dist2.modelNFR() params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ, flank = args.flank, bam = args.bam, ci = args.confidence_interval) sets = chunks.split(bases = bases) pool1 = mp.Pool(processes = max(1,args.cores-1)) diff_handle = open(args.out + '.occdiff.bed','w') diff_handle.close() diff_queue = mp.JoinableQueue() diff_process = mp.Process(target = _writeDiff, args=(diff_queue, args.out)) diff_process.start() nuc_dist = np.zeros(args.upper) for j in sets: tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params))) for result in tmp: diff_queue.put(result[1]) pool1.close() pool1.join() diff_queue.put('STOP') diff_process.join() pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz',force = True) shell_command('rm ' + args.out + '.occdiff.bed') pysam.tabix_index(args.out + '.occdiff.bed.gz', preset = "bed", force = True)
Example #10
Source File: vcf_utils.py From metasv with BSD 2-Clause "Simplified" License | 5 votes |
def merge_vcfs(in_vcfs_dir, contigs, out_vcf): logger.info("Mergings per-chromosome VCFs from %s" % in_vcfs_dir) header_done = False out_vcf_file = open(out_vcf, "w") for contig in contigs: chr_vcf = os.path.join(in_vcfs_dir, "%s.vcf.gz" % contig.name) if os.path.isfile(chr_vcf): chr_tabix_file = pysam.Tabixfile(chr_vcf) if not header_done: print_header(chr_tabix_file.header, out_vcf_file) for entry in chr_tabix_file.fetch(): out_vcf_file.write("%s\n" % entry) chr_tabix_file.close() out_vcf_file.close() pysam.tabix_index(out_vcf, force=True, preset="vcf")
Example #11
Source File: svtool_to_vcf.py From metasv with BSD 2-Clause "Simplified" License | 5 votes |
def convert_svtool_to_vcf(file_name, sample, out_vcf, toolname, reference, sort=False, index=False): vcf_template_reader = get_template() vcf_template_reader.samples = [sample] vcf_fd = open(out_vcf, "w") if out_vcf is not None else sys.stdout vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader) reference_handle = pysam.Fastafile(reference) if reference else None reference_contigs = get_contigs(reference) if sort: if not reference_contigs: logger.warn("Chromosomes will be sorted in lexicographic order since reference is missing") else: logger.info("Chromosomes will be sorted by the reference order") vcf_records = [] for tool_record in tool_to_reader[toolname](file_name, reference_handle=reference_handle): vcf_record = tool_record.to_vcf_record(sample) if vcf_record is None: continue if sort: vcf_records.append(vcf_record) else: vcf_writer.write_record(vcf_record) if sort: if reference_contigs: contigs_order_dict = {contig.name: index for (index, contig) in enumerate(reference_contigs)} vcf_records.sort( cmp=lambda x, y: cmp((contigs_order_dict[x.CHROM], x.POS), (contigs_order_dict[y.CHROM], y.POS))) else: vcf_records.sort(cmp=lambda x, y: cmp((x.CHROM, x.POS), (y.CHROM, y.POS))) for vcf_record in vcf_records: vcf_writer.write_record(vcf_record) vcf_writer.close() if out_vcf and index: pysam.tabix_index(out_vcf, force=True, preset='vcf')
Example #12
Source File: file_utils.py From pheweb with GNU Affero General Public License v3.0 | 5 votes |
def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path): make_basedir(ivf_path) tmp_path = get_tmp_path(ivf_path) pysam.tabix_compress(vf_path, tmp_path, force=True) os.rename(tmp_path, ivf_path) pysam.tabix_index( filename=ivf_path, force=True, seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`. line_skip=1, # skip header )
Example #13
Source File: read_store.py From ariba with GNU General Public License v3.0 | 5 votes |
def _compress_and_index_file(infile, log_fh=None): if log_fh is not None: print('Compressing file', infile, file=log_fh, flush=True) pysam.tabix_compress(infile, infile + '.gz') pysam.tabix_index(infile + '.gz', seq_col=0, start_col=1, end_col=1)
Example #14
Source File: demography.py From momi2 with GNU General Public License v3.0 | 4 votes |
def simulate_vcf(self, out_prefix, mutation_rate, recombination_rate, length, chrom_name=1, ploidy=1, random_seed=None, force=False, print_aa=True): out_prefix = os.path.expanduser(out_prefix) vcf_name = out_prefix + ".vcf" bed_name = out_prefix + ".bed" for fname in (vcf_name, bed_name): if not force and os.path.isfile(fname): raise FileExistsError( "{} exists and force=False".format(fname)) if np.any(self.sampled_n % ploidy != 0): raise ValueError("Sampled alleles per population must be" " integer multiple of ploidy") with open(bed_name, "w") as bed_f: print(chrom_name, 0, length, sep="\t", file=bed_f) with open(vcf_name, "w") as vcf_f: treeseq = self.simulate_trees( mutation_rate=mutation_rate, recombination_rate=recombination_rate, length=length, num_replicates=1, random_seed=random_seed) print("##fileformat=VCFv4.2", file=vcf_f) print('##source="VCF simulated by momi2 using' ' msprime backend"', file=vcf_f) print("##contig=<ID={chrom_name},length={length}>".format( chrom_name=chrom_name, length=length), file=vcf_f) print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=vcf_f) print('##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">', file=vcf_f) n_samples = int(np.sum(self.sampled_n) / ploidy) fields = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] for pop, n in zip(self.sampled_pops, self.sampled_n): for i in range(int(n / ploidy)): fields.append("{}_{}".format(pop, i)) print(*fields, sep="\t", file=vcf_f) loc = next(treeseq) if print_aa: info_str = "AA=A" else: info_str = "." for v in loc.variants(): gt = np.reshape(v.genotypes, (n_samples, ploidy)) print(chrom_name, int(np.floor(v.position)), ".", "A", "T", ".", ".", info_str, "GT", *["|".join(map(str, sample)) for sample in gt], sep="\t", file=vcf_f) pysam.tabix_index(vcf_name, preset="vcf", force=force)
Example #15
Source File: run_nuc.py From NucleoATAC with MIT License | 4 votes |
def run_nuc(args): """run occupancy calling """ vmat = VMat.open(args.vmat) if args.fasta: chrs = read_chrom_sizes_from_fasta(args.fasta) else: chrs = read_chrom_sizes_from_bam(args.bam) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = vmat.mat.shape[1] + vmat.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2, min_length = args.nuc_sep * 2) chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2) chunks.merge() maxQueueSize = args.cores*10 if args.sizes is not None: fragment_dist = FragmentSizes.open(args.sizes) else: fragment_dist = FragmentSizes(0, upper = vmat.upper) fragment_dist.calculateSizes(args.bam, chunks) params = NucParameters(vmat = vmat, fragmentsizes = fragment_dist, bam = args.bam, fasta = args.fasta, pwm = args.pwm, occ_track = args.occ_track, sd = args.sd, nonredundant_sep = args.nuc_sep, redundant_sep = args.redundant_sep, min_z = args.min_z, min_lr = args.min_lr , atac = args.atac) sets = chunks.split(items = args.cores*5) pool1 = mp.Pool(processes = max(1,args.cores-1)) if args.write_all: outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth', 'nucleoatac_background','nucleoatac_raw'] else: outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth'] handles = {} write_queues = {} write_processes = {} for i in outputs: if i not in ['nucpos','nucpos.redundant','nfrpos']: handles[i] = open(args.out + '.'+i+'.bedgraph','w') else: handles[i] = open(args.out + '.'+i+'.bed','w') handles[i].close() write_queues[i] = mp.JoinableQueue(maxsize = maxQueueSize) write_processes[i] = mp.Process(target = _writeFuncs[i], args=(write_queues[i], args.out)) write_processes[i].start() for j in sets: tmp = pool1.map(_nucHelper, zip(j,itertools.repeat(params))) for result in tmp: for i in outputs: write_queues[i].put(result[i]) pool1.close() pool1.join() for i in outputs: write_queues[i].put('STOP') for i in outputs: write_processes[i].join() if i not in ['nucpos','nucpos.redundant']: pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.' + i + '.bedgraph.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bedgraph') pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True) else: pysam.tabix_compress(args.out + '.' + i + '.bed', args.out + '.' + i + '.bed.gz',force = True) shell_command('rm ' + args.out + '.' + i + '.bed') pysam.tabix_index(args.out + '.' + i + '.bed.gz', preset = "bed", force = True)
Example #16
Source File: run_nfr.py From NucleoATAC with MIT License | 4 votes |
def run_nfr(args): """run nfr calling """ if args.bam is None and args.ins_track is None: raise Exception("Must supply either bam file or insertion track") if not args.out: args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3]) if args.fasta is not None: chrs_fasta = read_chrom_sizes_from_fasta(args.fasta) pwm = PWM.open(args.pwm) chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down)) else: chunks = ChunkList.read(args.bed) if args.bam is not None: chrs_bam = read_chrom_sizes_from_bam(args.bam) chunks.checkChroms(chrs_bam, chrom_source = "BAM file") chunks.merge() maxQueueSize = args.cores * 10 params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper, fasta = args.fasta, pwm = args.pwm) sets = chunks.split(items = args.cores * 5) pool1 = mp.Pool(processes = max(1,args.cores-1)) nfr_handle = open(args.out + '.nfrpos.bed','w') nfr_handle.close() nfr_queue = mp.JoinableQueue() nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out)) nfr_process.start() if params.ins_track is None: ins_handle = open(args.out + '.ins.bedgraph','w') ins_handle.close() ins_queue = mp.JoinableQueue() ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out)) ins_process.start() for j in sets: tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params))) for result in tmp: if params.ins_track is None: nfr_queue.put(result[0]) ins_queue.put(result[1]) else: nfr_queue.put(result) pool1.close() pool1.join() nfr_queue.put('STOP') nfr_process.join() if params.ins_track is None: ins_queue.put('STOP') ins_process.join() pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True) shell_command('rm ' + args.out + '.nfrpos.bed') pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True) if params.ins_track is None: pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True) shell_command('rm ' + args.out + '.ins.bedgraph') pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)