Python pysam.tabix_compress() Examples

The following are 11 code examples of pysam.tabix_compress(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pysam , or try the search function .
Example #1
Source File: samtools_variants.py    From ariba with GNU General Public License v3.0 6 votes vote down vote up
def _make_vcf_and_read_depths_files(self):
        if not os.path.exists(self.ref_fa + '.fai'):
            pysam.faidx(self.ref_fa)

        tmp_vcf = self.vcf_file + '.tmp'
        with open(tmp_vcf, 'w') as f:
            print(pysam.mpileup(
                '-t', 'INFO/AD,INFO/ADF,INFO/ADR',
                '-L', '99999999',
                '-A',
                '-f', self.ref_fa,
                '-u',
                '-v',
                self.bam,
            ), end='', file=f)

        got = vcfcall_ariba.vcfcall_ariba(tmp_vcf, self.outprefix, self.min_var_read_depth, self.min_second_var_read_depth, self.max_allele_freq)
        if got != 0:
            raise Error('Error parsing vcf file. Cannot contine')

        pysam.tabix_compress(self.outprefix + '.read_depths', self.read_depths_file)
        pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1)
        os.unlink(self.outprefix + '.read_depths')
        os.unlink(tmp_vcf) 
Example #2
Source File: tabix.py    From svviz with MIT License 5 votes vote down vote up
def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath+".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath+".gz")
            if not os.path.exists(bedPath+".gz"):
                raise Exception("Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(preset=preset, file=bedPath))
        bedPath += ".gz"
    if not os.path.exists(bedPath+".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath+".tbi"):
            raise Exception("Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset, file=bedPath))

    line = next(pysam.Tabixfile(bedPath).fetch())
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError("BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)")
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath


# def sortFile(uncompressedPath, preset):
#     if preset == "bed":
#         fields = {"chrom":0, "start":1, "end":2}
#     elif preset == "gff":
#         fields = {"chrom":0, "start":3, "end":4}

#     sortCommand = "sort -k{chrom}V -k{start}n -k{end}n".format(**fields)

#     tabixCommand = "{sort} {path} | bgzip > {path}.gz".format(sort=sortCommand, path=uncompressedPath)

#     logging.info("Trying to sort input annotation file with command:")
#     logging.info("  {}".format(tabixCommand))

#     subprocess.check_call(tabixCommand, shell=True) 
Example #3
Source File: get_ins.py    From NucleoATAC with MIT License 5 votes vote down vote up
def get_ins(args, bases = 50000, splitsize = 1000):
    """function to get insertions

    """
    if not args.out:
        if args.bed is None:
            args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
    if args.bed is None:
        chrs = read_chrom_sizes_from_bam(args.bam)
        chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.ins.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeIns, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        if args.smooth:
            tmp = pool1.map(_insHelperSmooth, zip(j,itertools.repeat(args)))
        else:
            tmp = pool1.map(_insHelper, zip(j,itertools.repeat(args)))
        for track in tmp:
            write_queue.put(track)
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.ins.bedgraph')
    pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True) 
Example #4
Source File: get_cov.py    From NucleoATAC with MIT License 5 votes vote down vote up
def get_cov(args, bases = 50000, splitsize = 1000):
    """function to get coverages

    """
    if not args.out:
        if args.bed is None:
            args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
    if args.bed is None:
        chrs = read_chrom_sizes_from_bam(args.bam)
        chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.cov.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeCov, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args)))
        for track in tmp:
            write_queue.put(track)
    pool1.close()
    pool1.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.cov.bedgraph')
    pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True) 
Example #5
Source File: bedgraph.py    From NucleoATAC with MIT License 5 votes vote down vote up
def tabix_bedgraph(bedgraph):
    pysam.tabix_compress(bedgraph,bedgraph+'.gz')
    pysam.tabix_index(bedgraph+'.gz', seq_col=0, start_col=1, end_col=2, zerobased=True) 
Example #6
Source File: make_bias_track.py    From NucleoATAC with MIT License 5 votes vote down vote up
def make_bias_track(args, bases = 500000, splitsize = 1000):
    """function to compute bias track

    """
    if args.out is None:
        if args.bed is not None:
            args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
        else:
            args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1])
    params = _BiasParams(args.fasta, args.pwm)
    if args.bed is None:
        chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize)
        sets = chunks.split(items = bases/splitsize)
    else:
        chunks = ChunkList.read(args.bed)
        chunks.checkChroms(params.chrs.keys())
        chunks.merge()
        sets = chunks.split(bases = bases)
    maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
    pool = mp.Pool(processes = max(1,args.cores-1))
    out_handle = open(args.out + '.Scores.bedgraph','w')
    out_handle.close()
    write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
    write_process = mp.Process(target = _writeBias, args=(write_queue, args.out))
    write_process.start()
    for j in sets:
        tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params)))
        for track in tmp:
            write_queue.put(track)
    pool.close()
    pool.join()
    write_queue.put('STOP')
    write_process.join()
    pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True)
    shell_command('rm ' + args.out + '.Scores.bedgraph')
    pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True) 
Example #7
Source File: diff_occ.py    From NucleoATAC with MIT License 5 votes vote down vote up
def run_diff(args, bases = 500000):
    """run differential occupancy calling

    """
    chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down))
    chunks.merge()
    maxQueueSize = max(2,int(100 * bases / np.mean([chunk.length() for chunk in chunks])))
    #get fragmentsizes
    fragment_dist1 = FragmentMixDistribution(0, upper = args.upper)
    fragment_dist1.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes1).get(0,args.upper))
    fragment_dist1.modelNFR()
    fragment_dist2 = FragmentMixDistribution(0, upper = args.upper)
    fragment_dist2.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes2).get(0,args.upper))
    fragment_dist2.modelNFR()
    params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ,
            flank = args.flank, bam = args.bam, ci = args.confidence_interval)
    sets = chunks.split(bases = bases)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    diff_handle = open(args.out + '.occdiff.bed','w')
    diff_handle.close()
    diff_queue = mp.JoinableQueue()
    diff_process = mp.Process(target = _writeDiff, args=(diff_queue, args.out))
    diff_process.start()
    nuc_dist = np.zeros(args.upper)
    for j in sets:
        tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            diff_queue.put(result[1])
    pool1.close()
    pool1.join()
    diff_queue.put('STOP')
    diff_process.join()
    pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz',force = True)
    shell_command('rm ' + args.out + '.occdiff.bed')
    pysam.tabix_index(args.out + '.occdiff.bed.gz', preset = "bed", force = True) 
Example #8
Source File: file_utils.py    From pheweb with GNU Affero General Public License v3.0 5 votes vote down vote up
def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path):
    make_basedir(ivf_path)
    tmp_path = get_tmp_path(ivf_path)
    pysam.tabix_compress(vf_path, tmp_path, force=True)
    os.rename(tmp_path, ivf_path)

    pysam.tabix_index(
        filename=ivf_path, force=True,
        seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`.
        line_skip=1, # skip header
    ) 
Example #9
Source File: read_store.py    From ariba with GNU General Public License v3.0 5 votes vote down vote up
def _compress_and_index_file(infile, log_fh=None):
        if log_fh is not None:
            print('Compressing file', infile, file=log_fh, flush=True)
        pysam.tabix_compress(infile, infile + '.gz')
        pysam.tabix_index(infile + '.gz', seq_col=0, start_col=1, end_col=1) 
Example #10
Source File: run_nuc.py    From NucleoATAC with MIT License 4 votes vote down vote up
def run_nuc(args):
    """run occupancy calling

    """
    vmat = VMat.open(args.vmat)
    if args.fasta:
        chrs = read_chrom_sizes_from_fasta(args.fasta)
    else:
        chrs = read_chrom_sizes_from_bam(args.bam)
    pwm = PWM.open(args.pwm)
    chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = vmat.mat.shape[1] + vmat.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2, min_length = args.nuc_sep * 2)
    chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2)
    chunks.merge()
    maxQueueSize = args.cores*10
    if args.sizes is not None:
        fragment_dist = FragmentSizes.open(args.sizes)
    else:
        fragment_dist = FragmentSizes(0, upper = vmat.upper)
        fragment_dist.calculateSizes(args.bam, chunks)
    params = NucParameters(vmat = vmat, fragmentsizes = fragment_dist, bam = args.bam, fasta = args.fasta, pwm = args.pwm,
                           occ_track = args.occ_track,
                           sd = args.sd, nonredundant_sep = args.nuc_sep, redundant_sep = args.redundant_sep,
                           min_z = args.min_z, min_lr = args.min_lr , atac = args.atac)
    sets = chunks.split(items = args.cores*5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    if args.write_all:
        outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth',
                       'nucleoatac_background','nucleoatac_raw']
    else:
        outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth']
    handles = {}
    write_queues = {}
    write_processes = {}
    for i in outputs:
        if i not in ['nucpos','nucpos.redundant','nfrpos']:
            handles[i] = open(args.out + '.'+i+'.bedgraph','w')
        else:
            handles[i] = open(args.out + '.'+i+'.bed','w')
        handles[i].close()
        write_queues[i] = mp.JoinableQueue(maxsize = maxQueueSize)
        write_processes[i] = mp.Process(target = _writeFuncs[i], args=(write_queues[i], args.out))
        write_processes[i].start()
    for j in sets:
        tmp = pool1.map(_nucHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            for i in outputs:
                write_queues[i].put(result[i])
    pool1.close()
    pool1.join()
    for i in outputs:
        write_queues[i].put('STOP')
    for i in outputs:
        write_processes[i].join()
        if i not in ['nucpos','nucpos.redundant']:
            pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out +  '.' + i + '.bedgraph.gz',force = True)
            shell_command('rm ' + args.out +  '.' + i + '.bedgraph')
            pysam.tabix_index(args.out +  '.' + i + '.bedgraph.gz', preset = "bed", force = True)
        else:
            pysam.tabix_compress(args.out + '.' + i + '.bed', args.out +  '.' + i + '.bed.gz',force = True)
            shell_command('rm ' + args.out +  '.' + i + '.bed')
            pysam.tabix_index(args.out +  '.' + i + '.bed.gz', preset = "bed", force = True) 
Example #11
Source File: run_nfr.py    From NucleoATAC with MIT License 4 votes vote down vote up
def run_nfr(args):
    """run nfr calling

    """
    if args.bam is None and args.ins_track is None:
        raise Exception("Must supply either bam file or insertion track")
    if not args.out:
        args.out = '.'.join(os.path.basename(args.calls).split('.')[0:-3])
    if args.fasta is not None:
        chrs_fasta = read_chrom_sizes_from_fasta(args.fasta)
        pwm = PWM.open(args.pwm)
        chunks = ChunkList.read(args.bed, chromDict = chrs_fasta, min_offset = max(pwm.up, pwm.down))
    else:
        chunks = ChunkList.read(args.bed)
    if args.bam is not None:
        chrs_bam = read_chrom_sizes_from_bam(args.bam)
        chunks.checkChroms(chrs_bam, chrom_source = "BAM file") 
    chunks.merge()
    maxQueueSize = args.cores * 10 
    params = NFRParameters(args.occ_track, args.calls, args.ins_track, args.bam, max_occ = args.max_occ, max_occ_upper = args.max_occ_upper,
                            fasta = args.fasta, pwm = args.pwm)
    sets = chunks.split(items = args.cores * 5)
    pool1 = mp.Pool(processes = max(1,args.cores-1))
    nfr_handle = open(args.out + '.nfrpos.bed','w')
    nfr_handle.close()
    nfr_queue = mp.JoinableQueue()
    nfr_process = mp.Process(target = _writeNFR, args=(nfr_queue, args.out))
    nfr_process.start()
    if params.ins_track is None:
        ins_handle = open(args.out + '.ins.bedgraph','w')
        ins_handle.close()
        ins_queue = mp.JoinableQueue()
        ins_process = mp.Process(target = _writeIns, args=(ins_queue, args.out))
        ins_process.start()
    for j in sets:
        tmp = pool1.map(_nfrHelper, zip(j,itertools.repeat(params)))
        for result in tmp:
            if params.ins_track is None:
                nfr_queue.put(result[0])
                ins_queue.put(result[1])
            else:
                nfr_queue.put(result)
    pool1.close()
    pool1.join()
    nfr_queue.put('STOP')
    nfr_process.join()
    if params.ins_track is None:
        ins_queue.put('STOP')
        ins_process.join()
    pysam.tabix_compress(args.out + '.nfrpos.bed', args.out + '.nfrpos.bed.gz',force = True)
    shell_command('rm ' + args.out + '.nfrpos.bed')
    pysam.tabix_index(args.out + '.nfrpos.bed.gz', preset = "bed", force = True)
    if params.ins_track is None:
        pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True)
        shell_command('rm ' + args.out + '.ins.bedgraph')
        pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)