Python Examples of pysam.sort

Source File: filter.py From slamdunk with GNU Affero General Public License v3.0

6 votes

def bamSort(outputBAM, log, newHeader, verbose):

    tmp = outputBAM + "_tmp"
    if(newHeader != None):
        pyOutputBAM = pysam.AlignmentFile(outputBAM, "rb")
        pyTmp = pysam.AlignmentFile(tmp, "wb", header=newHeader)
        for read in pyOutputBAM:
            pyTmp.write(read)
        pyOutputBAM.close()
        pyTmp.close()
    else:
        os.rename(outputBAM, tmp)

    #run(" ".join(["samtools", "sort", "-@", str(threads) , tmp, replaceExtension(outFile, "")]), log, verbose=verbose, dry=dry)
    run(" ".join(["samtools sort", "-o", outputBAM, tmp]), log, verbose=verbose, dry=False)
    #pysam.sort(tmp, outputBAM)  # @UndefinedVariable
    removeFile(tmp)

Source File: tepid.py From TEPID with GNU General Public License v3.0

6 votes

def check_name_sorted(bam, p, prefix):
    """
    Sort bam file by name if position sorted
    """
    test_head = pysam.AlignmentFile(bam, 'rb')
    p = str(p)
    try:
        test_head.header['HD']['SO']
    except KeyError:
        print '  sorting bam file'
        pysam.sort('-@', p, '-n', bam, prefix + 'sorted.temp')
        os.remove(bam)
        os.rename(prefix + 'sorted.temp.bam', bam)
    else:
        if test_head.header['HD']['SO'] == 'queryname':
            pass
        else:
            print '  sorting bam file'
            pysam.sort('-@', p, '-n', bam, prefix + 'sorted.temp')
            os.remove(bam)
            os.rename(prefix + 'sorted.temp.bam', bam)
    test_head.close()

Source File: sambamconverter.py From READemption with ISC License

6 votes

def sam_to_bam(self, sam_path, bam_path_prefix):
        if self._sam_file_is_empty(sam_path) is True:
            # pysam will generate an error if an emtpy SAM file will
            # be converted. Due to this an empty bam file with the
            # same header information will be generated from scratch
            self._generate_empty_bam_file(sam_path, bam_path_prefix)
            # Remove SAM file
            os.remove(sam_path)
            return
        temp_unsorted_bam_path = self._temp_unsorted_bam_path(
            bam_path_prefix)
        # Generate unsorted BAM file
        pysam.samtools.view(
            "-b", "-o{}".format(temp_unsorted_bam_path), sam_path,
            catch_stdout=False)
        # Generate sorted BAM file
        pysam.sort(temp_unsorted_bam_path, "-o", bam_path_prefix + ".bam")
        # Generate index for BAM file
        pysam.index("%s.bam" % bam_path_prefix)
        # Remove unsorted BAM file
        os.remove(temp_unsorted_bam_path)
        # Remove SAM file
        os.remove(sam_path)

Source File: utils.py From smallrnaseq with GNU General Public License v3.0

5 votes

def sam_to_bam(filename):
    """Convert sam to bam"""

    import pysam
    infile = pysam.AlignmentFile(filename, "r")
    name = os.path.splitext(filename)[0]+'.bam'
    bamfile = pysam.AlignmentFile(name, "wb", template=infile)
    for s in infile:
        bamfile.write(s)
    pysam.sort("-o", name, name)
    pysam.index(name)
    bamfile = pysam.AlignmentFile(name, "rb")
    return

Source File: PipelineIntervals.py From CGATPipelines with MIT License

5 votes

def createGenomeWindows(genome, outfile, windows):

    statement = '''
        cgat windows2gff
            --genome=%(genome)s
            --output-format=bed
            --fixed-width-windows=%(windows)s
            --log=%(outfile)s.log |
        bedtools sort -i stdin |
        gzip > %(outfile)s
        '''
    P.run()

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

5 votes

def mergeSortIndex(bamfiles, out):
    '''
    Merge bamfiles into a single sorted, indexed outfile.
    Generates and runs a command line statement.

    Example Statement:
    samtools merge ctmpljriXY.bam K9-13-1_filtered.bam K9-13-2_filtered.bam
    K9-13-3_filtered.bam;
    samtools sort ctmpljriXY.bam -o ctmpYH6llm.bam;
    samtools index ctmpYH6llm.bam;
    mv ctmpYH6llm.bam 13_Heart_pooled_filtered.bam;
    mv ctmpYH6llm.bam.bai 13_Heart_pooled_filtered.bam.bai;

    Parameters
    ----------
    bamfiles: list
        list of paths to bam files to merge
    out: str
        path to output file

    '''
    infiles = " ".join(bamfiles)
    T1 = P.getTempFilename(".")
    T2 = P.getTempFilename(".")
    statement = """samtools merge %(T1)s.bam %(infiles)s;
    samtools sort %(T1)s.bam -o %(T2)s.bam;
    samtools index %(T2)s.bam;
    mv %(T2)s.bam %(out)s;
    mv %(T2)s.bam.bai %(out)s.bai""" % locals()
    P.run()
    os.remove("%s.bam" % T1)
    os.remove(T1)
    os.remove(T2)

Source File: simulator.py From slamdunk with GNU Affero General Public License v3.0

5 votes

def prepareBED(bed, slamSimBed, minLength):
    utrs = []
    for utr in BedIterator(bed):
        utrs.append(utr)

    utrs.sort(key=lambda x: (x.name, -x.getLength()))

    outBed = open(slamSimBed, "w")

    partList = []
    lastUtr = None
    for utr in utrs:
        if utr.hasStrand() and utr.hasNonEmptyName():
            currentUtr = utr.name
            if currentUtr == lastUtr:
                partList.append(utr)
            else:
                if(not lastUtr is None):
                    printUTR(partList[0], outBed, minLength)
                partList = [utr]
            lastUtr = currentUtr
        else:
            print("Warning: Invalid BED entry found: " + str(utr))

    if(not lastUtr is None):
        printUTR(partList[0], outBed, minLength)

    outBed.close()

Source File: tepid.py From TEPID with GNU General Public License v3.0

5 votes

def check_bam(bam, p, prefix, make_new_index=False):
    """
    Sort and index bam file
    returns dictionary of chromosome names and lengths
    """
    # check if sorted
    test_head = pysam.AlignmentFile(bam, 'rb')
    chrom_sizes = {}
    p = str(p)
    for i in test_head.header['SQ']:
        chrom_sizes[i['SN']] = int(i['LN'])
    try:
        test_head.header['HD']['SO']
    except KeyError:
        print '  sorting bam file'
        pysam.sort('-@', p, bam, prefix + 'sorted.temp')
        os.remove(bam)
        os.rename(prefix + 'sorted.temp.bam', bam)
    else:
        if test_head.header['HD']['SO'] == 'coordinate':
            pass
        else:
            print '  sorting bam file'
            pysam.sort('-@', p, bam, prefix + 'sorted.temp')
            os.remove(bam)
            os.rename(prefix + 'sorted.temp.bam', bam)
    test_head.close()
    # check if indexed
    if '{}.bai'.format(bam) in os.listdir('.') and make_new_index is False:
        pass
    else:
        print '  indexing bam file'
        pysam.index(bam)
    return chrom_sizes

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

5 votes

def sortIndex(bamfile):
    '''
    Sorts and indexes a bam file.
    Generates and runs a command line statement.

    Example Statement:
    samtools sort K9-10-1_filtered.bam -o ctmpvHoczK.bam;
    samtools index ctmpvHoczK.bam;
    mv ctmpvHoczK.bam K9-10-1_filtered.bam;
    mv ctmpvHoczK.bam.bai K9-10-1_filtered.bam.bai

    The input bam file is replaced by the sorted bam file.

    Parameters
    ----------
    bamfile: str
        path to bam file to sort and index

    '''
    T1 = P.getTempFilename(".")
    bamfile = P.snip(bamfile)
    statement = """
    samtools sort %(bamfile)s.bam -o %(T1)s.bam;
    samtools index %(T1)s.bam;
    mv %(T1)s.bam %(bamfile)s.bam;
    mv %(T1)s.bam.bai %(bamfile)s.bam.bai""" % locals()
    P.run()
    os.remove(T1)

Source File: tepid.py From TEPID with GNU General Public License v3.0

5 votes

def refine(options):
    """
    Refine TE insertion and deletion calls within a group of related samples
    Use indel calls from other samples in the group, inspect areas of the genome in samples where
    indel was not called, and look for evidence of the same indel with much lower
    read count threshold
    """
    te = pybedtools.BedTool(options.te).sort()
    names = readNames(options.all_samples)
    if options.insertions is not False:
        insertions = getOtherLines(names, options.insertions)
    if options.deletions is not False:
        deletions = getOtherLines(names, options.deletions)  # format ([data], [inverse_accessions])
    print "Processing "+options.name
    chrom_sizes = check_bam(options.conc, options.proc, options.prefix)
    check_bam(options.split, options.proc, options.prefix, make_new_index=True)
    cov = calc_cov(options.conc, 100000, 120000)
    concordant = pysam.AlignmentFile(options.conc, 'rb')
    split_alignments = pysam.AlignmentFile(options.split, 'rb')
    name_indexed = pysam.IndexedReads(split_alignments)
    name_indexed.build()
    if options.deletions is not False:
        print "  checking deletions"
        process_missed(deletions, "deletion", concordant, split_alignments, name_indexed, options.name, te, cov/5, chrom_sizes, options.prefix)
    else:
        pass
    if options.insertions is not False:
        print "  checking insertions"
        process_missed(insertions, "insertion", concordant, split_alignments, name_indexed, options.name, te, cov/10, chrom_sizes, options.prefix)
    else:
        pass

Source File: tepid.py From TEPID with GNU General Public License v3.0

5 votes

def check_te_overlaps_dels(te, bamfile, te_list, prefix):
    pybedtools.BedTool(bamfile).bam_to_bed().saveas(prefix + 'split.temp')
    convert_split_pairbed(prefix + 'split.temp', prefix + 'split_bedpe.temp')
    split = pybedtools.BedTool(prefix + 'split_bedpe.temp').each(append_origin, word='split').saveas()
    create_deletion_coords(split, prefix + 'second_pass_del_coords.temp')
    dels = pybedtools.BedTool(prefix + 'second_pass_del_coords.temp').sort()
    intersections = dels.intersect(te, wb=True, nonamecheck=True, sorted=True)
    os.remove(prefix + 'second_pass_del_coords.temp')
    reads = []
    for r in intersections:
        if r[-3] in te_list:
            reads.append(r[3])
        else:
            pass
    return reads

Source File: mock_data.py From medaka with Mozilla Public License 2.0

5 votes

def create_simple_bam(fname, calls):
    """Create a small bam file with RLE encoding coded in the qscores."""
    ref_len = len(simple_data['ref'])

    header = {'HD': {'VN': '1.0'},
              'SQ': [{'LN': ref_len, 'SN': 'ref'}]}

    tmp_file = '{}.tmp'.format(fname)
    with pysam.AlignmentFile(
            tmp_file, 'wb', reference_names=['ref', ],
            reference_lengths=[ref_len, ], header=header) as output:
        for index, basecall in enumerate(calls):
            a =common.initialise_alignment(
                query_name=basecall['query_name'],
                reference_id=0,
                reference_start=0,
                query_sequence=basecall['seq'],
                cigarstring=basecall['cigarstring'],
                flag=basecall['flag'],
                query_qualities=basecall['quality'],
                tags=basecall['tags'])

            output.write(a)

    pysam.sort("-o", fname, tmp_file)
    os.remove(tmp_file)
    pysam.index(fname)

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

5 votes

def createGenomeWindows(genome, outfile, windows):

    statement = '''
        cgat windows2gff
            --genome=%(genome)s
            --output-format=bed
            --fixed-width-windows=%(windows)s
            --log=%(outfile)s.log |
        bedtools sort -i stdin |
        gzip > %(outfile)s
        '''
    P.run()

Source File: app.py From svviz with MIT License

5 votes

def saveReads(dataHub, nameExtra=None):
    if dataHub.args.save_reads:
        logging.info("* Saving relevant reads *")
        for i, sample in enumerate(dataHub):
            outbam_path = dataHub.args.save_reads
            if not outbam_path.endswith(".bam"):
                outbam_path += ".bam"

            if len(dataHub.samples) > 1:
                logging.debug("Using i = {}".format(i))
                outbam_path = outbam_path.replace(".bam", ".{}.bam".format(i))

            if nameExtra is not None:
                outbam_path = outbam_path.replace(".bam", ".{}.bam".format(nameExtra))

            logging.info("  Outpath: {}".format(outbam_path))

            # print out just the reads we're interested for use later
            bam_small = pysam.Samfile(outbam_path, "wb", template=sample.bam)
            for read in sample.reads:
                bam_small.write(read)

            for read in sample.readStatistics.reads:
                bam_small.write(read)

            bam_small.close()
            sorted_path = outbam_path.replace(".bam", ".sorted")
            pysam.sort(outbam_path, sorted_path)
            pysam.index(sorted_path+".bam")

Source File: process_sams.py From TALON with MIT License

5 votes

def partition_reads(sam_files, datasets, tmp_dir = "talon_tmp/", n_threads = 0):
    """ Use bedtools merge to create non-overlapping intervals from all of the
        transcripts in a series of SAM/BAM files. Then, iterate over the intervals
        to extract all reads inside of them from the pysam object.
       
        Returns:
            - List of lists: sublists contain pysam reads from a given interval
            - List of tuple intervals
            - filename of merged bam file (to keep track of the header)
           """

    merged_bam = preprocess_sam(sam_files, datasets, tmp_dir = tmp_dir, 
                                n_threads = n_threads)

    try:
        all_reads = pybedtools.BedTool(merged_bam).bam_to_bed()
    except Exception as e:
        print(e)
        raise RuntimeError("Problem opening sam file %s" % (merged_bam))

    # Must sort the Bedtool object
    sorted_reads = all_reads.sort()
    intervals = sorted_reads.merge(d = 100000000)

    # Now open each sam file using pysam and extract the reads
    coords = []
    read_groups = []
    with pysam.AlignmentFile(merged_bam) as bam:  # type: pysam.AlignmentFile
        for interval in intervals:
            reads = get_reads_in_interval(bam, interval.chrom,
                                          interval.start, interval.end)
            read_groups.append(reads)
            coords.append((interval.chrom, interval.start + 1, interval.end))

    return read_groups, coords, merged_bam

Source File: process_sams.py From TALON with MIT License

5 votes

def preprocess_sam(sam_files, datasets, tmp_dir = "talon_tmp/", n_threads = 0):
    """ Copy and rename the provided SAM/BAM file(s), merge them, and index.
        This is necessary in order to use Pybedtools commands on the reads.
        The renaming is necessary in order to label the reads according to
        their dataset."""

    # Create the tmp dir
    os.system("mkdir -p %s " % (tmp_dir))

    # Copy and rename SAM files with dataset names to ensure correct RG tags
    renamed_sams = []
    for sam, dataset in zip(sam_files, datasets):
        suffix = "." + sam.split(".")[-1]
        if suffix == ".sam":
            bam_copy = tmp_dir + dataset + "_unsorted.bam"
            convert_to_bam(sam, bam_copy)
            sam = bam_copy
        sorted_bam = tmp_dir + dataset + ".bam"
        pysam.sort("-@", str(n_threads), "-o", sorted_bam, sam)
        renamed_sams.append(sorted_bam)

    merged_bam = tmp_dir + "merged.bam"
    merge_args = [merged_bam] + renamed_sams + ["-f", "-r", "-@", str(n_threads)]
    # index_args = [merged_bam, "-@", str(n_threads)]

    # Merge datasets and use -r option to include a read group tag
    try:
        pysam.merge(*merge_args)
        pysam.index(merged_bam)
        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        print("[ %s ] Merged input SAM/BAM files" % (ts))
    except:
        raise RuntimeError(("Problem merging and indexing SAM/BAM files. "
                            "Check your file paths and make sure that all "
                            "files have headers."))
    return merged_bam

Source File: utils.py From smallrnaseq with GNU General Public License v3.0

5 votes

def featurecounts(samfile, gtffile):
    """Count aligned features with the featureCounts program.
       Returns: a dataframe of counts"""

    params = '-T 5 -t exon -g transcript_id'
    cmd = 'featureCounts %s -a %s -o counts.txt %s' %(params, gtffile, samfile)
    print (cmd)
    result = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
    counts =  pd.read_csv('counts.txt', sep='\t', comment='#')
    counts = counts.rename(columns={samfile:'reads'})
    counts = counts.sort('reads', ascending=False)
    return counts

Source File: utils.py From smallrnaseq with GNU General Public License v3.0

5 votes

def print_read_stack(reads, refseq=None, cutoff=0, by=None, label=''):
    """Print local read alignments from a sam file against the mapped sequence.
       Args:
        reads: dataframe of read counts with position info
        refseq: sequence segment or gene that reads are mapped to
        cutoff: don't display read with <=cutoff counts
        by: column to sort reads by, e.g. 'reads', 'start' or 'end'
       Returns:
        string of printed read stacks to display or write to a file
    """

    if refseq != None:
        seqlen = len(refseq)
    else:
        seqlen = reads.end.max()
    f = None
    reads = reads[reads.reads>=cutoff]
    if by is not None:
        reads = reads.sort_values(by, ascending=False)

    l = len(reads)
    if l==0: return
    s = ''
    s += '%s unique reads\n' %l
    s += '-------------------------------------\n'
    s += refseq+'\n'
    for idx,r in reads.iterrows():
        seq = r.seq
        count = r.reads
        pos = find_subseq(refseq, seq)
        if pos == -1: continue
        i = len(seq)+pos
        s += "{:>{w}} ({c})\n".format(seq,w=i,c=count)
    s += '\n'
    return s

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

4 votes

def buildBAMforPeakCalling(infiles, outfile, dedup, mask):
    '''build a BAM file suitable for peak calling.

    This method uses bedtools_.

    Infiles are merged and unmapped reads removed.

    Arguments
    ---------
    infiles : list
        List of :term:`bam` formatted files.
    outfile : string
        Filename of output file in :term:`bam` format.
    dedup : int
        If True, remove duplicate reads using Picard.
    mask : string
        If given, `mask` should be a :term:`bed` formatted
        file. Reads falling into these regions are removed.
    '''

    # open the infiles, if more than one merge and sort first using samtools.
    statement = []

    if len(infiles) > 1 and isinstance(infiles, str) == 0:
        # assume: samtools merge output is sorted
        # assume: sam files are sorted already
        statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" ")))
        statement.append('''samtools sort @IN@ @OUT@''')

    if dedup:
        job_memory = "16G"
        statement.append('''MarkDuplicates
        INPUT=@IN@
        ASSUME_SORTED=true
        REMOVE_DUPLICATES=true
        QUIET=true
        OUTPUT=@OUT@
        METRICS_FILE=%(outfile)s.picardmetrics
        VALIDATION_STRINGENCY=SILENT
        >& %(outfile)s.picardlog''')

    if mask:
        statement.append(
            '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''')

    # do not link, as local scratch will not be available on shared
    # node
    statement.append('''mv @IN@ %(outfile)s''')
    statement.append('''samtools index %(outfile)s''')

    statement = P.joinStatements(statement, infiles)
    P.run()

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

4 votes

def bedGraphToBigwig(infile, contigsfile, outfile,
                     remove=True, sort_bedGraph=False):
    '''convert a bedgraph file to a bigwig file.

    The bedgraph file is deleted on success.

    Arguments
    ---------
    infile : string
        Input file in term `bedgraph` format. Must be sorted: sort -k1,1 -k2,2
    contigsfile : string
        file of chromosome size containing two columns c=name and length
    outfile : string
        Filename of output file in `bigwig` format.
    remove : boolean
        remove bedgraph file after successful conversion. Default True
    '''

    if not os.path.exists(infile):
        raise OSError("bedgraph file %s does not exist" % infile)

    if not os.path.exists(contigsfile):
        raise OSError("contig size file %s does not exist" % contigsfile)

    if sort_bedGraph:
        statement = '''sorted_bdg=`mktemp -p %(local_tmpdir)s`;
                           checkpoint;
                           sort -k1,1 -k2,2n %(infile)s > $sorted_bdg;
                           checkpoint;
                           bedGraphToBigWig $sorted_bdg
                                            %(contigsfile)s %(outfile)s;
                           checkpoint;
                           rm $sorted_bdg;
                        '''

    else:
        statement = '''
                          bedGraphToBigWig %(infile)s %(contigsfile)s %(outfile)s
                        '''

    P.run()

    if os.path.exists(outfile) and not IOTools.isEmpty(outfile):
        os.remove(infile)

Source File: PipelineChipseq.py From CGATPipelines with MIT License

4 votes

def buildBAMforPeakCalling(infiles, outfile, dedup, mask):
    ''' Make a BAM file suitable for peak calling.

        Infiles are merged and unmapped reads removed. 

        If specificied duplicate reads are removed. 
        This method use Picard.

        If a mask is specified, reads falling within
        the mask are filtered out. 

        This uses bedtools.

        The mask is a quicksect object containing
        the regions from which reads are to be excluded.
    '''

    # open the infiles, if more than one merge and sort first using samtools.

    samfiles = []
    num_reads = 0
    nfiles = 0

    statement = []

    tmpfile = P.getTempFilename(".")

    if len(infiles) > 1 and isinstance(infiles, str) == 0:
        # assume: samtools merge output is sorted
        # assume: sam files are sorted already
        statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" ")))
        statement.append('''samtools sort @IN@ @OUT@''')

    if dedup:
        statement.append('''MarkDuplicates
        INPUT=@IN@
        ASSUME_SORTED=true
        REMOVE_DUPLICATES=true
        QUIET=true
        OUTPUT=@OUT@
        METRICS_FILE=%(outfile)s.picardmetrics
        VALIDATION_STRINGENCY=SILENT
        > %(outfile)s.picardlog ''')

    if mask:
        statement.append(
            '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''')

    statement.append('''mv @IN@ %(outfile)s''')
    statement.append('''samtools index %(outfile)s''')

    statement = P.joinStatements(statement, infiles)
    P.run()

############################################################
############################################################
############################################################

Source File: PipelineIntervals.py From CGATPipelines with MIT License

4 votes

def bedGraphToBigwig(infile, contigsfile, outfile,
                     remove=True, sort_bedGraph=False):
    '''convert a bedgraph file to a bigwig file.

    The bedgraph file is deleted on success.

    Arguments
    ---------
    infile : string
        Input file in term `bedgraph` format. Must be sorted: sort -k1,1 -k2,2
    contigsfile : string
        file of chromosome size containing two columns c=name and length
    outfile : string
        Filename of output file in `bigwig` format.
    remove : boolean
        remove bedgraph file after successful conversion. Default True
    '''

    if not os.path.exists(infile):
        raise OSError("bedgraph file %s does not exist" % infile)

    if not os.path.exists(contigsfile):
        raise OSError("contig size file %s does not exist" % contigsfile)

    if sort_bedGraph:
        statement = '''sorted_bdg=`mktemp -p %(local_tmpdir)s`;
                           checkpoint;
                           sort -k1,1 -k2,2n %(infile)s > $sorted_bdg;
                           checkpoint;
                           bedGraphToBigWig $sorted_bdg
                                            %(contigsfile)s %(outfile)s;
                           checkpoint;
                           rm $sorted_bdg;
                        '''

    else:
        statement = '''
                          bedGraphToBigWig %(infile)s %(contigsfile)s %(outfile)s
                        '''

    P.run()

    if os.path.exists(outfile) and not IOTools.isEmpty(outfile):
        os.remove(infile)

Source File: PipelineIntervals.py From CGATPipelines with MIT License

4 votes

def buildBAMforPeakCalling(infiles, outfile, dedup, mask):
    '''build a BAM file suitable for peak calling.

    This method uses bedtools_.

    Infiles are merged and unmapped reads removed.

    Arguments
    ---------
    infiles : list
        List of :term:`bam` formatted files.
    outfile : string
        Filename of output file in :term:`bam` format.
    dedup : int
        If True, remove duplicate reads using Picard.
    mask : string
        If given, `mask` should be a :term:`bed` formatted
        file. Reads falling into these regions are removed.
    '''

    # open the infiles, if more than one merge and sort first using samtools.
    statement = []

    if len(infiles) > 1 and isinstance(infiles, str) == 0:
        # assume: samtools merge output is sorted
        # assume: sam files are sorted already
        statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" ")))
        statement.append('''samtools sort @IN@ @OUT@''')

    if dedup:
        job_memory = "16G"
        statement.append('''picard MarkDuplicates
        INPUT=@IN@
        ASSUME_SORTED=true
        REMOVE_DUPLICATES=true
        QUIET=true
        OUTPUT=@OUT@
        METRICS_FILE=%(outfile)s.picardmetrics
        VALIDATION_STRINGENCY=SILENT
        >& %(outfile)s.picardlog''')

    if mask:
        statement.append(
            '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''')

    # do not link, as local scratch will not be available on shared
    # node
    statement.append('''mv @IN@ %(outfile)s''')
    statement.append('''samtools index %(outfile)s''')

    statement = P.joinStatements(statement, infiles)
    P.run()

Source File: tepid.py From TEPID with GNU General Public License v3.0

4 votes

def process_merged(infile, outfile, sd):
    """
    take merged coordinates and filter out those where multiple non-nested TEs insert into same locus
    resolve cases where insertion site is within another TE
    """
    disc = ['disc_forward', 'disc_reverse']
    with open(infile, 'r') as inf, open(outfile, 'w+') as outf:
        for line in inf:
            line = line.rsplit()
            te_chroms = line[3].split(',')
            te_names = line[7].split(',')
            r2_chrom = line[9].split(',')  # TE read, at end of file. Should be two clusers for split reads if TE large enough
            r2_start = line[10].split(',')
            r2Starts = [int(x) for x in r2_start]
            r2_stop = line[11].split(',')
            r2Stops = [int(x) for x in r2_stop]

            # create list of tuples to record read positions
            r2 = []
            for x in xrange(len(r2_chrom)):
                r2.append((r2_chrom[x], r2Starts[x], r2Stops[x], te_names[x]))
            # now sort reads by chr, start
            r2 = sorted(r2, key=lambda x: (x[0], x[1]))
            r2_coords = _condense_coords(r2, 10)

            starts = line[4].split(',')
            starts = [int(x) for x in starts]
            stops = line[5].split(',')
            stops = [int(x) for x in stops]
            te_reads = []
            for x in xrange(len(te_chroms)):
                te_reads.append((te_chroms[x], starts[x], stops[x], te_names[x]))
            te_reads = sorted(te_reads, key=lambda x: (x[0], x[1]))

            # filter where there are at least two clusters of reads in the TE if the TE length is more than 200 bp and read is split
            if (len(r2_coords) >= 2) or sd in disc or (abs(max(stops) - min(starts) <= 200)):
                coords = _condense_coords(te_reads, 1000)
                if len(coords) > 1:
                    crd, max_reads, total_smaller_clusters, dubs = get_main_cluster(coords)
                    te_name = ','.join(crd[3])
                else:
                    crd = coords[0][0:3]
                    max_reads = coords[0][3]
                    te_name = ','.join(coords[0][4])
                    total_smaller_clusters = 0
                    dubs = 0
                if total_smaller_clusters >= (max_reads - 2) or dubs >= 2:
                    pass
                else:
                    outf.write('{ch}\t{sta}\t{stp}\t{tec}\t{tesa}\t{tesp}\t{rds}\t{nm}\t{cnt}\t{sd}\n'.format(ch=line[0],
                                                                                                        sta=line[1],
                                                                                                        stp=line[2],
                                                                                                        tec=crd[0],
                                                                                                        tesa=crd[1],
                                                                                                        tesp=crd[2],
                                                                                                        rds=line[6],
                                                                                                        nm=te_name,
                                                                                                        cnt=line[8],
                                                                                                        sd=sd))
            else:
                pass

Source File: test_rle.py From medaka with Mozilla Public License 2.0

4 votes

def setUpClass(cls):
        """Create temporary files and bam file

        Ref     T  T  A  A    C  T  T  T  G
        Read1         A  A    C  T  T  T  G
        Read2      T  A  A  A C  T  T  T  G
        """
        cls.bam_input = tempfile.NamedTemporaryFile(suffix='.bam').name
        cls.bam_output = tempfile.NamedTemporaryFile(suffix='.bam').name
        cls.ref_fname = tempfile.NamedTemporaryFile(suffix='.fasta').name

        with open(cls.ref_fname, 'w') as fasta:
            fasta.write('>ref\n')
            fasta.write('TTAACTTTG\n')

        header = {
            'HD': {'VN': '1.0'},
            'SQ': [{'LN': 9, 'SN': 'ref'}, ]}

        basecalls = {
            'read1': {
                'query_name': 'read1',
                'reference_id': 0,
                'reference_start': 2,
                'query_sequence': 'AACTTTG',
                'cigarstring': '7=',
                'flag': 0,
                'mapping_quality': 50},
            'read2': {
                'query_name': 'read2',
                'reference_id': 0,
                'reference_start': 1,
                'query_sequence': 'TAAACTTTG',
                'cigarstring': '3=1I5=',
                'flag': 0,
                'mapping_quality': 50}}

        tmp_file = '{}.tmp'.format(cls.bam_input)
        with pysam.AlignmentFile(tmp_file, 'wb', header=header) as bam:
            for basecall in basecalls.values():
                record = common.initialise_alignment(**basecall)
                bam.write(record)

        pysam.sort("-o", cls.bam_input, tmp_file)
        os.remove(tmp_file)
        pysam.index(cls.bam_input)

Source File: rle.py From medaka with Mozilla Public License 2.0

4 votes

def _compress_bam(bam_input, bam_output, ref_fname,
                  fast5_dir=None, summary=None, regions=None, threads=1):
    """Compress a bam into run length encoding (RLE).

    :param bam_input: str, name of the bam file to be compressed
    :param bam_output: str, name of the bam to be produced
    :param ref_fname: str, reference filename, used to produce bam_input
    :param fast5_dir: str, root directory to find the fast5 files
    :param summary: str, filename of a summary name, with the columns
        to link the read id to the fast5 file containing it. Must contain
        columns 'read_id' and 'filename'
    :param regions: list, genomic regions to be extracted
    :param threads: int, number of workers to be used

    :returns: None
    """
    regions = medaka.common.get_regions(bam_input, regions)
    ref_fasta = pysam.FastaFile(ref_fname)

    # If fast_dir is passed, create an index
    if fast5_dir:
        with open(summary) as input_file:
            # Summary files can be huge, avoid loading to memory
            col_names = input_file.readline().replace('#', '').split()
            col_filename = col_names.index('filename')
            col_readid = col_names.index('read_id')

            file_index = dict(
                (line.split()[col_readid], line.split()[col_filename])
                for line in input_file)
    else:
        file_index = None

    with pysam.AlignmentFile(bam_input, 'r') as alignments_bam:
        tmp_output = '{}.tmp'.format(bam_output)
        with pysam.AlignmentFile(
                tmp_output, 'wb', header=alignments_bam.header) as output:
            for region in regions:
                bam_current = alignments_bam.fetch(
                    reference=region.ref_name,
                    start=region.start,
                    end=region.end)
                ref_sequence = ref_fasta.fetch(region.ref_name)
                ref_rle = RLEConverter(ref_sequence)
                func = functools.partial(
                    _compress_alignment,
                    ref_rle=ref_rle,
                    fast5_dir=fast5_dir,
                    file_index=file_index)
                with concurrent.futures.ThreadPoolExecutor(
                        max_workers=threads) as executor:
                    for chunk in medaka.common.grouper(bam_current, 100):
                        for new_alignment in executor.map(func, chunk):
                            if new_alignment is not None:
                                output.write(new_alignment)

        pysam.sort("-o", bam_output, tmp_output)
        os.remove(tmp_output)
        pysam.index(bam_output)

Source File: segemehl.py From READemption with ISC License

4 votes

def align_reads(
        self, read_file_or_pair, index_file, fasta_files, output_file,
            hit_strategy=1, accuracy=95, evalue=5.0, threads=1, split=False,
            segemehl_format=False, order=False, nonmatch_file=None,
            other_parameters=None, paired_end=False):
        if not paired_end:
            assert type(read_file_or_pair) == str
            segemehl_call = [
                self._segemehl_bin,
                "--query", read_file_or_pair]
        else:
            assert type(read_file_or_pair) == list
            segemehl_call = [
                self._segemehl_bin,
                "--query", read_file_or_pair[0],
                "--mate", read_file_or_pair[1]]
        segemehl_call += [
            "--index", index_file,
            "--database"] + fasta_files + [
            "--outfile", output_file,
            "--bamabafixoida",
            "--hitstrategy", str(hit_strategy),
            "--accuracy", str(accuracy),
            "--evalue", str(evalue),
            "--threads", str(threads)]
        if segemehl_format:
            segemehl_call.append("--SEGEMEHL")
        if order is True:
            segemehl_call.append("--order")
        if split is True:
            segemehl_call.append("--splits")
        if nonmatch_file:
            segemehl_call += ["--nomatchfilename", nonmatch_file]
        if self._show_progress is False:
            #segemehl_call += ["--silent"]
            pass
        if other_parameters:
            segemehl_call.append(other_parameters)
        # Discard standard error output
        if self._show_progress is False:
            with open(os.devnull, "w") as devnull:
                call(segemehl_call, stderr=devnull)
        else:
            call(segemehl_call)
        # Discard unmapped reads for further analysis.
        # Unmapped reads are stored in the folder output/align/unaligned_reads.
        tmp_filtered_output_file = f"{output_file}_filtered"
        pysam.view("-b",
                   "-F", "4",
                   "-o", tmp_filtered_output_file,
                   output_file,
                   catch_stdout=False)
        os.rename(tmp_filtered_output_file, output_file)

        tmp_sorted_outfile = f"{output_file}_sorted"
        pysam.sort("-o", tmp_sorted_outfile, output_file)
        os.rename(tmp_sorted_outfile, output_file)
        pysam.index(output_file)

Source File: talon_label_reads.py From TALON with MIT License

4 votes

def split_reads_by_chrom(sam_file, tmp_dir = "tmp_label_reads", n_threads = 1):
    """ Reads a SAM/BAM file and splits the reads into one file per chromosome.
        Returns a list of the resulting filenames."""

    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
    print("[ %s ] Splitting SAM by chromosome..." % (ts))

    tmp_dir = tmp_dir + "/raw"
    os.system("mkdir -p %s" %(tmp_dir))

    if sam_file.endswith(".sam"):
        # Convert to bam
        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        print("[ %s ] -----Converting to bam...." % (ts))
        bam_file = tmp_dir + "/all_reads.bam"
        pysam.view("-b", "-S", "-@", str(n_threads), "-o", bam_file, sam_file, 
                   catch_stdout=False)
    elif sam_file.endswith(".bam"):
        bam_file = sam_file
    else:
        raise ValueError("Please provide a .sam or .bam file")

    # Index the file if no index exists
    if not os.path.isfile(bam_file + ".bai"):
        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
        print("[ %s ] -----Sorting and indexing..." % (ts))
        sorted_bam = tmp_dir + "/all_reads.sorted.bam"
        pysam.sort("-@", str(n_threads), "-o", sorted_bam, bam_file)
        bam_file = sorted_bam
        pysam.index(bam_file)
        
    # Open bam file
    tmp_dir += "/chroms"
    os.system("mkdir -p %s" %(tmp_dir))
    read_files = []
    ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
    print("[ %s ] -----Writing chrom files..." % (ts))
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        # Iterate over chromosomes and write a reads file for each
        chromosomes = [ x.contig for x in bam.get_index_statistics() \
                        if x.mapped > 0 ]
        for chrom in chromosomes:
           records = bam.fetch(chrom)
           fname = tmp_dir + "/" + chrom + ".sam"
           with pysam.AlignmentFile(fname, "w", template = bam) as o: 
               for record in records:
                   o.write(record)
           read_files.append(fname)

    return read_files

Python pysam.sort() Examples