Python Examples of pysam.index

Source File: pipeline_cufflinks_optimization.py From CGATPipelines with MIT License

6 votes

def linkBamToWorkingDirs(infiles, outfile):
    '''
    symlink the bam file and index to the working directories
    for execution of the transcript building pipeline
    '''

    bamfile = P.snip(infiles[0], ".bai")
    indexfile = infiles[0]
    directories = [P.snip(logfile, ".log") for logfile in infiles[1]]

    for directory in directories:
        os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile))
        os.symlink(
            os.path.abspath(indexfile), os.path.join(directory, indexfile))
    updateFile(outfile)

###################################################################
###################################################################
###################################################################

Source File: download_example_data.py From ga4gh-server with Apache License 2.0

6 votes

def createBamHeader(self, baseHeader):
        """
        Creates a new bam header based on the specified header from the
        parent BAM file.
        """
        header = dict(baseHeader)
        newSequences = []
        for index, referenceInfo in enumerate(header['SQ']):
            if index < self.numChromosomes:
                referenceName = referenceInfo['SN']
                # The sequence dictionary in the BAM file has to match up
                # with the sequence ids in the data, so we must be sure
                # that these still match up.
                assert referenceName == self.chromosomes[index]
                newReferenceInfo = {
                    'AS': self.referenceSetName,
                    'SN': referenceName,
                    'LN': 0,  # FIXME
                    'UR': 'http://example.com',
                    'M5': 'dbb6e8ece0b5de29da56601613007c2a',  # FIXME
                    'SP': 'Human'
                }
                newSequences.append(newReferenceInfo)
        header['SQ'] = newSequences
        return header

Source File: utils.py From smallrnaseq with GNU General Public License v3.0

6 votes

def get_aligned_reads(samfile, collapsed=None, readcounts=None):
    """Get all aligned reads from a sam file into a pandas dataframe"""

    sam = HTSeq.SAM_Reader(str(samfile))
    f=[]
    for a in sam:
        if a.aligned == True:
            seq = a.read.seq.decode()
            f.append((seq,a.read.name,a.iv.chrom,a.iv.start,a.iv.end,a.iv.strand))
        #else:
        #    f.append((seq,a.read.name,'_unmapped'))
    counts = pd.DataFrame(f, columns=['seq','read','name','start','end','strand'])
    counts['length'] = counts.seq.str.len()
    counts = counts.drop(['read'],1)
    if collapsed is not None:
        readcounts = read_collapsed_file(collapsed)
    if readcounts is not None:
        counts = counts.merge(readcounts, on='seq')
        counts['align_id'] = counts.index
    return counts

Source File: utils.py From GelReportModels with Apache License 2.0

6 votes

def parseArgs(self):
        description = "{} to {} conversion tool".format(
            self.inputFileFormat, self.outputFileFormat)
        parser = argparse.ArgumentParser(
            description=description)
        inputHelpText = "the name of the {} file to read".format(
            self.inputFileFormat)
        parser.add_argument(
            "inputFile", help=inputHelpText)
        outputHelpText = "the name of the {} file to write".format(
            self.outputFileFormat)
        defaultOutputFilePath = "out.{}".format(
            self.outputFileFormat.lower())
        parser.add_argument(
            "--outputFile", "-o", default=defaultOutputFilePath,
            help=outputHelpText)
        parser.add_argument(
            "--numLines", "-n", default=10,
            help="the number of lines to write")
        parser.add_argument(
            "--skipIndexing", default=False, action='store_true',
            help="don't create an index file")
        args = parser.parse_args()
        self.args = args

Source File: sambamconverter.py From READemption with ISC License

6 votes

def sam_to_bam(self, sam_path, bam_path_prefix):
        if self._sam_file_is_empty(sam_path) is True:
            # pysam will generate an error if an emtpy SAM file will
            # be converted. Due to this an empty bam file with the
            # same header information will be generated from scratch
            self._generate_empty_bam_file(sam_path, bam_path_prefix)
            # Remove SAM file
            os.remove(sam_path)
            return
        temp_unsorted_bam_path = self._temp_unsorted_bam_path(
            bam_path_prefix)
        # Generate unsorted BAM file
        pysam.samtools.view(
            "-b", "-o{}".format(temp_unsorted_bam_path), sam_path,
            catch_stdout=False)
        # Generate sorted BAM file
        pysam.sort(temp_unsorted_bam_path, "-o", bam_path_prefix + ".bam")
        # Generate index for BAM file
        pysam.index("%s.bam" % bam_path_prefix)
        # Remove unsorted BAM file
        os.remove(temp_unsorted_bam_path)
        # Remove SAM file
        os.remove(sam_path)

Source File: rle.py From medaka with Mozilla Public License 2.0

6 votes

def rlebam(args):
    """Entry point for merging run length information for fast5s to bam."""
    logger = medaka.common.get_named_logger('BAMDecor')
    read_index = medaka.common.read_key_value_tsv(args.read_index)
    logger.info("Found {} read in index\n".format(len(read_index)))

    def _ingress():
        for line in sys.stdin:
            if line[0] == '@':
                yield line.rstrip(), None, None, None
            else:
                read_id, flag, _ = line.split('\t', 2)
                is_rev = bool(int(flag) & 16)
                fname = read_index[read_id]
                yield line.rstrip(), read_id, is_rev, fname

    with concurrent.futures.ProcessPoolExecutor(
            max_workers=args.workers) as executor:
        for results in executor.map(
                _rle_bam_hdf_worker, _ingress(), chunksize=10):
            print(results)

Source File: smolecule.py From medaka with Mozilla Public License 2.0

6 votes

def write_bam(fname, alignments, header, bam=True):
    """Write a `.bam` file for a set of alignments.

    :param fname: output filename.
    :param alignments: a list of `Alignment` tuples.
    :param header: bam header
    :param bam: write bam, else sam

    """
    mode = 'wb' if bam else 'w'
    with pysam.AlignmentFile(fname, mode, header=header) as fh:
        for ref_id, subreads in enumerate(alignments):
            for aln in sorted(subreads, key=lambda x: x.rstart):
                a = pysam.AlignedSegment()
                a.reference_id = ref_id
                a.query_name = aln.qname
                a.query_sequence = aln.seq
                a.reference_start = aln.rstart
                a.cigarstring = aln.cigar
                a.flag = aln.flag
                a.mapping_quality = 60
                fh.write(a)
    if mode == 'wb':
        pysam.index(fname)

Source File: BAMtoJunctionBED_alt.py From altanalyze with Apache License 2.0

6 votes

def exportIndexes(input_dir):
    import unique
    bam_dirs = unique.read_directory(input_dir)
    print 'Building BAM index files',
    for file in bam_dirs:
        if string.lower(file[-4:]) == '.bam':
            bam_dir = input_dir+'/'+file
            bamf = pysam.AlignmentFile(bam_dir, "rb" )
            ### Is there an indexed .bai for the BAM? Check.
            try:
                for entry in bamf.fetch():
                    codes = map(lambda x: x[0],entry.cigar)
                    break
            except Exception:
                ### Make BAM Indexv lciv9df8scivx 
                print '.',
                bam_dir = str(bam_dir)
                #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
                pysam.index(bam_dir)

Source File: BAMtoJunctionBED.py From altanalyze with Apache License 2.0

6 votes

def exportIndexes(input_dir):
    import unique
    bam_dirs = unique.read_directory(input_dir)
    print 'Building BAM index files',
    for file in bam_dirs:
        if string.lower(file[-4:]) == '.bam':
            bam_dir = input_dir+'/'+file
            bamf = pysam.Samfile(bam_dir, "rb" )
            ### Is there an indexed .bai for the BAM? Check.
            try:
                for entry in bamf.fetch():
                    codes = map(lambda x: x[0],entry.cigar)
                    break
            except Exception:
                ### Make BAM Indexv lciv9df8scivx 
                print '.',
                bam_dir = str(bam_dir)
                #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
                pysam.index(bam_dir)
                bamf = pysam.Samfile(bam_dir, "rb" )

Source File: Opossum.py From Opossum with GNU General Public License v3.0

6 votes

def CigarIndex(cigar, pos) :

	position = 0
	k = 0 # cigar index
	addpos = pos # extra positions

	for cigartype, cigarlength in cigar :

		position += cigarlength

		if position < pos : 
			k += 1
			addpos -= cigarlength
		else : return k, addpos



# Converts string of cigar symbol characters into the format used by Pysam

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

6 votes

def getMacsPeakShiftEstimate(infile):
    '''
    Parses the peak shift estimate file from the estimateInsertSize
    function and returns the fragment size, which is used in the macs2
    postprocessing steps as the "offset" for bed2table
    Parameters
    ----------
    infile: str
        path to input file
    '''

    with IOTools.openFile(infile, "r") as inf:

        header = inf.readline().strip().split("\t")
        values = inf.readline().strip().split("\t")

    fragment_size_mean_ix = header.index("fragmentsize_mean")
    fragment_size = int(float(values[fragment_size_mean_ix]))

    return fragment_size

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

6 votes

def makeBamLink(currentname, newname):
    '''
    Makes soft links to an existing bam file and its index - used instead
    of copying files.
    Generates and runs a command line statement.

    Parameters:
    currentname: str
        path to original file
    newname: str
        path to link location
    '''
    cwd = os.getcwd()
    os.system("""
    ln -s %(cwd)s/%(currentname)s %(cwd)s/%(newname)s;
    ln -s %(cwd)s/%(currentname)s.bai %(cwd)s/%(newname)s.bai;
    """ % locals())

Source File: PipelineIDR.py From CGATPipelines with MIT License

6 votes

def mergeBams(infile_list, outfile):
    infile_list = " ".join(infile_list)
    job_memory = "5G"
    statement = ("samtools merge - %(infile_list)s"
                 " | samtools sort - -o %(outfile)s"
                 " 2>%(outfile)s.log;"
                 " checkpoint;"
                 " samtools index %(outfile)s"
                 " 2>%(outfile)s.bai.log")
    P.run()

##########################################################################
##########################################################################
##########################################################################
# Run Peak Calling For IDR
##########################################################################

Source File: download_example_data.py From ga4gh-server with Apache License 2.0

5 votes

def _downloadBam(self, sample):
        samplePath = '{}/alignment/'.format(sample)
        study = self.studyMap[sample]
        sourceFileName = (
            '{}.mapped.ILLUMINA.bwa.{}.'
            'low_coverage.20120522.bam'.format(sample, study))
        destFileName = os.path.join(
            self.dirName, "{}.bam".format(sample))
        baseUrl = self.getBamBaseUrl()
        sampleUrl = os.path.join(baseUrl, samplePath, sourceFileName)
        indexUrl = sampleUrl + ".bai"
        localIndexFile = os.path.join(self.tempDir, sourceFileName + ".bai")
        self._downloadIndex(indexUrl, localIndexFile)
        remoteFile = pysam.AlignmentFile(
            sampleUrl, filepath_index=localIndexFile)
        header = self.createBamHeader(remoteFile.header)
        self.log("Writing '{}'".format(destFileName))
        localFile = pysam.AlignmentFile(
            destFileName, 'wb', header=header)
        for chromosome in self.chromosomes:
            self.log("chromosome {}".format(chromosome))
            iterator = remoteFile.fetch(
                chromosome.encode('utf-8'),
                start=self.chromMinMax.getMinPos(chromosome),
                end=self.chromMinMax.getMaxPos(chromosome))
            for index, record in enumerate(iterator):
                # We only write records where we have the references
                # for the next mate. TODO we should take the positions
                # of these reads into account later when calculating
                # our reference bounds.
                if record.next_reference_id < self.numChromosomes:
                    if index >= self.maxReads:
                        break
                    localFile.write(record)
            self.log("{} records written".format(index))
        remoteFile.close()
        localFile.close()
        self.log("Indexing '{}'".format(destFileName))
        pysam.index(destFileName.encode('utf-8'))
        self.bamFilePaths.append(
            (destFileName, destFileName + ".bai"))

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

5 votes

def mergeSortIndex(bamfiles, out):
    '''
    Merge bamfiles into a single sorted, indexed outfile.
    Generates and runs a command line statement.

    Example Statement:
    samtools merge ctmpljriXY.bam K9-13-1_filtered.bam K9-13-2_filtered.bam
    K9-13-3_filtered.bam;
    samtools sort ctmpljriXY.bam -o ctmpYH6llm.bam;
    samtools index ctmpYH6llm.bam;
    mv ctmpYH6llm.bam 13_Heart_pooled_filtered.bam;
    mv ctmpYH6llm.bam.bai 13_Heart_pooled_filtered.bam.bai;

    Parameters
    ----------
    bamfiles: list
        list of paths to bam files to merge
    out: str
        path to output file

    '''
    infiles = " ".join(bamfiles)
    T1 = P.getTempFilename(".")
    T2 = P.getTempFilename(".")
    statement = """samtools merge %(T1)s.bam %(infiles)s;
    samtools sort %(T1)s.bam -o %(T2)s.bam;
    samtools index %(T2)s.bam;
    mv %(T2)s.bam %(out)s;
    mv %(T2)s.bam.bai %(out)s.bai""" % locals()
    P.run()
    os.remove("%s.bam" % T1)
    os.remove(T1)
    os.remove(T2)

Source File: utils.py From GelReportModels with Apache License 2.0

5 votes

def convert(self):
        # set flags
        if self.inputFileFormat == AlignmentFileConstants.SAM:
            inputFlags = "r"
        elif self.inputFileFormat == AlignmentFileConstants.BAM:
            inputFlags = "rb"
        if self.outputFileFormat == AlignmentFileConstants.SAM:
            outputFlags = "wh"
        elif self.outputFileFormat == AlignmentFileConstants.BAM:
            outputFlags = "wb"
        # open files
        inputFile = pysam.AlignmentFile(
            self.args.inputFile, inputFlags)
        outputFile = pysam.AlignmentFile(
            self.args.outputFile, outputFlags, header=inputFile.header)
        outputFilePath = outputFile.filename
        log("Creating alignment file '{}'".format(outputFilePath))
        # write new file
        for _ in range(self.args.numLines):
            alignedSegment = inputFile.next()
            outputFile.write(alignedSegment)
        # clean up
        inputFile.close()
        outputFile.close()
        # create index file
        if (not self.args.skipIndexing and
                self.outputFileFormat == AlignmentFileConstants.BAM):
            indexFilePath = "{}.{}".format(
                outputFilePath, AlignmentFileConstants.BAI.lower())
            log("Creating index file '{}'".format(indexFilePath))
            pysam.index(outputFilePath)

Source File: pipeline_cufflinks_optimization.py From CGATPipelines with MIT License

5 votes

def indexBam(infile, outfile):
    '''
    index the reduced bam file
    '''
    pysam.index(infile)

###################################################################
###################################################################
###################################################################

Source File: app.py From svviz with MIT License

5 votes

def saveReads(dataHub, nameExtra=None):
    if dataHub.args.save_reads:
        logging.info("* Saving relevant reads *")
        for i, sample in enumerate(dataHub):
            outbam_path = dataHub.args.save_reads
            if not outbam_path.endswith(".bam"):
                outbam_path += ".bam"

            if len(dataHub.samples) > 1:
                logging.debug("Using i = {}".format(i))
                outbam_path = outbam_path.replace(".bam", ".{}.bam".format(i))

            if nameExtra is not None:
                outbam_path = outbam_path.replace(".bam", ".{}.bam".format(nameExtra))

            logging.info("  Outpath: {}".format(outbam_path))

            # print out just the reads we're interested for use later
            bam_small = pysam.Samfile(outbam_path, "wb", template=sample.bam)
            for read in sample.reads:
                bam_small.write(read)

            for read in sample.readStatistics.reads:
                bam_small.write(read)

            bam_small.close()
            sorted_path = outbam_path.replace(".bam", ".sorted")
            pysam.sort(outbam_path, sorted_path)
            pysam.index(sorted_path+".bam")

Source File: download_example_data.py From ga4gh-server with Apache License 2.0

5 votes

def _downloadIndex(self, indexUrl, localIndexFile):
        self.log("Downloading index from {} to {}".format(
            indexUrl, localIndexFile))
        response = urllib2.urlopen(indexUrl)
        with open(localIndexFile, "w") as destFile:
            destFile.write(response.read())

Source File: PipelineChipseq.py From CGATPipelines with MIT License

5 votes

def buildSimpleNormalizedBAM(infiles, outfile, nreads):
    '''normalize a bam file to given number of counts
       by random sampling
    '''
    infile, countfile = infiles

    pysam_in = pysam.Samfile(infile, "rb")

    fh = IOTools.openFile(countfile, "r")
    readcount = int(fh.read())
    fh.close()

    threshold = float(nreads) / float(readcount)

    pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in)

    # iterate over mapped reads thinning by the threshold
    ninput, noutput = 0, 0
    for read in pysam_in.fetch():
        ninput += 1
        if random.random() <= threshold:
            pysam_out.write(read)
            noutput += 1

    pysam_in.close()
    pysam_out.close()
    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, nreads))

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

5 votes

def normalize(infile, larger_nreads, outfile, smaller_nreads):
    threshold = float(smaller_nreads) / float(larger_nreads)

    pysam_in = pysam.Samfile(infile, "rb")
    pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in)

    for read in pysam_in.fetch():
        if random.random() <= threshold:
            pysam_out.write(read)

    pysam_in.close()
    pysam_out.close()
    pysam.index(outfile)

    return outfile

Source File: PipelineIntervals.py From CGATPipelines with MIT License

5 votes

def buildSimpleNormalizedBAM(bamfile, outfile, nreads):
    '''normalize a bam file to given number of counts
       by random sampling

    This method assumes that unmapped reads and multi-mapping
    reads have been removed from the :term:`bam` file.

    Arguments
    ---------
    bamfile : string
        Input file in :term:`bam` format.
    outfile : string
        Filename of output file in :term:`bam` format.
    nreads :int
        Number of reads to normalize to.
    '''

    readcount = BamTools.getNumReads(bamfile)

    pysam_in = pysam.Samfile(bamfile, "rb")

    threshold = float(nreads) / float(readcount)

    pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in)

    # iterate over mapped reads thinning by the threshold
    ninput, noutput = 0, 0
    for read in pysam_in.fetch():
        ninput += 1
        if random.random() <= threshold:
            pysam_out.write(read)
            noutput += 1

    pysam_in.close()
    pysam_out.close()
    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, nreads))

Source File: misc.py From slamdunk with GNU Affero General Public License v3.0

5 votes

def pysamIndex(outputBam):
    pysam.index(outputBam)  # @UndefinedVariable

Source File: tcounter.py From slamdunk with GNU Affero General Public License v3.0

5 votes

def pysamIndex(outputBam):
    pysam.index(outputBam)  # @UndefinedVariable

Source File: Opossum.py From Opossum with GNU General Public License v3.0

5 votes

def ReadCigarLength(cigar, k=100) :

	if k < len(cigar) :
		cigarlist = cigar[:(k+1)]
	else :
		cigarlist = cigar

	l = sum([c[1] for c in cigarlist])
		
	return l


# Input cigar and pos denoting index within a cigar transformed into a string sequence.
# Return what is the index in the original cigar and how many extra positions from the beginning
# of this index is the current position.

Source File: Opossum.py From Opossum with GNU General Public License v3.0

5 votes

def UpdateFlag(flag) :

	if (flag & 16) == 16 : return 16
	else : return 0


# Returns the cigar length when cigar is expressed base by base,
# up to the index k of cigarlist (default is a very large index)

Source File: Opossum.py From Opossum with GNU General Public License v3.0

5 votes

def MatchingBases(md, cigar, FromBeginning) :

	if not FromBeginning : 
		md = md[::-1]
		cigar = cigar[::-1]

	i = 0 # md index
	number = ''
	mlen = len(md)

	while i < mlen and md[i] in '0123456789' :
		number += md[i]
		i += 1

	if not number : number = '0'
	if FromBeginning : number = int(number)
	else : number = int(number[::-1])

	# Go through cigar to find possible inserts
	clen = 0	
	for cigartype, cigarlength in cigar :
		if cigartype in [1,3] :
			break	
		elif cigartype == 5 :
			pass
		else :
			clen += cigarlength

	if number < clen :
		number += AddClips(cigar, True) # cigar has been reversed here!

	return min(number, clen)


# Takes flag as input and returns whether alignment is primary (True) or not (False).
# If additional input parameter is set to True, then also check whether read is properly
# paired (definition of properly paired depends on mapper).

Source File: alignment_file.py From ga4gh-server with Apache License 2.0

5 votes

def parseArgs(self):
        description = "{} to {} conversion tool".format(
            self.inputFileFormat, self.outputFileFormat)
        parser = argparse.ArgumentParser(
            description=description)
        inputHelpText = "the name of the {} file to read".format(
            self.inputFileFormat)
        parser.add_argument(
            "inputFile", help=inputHelpText)
        outputHelpText = "the name of the {} file to write".format(
            self.outputFileFormat)
        defaultOutputFilePath = "out.{}".format(
            self.outputFileFormat.lower())
        parser.add_argument(
            "--outputFile", "-o", default=defaultOutputFilePath,
            help=outputHelpText)
        parser.add_argument(
            "--numLines", "-n", default=10,
            help="the number of lines to write")
        parser.add_argument(
            "--skipIndexing", default=False, action='store_true',
            help="don't create an index file")
        args = parser.parse_args()
        self.args = args

Source File: alignment_file.py From ga4gh-server with Apache License 2.0

5 votes

def convert(self):
        # set flags
        if self.inputFileFormat == AlignmentFileConstants.SAM:
            inputFlags = "r"
        elif self.inputFileFormat == AlignmentFileConstants.BAM:
            inputFlags = "rb"
        if self.outputFileFormat == AlignmentFileConstants.SAM:
            outputFlags = "wh"
        elif self.outputFileFormat == AlignmentFileConstants.BAM:
            outputFlags = "wb"
        # open files
        inputFile = pysam.AlignmentFile(
            self.args.inputFile, inputFlags)
        outputFile = pysam.AlignmentFile(
            self.args.outputFile, outputFlags, header=inputFile.header)
        outputFilePath = outputFile.filename
        utils.log("Creating alignment file '{}'".format(outputFilePath))
        # write new file
        for _ in xrange(self.args.numLines):
            alignedSegment = inputFile.next()
            outputFile.write(alignedSegment)
        # clean up
        inputFile.close()
        outputFile.close()
        # create index file
        if (not self.args.skipIndexing and
                self.outputFileFormat == AlignmentFileConstants.BAM):
            indexFilePath = "{}.{}".format(
                outputFilePath, AlignmentFileConstants.BAI.lower())
            utils.log("Creating index file '{}'".format(indexFilePath))
            pysam.index(outputFilePath)

Source File: PipelinePeakcalling.py From CGATPipelines with MIT License

5 votes

def getContigSizes(idx):
    idx_file = pd.read_csv(idx, "\t",
                           header=None,
                           names=["Name", "Length", "Mapped", "Unmapped"])

    contigs = idx_file[['Name', 'Length']]

    contig_file = idx.replace(".idxstats", ".contig")

    contigs.to_csv(contig_file, sep="\t", header=None, index=None)

    return contig_file

Python pysam.index() Examples