Python pysam.index() Examples
The following are 30
code examples of pysam.index().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pysam
, or try the search function
.
Example #1
Source File: pipeline_cufflinks_optimization.py From CGATPipelines with MIT License | 6 votes |
def linkBamToWorkingDirs(infiles, outfile): ''' symlink the bam file and index to the working directories for execution of the transcript building pipeline ''' bamfile = P.snip(infiles[0], ".bai") indexfile = infiles[0] directories = [P.snip(logfile, ".log") for logfile in infiles[1]] for directory in directories: os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile)) os.symlink( os.path.abspath(indexfile), os.path.join(directory, indexfile)) updateFile(outfile) ################################################################### ################################################################### ###################################################################
Example #2
Source File: download_example_data.py From ga4gh-server with Apache License 2.0 | 6 votes |
def createBamHeader(self, baseHeader): """ Creates a new bam header based on the specified header from the parent BAM file. """ header = dict(baseHeader) newSequences = [] for index, referenceInfo in enumerate(header['SQ']): if index < self.numChromosomes: referenceName = referenceInfo['SN'] # The sequence dictionary in the BAM file has to match up # with the sequence ids in the data, so we must be sure # that these still match up. assert referenceName == self.chromosomes[index] newReferenceInfo = { 'AS': self.referenceSetName, 'SN': referenceName, 'LN': 0, # FIXME 'UR': 'http://example.com', 'M5': 'dbb6e8ece0b5de29da56601613007c2a', # FIXME 'SP': 'Human' } newSequences.append(newReferenceInfo) header['SQ'] = newSequences return header
Example #3
Source File: utils.py From smallrnaseq with GNU General Public License v3.0 | 6 votes |
def get_aligned_reads(samfile, collapsed=None, readcounts=None): """Get all aligned reads from a sam file into a pandas dataframe""" sam = HTSeq.SAM_Reader(str(samfile)) f=[] for a in sam: if a.aligned == True: seq = a.read.seq.decode() f.append((seq,a.read.name,a.iv.chrom,a.iv.start,a.iv.end,a.iv.strand)) #else: # f.append((seq,a.read.name,'_unmapped')) counts = pd.DataFrame(f, columns=['seq','read','name','start','end','strand']) counts['length'] = counts.seq.str.len() counts = counts.drop(['read'],1) if collapsed is not None: readcounts = read_collapsed_file(collapsed) if readcounts is not None: counts = counts.merge(readcounts, on='seq') counts['align_id'] = counts.index return counts
Example #4
Source File: utils.py From GelReportModels with Apache License 2.0 | 6 votes |
def parseArgs(self): description = "{} to {} conversion tool".format( self.inputFileFormat, self.outputFileFormat) parser = argparse.ArgumentParser( description=description) inputHelpText = "the name of the {} file to read".format( self.inputFileFormat) parser.add_argument( "inputFile", help=inputHelpText) outputHelpText = "the name of the {} file to write".format( self.outputFileFormat) defaultOutputFilePath = "out.{}".format( self.outputFileFormat.lower()) parser.add_argument( "--outputFile", "-o", default=defaultOutputFilePath, help=outputHelpText) parser.add_argument( "--numLines", "-n", default=10, help="the number of lines to write") parser.add_argument( "--skipIndexing", default=False, action='store_true', help="don't create an index file") args = parser.parse_args() self.args = args
Example #5
Source File: sambamconverter.py From READemption with ISC License | 6 votes |
def sam_to_bam(self, sam_path, bam_path_prefix): if self._sam_file_is_empty(sam_path) is True: # pysam will generate an error if an emtpy SAM file will # be converted. Due to this an empty bam file with the # same header information will be generated from scratch self._generate_empty_bam_file(sam_path, bam_path_prefix) # Remove SAM file os.remove(sam_path) return temp_unsorted_bam_path = self._temp_unsorted_bam_path( bam_path_prefix) # Generate unsorted BAM file pysam.samtools.view( "-b", "-o{}".format(temp_unsorted_bam_path), sam_path, catch_stdout=False) # Generate sorted BAM file pysam.sort(temp_unsorted_bam_path, "-o", bam_path_prefix + ".bam") # Generate index for BAM file pysam.index("%s.bam" % bam_path_prefix) # Remove unsorted BAM file os.remove(temp_unsorted_bam_path) # Remove SAM file os.remove(sam_path)
Example #6
Source File: rle.py From medaka with Mozilla Public License 2.0 | 6 votes |
def rlebam(args): """Entry point for merging run length information for fast5s to bam.""" logger = medaka.common.get_named_logger('BAMDecor') read_index = medaka.common.read_key_value_tsv(args.read_index) logger.info("Found {} read in index\n".format(len(read_index))) def _ingress(): for line in sys.stdin: if line[0] == '@': yield line.rstrip(), None, None, None else: read_id, flag, _ = line.split('\t', 2) is_rev = bool(int(flag) & 16) fname = read_index[read_id] yield line.rstrip(), read_id, is_rev, fname with concurrent.futures.ProcessPoolExecutor( max_workers=args.workers) as executor: for results in executor.map( _rle_bam_hdf_worker, _ingress(), chunksize=10): print(results)
Example #7
Source File: smolecule.py From medaka with Mozilla Public License 2.0 | 6 votes |
def write_bam(fname, alignments, header, bam=True): """Write a `.bam` file for a set of alignments. :param fname: output filename. :param alignments: a list of `Alignment` tuples. :param header: bam header :param bam: write bam, else sam """ mode = 'wb' if bam else 'w' with pysam.AlignmentFile(fname, mode, header=header) as fh: for ref_id, subreads in enumerate(alignments): for aln in sorted(subreads, key=lambda x: x.rstart): a = pysam.AlignedSegment() a.reference_id = ref_id a.query_name = aln.qname a.query_sequence = aln.seq a.reference_start = aln.rstart a.cigarstring = aln.cigar a.flag = aln.flag a.mapping_quality = 60 fh.write(a) if mode == 'wb': pysam.index(fname)
Example #8
Source File: BAMtoJunctionBED_alt.py From altanalyze with Apache License 2.0 | 6 votes |
def exportIndexes(input_dir): import unique bam_dirs = unique.read_directory(input_dir) print 'Building BAM index files', for file in bam_dirs: if string.lower(file[-4:]) == '.bam': bam_dir = input_dir+'/'+file bamf = pysam.AlignmentFile(bam_dir, "rb" ) ### Is there an indexed .bai for the BAM? Check. try: for entry in bamf.fetch(): codes = map(lambda x: x[0],entry.cigar) break except Exception: ### Make BAM Indexv lciv9df8scivx print '.', bam_dir = str(bam_dir) #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False pysam.index(bam_dir)
Example #9
Source File: BAMtoJunctionBED.py From altanalyze with Apache License 2.0 | 6 votes |
def exportIndexes(input_dir): import unique bam_dirs = unique.read_directory(input_dir) print 'Building BAM index files', for file in bam_dirs: if string.lower(file[-4:]) == '.bam': bam_dir = input_dir+'/'+file bamf = pysam.Samfile(bam_dir, "rb" ) ### Is there an indexed .bai for the BAM? Check. try: for entry in bamf.fetch(): codes = map(lambda x: x[0],entry.cigar) break except Exception: ### Make BAM Indexv lciv9df8scivx print '.', bam_dir = str(bam_dir) #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False pysam.index(bam_dir) bamf = pysam.Samfile(bam_dir, "rb" )
Example #10
Source File: Opossum.py From Opossum with GNU General Public License v3.0 | 6 votes |
def CigarIndex(cigar, pos) : position = 0 k = 0 # cigar index addpos = pos # extra positions for cigartype, cigarlength in cigar : position += cigarlength if position < pos : k += 1 addpos -= cigarlength else : return k, addpos # Converts string of cigar symbol characters into the format used by Pysam
Example #11
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 6 votes |
def getMacsPeakShiftEstimate(infile): ''' Parses the peak shift estimate file from the estimateInsertSize function and returns the fragment size, which is used in the macs2 postprocessing steps as the "offset" for bed2table Parameters ---------- infile: str path to input file ''' with IOTools.openFile(infile, "r") as inf: header = inf.readline().strip().split("\t") values = inf.readline().strip().split("\t") fragment_size_mean_ix = header.index("fragmentsize_mean") fragment_size = int(float(values[fragment_size_mean_ix])) return fragment_size
Example #12
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 6 votes |
def makeBamLink(currentname, newname): ''' Makes soft links to an existing bam file and its index - used instead of copying files. Generates and runs a command line statement. Parameters: currentname: str path to original file newname: str path to link location ''' cwd = os.getcwd() os.system(""" ln -s %(cwd)s/%(currentname)s %(cwd)s/%(newname)s; ln -s %(cwd)s/%(currentname)s.bai %(cwd)s/%(newname)s.bai; """ % locals())
Example #13
Source File: PipelineIDR.py From CGATPipelines with MIT License | 6 votes |
def mergeBams(infile_list, outfile): infile_list = " ".join(infile_list) job_memory = "5G" statement = ("samtools merge - %(infile_list)s" " | samtools sort - -o %(outfile)s" " 2>%(outfile)s.log;" " checkpoint;" " samtools index %(outfile)s" " 2>%(outfile)s.bai.log") P.run() ########################################################################## ########################################################################## ########################################################################## # Run Peak Calling For IDR ##########################################################################
Example #14
Source File: download_example_data.py From ga4gh-server with Apache License 2.0 | 5 votes |
def _downloadBam(self, sample): samplePath = '{}/alignment/'.format(sample) study = self.studyMap[sample] sourceFileName = ( '{}.mapped.ILLUMINA.bwa.{}.' 'low_coverage.20120522.bam'.format(sample, study)) destFileName = os.path.join( self.dirName, "{}.bam".format(sample)) baseUrl = self.getBamBaseUrl() sampleUrl = os.path.join(baseUrl, samplePath, sourceFileName) indexUrl = sampleUrl + ".bai" localIndexFile = os.path.join(self.tempDir, sourceFileName + ".bai") self._downloadIndex(indexUrl, localIndexFile) remoteFile = pysam.AlignmentFile( sampleUrl, filepath_index=localIndexFile) header = self.createBamHeader(remoteFile.header) self.log("Writing '{}'".format(destFileName)) localFile = pysam.AlignmentFile( destFileName, 'wb', header=header) for chromosome in self.chromosomes: self.log("chromosome {}".format(chromosome)) iterator = remoteFile.fetch( chromosome.encode('utf-8'), start=self.chromMinMax.getMinPos(chromosome), end=self.chromMinMax.getMaxPos(chromosome)) for index, record in enumerate(iterator): # We only write records where we have the references # for the next mate. TODO we should take the positions # of these reads into account later when calculating # our reference bounds. if record.next_reference_id < self.numChromosomes: if index >= self.maxReads: break localFile.write(record) self.log("{} records written".format(index)) remoteFile.close() localFile.close() self.log("Indexing '{}'".format(destFileName)) pysam.index(destFileName.encode('utf-8')) self.bamFilePaths.append( (destFileName, destFileName + ".bai"))
Example #15
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 5 votes |
def mergeSortIndex(bamfiles, out): ''' Merge bamfiles into a single sorted, indexed outfile. Generates and runs a command line statement. Example Statement: samtools merge ctmpljriXY.bam K9-13-1_filtered.bam K9-13-2_filtered.bam K9-13-3_filtered.bam; samtools sort ctmpljriXY.bam -o ctmpYH6llm.bam; samtools index ctmpYH6llm.bam; mv ctmpYH6llm.bam 13_Heart_pooled_filtered.bam; mv ctmpYH6llm.bam.bai 13_Heart_pooled_filtered.bam.bai; Parameters ---------- bamfiles: list list of paths to bam files to merge out: str path to output file ''' infiles = " ".join(bamfiles) T1 = P.getTempFilename(".") T2 = P.getTempFilename(".") statement = """samtools merge %(T1)s.bam %(infiles)s; samtools sort %(T1)s.bam -o %(T2)s.bam; samtools index %(T2)s.bam; mv %(T2)s.bam %(out)s; mv %(T2)s.bam.bai %(out)s.bai""" % locals() P.run() os.remove("%s.bam" % T1) os.remove(T1) os.remove(T2)
Example #16
Source File: utils.py From GelReportModels with Apache License 2.0 | 5 votes |
def convert(self): # set flags if self.inputFileFormat == AlignmentFileConstants.SAM: inputFlags = "r" elif self.inputFileFormat == AlignmentFileConstants.BAM: inputFlags = "rb" if self.outputFileFormat == AlignmentFileConstants.SAM: outputFlags = "wh" elif self.outputFileFormat == AlignmentFileConstants.BAM: outputFlags = "wb" # open files inputFile = pysam.AlignmentFile( self.args.inputFile, inputFlags) outputFile = pysam.AlignmentFile( self.args.outputFile, outputFlags, header=inputFile.header) outputFilePath = outputFile.filename log("Creating alignment file '{}'".format(outputFilePath)) # write new file for _ in range(self.args.numLines): alignedSegment = inputFile.next() outputFile.write(alignedSegment) # clean up inputFile.close() outputFile.close() # create index file if (not self.args.skipIndexing and self.outputFileFormat == AlignmentFileConstants.BAM): indexFilePath = "{}.{}".format( outputFilePath, AlignmentFileConstants.BAI.lower()) log("Creating index file '{}'".format(indexFilePath)) pysam.index(outputFilePath)
Example #17
Source File: pipeline_cufflinks_optimization.py From CGATPipelines with MIT License | 5 votes |
def indexBam(infile, outfile): ''' index the reduced bam file ''' pysam.index(infile) ################################################################### ################################################################### ###################################################################
Example #18
Source File: app.py From svviz with MIT License | 5 votes |
def saveReads(dataHub, nameExtra=None): if dataHub.args.save_reads: logging.info("* Saving relevant reads *") for i, sample in enumerate(dataHub): outbam_path = dataHub.args.save_reads if not outbam_path.endswith(".bam"): outbam_path += ".bam" if len(dataHub.samples) > 1: logging.debug("Using i = {}".format(i)) outbam_path = outbam_path.replace(".bam", ".{}.bam".format(i)) if nameExtra is not None: outbam_path = outbam_path.replace(".bam", ".{}.bam".format(nameExtra)) logging.info(" Outpath: {}".format(outbam_path)) # print out just the reads we're interested for use later bam_small = pysam.Samfile(outbam_path, "wb", template=sample.bam) for read in sample.reads: bam_small.write(read) for read in sample.readStatistics.reads: bam_small.write(read) bam_small.close() sorted_path = outbam_path.replace(".bam", ".sorted") pysam.sort(outbam_path, sorted_path) pysam.index(sorted_path+".bam")
Example #19
Source File: download_example_data.py From ga4gh-server with Apache License 2.0 | 5 votes |
def _downloadIndex(self, indexUrl, localIndexFile): self.log("Downloading index from {} to {}".format( indexUrl, localIndexFile)) response = urllib2.urlopen(indexUrl) with open(localIndexFile, "w") as destFile: destFile.write(response.read())
Example #20
Source File: PipelineChipseq.py From CGATPipelines with MIT License | 5 votes |
def buildSimpleNormalizedBAM(infiles, outfile, nreads): '''normalize a bam file to given number of counts by random sampling ''' infile, countfile = infiles pysam_in = pysam.Samfile(infile, "rb") fh = IOTools.openFile(countfile, "r") readcount = int(fh.read()) fh.close() threshold = float(nreads) / float(readcount) pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in) # iterate over mapped reads thinning by the threshold ninput, noutput = 0, 0 for read in pysam_in.fetch(): ninput += 1 if random.random() <= threshold: pysam_out.write(read) noutput += 1 pysam_in.close() pysam_out.close() pysam.index(outfile) E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0 * noutput / ninput, nreads))
Example #21
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 5 votes |
def normalize(infile, larger_nreads, outfile, smaller_nreads): threshold = float(smaller_nreads) / float(larger_nreads) pysam_in = pysam.Samfile(infile, "rb") pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in) for read in pysam_in.fetch(): if random.random() <= threshold: pysam_out.write(read) pysam_in.close() pysam_out.close() pysam.index(outfile) return outfile
Example #22
Source File: PipelineIntervals.py From CGATPipelines with MIT License | 5 votes |
def buildSimpleNormalizedBAM(bamfile, outfile, nreads): '''normalize a bam file to given number of counts by random sampling This method assumes that unmapped reads and multi-mapping reads have been removed from the :term:`bam` file. Arguments --------- bamfile : string Input file in :term:`bam` format. outfile : string Filename of output file in :term:`bam` format. nreads :int Number of reads to normalize to. ''' readcount = BamTools.getNumReads(bamfile) pysam_in = pysam.Samfile(bamfile, "rb") threshold = float(nreads) / float(readcount) pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in) # iterate over mapped reads thinning by the threshold ninput, noutput = 0, 0 for read in pysam_in.fetch(): ninput += 1 if random.random() <= threshold: pysam_out.write(read) noutput += 1 pysam_in.close() pysam_out.close() pysam.index(outfile) E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0 * noutput / ninput, nreads))
Example #23
Source File: misc.py From slamdunk with GNU Affero General Public License v3.0 | 5 votes |
def pysamIndex(outputBam): pysam.index(outputBam) # @UndefinedVariable
Example #24
Source File: tcounter.py From slamdunk with GNU Affero General Public License v3.0 | 5 votes |
def pysamIndex(outputBam): pysam.index(outputBam) # @UndefinedVariable
Example #25
Source File: Opossum.py From Opossum with GNU General Public License v3.0 | 5 votes |
def ReadCigarLength(cigar, k=100) : if k < len(cigar) : cigarlist = cigar[:(k+1)] else : cigarlist = cigar l = sum([c[1] for c in cigarlist]) return l # Input cigar and pos denoting index within a cigar transformed into a string sequence. # Return what is the index in the original cigar and how many extra positions from the beginning # of this index is the current position.
Example #26
Source File: Opossum.py From Opossum with GNU General Public License v3.0 | 5 votes |
def UpdateFlag(flag) : if (flag & 16) == 16 : return 16 else : return 0 # Returns the cigar length when cigar is expressed base by base, # up to the index k of cigarlist (default is a very large index)
Example #27
Source File: Opossum.py From Opossum with GNU General Public License v3.0 | 5 votes |
def MatchingBases(md, cigar, FromBeginning) : if not FromBeginning : md = md[::-1] cigar = cigar[::-1] i = 0 # md index number = '' mlen = len(md) while i < mlen and md[i] in '0123456789' : number += md[i] i += 1 if not number : number = '0' if FromBeginning : number = int(number) else : number = int(number[::-1]) # Go through cigar to find possible inserts clen = 0 for cigartype, cigarlength in cigar : if cigartype in [1,3] : break elif cigartype == 5 : pass else : clen += cigarlength if number < clen : number += AddClips(cigar, True) # cigar has been reversed here! return min(number, clen) # Takes flag as input and returns whether alignment is primary (True) or not (False). # If additional input parameter is set to True, then also check whether read is properly # paired (definition of properly paired depends on mapper).
Example #28
Source File: alignment_file.py From ga4gh-server with Apache License 2.0 | 5 votes |
def parseArgs(self): description = "{} to {} conversion tool".format( self.inputFileFormat, self.outputFileFormat) parser = argparse.ArgumentParser( description=description) inputHelpText = "the name of the {} file to read".format( self.inputFileFormat) parser.add_argument( "inputFile", help=inputHelpText) outputHelpText = "the name of the {} file to write".format( self.outputFileFormat) defaultOutputFilePath = "out.{}".format( self.outputFileFormat.lower()) parser.add_argument( "--outputFile", "-o", default=defaultOutputFilePath, help=outputHelpText) parser.add_argument( "--numLines", "-n", default=10, help="the number of lines to write") parser.add_argument( "--skipIndexing", default=False, action='store_true', help="don't create an index file") args = parser.parse_args() self.args = args
Example #29
Source File: alignment_file.py From ga4gh-server with Apache License 2.0 | 5 votes |
def convert(self): # set flags if self.inputFileFormat == AlignmentFileConstants.SAM: inputFlags = "r" elif self.inputFileFormat == AlignmentFileConstants.BAM: inputFlags = "rb" if self.outputFileFormat == AlignmentFileConstants.SAM: outputFlags = "wh" elif self.outputFileFormat == AlignmentFileConstants.BAM: outputFlags = "wb" # open files inputFile = pysam.AlignmentFile( self.args.inputFile, inputFlags) outputFile = pysam.AlignmentFile( self.args.outputFile, outputFlags, header=inputFile.header) outputFilePath = outputFile.filename utils.log("Creating alignment file '{}'".format(outputFilePath)) # write new file for _ in xrange(self.args.numLines): alignedSegment = inputFile.next() outputFile.write(alignedSegment) # clean up inputFile.close() outputFile.close() # create index file if (not self.args.skipIndexing and self.outputFileFormat == AlignmentFileConstants.BAM): indexFilePath = "{}.{}".format( outputFilePath, AlignmentFileConstants.BAI.lower()) utils.log("Creating index file '{}'".format(indexFilePath)) pysam.index(outputFilePath)
Example #30
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 5 votes |
def getContigSizes(idx): idx_file = pd.read_csv(idx, "\t", header=None, names=["Name", "Length", "Mapped", "Unmapped"]) contigs = idx_file[['Name', 'Length']] contig_file = idx.replace(".idxstats", ".contig") contigs.to_csv(contig_file, sep="\t", header=None, index=None) return contig_file