Python pysam.sort() Examples
The following are 28
code examples of pysam.sort().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pysam
, or try the search function
.
Example #1
Source File: filter.py From slamdunk with GNU Affero General Public License v3.0 | 6 votes |
def bamSort(outputBAM, log, newHeader, verbose): tmp = outputBAM + "_tmp" if(newHeader != None): pyOutputBAM = pysam.AlignmentFile(outputBAM, "rb") pyTmp = pysam.AlignmentFile(tmp, "wb", header=newHeader) for read in pyOutputBAM: pyTmp.write(read) pyOutputBAM.close() pyTmp.close() else: os.rename(outputBAM, tmp) #run(" ".join(["samtools", "sort", "-@", str(threads) , tmp, replaceExtension(outFile, "")]), log, verbose=verbose, dry=dry) run(" ".join(["samtools sort", "-o", outputBAM, tmp]), log, verbose=verbose, dry=False) #pysam.sort(tmp, outputBAM) # @UndefinedVariable removeFile(tmp)
Example #2
Source File: tepid.py From TEPID with GNU General Public License v3.0 | 6 votes |
def check_name_sorted(bam, p, prefix): """ Sort bam file by name if position sorted """ test_head = pysam.AlignmentFile(bam, 'rb') p = str(p) try: test_head.header['HD']['SO'] except KeyError: print ' sorting bam file' pysam.sort('-@', p, '-n', bam, prefix + 'sorted.temp') os.remove(bam) os.rename(prefix + 'sorted.temp.bam', bam) else: if test_head.header['HD']['SO'] == 'queryname': pass else: print ' sorting bam file' pysam.sort('-@', p, '-n', bam, prefix + 'sorted.temp') os.remove(bam) os.rename(prefix + 'sorted.temp.bam', bam) test_head.close()
Example #3
Source File: sambamconverter.py From READemption with ISC License | 6 votes |
def sam_to_bam(self, sam_path, bam_path_prefix): if self._sam_file_is_empty(sam_path) is True: # pysam will generate an error if an emtpy SAM file will # be converted. Due to this an empty bam file with the # same header information will be generated from scratch self._generate_empty_bam_file(sam_path, bam_path_prefix) # Remove SAM file os.remove(sam_path) return temp_unsorted_bam_path = self._temp_unsorted_bam_path( bam_path_prefix) # Generate unsorted BAM file pysam.samtools.view( "-b", "-o{}".format(temp_unsorted_bam_path), sam_path, catch_stdout=False) # Generate sorted BAM file pysam.sort(temp_unsorted_bam_path, "-o", bam_path_prefix + ".bam") # Generate index for BAM file pysam.index("%s.bam" % bam_path_prefix) # Remove unsorted BAM file os.remove(temp_unsorted_bam_path) # Remove SAM file os.remove(sam_path)
Example #4
Source File: utils.py From smallrnaseq with GNU General Public License v3.0 | 5 votes |
def sam_to_bam(filename): """Convert sam to bam""" import pysam infile = pysam.AlignmentFile(filename, "r") name = os.path.splitext(filename)[0]+'.bam' bamfile = pysam.AlignmentFile(name, "wb", template=infile) for s in infile: bamfile.write(s) pysam.sort("-o", name, name) pysam.index(name) bamfile = pysam.AlignmentFile(name, "rb") return
Example #5
Source File: PipelineIntervals.py From CGATPipelines with MIT License | 5 votes |
def createGenomeWindows(genome, outfile, windows): statement = ''' cgat windows2gff --genome=%(genome)s --output-format=bed --fixed-width-windows=%(windows)s --log=%(outfile)s.log | bedtools sort -i stdin | gzip > %(outfile)s ''' P.run()
Example #6
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 5 votes |
def mergeSortIndex(bamfiles, out): ''' Merge bamfiles into a single sorted, indexed outfile. Generates and runs a command line statement. Example Statement: samtools merge ctmpljriXY.bam K9-13-1_filtered.bam K9-13-2_filtered.bam K9-13-3_filtered.bam; samtools sort ctmpljriXY.bam -o ctmpYH6llm.bam; samtools index ctmpYH6llm.bam; mv ctmpYH6llm.bam 13_Heart_pooled_filtered.bam; mv ctmpYH6llm.bam.bai 13_Heart_pooled_filtered.bam.bai; Parameters ---------- bamfiles: list list of paths to bam files to merge out: str path to output file ''' infiles = " ".join(bamfiles) T1 = P.getTempFilename(".") T2 = P.getTempFilename(".") statement = """samtools merge %(T1)s.bam %(infiles)s; samtools sort %(T1)s.bam -o %(T2)s.bam; samtools index %(T2)s.bam; mv %(T2)s.bam %(out)s; mv %(T2)s.bam.bai %(out)s.bai""" % locals() P.run() os.remove("%s.bam" % T1) os.remove(T1) os.remove(T2)
Example #7
Source File: simulator.py From slamdunk with GNU Affero General Public License v3.0 | 5 votes |
def prepareBED(bed, slamSimBed, minLength): utrs = [] for utr in BedIterator(bed): utrs.append(utr) utrs.sort(key=lambda x: (x.name, -x.getLength())) outBed = open(slamSimBed, "w") partList = [] lastUtr = None for utr in utrs: if utr.hasStrand() and utr.hasNonEmptyName(): currentUtr = utr.name if currentUtr == lastUtr: partList.append(utr) else: if(not lastUtr is None): printUTR(partList[0], outBed, minLength) partList = [utr] lastUtr = currentUtr else: print("Warning: Invalid BED entry found: " + str(utr)) if(not lastUtr is None): printUTR(partList[0], outBed, minLength) outBed.close()
Example #8
Source File: tepid.py From TEPID with GNU General Public License v3.0 | 5 votes |
def check_bam(bam, p, prefix, make_new_index=False): """ Sort and index bam file returns dictionary of chromosome names and lengths """ # check if sorted test_head = pysam.AlignmentFile(bam, 'rb') chrom_sizes = {} p = str(p) for i in test_head.header['SQ']: chrom_sizes[i['SN']] = int(i['LN']) try: test_head.header['HD']['SO'] except KeyError: print ' sorting bam file' pysam.sort('-@', p, bam, prefix + 'sorted.temp') os.remove(bam) os.rename(prefix + 'sorted.temp.bam', bam) else: if test_head.header['HD']['SO'] == 'coordinate': pass else: print ' sorting bam file' pysam.sort('-@', p, bam, prefix + 'sorted.temp') os.remove(bam) os.rename(prefix + 'sorted.temp.bam', bam) test_head.close() # check if indexed if '{}.bai'.format(bam) in os.listdir('.') and make_new_index is False: pass else: print ' indexing bam file' pysam.index(bam) return chrom_sizes
Example #9
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 5 votes |
def sortIndex(bamfile): ''' Sorts and indexes a bam file. Generates and runs a command line statement. Example Statement: samtools sort K9-10-1_filtered.bam -o ctmpvHoczK.bam; samtools index ctmpvHoczK.bam; mv ctmpvHoczK.bam K9-10-1_filtered.bam; mv ctmpvHoczK.bam.bai K9-10-1_filtered.bam.bai The input bam file is replaced by the sorted bam file. Parameters ---------- bamfile: str path to bam file to sort and index ''' T1 = P.getTempFilename(".") bamfile = P.snip(bamfile) statement = """ samtools sort %(bamfile)s.bam -o %(T1)s.bam; samtools index %(T1)s.bam; mv %(T1)s.bam %(bamfile)s.bam; mv %(T1)s.bam.bai %(bamfile)s.bam.bai""" % locals() P.run() os.remove(T1)
Example #10
Source File: tepid.py From TEPID with GNU General Public License v3.0 | 5 votes |
def refine(options): """ Refine TE insertion and deletion calls within a group of related samples Use indel calls from other samples in the group, inspect areas of the genome in samples where indel was not called, and look for evidence of the same indel with much lower read count threshold """ te = pybedtools.BedTool(options.te).sort() names = readNames(options.all_samples) if options.insertions is not False: insertions = getOtherLines(names, options.insertions) if options.deletions is not False: deletions = getOtherLines(names, options.deletions) # format ([data], [inverse_accessions]) print "Processing "+options.name chrom_sizes = check_bam(options.conc, options.proc, options.prefix) check_bam(options.split, options.proc, options.prefix, make_new_index=True) cov = calc_cov(options.conc, 100000, 120000) concordant = pysam.AlignmentFile(options.conc, 'rb') split_alignments = pysam.AlignmentFile(options.split, 'rb') name_indexed = pysam.IndexedReads(split_alignments) name_indexed.build() if options.deletions is not False: print " checking deletions" process_missed(deletions, "deletion", concordant, split_alignments, name_indexed, options.name, te, cov/5, chrom_sizes, options.prefix) else: pass if options.insertions is not False: print " checking insertions" process_missed(insertions, "insertion", concordant, split_alignments, name_indexed, options.name, te, cov/10, chrom_sizes, options.prefix) else: pass
Example #11
Source File: tepid.py From TEPID with GNU General Public License v3.0 | 5 votes |
def check_te_overlaps_dels(te, bamfile, te_list, prefix): pybedtools.BedTool(bamfile).bam_to_bed().saveas(prefix + 'split.temp') convert_split_pairbed(prefix + 'split.temp', prefix + 'split_bedpe.temp') split = pybedtools.BedTool(prefix + 'split_bedpe.temp').each(append_origin, word='split').saveas() create_deletion_coords(split, prefix + 'second_pass_del_coords.temp') dels = pybedtools.BedTool(prefix + 'second_pass_del_coords.temp').sort() intersections = dels.intersect(te, wb=True, nonamecheck=True, sorted=True) os.remove(prefix + 'second_pass_del_coords.temp') reads = [] for r in intersections: if r[-3] in te_list: reads.append(r[3]) else: pass return reads
Example #12
Source File: mock_data.py From medaka with Mozilla Public License 2.0 | 5 votes |
def create_simple_bam(fname, calls): """Create a small bam file with RLE encoding coded in the qscores.""" ref_len = len(simple_data['ref']) header = {'HD': {'VN': '1.0'}, 'SQ': [{'LN': ref_len, 'SN': 'ref'}]} tmp_file = '{}.tmp'.format(fname) with pysam.AlignmentFile( tmp_file, 'wb', reference_names=['ref', ], reference_lengths=[ref_len, ], header=header) as output: for index, basecall in enumerate(calls): a =common.initialise_alignment( query_name=basecall['query_name'], reference_id=0, reference_start=0, query_sequence=basecall['seq'], cigarstring=basecall['cigarstring'], flag=basecall['flag'], query_qualities=basecall['quality'], tags=basecall['tags']) output.write(a) pysam.sort("-o", fname, tmp_file) os.remove(tmp_file) pysam.index(fname)
Example #13
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 5 votes |
def createGenomeWindows(genome, outfile, windows): statement = ''' cgat windows2gff --genome=%(genome)s --output-format=bed --fixed-width-windows=%(windows)s --log=%(outfile)s.log | bedtools sort -i stdin | gzip > %(outfile)s ''' P.run()
Example #14
Source File: app.py From svviz with MIT License | 5 votes |
def saveReads(dataHub, nameExtra=None): if dataHub.args.save_reads: logging.info("* Saving relevant reads *") for i, sample in enumerate(dataHub): outbam_path = dataHub.args.save_reads if not outbam_path.endswith(".bam"): outbam_path += ".bam" if len(dataHub.samples) > 1: logging.debug("Using i = {}".format(i)) outbam_path = outbam_path.replace(".bam", ".{}.bam".format(i)) if nameExtra is not None: outbam_path = outbam_path.replace(".bam", ".{}.bam".format(nameExtra)) logging.info(" Outpath: {}".format(outbam_path)) # print out just the reads we're interested for use later bam_small = pysam.Samfile(outbam_path, "wb", template=sample.bam) for read in sample.reads: bam_small.write(read) for read in sample.readStatistics.reads: bam_small.write(read) bam_small.close() sorted_path = outbam_path.replace(".bam", ".sorted") pysam.sort(outbam_path, sorted_path) pysam.index(sorted_path+".bam")
Example #15
Source File: process_sams.py From TALON with MIT License | 5 votes |
def partition_reads(sam_files, datasets, tmp_dir = "talon_tmp/", n_threads = 0): """ Use bedtools merge to create non-overlapping intervals from all of the transcripts in a series of SAM/BAM files. Then, iterate over the intervals to extract all reads inside of them from the pysam object. Returns: - List of lists: sublists contain pysam reads from a given interval - List of tuple intervals - filename of merged bam file (to keep track of the header) """ merged_bam = preprocess_sam(sam_files, datasets, tmp_dir = tmp_dir, n_threads = n_threads) try: all_reads = pybedtools.BedTool(merged_bam).bam_to_bed() except Exception as e: print(e) raise RuntimeError("Problem opening sam file %s" % (merged_bam)) # Must sort the Bedtool object sorted_reads = all_reads.sort() intervals = sorted_reads.merge(d = 100000000) # Now open each sam file using pysam and extract the reads coords = [] read_groups = [] with pysam.AlignmentFile(merged_bam) as bam: # type: pysam.AlignmentFile for interval in intervals: reads = get_reads_in_interval(bam, interval.chrom, interval.start, interval.end) read_groups.append(reads) coords.append((interval.chrom, interval.start + 1, interval.end)) return read_groups, coords, merged_bam
Example #16
Source File: process_sams.py From TALON with MIT License | 5 votes |
def preprocess_sam(sam_files, datasets, tmp_dir = "talon_tmp/", n_threads = 0): """ Copy and rename the provided SAM/BAM file(s), merge them, and index. This is necessary in order to use Pybedtools commands on the reads. The renaming is necessary in order to label the reads according to their dataset.""" # Create the tmp dir os.system("mkdir -p %s " % (tmp_dir)) # Copy and rename SAM files with dataset names to ensure correct RG tags renamed_sams = [] for sam, dataset in zip(sam_files, datasets): suffix = "." + sam.split(".")[-1] if suffix == ".sam": bam_copy = tmp_dir + dataset + "_unsorted.bam" convert_to_bam(sam, bam_copy) sam = bam_copy sorted_bam = tmp_dir + dataset + ".bam" pysam.sort("-@", str(n_threads), "-o", sorted_bam, sam) renamed_sams.append(sorted_bam) merged_bam = tmp_dir + "merged.bam" merge_args = [merged_bam] + renamed_sams + ["-f", "-r", "-@", str(n_threads)] # index_args = [merged_bam, "-@", str(n_threads)] # Merge datasets and use -r option to include a read group tag try: pysam.merge(*merge_args) pysam.index(merged_bam) ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] Merged input SAM/BAM files" % (ts)) except: raise RuntimeError(("Problem merging and indexing SAM/BAM files. " "Check your file paths and make sure that all " "files have headers.")) return merged_bam
Example #17
Source File: utils.py From smallrnaseq with GNU General Public License v3.0 | 5 votes |
def featurecounts(samfile, gtffile): """Count aligned features with the featureCounts program. Returns: a dataframe of counts""" params = '-T 5 -t exon -g transcript_id' cmd = 'featureCounts %s -a %s -o counts.txt %s' %(params, gtffile, samfile) print (cmd) result = subprocess.check_output(cmd, shell=True, executable='/bin/bash') counts = pd.read_csv('counts.txt', sep='\t', comment='#') counts = counts.rename(columns={samfile:'reads'}) counts = counts.sort('reads', ascending=False) return counts
Example #18
Source File: utils.py From smallrnaseq with GNU General Public License v3.0 | 5 votes |
def print_read_stack(reads, refseq=None, cutoff=0, by=None, label=''): """Print local read alignments from a sam file against the mapped sequence. Args: reads: dataframe of read counts with position info refseq: sequence segment or gene that reads are mapped to cutoff: don't display read with <=cutoff counts by: column to sort reads by, e.g. 'reads', 'start' or 'end' Returns: string of printed read stacks to display or write to a file """ if refseq != None: seqlen = len(refseq) else: seqlen = reads.end.max() f = None reads = reads[reads.reads>=cutoff] if by is not None: reads = reads.sort_values(by, ascending=False) l = len(reads) if l==0: return s = '' s += '%s unique reads\n' %l s += '-------------------------------------\n' s += refseq+'\n' for idx,r in reads.iterrows(): seq = r.seq count = r.reads pos = find_subseq(refseq, seq) if pos == -1: continue i = len(seq)+pos s += "{:>{w}} ({c})\n".format(seq,w=i,c=count) s += '\n' return s
Example #19
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 4 votes |
def buildBAMforPeakCalling(infiles, outfile, dedup, mask): '''build a BAM file suitable for peak calling. This method uses bedtools_. Infiles are merged and unmapped reads removed. Arguments --------- infiles : list List of :term:`bam` formatted files. outfile : string Filename of output file in :term:`bam` format. dedup : int If True, remove duplicate reads using Picard. mask : string If given, `mask` should be a :term:`bed` formatted file. Reads falling into these regions are removed. ''' # open the infiles, if more than one merge and sort first using samtools. statement = [] if len(infiles) > 1 and isinstance(infiles, str) == 0: # assume: samtools merge output is sorted # assume: sam files are sorted already statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" "))) statement.append('''samtools sort @IN@ @OUT@''') if dedup: job_memory = "16G" statement.append('''MarkDuplicates INPUT=@IN@ ASSUME_SORTED=true REMOVE_DUPLICATES=true QUIET=true OUTPUT=@OUT@ METRICS_FILE=%(outfile)s.picardmetrics VALIDATION_STRINGENCY=SILENT >& %(outfile)s.picardlog''') if mask: statement.append( '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''') # do not link, as local scratch will not be available on shared # node statement.append('''mv @IN@ %(outfile)s''') statement.append('''samtools index %(outfile)s''') statement = P.joinStatements(statement, infiles) P.run()
Example #20
Source File: PipelinePeakcalling.py From CGATPipelines with MIT License | 4 votes |
def bedGraphToBigwig(infile, contigsfile, outfile, remove=True, sort_bedGraph=False): '''convert a bedgraph file to a bigwig file. The bedgraph file is deleted on success. Arguments --------- infile : string Input file in term `bedgraph` format. Must be sorted: sort -k1,1 -k2,2 contigsfile : string file of chromosome size containing two columns c=name and length outfile : string Filename of output file in `bigwig` format. remove : boolean remove bedgraph file after successful conversion. Default True ''' if not os.path.exists(infile): raise OSError("bedgraph file %s does not exist" % infile) if not os.path.exists(contigsfile): raise OSError("contig size file %s does not exist" % contigsfile) if sort_bedGraph: statement = '''sorted_bdg=`mktemp -p %(local_tmpdir)s`; checkpoint; sort -k1,1 -k2,2n %(infile)s > $sorted_bdg; checkpoint; bedGraphToBigWig $sorted_bdg %(contigsfile)s %(outfile)s; checkpoint; rm $sorted_bdg; ''' else: statement = ''' bedGraphToBigWig %(infile)s %(contigsfile)s %(outfile)s ''' P.run() if os.path.exists(outfile) and not IOTools.isEmpty(outfile): os.remove(infile)
Example #21
Source File: PipelineChipseq.py From CGATPipelines with MIT License | 4 votes |
def buildBAMforPeakCalling(infiles, outfile, dedup, mask): ''' Make a BAM file suitable for peak calling. Infiles are merged and unmapped reads removed. If specificied duplicate reads are removed. This method use Picard. If a mask is specified, reads falling within the mask are filtered out. This uses bedtools. The mask is a quicksect object containing the regions from which reads are to be excluded. ''' # open the infiles, if more than one merge and sort first using samtools. samfiles = [] num_reads = 0 nfiles = 0 statement = [] tmpfile = P.getTempFilename(".") if len(infiles) > 1 and isinstance(infiles, str) == 0: # assume: samtools merge output is sorted # assume: sam files are sorted already statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" "))) statement.append('''samtools sort @IN@ @OUT@''') if dedup: statement.append('''MarkDuplicates INPUT=@IN@ ASSUME_SORTED=true REMOVE_DUPLICATES=true QUIET=true OUTPUT=@OUT@ METRICS_FILE=%(outfile)s.picardmetrics VALIDATION_STRINGENCY=SILENT > %(outfile)s.picardlog ''') if mask: statement.append( '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''') statement.append('''mv @IN@ %(outfile)s''') statement.append('''samtools index %(outfile)s''') statement = P.joinStatements(statement, infiles) P.run() ############################################################ ############################################################ ############################################################
Example #22
Source File: PipelineIntervals.py From CGATPipelines with MIT License | 4 votes |
def bedGraphToBigwig(infile, contigsfile, outfile, remove=True, sort_bedGraph=False): '''convert a bedgraph file to a bigwig file. The bedgraph file is deleted on success. Arguments --------- infile : string Input file in term `bedgraph` format. Must be sorted: sort -k1,1 -k2,2 contigsfile : string file of chromosome size containing two columns c=name and length outfile : string Filename of output file in `bigwig` format. remove : boolean remove bedgraph file after successful conversion. Default True ''' if not os.path.exists(infile): raise OSError("bedgraph file %s does not exist" % infile) if not os.path.exists(contigsfile): raise OSError("contig size file %s does not exist" % contigsfile) if sort_bedGraph: statement = '''sorted_bdg=`mktemp -p %(local_tmpdir)s`; checkpoint; sort -k1,1 -k2,2n %(infile)s > $sorted_bdg; checkpoint; bedGraphToBigWig $sorted_bdg %(contigsfile)s %(outfile)s; checkpoint; rm $sorted_bdg; ''' else: statement = ''' bedGraphToBigWig %(infile)s %(contigsfile)s %(outfile)s ''' P.run() if os.path.exists(outfile) and not IOTools.isEmpty(outfile): os.remove(infile)
Example #23
Source File: PipelineIntervals.py From CGATPipelines with MIT License | 4 votes |
def buildBAMforPeakCalling(infiles, outfile, dedup, mask): '''build a BAM file suitable for peak calling. This method uses bedtools_. Infiles are merged and unmapped reads removed. Arguments --------- infiles : list List of :term:`bam` formatted files. outfile : string Filename of output file in :term:`bam` format. dedup : int If True, remove duplicate reads using Picard. mask : string If given, `mask` should be a :term:`bed` formatted file. Reads falling into these regions are removed. ''' # open the infiles, if more than one merge and sort first using samtools. statement = [] if len(infiles) > 1 and isinstance(infiles, str) == 0: # assume: samtools merge output is sorted # assume: sam files are sorted already statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" "))) statement.append('''samtools sort @IN@ @OUT@''') if dedup: job_memory = "16G" statement.append('''picard MarkDuplicates INPUT=@IN@ ASSUME_SORTED=true REMOVE_DUPLICATES=true QUIET=true OUTPUT=@OUT@ METRICS_FILE=%(outfile)s.picardmetrics VALIDATION_STRINGENCY=SILENT >& %(outfile)s.picardlog''') if mask: statement.append( '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''') # do not link, as local scratch will not be available on shared # node statement.append('''mv @IN@ %(outfile)s''') statement.append('''samtools index %(outfile)s''') statement = P.joinStatements(statement, infiles) P.run()
Example #24
Source File: tepid.py From TEPID with GNU General Public License v3.0 | 4 votes |
def process_merged(infile, outfile, sd): """ take merged coordinates and filter out those where multiple non-nested TEs insert into same locus resolve cases where insertion site is within another TE """ disc = ['disc_forward', 'disc_reverse'] with open(infile, 'r') as inf, open(outfile, 'w+') as outf: for line in inf: line = line.rsplit() te_chroms = line[3].split(',') te_names = line[7].split(',') r2_chrom = line[9].split(',') # TE read, at end of file. Should be two clusers for split reads if TE large enough r2_start = line[10].split(',') r2Starts = [int(x) for x in r2_start] r2_stop = line[11].split(',') r2Stops = [int(x) for x in r2_stop] # create list of tuples to record read positions r2 = [] for x in xrange(len(r2_chrom)): r2.append((r2_chrom[x], r2Starts[x], r2Stops[x], te_names[x])) # now sort reads by chr, start r2 = sorted(r2, key=lambda x: (x[0], x[1])) r2_coords = _condense_coords(r2, 10) starts = line[4].split(',') starts = [int(x) for x in starts] stops = line[5].split(',') stops = [int(x) for x in stops] te_reads = [] for x in xrange(len(te_chroms)): te_reads.append((te_chroms[x], starts[x], stops[x], te_names[x])) te_reads = sorted(te_reads, key=lambda x: (x[0], x[1])) # filter where there are at least two clusters of reads in the TE if the TE length is more than 200 bp and read is split if (len(r2_coords) >= 2) or sd in disc or (abs(max(stops) - min(starts) <= 200)): coords = _condense_coords(te_reads, 1000) if len(coords) > 1: crd, max_reads, total_smaller_clusters, dubs = get_main_cluster(coords) te_name = ','.join(crd[3]) else: crd = coords[0][0:3] max_reads = coords[0][3] te_name = ','.join(coords[0][4]) total_smaller_clusters = 0 dubs = 0 if total_smaller_clusters >= (max_reads - 2) or dubs >= 2: pass else: outf.write('{ch}\t{sta}\t{stp}\t{tec}\t{tesa}\t{tesp}\t{rds}\t{nm}\t{cnt}\t{sd}\n'.format(ch=line[0], sta=line[1], stp=line[2], tec=crd[0], tesa=crd[1], tesp=crd[2], rds=line[6], nm=te_name, cnt=line[8], sd=sd)) else: pass
Example #25
Source File: test_rle.py From medaka with Mozilla Public License 2.0 | 4 votes |
def setUpClass(cls): """Create temporary files and bam file Ref T T A A C T T T G Read1 A A C T T T G Read2 T A A A C T T T G """ cls.bam_input = tempfile.NamedTemporaryFile(suffix='.bam').name cls.bam_output = tempfile.NamedTemporaryFile(suffix='.bam').name cls.ref_fname = tempfile.NamedTemporaryFile(suffix='.fasta').name with open(cls.ref_fname, 'w') as fasta: fasta.write('>ref\n') fasta.write('TTAACTTTG\n') header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 9, 'SN': 'ref'}, ]} basecalls = { 'read1': { 'query_name': 'read1', 'reference_id': 0, 'reference_start': 2, 'query_sequence': 'AACTTTG', 'cigarstring': '7=', 'flag': 0, 'mapping_quality': 50}, 'read2': { 'query_name': 'read2', 'reference_id': 0, 'reference_start': 1, 'query_sequence': 'TAAACTTTG', 'cigarstring': '3=1I5=', 'flag': 0, 'mapping_quality': 50}} tmp_file = '{}.tmp'.format(cls.bam_input) with pysam.AlignmentFile(tmp_file, 'wb', header=header) as bam: for basecall in basecalls.values(): record = common.initialise_alignment(**basecall) bam.write(record) pysam.sort("-o", cls.bam_input, tmp_file) os.remove(tmp_file) pysam.index(cls.bam_input)
Example #26
Source File: rle.py From medaka with Mozilla Public License 2.0 | 4 votes |
def _compress_bam(bam_input, bam_output, ref_fname, fast5_dir=None, summary=None, regions=None, threads=1): """Compress a bam into run length encoding (RLE). :param bam_input: str, name of the bam file to be compressed :param bam_output: str, name of the bam to be produced :param ref_fname: str, reference filename, used to produce bam_input :param fast5_dir: str, root directory to find the fast5 files :param summary: str, filename of a summary name, with the columns to link the read id to the fast5 file containing it. Must contain columns 'read_id' and 'filename' :param regions: list, genomic regions to be extracted :param threads: int, number of workers to be used :returns: None """ regions = medaka.common.get_regions(bam_input, regions) ref_fasta = pysam.FastaFile(ref_fname) # If fast_dir is passed, create an index if fast5_dir: with open(summary) as input_file: # Summary files can be huge, avoid loading to memory col_names = input_file.readline().replace('#', '').split() col_filename = col_names.index('filename') col_readid = col_names.index('read_id') file_index = dict( (line.split()[col_readid], line.split()[col_filename]) for line in input_file) else: file_index = None with pysam.AlignmentFile(bam_input, 'r') as alignments_bam: tmp_output = '{}.tmp'.format(bam_output) with pysam.AlignmentFile( tmp_output, 'wb', header=alignments_bam.header) as output: for region in regions: bam_current = alignments_bam.fetch( reference=region.ref_name, start=region.start, end=region.end) ref_sequence = ref_fasta.fetch(region.ref_name) ref_rle = RLEConverter(ref_sequence) func = functools.partial( _compress_alignment, ref_rle=ref_rle, fast5_dir=fast5_dir, file_index=file_index) with concurrent.futures.ThreadPoolExecutor( max_workers=threads) as executor: for chunk in medaka.common.grouper(bam_current, 100): for new_alignment in executor.map(func, chunk): if new_alignment is not None: output.write(new_alignment) pysam.sort("-o", bam_output, tmp_output) os.remove(tmp_output) pysam.index(bam_output)
Example #27
Source File: segemehl.py From READemption with ISC License | 4 votes |
def align_reads( self, read_file_or_pair, index_file, fasta_files, output_file, hit_strategy=1, accuracy=95, evalue=5.0, threads=1, split=False, segemehl_format=False, order=False, nonmatch_file=None, other_parameters=None, paired_end=False): if not paired_end: assert type(read_file_or_pair) == str segemehl_call = [ self._segemehl_bin, "--query", read_file_or_pair] else: assert type(read_file_or_pair) == list segemehl_call = [ self._segemehl_bin, "--query", read_file_or_pair[0], "--mate", read_file_or_pair[1]] segemehl_call += [ "--index", index_file, "--database"] + fasta_files + [ "--outfile", output_file, "--bamabafixoida", "--hitstrategy", str(hit_strategy), "--accuracy", str(accuracy), "--evalue", str(evalue), "--threads", str(threads)] if segemehl_format: segemehl_call.append("--SEGEMEHL") if order is True: segemehl_call.append("--order") if split is True: segemehl_call.append("--splits") if nonmatch_file: segemehl_call += ["--nomatchfilename", nonmatch_file] if self._show_progress is False: #segemehl_call += ["--silent"] pass if other_parameters: segemehl_call.append(other_parameters) # Discard standard error output if self._show_progress is False: with open(os.devnull, "w") as devnull: call(segemehl_call, stderr=devnull) else: call(segemehl_call) # Discard unmapped reads for further analysis. # Unmapped reads are stored in the folder output/align/unaligned_reads. tmp_filtered_output_file = f"{output_file}_filtered" pysam.view("-b", "-F", "4", "-o", tmp_filtered_output_file, output_file, catch_stdout=False) os.rename(tmp_filtered_output_file, output_file) tmp_sorted_outfile = f"{output_file}_sorted" pysam.sort("-o", tmp_sorted_outfile, output_file) os.rename(tmp_sorted_outfile, output_file) pysam.index(output_file)
Example #28
Source File: talon_label_reads.py From TALON with MIT License | 4 votes |
def split_reads_by_chrom(sam_file, tmp_dir = "tmp_label_reads", n_threads = 1): """ Reads a SAM/BAM file and splits the reads into one file per chromosome. Returns a list of the resulting filenames.""" ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] Splitting SAM by chromosome..." % (ts)) tmp_dir = tmp_dir + "/raw" os.system("mkdir -p %s" %(tmp_dir)) if sam_file.endswith(".sam"): # Convert to bam ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] -----Converting to bam...." % (ts)) bam_file = tmp_dir + "/all_reads.bam" pysam.view("-b", "-S", "-@", str(n_threads), "-o", bam_file, sam_file, catch_stdout=False) elif sam_file.endswith(".bam"): bam_file = sam_file else: raise ValueError("Please provide a .sam or .bam file") # Index the file if no index exists if not os.path.isfile(bam_file + ".bai"): ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] -----Sorting and indexing..." % (ts)) sorted_bam = tmp_dir + "/all_reads.sorted.bam" pysam.sort("-@", str(n_threads), "-o", sorted_bam, bam_file) bam_file = sorted_bam pysam.index(bam_file) # Open bam file tmp_dir += "/chroms" os.system("mkdir -p %s" %(tmp_dir)) read_files = [] ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] -----Writing chrom files..." % (ts)) with pysam.AlignmentFile(bam_file, "rb") as bam: # Iterate over chromosomes and write a reads file for each chromosomes = [ x.contig for x in bam.get_index_statistics() \ if x.mapped > 0 ] for chrom in chromosomes: records = bam.fetch(chrom) fname = tmp_dir + "/" + chrom + ".sam" with pysam.AlignmentFile(fname, "w", template = bam) as o: for record in records: o.write(record) read_files.append(fname) return read_files