Python pysam.Samfile() Examples
The following are 30
code examples of pysam.Samfile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pysam
, or try the search function
.
Example #1
Source File: hints_db.py From Comparative-Annotation-Toolkit with Apache License 2.0 | 7 votes |
def validate_bam_fasta_pairs(bam_path, fasta_sequences, genome): """ Make sure that this BAM is actually aligned to this fasta. Every sequence should be the same length. Sequences can exist in the reference that do not exist in the BAM, but not the other way around. """ handle = pysam.Samfile(bam_path, 'rb') bam_sequences = {(n, s) for n, s in zip(*[handle.references, handle.lengths])} difference = bam_sequences - fasta_sequences if len(difference) > 0: base_err = 'Error: BAM {} has the following sequence/length pairs not found in the {} fasta: {}.' err = base_err.format(bam_path, genome, ','.join(['-'.join(map(str, x)) for x in difference])) raise UserException(err) missing_seqs = fasta_sequences - bam_sequences if len(missing_seqs) > 0: base_msg = 'BAM {} does not have the following sequence/length pairs in its header: {}.' msg = base_msg.format(bam_path, ','.join(['-'.join(map(str, x)) for x in missing_seqs])) logger.warning(msg)
Example #2
Source File: hicBuildMatrix.py From HiCExplorer with GNU General Public License v3.0 | 7 votes |
def get_chrom_sizes(bam_handle): """ return the list of chromosome names and their size from the bam file The return value is a list of the form [('chr1', 2343434), ('chr2', 43432432)] >>> test = Tester() >>> get_chrom_sizes(pysam.Samfile(test.bam_file_1, 'rb')) [('contig-1', 7125), ('contig-2', 3345)] """ # in some cases there are repeated entries in # the bam file. Thus, I first convert to dict, # then to list. list_chrom_sizes = OrderedDict(zip(bam_handle.references, bam_handle.lengths)) return list(list_chrom_sizes.items())
Example #3
Source File: BAMtoJunctionBED.py From altanalyze with Apache License 2.0 | 6 votes |
def exportIndexes(input_dir): import unique bam_dirs = unique.read_directory(input_dir) print 'Building BAM index files', for file in bam_dirs: if string.lower(file[-4:]) == '.bam': bam_dir = input_dir+'/'+file bamf = pysam.Samfile(bam_dir, "rb" ) ### Is there an indexed .bai for the BAM? Check. try: for entry in bamf.fetch(): codes = map(lambda x: x[0],entry.cigar) break except Exception: ### Make BAM Indexv lciv9df8scivx print '.', bam_dir = str(bam_dir) #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False pysam.index(bam_dir) bamf = pysam.Samfile(bam_dir, "rb" )
Example #4
Source File: test_snptable.py From WASP with Apache License 2.0 | 6 votes |
def test_get_overlapping_snps_softclip(self): """Test that soft-clipped part of read is not used""" data = Data() data.setup() # write a single read with softclipping on left end sam_file = open(data.sam_filename, "w") data.write_sam_header(sam_file) data.write_sam_read(sam_file, cigar="10S20M") sam_file.close() sam_file = pysam.Samfile(data.sam_filename) read = next(sam_file) snp_tab = snptable.SNPTable() snp_tab.read_file(data.snp_filename) snp_idx, snp_read_pos, \ indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read) # check that overlapping SNPs are found and in correct locations assert len(snp_idx) == 2 assert snp_idx[0] == 0 assert snp_idx[1] == 1 assert snp_read_pos[0] == 20 assert snp_read_pos[1] == 30
Example #5
Source File: test_snptable.py From WASP with Apache License 2.0 | 6 votes |
def test_get_overlapping_indel(self): """Test that indels can be correctly obtained""" data = Data() data.snp_list = [(10, "A", "-")] data.setup() # write a single read with match sam_file = open(data.sam_filename, "w") data.write_sam_header(sam_file) data.write_sam_read(sam_file, cigar="30M") sam_file.close() sam_file = pysam.Samfile(data.sam_filename) read = next(sam_file) snp_tab = snptable.SNPTable() snp_tab.read_file(data.snp_filename) snp_idx, snp_read_pos, \ indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read) # check that overlapping indel found in correct location assert len(snp_idx) == 0 assert len(indel_idx) == 1 assert indel_idx[0] == 0 assert indel_read_pos[0] == 10
Example #6
Source File: rmdup_pe.py From WASP with Apache License 2.0 | 6 votes |
def main(input_bam, output_bam): if input_bam.endswith(".sam") or input_bam.endswith("sam.gz"): infile = pysam.Samfile(input_bam, "r") else: # assume binary BAM file infile = pysam.Samfile(input_bam, "rb") if output_bam.endswith(".sam"): # output in text SAM format outfile = pysam.Samfile(output_bam, "w", template=infile) elif output_bam.endswith(".bam"): # output in binary compressed BAM format outfile = pysam.Samfile(output_bam, "wb", template=infile) else: raise ValueError("name of output file must end with .bam or .sam") filter_reads(infile, outfile) infile.close() outfile.close()
Example #7
Source File: classDef.py From PePr with GNU General Public License v3.0 | 6 votes |
def check_bampe_sorted(filename, input_dir): infile = pysam.Samfile(input_dir + filename, 'rb') count_list = [] count = 1 pre_name = '' for idx,line in enumerate(infile.fetch(until_eof = True)): if line.query_name == pre_name: count += 1 else: count_list.append(count) count = 1 pre_name = line.query_name if idx == 999: break count1 = len([i for i in count_list if i==1]) ratio = float(count1)/sum(count_list) #print filename, ratio if ratio > 0.8: warning("%s may not be sorted by read name. Please check.",filename)
Example #8
Source File: fileParser.py From PePr with GNU General Public License v3.0 | 6 votes |
def parse_bam_for_f_r(filename, input_dir): num = 0 forward = {} reverse = {} infile =pysam.Samfile(input_dir+filename, 'rb') for line in infile.fetch(until_eof = True): num += 1 if num % 10000000 == 0 : print ("{0:,} lines processed in {1}".format(num, filename)) if line.is_unmapped is False: chr = infile.getrname(line.tid) if line.is_reverse is False: try: forward[chr].append(line.reference_start) except KeyError: forward[chr] = array.array('i',[line.reference_start]) else: try: reverse[chr].append(line.reference_end) except KeyError: reverse[chr] = array.array('i',[line.reference_end]) return forward,reverse
Example #9
Source File: initialize.py From PePr with GNU General Public License v3.0 | 6 votes |
def get_chr_info_bam(parameter, filename): chr_info = {} num = 0 infile = pysam.Samfile(parameter.input_directory+filename, 'rb') for line in infile.fetch(until_eof=True): num += 1 if num %10000000 == 0: print("{0:,} lines processed in {1}".format(num, filename)) if line.is_unmapped is False: chr = infile.getrname(line.tid) try: chr_info[chr] = max(chr_info[chr], line.pos) except KeyError: chr_info[chr] = line.pos parameter.chr_info = chr_info return
Example #10
Source File: initialize.py From PePr with GNU General Public License v3.0 | 6 votes |
def get_read_length_from_bam(parameter): for filename in parameter.get_filenames(): with pysam.Samfile(parameter.input_directory+filename, 'rb') as infile: length_list = [] num = 0 #for idx in range(1000): # line = infile.fetch(until_eof=True).__next__() for line in infile.fetch(until_eof=True): num += 1 if num == 1000: break try: length_list.append(line.query_length) # rlen is deprecated starting from pysam v0.9 except AttributeError: length_list.append(line.rlen) length = max(length_list) parameter.read_length_dict[filename] = length return
Example #11
Source File: find_peak_mode.py From PePr with GNU General Public License v3.0 | 6 votes |
def parse_bam_for_f_r(filename): num = 0 forward = {} reverse = {} infile =pysam.Samfile(filename, 'rb') line = infile.fetch(until_eof=True).__next__() length = line.alen for line in infile.fetch(until_eof = True): num += 1 if num % 1000000 == 0 : print ("{0:,} lines processed in {1}".format(num, filename)) if line.is_unmapped is False: chr = infile.getrname(line.tid) if line.is_reverse is False: try: forward[chr].append(line.pos) except KeyError: forward[chr] = array.array('i',[line.pos]) else: try: reverse[chr].append(line.pos) except KeyError: reverse[chr] = array.array('i',[line.pos]) return forward,reverse, length
Example #12
Source File: mapping_test.py From iva with GNU General Public License v3.0 | 6 votes |
def test_soft_clipped(self): '''Test soft_clipped''' expected = [ (5, 0), (0, 0), (0, 0), (0, 5), (0, 0), None, (0, 0), (0, 0), (2, 0), (0, 1), None, None, (1, 1), (0, 1) ] sam_reader = pysam.Samfile(os.path.join(data_dir, 'mapping_test.smalt.out.bam'), "rb") i = 0 for sam in sam_reader.fetch(until_eof=True): self.assertEqual(mapping.soft_clipped(sam), expected[i]) i += 1
Example #13
Source File: mapping_test.py From iva with GNU General Public License v3.0 | 6 votes |
def test_get_pair_type(self): '''Test get_pair_type''' expected = [ (mapping.CAN_EXTEND_LEFT, mapping.KEEP), (mapping.KEEP, mapping.CAN_EXTEND_RIGHT), (mapping.KEEP, mapping.KEEP), (mapping.NOT_USEFUL, mapping.NOT_USEFUL), (mapping.CAN_EXTEND_LEFT, mapping.KEEP), (mapping.BOTH_UNMAPPED, mapping.BOTH_UNMAPPED), (mapping.NOT_USEFUL, mapping.NOT_USEFUL) ] sam_reader = pysam.Samfile(os.path.join(data_dir, 'mapping_test.smalt.out.bam'), "rb") previous_sam = None i = 0 for sam in sam_reader.fetch(until_eof=True): if previous_sam is None: previous_sam = sam continue types = mapping.get_pair_type(previous_sam, sam, 190, 1000, min_clip=2) self.assertEqual(types, expected[i]) i += 1 previous_sam = None
Example #14
Source File: assembly_test.py From iva with GNU General Public License v3.0 | 6 votes |
def test_map_reads(self): '''test _map_reads''' a = assembly.Assembly(contigs_file=os.path.join(data_dir, 'assembly_test.fa')) reads_prefix = os.path.join(data_dir, 'assembly_test.to_map') out_prefix = 'tmp.assembly_test.out' a._map_reads(reads_prefix + '_1.fastq', reads_prefix + '_2.fastq', out_prefix) # different smalt version output slightly different BAMs. Some columns # should never change, so check just those ones def get_sam_columns(bamfile): sams = [] sam_reader = pysam.Samfile(bamfile, "rb") for sam in sam_reader.fetch(until_eof=True): if sam.is_unmapped: refname = None else: refname = sam_reader.getrname(sam.tid) sams.append((sam.qname, sam.flag, refname, sam.pos, sam.cigar, sam.seq)) return sams expected = get_sam_columns(os.path.join(data_dir, 'assembly_test.mapped.bam')) got = get_sam_columns(out_prefix + '.bam') self.assertListEqual(expected, got) os.unlink(out_prefix + '.bam')
Example #15
Source File: dedup.py From UMI-tools with MIT License | 6 votes |
def detect_bam_features(bamfile, n_entries=1000): ''' read the first n entries in the bam file and identify the tags available detecting multimapping ''' inbam = pysam.Samfile(bamfile) inbam = inbam.fetch(until_eof=True) tags = ["NH", "X0", "XT"] available_tags = {x: 1 for x in tags} for n, read in enumerate(inbam): if n > n_entries: break if read.is_unmapped: continue else: for tag in tags: if not read.has_tag(tag): available_tags[tag] = 0 return available_tags
Example #16
Source File: test_identify_locus.py From STRetch with MIT License | 6 votes |
def test_indel_size_allele_freq(bamfile, n, expected): """Test indel corretly identified""" region = (70713514, 70713561) chrom = 'chr13' bam = pysam.Samfile(bamfile, 'rb') all_indels = {} for read in bam.fetch(): try: all_indels[read.query_name] = indel_size(read, region, chrom) #print(read.is_secondary) except ValueError: continue print(all_indels) all_indels_list = [all_indels[x] for x in all_indels] alleles_by_frequency = allele_freq(all_indels_list, n) assert alleles_by_frequency == expected
Example #17
Source File: genewisequanti.py From READemption with ISC License | 6 votes |
def _quantify(self, read_alignment_path, annotation_path, output_path, fraction_calc_method, pseudocounts=False): sam = pysam.Samfile(read_alignment_path) gff3_parser = Gff3Parser() output_fh = open(output_path, "w") output_fh.write("#" + "\t".join(_gff_field_descriptions() + ["sense", "antisense"]) + "\n") for entry in gff3_parser.entries(open(annotation_path)): if _entry_to_use(entry, self._allowed_features) is False: continue if pseudocounts is False: sum_sense = 0 sum_antisense = 0 else: sum_sense = 1 sum_antisense = 1 for alignment in self._overlapping_alignments(sam, entry): fraction = fraction_calc_method(alignment) if self._same_strand(entry, alignment): sum_sense += fraction else: sum_antisense += fraction output_fh.write(str(entry) + "\t" + str(sum_sense) + "\t" + str(sum_antisense) + "\n")
Example #18
Source File: readalignerstats.py From READemption with ISC License | 6 votes |
def _count_aligned_reads_and_alignments( self, read_alignment_result_bam_path): bam = pysam.Samfile(read_alignment_result_bam_path) stats_per_ref = defaultdict(dict) no_of_hits_per_read_freq = {} for ref_id in bam.references: self._init_counting_dict(stats_per_ref, ref_id) for entry in bam.fetch(): ref_id = bam.getrname(entry.tid) try: self._count_alignment( entry, ref_id, stats_per_ref, no_of_hits_per_read_freq) except KeyError: sys.stderr.write( "SAM entry with unspecified reference found! Stoping\n") sys.exit(2) self._stats["stats_per_reference"] = stats_per_ref for ref_id, stats in stats_per_ref.items(): stats_per_ref[ref_id][ "no_of_hits_per_read_and_freqs"] = self._calc_down_to_read( stats_per_ref[ref_id]["no_of_hits_per_read_and_freqs"]) self._stats["stats_total"] = self._sum_countings(stats_per_ref)
Example #19
Source File: bam_alignment.py From pybamview with MIT License | 6 votes |
def __init__(self, _bamfiles, _reffile): self.bamfiles = _bamfiles self.bamreaders = [] for bam in self.bamfiles: try: br = pysam.Samfile(bam, "rb") self.bamreaders.append(br) except: sys.stderr.write("ERROR: could not open %s. Is this a valid bam file?\n"%bam) if _reffile != "": try: self.reference = pyfaidx.Fasta(_reffile, as_raw=True) except: self.reference = None else: self.reference = None self.alignment_grid = None self.read_groups = self.LoadRGDictionary()
Example #20
Source File: run_hmms.py From ViFi with GNU General Public License v3.0 | 6 votes |
def prepare_unmapped_sequences(options): start_time = time.time() counter = 0; bam = pysam.Samfile(options.bamfile, 'rb') fas = open('%s/temp/unmapped.fas' % options.directory, 'wb') map = open('%s/temp/unmapped.map' % options.directory, 'wb') for read in bam: if read.is_unmapped and not read.mate_is_unmapped and bam.references[read.rnext].find('chr') == 0: fas.write('>read_%d\n%s\n' % (counter, read.seq)) map.write('%s\tread_%d\n' % (read.qname, counter)) counter+=1 fas.close() map.close() bam.close() end_time = time.time() - start_time; print "Prepared sequences for searching against HMMs: %fs" % end_time
Example #21
Source File: mapping_test.py From iva with GNU General Public License v3.0 | 6 votes |
def test_can_extend(self): '''Test can_extend''' expected = [ (True, False), (False, False), (False, False), (False, True), (False, False), (False, False), (False, False), (False, False), (True, False), (False, False), (False, False), (False, False), (False, False), (False, False), ] sam_reader = pysam.Samfile(os.path.join(data_dir, 'mapping_test.smalt.out.bam'), "rb") i = 0 for sam in sam_reader.fetch(until_eof=True): self.assertEqual(mapping._can_extend(sam, 190, min_clip=2), expected[i]) i += 1
Example #22
Source File: QC_stats_Final.py From ngs_pipeline with MIT License | 5 votes |
def _count_reads(bam): stats = { 'total_reads' : 0, 'mapped_reads' : 0 } with pysam.Samfile(bam, 'rb') as bam_file: for read in bam_file: stats['total_reads'] += 1 if (not read.is_unmapped): stats['mapped_reads'] += 1 stats['percent_mapped'] = (float(stats['mapped_reads']) / float(stats['total_reads'])) * 100 stats['percent_mapped'] = "%2.2F" % stats['percent_mapped'] return stats
Example #23
Source File: sambamconverter.py From READemption with ISC License | 5 votes |
def _generate_empty_bam_file(self, sam_path, bam_path_prefix): samfile = pysam.Samfile(sam_path, "r") bamfile = pysam.Samfile( "%s.bam" % bam_path_prefix, "wb", header=samfile.header) bamfile.close() samfile.close() pysam.index("%s.bam" % bam_path_prefix)
Example #24
Source File: hints_db.py From Comparative-Annotation-Toolkit with Apache License 2.0 | 5 votes |
def bam_is_paired(bam_path, num_reads=20000, paired_cutoff=0.75): """ Infers the paired-ness of a bam file. """ sam = pysam.Samfile(bam_path) count = 0 for rec in itertools.islice(sam, num_reads): if rec.is_paired: count += 1 if tools.mathOps.format_ratio(count, num_reads) > 0.75: return True elif tools.mathOps.format_ratio(count, num_reads) < 1 - paired_cutoff: return False else: raise UserException("Unable to infer pairing from bamfile {}".format(bam_path))
Example #25
Source File: coveragecalculator.py From READemption with ISC License | 5 votes |
def _open_bam_file(self, bam_file): return pysam.Samfile(bam_file)
Example #26
Source File: misc.py From Comparative-Annotation-Toolkit with Apache License 2.0 | 5 votes |
def is_bam(path): """Checks if a path is a BAMfile""" try: pysam.Samfile(path) except IOError: raise RuntimeError('Path {} does not exist'.format(path)) except ValueError: return False return True
Example #27
Source File: crossalignfilter.py From READemption with ISC License | 5 votes |
def write_crossmapping_free_bam(self): with pysam.Samfile(self._input_bam) as input_bam: with pysam.Samfile(self._output_bam, "wb", header=input_bam.header) as output_bam: for alignment in input_bam.fetch(): if not alignment.qname in self._crossmapped_reads: output_bam.write(alignment) pysam.index(self._output_bam)
Example #28
Source File: crossalignfilter.py From READemption with ISC License | 5 votes |
def _check_replicon_existance(self): found_all = True with pysam.Samfile(self._input_bam) as bam: for replicon_ids in self._orgs_and_replicon_ids.values(): for replicon_id in replicon_ids: if not replicon_id in bam.references: sys.stderr.write( "\"%s\" not found in BAM header.\n" % replicon_id) found_all = False if not found_all: raise RepliconIdNotInBam
Example #29
Source File: bam_alignment.py From pybamview with MIT License | 5 votes |
def GetDefaultLocation(bamfiles): """ Return default location to jump to if no location given. Look at the first read we see and go there. If no reads aligned, return 'error' Args: bamfiles (list): A list with paths to bamfiles Returns: position (string): A string with chromosome and position """ default_chrom = None default_pos = None aligned = False position = 'error' for bam in bamfiles: try: br = pysam.Samfile(bam, "rb") except: sys.stderr.write("ERROR: Could not open %s. Is this a valid bam file?\n"%bam) continue # Peak at the first hundred reads read_count = 0 while not (aligned or read_count > 100): try: aligned_read = br.next() except StopIteration: continue if not aligned_read.is_unmapped: default_chrom = br.getrname(aligned_read.tid) default_pos = str(aligned_read.pos) aligned = True position = ':'.join([default_chrom, default_pos]) break else: read_count += 1 return position
Example #30
Source File: bioutilities.py From Haystack with GNU Affero General Public License v3.0 | 5 votes |
def calculate_profile_matrix_bed_bam(bed_filename,sam_filename,window_size=5000, resolution=50, fragment_length=200, use_strand=False,return_coordinates=False): cs=Coordinate.bed_to_coordinates(bed_filename) cs=Coordinate.coordinates_of_intervals_around_center(cs, window_size) samfile = pysam.Samfile(sam_filename) n_bins=(window_size/resolution)+1 profile_matrix=np.zeros((len(cs),n_bins)) for idx_c,c in enumerate(cs): n_start=max(0,c.bpcenter-window_size/2) n_end=c.bpcenter+window_size/2 for rd in samfile.fetch(c.chr_id, n_start,n_end): if rd.is_reverse: rd_st=rd.positions[-1]-fragment_length-1 rd_end=rd.positions[-1] else: rd_st=rd.positions[0] rd_end=max(rd.positions[-1],rd.positions[1]+fragment_length) bin_idx_st=max(0,1+(rd_st-n_start)/resolution) bin_idx_en=min(n_bins,1+ (rd_end-n_start)/resolution) #print rd.positions[1],rd.positions[-1],rd_st, rd_end,rd_end-rd_st,rd_st-n_start,rd_end-n_start,bin_idx_st,bin_idx_en profile_matrix[idx_c,bin_idx_st:bin_idx_en]+=1 if use_strand and c.strand == '-': profile_matrix[idx_c,:]=profile_matrix[idx_c,::-1] if return_coordinates: return profile_matrix,samfile.mapped,cs else: return profile_matrix,samfile.mapped