Python pysam.AlignmentFile() Examples
The following are 30
code examples of pysam.AlignmentFile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pysam
, or try the search function
.
Example #1
Source File: count.py From SVclone with BSD 3-Clause "New" or "Revised" License | 9 votes |
def write_anomalous_read_to_bam(bam,split_reads,span_reads,anom_reads,out): print('Writing anom reads to file') split_reads = np.unique(split_reads['query_name']) span_reads = np.unique(span_reads['query_name']) anom_reads = np.unique(anom_reads['query_name']) # need to filter out any reads that were at any point marked as valid supporting reads anom_reads = np.array([x for x in anom_reads if x not in split_reads]) anom_reads = np.array([x for x in anom_reads if x not in span_reads]) bamf = pysam.AlignmentFile(bam, "rb") index = pysam.IndexedReads(bamf) index.build() anom_bam = pysam.AlignmentFile("%s_anom_reads.bam" % out, "wb", template=bamf) for read_name in anom_reads: for read in index.find(read_name): anom_bam.write(read) anom_bam.close()
Example #2
Source File: common.py From wub with Mozilla Public License 2.0 | 6 votes |
def pysam_open(alignment_file, in_format='BAM'): """Open SAM/BAM file using pysam. :param alignment_file: Input file. :param in_format: Format (SAM or BAM). :returns: pysam.AlignmentFile :rtype: pysam.AlignmentFile """ if in_format == 'BAM': mode = "rb" elif in_format == 'SAM': mode = "r" else: raise Exception("Invalid format: {}".format(in_format)) aln_iter = pysam.AlignmentFile(alignment_file, mode) return aln_iter
Example #3
Source File: test_parse_custom_SAM_tags.py From TALON with MIT License | 6 votes |
def test_parse_custom_SAM_tags(): """ Test that custom SAM tags are handled as expected """ sam_file = "input_files/test_parse_custom_SAM_tags/toy_reads.sam" with pysam.AlignmentFile(sam_file, "rb") as sam: for sam_record in sam: fraction_As, custom_label, allelic_label, \ start_support, end_support = talon.parse_custom_SAM_tags(sam_record) if sam_record.query_name == "read_1": assert round(fraction_As,1) == 0.2 assert custom_label == "yes" assert allelic_label == "paternal" assert start_support == "yes" assert end_support == "no" elif sam_record.query_name == "read_4": assert fraction_As == custom_label == allelic_label == None assert start_support == end_support == None else: pytest.fail("Did not recognize read name")
Example #4
Source File: test_preprocess_sam_dataset_labeling.py From TALON with MIT License | 6 votes |
def test_read_labels(self): """ Given two sam files and two corresponding dataset labels, ensure that the merged BAM file preserves the RB tags assigned by pysam.merge """ sams = ["input_files/preprocess_sam/read1.sam", "input_files/preprocess_sam/read2.sam" ] datasets = ["dataset1", "dataset2"] tmp_dir = "scratch/test_read_labels/test1/" merged_bam = procsam.preprocess_sam(sams, datasets, tmp_dir = tmp_dir) with pysam.AlignmentFile(merged_bam) as bam: for entry in bam: if entry.query_name == "read_1": assert entry.get_tag("RG") == "dataset1" elif entry.query_name == "read_2": assert entry.get_tag("RG") == "dataset1" elif entry.query_name == "read_3": assert entry.get_tag("RG") == "dataset2" else: pytest.fail("Unexpected read encountered")
Example #5
Source File: bamtools.py From SVclone with BSD 3-Clause "New" or "Revised" License | 6 votes |
def estimateTagSize(bamfile, alignments=10, multiple="error"): '''estimate tag size from first alignments in file.''' samfile = pysam.AlignmentFile(bamfile) sizes = [read.rlen for read in samfile.head(alignments)] mi, ma = min(sizes), max(sizes) if mi == 0 and ma == 0: sizes = [read.inferred_length for read in samfile.head(alignments)] # remove 0 sizes (unaligned reads?) sizes = [x for x in sizes if x > 0] mi, ma = min(sizes), max(sizes) if mi != ma: if multiple == "error": raise ValueError('multiple tag sizes in %s: %s' % (bamfile, sizes)) elif multiple == "mean": mi = int(sum(sizes) / len(sizes)) return mi
Example #6
Source File: bamtools.py From SVclone with BSD 3-Clause "New" or "Revised" License | 6 votes |
def estimateInsertSizeDistribution(bamfile, alignments=50000): '''estimate insert size from first alignments in bam file. returns mean and stddev of insert sizes. ''' assert isPaired(bamfile), \ 'can only estimate insert size from' \ 'paired bam files' samfile = pysam.AlignmentFile(bamfile) # only get positive to avoid double counting inserts = numpy.array( [read.tlen for read in samfile.head(alignments) if read.is_proper_pair and read.tlen > 0]) insert_mean, insert_std = numpy.mean(inserts), numpy.std(inserts) print('Insert mean of %f, with standard deviation of %f inferred' % (insert_mean, insert_std)) if insert_mean > 10000 or insert_std > 1000 or \ insert_mean < 1 or insert_std < 1: print('''WARNING: anomalous insert sizes detected. Please double check or consider setting values manually.''') return insert_mean, insert_std
Example #7
Source File: bamtools.py From SVclone with BSD 3-Clause "New" or "Revised" License | 6 votes |
def isPaired(bamfile, alignments=1000): '''check if a *bamfile* contains paired end data The method reads at most the first *alignments* and returns True if any of the alignments are paired. ''' samfile = pysam.AlignmentFile(bamfile) n = 0 for read in samfile: if read.is_paired: break n += 1 if n == alignments: break samfile.close() return n != alignments
Example #8
Source File: annotate.py From SVclone with BSD 3-Clause "New" or "Revised" License | 6 votes |
def retrieve_loc_reads(sv, bam, max_dep, threshold): bp_dtype = [('chrom', '<U20'), ('start', int), ('end', int), ('dir', '<U1')] sv_id, chr1, pos1, dir1, chr2, pos2, dir2, \ sv_class, orig_id, orig_pos1, orig_pos2 = [h[0] for h in dtypes.sv_dtype] bp1 = np.array((sv[chr1], sv[pos1]-(threshold*2), \ sv[pos1]+(threshold*2), sv[dir1]), dtype=bp_dtype) bp2 = np.array((sv[chr2], sv[pos2]-(threshold*2), \ sv[pos2]+(threshold*2), sv[dir2]), dtype=bp_dtype) bamf = pysam.AlignmentFile(bam, "rb") loc1_reads, err_code1 = count.get_loc_reads(bp1, bamf, max_dep) loc2_reads, err_code2 = count.get_loc_reads(bp2, bamf, max_dep) bamf.close() sv_class = str(sv['classification']) if err_code1 == 1 or err_code2 == 1: sv['classification'] = 'HIDEP' if sv_class == '' else sv_class+';HIDEP' elif err_code1 == 2 or err_code2 == 2: sv['classification'] = 'READ_FETCH_FAILED' if sv_class == '' \ else sv_class+';READ_FETCH_FAILED' return sv, loc1_reads, loc2_reads, err_code1, err_code2
Example #9
Source File: process_sams.py From TALON with MIT License | 6 votes |
def write_reads_to_file(read_groups, intervals, header_template, tmp_dir = "talon_tmp/"): """ For each read group, iterate over the reads and write them to a file named for the interval they belong to. This step is necessary because Pysam objects cannot be pickled. """ tmp_dir = tmp_dir + "interval_files/" os.system("mkdir -p %s " % (tmp_dir)) outbam_names = [] with pysam.AlignmentFile(header_template, "rb") as template: for group, interval in zip(read_groups, intervals): fname = tmp_dir + "_".join([str(x) for x in interval]) + ".bam" outbam_names.append(fname) with pysam.AlignmentFile(fname, "wb", template = template) as f: for read in group: f.write(read) return outbam_names
Example #10
Source File: sample_info.py From grocsvs with MIT License | 6 votes |
def get_bam_coverages(options, sample, dataset): window_skip = int(1e6) window_size = 1000 skip_at_ends = int(1e6) bam = pysam.AlignmentFile(dataset.bam) print "getting coverages" coverages = [] count = 0 for chrom, start, end in get_search_regions(options.reference, window_skip, window_size, skip_at_ends): coverages.append(bam.count(chrom, start, end)) if count % 1000 == 0: print count count += 1 return coverages
Example #11
Source File: bam_cov.py From basenji with Apache License 2.0 | 6 votes |
def learn_shift_pair(self, bam_file): """ Learn the optimal fragment shift from paired end fragments. """ t0 = time.time() print('Learning shift from paired-end sequences.', end='', flush=True) # read proper pair template lengths template_lengths = [] for align in pysam.AlignmentFile(bam_file): if align.is_proper_pair and align.is_read1: template_lengths.append(abs(align.template_length)) # compute mean self.shift_forward = int(np.round(np.mean(template_lengths) / 2)) print(' Done in %ds.' % (time.time() - t0)) print('Shift: %d' % self.shift_forward, flush=True)
Example #12
Source File: genotyping.py From grocsvs with MIT License | 6 votes |
def count_ref_reads(bampath, reference, chrom, start, end): ref_counts = numpy.zeros(end-start) non_ref_counts = numpy.zeros(end-start) bam = pysam.AlignmentFile(bampath) # stepper = "all" skips dupe, unmapped, secondary, and qcfail reads start = max(0, start) for col in bam.pileup(chrom, start, end, truncate=True, stepper="all"): refnuc = reference.fasta[chrom][col.reference_pos].upper() nuc_counts = collections.Counter() for read in col.pileups: if read.query_position is None: # nuc_counts["indel"] += 1 pass else: nuc_counts[read.alignment.query_sequence[read.query_position]] += 1 ref_counts[col.reference_pos-start] = nuc_counts[refnuc] non_ref_counts[col.reference_pos-start] = sum(nuc_counts.values()) - nuc_counts[refnuc] return ref_counts, non_ref_counts
Example #13
Source File: bamops.py From anvio with GNU General Public License v3.0 | 6 votes |
def __init__(self, *args): """A class that is essentially pysam.AlignmentFile, with some added bonuses This class inherits pysam.AlignmentFile and adds a little flair. Init such an object the way you would an AlignmentFile, i.e. bam = bamops.BAMFileObject(path_to_bam) """ self.input_bam_path = args[0] filesnpaths.is_file_exists(self.input_bam_path) try: pysam.AlignmentFile.__init__(self) except ValueError as e: raise ConfigError('Are you sure "%s" is a BAM file? Because samtools is not happy with it: """%s"""' % (self.input_bam_path, e)) try: self.mapped except ValueError: raise ConfigError("It seems the BAM file is not indexed. See 'anvi-init-bam' script.")
Example #14
Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License | 5 votes |
def peek_umi_only(self, bamfile: str, lines: int=30) -> None: """Peeks for umi into the samfile to determine if it is a cellranger or dropseq file """ logging.debug(f"Peeking into {bamfile}") fin = pysam.AlignmentFile(bamfile) # type: pysam.AlignmentFile cellranger: int = 0 dropseq: int = 0 failed: int = 0 for i, read in enumerate(fin): if read.is_unmapped: continue if read.has_tag("UB"): cellranger += 1 elif read.has_tag("XM"): dropseq += 1 else: logging.warn(f"Not found cell and umi barcode in entry {i} of the bam file") failed += 1 if cellranger > lines: self.umibarcode_str = "UB" break elif dropseq > lines: self.umibarcode_str = "XM" break elif failed > 5 * lines: raise IOError("The bam file does not contain umi barcodes appropriatelly formatted. If you are runnin UMI-less data you should use the -U flag.") else: pass fin.close()
Example #15
Source File: process_sams.py From TALON with MIT License | 5 votes |
def convert_to_bam(sam, bam): """ Convert provided sam file to bam file (provided name). """ try: infile = pysam.AlignmentFile(sam, "r") outfile = pysam.AlignmentFile(bam, "wb", template=infile) for s in infile: outfile.write(s) except Exception as e: print(e) raise RuntimeError("Problem converting sam file '%s' to bam." % (sam))
Example #16
Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License | 5 votes |
def peek(self, bamfile: str, lines: int=1000) -> None: """Peeks into the samfile to determine if it is a cellranger or dropseq file """ logging.debug(f"Peeking into {bamfile}") fin = pysam.AlignmentFile(bamfile) # type: pysam.AlignmentFile cellranger: int = 0 dropseq: int = 0 failed: int = 0 for i, read in enumerate(fin): if read.is_unmapped: continue if read.has_tag("CB") and read.has_tag("UB"): cellranger += 1 elif read.has_tag("XC") and read.has_tag("XM"): dropseq += 1 else: logging.warn(f"Not found cell and umi barcode in entry {i} of the bam file") failed += 1 if cellranger > lines: self.cellbarcode_str = "CB" self.umibarcode_str = "UB" break elif dropseq > lines: self.cellbarcode_str = "XC" self.umibarcode_str = "XM" break elif failed > 5 * lines: raise IOError("The bam file does not contain cell and umi barcodes appropriatelly formatted. If you are runnin UMI-less data you should use the -U flag.") else: pass fin.close()
Example #17
Source File: call_readclouds.py From grocsvs with MIT License | 5 votes |
def __init__(self, bam_path, max_dist, min_end_mapq=30, require_one_mapq=60): self.bam = pysam.AlignmentFile(bam_path) self.max_dist = max_dist self.min_end_mapq = min_end_mapq self.require_one_mapq = require_one_mapq self._cur_chrom = None self._barcodes_to_reads = {} self._barcodes_to_haplotypes = {} self._recently_observed_bcs = set() self.cur_start = 0
Example #18
Source File: BtIO.py From blobtools with GNU General Public License v3.0 | 5 votes |
def parseBam(infile, set_of_blobs, estimate_cov): # no_base_cov_flag [deprecated] reads_total, reads_mapped = 0, 0 with pysam.AlignmentFile(infile) as aln: reads_total, reads_mapped = checkBam(aln, set_of_blobs) if estimate_cov: base_cov_dict, read_cov_dict = estimate_coverage(aln, set_of_blobs) else: base_cov_dict, read_cov_dict = calculate_coverage(aln, reads_mapped, set_of_blobs) return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Example #19
Source File: common.py From pycoQC with GNU General Public License v3.0 | 5 votes |
def expand_file_names(fn, bam_check=False): """""" # Try to expand file name to list if isinstance(fn, list): if len(fn) ==1: fn_list=glob(fn[0]) else: fn_list = [] for f in fn: fn_list.extend(glob(f)) elif isinstance(fn, str): fn_list=glob(fn) else: raise pycoQCError ("{} has to be either a file or a regular expression or a list of files".format(fn)) # Verify that files are readable if not fn_list: raise pycoQCError("No files found in {}".format(fn)) for f in fn_list: if not is_readable_file (f): raise pycoQCError("Cannot read file {}".format(f)) # Extra checks for bam files if bam_check: with ps.AlignmentFile(f, "rb") as bam: if not bam.has_index(): raise pycoQCError("No index found for bam file: {}. Please index with samtools index".format(f)) if not bam.header['HD']['SO'] == 'coordinate': raise pycoQCError("Bam file not sorted: {}. Please sort with samtools sort".format(f)) return fn_list
Example #20
Source File: get_counts.py From NucleoATAC with MIT License | 5 votes |
def get_counts(args): """function to get fragment sizes """ if args.out is None: args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1]) chunks = ChunkList.read(args.bed) mat = np.zeros(len(chunks), dtype=np.int) bamHandle = AlignmentFile(args.bam) j = 0 for chunk in chunks: for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - args.upper), chunk.end + args.upper): if read.is_proper_pair and not read.is_reverse: if args.atac: #get left position l_pos = read.pos + 4 #get insert size #correct by 8 base pairs to be inserion to insertion ilen = abs(read.template_length) - 8 else: l_pos = read.pos ilen = abs(read.template_length) r_pos = l_pos + ilen - 1 if _between(ilen, args.lower, args.upper) and (_between(l_pos, chunk.start, chunk.end) or _between(r_pos, chunk.start, chunk.end)): mat[j] += 1 j += 1 bamHandle.close() np.savetxt(args.out + ".counts.txt.gz", mat, delimiter="\n", fmt='%i')
Example #21
Source File: bubbleprocesser.py From AfterQC with MIT License | 5 votes |
def statFileBam(self, filename, queue): print("start: " + filename + "\n") #we apply lazy import here for pysam, since this module need to be installed manually and it is not used for most case import pysam reader = pysam.AlignmentFile(filename) records = [] while True: try: read = reader.next() except StopIteration: break if read == None: break poly, count = self.countPoly(read.seq) if poly != None: #illumina sequence name line format #@<instrument>:<run number>:<flowcell ID>:<lane>:<tile_no>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence> items = read.qname.split(":") if len(items) < 7: continue lane = items[3] tile_no = items[4] surface = tile_no[0] swath = tile_no[1] camera = tile_no[2] tile = tile_no[3:] xpos = items[5] ypos = items[6] record = [int(lane), int(surface), int(swath), int(camera), int(tile), int(xpos), int(ypos), ord(poly), count, int(tile_no)] records.append(record) queue.put(records) reader.close() print("finished " +filename + " with " + str(len(records)) + " polyX records")
Example #22
Source File: mate_pair_evidence.py From grocsvs with MIT License | 5 votes |
def run(self): self.logger.log("running!") path = self.evidence_step.outpaths(final=True)["evidence"] evidence = pandas.read_table(path) evidence["chromx"] = evidence["chromx"].astype("string") evidence["chromy"] = evidence["chromy"].astype("string") sample_info = self.options.sample_info(self.sample.name) dataset_info = sample_info[self.dataset.id] insert_size_dict = dataset_info["insert_sizes"] bam = pysam.AlignmentFile(self.dataset.bam) short_frag_evidence = [] print "@@@@", get_max_insert_size(insert_size_dict) # for i, j in numpy.transpose(numpy.where(binom_ps<1e-4)): for i, row in evidence.iterrows(): # if common_counts[i,j] < 20: continue if row["shared"] < 20: continue chromx, x, orientationx = row["chromx"], int(row["new_x"]), row["orientation"][0] chromy, y, orientationy = row["chromy"], int(row["new_y"]), row["orientation"][1] cur_evidence = get_evidence(chromx, x, chromy, y, orientationx, orientationy, bam, get_max_insert_size(insert_size_dict), 5000) short_frag_evidence.append( [chromx, row["original_x"], orientationx, chromy, row["original_y"], orientationy] + list(cur_evidence)) columns = ["chromx", "x", "orientationx", "chromy", "y", "orientationy", "common", "total"] df = pandas.DataFrame(short_frag_evidence, columns=columns) outpath = self.outpaths(final=False)["mate_pair_evidence"] df.to_csv(outpath, sep="\t", index=False)
Example #23
Source File: test_preprocess_sam_dataset_labeling.py From TALON with MIT License | 5 votes |
def test_unsorted_sam_file(self): """ Make sure that the function can handle a SAM input that has not been sorted """ sams = ["input_files/chr11_and_Tcf3/BC017.sam"] datasets = ["BC017"] tmp_dir = "scratch/test_read_labels/test2/" merged_bam = procsam.preprocess_sam(sams, datasets, tmp_dir = tmp_dir) with pysam.AlignmentFile(merged_bam) as bam: # Check the first read for entry in bam: assert entry.query_name == "m54284_180814_002203/19268005/ccs" break
Example #24
Source File: utils.py From smallrnaseq with GNU General Public License v3.0 | 5 votes |
def sam_to_bam(filename): """Convert sam to bam""" import pysam infile = pysam.AlignmentFile(filename, "r") name = os.path.splitext(filename)[0]+'.bam' bamfile = pysam.AlignmentFile(name, "wb", template=infile) for s in infile: bamfile.write(s) pysam.sort("-o", name, name) pysam.index(name) bamfile = pysam.AlignmentFile(name, "rb") return
Example #25
Source File: singlesample.py From svtyper with MIT License | 5 votes |
def open_alignment_file(afile, reference_fasta): fd = None if afile.endswith('.bam'): fd = pysam.AlignmentFile(afile, mode='rb') elif afile.endswith('.cram'): fd = pysam.AlignmentFile(afile, mode='rc', reference_filename=reference_fasta) else: die('Error: %s is not a valid alignment file (*.bam or *.cram)\n' % afile) return fd
Example #26
Source File: process_sams.py From TALON with MIT License | 5 votes |
def get_reads_in_interval(sam, chrom, start, end): """ Given an open pysam.AlignmentFile, return only the reads that overlap the provided interval. Note that this means there may be reads that extend beyond the bounds of the interval. """ iterator = sam.fetch(chrom, start, end) reads = [ x for x in iterator ] return reads
Example #27
Source File: test_find_intersecting_snps.py From WASP with Apache License 2.0 | 5 votes |
def test_remap_secondary_alignments(self): """Test that secondary alignments don't appear in remap files""" test_data = Data() test_data.setup() test_data.index_genome_bowtie2() test_data.map_single_bowtie2() test_data.sam2bam() find_intersecting_snps.main(test_data.bam_filename, is_paired_end=False, is_sorted=False, snp_dir=test_data.snp_dir) # # Verify to.remap bam is the same as the input bam file. # old_lines = read_bam(test_data.bam_filename) new_lines = read_bam(test_data.bam_remap_filename) assert old_lines == new_lines # now, let's alter the flag of the read and # rerun find_intersecting_snps bam = pysam.Samfile(test_data.bam_filename, "rb") new_bam_file = test_data.bam_filename[:-4] + "2.bam" new_bam = pysam.AlignmentFile(new_bam_file, "wb", template=bam) for read in bam: read.flag = 256 new_bam.write(read) bam.close() new_bam.close() # replace old bam file os.rename(new_bam_file, test_data.bam_filename) find_intersecting_snps.main(test_data.bam_filename, snp_dir=test_data.snp_dir, is_paired_end=False, is_sorted=False) # now, the remap file shouldn't have any reads lines = read_bam(test_data.bam_remap_filename) assert len(lines) == 1 assert lines[0] == '' test_data.cleanup()
Example #28
Source File: test_find_intersecting_snps.py From WASP with Apache License 2.0 | 5 votes |
def test_remap_supplementary_alignments(self): """Test that supplementary alignments don't appear in remap files""" test_data = Data() test_data.setup() test_data.index_genome_bowtie2() test_data.map_single_bowtie2() test_data.sam2bam() find_intersecting_snps.main(test_data.bam_filename, is_paired_end=False, is_sorted=False, snp_dir=test_data.snp_dir) # # Verify to.remap bam is the same as the input bam file. # old_lines = read_bam(test_data.bam_filename) new_lines = read_bam(test_data.bam_remap_filename) assert old_lines == new_lines # now, let's alter the flag of the read and # rerun find_intersecting_snps bam = pysam.Samfile(test_data.bam_filename, "rb") new_bam_file = test_data.bam_filename[:-4] + "2.bam" new_bam = pysam.AlignmentFile(new_bam_file, "wb", template=bam) for read in bam: read.flag = 2048 new_bam.write(read) bam.close() new_bam.close() # replace old bam file os.rename(new_bam_file, test_data.bam_filename) find_intersecting_snps.main(test_data.bam_filename, snp_dir=test_data.snp_dir, is_paired_end=False, is_sorted=False) # now, the remap file shouldn't have any reads lines = read_bam(test_data.bam_remap_filename) assert len(lines) == 1 assert lines[0] == ''
Example #29
Source File: bamops.py From anvio with GNU General Public License v3.0 | 5 votes |
def _fetch_iterator(self, bam, contig_name, start, end): """Uses standard pysam fetch iterator from AlignmentFile objects, ignores unmapped reads""" for read in bam.fetch(contig_name, start, end): if read.cigartuples is None or read.query_sequence is None: # This read either has no associated cigar string or no query sequence. If cigar # string is None, this means it did not align but is in the BAM file anyways, or the # mapping software decided it did not want to include a cigar string for this read. # The origin of why query sequence would be None is unkown. continue yield read
Example #30
Source File: process_sams.py From TALON with MIT License | 5 votes |
def partition_reads(sam_files, datasets, tmp_dir = "talon_tmp/", n_threads = 0): """ Use bedtools merge to create non-overlapping intervals from all of the transcripts in a series of SAM/BAM files. Then, iterate over the intervals to extract all reads inside of them from the pysam object. Returns: - List of lists: sublists contain pysam reads from a given interval - List of tuple intervals - filename of merged bam file (to keep track of the header) """ merged_bam = preprocess_sam(sam_files, datasets, tmp_dir = tmp_dir, n_threads = n_threads) try: all_reads = pybedtools.BedTool(merged_bam).bam_to_bed() except Exception as e: print(e) raise RuntimeError("Problem opening sam file %s" % (merged_bam)) # Must sort the Bedtool object sorted_reads = all_reads.sort() intervals = sorted_reads.merge(d = 100000000) # Now open each sam file using pysam and extract the reads coords = [] read_groups = [] with pysam.AlignmentFile(merged_bam) as bam: # type: pysam.AlignmentFile for interval in intervals: reads = get_reads_in_interval(bam, interval.chrom, interval.start, interval.end) read_groups.append(reads) coords.append((interval.chrom, interval.start + 1, interval.end)) return read_groups, coords, merged_bam