Python Examples of pysam.AlignmentFile

Source File: count.py From SVclone with BSD 3-Clause "New" or "Revised" License

9 votes

def write_anomalous_read_to_bam(bam,split_reads,span_reads,anom_reads,out):
    print('Writing anom reads to file')
    split_reads = np.unique(split_reads['query_name'])
    span_reads = np.unique(span_reads['query_name'])
    anom_reads = np.unique(anom_reads['query_name'])

    # need to filter out any reads that were at any point marked as valid supporting reads
    anom_reads = np.array([x for x in anom_reads if x not in split_reads])
    anom_reads = np.array([x for x in anom_reads if x not in span_reads])

    bamf = pysam.AlignmentFile(bam, "rb")
    index = pysam.IndexedReads(bamf)
    index.build()
    anom_bam = pysam.AlignmentFile("%s_anom_reads.bam" % out, "wb", template=bamf)
    for read_name in anom_reads:
        for read in index.find(read_name):
            anom_bam.write(read)
    anom_bam.close()

Source File: common.py From wub with Mozilla Public License 2.0

6 votes

def pysam_open(alignment_file, in_format='BAM'):
    """Open SAM/BAM file using pysam.

    :param alignment_file: Input file.
    :param in_format: Format (SAM or BAM).
    :returns: pysam.AlignmentFile
    :rtype: pysam.AlignmentFile
    """
    if in_format == 'BAM':
        mode = "rb"
    elif in_format == 'SAM':
        mode = "r"
    else:
        raise Exception("Invalid format: {}".format(in_format))

    aln_iter = pysam.AlignmentFile(alignment_file, mode)
    return aln_iter

Source File: test_parse_custom_SAM_tags.py From TALON with MIT License

6 votes

def test_parse_custom_SAM_tags():
    """ Test that custom SAM tags are handled as expected """
   
    sam_file = "input_files/test_parse_custom_SAM_tags/toy_reads.sam"
    with pysam.AlignmentFile(sam_file, "rb") as sam: 
        for sam_record in sam:
            fraction_As, custom_label, allelic_label, \
            start_support, end_support = talon.parse_custom_SAM_tags(sam_record)
            if sam_record.query_name == "read_1":
                assert round(fraction_As,1) == 0.2
                assert custom_label == "yes"
                assert allelic_label == "paternal"
                assert start_support == "yes"
                assert end_support == "no"
            elif sam_record.query_name == "read_4":
                assert fraction_As == custom_label == allelic_label == None
                assert start_support == end_support == None
            else:
                pytest.fail("Did not recognize read name")

Source File: test_preprocess_sam_dataset_labeling.py From TALON with MIT License

6 votes

def test_read_labels(self):
        """ Given two sam files and two corresponding dataset labels, ensure 
            that the merged BAM file preserves the RB tags assigned by 
            pysam.merge """

        sams = ["input_files/preprocess_sam/read1.sam", 
               "input_files/preprocess_sam/read2.sam" ]
        datasets = ["dataset1", "dataset2"]
        tmp_dir = "scratch/test_read_labels/test1/"

        merged_bam = procsam.preprocess_sam(sams, datasets, tmp_dir = tmp_dir)

        with pysam.AlignmentFile(merged_bam) as bam:
            for entry in bam:
                if entry.query_name == "read_1":
                    assert entry.get_tag("RG") == "dataset1"
                elif entry.query_name == "read_2":
                    assert entry.get_tag("RG") == "dataset1"
                elif entry.query_name == "read_3":
                    assert entry.get_tag("RG") == "dataset2"
                else:
                    pytest.fail("Unexpected read encountered")

Source File: bamtools.py From SVclone with BSD 3-Clause "New" or "Revised" License

6 votes

def estimateTagSize(bamfile,
                    alignments=10,
                    multiple="error"):
    '''estimate tag size from first alignments in file.'''
    samfile = pysam.AlignmentFile(bamfile)
    sizes = [read.rlen for read in samfile.head(alignments)]
    mi, ma = min(sizes), max(sizes)

    if mi == 0 and ma == 0:
        sizes = [read.inferred_length for read in samfile.head(alignments)]
        # remove 0 sizes (unaligned reads?)
        sizes = [x for x in sizes if x > 0]
        mi, ma = min(sizes), max(sizes)

    if mi != ma:
        if multiple == "error":
            raise ValueError('multiple tag sizes in %s: %s' % (bamfile, sizes))
        elif multiple == "mean":
            mi = int(sum(sizes) / len(sizes))

    return mi

Source File: bamtools.py From SVclone with BSD 3-Clause "New" or "Revised" License

6 votes

def estimateInsertSizeDistribution(bamfile,
                                   alignments=50000):
    '''estimate insert size from first alignments in bam file.

    returns mean and stddev of insert sizes.
    '''

    assert isPaired(bamfile), \
        'can only estimate insert size from' \
        'paired bam files'

    samfile = pysam.AlignmentFile(bamfile)
    # only get positive to avoid double counting
    inserts = numpy.array(
        [read.tlen for read in samfile.head(alignments)
         if read.is_proper_pair and read.tlen > 0])
    insert_mean, insert_std = numpy.mean(inserts), numpy.std(inserts)
    print('Insert mean of %f, with standard deviation of %f inferred' % 
            (insert_mean, insert_std))
    if insert_mean > 10000 or insert_std > 1000 or \
        insert_mean < 1 or insert_std < 1:
            print('''WARNING: anomalous insert sizes detected. Please 
              double check or consider setting values manually.''')
    return insert_mean, insert_std

Source File: bamtools.py From SVclone with BSD 3-Clause "New" or "Revised" License

6 votes

def isPaired(bamfile, alignments=1000):
    '''check if a *bamfile* contains paired end data

    The method reads at most the first *alignments* and returns
    True if any of the alignments are paired.
    '''

    samfile = pysam.AlignmentFile(bamfile)
    n = 0
    for read in samfile:
        if read.is_paired:
            break
        n += 1
        if n == alignments:
            break

    samfile.close()

    return n != alignments

Source File: annotate.py From SVclone with BSD 3-Clause "New" or "Revised" License

6 votes

def retrieve_loc_reads(sv, bam, max_dep, threshold):
    bp_dtype = [('chrom', '<U20'), ('start', int), ('end', int), ('dir', '<U1')]

    sv_id, chr1, pos1, dir1, chr2, pos2, dir2, \
        sv_class, orig_id, orig_pos1, orig_pos2 = [h[0] for h in dtypes.sv_dtype]
    bp1 = np.array((sv[chr1], sv[pos1]-(threshold*2), \
                    sv[pos1]+(threshold*2), sv[dir1]), dtype=bp_dtype)
    bp2 = np.array((sv[chr2], sv[pos2]-(threshold*2), \
                    sv[pos2]+(threshold*2), sv[dir2]), dtype=bp_dtype)

    bamf = pysam.AlignmentFile(bam, "rb")
    loc1_reads, err_code1 = count.get_loc_reads(bp1, bamf, max_dep)
    loc2_reads, err_code2 = count.get_loc_reads(bp2, bamf, max_dep)
    bamf.close()

    sv_class = str(sv['classification'])
    if err_code1 == 1 or err_code2 == 1:
        sv['classification'] = 'HIDEP' if sv_class == '' else sv_class+';HIDEP'
    elif err_code1 == 2 or err_code2 == 2:
        sv['classification'] = 'READ_FETCH_FAILED' if sv_class == '' \
                                                    else sv_class+';READ_FETCH_FAILED'

    return sv, loc1_reads, loc2_reads, err_code1, err_code2

Source File: process_sams.py From TALON with MIT License

6 votes

def write_reads_to_file(read_groups, intervals, header_template, tmp_dir = "talon_tmp/"):
    """ For each read group, iterate over the reads and write them to a file
        named for the interval they belong to. This step is necessary because
        Pysam objects cannot be pickled. """
 
    tmp_dir = tmp_dir + "interval_files/"
    os.system("mkdir -p %s " % (tmp_dir))
 
    outbam_names = []
    with pysam.AlignmentFile(header_template, "rb") as template:
        for group, interval in zip(read_groups, intervals):
            fname = tmp_dir + "_".join([str(x) for x in interval]) + ".bam"
            outbam_names.append(fname)
            with pysam.AlignmentFile(fname, "wb", template = template) as f:
                for read in group:
                    f.write(read)
        
    return outbam_names

Source File: sample_info.py From grocsvs with MIT License

6 votes

def get_bam_coverages(options, sample, dataset):
    window_skip = int(1e6)
    window_size = 1000
    skip_at_ends = int(1e6)

    bam = pysam.AlignmentFile(dataset.bam)

    print "getting coverages"
    coverages = []
    count = 0
    for chrom, start, end in get_search_regions(options.reference, 
                                                window_skip, window_size, skip_at_ends):
        coverages.append(bam.count(chrom, start, end))

        if count % 1000 == 0:
            print count
        count += 1
    return coverages

Source File: bam_cov.py From basenji with Apache License 2.0

6 votes

def learn_shift_pair(self, bam_file):
    """ Learn the optimal fragment shift from paired end fragments. """

    t0 = time.time()
    print('Learning shift from paired-end sequences.', end='', flush=True)

    # read proper pair template lengths
    template_lengths = []
    for align in pysam.AlignmentFile(bam_file):
      if align.is_proper_pair and align.is_read1:
        template_lengths.append(abs(align.template_length))

    # compute mean
    self.shift_forward = int(np.round(np.mean(template_lengths) / 2))

    print(' Done in %ds.' % (time.time() - t0))
    print('Shift: %d' % self.shift_forward, flush=True)

Source File: genotyping.py From grocsvs with MIT License

6 votes

def count_ref_reads(bampath, reference, chrom, start, end):
    ref_counts = numpy.zeros(end-start)
    non_ref_counts = numpy.zeros(end-start)

    bam = pysam.AlignmentFile(bampath)

    # stepper = "all" skips dupe, unmapped, secondary, and qcfail reads
    start = max(0, start)
    
    for col in bam.pileup(chrom, start, end, truncate=True, stepper="all"):
        refnuc = reference.fasta[chrom][col.reference_pos].upper()
        nuc_counts = collections.Counter()
        
        for read in col.pileups:
            if read.query_position is None:
                # nuc_counts["indel"] += 1
                pass
            else:
                nuc_counts[read.alignment.query_sequence[read.query_position]] += 1
    
        ref_counts[col.reference_pos-start] = nuc_counts[refnuc]
        non_ref_counts[col.reference_pos-start] = sum(nuc_counts.values()) - nuc_counts[refnuc]

    return ref_counts, non_ref_counts

Source File: bamops.py From anvio with GNU General Public License v3.0

6 votes

def __init__(self, *args):
        """A class that is essentially pysam.AlignmentFile, with some added bonuses

        This class inherits pysam.AlignmentFile and adds a little flair. Init such an object the
        way you would an AlignmentFile, i.e. bam = bamops.BAMFileObject(path_to_bam)
        """

        self.input_bam_path = args[0]
        filesnpaths.is_file_exists(self.input_bam_path)

        try:
            pysam.AlignmentFile.__init__(self)
        except ValueError as e:
            raise ConfigError('Are you sure "%s" is a BAM file? Because samtools is not happy with it: """%s"""' % (self.input_bam_path, e))

        try:
            self.mapped
        except ValueError:
            raise ConfigError("It seems the BAM file is not indexed. See 'anvi-init-bam' script.")

Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License

5 votes

def peek_umi_only(self, bamfile: str, lines: int=30) -> None:
        """Peeks for umi into the samfile to determine if it is a cellranger or dropseq file
        """
        logging.debug(f"Peeking into {bamfile}")
        fin = pysam.AlignmentFile(bamfile)  # type: pysam.AlignmentFile
        cellranger: int = 0
        dropseq: int = 0
        failed: int = 0
        for i, read in enumerate(fin):
            if read.is_unmapped:
                continue
            if read.has_tag("UB"):
                cellranger += 1
            elif read.has_tag("XM"):
                dropseq += 1
            else:
                logging.warn(f"Not found cell and umi barcode in entry {i} of the bam file")
                failed += 1
            if cellranger > lines:
                self.umibarcode_str = "UB"
                break
            elif dropseq > lines:
                self.umibarcode_str = "XM"
                break
            elif failed > 5 * lines:
                raise IOError("The bam file does not contain umi barcodes appropriatelly formatted. If you are runnin UMI-less data you should use the -U flag.")
            else:
                pass
        fin.close()

Source File: process_sams.py From TALON with MIT License

5 votes

def convert_to_bam(sam, bam):
    """ Convert provided sam file to bam file (provided name).  """
    
    try:
        infile = pysam.AlignmentFile(sam, "r")
        outfile = pysam.AlignmentFile(bam, "wb", template=infile)
        for s in infile:
            outfile.write(s)

    except Exception as e:
        print(e)
        raise RuntimeError("Problem converting sam file '%s' to bam." % (sam))

Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License

5 votes

def peek(self, bamfile: str, lines: int=1000) -> None:
        """Peeks into the samfile to determine if it is a cellranger or dropseq file
        """
        logging.debug(f"Peeking into {bamfile}")
        fin = pysam.AlignmentFile(bamfile)  # type: pysam.AlignmentFile
        cellranger: int = 0
        dropseq: int = 0
        failed: int = 0
        for i, read in enumerate(fin):
            if read.is_unmapped:
                continue
            if read.has_tag("CB") and read.has_tag("UB"):
                cellranger += 1
            elif read.has_tag("XC") and read.has_tag("XM"):
                dropseq += 1
            else:
                logging.warn(f"Not found cell and umi barcode in entry {i} of the bam file")
                failed += 1
            if cellranger > lines:
                self.cellbarcode_str = "CB"
                self.umibarcode_str = "UB"
                break
            elif dropseq > lines:
                self.cellbarcode_str = "XC"
                self.umibarcode_str = "XM"
                break
            elif failed > 5 * lines:
                raise IOError("The bam file does not contain cell and umi barcodes appropriatelly formatted. If you are runnin UMI-less data you should use the -U flag.")
            else:
                pass
        fin.close()

Source File: call_readclouds.py From grocsvs with MIT License

5 votes

def __init__(self, bam_path, max_dist, min_end_mapq=30, require_one_mapq=60):
        self.bam = pysam.AlignmentFile(bam_path)
        self.max_dist = max_dist
        self.min_end_mapq = min_end_mapq
        self.require_one_mapq = require_one_mapq

        self._cur_chrom = None
        self._barcodes_to_reads = {}
        self._barcodes_to_haplotypes = {}
        self._recently_observed_bcs = set()

        self.cur_start = 0

Source File: BtIO.py From blobtools with GNU General Public License v3.0

5 votes

def parseBam(infile, set_of_blobs, estimate_cov):
    # no_base_cov_flag [deprecated]
    reads_total, reads_mapped = 0, 0
    with pysam.AlignmentFile(infile) as aln:
        reads_total, reads_mapped = checkBam(aln, set_of_blobs)
        if estimate_cov:
            base_cov_dict, read_cov_dict = estimate_coverage(aln, set_of_blobs)
        else:
            base_cov_dict, read_cov_dict = calculate_coverage(aln, reads_mapped, set_of_blobs)
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict

Source File: common.py From pycoQC with GNU General Public License v3.0

5 votes

def expand_file_names(fn, bam_check=False):
    """"""
    # Try to expand file name to list
    if isinstance(fn, list):
        if len(fn) ==1:
            fn_list=glob(fn[0])
        else:
            fn_list = []
            for f in fn:
                fn_list.extend(glob(f))
    elif isinstance(fn, str):
        fn_list=glob(fn)
    else:
        raise pycoQCError ("{} has to be either a file or a regular expression or a list of files".format(fn))

    # Verify that files are readable
    if not fn_list:
        raise pycoQCError("No files found in {}".format(fn))
    for f in fn_list:
        if not is_readable_file (f):
            raise pycoQCError("Cannot read file {}".format(f))
        # Extra checks for bam files
        if bam_check:
            with ps.AlignmentFile(f, "rb") as bam:
                if not bam.has_index():
                    raise pycoQCError("No index found for bam file: {}. Please index with samtools index".format(f))
                if not bam.header['HD']['SO'] == 'coordinate':
                    raise pycoQCError("Bam file not sorted: {}. Please sort with samtools sort".format(f))
    return fn_list

Source File: get_counts.py From NucleoATAC with MIT License

5 votes

def get_counts(args):
    """function to get fragment sizes

    """
    if args.out is None:
        args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])  
    chunks = ChunkList.read(args.bed)
    mat = np.zeros(len(chunks), dtype=np.int)
    bamHandle = AlignmentFile(args.bam)
    j = 0
    for chunk in chunks:
        for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - args.upper), chunk.end + args.upper):
            if read.is_proper_pair and not read.is_reverse:
                if args.atac:
                    #get left position
                    l_pos = read.pos + 4
                    #get insert size
                    #correct by 8 base pairs to be inserion to insertion
                    ilen = abs(read.template_length) - 8
                else:
                    l_pos = read.pos
                    ilen = abs(read.template_length)
                r_pos = l_pos + ilen - 1
                if _between(ilen, args.lower, args.upper) and (_between(l_pos, chunk.start, chunk.end) or _between(r_pos, chunk.start, chunk.end)):
                    mat[j] += 1
        j += 1
    bamHandle.close()
    np.savetxt(args.out + ".counts.txt.gz", mat, delimiter="\n", fmt='%i')

Source File: bubbleprocesser.py From AfterQC with MIT License

5 votes

def statFileBam(self, filename, queue):
        print("start: " + filename + "\n")
        #we apply lazy import here for pysam, since this module need to be installed manually and it is not used for most case
        import pysam
        reader = pysam.AlignmentFile(filename)
        records = []
        while True:
            try:
                read = reader.next()
            except StopIteration:
                break
            if read == None:
                break
            poly, count = self.countPoly(read.seq)
            
            if poly != None:
                
                #illumina sequence name line format
                #@<instrument>:<run number>:<flowcell ID>:<lane>:<tile_no>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>
                
                items = read.qname.split(":")
                if len(items) < 7:
                    continue
                lane = items[3]
                tile_no = items[4]
                surface = tile_no[0]
                swath = tile_no[1]
                camera = tile_no[2]
                tile = tile_no[3:]
                xpos = items[5]
                ypos = items[6]
                record = [int(lane), int(surface), int(swath), int(camera), int(tile), int(xpos), int(ypos), ord(poly), count, int(tile_no)]
                records.append(record)

        queue.put(records)
        reader.close()
        print("finished " +filename + " with " + str(len(records)) + " polyX records")

Source File: mate_pair_evidence.py From grocsvs with MIT License

5 votes

def run(self):
        self.logger.log("running!")

        path = self.evidence_step.outpaths(final=True)["evidence"]
        evidence = pandas.read_table(path)
        evidence["chromx"] = evidence["chromx"].astype("string")
        evidence["chromy"] = evidence["chromy"].astype("string")
        
        sample_info = self.options.sample_info(self.sample.name)
        dataset_info = sample_info[self.dataset.id]

        insert_size_dict = dataset_info["insert_sizes"]
        bam = pysam.AlignmentFile(self.dataset.bam)
        short_frag_evidence = []

        print "@@@@", get_max_insert_size(insert_size_dict)
        # for i, j in numpy.transpose(numpy.where(binom_ps<1e-4)):
        for i, row in evidence.iterrows():
            # if common_counts[i,j] < 20: continue
            if row["shared"] < 20: continue
            chromx, x, orientationx = row["chromx"], int(row["new_x"]), row["orientation"][0]
            chromy, y, orientationy = row["chromy"], int(row["new_y"]), row["orientation"][1]

            cur_evidence = get_evidence(chromx, x, chromy, y, orientationx, orientationy, 
                bam, get_max_insert_size(insert_size_dict), 5000)

            short_frag_evidence.append(
                [chromx, row["original_x"], orientationx,
                 chromy, row["original_y"], orientationy]
                 + list(cur_evidence))

        columns = ["chromx", "x", "orientationx", "chromy", "y", "orientationy",
                   "common", "total"]
        df = pandas.DataFrame(short_frag_evidence, columns=columns)

        outpath = self.outpaths(final=False)["mate_pair_evidence"]
        df.to_csv(outpath, sep="\t", index=False)

Source File: test_preprocess_sam_dataset_labeling.py From TALON with MIT License

5 votes

def test_unsorted_sam_file(self):
        """ Make sure that the function can handle a SAM input that has not 
            been sorted """

        sams = ["input_files/chr11_and_Tcf3/BC017.sam"]
        datasets = ["BC017"]      
        tmp_dir = "scratch/test_read_labels/test2/"

        merged_bam = procsam.preprocess_sam(sams, datasets, tmp_dir = tmp_dir)
        with pysam.AlignmentFile(merged_bam) as bam:
            # Check the first read
            for entry in bam:
                assert entry.query_name == "m54284_180814_002203/19268005/ccs" 
                break

Source File: utils.py From smallrnaseq with GNU General Public License v3.0

5 votes

def sam_to_bam(filename):
    """Convert sam to bam"""

    import pysam
    infile = pysam.AlignmentFile(filename, "r")
    name = os.path.splitext(filename)[0]+'.bam'
    bamfile = pysam.AlignmentFile(name, "wb", template=infile)
    for s in infile:
        bamfile.write(s)
    pysam.sort("-o", name, name)
    pysam.index(name)
    bamfile = pysam.AlignmentFile(name, "rb")
    return

Source File: singlesample.py From svtyper with MIT License

5 votes

def open_alignment_file(afile, reference_fasta):
    fd = None
    if afile.endswith('.bam'):
        fd = pysam.AlignmentFile(afile, mode='rb')
    elif afile.endswith('.cram'):
        fd = pysam.AlignmentFile(afile, mode='rc', reference_filename=reference_fasta)
    else:
        die('Error: %s is not a valid alignment file (*.bam or *.cram)\n' % afile)

    return fd

Source File: process_sams.py From TALON with MIT License

5 votes

def get_reads_in_interval(sam, chrom, start, end):
    """ Given an open pysam.AlignmentFile, return only the reads that overlap
        the provided interval. Note that this means there may be reads that
        extend beyond the bounds of the interval. """
    iterator = sam.fetch(chrom, start, end)
    reads = [ x for x in iterator ]
    return reads

Source File: test_find_intersecting_snps.py From WASP with Apache License 2.0

5 votes

def test_remap_secondary_alignments(self):
        """Test that secondary alignments don't appear in remap files"""
        test_data = Data()
        test_data.setup()
        test_data.index_genome_bowtie2()
        test_data.map_single_bowtie2()
        test_data.sam2bam()
        find_intersecting_snps.main(test_data.bam_filename,
                                    is_paired_end=False,
                                    is_sorted=False,
                                    snp_dir=test_data.snp_dir)
        #
        # Verify to.remap bam is the same as the input bam file.
        #
        old_lines = read_bam(test_data.bam_filename)
        new_lines = read_bam(test_data.bam_remap_filename)
        assert old_lines == new_lines

        # now, let's alter the flag of the read and
        # rerun find_intersecting_snps
        bam = pysam.Samfile(test_data.bam_filename, "rb")
        new_bam_file = test_data.bam_filename[:-4] + "2.bam"
        new_bam = pysam.AlignmentFile(new_bam_file, "wb", template=bam)
        for read in bam:
            read.flag = 256
            new_bam.write(read)
        bam.close()
        new_bam.close()
        # replace old bam file
        os.rename(new_bam_file, test_data.bam_filename)

        find_intersecting_snps.main(test_data.bam_filename,
                                    snp_dir=test_data.snp_dir, is_paired_end=False,
                                    is_sorted=False)

        # now, the remap file shouldn't have any reads
        lines = read_bam(test_data.bam_remap_filename)
        assert len(lines) == 1
        assert lines[0] == ''

        test_data.cleanup()

Source File: test_find_intersecting_snps.py From WASP with Apache License 2.0

5 votes

def test_remap_supplementary_alignments(self):
        """Test that supplementary alignments don't appear in remap files"""
        test_data = Data()
        test_data.setup()
        test_data.index_genome_bowtie2()
        test_data.map_single_bowtie2()
        test_data.sam2bam()
        find_intersecting_snps.main(test_data.bam_filename,
                                    is_paired_end=False,
                                    is_sorted=False,
                                    snp_dir=test_data.snp_dir)
        #
        # Verify to.remap bam is the same as the input bam file.
        #
        old_lines = read_bam(test_data.bam_filename)
        new_lines = read_bam(test_data.bam_remap_filename)
        assert old_lines == new_lines

        # now, let's alter the flag of the read and
        # rerun find_intersecting_snps
        bam = pysam.Samfile(test_data.bam_filename, "rb")
        new_bam_file = test_data.bam_filename[:-4] + "2.bam"
        new_bam = pysam.AlignmentFile(new_bam_file, "wb", template=bam)
        for read in bam:
            read.flag = 2048
            new_bam.write(read)
        bam.close()
        new_bam.close()
        # replace old bam file
        os.rename(new_bam_file, test_data.bam_filename)

        find_intersecting_snps.main(test_data.bam_filename,
                                    snp_dir=test_data.snp_dir, is_paired_end=False,
                                    is_sorted=False)

        # now, the remap file shouldn't have any reads
        lines = read_bam(test_data.bam_remap_filename)
        assert len(lines) == 1
        assert lines[0] == ''

Source File: bamops.py From anvio with GNU General Public License v3.0

5 votes

def _fetch_iterator(self, bam, contig_name, start, end):
        """Uses standard pysam fetch iterator from AlignmentFile objects, ignores unmapped reads"""

        for read in bam.fetch(contig_name, start, end):
            if read.cigartuples is None or read.query_sequence is None:
                # This read either has no associated cigar string or no query sequence. If cigar
                # string is None, this means it did not align but is in the BAM file anyways, or the
                # mapping software decided it did not want to include a cigar string for this read.
                # The origin of why query sequence would be None is unkown.
                continue

            yield read

Source File: process_sams.py From TALON with MIT License

5 votes

def partition_reads(sam_files, datasets, tmp_dir = "talon_tmp/", n_threads = 0):
    """ Use bedtools merge to create non-overlapping intervals from all of the
        transcripts in a series of SAM/BAM files. Then, iterate over the intervals
        to extract all reads inside of them from the pysam object.
       
        Returns:
            - List of lists: sublists contain pysam reads from a given interval
            - List of tuple intervals
            - filename of merged bam file (to keep track of the header)
           """

    merged_bam = preprocess_sam(sam_files, datasets, tmp_dir = tmp_dir, 
                                n_threads = n_threads)

    try:
        all_reads = pybedtools.BedTool(merged_bam).bam_to_bed()
    except Exception as e:
        print(e)
        raise RuntimeError("Problem opening sam file %s" % (merged_bam))

    # Must sort the Bedtool object
    sorted_reads = all_reads.sort()
    intervals = sorted_reads.merge(d = 100000000)

    # Now open each sam file using pysam and extract the reads
    coords = []
    read_groups = []
    with pysam.AlignmentFile(merged_bam) as bam:  # type: pysam.AlignmentFile
        for interval in intervals:
            reads = get_reads_in_interval(bam, interval.chrom,
                                          interval.start, interval.end)
            read_groups.append(reads)
            coords.append((interval.chrom, interval.start + 1, interval.end))

    return read_groups, coords, merged_bam

Python pysam.AlignmentFile() Examples