Python Examples of pysam.Samfile

Source File: hints_db.py From Comparative-Annotation-Toolkit with Apache License 2.0

7 votes

def validate_bam_fasta_pairs(bam_path, fasta_sequences, genome):
    """
    Make sure that this BAM is actually aligned to this fasta. Every sequence should be the same length. Sequences
    can exist in the reference that do not exist in the BAM, but not the other way around.
    """
    handle = pysam.Samfile(bam_path, 'rb')
    bam_sequences = {(n, s) for n, s in zip(*[handle.references, handle.lengths])}
    difference = bam_sequences - fasta_sequences
    if len(difference) > 0:
        base_err = 'Error: BAM {} has the following sequence/length pairs not found in the {} fasta: {}.'
        err = base_err.format(bam_path, genome, ','.join(['-'.join(map(str, x)) for x in difference]))
        raise UserException(err)
    missing_seqs = fasta_sequences - bam_sequences
    if len(missing_seqs) > 0:
        base_msg = 'BAM {} does not have the following sequence/length pairs in its header: {}.'
        msg = base_msg.format(bam_path, ','.join(['-'.join(map(str, x)) for x in missing_seqs]))
        logger.warning(msg)

Source File: hicBuildMatrix.py From HiCExplorer with GNU General Public License v3.0

7 votes

def get_chrom_sizes(bam_handle):
    """
    return the list of chromosome names and their
    size from the bam file
    The return value is a list of the form
    [('chr1', 2343434), ('chr2', 43432432)]

    >>> test = Tester()
    >>> get_chrom_sizes(pysam.Samfile(test.bam_file_1, 'rb'))
    [('contig-1', 7125), ('contig-2', 3345)]
    """

    # in some cases there are repeated entries in
    # the bam file. Thus, I first convert to dict,
    # then to list.
    list_chrom_sizes = OrderedDict(zip(bam_handle.references,
                                       bam_handle.lengths))
    return list(list_chrom_sizes.items())

Source File: BAMtoJunctionBED.py From altanalyze with Apache License 2.0

6 votes

def exportIndexes(input_dir):
    import unique
    bam_dirs = unique.read_directory(input_dir)
    print 'Building BAM index files',
    for file in bam_dirs:
        if string.lower(file[-4:]) == '.bam':
            bam_dir = input_dir+'/'+file
            bamf = pysam.Samfile(bam_dir, "rb" )
            ### Is there an indexed .bai for the BAM? Check.
            try:
                for entry in bamf.fetch():
                    codes = map(lambda x: x[0],entry.cigar)
                    break
            except Exception:
                ### Make BAM Indexv lciv9df8scivx 
                print '.',
                bam_dir = str(bam_dir)
                #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
                pysam.index(bam_dir)
                bamf = pysam.Samfile(bam_dir, "rb" )

Source File: test_snptable.py From WASP with Apache License 2.0

6 votes

def test_get_overlapping_snps_softclip(self):
        """Test that soft-clipped part of read is not used"""
        data = Data()
        data.setup()

        # write a single read with softclipping on left end
        sam_file = open(data.sam_filename, "w")
        data.write_sam_header(sam_file)
        data.write_sam_read(sam_file, cigar="10S20M")
        sam_file.close()
        
        sam_file = pysam.Samfile(data.sam_filename)
        read = next(sam_file)

        snp_tab = snptable.SNPTable()
        snp_tab.read_file(data.snp_filename)
        snp_idx, snp_read_pos, \
            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)

        # check that overlapping SNPs are found and in correct locations
        assert len(snp_idx) == 2
        assert snp_idx[0] == 0
        assert snp_idx[1] == 1
        assert snp_read_pos[0] == 20
        assert snp_read_pos[1] == 30

Source File: test_snptable.py From WASP with Apache License 2.0

6 votes

def test_get_overlapping_indel(self):
        """Test that indels can be correctly obtained"""
        data = Data()
        data.snp_list = [(10, "A", "-")]
        data.setup()

        # write a single read with match
        sam_file = open(data.sam_filename, "w")
        data.write_sam_header(sam_file)
        data.write_sam_read(sam_file, cigar="30M")
        sam_file.close()
        
        sam_file = pysam.Samfile(data.sam_filename)
        read = next(sam_file)

        snp_tab = snptable.SNPTable()
        snp_tab.read_file(data.snp_filename)
        snp_idx, snp_read_pos, \
            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)

        # check that overlapping indel found in correct location
        assert len(snp_idx) == 0
        assert len(indel_idx) == 1
        assert indel_idx[0] == 0
        assert indel_read_pos[0] == 10

Source File: rmdup_pe.py From WASP with Apache License 2.0

6 votes

def main(input_bam, output_bam):
    if input_bam.endswith(".sam") or input_bam.endswith("sam.gz"):
        infile = pysam.Samfile(input_bam, "r")
    else:
        # assume binary BAM file
        infile = pysam.Samfile(input_bam, "rb")

    if output_bam.endswith(".sam"):
        # output in text SAM format
        outfile = pysam.Samfile(output_bam, "w", template=infile)
    elif output_bam.endswith(".bam"):
        # output in binary compressed BAM format
        outfile = pysam.Samfile(output_bam, "wb", template=infile)
    else:
        raise ValueError("name of output file must end with .bam or .sam")

    filter_reads(infile, outfile)

    infile.close()
    outfile.close()

Source File: classDef.py From PePr with GNU General Public License v3.0

6 votes

def check_bampe_sorted(filename, input_dir):
        infile = pysam.Samfile(input_dir + filename, 'rb')
        count_list = []
        count = 1
        pre_name = ''
        for idx,line in enumerate(infile.fetch(until_eof = True)):
            if line.query_name == pre_name:
                count += 1
            else:
                count_list.append(count)
                count = 1
                pre_name = line.query_name
            if idx == 999:
                break

        count1 = len([i for i in count_list if i==1])
        ratio = float(count1)/sum(count_list)
        #print filename, ratio
        if ratio > 0.8:
            warning("%s may not be sorted by read name. Please check.",filename)

Source File: fileParser.py From PePr with GNU General Public License v3.0

6 votes

def parse_bam_for_f_r(filename, input_dir):
    num = 0
    forward = {}
    reverse = {}

    infile =pysam.Samfile(input_dir+filename, 'rb')
    for line in infile.fetch(until_eof = True):
        num += 1
        if num % 10000000 == 0 :
            print ("{0:,} lines processed in {1}".format(num, filename))
        if line.is_unmapped is False:
            chr = infile.getrname(line.tid)
            if line.is_reverse is False:
                try: forward[chr].append(line.reference_start)
                except KeyError:
                    forward[chr] = array.array('i',[line.reference_start])
            else:
                try: reverse[chr].append(line.reference_end)
                except KeyError:
                    reverse[chr] = array.array('i',[line.reference_end])
    return forward,reverse

Source File: initialize.py From PePr with GNU General Public License v3.0

6 votes

def get_chr_info_bam(parameter, filename):
    chr_info = {}
    num = 0
    infile = pysam.Samfile(parameter.input_directory+filename, 'rb')
    for line in infile.fetch(until_eof=True):
        num += 1
        if num %10000000 == 0:
            print("{0:,} lines processed in {1}".format(num, filename))
        if line.is_unmapped is False:
            chr = infile.getrname(line.tid)
            try: 
                chr_info[chr] = max(chr_info[chr], line.pos)
            except KeyError: 
                chr_info[chr] = line.pos
    parameter.chr_info = chr_info
    return

Source File: initialize.py From PePr with GNU General Public License v3.0

6 votes

def get_read_length_from_bam(parameter):
    for filename in parameter.get_filenames():
        with pysam.Samfile(parameter.input_directory+filename, 'rb') as infile:
            length_list = []
            num = 0
            #for idx in range(1000):
            #    line = infile.fetch(until_eof=True).__next__()
            for line in infile.fetch(until_eof=True):
                num += 1
                if num == 1000:
                    break
                try:
                    length_list.append(line.query_length)
                # rlen is deprecated starting from pysam v0.9
                except AttributeError: 
                    length_list.append(line.rlen)
            length = max(length_list)
        parameter.read_length_dict[filename] = length
    return

Source File: find_peak_mode.py From PePr with GNU General Public License v3.0

6 votes

def parse_bam_for_f_r(filename):
    num = 0
    forward = {}
    reverse = {}

    infile =pysam.Samfile(filename, 'rb')
    line = infile.fetch(until_eof=True).__next__()
    length = line.alen
    for line in infile.fetch(until_eof = True):
        num += 1
        if num % 1000000 == 0 :
            print ("{0:,} lines processed in {1}".format(num, filename))
        if line.is_unmapped is False:
            chr = infile.getrname(line.tid)
            if line.is_reverse is False:
                try: forward[chr].append(line.pos)
                except KeyError:
                    forward[chr] = array.array('i',[line.pos])
            else:
                try: reverse[chr].append(line.pos)
                except KeyError:
                    reverse[chr] = array.array('i',[line.pos])
    return forward,reverse, length

Source File: mapping_test.py From iva with GNU General Public License v3.0

6 votes

def test_soft_clipped(self):
        '''Test soft_clipped'''
        expected = [
            (5, 0),
            (0, 0),
            (0, 0),
            (0, 5),
            (0, 0),
            None,
            (0, 0),
            (0, 0),
            (2, 0),
            (0, 1),
            None,
            None,
            (1, 1),
            (0, 1)
        ]

        sam_reader = pysam.Samfile(os.path.join(data_dir, 'mapping_test.smalt.out.bam'), "rb")
        i = 0
        for sam in sam_reader.fetch(until_eof=True):
            self.assertEqual(mapping.soft_clipped(sam), expected[i])
            i += 1

Source File: mapping_test.py From iva with GNU General Public License v3.0

6 votes

def test_get_pair_type(self):
        '''Test get_pair_type'''
        expected = [
            (mapping.CAN_EXTEND_LEFT, mapping.KEEP),
            (mapping.KEEP, mapping.CAN_EXTEND_RIGHT),
            (mapping.KEEP, mapping.KEEP),
            (mapping.NOT_USEFUL, mapping.NOT_USEFUL),
            (mapping.CAN_EXTEND_LEFT, mapping.KEEP),
            (mapping.BOTH_UNMAPPED, mapping.BOTH_UNMAPPED),
            (mapping.NOT_USEFUL, mapping.NOT_USEFUL)
        ]

        sam_reader = pysam.Samfile(os.path.join(data_dir, 'mapping_test.smalt.out.bam'), "rb")
        previous_sam = None
        i = 0
        for sam in sam_reader.fetch(until_eof=True):
            if previous_sam is None:
                previous_sam = sam
                continue

            types = mapping.get_pair_type(previous_sam, sam, 190, 1000, min_clip=2)
            self.assertEqual(types, expected[i])
            i += 1
            previous_sam = None

Source File: assembly_test.py From iva with GNU General Public License v3.0

6 votes

def test_map_reads(self):
        '''test _map_reads'''
        a = assembly.Assembly(contigs_file=os.path.join(data_dir, 'assembly_test.fa'))
        reads_prefix = os.path.join(data_dir, 'assembly_test.to_map')
        out_prefix = 'tmp.assembly_test.out'
        a._map_reads(reads_prefix + '_1.fastq', reads_prefix + '_2.fastq', out_prefix)

        # different smalt version output slightly different BAMs. Some columns
        # should never change, so check just those ones
        def get_sam_columns(bamfile):
            sams = []
            sam_reader = pysam.Samfile(bamfile, "rb")
            for sam in sam_reader.fetch(until_eof=True):
                if sam.is_unmapped:
                    refname = None
                else:
                    refname = sam_reader.getrname(sam.tid)
                sams.append((sam.qname, sam.flag, refname, sam.pos, sam.cigar, sam.seq))
            return sams

        expected = get_sam_columns(os.path.join(data_dir, 'assembly_test.mapped.bam'))
        got = get_sam_columns(out_prefix + '.bam')
        self.assertListEqual(expected, got)
        os.unlink(out_prefix + '.bam')

Source File: dedup.py From UMI-tools with MIT License

6 votes

def detect_bam_features(bamfile, n_entries=1000):
    ''' read the first n entries in the bam file and identify the tags
    available detecting multimapping '''

    inbam = pysam.Samfile(bamfile)
    inbam = inbam.fetch(until_eof=True)

    tags = ["NH", "X0", "XT"]
    available_tags = {x: 1 for x in tags}

    for n, read in enumerate(inbam):
        if n > n_entries:
            break

        if read.is_unmapped:
            continue

        else:
            for tag in tags:
                if not read.has_tag(tag):
                    available_tags[tag] = 0

    return available_tags

Source File: test_identify_locus.py From STRetch with MIT License

6 votes

def test_indel_size_allele_freq(bamfile, n, expected):
    """Test indel corretly identified"""
    region = (70713514, 70713561)
    chrom = 'chr13'
    bam = pysam.Samfile(bamfile, 'rb')
    all_indels = {}
    for read in bam.fetch():
       try:
           all_indels[read.query_name] = indel_size(read, region, chrom)
           #print(read.is_secondary)
       except ValueError:
           continue
    print(all_indels)
    all_indels_list = [all_indels[x] for x in all_indels]
    alleles_by_frequency = allele_freq(all_indels_list, n)
    assert alleles_by_frequency == expected

Source File: genewisequanti.py From READemption with ISC License

6 votes

def _quantify(self, read_alignment_path, annotation_path, output_path,
                  fraction_calc_method, pseudocounts=False):
        sam = pysam.Samfile(read_alignment_path)
        gff3_parser = Gff3Parser()
        output_fh = open(output_path, "w")
        output_fh.write("#" + "\t".join(_gff_field_descriptions()
                                        + ["sense", "antisense"]) + "\n")
        for entry in gff3_parser.entries(open(annotation_path)):
            if _entry_to_use(entry, self._allowed_features) is False:
                continue
            if pseudocounts is False:
                sum_sense = 0
                sum_antisense = 0
            else:
                sum_sense = 1
                sum_antisense = 1
            for alignment in self._overlapping_alignments(sam, entry):
                fraction = fraction_calc_method(alignment)
                if self._same_strand(entry, alignment):
                    sum_sense += fraction
                else:
                    sum_antisense += fraction
            output_fh.write(str(entry) + "\t" + str(sum_sense) + "\t" +
                            str(sum_antisense) + "\n")

Source File: readalignerstats.py From READemption with ISC License

6 votes

def _count_aligned_reads_and_alignments(
            self, read_alignment_result_bam_path):
        bam = pysam.Samfile(read_alignment_result_bam_path)
        stats_per_ref = defaultdict(dict)
        no_of_hits_per_read_freq = {}
        for ref_id in bam.references:
            self._init_counting_dict(stats_per_ref, ref_id)
        for entry in bam.fetch():
            ref_id = bam.getrname(entry.tid)
            try:
                self._count_alignment(
                    entry, ref_id, stats_per_ref, no_of_hits_per_read_freq)
            except KeyError:
                sys.stderr.write(
                    "SAM entry with unspecified reference found! Stoping\n")
                sys.exit(2)
        self._stats["stats_per_reference"] = stats_per_ref
        for ref_id, stats in stats_per_ref.items():
            stats_per_ref[ref_id][
                "no_of_hits_per_read_and_freqs"] = self._calc_down_to_read(
                stats_per_ref[ref_id]["no_of_hits_per_read_and_freqs"])
        self._stats["stats_total"] = self._sum_countings(stats_per_ref)

Source File: bam_alignment.py From pybamview with MIT License

6 votes

def __init__(self, _bamfiles, _reffile):
        self.bamfiles = _bamfiles
        self.bamreaders = []
        for bam in self.bamfiles:
            try:
                br = pysam.Samfile(bam, "rb")
                self.bamreaders.append(br)
            except:
                sys.stderr.write("ERROR: could not open %s. Is this a valid bam file?\n"%bam)
        if _reffile != "":
            try:
                self.reference = pyfaidx.Fasta(_reffile, as_raw=True)
            except:
                self.reference = None
        else: self.reference = None
        self.alignment_grid = None
        self.read_groups = self.LoadRGDictionary()

Source File: run_hmms.py From ViFi with GNU General Public License v3.0

6 votes

def prepare_unmapped_sequences(options):
  start_time = time.time() 
  counter = 0;
  bam = pysam.Samfile(options.bamfile, 'rb')
  fas = open('%s/temp/unmapped.fas' % options.directory, 'wb')
  map = open('%s/temp/unmapped.map' % options.directory, 'wb')
  
  for read in bam:
    if read.is_unmapped and not read.mate_is_unmapped and bam.references[read.rnext].find('chr') == 0:
      fas.write('>read_%d\n%s\n' % (counter, read.seq))
      map.write('%s\tread_%d\n' % (read.qname, counter))
      counter+=1
  fas.close()
  map.close()
  bam.close()
  end_time = time.time() - start_time;
  print "Prepared sequences for searching against HMMs: %fs" % end_time

Source File: mapping_test.py From iva with GNU General Public License v3.0

6 votes

def test_can_extend(self):
        '''Test can_extend'''
        expected = [
            (True, False),
            (False, False),
            (False, False),
            (False, True),
            (False, False),
            (False, False),
            (False, False),
            (False, False),
            (True, False),
            (False, False),
            (False, False),
            (False, False),
            (False, False),
            (False, False),

        ]

        sam_reader = pysam.Samfile(os.path.join(data_dir, 'mapping_test.smalt.out.bam'), "rb")
        i = 0
        for sam in sam_reader.fetch(until_eof=True):
            self.assertEqual(mapping._can_extend(sam, 190, min_clip=2), expected[i])
            i += 1

Source File: QC_stats_Final.py From ngs_pipeline with MIT License

5 votes

def _count_reads(bam):

    stats = { 'total_reads' : 0, 'mapped_reads' : 0 }

    with pysam.Samfile(bam, 'rb') as bam_file:
        for read in bam_file:
            stats['total_reads'] += 1
            if (not read.is_unmapped):
                stats['mapped_reads'] += 1

    stats['percent_mapped'] = (float(stats['mapped_reads']) / float(stats['total_reads'])) * 100
    stats['percent_mapped'] = "%2.2F" % stats['percent_mapped']

    return stats

Source File: sambamconverter.py From READemption with ISC License

5 votes

def _generate_empty_bam_file(self, sam_path, bam_path_prefix):
        samfile = pysam.Samfile(sam_path, "r")
        bamfile = pysam.Samfile(
            "%s.bam" % bam_path_prefix, "wb", header=samfile.header)
        bamfile.close()
        samfile.close()
        pysam.index("%s.bam" % bam_path_prefix)

Source File: hints_db.py From Comparative-Annotation-Toolkit with Apache License 2.0

5 votes

def bam_is_paired(bam_path, num_reads=20000, paired_cutoff=0.75):
    """
    Infers the paired-ness of a bam file.
    """
    sam = pysam.Samfile(bam_path)
    count = 0
    for rec in itertools.islice(sam, num_reads):
        if rec.is_paired:
            count += 1
    if tools.mathOps.format_ratio(count, num_reads) > 0.75:
        return True
    elif tools.mathOps.format_ratio(count, num_reads) < 1 - paired_cutoff:
        return False
    else:
        raise UserException("Unable to infer pairing from bamfile {}".format(bam_path))

Source File: coveragecalculator.py From READemption with ISC License

5 votes

def _open_bam_file(self, bam_file):
        return pysam.Samfile(bam_file)

Source File: misc.py From Comparative-Annotation-Toolkit with Apache License 2.0

5 votes

def is_bam(path):
    """Checks if a path is a BAMfile"""
    try:
        pysam.Samfile(path)
    except IOError:
        raise RuntimeError('Path {} does not exist'.format(path))
    except ValueError:
        return False
    return True

Source File: crossalignfilter.py From READemption with ISC License

5 votes

def write_crossmapping_free_bam(self):
        with pysam.Samfile(self._input_bam) as input_bam:
            with pysam.Samfile(self._output_bam, "wb",
                               header=input_bam.header) as output_bam:
                for alignment in input_bam.fetch():
                    if not alignment.qname in self._crossmapped_reads:
                        output_bam.write(alignment)
        pysam.index(self._output_bam)

Source File: crossalignfilter.py From READemption with ISC License

5 votes

def _check_replicon_existance(self):
        found_all = True
        with pysam.Samfile(self._input_bam) as bam:
            for replicon_ids in self._orgs_and_replicon_ids.values():
                for replicon_id in replicon_ids:
                    if not replicon_id in bam.references:
                        sys.stderr.write(
                            "\"%s\" not found in BAM header.\n" % replicon_id)
                        found_all = False
        if not found_all:
            raise RepliconIdNotInBam

Source File: bam_alignment.py From pybamview with MIT License

5 votes

def GetDefaultLocation(bamfiles):
    """
    Return default location to jump to if no location given.
    Look at the first read we see and go there.
    If no reads aligned, return 'error'

    Args:
        bamfiles (list): A list with paths to bamfiles

    Returns:
        position (string): A string with chromosome and position

    """
    default_chrom = None
    default_pos = None
    aligned = False
    position = 'error'
    for bam in bamfiles:
        try:
            br = pysam.Samfile(bam, "rb")
        except:
            sys.stderr.write("ERROR: Could not open %s. Is this a valid bam file?\n"%bam)
            continue
        # Peak at the first hundred reads
        read_count = 0
        while not (aligned or read_count > 100):
            try:
                aligned_read = br.next()
            except StopIteration:
                continue
            if not aligned_read.is_unmapped:
                default_chrom = br.getrname(aligned_read.tid)
                default_pos = str(aligned_read.pos)
                aligned = True
                position = ':'.join([default_chrom, default_pos])
                break
            else:
                read_count += 1

    return position

Source File: bioutilities.py From Haystack with GNU Affero General Public License v3.0

5 votes

def calculate_profile_matrix_bed_bam(bed_filename,sam_filename,window_size=5000, resolution=50, fragment_length=200, use_strand=False,return_coordinates=False):
    cs=Coordinate.bed_to_coordinates(bed_filename)
    cs=Coordinate.coordinates_of_intervals_around_center(cs, window_size)
    samfile = pysam.Samfile(sam_filename)
    n_bins=(window_size/resolution)+1
    
    profile_matrix=np.zeros((len(cs),n_bins))

    for idx_c,c in enumerate(cs):
        n_start=max(0,c.bpcenter-window_size/2)
        n_end=c.bpcenter+window_size/2

        for rd in samfile.fetch(c.chr_id, n_start,n_end):

            if rd.is_reverse:
                rd_st=rd.positions[-1]-fragment_length-1
                rd_end=rd.positions[-1]
            else:
                rd_st=rd.positions[0]
                rd_end=max(rd.positions[-1],rd.positions[1]+fragment_length)
             
            bin_idx_st=max(0,1+(rd_st-n_start)/resolution)
            bin_idx_en=min(n_bins,1+ (rd_end-n_start)/resolution)

            #print rd.positions[1],rd.positions[-1],rd_st, rd_end,rd_end-rd_st,rd_st-n_start,rd_end-n_start,bin_idx_st,bin_idx_en
            
            profile_matrix[idx_c,bin_idx_st:bin_idx_en]+=1
        
        if use_strand and c.strand == '-':
            profile_matrix[idx_c,:]=profile_matrix[idx_c,::-1]
            
    if return_coordinates:
        return profile_matrix,samfile.mapped,cs
    else:    
        return profile_matrix,samfile.mapped

Python pysam.Samfile() Examples