Python pysam.AlignedSegment() Examples

The following are 30 code examples of pysam.AlignedSegment(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pysam , or try the search function .
Example #1
Source File: bamops.py    From anvio with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, read):
        """Class for manipulating reads

        Parameters
        ==========
        read : pysam.AlignedSegment
        """

        # redefine all properties of interest explicitly from pysam.AlignedSegment object as
        # attributes of this class. The reason for this is that some of the AlignedSegment
        # attributes have no __set__ methods, so are read only. Since this class is designed to
        # modify some of these attributes, and since we want to maintain consistency across
        # attributes, all attributes of interest are redefined here
        self.cigartuples = np.array(read.cigartuples)
        self.query_sequence = np.frombuffer(read.query_sequence.encode('ascii'), np.uint8)
        self.reference_start = read.reference_start
        self.reference_end = read.reference_end

        if read.has_tag('MD'):
            self.reference_sequence = np.frombuffer(read.get_reference_sequence().upper().encode('ascii'), np.uint8)
        else:
            self.reference_sequence = np.array([ord('N')] * (self.reference_end - self.reference_start))

        # See self.vectorize
        self.v = None 
Example #2
Source File: simulator.py    From slamdunk with GNU Affero General Public License v3.0 6 votes vote down vote up
def printFastaEntry(sequence, name, index, conversions, readOutSAM, conversionRate):
    #a = pysam.AlignedSegment()
    print(name + "_" + str(index) + "_" + str(conversions),
          "4",
          "*",
          "0",
          "0",
          "*",
          "*",
          "0",
          "0",
          sequence,
          "F" * len(sequence),
          "TC:i:" + str(conversions),
          "ID:i:" + str(index),
          "CR:f" + str(conversionRate),
           file=readOutSAM, sep="\t") 
Example #3
Source File: smolecule.py    From medaka with Mozilla Public License 2.0 6 votes vote down vote up
def write_bam(fname, alignments, header, bam=True):
    """Write a `.bam` file for a set of alignments.

    :param fname: output filename.
    :param alignments: a list of `Alignment` tuples.
    :param header: bam header
    :param bam: write bam, else sam

    """
    mode = 'wb' if bam else 'w'
    with pysam.AlignmentFile(fname, mode, header=header) as fh:
        for ref_id, subreads in enumerate(alignments):
            for aln in sorted(subreads, key=lambda x: x.rstart):
                a = pysam.AlignedSegment()
                a.reference_id = ref_id
                a.query_name = aln.qname
                a.query_sequence = aln.seq
                a.reference_start = aln.rstart
                a.cigarstring = aln.cigar
                a.flag = aln.flag
                a.mapping_quality = 60
                fh.write(a)
    if mode == 'wb':
        pysam.index(fname) 
Example #4
Source File: transcript.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def __initialize_with_bed12(self, transcript_row: BED12):

        """
        :param transcript_row:
        :type transcript_row: pysam.AlignedSegment
        :return:
        """

        if transcript_row.header is True:
            raise InvalidTranscript("I cannot initialise a valid transcript with a header (ie empty) BED line.")

        self.chrom = transcript_row.chrom
        self.name = self.id = transcript_row.name
        self.start, self.end = transcript_row.start, transcript_row.end
        self.score = transcript_row.score
        self.strand = transcript_row.strand
        exon_starts = np.array([_ + self.start for _ in transcript_row.block_starts])
        exon_ends = exon_starts + np.array(transcript_row.block_sizes) - 1
        isizes = exon_starts[1:] - exon_ends[:-1]  # This functions also for monoexonic
        if np.where(isizes < 0)[0].size > 0:
            raise InvalidTranscript("Overlapping exons found for {}!".format(self.id))
        if np.where(isizes <= 1)[0].size > 0:
            self.logger.debug("Merging touching exons")
            exon_starts = np.concatenate((np.array([exon_starts[0]]), exon_starts[np.where(isizes > 1)[0] + 1]))
            exon_ends = np.concatenate((exon_ends[np.where(isizes > 1)[0]], np.array([exon_ends[-1]])))

        self.add_exons(list(zip(list(exon_starts), list(exon_ends))))
        self.parent = getattr(transcript_row, "parent", transcript_row.id)
        self.source = getattr(transcript_row, "source", None)
        # Now we have to calculate the CDS
        cds = []
        if transcript_row.coding is True:
            for exon in self.exons:
                if exon[1] >= transcript_row.thick_start and exon[0] <= transcript_row.thick_end:
                    cds.append((int(max(exon[0], transcript_row.thick_start)),
                                int(min(exon[1], transcript_row.thick_end))))
            self.add_exons(cds, features="CDS")
        self.finalize() 
Example #5
Source File: transcript.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def __initialize_with_line(self, transcript_row):
        """
        Private method to copy the necessary attributes from
        an external GTF/GFF3 row.
        :param transcript_row:
        :return:
        """

        if isinstance(transcript_row, (str, bytes)):
            if isinstance(transcript_row, bytes):
                transcript_row = transcript_row.decode()
            _ = GffLine(transcript_row)
            if _.header is False and _.is_transcript is True and _.id is not None:
                transcript_row = _
            else:
                _ = GtfLine(transcript_row)
                if _.header is False and _.is_transcript is True and _.id is not None:
                    transcript_row = _
                else:
                    _ = BED12(transcript_row)
                    if _.header is False and _.name is not None:
                        transcript_row = _

        if isinstance(transcript_row, (GffLine, GtfLine)):
            self.__initialize_with_gf(transcript_row)
        elif isinstance(transcript_row, BED12):
            self.__initialize_with_bed12(transcript_row)
        elif isinstance(transcript_row, pysam.AlignedSegment):
            self.__initialize_with_bam(transcript_row)
        else:
            raise TypeError("Invalid data type: {0}".format(type(transcript_row))) 
Example #6
Source File: Opossum.py    From Opossum with GNU General Public License v3.0 5 votes vote down vote up
def CreateReadObject(read, newseq, newqual, newcigar, startread, basetag=[]) :

	a = pysam.AlignedSegment()
	a.query_name = read.query_name
	a.query_sequence = newseq
	a.query_qualities = pysam.qualitystring_to_array(newqual)
	a.cigar = newcigar
	a.reference_start = startread

	# If (Star) mapper has assigned a value of 255 to mapping quality,
	# change it to 50
	mapqual = read.mapping_quality 
	if mapqual == 255 :
		a.mapping_quality = 50
	else :
		a.mapping_quality = mapqual

	a.reference_id = read.reference_id

	# If read has RG read group tag, keep it
	try :
		r_RG = read.get_tag('RG')
		a.tags = ()
		a.set_tag('RG', r_RG)
	except :
		a.tags = ()

	a.next_reference_id = -1
	a.next_reference_start = -1
	a.template_length = 0
	a.flag = UpdateFlag(read.flag)

	return a


# Goes through the given cigar list and reports how many times cigarType is equal to 3
# Optional field is finalpos, which is cutoff position for counting the splits (relative to start pos)
# Default value for finalpos is something very large 
Example #7
Source File: test_rle.py    From medaka with Mozilla Public License 2.0 5 votes vote down vote up
def test_unmapped_return_None(self):
        """Unmapped or secondary reads are skipped"""
        expected = None
        alignment = pysam.AlignedSegment()
        alignment.is_unmapped = True
        got = medaka.rle._compress_alignment(alignment, None)
        self.assertEqual(expected, got) 
Example #8
Source File: common.py    From medaka with Mozilla Public License 2.0 5 votes vote down vote up
def initialise_alignment(
        query_name, reference_id, reference_start,
        query_sequence, cigarstring, flag, mapping_quality=60,
        query_qualities=None, tags=None):
    """Create a `Pysam.AlignedSegment` object.

    :param query_name: name of the query sequence
    :param reference_id: index to the reference name
    :param reference_start: 0-based index of first leftmost reference
        coordinate
    :param query_sequence: read sequence bases, including those soft clipped
    :param cigarstring: cigar string representing the alignment of query
        and reference
    :param flag: bitwise flag representing some properties of the alignment
        (see SAM format)
    :param mapping_quality: optional quality of the mapping or query to
        reference
    :param query_qualities: optional base qualities of the query, including
        soft-clipped ones!

    :returns: `pysam.AlignedSegment` object
    """
    if tags is None:
        tags = dict()

    a = pysam.AlignedSegment()
    a.query_name = query_name
    a.reference_id = reference_id
    a.reference_start = reference_start
    a.query_sequence = query_sequence
    a.cigarstring = cigarstring
    a.flag = flag
    a.mapping_quality = mapping_quality
    if query_qualities is not None:
        a.query_qualities = query_qualities

    for tag_name, tag_value in tags.items():
        a.set_tag(tag_name, tag_value)

    return a 
Example #9
Source File: labels.py    From medaka with Mozilla Public License 2.0 5 votes vote down vote up
def _alignment_to_pairs(self, aln):
        """Convert `pysam.AlignedSegment` to aligned pairs."""
        seq = aln.query_sequence
        for qpos, rpos in aln.get_aligned_pairs():
            yield rpos, seq[qpos].upper() if qpos is not None else '*' 
Example #10
Source File: labels.py    From medaka with Mozilla Public License 2.0 5 votes vote down vote up
def _alignment_to_pairs(self, aln):
        """Convert `pysam.AlignedSegment` to aligned pairs."""
        seq = aln.query_sequence
        for qpos, rpos in aln.get_aligned_pairs():
            yield rpos, seq[qpos].upper() if qpos is not None else '*' 
Example #11
Source File: labels.py    From medaka with Mozilla Public License 2.0 5 votes vote down vote up
def encode(self, truth_alns):
        """Convert truth alignment(s) to array of intermediate representation.

        In most cases the intermediate representation consists of integers.

        :param truth_alns: tuple of `pysam.AlignedSegment` s for each haplotype
            spanning the same genomic range.

        :returns: tuple(positions, training_vectors)

            - positions: numpy structured array with 'major'
              (reference position index) and 'minor'
              (trailing insertion index) fields.

            - encoded: nd.array of encoded labels

        .. note ::
            It is generally the case that the returned encoded labels must be
            padded with encoded gap labels when aligned to corresponding
            training feature data.

        """
        # Labels is a list of tuples with alleles ('A', ), ('A', 'C'), ('C', 3)
        positions, labels = self._alignments_to_labels(truth_alns)

        # Encoded is an array of integers
        encoded = self._labels_to_encoded_labels(labels)

        return positions, encoded 
Example #12
Source File: labels.py    From medaka with Mozilla Public License 2.0 5 votes vote down vote up
def _alignment_to_pairs(self, aln):
        """Convert `pysam.AlignedSegment` to aligned pairs.""" 
Example #13
Source File: labels.py    From medaka with Mozilla Public License 2.0 5 votes vote down vote up
def __init__(self, alignment):
        """Create a `TruthAlignment` list from an `AlignedSegment`.

        :param alignment: `pysam.AlignedSegment`.
        """
        self.aln = alignment  # so we can get positions and labels later
        # initialise start and end (which might be moved)
        self.start = self.aln.reference_start  # zero-based
        self.end = self.aln.reference_end
        self.is_kept = True
        self.logger = medaka.common.get_named_logger('TruthAlign') 
Example #14
Source File: find_indels.py    From pomoxis with Mozilla Public License 2.0 5 votes vote down vote up
def get_trimmed_pairs(aln):
    """Trim aligned pairs to the alignment.

    :param aln: `pysam.AlignedSegment` object
    :yields pairs:
    """
    def pos_is_none(x):
        return x[1] is None or x[0] is None

    for qp, rp in itertools.dropwhile(pos_is_none, aln.get_aligned_pairs()):
        if (rp == aln.reference_end or qp == aln.query_alignment_end):
            break
        yield qp, rp 
Example #15
Source File: util.py    From pomoxis with Mozilla Public License 2.0 5 votes vote down vote up
def get_trimmed_pairs(aln):
    """Trim aligned pairs to the alignment.

    :param aln: `pysam.AlignedSegment` object
    :yields pairs:
    """

    pairs = get_pairs(aln)
    for pair in itertools.dropwhile(lambda x: x.rpos is None or x.qpos is None, pairs):
        if (pair.rpos == aln.reference_end or pair.qpos == aln.query_alignment_end):
            break
        yield pair 
Example #16
Source File: util.py    From pomoxis with Mozilla Public License 2.0 5 votes vote down vote up
def get_pairs(aln):
    """Return generator of pairs.

    :param aln: `pysam.AlignedSegment` object.
    :returns: generator of `AlignPos` objects.
    """
    seq = aln.query_sequence
    pairs = (AlignPos(qpos=qp,
                      qbase=seq[qp] if qp is not None else '-',
                      rpos=rp,
                      rbase=rb if rp is not None else '-'
                      )
             for qp, rp, rb in aln.get_aligned_pairs(with_seq=True)
             )
    return pairs 
Example #17
Source File: hg19util.py    From ViFi with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, line, start=-1, end=-1, strand=1,
        file_format='', bamfile=None, info=''):
        self.info = ""
        self.file_format = file_format
        if type(line) == pysam.AlignedRead or type(line) == pysam.AlignedSegment:
            self.load_pysamread(line, bamfile)
        elif start == -1:
            self.load_line(line, file_format)
        elif end == -1:
            self.load_pos(line, start, start, strand)
        else:
            self.load_pos(line, start, end, strand)
        if len(info) > 0:
            self.info = info 
Example #18
Source File: bamops.py    From anvio with GNU General Public License v3.0 5 votes vote down vote up
def get_blocks(self):
        """Mimic the get_blocks function from AlignedSegment.

        Notes
        =====
        - Takes roughly 200us
        """

        blocks = []
        block_start = self.reference_start
        block_length = 0

        for _, length, consumes_read, consumes_ref in iterate_cigartuples(self.cigartuples, constants.cigar_consumption):
            if consumes_read and consumes_ref:
                block_length += length

            elif consumes_read and not consumes_ref:
                if block_length:
                    blocks.append((block_start, block_start + block_length))

                block_start = block_start + block_length
                block_length = 0

            elif not consumes_read and consumes_ref:
                if block_length:
                    blocks.append((block_start, block_start + block_length))

                block_start = block_start + block_length + length
                block_length = 0

            else:
                pass

        if block_length:
            blocks.append((block_start, block_start + block_length))

        return blocks 
Example #19
Source File: SVIM_COLLECT.py    From svim with GNU General Public License v3.0 5 votes vote down vote up
def bam_iterator(bam):
    """Returns an iterator for the given SAM/BAM file (must be query-sorted).
    In each call, the alignments of a single read are yielded as a 3-tuple: (list of primary pysam.AlignedSegment, list of supplementary pysam.AlignedSegment, list of secondary pysam.AlignedSegment)."""
    alignments = bam.fetch(until_eof=True)
    current_aln = next(alignments)
    current_read_name = current_aln.query_name
    current_prim = []
    current_suppl = []
    current_sec = []
    if current_aln.is_secondary:
        current_sec.append(current_aln)
    elif current_aln.is_supplementary:
        current_suppl.append(current_aln)
    else:
        current_prim.append(current_aln)
    while True:
        try:
            next_aln = next(alignments)
            next_read_name = next_aln.query_name
            if next_read_name != current_read_name:
                yield (current_prim, current_suppl, current_sec)
                current_read_name = next_read_name
                current_prim = []
                current_suppl = []
                current_sec = []
            if next_aln.is_secondary:
                current_sec.append(next_aln)
            elif next_aln.is_supplementary:
                current_suppl.append(next_aln)
            else:
                current_prim.append(next_aln)
        except StopIteration:
            break
    yield (current_prim, current_suppl, current_sec) 
Example #20
Source File: counter.py    From velocyto.py with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _bam_id_barcode(self, read: pysam.AlignedSegment) -> str:
        return f"{self._current_bamfile}" 
Example #21
Source File: counter.py    From velocyto.py with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _extension_chr(self, read: pysam.AlignedSegment) -> str:
        return read.get_tag(self.umibarcode_str) + f"_{read.rname}:{read.reference_start // 10000000}"  # catch the error 
Example #22
Source File: counter.py    From velocyto.py with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _placeolder_umi(self, read: pysam.AlignedSegment) -> str:
        return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(vcy.PLACEHOLDER_UMI_LEN)) 
Example #23
Source File: counter.py    From velocyto.py with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _extension_Gene(self, read: pysam.AlignedSegment) -> str:
        try:
            return read.get_tag(self.umibarcode_str) + "_" + read.get_tag("GX")  # catch the error
        except KeyError:
            return read.get_tag(self.umibarcode_str) + "_withoutGX" 
Example #24
Source File: counter.py    From velocyto.py with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _extension_Nbp(self, read: pysam.AlignedSegment) -> str:
        return read.get_tag(self.umibarcode_str) + read.query_alignment_sequence[:self.umi_bp] 
Example #25
Source File: counter.py    From velocyto.py with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _no_extension(self, read: pysam.AlignedSegment) -> str:
        return read.get_tag(self.umibarcode_str) 
Example #26
Source File: transcript_utils.py    From TALON with MIT License 5 votes vote down vote up
def get_introns(sam_record: pysam.AlignedSegment, start, cigar):
    """ Locates the jI field in a list of SAM fields or computes
        it from the CIGAR string and start position if it isn't found.
        Note that positions refer to start and endpoints of introns, not exons,
        so adjustments are needed to avoid an off-by-one error if you want exons.

        Example jI strings:
            no introns: jI:B:i,-1
            two introns: jI:B:i,167936516,167951806,167951862,167966628
        Args:
            sam_record: a pysam AlignedSegment
            start: The start position of the transcript with respect to the
            forward strand
            cigar: SAM CIGAR string describing match operations to the reference
            genome
        Returns:
            intron_list: intron starts and ends in a list (sorted order)
    """
    try:
        intron_list = sam_record.get_tag("jI").tolist()
    except KeyError:
        jI = compute_jI(start, cigar)
        intron_list = [int(x) for x in jI.split(",")[1:]]

    if intron_list[0] == -1:
        return []
    else:
        return intron_list 
Example #27
Source File: transcript_utils.py    From TALON with MIT License 5 votes vote down vote up
def check_read_quality(sam_record: pysam.AlignedSegment, run_info):
    """ Process an individual sam read and return quality attributes. """
    read_ID = sam_record.query_name
    flag = sam_record.flag
    cigar = sam_record.cigarstring
    seq = sam_record.query
    read_length = sam_record.query_length
    dataset = sam_record.get_tag('RG')
 
    # Only use uniquely mapped transcripts
    if flag not in [0, 16]:
        return [dataset, read_ID, 0, 0, read_length, "NA", "NA"]

    # Only use reads that are greater than or equal to length threshold
    if read_length < run_info.min_length:
        return [dataset, read_ID, 0, 1, read_length, "NA", "NA"]

    # Locate the MD field of the sam transcript
    try:
        md_tag = sam_record.get_tag('MD')
    except KeyError:
        raise ValueError("SAM transcript %s lacks an MD tag" % read_ID)

    # Only use reads where alignment coverage and identity exceed
    # cutoffs
    coverage = compute_alignment_coverage(cigar)
    identity = compute_alignment_identity(md_tag, seq)

    if coverage < run_info.min_coverage or \
       identity < run_info.min_identity:
        return [dataset, read_ID, 0, 1, read_length, coverage, identity]

    # At this point, the read has passed the quality control
    return [dataset, read_ID, 1, 1, read_length, coverage, identity] 
Example #28
Source File: talon.py    From TALON with MIT License 5 votes vote down vote up
def parse_custom_SAM_tags(sam_record: pysam.AlignedSegment):
    """ Looks for the following tags in the read. Will be set to None if no tag
        is found 
            fA: fraction As in the 10-bp interval following the alignment end
            lC: custom label (type = string)
            lA: custom allele label (type = string)
            tS: flag indicating start site support (type = string)
            tE: flag indicating end site support (typ = string)
    """
    try:
        fraction_As = sam_record.get_tag("fA")
    except:
        fraction_As = None
    try:
        custom_label = sam_record.get_tag("lC")
    except:
        custom_label = None
    try:
        allelic_label = sam_record.get_tag("lA")
    except:
        allelic_label = None
    try:
        start_support = sam_record.get_tag("tS")
    except:
        start_support = None
    try:
        end_support = sam_record.get_tag("tE")
    except:
        end_support = None

    return fraction_As, custom_label, allelic_label, start_support, end_support 
Example #29
Source File: talon_label_reads.py    From TALON with MIT License 5 votes vote down vote up
def compute_transcript_end(transcript=pysam.AlignedSegment):
    """ Compute the position of the final transcript base relative to the genome,
        taking strand into account. Position is 1-based. """

    strand = "-" if transcript.is_reverse else "+"
    if strand == '+':
        return transcript.reference_end
    if strand == '-':
        return transcript.reference_start + 1 # (make 1-based) 
Example #30
Source File: fragments.py    From sinto with MIT License 4 votes vote down vote up
def updateFragmentDict(
    fragments, segment, min_mapq, cellbarcode, readname_barcode, cells, max_dist
):
    """Update dictionary of ATAC fragments
    Takes a new aligned segment and adds information to the dictionary,
    returns a modified version of the dictionary

    Positions are 0-based
    Reads aligned to the + strand are shifted +4 bp
    Reads aligned to the - strand are shifted -5 bp

    Parameters
    ----------
    fragments : dict
        A dictionary containing ATAC fragment information
    segment : pysam.AlignedSegment
        An aligned segment
    min_mapq : int
        Minimum MAPQ to retain fragment
    cellbarcode : str
       Tag used for cell barcode. Default is CB (used by cellranger)
    readname_barcode : regex
        A compiled regex for matching cell barcode in read name. If None,
        use the read tags.
    cells : list
        List of cells to retain. If None, retain all cells found.
    max_dist : int
        Maximum allowed distance between fragment start and end sites
    """
    # because the cell barcode is not stored with each read pair (only one of the pair)
    # we need to look for each read separately rather than using the mate cigar / mate postion information
    if readname_barcode is not None:
        re_match = readname_barcode.match(segment.qname)
        cell_barcode = re_match.group()
    else:
        cell_barcode, _ = utils.scan_tags(segment.tags, cb=cellbarcode)
    if cells is not None and cell_barcode is not None:
        if cell_barcode not in cells:
            return fragments
    mapq = segment.mapping_quality
    if mapq < min_mapq:
        return fragments
    chromosome = segment.reference_name
    qname = segment.query_name
    rstart = segment.reference_start
    rend = segment.reference_end
    qstart = segment.query_alignment_start
    is_reverse = segment.is_reverse
    if (rend is None) or (rstart is None):
        return fragments
    # correct for soft clipping
    rstart = rstart + qstart
    # correct for 9 bp Tn5 shift
    if is_reverse:
        rend = rend - 5
    else:
        rstart = rstart + 4
    fragments = addToFragments(
        fragments, qname, chromosome, rstart, rend, cell_barcode, is_reverse, max_dist
    )
    return fragments