Python Examples of pysam.AlignedSegment

Source File: bamops.py From anvio with GNU General Public License v3.0

6 votes

def __init__(self, read):
        """Class for manipulating reads

        Parameters
        ==========
        read : pysam.AlignedSegment
        """

        # redefine all properties of interest explicitly from pysam.AlignedSegment object as
        # attributes of this class. The reason for this is that some of the AlignedSegment
        # attributes have no __set__ methods, so are read only. Since this class is designed to
        # modify some of these attributes, and since we want to maintain consistency across
        # attributes, all attributes of interest are redefined here
        self.cigartuples = np.array(read.cigartuples)
        self.query_sequence = np.frombuffer(read.query_sequence.encode('ascii'), np.uint8)
        self.reference_start = read.reference_start
        self.reference_end = read.reference_end

        if read.has_tag('MD'):
            self.reference_sequence = np.frombuffer(read.get_reference_sequence().upper().encode('ascii'), np.uint8)
        else:
            self.reference_sequence = np.array([ord('N')] * (self.reference_end - self.reference_start))

        # See self.vectorize
        self.v = None

Source File: simulator.py From slamdunk with GNU Affero General Public License v3.0

6 votes

def printFastaEntry(sequence, name, index, conversions, readOutSAM, conversionRate):
    #a = pysam.AlignedSegment()
    print(name + "_" + str(index) + "_" + str(conversions),
          "4",
          "*",
          "0",
          "0",
          "*",
          "*",
          "0",
          "0",
          sequence,
          "F" * len(sequence),
          "TC:i:" + str(conversions),
          "ID:i:" + str(index),
          "CR:f" + str(conversionRate),
           file=readOutSAM, sep="\t")

Source File: smolecule.py From medaka with Mozilla Public License 2.0

6 votes

def write_bam(fname, alignments, header, bam=True):
    """Write a `.bam` file for a set of alignments.

    :param fname: output filename.
    :param alignments: a list of `Alignment` tuples.
    :param header: bam header
    :param bam: write bam, else sam

    """
    mode = 'wb' if bam else 'w'
    with pysam.AlignmentFile(fname, mode, header=header) as fh:
        for ref_id, subreads in enumerate(alignments):
            for aln in sorted(subreads, key=lambda x: x.rstart):
                a = pysam.AlignedSegment()
                a.reference_id = ref_id
                a.query_name = aln.qname
                a.query_sequence = aln.seq
                a.reference_start = aln.rstart
                a.cigarstring = aln.cigar
                a.flag = aln.flag
                a.mapping_quality = 60
                fh.write(a)
    if mode == 'wb':
        pysam.index(fname)

Source File: transcript.py From mikado with GNU Lesser General Public License v3.0

5 votes

def __initialize_with_bed12(self, transcript_row: BED12):

        """
        :param transcript_row:
        :type transcript_row: pysam.AlignedSegment
        :return:
        """

        if transcript_row.header is True:
            raise InvalidTranscript("I cannot initialise a valid transcript with a header (ie empty) BED line.")

        self.chrom = transcript_row.chrom
        self.name = self.id = transcript_row.name
        self.start, self.end = transcript_row.start, transcript_row.end
        self.score = transcript_row.score
        self.strand = transcript_row.strand
        exon_starts = np.array([_ + self.start for _ in transcript_row.block_starts])
        exon_ends = exon_starts + np.array(transcript_row.block_sizes) - 1
        isizes = exon_starts[1:] - exon_ends[:-1]  # This functions also for monoexonic
        if np.where(isizes < 0)[0].size > 0:
            raise InvalidTranscript("Overlapping exons found for {}!".format(self.id))
        if np.where(isizes <= 1)[0].size > 0:
            self.logger.debug("Merging touching exons")
            exon_starts = np.concatenate((np.array([exon_starts[0]]), exon_starts[np.where(isizes > 1)[0] + 1]))
            exon_ends = np.concatenate((exon_ends[np.where(isizes > 1)[0]], np.array([exon_ends[-1]])))

        self.add_exons(list(zip(list(exon_starts), list(exon_ends))))
        self.parent = getattr(transcript_row, "parent", transcript_row.id)
        self.source = getattr(transcript_row, "source", None)
        # Now we have to calculate the CDS
        cds = []
        if transcript_row.coding is True:
            for exon in self.exons:
                if exon[1] >= transcript_row.thick_start and exon[0] <= transcript_row.thick_end:
                    cds.append((int(max(exon[0], transcript_row.thick_start)),
                                int(min(exon[1], transcript_row.thick_end))))
            self.add_exons(cds, features="CDS")
        self.finalize()

Source File: transcript.py From mikado with GNU Lesser General Public License v3.0

5 votes

def __initialize_with_line(self, transcript_row):
        """
        Private method to copy the necessary attributes from
        an external GTF/GFF3 row.
        :param transcript_row:
        :return:
        """

        if isinstance(transcript_row, (str, bytes)):
            if isinstance(transcript_row, bytes):
                transcript_row = transcript_row.decode()
            _ = GffLine(transcript_row)
            if _.header is False and _.is_transcript is True and _.id is not None:
                transcript_row = _
            else:
                _ = GtfLine(transcript_row)
                if _.header is False and _.is_transcript is True and _.id is not None:
                    transcript_row = _
                else:
                    _ = BED12(transcript_row)
                    if _.header is False and _.name is not None:
                        transcript_row = _

        if isinstance(transcript_row, (GffLine, GtfLine)):
            self.__initialize_with_gf(transcript_row)
        elif isinstance(transcript_row, BED12):
            self.__initialize_with_bed12(transcript_row)
        elif isinstance(transcript_row, pysam.AlignedSegment):
            self.__initialize_with_bam(transcript_row)
        else:
            raise TypeError("Invalid data type: {0}".format(type(transcript_row)))

Source File: Opossum.py From Opossum with GNU General Public License v3.0

5 votes

def CreateReadObject(read, newseq, newqual, newcigar, startread, basetag=[]) :

	a = pysam.AlignedSegment()
	a.query_name = read.query_name
	a.query_sequence = newseq
	a.query_qualities = pysam.qualitystring_to_array(newqual)
	a.cigar = newcigar
	a.reference_start = startread

	# If (Star) mapper has assigned a value of 255 to mapping quality,
	# change it to 50
	mapqual = read.mapping_quality 
	if mapqual == 255 :
		a.mapping_quality = 50
	else :
		a.mapping_quality = mapqual

	a.reference_id = read.reference_id

	# If read has RG read group tag, keep it
	try :
		r_RG = read.get_tag('RG')
		a.tags = ()
		a.set_tag('RG', r_RG)
	except :
		a.tags = ()

	a.next_reference_id = -1
	a.next_reference_start = -1
	a.template_length = 0
	a.flag = UpdateFlag(read.flag)

	return a


# Goes through the given cigar list and reports how many times cigarType is equal to 3
# Optional field is finalpos, which is cutoff position for counting the splits (relative to start pos)
# Default value for finalpos is something very large

Source File: test_rle.py From medaka with Mozilla Public License 2.0

5 votes

def test_unmapped_return_None(self):
        """Unmapped or secondary reads are skipped"""
        expected = None
        alignment = pysam.AlignedSegment()
        alignment.is_unmapped = True
        got = medaka.rle._compress_alignment(alignment, None)
        self.assertEqual(expected, got)

Source File: common.py From medaka with Mozilla Public License 2.0

5 votes

def initialise_alignment(
        query_name, reference_id, reference_start,
        query_sequence, cigarstring, flag, mapping_quality=60,
        query_qualities=None, tags=None):
    """Create a `Pysam.AlignedSegment` object.

    :param query_name: name of the query sequence
    :param reference_id: index to the reference name
    :param reference_start: 0-based index of first leftmost reference
        coordinate
    :param query_sequence: read sequence bases, including those soft clipped
    :param cigarstring: cigar string representing the alignment of query
        and reference
    :param flag: bitwise flag representing some properties of the alignment
        (see SAM format)
    :param mapping_quality: optional quality of the mapping or query to
        reference
    :param query_qualities: optional base qualities of the query, including
        soft-clipped ones!

    :returns: `pysam.AlignedSegment` object
    """
    if tags is None:
        tags = dict()

    a = pysam.AlignedSegment()
    a.query_name = query_name
    a.reference_id = reference_id
    a.reference_start = reference_start
    a.query_sequence = query_sequence
    a.cigarstring = cigarstring
    a.flag = flag
    a.mapping_quality = mapping_quality
    if query_qualities is not None:
        a.query_qualities = query_qualities

    for tag_name, tag_value in tags.items():
        a.set_tag(tag_name, tag_value)

    return a

Source File: labels.py From medaka with Mozilla Public License 2.0

5 votes

def _alignment_to_pairs(self, aln):
        """Convert `pysam.AlignedSegment` to aligned pairs."""
        seq = aln.query_sequence
        for qpos, rpos in aln.get_aligned_pairs():
            yield rpos, seq[qpos].upper() if qpos is not None else '*'

Source File: labels.py From medaka with Mozilla Public License 2.0

5 votes

def _alignment_to_pairs(self, aln):
        """Convert `pysam.AlignedSegment` to aligned pairs."""
        seq = aln.query_sequence
        for qpos, rpos in aln.get_aligned_pairs():
            yield rpos, seq[qpos].upper() if qpos is not None else '*'

Source File: labels.py From medaka with Mozilla Public License 2.0

5 votes

def encode(self, truth_alns):
        """Convert truth alignment(s) to array of intermediate representation.

        In most cases the intermediate representation consists of integers.

        :param truth_alns: tuple of `pysam.AlignedSegment` s for each haplotype
            spanning the same genomic range.

        :returns: tuple(positions, training_vectors)

            - positions: numpy structured array with 'major'
              (reference position index) and 'minor'
              (trailing insertion index) fields.

            - encoded: nd.array of encoded labels

        .. note ::
            It is generally the case that the returned encoded labels must be
            padded with encoded gap labels when aligned to corresponding
            training feature data.

        """
        # Labels is a list of tuples with alleles ('A', ), ('A', 'C'), ('C', 3)
        positions, labels = self._alignments_to_labels(truth_alns)

        # Encoded is an array of integers
        encoded = self._labels_to_encoded_labels(labels)

        return positions, encoded

Source File: labels.py From medaka with Mozilla Public License 2.0

5 votes

def _alignment_to_pairs(self, aln):
        """Convert `pysam.AlignedSegment` to aligned pairs."""

Source File: labels.py From medaka with Mozilla Public License 2.0

5 votes

def __init__(self, alignment):
        """Create a `TruthAlignment` list from an `AlignedSegment`.

        :param alignment: `pysam.AlignedSegment`.
        """
        self.aln = alignment  # so we can get positions and labels later
        # initialise start and end (which might be moved)
        self.start = self.aln.reference_start  # zero-based
        self.end = self.aln.reference_end
        self.is_kept = True
        self.logger = medaka.common.get_named_logger('TruthAlign')

Source File: find_indels.py From pomoxis with Mozilla Public License 2.0

5 votes

def get_trimmed_pairs(aln):
    """Trim aligned pairs to the alignment.

    :param aln: `pysam.AlignedSegment` object
    :yields pairs:
    """
    def pos_is_none(x):
        return x[1] is None or x[0] is None

    for qp, rp in itertools.dropwhile(pos_is_none, aln.get_aligned_pairs()):
        if (rp == aln.reference_end or qp == aln.query_alignment_end):
            break
        yield qp, rp

Source File: util.py From pomoxis with Mozilla Public License 2.0

5 votes

def get_trimmed_pairs(aln):
    """Trim aligned pairs to the alignment.

    :param aln: `pysam.AlignedSegment` object
    :yields pairs:
    """

    pairs = get_pairs(aln)
    for pair in itertools.dropwhile(lambda x: x.rpos is None or x.qpos is None, pairs):
        if (pair.rpos == aln.reference_end or pair.qpos == aln.query_alignment_end):
            break
        yield pair

Source File: util.py From pomoxis with Mozilla Public License 2.0

5 votes

def get_pairs(aln):
    """Return generator of pairs.

    :param aln: `pysam.AlignedSegment` object.
    :returns: generator of `AlignPos` objects.
    """
    seq = aln.query_sequence
    pairs = (AlignPos(qpos=qp,
                      qbase=seq[qp] if qp is not None else '-',
                      rpos=rp,
                      rbase=rb if rp is not None else '-'
                      )
             for qp, rp, rb in aln.get_aligned_pairs(with_seq=True)
             )
    return pairs

Source File: hg19util.py From ViFi with GNU General Public License v3.0

5 votes

def __init__(self, line, start=-1, end=-1, strand=1,
        file_format='', bamfile=None, info=''):
        self.info = ""
        self.file_format = file_format
        if type(line) == pysam.AlignedRead or type(line) == pysam.AlignedSegment:
            self.load_pysamread(line, bamfile)
        elif start == -1:
            self.load_line(line, file_format)
        elif end == -1:
            self.load_pos(line, start, start, strand)
        else:
            self.load_pos(line, start, end, strand)
        if len(info) > 0:
            self.info = info

Source File: bamops.py From anvio with GNU General Public License v3.0

5 votes

def get_blocks(self):
        """Mimic the get_blocks function from AlignedSegment.

        Notes
        =====
        - Takes roughly 200us
        """

        blocks = []
        block_start = self.reference_start
        block_length = 0

        for _, length, consumes_read, consumes_ref in iterate_cigartuples(self.cigartuples, constants.cigar_consumption):
            if consumes_read and consumes_ref:
                block_length += length

            elif consumes_read and not consumes_ref:
                if block_length:
                    blocks.append((block_start, block_start + block_length))

                block_start = block_start + block_length
                block_length = 0

            elif not consumes_read and consumes_ref:
                if block_length:
                    blocks.append((block_start, block_start + block_length))

                block_start = block_start + block_length + length
                block_length = 0

            else:
                pass

        if block_length:
            blocks.append((block_start, block_start + block_length))

        return blocks

Source File: SVIM_COLLECT.py From svim with GNU General Public License v3.0

5 votes

def bam_iterator(bam):
    """Returns an iterator for the given SAM/BAM file (must be query-sorted).
    In each call, the alignments of a single read are yielded as a 3-tuple: (list of primary pysam.AlignedSegment, list of supplementary pysam.AlignedSegment, list of secondary pysam.AlignedSegment)."""
    alignments = bam.fetch(until_eof=True)
    current_aln = next(alignments)
    current_read_name = current_aln.query_name
    current_prim = []
    current_suppl = []
    current_sec = []
    if current_aln.is_secondary:
        current_sec.append(current_aln)
    elif current_aln.is_supplementary:
        current_suppl.append(current_aln)
    else:
        current_prim.append(current_aln)
    while True:
        try:
            next_aln = next(alignments)
            next_read_name = next_aln.query_name
            if next_read_name != current_read_name:
                yield (current_prim, current_suppl, current_sec)
                current_read_name = next_read_name
                current_prim = []
                current_suppl = []
                current_sec = []
            if next_aln.is_secondary:
                current_sec.append(next_aln)
            elif next_aln.is_supplementary:
                current_suppl.append(next_aln)
            else:
                current_prim.append(next_aln)
        except StopIteration:
            break
    yield (current_prim, current_suppl, current_sec)