Python pysam.AlignedSegment() Examples
The following are 30
code examples of pysam.AlignedSegment().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pysam
, or try the search function
.
Example #1
Source File: bamops.py From anvio with GNU General Public License v3.0 | 6 votes |
def __init__(self, read): """Class for manipulating reads Parameters ========== read : pysam.AlignedSegment """ # redefine all properties of interest explicitly from pysam.AlignedSegment object as # attributes of this class. The reason for this is that some of the AlignedSegment # attributes have no __set__ methods, so are read only. Since this class is designed to # modify some of these attributes, and since we want to maintain consistency across # attributes, all attributes of interest are redefined here self.cigartuples = np.array(read.cigartuples) self.query_sequence = np.frombuffer(read.query_sequence.encode('ascii'), np.uint8) self.reference_start = read.reference_start self.reference_end = read.reference_end if read.has_tag('MD'): self.reference_sequence = np.frombuffer(read.get_reference_sequence().upper().encode('ascii'), np.uint8) else: self.reference_sequence = np.array([ord('N')] * (self.reference_end - self.reference_start)) # See self.vectorize self.v = None
Example #2
Source File: simulator.py From slamdunk with GNU Affero General Public License v3.0 | 6 votes |
def printFastaEntry(sequence, name, index, conversions, readOutSAM, conversionRate): #a = pysam.AlignedSegment() print(name + "_" + str(index) + "_" + str(conversions), "4", "*", "0", "0", "*", "*", "0", "0", sequence, "F" * len(sequence), "TC:i:" + str(conversions), "ID:i:" + str(index), "CR:f" + str(conversionRate), file=readOutSAM, sep="\t")
Example #3
Source File: smolecule.py From medaka with Mozilla Public License 2.0 | 6 votes |
def write_bam(fname, alignments, header, bam=True): """Write a `.bam` file for a set of alignments. :param fname: output filename. :param alignments: a list of `Alignment` tuples. :param header: bam header :param bam: write bam, else sam """ mode = 'wb' if bam else 'w' with pysam.AlignmentFile(fname, mode, header=header) as fh: for ref_id, subreads in enumerate(alignments): for aln in sorted(subreads, key=lambda x: x.rstart): a = pysam.AlignedSegment() a.reference_id = ref_id a.query_name = aln.qname a.query_sequence = aln.seq a.reference_start = aln.rstart a.cigarstring = aln.cigar a.flag = aln.flag a.mapping_quality = 60 fh.write(a) if mode == 'wb': pysam.index(fname)
Example #4
Source File: transcript.py From mikado with GNU Lesser General Public License v3.0 | 5 votes |
def __initialize_with_bed12(self, transcript_row: BED12): """ :param transcript_row: :type transcript_row: pysam.AlignedSegment :return: """ if transcript_row.header is True: raise InvalidTranscript("I cannot initialise a valid transcript with a header (ie empty) BED line.") self.chrom = transcript_row.chrom self.name = self.id = transcript_row.name self.start, self.end = transcript_row.start, transcript_row.end self.score = transcript_row.score self.strand = transcript_row.strand exon_starts = np.array([_ + self.start for _ in transcript_row.block_starts]) exon_ends = exon_starts + np.array(transcript_row.block_sizes) - 1 isizes = exon_starts[1:] - exon_ends[:-1] # This functions also for monoexonic if np.where(isizes < 0)[0].size > 0: raise InvalidTranscript("Overlapping exons found for {}!".format(self.id)) if np.where(isizes <= 1)[0].size > 0: self.logger.debug("Merging touching exons") exon_starts = np.concatenate((np.array([exon_starts[0]]), exon_starts[np.where(isizes > 1)[0] + 1])) exon_ends = np.concatenate((exon_ends[np.where(isizes > 1)[0]], np.array([exon_ends[-1]]))) self.add_exons(list(zip(list(exon_starts), list(exon_ends)))) self.parent = getattr(transcript_row, "parent", transcript_row.id) self.source = getattr(transcript_row, "source", None) # Now we have to calculate the CDS cds = [] if transcript_row.coding is True: for exon in self.exons: if exon[1] >= transcript_row.thick_start and exon[0] <= transcript_row.thick_end: cds.append((int(max(exon[0], transcript_row.thick_start)), int(min(exon[1], transcript_row.thick_end)))) self.add_exons(cds, features="CDS") self.finalize()
Example #5
Source File: transcript.py From mikado with GNU Lesser General Public License v3.0 | 5 votes |
def __initialize_with_line(self, transcript_row): """ Private method to copy the necessary attributes from an external GTF/GFF3 row. :param transcript_row: :return: """ if isinstance(transcript_row, (str, bytes)): if isinstance(transcript_row, bytes): transcript_row = transcript_row.decode() _ = GffLine(transcript_row) if _.header is False and _.is_transcript is True and _.id is not None: transcript_row = _ else: _ = GtfLine(transcript_row) if _.header is False and _.is_transcript is True and _.id is not None: transcript_row = _ else: _ = BED12(transcript_row) if _.header is False and _.name is not None: transcript_row = _ if isinstance(transcript_row, (GffLine, GtfLine)): self.__initialize_with_gf(transcript_row) elif isinstance(transcript_row, BED12): self.__initialize_with_bed12(transcript_row) elif isinstance(transcript_row, pysam.AlignedSegment): self.__initialize_with_bam(transcript_row) else: raise TypeError("Invalid data type: {0}".format(type(transcript_row)))
Example #6
Source File: Opossum.py From Opossum with GNU General Public License v3.0 | 5 votes |
def CreateReadObject(read, newseq, newqual, newcigar, startread, basetag=[]) : a = pysam.AlignedSegment() a.query_name = read.query_name a.query_sequence = newseq a.query_qualities = pysam.qualitystring_to_array(newqual) a.cigar = newcigar a.reference_start = startread # If (Star) mapper has assigned a value of 255 to mapping quality, # change it to 50 mapqual = read.mapping_quality if mapqual == 255 : a.mapping_quality = 50 else : a.mapping_quality = mapqual a.reference_id = read.reference_id # If read has RG read group tag, keep it try : r_RG = read.get_tag('RG') a.tags = () a.set_tag('RG', r_RG) except : a.tags = () a.next_reference_id = -1 a.next_reference_start = -1 a.template_length = 0 a.flag = UpdateFlag(read.flag) return a # Goes through the given cigar list and reports how many times cigarType is equal to 3 # Optional field is finalpos, which is cutoff position for counting the splits (relative to start pos) # Default value for finalpos is something very large
Example #7
Source File: test_rle.py From medaka with Mozilla Public License 2.0 | 5 votes |
def test_unmapped_return_None(self): """Unmapped or secondary reads are skipped""" expected = None alignment = pysam.AlignedSegment() alignment.is_unmapped = True got = medaka.rle._compress_alignment(alignment, None) self.assertEqual(expected, got)
Example #8
Source File: common.py From medaka with Mozilla Public License 2.0 | 5 votes |
def initialise_alignment( query_name, reference_id, reference_start, query_sequence, cigarstring, flag, mapping_quality=60, query_qualities=None, tags=None): """Create a `Pysam.AlignedSegment` object. :param query_name: name of the query sequence :param reference_id: index to the reference name :param reference_start: 0-based index of first leftmost reference coordinate :param query_sequence: read sequence bases, including those soft clipped :param cigarstring: cigar string representing the alignment of query and reference :param flag: bitwise flag representing some properties of the alignment (see SAM format) :param mapping_quality: optional quality of the mapping or query to reference :param query_qualities: optional base qualities of the query, including soft-clipped ones! :returns: `pysam.AlignedSegment` object """ if tags is None: tags = dict() a = pysam.AlignedSegment() a.query_name = query_name a.reference_id = reference_id a.reference_start = reference_start a.query_sequence = query_sequence a.cigarstring = cigarstring a.flag = flag a.mapping_quality = mapping_quality if query_qualities is not None: a.query_qualities = query_qualities for tag_name, tag_value in tags.items(): a.set_tag(tag_name, tag_value) return a
Example #9
Source File: labels.py From medaka with Mozilla Public License 2.0 | 5 votes |
def _alignment_to_pairs(self, aln): """Convert `pysam.AlignedSegment` to aligned pairs.""" seq = aln.query_sequence for qpos, rpos in aln.get_aligned_pairs(): yield rpos, seq[qpos].upper() if qpos is not None else '*'
Example #10
Source File: labels.py From medaka with Mozilla Public License 2.0 | 5 votes |
def _alignment_to_pairs(self, aln): """Convert `pysam.AlignedSegment` to aligned pairs.""" seq = aln.query_sequence for qpos, rpos in aln.get_aligned_pairs(): yield rpos, seq[qpos].upper() if qpos is not None else '*'
Example #11
Source File: labels.py From medaka with Mozilla Public License 2.0 | 5 votes |
def encode(self, truth_alns): """Convert truth alignment(s) to array of intermediate representation. In most cases the intermediate representation consists of integers. :param truth_alns: tuple of `pysam.AlignedSegment` s for each haplotype spanning the same genomic range. :returns: tuple(positions, training_vectors) - positions: numpy structured array with 'major' (reference position index) and 'minor' (trailing insertion index) fields. - encoded: nd.array of encoded labels .. note :: It is generally the case that the returned encoded labels must be padded with encoded gap labels when aligned to corresponding training feature data. """ # Labels is a list of tuples with alleles ('A', ), ('A', 'C'), ('C', 3) positions, labels = self._alignments_to_labels(truth_alns) # Encoded is an array of integers encoded = self._labels_to_encoded_labels(labels) return positions, encoded
Example #12
Source File: labels.py From medaka with Mozilla Public License 2.0 | 5 votes |
def _alignment_to_pairs(self, aln): """Convert `pysam.AlignedSegment` to aligned pairs."""
Example #13
Source File: labels.py From medaka with Mozilla Public License 2.0 | 5 votes |
def __init__(self, alignment): """Create a `TruthAlignment` list from an `AlignedSegment`. :param alignment: `pysam.AlignedSegment`. """ self.aln = alignment # so we can get positions and labels later # initialise start and end (which might be moved) self.start = self.aln.reference_start # zero-based self.end = self.aln.reference_end self.is_kept = True self.logger = medaka.common.get_named_logger('TruthAlign')
Example #14
Source File: find_indels.py From pomoxis with Mozilla Public License 2.0 | 5 votes |
def get_trimmed_pairs(aln): """Trim aligned pairs to the alignment. :param aln: `pysam.AlignedSegment` object :yields pairs: """ def pos_is_none(x): return x[1] is None or x[0] is None for qp, rp in itertools.dropwhile(pos_is_none, aln.get_aligned_pairs()): if (rp == aln.reference_end or qp == aln.query_alignment_end): break yield qp, rp
Example #15
Source File: util.py From pomoxis with Mozilla Public License 2.0 | 5 votes |
def get_trimmed_pairs(aln): """Trim aligned pairs to the alignment. :param aln: `pysam.AlignedSegment` object :yields pairs: """ pairs = get_pairs(aln) for pair in itertools.dropwhile(lambda x: x.rpos is None or x.qpos is None, pairs): if (pair.rpos == aln.reference_end or pair.qpos == aln.query_alignment_end): break yield pair
Example #16
Source File: util.py From pomoxis with Mozilla Public License 2.0 | 5 votes |
def get_pairs(aln): """Return generator of pairs. :param aln: `pysam.AlignedSegment` object. :returns: generator of `AlignPos` objects. """ seq = aln.query_sequence pairs = (AlignPos(qpos=qp, qbase=seq[qp] if qp is not None else '-', rpos=rp, rbase=rb if rp is not None else '-' ) for qp, rp, rb in aln.get_aligned_pairs(with_seq=True) ) return pairs
Example #17
Source File: hg19util.py From ViFi with GNU General Public License v3.0 | 5 votes |
def __init__(self, line, start=-1, end=-1, strand=1, file_format='', bamfile=None, info=''): self.info = "" self.file_format = file_format if type(line) == pysam.AlignedRead or type(line) == pysam.AlignedSegment: self.load_pysamread(line, bamfile) elif start == -1: self.load_line(line, file_format) elif end == -1: self.load_pos(line, start, start, strand) else: self.load_pos(line, start, end, strand) if len(info) > 0: self.info = info
Example #18
Source File: bamops.py From anvio with GNU General Public License v3.0 | 5 votes |
def get_blocks(self): """Mimic the get_blocks function from AlignedSegment. Notes ===== - Takes roughly 200us """ blocks = [] block_start = self.reference_start block_length = 0 for _, length, consumes_read, consumes_ref in iterate_cigartuples(self.cigartuples, constants.cigar_consumption): if consumes_read and consumes_ref: block_length += length elif consumes_read and not consumes_ref: if block_length: blocks.append((block_start, block_start + block_length)) block_start = block_start + block_length block_length = 0 elif not consumes_read and consumes_ref: if block_length: blocks.append((block_start, block_start + block_length)) block_start = block_start + block_length + length block_length = 0 else: pass if block_length: blocks.append((block_start, block_start + block_length)) return blocks
Example #19
Source File: SVIM_COLLECT.py From svim with GNU General Public License v3.0 | 5 votes |
def bam_iterator(bam): """Returns an iterator for the given SAM/BAM file (must be query-sorted). In each call, the alignments of a single read are yielded as a 3-tuple: (list of primary pysam.AlignedSegment, list of supplementary pysam.AlignedSegment, list of secondary pysam.AlignedSegment).""" alignments = bam.fetch(until_eof=True) current_aln = next(alignments) current_read_name = current_aln.query_name current_prim = [] current_suppl = [] current_sec = [] if current_aln.is_secondary: current_sec.append(current_aln) elif current_aln.is_supplementary: current_suppl.append(current_aln) else: current_prim.append(current_aln) while True: try: next_aln = next(alignments) next_read_name = next_aln.query_name if next_read_name != current_read_name: yield (current_prim, current_suppl, current_sec) current_read_name = next_read_name current_prim = [] current_suppl = [] current_sec = [] if next_aln.is_secondary: current_sec.append(next_aln) elif next_aln.is_supplementary: current_suppl.append(next_aln) else: current_prim.append(next_aln) except StopIteration: break yield (current_prim, current_suppl, current_sec)
Example #20
Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License | 5 votes |
def _bam_id_barcode(self, read: pysam.AlignedSegment) -> str: return f"{self._current_bamfile}"
Example #21
Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License | 5 votes |
def _extension_chr(self, read: pysam.AlignedSegment) -> str: return read.get_tag(self.umibarcode_str) + f"_{read.rname}:{read.reference_start // 10000000}" # catch the error
Example #22
Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License | 5 votes |
def _placeolder_umi(self, read: pysam.AlignedSegment) -> str: return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(vcy.PLACEHOLDER_UMI_LEN))
Example #23
Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License | 5 votes |
def _extension_Gene(self, read: pysam.AlignedSegment) -> str: try: return read.get_tag(self.umibarcode_str) + "_" + read.get_tag("GX") # catch the error except KeyError: return read.get_tag(self.umibarcode_str) + "_withoutGX"
Example #24
Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License | 5 votes |
def _extension_Nbp(self, read: pysam.AlignedSegment) -> str: return read.get_tag(self.umibarcode_str) + read.query_alignment_sequence[:self.umi_bp]
Example #25
Source File: counter.py From velocyto.py with BSD 2-Clause "Simplified" License | 5 votes |
def _no_extension(self, read: pysam.AlignedSegment) -> str: return read.get_tag(self.umibarcode_str)
Example #26
Source File: transcript_utils.py From TALON with MIT License | 5 votes |
def get_introns(sam_record: pysam.AlignedSegment, start, cigar): """ Locates the jI field in a list of SAM fields or computes it from the CIGAR string and start position if it isn't found. Note that positions refer to start and endpoints of introns, not exons, so adjustments are needed to avoid an off-by-one error if you want exons. Example jI strings: no introns: jI:B:i,-1 two introns: jI:B:i,167936516,167951806,167951862,167966628 Args: sam_record: a pysam AlignedSegment start: The start position of the transcript with respect to the forward strand cigar: SAM CIGAR string describing match operations to the reference genome Returns: intron_list: intron starts and ends in a list (sorted order) """ try: intron_list = sam_record.get_tag("jI").tolist() except KeyError: jI = compute_jI(start, cigar) intron_list = [int(x) for x in jI.split(",")[1:]] if intron_list[0] == -1: return [] else: return intron_list
Example #27
Source File: transcript_utils.py From TALON with MIT License | 5 votes |
def check_read_quality(sam_record: pysam.AlignedSegment, run_info): """ Process an individual sam read and return quality attributes. """ read_ID = sam_record.query_name flag = sam_record.flag cigar = sam_record.cigarstring seq = sam_record.query read_length = sam_record.query_length dataset = sam_record.get_tag('RG') # Only use uniquely mapped transcripts if flag not in [0, 16]: return [dataset, read_ID, 0, 0, read_length, "NA", "NA"] # Only use reads that are greater than or equal to length threshold if read_length < run_info.min_length: return [dataset, read_ID, 0, 1, read_length, "NA", "NA"] # Locate the MD field of the sam transcript try: md_tag = sam_record.get_tag('MD') except KeyError: raise ValueError("SAM transcript %s lacks an MD tag" % read_ID) # Only use reads where alignment coverage and identity exceed # cutoffs coverage = compute_alignment_coverage(cigar) identity = compute_alignment_identity(md_tag, seq) if coverage < run_info.min_coverage or \ identity < run_info.min_identity: return [dataset, read_ID, 0, 1, read_length, coverage, identity] # At this point, the read has passed the quality control return [dataset, read_ID, 1, 1, read_length, coverage, identity]
Example #28
Source File: talon.py From TALON with MIT License | 5 votes |
def parse_custom_SAM_tags(sam_record: pysam.AlignedSegment): """ Looks for the following tags in the read. Will be set to None if no tag is found fA: fraction As in the 10-bp interval following the alignment end lC: custom label (type = string) lA: custom allele label (type = string) tS: flag indicating start site support (type = string) tE: flag indicating end site support (typ = string) """ try: fraction_As = sam_record.get_tag("fA") except: fraction_As = None try: custom_label = sam_record.get_tag("lC") except: custom_label = None try: allelic_label = sam_record.get_tag("lA") except: allelic_label = None try: start_support = sam_record.get_tag("tS") except: start_support = None try: end_support = sam_record.get_tag("tE") except: end_support = None return fraction_As, custom_label, allelic_label, start_support, end_support
Example #29
Source File: talon_label_reads.py From TALON with MIT License | 5 votes |
def compute_transcript_end(transcript=pysam.AlignedSegment): """ Compute the position of the final transcript base relative to the genome, taking strand into account. Position is 1-based. """ strand = "-" if transcript.is_reverse else "+" if strand == '+': return transcript.reference_end if strand == '-': return transcript.reference_start + 1 # (make 1-based)
Example #30
Source File: fragments.py From sinto with MIT License | 4 votes |
def updateFragmentDict( fragments, segment, min_mapq, cellbarcode, readname_barcode, cells, max_dist ): """Update dictionary of ATAC fragments Takes a new aligned segment and adds information to the dictionary, returns a modified version of the dictionary Positions are 0-based Reads aligned to the + strand are shifted +4 bp Reads aligned to the - strand are shifted -5 bp Parameters ---------- fragments : dict A dictionary containing ATAC fragment information segment : pysam.AlignedSegment An aligned segment min_mapq : int Minimum MAPQ to retain fragment cellbarcode : str Tag used for cell barcode. Default is CB (used by cellranger) readname_barcode : regex A compiled regex for matching cell barcode in read name. If None, use the read tags. cells : list List of cells to retain. If None, retain all cells found. max_dist : int Maximum allowed distance between fragment start and end sites """ # because the cell barcode is not stored with each read pair (only one of the pair) # we need to look for each read separately rather than using the mate cigar / mate postion information if readname_barcode is not None: re_match = readname_barcode.match(segment.qname) cell_barcode = re_match.group() else: cell_barcode, _ = utils.scan_tags(segment.tags, cb=cellbarcode) if cells is not None and cell_barcode is not None: if cell_barcode not in cells: return fragments mapq = segment.mapping_quality if mapq < min_mapq: return fragments chromosome = segment.reference_name qname = segment.query_name rstart = segment.reference_start rend = segment.reference_end qstart = segment.query_alignment_start is_reverse = segment.is_reverse if (rend is None) or (rstart is None): return fragments # correct for soft clipping rstart = rstart + qstart # correct for 9 bp Tn5 shift if is_reverse: rend = rend - 5 else: rstart = rstart + 4 fragments = addToFragments( fragments, qname, chromosome, rstart, rend, cell_barcode, is_reverse, max_dist ) return fragments