Python pysam.TabixFile() Examples

The following are 14 code examples of pysam.TabixFile(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pysam , or try the search function .
Example #1
Source File: BedGraphTrack.py    From pyGenomeTracks with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, properties_dict):
        super(BedGraphTrack, self).__init__(properties_dict)
        self.load_file()

        self.tbx2 = None
        self.interval_tree2 = None

        if 'second_file' in self.properties['operation']:
            if self.properties['second_file'] is None:
                raise InputError("operation: {} requires to set the parameter"
                                 " second_file."
                                 "".format(self.properties['operation']))
            else:
                if self.properties['second_file'].endswith(".bgz"):
                    # from the tabix file is not possible to know the
                    # global min and max
                    try:
                        self.tbx2 = pysam.TabixFile(self.properties['second_file'])
                    except IOError:
                        self.interval_tree2, __, __ = file_to_intervaltree(self.properties['second_file'])
                # load the file as an interval tree
                else:
                    self.interval_tree2, __, __ = file_to_intervaltree(self.properties['second_file']) 
Example #2
Source File: NFRCalling.py    From NucleoATAC with MIT License 6 votes vote down vote up
def findNFRs(self):
        """find NFR regions"""
        region = np.ones(self.length())
        tbx = pysam.TabixFile(self.params.calls)
        nucs = []
        if self.chrom in tbx.contigs:
            for row in tbx.fetch(self.chrom, self.start, self.end, parser = pysam.asTuple()):
                nucs.append(int(row[1]))
        for j in xrange(1,len(nucs)):
            left = nucs[j-1] + 73
            right = nucs[j] - 72
            if right <= left:
                continue
            candidate = NFR(left, right, self)
            if candidate.min_upper < self.params.max_occ_upper and candidate.occ < self.params.max_occ:
                self.nfrs.append(candidate) 
Example #3
Source File: samtools_variants.py    From ariba with GNU General Public License v3.0 6 votes vote down vote up
def _get_read_depths(cls, read_depths_file, sequence_name, position):
        '''Returns total read depth and depth of reads supporting alternative (if present)'''
        assert os.path.exists(read_depths_file)
        assert os.path.exists(read_depths_file + '.tbi')
        tbx = pysam.TabixFile(read_depths_file)
        try:
            rows = [x for x in tbx.fetch(sequence_name, position, position + 1)]
        except:
            return None

        if len(rows) > 1: # which happens with indels, mutiple lines for same base of reference
            test_rows = [x for x in rows if x.rstrip().split()[3] != '.']
            if len(test_rows) != 1:
                rows = [rows[-1]]
            else:
                rows = test_rows

        if len(rows) == 1:
            r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
            bases = ref_base if alt_base == '.' else ref_base + ',' + alt_base
            return bases, int(ref_counts), alt_counts
        else:
            return None 
Example #4
Source File: bedtrack.py    From genomeview with MIT License 5 votes vote down vote up
def fetch_from_tabix(path, chrom, start, end):
    import pysam

    bed = pysam.TabixFile(path)

    chrom = match_chrom_format(chrom, bed.contigs)
    for locus in bed.fetch(chrom, start, end):
        locus = locus.split()
        yield tx_from_bedfields(locus) 
Example #5
Source File: feature.py    From igv-reports with MIT License 5 votes vote down vote up
def __init__(self, path):

        tabix = None
        if path.endswith(".gz"):
            # Might be a tabix file
            try:
                tabix = pysam.TabixFile(path)
            except:
                tabix = None

        if tabix:
            self.reader = _Tabix(tabix)
        else:
            self.reader = _NonIndexed(path) 
Example #6
Source File: BedGraphTrack.py    From pyGenomeTracks with GNU General Public License v3.0 5 votes vote down vote up
def load_file(self):
        self.tbx = None
        # try to load a tabix file is available
        if self.properties['file'].endswith(".bgz"):
            # from the tabix file is not possible to know the
            # global min and max
            try:
                self.tbx = pysam.TabixFile(self.properties['file'])
            except IOError:
                self.interval_tree, __, __ = file_to_intervaltree(self.properties['file'])
        # load the file as an interval tree
        else:
            self.interval_tree, __, __ = file_to_intervaltree(self.properties['file'])
        self.num_fields = None 
Example #7
Source File: test_clinvar_match.py    From CharGer with GNU General Public License v3.0 5 votes vote down vote up
def clinvar_tabix(test_root):
    return TabixFile(
        str(
            test_root.joinpath("examples/annotations/clinvar_chrom_22_only.b37.tsv.gz")
        ),
        encoding="utf8",
    ) 
Example #8
Source File: call_readclouds.py    From grocsvs with MIT License 5 votes vote down vote up
def load_fragments(options, sample, dataset, chrom=None, start=None, end=None, usecols=None, 
                   min_reads_per_frag=1):
    if start is not None:
        if start < 0:
            raise Exception("start coord is negative: {}:{}-{}".format(chrom, start, end))
    if end is not None:
        if start >= end:
            raise Exception("end coord is before start: {}:{}-{}".format(chrom, start, end))

    readclouds_path = os.path.join(
        options.results_dir,
        "CombineReadcloudsStep",
        "readclouds.{}.{}.tsv.gz".format(sample.name, dataset.id))

    tabix = pysam.TabixFile(readclouds_path)
    
    if chrom is not None and chrom not in tabix.contigs:
        #print("MISSING:", chrom)
        return pandas.DataFrame(columns="chrom start_pos end_pos bc num_reads obs_len hap".split())
    
    if usecols is not None and "num_reads" not in usecols:
        usecols.append("num_reads")
        
    s = StringIO.StringIO("\n".join(tabix.fetch(chrom, start, end)))
    readclouds = pandas.read_table(s, header=None, names=Readcloud._fields, usecols=usecols)
    readclouds["chrom"] = readclouds["chrom"].astype("string")
    
    if min_reads_per_frag > 0:
        readclouds = readclouds.loc[readclouds["num_reads"]>min_reads_per_frag]

    return readclouds 
Example #9
Source File: file_utils.py    From pheweb with GNU Affero General Public License v3.0 5 votes vote down vote up
def IndexedVariantFileReader(phenocode):
    filepath = common_filepaths['pheno_gz'](phenocode)
    with read_gzip(filepath) as f:
        reader = csv.reader(f, dialect='pheweb-internal-dialect')
        fields = next(reader)
    if fields[0].startswith('#'): # previous version of PheWeb commented the header line
        fields[0] = fields[0][1:]
    for field in fields:
        assert field in conf.parse.per_variant_fields or field in conf.parse.per_assoc_fields, field
    colidxs = {field: idx for idx, field in enumerate(fields)}
    with pysam.TabixFile(filepath, parser=None) as tabix_file:
        yield _ivfr(tabix_file, colidxs) 
Example #10
Source File: file_utils.py    From pheweb with GNU Affero General Public License v3.0 5 votes vote down vote up
def context(self):
        with pysam.TabixFile(self._filepath, parser=None) as tabix_file:
            yield _mr(tabix_file, self._colidxs, self._colidxs_for_pheno, self._info_for_pheno) 
Example #11
Source File: samplot.py    From samplot with MIT License 5 votes vote down vote up
def get_tabix_iter(chrm, start, end, datafile):
    """Gets an iterator from a tabix BED/GFF/GFF3 file

    Used to avoid chrX vs. X notation issues when extracting data from
    annotation files
    """
    tbx = pysam.TabixFile(datafile)
    itr = None
    try:
        itr = tbx.fetch(chrm, max(0, start - 1000), end + 1000)
    except ValueError:
        # try and account for chr/no chr prefix
        if chrm[:3] == "chr":
            chrm = chrm[3:]
        else:
            chrm = "chr" + chrm

        try:
            itr = tbx.fetch(chrm, max(0, start - 1000), end + 1000)
        except ValueError as e:
            print(
                "Warning: Could not fetch "
                + chrm
                + ":"
                + str(start)
                + "-"
                + str(end)
                + " from "
                + datafile,
                file=sys.stderr,
            )
            print(e)
    return itr


# }}}

##Coverage methods
# {{{def add_coverage(bam_file, read, coverage, separate_mqual): 
Example #12
Source File: datasets.py    From grocsvs with MIT License 4 votes vote down vote up
def validate(self):
        assert os.path.exists(self.bam), "missing bam file '{}' for sample '{}' and dataset '{}'".format(
                                         self.bam, self.sample.name, self.id)


    # @staticmethod
    # def from_longranger_dir(self, longranger_dir):
    #     fragments = os.path.join(longranger_dir,
    #         "PHASER_SVCALLER_CS/PHASER_SVCALLER/_REPORTER/"
    #         "REPORT_SINGLE_PARTITION/fork0/files/fragments.h5")

    #     bam = os.path.join(longranger_dir,
    #         "PHASER_SVCALLER_CS/PHASER_SVCALLER/ATTACH_PHASING/"
    #         "fork0/files/phased_possorted_bam.bam")

    #     phased_fragments = os.path.join(longranger_dir,
    #         "10XSARCOMAC1/PHASER_SVCALLER_CS/PHASER_SVCALLER/"
    #         "_SNPINDEL_PHASER/PHASE_SNPINDELS/fork0/files/"
    #         "fragment_phasing.tsv.gz")

    #     self.validate()

    #     return TenXDataset(bam, fragments, phased_fragments)

    # def load_phased_fragments(self, chrom=None, start=None, end=None):

    #     columns = ["chrom", "start_pos", "end_pos", "phase_set", "ps_start", 
    #                "ps_end", "bc", "h0", "h1", "hmix", "unkn"]

    #     try:
    #         tabix = pysam.TabixFile(self.phased_fragments)
    #         s = StringIO.StringIO("\n".join(tabix.fetch(chrom, start, end)))
    #         frags = pandas.read_table(s)
    #         frags.columns = columns
    #     except (IOError, ValueError):
    #         frags = pandas.DataFrame(columns=columns)
        
    #     return frags

    # def load_fragments(self, chrom=None, start=None, end=None):
    #     tabix = pysam.TabixFile()

        # try:
        #     fragments = utilities.read_data_frame(self.fragments)
        #     goodbcs = utilities.get_good_barcodes(fragments)
        #     fragments = fragments.loc[fragments["bc"].isin(goodbcs)]
        #     # fragments = fragments.loc[fragments["num_reads"]>5]
        #     if chrom is not None:
        #         fragments = fragments.loc[fragments["chrom"]==chrom]

        #     return fragments
        # except:
        #     logging.exception("Unable to load fragments from fragments file "
        #         "'{}'".format(self.fragments))
        #     raise 
Example #13
Source File: _ingest.py    From cooler with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def __init__(
        self,
        filepath,
        chromsizes,
        bins,
        map=map,
        n_chunks=1,
        is_one_based=False,
        **kwargs
    ):
        try:
            import pysam
        except ImportError:
            raise ImportError("pysam is required to read tabix files")

        import dill
        import pickle

        dill.settings["protocol"] = pickle.HIGHEST_PROTOCOL

        self._map = map
        self.n_chunks = n_chunks
        self.is_one_based = bool(is_one_based)
        self.C2 = kwargs.pop("C2", 3)
        self.P2 = kwargs.pop("P2", 4)

        # all requested contigs will be placed in the output matrix
        self.gs = GenomeSegmentation(chromsizes, bins)

        # find available contigs in the contact list
        self.filepath = filepath
        self.n_records = None
        with pysam.TabixFile(filepath, "r", encoding="ascii") as f:
            try:
                self.file_contigs = [c.decode("ascii") for c in f.contigs]
            except AttributeError:
                self.file_contigs = f.contigs
            if not len(self.file_contigs):
                raise RuntimeError("No reference sequences found.")

        # warn about requested contigs not seen in the contact list
        for chrom in self.gs.contigs:
            if chrom not in self.file_contigs:
                warnings.warn(
                    "Did not find contig " + " '{}' in contact list file.".format(chrom)
                )

        warnings.warn(
            "NOTE: When using the Tabix aggregator, make sure the order of "
            "chromosomes in the provided chromsizes agrees with the chromosome "
            "ordering of read ends in the contact list file."
        ) 
Example #14
Source File: read_store.py    From ariba with GNU General Public License v3.0 4 votes vote down vote up
def get_reads(self, cluster_name, out1, out2=None, fasta=False, log_fh=None, wanted_ids=None):
        total_reads = 0
        total_bases = 0

        if log_fh is not None:
            print('Getting reads for', cluster_name, 'from', self.outfile, file=log_fh)
        tabix_file = pysam.TabixFile(self.outfile)
        f_out1 = pyfastaq.utils.open_file_write(out1)
        if out2 is None:
            f_out2 = f_out1
        else:
            f_out2 = pyfastaq.utils.open_file_write(out2)

        for line in tabix_file.fetch(reference=cluster_name):
            cluster, number, seq, qual = line.rstrip().split()
            number = int(number)
            if wanted_ids is not None:
                new_number = number if number % 2 else number - 1
                if new_number not in wanted_ids:
                    continue

            if number % 2 == 0:
                if fasta:
                    print('>' + str(number - 1) + '/2', seq, sep='\n', file=f_out2)
                else:
                    print('@' + str(number - 1) + '/2', seq, '+', qual, sep='\n', file=f_out2)
            else:
                if fasta:
                    print('>' + str(number) + '/1', seq, sep='\n', file=f_out1)
                else:
                    print('@' + str(number) + '/1', seq, '+', qual, sep='\n', file=f_out1)

            total_reads += 1
            total_bases += len(qual)

        pyfastaq.utils.close(f_out1)
        if out2 is not None:
            pyfastaq.utils.close(f_out2)
        if log_fh is not None:
            print('Finished getting reads for', cluster_name, 'from', self.outfile, file=log_fh)

        return total_reads, total_bases