Python Examples of pysam.TabixFile

Source File: BedGraphTrack.py From pyGenomeTracks with GNU General Public License v3.0

6 votes

def __init__(self, properties_dict):
        super(BedGraphTrack, self).__init__(properties_dict)
        self.load_file()

        self.tbx2 = None
        self.interval_tree2 = None

        if 'second_file' in self.properties['operation']:
            if self.properties['second_file'] is None:
                raise InputError("operation: {} requires to set the parameter"
                                 " second_file."
                                 "".format(self.properties['operation']))
            else:
                if self.properties['second_file'].endswith(".bgz"):
                    # from the tabix file is not possible to know the
                    # global min and max
                    try:
                        self.tbx2 = pysam.TabixFile(self.properties['second_file'])
                    except IOError:
                        self.interval_tree2, __, __ = file_to_intervaltree(self.properties['second_file'])
                # load the file as an interval tree
                else:
                    self.interval_tree2, __, __ = file_to_intervaltree(self.properties['second_file'])

Source File: NFRCalling.py From NucleoATAC with MIT License

6 votes

def findNFRs(self):
        """find NFR regions"""
        region = np.ones(self.length())
        tbx = pysam.TabixFile(self.params.calls)
        nucs = []
        if self.chrom in tbx.contigs:
            for row in tbx.fetch(self.chrom, self.start, self.end, parser = pysam.asTuple()):
                nucs.append(int(row[1]))
        for j in xrange(1,len(nucs)):
            left = nucs[j-1] + 73
            right = nucs[j] - 72
            if right <= left:
                continue
            candidate = NFR(left, right, self)
            if candidate.min_upper < self.params.max_occ_upper and candidate.occ < self.params.max_occ:
                self.nfrs.append(candidate)

Source File: samtools_variants.py From ariba with GNU General Public License v3.0

6 votes

def _get_read_depths(cls, read_depths_file, sequence_name, position):
        '''Returns total read depth and depth of reads supporting alternative (if present)'''
        assert os.path.exists(read_depths_file)
        assert os.path.exists(read_depths_file + '.tbi')
        tbx = pysam.TabixFile(read_depths_file)
        try:
            rows = [x for x in tbx.fetch(sequence_name, position, position + 1)]
        except:
            return None

        if len(rows) > 1: # which happens with indels, mutiple lines for same base of reference
            test_rows = [x for x in rows if x.rstrip().split()[3] != '.']
            if len(test_rows) != 1:
                rows = [rows[-1]]
            else:
                rows = test_rows

        if len(rows) == 1:
            r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
            bases = ref_base if alt_base == '.' else ref_base + ',' + alt_base
            return bases, int(ref_counts), alt_counts
        else:
            return None

Source File: bedtrack.py From genomeview with MIT License

5 votes

def fetch_from_tabix(path, chrom, start, end):
    import pysam

    bed = pysam.TabixFile(path)

    chrom = match_chrom_format(chrom, bed.contigs)
    for locus in bed.fetch(chrom, start, end):
        locus = locus.split()
        yield tx_from_bedfields(locus)

Source File: feature.py From igv-reports with MIT License

5 votes

def __init__(self, path):

        tabix = None
        if path.endswith(".gz"):
            # Might be a tabix file
            try:
                tabix = pysam.TabixFile(path)
            except:
                tabix = None

        if tabix:
            self.reader = _Tabix(tabix)
        else:
            self.reader = _NonIndexed(path)

Source File: BedGraphTrack.py From pyGenomeTracks with GNU General Public License v3.0

5 votes

def load_file(self):
        self.tbx = None
        # try to load a tabix file is available
        if self.properties['file'].endswith(".bgz"):
            # from the tabix file is not possible to know the
            # global min and max
            try:
                self.tbx = pysam.TabixFile(self.properties['file'])
            except IOError:
                self.interval_tree, __, __ = file_to_intervaltree(self.properties['file'])
        # load the file as an interval tree
        else:
            self.interval_tree, __, __ = file_to_intervaltree(self.properties['file'])
        self.num_fields = None

Source File: test_clinvar_match.py From CharGer with GNU General Public License v3.0

5 votes

def clinvar_tabix(test_root):
    return TabixFile(
        str(
            test_root.joinpath("examples/annotations/clinvar_chrom_22_only.b37.tsv.gz")
        ),
        encoding="utf8",
    )

Source File: call_readclouds.py From grocsvs with MIT License

5 votes

def load_fragments(options, sample, dataset, chrom=None, start=None, end=None, usecols=None, 
                   min_reads_per_frag=1):
    if start is not None:
        if start < 0:
            raise Exception("start coord is negative: {}:{}-{}".format(chrom, start, end))
    if end is not None:
        if start >= end:
            raise Exception("end coord is before start: {}:{}-{}".format(chrom, start, end))

    readclouds_path = os.path.join(
        options.results_dir,
        "CombineReadcloudsStep",
        "readclouds.{}.{}.tsv.gz".format(sample.name, dataset.id))

    tabix = pysam.TabixFile(readclouds_path)
    
    if chrom is not None and chrom not in tabix.contigs:
        #print("MISSING:", chrom)
        return pandas.DataFrame(columns="chrom start_pos end_pos bc num_reads obs_len hap".split())
    
    if usecols is not None and "num_reads" not in usecols:
        usecols.append("num_reads")
        
    s = StringIO.StringIO("\n".join(tabix.fetch(chrom, start, end)))
    readclouds = pandas.read_table(s, header=None, names=Readcloud._fields, usecols=usecols)
    readclouds["chrom"] = readclouds["chrom"].astype("string")
    
    if min_reads_per_frag > 0:
        readclouds = readclouds.loc[readclouds["num_reads"]>min_reads_per_frag]

    return readclouds

Source File: file_utils.py From pheweb with GNU Affero General Public License v3.0

5 votes

def IndexedVariantFileReader(phenocode):
    filepath = common_filepaths['pheno_gz'](phenocode)
    with read_gzip(filepath) as f:
        reader = csv.reader(f, dialect='pheweb-internal-dialect')
        fields = next(reader)
    if fields[0].startswith('#'): # previous version of PheWeb commented the header line
        fields[0] = fields[0][1:]
    for field in fields:
        assert field in conf.parse.per_variant_fields or field in conf.parse.per_assoc_fields, field
    colidxs = {field: idx for idx, field in enumerate(fields)}
    with pysam.TabixFile(filepath, parser=None) as tabix_file:
        yield _ivfr(tabix_file, colidxs)

Source File: file_utils.py From pheweb with GNU Affero General Public License v3.0

5 votes

def context(self):
        with pysam.TabixFile(self._filepath, parser=None) as tabix_file:
            yield _mr(tabix_file, self._colidxs, self._colidxs_for_pheno, self._info_for_pheno)

Source File: samplot.py From samplot with MIT License

5 votes

def get_tabix_iter(chrm, start, end, datafile):
    """Gets an iterator from a tabix BED/GFF/GFF3 file

    Used to avoid chrX vs. X notation issues when extracting data from
    annotation files
    """
    tbx = pysam.TabixFile(datafile)
    itr = None
    try:
        itr = tbx.fetch(chrm, max(0, start - 1000), end + 1000)
    except ValueError:
        # try and account for chr/no chr prefix
        if chrm[:3] == "chr":
            chrm = chrm[3:]
        else:
            chrm = "chr" + chrm

        try:
            itr = tbx.fetch(chrm, max(0, start - 1000), end + 1000)
        except ValueError as e:
            print(
                "Warning: Could not fetch "
                + chrm
                + ":"
                + str(start)
                + "-"
                + str(end)
                + " from "
                + datafile,
                file=sys.stderr,
            )
            print(e)
    return itr


# }}}

##Coverage methods
# {{{def add_coverage(bam_file, read, coverage, separate_mqual):

Source File: datasets.py From grocsvs with MIT License

4 votes

def validate(self):
        assert os.path.exists(self.bam), "missing bam file '{}' for sample '{}' and dataset '{}'".format(
                                         self.bam, self.sample.name, self.id)


    # @staticmethod
    # def from_longranger_dir(self, longranger_dir):
    #     fragments = os.path.join(longranger_dir,
    #         "PHASER_SVCALLER_CS/PHASER_SVCALLER/_REPORTER/"
    #         "REPORT_SINGLE_PARTITION/fork0/files/fragments.h5")

    #     bam = os.path.join(longranger_dir,
    #         "PHASER_SVCALLER_CS/PHASER_SVCALLER/ATTACH_PHASING/"
    #         "fork0/files/phased_possorted_bam.bam")

    #     phased_fragments = os.path.join(longranger_dir,
    #         "10XSARCOMAC1/PHASER_SVCALLER_CS/PHASER_SVCALLER/"
    #         "_SNPINDEL_PHASER/PHASE_SNPINDELS/fork0/files/"
    #         "fragment_phasing.tsv.gz")

    #     self.validate()

    #     return TenXDataset(bam, fragments, phased_fragments)

    # def load_phased_fragments(self, chrom=None, start=None, end=None):

    #     columns = ["chrom", "start_pos", "end_pos", "phase_set", "ps_start", 
    #                "ps_end", "bc", "h0", "h1", "hmix", "unkn"]

    #     try:
    #         tabix = pysam.TabixFile(self.phased_fragments)
    #         s = StringIO.StringIO("\n".join(tabix.fetch(chrom, start, end)))
    #         frags = pandas.read_table(s)
    #         frags.columns = columns
    #     except (IOError, ValueError):
    #         frags = pandas.DataFrame(columns=columns)
        
    #     return frags

    # def load_fragments(self, chrom=None, start=None, end=None):
    #     tabix = pysam.TabixFile()

        # try:
        #     fragments = utilities.read_data_frame(self.fragments)
        #     goodbcs = utilities.get_good_barcodes(fragments)
        #     fragments = fragments.loc[fragments["bc"].isin(goodbcs)]
        #     # fragments = fragments.loc[fragments["num_reads"]>5]
        #     if chrom is not None:
        #         fragments = fragments.loc[fragments["chrom"]==chrom]

        #     return fragments
        # except:
        #     logging.exception("Unable to load fragments from fragments file "
        #         "'{}'".format(self.fragments))
        #     raise

Source File: _ingest.py From cooler with BSD 3-Clause "New" or "Revised" License

4 votes

def __init__(
        self,
        filepath,
        chromsizes,
        bins,
        map=map,
        n_chunks=1,
        is_one_based=False,
        **kwargs
    ):
        try:
            import pysam
        except ImportError:
            raise ImportError("pysam is required to read tabix files")

        import dill
        import pickle

        dill.settings["protocol"] = pickle.HIGHEST_PROTOCOL

        self._map = map
        self.n_chunks = n_chunks
        self.is_one_based = bool(is_one_based)
        self.C2 = kwargs.pop("C2", 3)
        self.P2 = kwargs.pop("P2", 4)

        # all requested contigs will be placed in the output matrix
        self.gs = GenomeSegmentation(chromsizes, bins)

        # find available contigs in the contact list
        self.filepath = filepath
        self.n_records = None
        with pysam.TabixFile(filepath, "r", encoding="ascii") as f:
            try:
                self.file_contigs = [c.decode("ascii") for c in f.contigs]
            except AttributeError:
                self.file_contigs = f.contigs
            if not len(self.file_contigs):
                raise RuntimeError("No reference sequences found.")

        # warn about requested contigs not seen in the contact list
        for chrom in self.gs.contigs:
            if chrom not in self.file_contigs:
                warnings.warn(
                    "Did not find contig " + " '{}' in contact list file.".format(chrom)
                )

        warnings.warn(
            "NOTE: When using the Tabix aggregator, make sure the order of "
            "chromosomes in the provided chromsizes agrees with the chromosome "
            "ordering of read ends in the contact list file."
        )

Source File: read_store.py From ariba with GNU General Public License v3.0

4 votes

def get_reads(self, cluster_name, out1, out2=None, fasta=False, log_fh=None, wanted_ids=None):
        total_reads = 0
        total_bases = 0

        if log_fh is not None:
            print('Getting reads for', cluster_name, 'from', self.outfile, file=log_fh)
        tabix_file = pysam.TabixFile(self.outfile)
        f_out1 = pyfastaq.utils.open_file_write(out1)
        if out2 is None:
            f_out2 = f_out1
        else:
            f_out2 = pyfastaq.utils.open_file_write(out2)

        for line in tabix_file.fetch(reference=cluster_name):
            cluster, number, seq, qual = line.rstrip().split()
            number = int(number)
            if wanted_ids is not None:
                new_number = number if number % 2 else number - 1
                if new_number not in wanted_ids:
                    continue

            if number % 2 == 0:
                if fasta:
                    print('>' + str(number - 1) + '/2', seq, sep='\n', file=f_out2)
                else:
                    print('@' + str(number - 1) + '/2', seq, '+', qual, sep='\n', file=f_out2)
            else:
                if fasta:
                    print('>' + str(number) + '/1', seq, sep='\n', file=f_out1)
                else:
                    print('@' + str(number) + '/1', seq, '+', qual, sep='\n', file=f_out1)

            total_reads += 1
            total_bases += len(qual)

        pyfastaq.utils.close(f_out1)
        if out2 is not None:
            pyfastaq.utils.close(f_out2)
        if log_fh is not None:
            print('Finished getting reads for', cluster_name, 'from', self.outfile, file=log_fh)

        return total_reads, total_bases

Python pysam.TabixFile() Examples