Python pysam.FastaFile() Examples

The following are 30 code examples of pysam.FastaFile(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pysam , or try the search function .
Example #1
Source File: bindetect_functions.py    From TOBIAS with MIT License 6 votes vote down vote up
def get_gc_content(regions, fasta):
	""" Get GC content from regions in fasta """
	nuc_count = {"T":0, "t":0, "A":0, "a":0, "G":1, "g":1, "C":1, "c":1}

	gc = 0
	total = 0
	fasta_obj = pysam.FastaFile(fasta)
	for region in regions:
		seq = fasta_obj.fetch(region.chrom, region.start, region.end)
		gc += sum([nuc_count.get(nuc, 0.5) for nuc in seq])
		total += region.end - region.start
	fasta_obj.close()
	gc_content = gc / float(total)

	return(gc_content)


#---------------------------------------------------------------------------------------------------------#
#------------------------------------------- Main functions ----------------------------------------------#
#---------------------------------------------------------------------------------------------------------# 
Example #2
Source File: fasta.py    From igv-reports with MIT License 6 votes vote down vote up
def get_data(fasta_file,region=None):

    if None == region:

        with open(fasta_file,"r") as f:

            return(f.read())

    else :

        if isinstance(region,str):
            region = regions.parse_region(region)

        chr = region["chr"]
        start = region["start"] - 1
        end = region["end"]

        fasta = pysam.FastaFile(fasta_file)

        slice_seq = fasta.fetch(chr, start, end)

        fasta.close()

        return slice_seq 
Example #3
Source File: util.py    From pomoxis with Mozilla Public License 2.0 6 votes vote down vote up
def reverse_bed():
    """Convert bed-file coordinates to coordinates on the reverse strand."""
    parser = argparse.ArgumentParser(
        prog='reverse_bed',
        description='Convert bed-file coordinates to coordinates on the reverse strand.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('bed_in', help='Input bed file.')
    parser.add_argument('ref_fasta', help='Input reference fasta file.')
    parser.add_argument('bed_out', help='Output bed file.')
    args = parser.parse_args()

    fasta = pysam.FastaFile(args.ref_fasta)
    lengths = dict(zip(fasta.references, fasta.lengths))
    d = pd.read_csv(args.bed_in, sep='\t', names=['chrom', 'start', 'stop'])

    d['chrom_length'] = d['chrom'].map(lambda x: lengths[x])
    d['rc_stop'] = d['chrom_length'] - d['start']
    d['rc_start'] = d['chrom_length'] - d['stop']
    d['chrom_rc'] = d['chrom'] + '_rc'
    d[['chrom_rc', 'rc_start', 'rc_stop']].to_csv(args.bed_out, index=False, header=False, sep='\t') 
Example #4
Source File: backend.py    From genomelake with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def extract_fasta_to_file(fasta, output_dir, mode="bcolz", overwrite=False):
    assert mode in _array_writer

    makedirs(output_dir, exist_ok=overwrite)
    fasta_file = FastaFile(fasta)
    file_shapes = {}
    for chrom, size in zip(fasta_file.references, fasta_file.lengths):
        data = np.zeros((size, NUM_SEQ_CHARS), dtype=np.float32)
        seq = fasta_file.fetch(chrom)
        one_hot_encode_sequence(seq, data)
        file_shapes[chrom] = data.shape
        _array_writer[mode](data, os.path.join(output_dir, chrom))

    with open(os.path.join(output_dir, "metadata.json"), "w") as fp:
        json.dump(
            {
                "file_shapes": file_shapes,
                "type": "array_{}".format(mode),
                "source": fasta,
            },
            fp,
        ) 
Example #5
Source File: orf_test.py    From mikado with GNU Lesser General Public License v3.0 6 votes vote down vote up
def test_neg_in_serialise(self):

        line = "tr_c114_g1_i1.mrna1.89\tProdigal_v2.6.3\tCDS\t2\t205\t28.7\t-\t0\t\
ID=85_1;partial=11;start_type=Edge;rbs_motif=None;rbs_spacer=None;gc_cont=0.407;\
conf=99.86;score=28.71;cscore=27.10;sscore=1.61;rscore=0.00;uscore=0.00;tscore=1.61;"
        line = GFF.GffLine(line)
        self.assertFalse(line.header)
        self.assertIsNotNone(line.id)
        logger = create_default_logger("test_neg_in_serialise", "DEBUG")
        fasta = pkg_resources.resource_filename("Mikado.tests", "mikado_prepared.fasta")
        fai = pysam.FastaFile(fasta)
        bed = bed12.BED12(line,
                          logger=logger,
                          max_regression=0.1,
                          start_adjustment=True,
                          fasta_index=fai,
                          transcriptomic=True)
        self.assertFalse(bed.invalid, bed.invalid_reason) 
Example #6
Source File: test_system_calls.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) 
Example #7
Source File: fasta_utils.py    From brie with Apache License 2.0 5 votes vote down vote up
def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file) 
Example #8
Source File: fasta_utils.py    From models with MIT License 5 votes vote down vote up
def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file) 
Example #9
Source File: SlamSeqFile.py    From slamdunk with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, bamFile, referenceFile, snps):
        self._bamFile = pysam.AlignmentFile(bamFile, "rb")

        # Get version from BAM file
        self.bamVersion = "None"
        if('PG' in self._bamFile.header):
            for pg in self._bamFile.header['PG']:
                if pg['ID'] == "slamdunk":
                    self.bamVersion = pg['VN']

        self._referenceFile = pysam.FastaFile(referenceFile)
        self._snps = snps 
Example #10
Source File: checking.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def __setstate__(self, state):
        self.__dict__.update(state)
        create_queue_logger(self)
        self.fasta = pysam.FastaFile(self.__fasta) 
Example #11
Source File: bed12.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def __set_fasta_index(fasta_index):
        if isinstance(fasta_index, dict):
            # check that this is a bona fide dictionary ...
            assert isinstance(
                fasta_index[numpy.random.choice(fasta_index.keys(), 1)],
                Bio.SeqRecord.SeqRecord)
        elif fasta_index is not None:
            if isinstance(fasta_index, (str, bytes)):
                if isinstance(fasta_index, bytes):
                    fasta_index = fasta_index.decode()
                assert os.path.exists(fasta_index)
                fasta_index = pysam.FastaFile(fasta_index)
            else:
                assert isinstance(fasta_index, pysam.FastaFile), type(fasta_index)
        return fasta_index 
Example #12
Source File: test_modifications.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) 
Example #13
Source File: test_modifications.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_basic_padding(self):

        logger = create_null_logger("test_basic_padding")
        logger.setLevel("INFO")
        template = self.reference.copy()
        template.id = "AT5G66600.3_exp"
        template.strip_cds()
        template.unfinalize()

        template.remove_exon((26575000, 26575410))  # First exon
        template.start = 26574650
        template.add_exon((26574970, 26575410))  # New exon, template at 5'
        template.add_exon((26574650, 26574820))  # New UTR exon

        template.remove_exon((26578519, 26578625))  # Last exon
        template.end = 26579700
        template.add_exon((26578519, 26578725))
        template.add_exon((26579325, 26579700))
        
        template.finalize()
        fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

        new5 = expand_transcript(self.reference, None, template, fai, logger)
        self.assertIn((26574970, 26575410), new5.exons)
        self.assertIn((26574650, 26574820), new5.exons)
        self.assertEqual(template.start, new5.start)
        self.assertEqual(self.reference.end, new5.end)

        new3 = expand_transcript(self.reference, template, None, fai, logger)
        self.assertIn((26578519, 26578725), new3.exons)
        self.assertIn((26579325, 26579700), new3.exons)
        self.assertEqual(self.reference.start, new3.start)
        self.assertEqual(template.end, new5.end)

        new53 = expand_transcript(self.reference, template, template, fai, logger)
        self.assertIn((26574970, 26575410), new53.exons)
        self.assertIn((26574650, 26574820), new53.exons)
        self.assertIn((26578519, 26578725), new53.exons)
        self.assertIn((26579325, 26579700), new53.exons)
        self.assertEqual(template.start, new53.start)
        self.assertEqual(template.end, new53.end) 
Example #14
Source File: test_system_calls.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) 
Example #15
Source File: tcounter.py    From slamdunk with GNU Affero General Public License v3.0 5 votes vote down vote up
def genomewideReadSeparation(referenceFile, snpsFile, bam, minBaseQual, outputBAMPrefix, conversionThreshold, log):

    ref = pysam.FastaFile(referenceFile)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, snps)

    samFile = pysam.AlignmentFile(bam, "rb")

    chromosomes = testFile.getChromosomes()

    backgroundReadFileName = outputBAMPrefix + "_backgroundReads.bam"
    tcReadFileName = outputBAMPrefix + "_TCReads.bam"

    backgroundReadFile = pysam.AlignmentFile(backgroundReadFileName, "wb", template=samFile)
    tcReadFile = pysam.AlignmentFile(tcReadFileName, "wb", template=samFile)

    tcReadDict = dict()

    for chromosome in chromosomes:

        readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold)

        for read in readIterator:
            if (read.isTcRead) :
                tcReadDict[read.name] = 0

    for read in samFile.fetch():
        if read.query_name in tcReadDict:
            tcReadFile.write(read)
        else:
            backgroundReadFile.write(read)

    backgroundReadFile.close()
    tcReadFile.close()

    pysamIndex(backgroundReadFileName)
    pysamIndex(tcReadFileName) 
Example #16
Source File: test_system_calls.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) 
Example #17
Source File: prepare_misc_test.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):

        cls.fasta = pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")
        cls.fai = pysam.FastaFile(cls.fasta) 
Example #18
Source File: locus_test.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) 
Example #19
Source File: locus_test.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")) 
Example #20
Source File: test_transcript_checker.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def setUpClass(cls):

        # Prepare the genome
        cls.fasta = pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")
        # cls.fasta_temp = tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".fa.gz")
        # cls.fasta_temp.write(pkg_resources.resource_stream("Mikado.tests", "chr5.fas.gz").read())
        cls.fasta = pysam.FastaFile(cls.fasta)
        cls._model = Transcript()
        cls._model.chrom = "Chr5"
        cls._model.start, cls._model.end, cls._model.strand, cls._model.id, cls._model.parent = (
            26584797, 26595528, "+", "c58_g1_i3.mrna1.19", "c58_g1_i3.path1.19"
        )
        cls._model.add_exons([(26584797, 26584879),
                              (26585220, 26585273),
                              (26585345, 26585889),
                              (26585982, 26586294),
                              (26586420, 26586524),
                              (26586638, 26586850),
                              (26586934, 26586996),
                              (26587084, 26587202),
                              (26587287, 26587345),
                              (26587427, 26587472),
                              (26595411, 26595528)])
        cls._model.finalize()
        cls._exons = [cls.fasta.fetch(cls._model.chrom, line[0] - 1,  line[1]) for line in sorted(cls._model.exons)]
        # We need the whole genomic fragment
        cls._model_fasta = cls.fasta.fetch(cls._model.chrom, cls._model.start - 1, cls._model.end) 
Example #21
Source File: blast_serialiser.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def __determine_sequences(self, query_seqs, target_seqs):

        """Private method to assign the sequence file variables
        if necessary.
        :param query_seqs:
        :param target_seqs:
        :return:
        """

        if isinstance(query_seqs, str):
            assert os.path.exists(query_seqs)
            self.query_seqs = pysam.FastaFile(query_seqs)
        elif query_seqs is None:
            self.query_seqs = None
        else:
            self.logger.warn("Query type: %s", type(query_seqs))
            # assert "SeqIO.index" in repr(query_seqs)
            self.query_seqs = query_seqs

        self.target_seqs = []
        for target in target_seqs:
            if not os.path.exists(target):
                raise ValueError("{} not found!".format(target))
            self.target_seqs.append(pysam.FastaFile(target))

        return 
Example #22
Source File: orf.py    From mikado with GNU Lesser General Public License v3.0 5 votes vote down vote up
def line_parser_func(handle, fai, send_queue):
    fai = pysam.FastaFile(fai)
    for num, line in enumerate(open(handle)):
        if line[0] == "#":
            send_queue.put((num, line, None))
        else:
            _f = line.split("\t")
            if _f[0] not in fai:
                seq = None
            else:
                seq = zlib.compress(fai[line.split("\t")[0]].encode(), 1)
            send_queue.put_nowait((num, line, seq))

    send_queue.put("EXIT") 
Example #23
Source File: fabgz.py    From biocommons.seqrepo with Apache License 2.0 5 votes vote down vote up
def close(self):
        if self._fh:
            self._fh.close()
            self._fh = None
            subprocess.check_call([self._bgzip_exe, "--force", self._basepath])
            os.rename(self._basepath + ".gz", self.filename)

            # open file with FastaFile to create indexes, then make all read-only
            _fh = FastaFile(self.filename)
            _fh.close()
            os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

            _logger.info("{} written; added {} sequences".format(self.filename, len(self._added))) 
Example #24
Source File: fasta_utils.py    From models with MIT License 5 votes vote down vote up
def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file) 
Example #25
Source File: dataloader.py    From models with MIT License 5 votes vote down vote up
def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file) 
Example #26
Source File: fasta_utils.py    From models with MIT License 5 votes vote down vote up
def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file) 
Example #27
Source File: genomesource.py    From genomeview with MIT License 5 votes vote down vote up
def fasta(self):
        if self._fasta is None:
            # import pyfaidx
            # self._fasta = pyfaidx.Fasta(self.path, as_raw=True)
            import pysam
            self._fasta = pysam.FastaFile(self.path)
        return self._fasta 
Example #28
Source File: test_references.py    From ga4gh-server with Apache License 2.0 5 votes vote down vote up
def __init__(self, referenceSetId, fastaFile):
        super(ReferenceSetTest, self).__init__(referenceSetId, fastaFile)
        self._fastaFile = pysam.FastaFile(fastaFile) 
Example #29
Source File: references.py    From ga4gh-server with Apache License 2.0 5 votes vote down vote up
def openFile(self, dataFile):
        return pysam.FastaFile(dataFile) 
Example #30
Source File: extractors.py    From genomelake with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, datafile, use_strand=False, **kwargs):
        """Fasta file extractor
        
        NOTE: The extractor is not thread-save.
        If you with to use it with multiprocessing,
        create a new extractor object in each process.
        
        Args:
          datafile (str): path to the bigwig file
          use_strand (bool): if True, the extracted sequence
            is reverse complemented in case interval.strand == "-"
        """
        super(FastaExtractor, self).__init__(datafile, **kwargs)
        self.use_strand = use_strand
        self.fasta = FastaFile(self._datafile)