Python Examples of pysam.FastaFile

Source File: bindetect_functions.py From TOBIAS with MIT License

6 votes

def get_gc_content(regions, fasta):
	""" Get GC content from regions in fasta """
	nuc_count = {"T":0, "t":0, "A":0, "a":0, "G":1, "g":1, "C":1, "c":1}

	gc = 0
	total = 0
	fasta_obj = pysam.FastaFile(fasta)
	for region in regions:
		seq = fasta_obj.fetch(region.chrom, region.start, region.end)
		gc += sum([nuc_count.get(nuc, 0.5) for nuc in seq])
		total += region.end - region.start
	fasta_obj.close()
	gc_content = gc / float(total)

	return(gc_content)


#---------------------------------------------------------------------------------------------------------#
#------------------------------------------- Main functions ----------------------------------------------#
#---------------------------------------------------------------------------------------------------------#

Source File: fasta.py From igv-reports with MIT License

6 votes

def get_data(fasta_file,region=None):

    if None == region:

        with open(fasta_file,"r") as f:

            return(f.read())

    else :

        if isinstance(region,str):
            region = regions.parse_region(region)

        chr = region["chr"]
        start = region["start"] - 1
        end = region["end"]

        fasta = pysam.FastaFile(fasta_file)

        slice_seq = fasta.fetch(chr, start, end)

        fasta.close()

        return slice_seq

Source File: util.py From pomoxis with Mozilla Public License 2.0

6 votes

def reverse_bed():
    """Convert bed-file coordinates to coordinates on the reverse strand."""
    parser = argparse.ArgumentParser(
        prog='reverse_bed',
        description='Convert bed-file coordinates to coordinates on the reverse strand.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('bed_in', help='Input bed file.')
    parser.add_argument('ref_fasta', help='Input reference fasta file.')
    parser.add_argument('bed_out', help='Output bed file.')
    args = parser.parse_args()

    fasta = pysam.FastaFile(args.ref_fasta)
    lengths = dict(zip(fasta.references, fasta.lengths))
    d = pd.read_csv(args.bed_in, sep='\t', names=['chrom', 'start', 'stop'])

    d['chrom_length'] = d['chrom'].map(lambda x: lengths[x])
    d['rc_stop'] = d['chrom_length'] - d['start']
    d['rc_start'] = d['chrom_length'] - d['stop']
    d['chrom_rc'] = d['chrom'] + '_rc'
    d[['chrom_rc', 'rc_start', 'rc_stop']].to_csv(args.bed_out, index=False, header=False, sep='\t')

Source File: backend.py From genomelake with BSD 3-Clause "New" or "Revised" License

6 votes

def extract_fasta_to_file(fasta, output_dir, mode="bcolz", overwrite=False):
    assert mode in _array_writer

    makedirs(output_dir, exist_ok=overwrite)
    fasta_file = FastaFile(fasta)
    file_shapes = {}
    for chrom, size in zip(fasta_file.references, fasta_file.lengths):
        data = np.zeros((size, NUM_SEQ_CHARS), dtype=np.float32)
        seq = fasta_file.fetch(chrom)
        one_hot_encode_sequence(seq, data)
        file_shapes[chrom] = data.shape
        _array_writer[mode](data, os.path.join(output_dir, chrom))

    with open(os.path.join(output_dir, "metadata.json"), "w") as fp:
        json.dump(
            {
                "file_shapes": file_shapes,
                "type": "array_{}".format(mode),
                "source": fasta,
            },
            fp,
        )

Source File: orf_test.py From mikado with GNU Lesser General Public License v3.0

6 votes

def test_neg_in_serialise(self):

        line = "tr_c114_g1_i1.mrna1.89\tProdigal_v2.6.3\tCDS\t2\t205\t28.7\t-\t0\t\
ID=85_1;partial=11;start_type=Edge;rbs_motif=None;rbs_spacer=None;gc_cont=0.407;\
conf=99.86;score=28.71;cscore=27.10;sscore=1.61;rscore=0.00;uscore=0.00;tscore=1.61;"
        line = GFF.GffLine(line)
        self.assertFalse(line.header)
        self.assertIsNotNone(line.id)
        logger = create_default_logger("test_neg_in_serialise", "DEBUG")
        fasta = pkg_resources.resource_filename("Mikado.tests", "mikado_prepared.fasta")
        fai = pysam.FastaFile(fasta)
        bed = bed12.BED12(line,
                          logger=logger,
                          max_regression=0.1,
                          start_adjustment=True,
                          fasta_index=fai,
                          transcriptomic=True)
        self.assertFalse(bed.invalid, bed.invalid_reason)

Source File: test_system_calls.py From mikado with GNU Lesser General Public License v3.0

5 votes

def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

Source File: fasta_utils.py From brie with Apache License 2.0

5 votes

def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file)

Source File: fasta_utils.py From models with MIT License

5 votes

def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file)

Source File: SlamSeqFile.py From slamdunk with GNU Affero General Public License v3.0

5 votes

def __init__(self, bamFile, referenceFile, snps):
        self._bamFile = pysam.AlignmentFile(bamFile, "rb")

        # Get version from BAM file
        self.bamVersion = "None"
        if('PG' in self._bamFile.header):
            for pg in self._bamFile.header['PG']:
                if pg['ID'] == "slamdunk":
                    self.bamVersion = pg['VN']

        self._referenceFile = pysam.FastaFile(referenceFile)
        self._snps = snps

Source File: checking.py From mikado with GNU Lesser General Public License v3.0

5 votes

def __setstate__(self, state):
        self.__dict__.update(state)
        create_queue_logger(self)
        self.fasta = pysam.FastaFile(self.__fasta)

Source File: bed12.py From mikado with GNU Lesser General Public License v3.0

5 votes

def __set_fasta_index(fasta_index):
        if isinstance(fasta_index, dict):
            # check that this is a bona fide dictionary ...
            assert isinstance(
                fasta_index[numpy.random.choice(fasta_index.keys(), 1)],
                Bio.SeqRecord.SeqRecord)
        elif fasta_index is not None:
            if isinstance(fasta_index, (str, bytes)):
                if isinstance(fasta_index, bytes):
                    fasta_index = fasta_index.decode()
                assert os.path.exists(fasta_index)
                fasta_index = pysam.FastaFile(fasta_index)
            else:
                assert isinstance(fasta_index, pysam.FastaFile), type(fasta_index)
        return fasta_index

Source File: test_modifications.py From mikado with GNU Lesser General Public License v3.0

5 votes

def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

Source File: test_modifications.py From mikado with GNU Lesser General Public License v3.0

5 votes

def test_basic_padding(self):

        logger = create_null_logger("test_basic_padding")
        logger.setLevel("INFO")
        template = self.reference.copy()
        template.id = "AT5G66600.3_exp"
        template.strip_cds()
        template.unfinalize()

        template.remove_exon((26575000, 26575410))  # First exon
        template.start = 26574650
        template.add_exon((26574970, 26575410))  # New exon, template at 5'
        template.add_exon((26574650, 26574820))  # New UTR exon

        template.remove_exon((26578519, 26578625))  # Last exon
        template.end = 26579700
        template.add_exon((26578519, 26578725))
        template.add_exon((26579325, 26579700))
        
        template.finalize()
        fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

        new5 = expand_transcript(self.reference, None, template, fai, logger)
        self.assertIn((26574970, 26575410), new5.exons)
        self.assertIn((26574650, 26574820), new5.exons)
        self.assertEqual(template.start, new5.start)
        self.assertEqual(self.reference.end, new5.end)

        new3 = expand_transcript(self.reference, template, None, fai, logger)
        self.assertIn((26578519, 26578725), new3.exons)
        self.assertIn((26579325, 26579700), new3.exons)
        self.assertEqual(self.reference.start, new3.start)
        self.assertEqual(template.end, new5.end)

        new53 = expand_transcript(self.reference, template, template, fai, logger)
        self.assertIn((26574970, 26575410), new53.exons)
        self.assertIn((26574650, 26574820), new53.exons)
        self.assertIn((26578519, 26578725), new53.exons)
        self.assertIn((26579325, 26579700), new53.exons)
        self.assertEqual(template.start, new53.start)
        self.assertEqual(template.end, new53.end)

Source File: test_system_calls.py From mikado with GNU Lesser General Public License v3.0

5 votes

def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

Source File: tcounter.py From slamdunk with GNU Affero General Public License v3.0

5 votes

def genomewideReadSeparation(referenceFile, snpsFile, bam, minBaseQual, outputBAMPrefix, conversionThreshold, log):

    ref = pysam.FastaFile(referenceFile)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, snps)

    samFile = pysam.AlignmentFile(bam, "rb")

    chromosomes = testFile.getChromosomes()

    backgroundReadFileName = outputBAMPrefix + "_backgroundReads.bam"
    tcReadFileName = outputBAMPrefix + "_TCReads.bam"

    backgroundReadFile = pysam.AlignmentFile(backgroundReadFileName, "wb", template=samFile)
    tcReadFile = pysam.AlignmentFile(tcReadFileName, "wb", template=samFile)

    tcReadDict = dict()

    for chromosome in chromosomes:

        readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold)

        for read in readIterator:
            if (read.isTcRead) :
                tcReadDict[read.name] = 0

    for read in samFile.fetch():
        if read.query_name in tcReadDict:
            tcReadFile.write(read)
        else:
            backgroundReadFile.write(read)

    backgroundReadFile.close()
    tcReadFile.close()

    pysamIndex(backgroundReadFileName)
    pysamIndex(tcReadFileName)

Source File: test_system_calls.py From mikado with GNU Lesser General Public License v3.0

5 votes

def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

Source File: prepare_misc_test.py From mikado with GNU Lesser General Public License v3.0

5 votes

def setUpClass(cls):

        cls.fasta = pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")
        cls.fai = pysam.FastaFile(cls.fasta)

Source File: locus_test.py From mikado with GNU Lesser General Public License v3.0

5 votes

def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

Source File: locus_test.py From mikado with GNU Lesser General Public License v3.0

5 votes

def setUpClass(cls):
        cls.fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz"))

Source File: test_transcript_checker.py From mikado with GNU Lesser General Public License v3.0

5 votes

def setUpClass(cls):

        # Prepare the genome
        cls.fasta = pkg_resources.resource_filename("Mikado.tests", "chr5.fas.gz")
        # cls.fasta_temp = tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".fa.gz")
        # cls.fasta_temp.write(pkg_resources.resource_stream("Mikado.tests", "chr5.fas.gz").read())
        cls.fasta = pysam.FastaFile(cls.fasta)
        cls._model = Transcript()
        cls._model.chrom = "Chr5"
        cls._model.start, cls._model.end, cls._model.strand, cls._model.id, cls._model.parent = (
            26584797, 26595528, "+", "c58_g1_i3.mrna1.19", "c58_g1_i3.path1.19"
        )
        cls._model.add_exons([(26584797, 26584879),
                              (26585220, 26585273),
                              (26585345, 26585889),
                              (26585982, 26586294),
                              (26586420, 26586524),
                              (26586638, 26586850),
                              (26586934, 26586996),
                              (26587084, 26587202),
                              (26587287, 26587345),
                              (26587427, 26587472),
                              (26595411, 26595528)])
        cls._model.finalize()
        cls._exons = [cls.fasta.fetch(cls._model.chrom, line[0] - 1,  line[1]) for line in sorted(cls._model.exons)]
        # We need the whole genomic fragment
        cls._model_fasta = cls.fasta.fetch(cls._model.chrom, cls._model.start - 1, cls._model.end)

Source File: blast_serialiser.py From mikado with GNU Lesser General Public License v3.0

5 votes

def __determine_sequences(self, query_seqs, target_seqs):

        """Private method to assign the sequence file variables
        if necessary.
        :param query_seqs:
        :param target_seqs:
        :return:
        """

        if isinstance(query_seqs, str):
            assert os.path.exists(query_seqs)
            self.query_seqs = pysam.FastaFile(query_seqs)
        elif query_seqs is None:
            self.query_seqs = None
        else:
            self.logger.warn("Query type: %s", type(query_seqs))
            # assert "SeqIO.index" in repr(query_seqs)
            self.query_seqs = query_seqs

        self.target_seqs = []
        for target in target_seqs:
            if not os.path.exists(target):
                raise ValueError("{} not found!".format(target))
            self.target_seqs.append(pysam.FastaFile(target))

        return

Source File: orf.py From mikado with GNU Lesser General Public License v3.0

5 votes

def line_parser_func(handle, fai, send_queue):
    fai = pysam.FastaFile(fai)
    for num, line in enumerate(open(handle)):
        if line[0] == "#":
            send_queue.put((num, line, None))
        else:
            _f = line.split("\t")
            if _f[0] not in fai:
                seq = None
            else:
                seq = zlib.compress(fai[line.split("\t")[0]].encode(), 1)
            send_queue.put_nowait((num, line, seq))

    send_queue.put("EXIT")

Source File: fabgz.py From biocommons.seqrepo with Apache License 2.0

5 votes

def close(self):
        if self._fh:
            self._fh.close()
            self._fh = None
            subprocess.check_call([self._bgzip_exe, "--force", self._basepath])
            os.rename(self._basepath + ".gz", self.filename)

            # open file with FastaFile to create indexes, then make all read-only
            _fh = FastaFile(self.filename)
            _fh.close()
            os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

            _logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))

Source File: fasta_utils.py From models with MIT License

5 votes

def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file)

Source File: dataloader.py From models with MIT License

5 votes

def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file)

Source File: fasta_utils.py From models with MIT License

5 votes

def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file)

Source File: genomesource.py From genomeview with MIT License

5 votes

def fasta(self):
        if self._fasta is None:
            # import pyfaidx
            # self._fasta = pyfaidx.Fasta(self.path, as_raw=True)
            import pysam
            self._fasta = pysam.FastaFile(self.path)
        return self._fasta

Source File: test_references.py From ga4gh-server with Apache License 2.0

5 votes

def __init__(self, referenceSetId, fastaFile):
        super(ReferenceSetTest, self).__init__(referenceSetId, fastaFile)
        self._fastaFile = pysam.FastaFile(fastaFile)

Source File: references.py From ga4gh-server with Apache License 2.0

5 votes

def openFile(self, dataFile):
        return pysam.FastaFile(dataFile)

Source File: extractors.py From genomelake with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, datafile, use_strand=False, **kwargs):
        """Fasta file extractor
        
        NOTE: The extractor is not thread-save.
        If you with to use it with multiprocessing,
        create a new extractor object in each process.
        
        Args:
          datafile (str): path to the bigwig file
          use_strand (bool): if True, the extracted sequence
            is reverse complemented in case interval.strand == "-"
        """
        super(FastaExtractor, self).__init__(datafile, **kwargs)
        self.use_strand = use_strand
        self.fasta = FastaFile(self._datafile)

Python pysam.FastaFile() Examples