Python gffutils.create_db() Examples
The following are 16
code examples of gffutils.create_db().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gffutils
, or try the search function
.
Example #1
Source File: extract_features.py From Liftoff with GNU General Public License v3.0 | 6 votes |
def create_feature_db_connections(g_arg, db_arg, infer_transcripts, infer_genes): if infer_transcripts is True: disable_transcripts = False else: disable_transcripts = True if infer_genes is True: disable_genes = False else: disable_genes = True if db_arg is None: feature_db = gffutils.create_db(g_arg, g_arg + "_db", merge_strategy="create_unique", force=True, disable_infer_transcripts=disable_transcripts, disable_infer_genes=disable_genes, verbose=True) feature_db_name = db_arg else: feature_db = gffutils.FeatureDB(db_arg) feature_db_name = db_arg return feature_db, feature_db_name
Example #2
Source File: gtf.py From outrigger with BSD 3-Clause "New" or "Revised" License | 6 votes |
def create_db(gtf_filename, db_filename=None): db_filename = ':memory:' if db_filename is None else db_filename db = gffutils.create_db( gtf_filename, db_filename, merge_strategy='merge', id_spec={'gene': 'gene_id', 'transcript': 'transcript_id', 'exon': 'location_id', 'CDS': 'location_id', 'start_codon': 'location_id', 'stop_codon': 'location_id', 'UTR': 'location_id'}, transform=transform, force=True, verbose=True, disable_infer_genes=True, disable_infer_transcripts=True, force_merge_fields=['source']) maybe_analyze(db) return db
Example #3
Source File: dataloader.py From models with MIT License | 5 votes |
def __init__(self, gtf_file, fasta_file, disable_infer_transcripts=True, disable_infer_genes=True): self.gtf_file = gtf_file self.fasta_file = fasta_file self.fasta_extractor = None self.db = gffutils.create_db(gtf_file, ":memory:", disable_infer_transcripts=disable_infer_transcripts, disable_infer_genes=disable_infer_genes) self.start_codons = list(self.db.features_of_type("start_codon")) self.input_transform = OneHot()
Example #4
Source File: dataloader.py From models with MIT License | 5 votes |
def _get_introns(self): """ Get introns from the specified seq and annotation files. :return: List of all detected introns as gffutils.Feature objects. """ # create a gffutils database self.db = gffutils.create_db(data=self.gtf_file, dbfn=":memory:", force=True, id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'}, disable_infer_transcripts=True, disable_infer_genes=True, verbose=False, merge_strategy="merge") if not self.create_introns: # load introns from gtf, don't create them introns = list(self.db.features_of_type('intron', order_by=('seqid', 'start', 'end'))) # exons are sorted start-coord. asc. self._add_SOI(introns) return introns exons = list(self.db.features_of_type('exon', order_by=('seqid', 'start', 'end'))) # exons are sorted start-coord. asc. # group exons in a dict by gene id transcript_to_exon = self._get_tr_to_exon_dict(exons) collected_introns = self._build_introns(transcript_to_exon) self._add_SOI(collected_introns) return collected_introns
Example #5
Source File: dataloader.py From models with MIT License | 5 votes |
def _get_introns(self): """ Get introns from the specified seq and annotation files. :return: List of all detected introns as gffutils.Feature objects. """ # create a gffutils database self.db = gffutils.create_db(data=self.gtf_file, dbfn=":memory:", force=True, id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'}, disable_infer_transcripts=True, disable_infer_genes=True, verbose=False, merge_strategy="merge") if not self.create_introns: # load introns from gtf, don't create them introns = list(self.db.features_of_type('intron', order_by=('seqid', 'start', 'end'))) # exons are sorted start-coord. asc. self._add_SOI(introns) return introns exons = list(self.db.features_of_type('exon', order_by=('seqid', 'start', 'end'))) # exons are sorted start-coord. asc. # group exons in a dict by gene id transcript_to_exon = self._get_tr_to_exon_dict(exons) collected_introns = self._build_introns(transcript_to_exon) self._add_SOI(collected_introns) return collected_introns
Example #6
Source File: getRightStrand.py From LoReAn with MIT License | 5 votes |
def add_removed_evm(pasa, exon, wd): """ here the clusters of sequence from the same locus are prepared """ db_evm = gffutils.create_db(pasa, ':memory:', merge_strategy='create_unique', keep_order=True) ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")] db_gmap = gffutils.create_db(exon, ':memory:', merge_strategy='create_unique', keep_order=True) ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")] ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("mRNA")] uniq_evm = [evm for evm in ids_evm if evm not in ids_gmap] uniq_gene = [gene.attributes["ID"][0] for mrna in uniq_evm for gene in db_evm.parents(mrna)] uniq = list(set(uniq_gene)) outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd) gff_out_s = gffwriter.GFFWriter(outfile.name) for name in uniq: for i in db_evm.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_evm[name]) for name in ids_gmap_full: for i in db_gmap.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_gmap[name]) gff_out_s.close() return outfile.name #final_evm="/home/lfaino/lfainoData/lorean/LoReAn_Example/Crispa/LoReAn_crispa/run/exonerate/exonerate_step4.final.4e4aoxwb.gff3" #genome = "/home/lfaino/lfainoData/lorean/LoReAn_Example/Crispa/LoReAn_crispa/run/split/scaffold3.fasta.masked.rename.fasta"
Example #7
Source File: collectOnly.py From LoReAn with MIT License | 5 votes |
def add_EVM(final_update, wd, consensus_mapped_gff3): """ """ db_evm = gffutils.create_db(final_update, ':memory:', merge_strategy='create_unique', keep_order=True) ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")] db_gmap = gffutils.create_db(consensus_mapped_gff3, ':memory:', merge_strategy='create_unique', keep_order=True) ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")] ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("gene")] uniq_evm = [evm for evm in ids_evm if not evm in ids_gmap] mRNA = [] for evm in uniq_evm: for line in db_evm.parents(evm, order_by='start'): mRNA.append(line.attributes["ID"][0]) mRNA_uniq = list(set(mRNA)) outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.1.", suffix=".gff3", dir=wd) gff_out_s = gffwriter.GFFWriter(outfile.name) for name in mRNA_uniq: for i in db_evm.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_evm[name]) for name in ids_gmap_full: for i in db_gmap.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_gmap[name]) gff_out_s.close() return outfile.name #return outfile_out.name
Example #8
Source File: extractGeneRegionFromGMLGenes.py From panaroo with MIT License | 5 votes |
def get_gene_coordinates_single( gff_file, contig, gene, l ): #Takes a GFF file, splits into CDS and FASTA and identifies the coordinates of the 2 genes gff = open(gff_file).read() #Import the GFF file split = gff.split("##FASTA") geneCoordinates = [] #Will be filled with the coordinates of the 2 genes with StringIO(split[1] ) as temp_fasta: #Import the sequences from the GFF as fasta sequences = list(SeqIO.parse(temp_fasta, "fasta")) parsed_gff = gffutils.create_db(clean_gff_string(split[0]), dbfn=":memory:", force=True, keep_order=True, from_string=True) for geneSample in parsed_gff.all_features( featuretype=()): #Iterate through the genes if geneSample.id == gene: #Check if the gene is one of the genes of interest geneCoordinates.append(geneSample.start) geneCoordinates.append(geneSample.stop) firstPosition = min( geneCoordinates ) - l + 1 #The first position to be extracted from the contig endPosition = max( geneCoordinates) + l #The end position to be extracted from the contig if firstPosition < 0: firstPosition = 0 if endPosition > len(sequences[int(contig)].seq): endPosition = len(sequences[int(contig)].seq) extractSequence = sequences[int(contig)].seq[ firstPosition: endPosition] #Assign to the sequence in the region of interest return extractSequence
Example #9
Source File: readGtf.py From pyGenomeTracks with GNU General Public License v3.0 | 4 votes |
def __init__(self, file_path, prefered_name="transcript_name", merge_transcripts=True): """ :param file_path: the path of the gtf file :return: """ self.file_type = 'bed12' # list of bed fields self.fields = ['chromosome', 'start', 'end', 'name', 'score', 'strand', 'thick_start', 'thick_end', 'rgb', 'block_count', 'block_sizes', 'block_starts'] self.BedInterval = collections.namedtuple('BedInterval', self.fields) # I think the name which should be written # should be the transcript_name # But we can change it to gene_name self.prefered_name = prefered_name self.merge_transcripts = merge_transcripts # Will process the gtf to get one item per transcript: # This will create a database: try: self.db = gffutils.create_db(file_path, ':memory:') except ValueError as ve: if "No lines parsed" in str(ve): self.length = 0 self.all_transcripts = open(file_path, 'r') else: raise InputError("This is not a gtf file.") else: if self.merge_transcripts: self.length = len([i for i in self.db.features_of_type("gene")]) self.all_transcripts = self.db.features_of_type("gene", order_by='start') else: self.length = len([i for i in self.db.features_of_type("transcript")]) self.all_transcripts = self.db.features_of_type("transcript", order_by='start')
Example #10
Source File: splicing.py From kipoiseq with MIT License | 4 votes |
def generate_exons(gtf_file, overhang=(100, 100), # overhang from the exon gtf_db_path=":memory:", out_file=None, disable_infer_transcripts=True, disable_infer_genes=True, firstLastNoExtend=True, source_filter=None): """ Build IntervalTree object from gtf file for one feature unit (e.g. gene, exon). If give out_file, pickle it. Args: gtf_file: gtf format file or pickled Intervaltree object. overhang: flanking intron length to take along with exon. Corresponding to left (acceptor side) and right (donor side) gtf_db_path: (optional) gtf database path. Database for one gtf file only need to be created once out_file: (optional) file path to store the pickled Intervaltree obejct. Next time run it can be given to `gtf_file` disable_infer_transcripts: option to disable infering transcripts. Can be True if the gtf file has transcripts annotated. disable_infer_genes: option to disable infering genes. Can be True if the gtf file has genes annotated. firstLastNoExtend: if True, overhang is not taken for 5' of the first exon, or 3' of the last exon of a gene. source_filter: gene source filters, such as "protein_coding" filter for protein coding genes """ try: gtf_db = gffutils.interface.FeatureDB(gtf_db_path) except ValueError: gtf_db = gffutils.create_db( gtf_file, gtf_db_path, disable_infer_transcripts=disable_infer_transcripts, disable_infer_genes=disable_infer_genes) genes = gtf_db.features_of_type('gene') default_overhang = overhang for gene in genes: if source_filter is not None: if gene.source != source_filter: continue for exon in gtf_db.children(gene, featuretype='exon'): isLast = False # track whether is last exon if firstLastNoExtend: if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start): overhang = (overhang[0], 0) isLast = True elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end): overhang = (0, overhang[1]) exon = ExonInterval.from_feature(exon, overhang) exon.isLast = isLast overhang = default_overhang yield exon
Example #11
Source File: mapping.py From LoReAn with MIT License | 4 votes |
def longest_cds(gff_file, gff_filerc, verbose, wd, filename): db = gffutils.create_db(gff_file, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform) dbrc = gffutils.create_db(gff_filerc, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform) list_mrna = [mRNA.attributes["ID"][0] for mRNA in db.features_of_type('mRNA')] list_mrna_rc = [mRNA.attributes["ID"][0] for mRNA in dbrc.features_of_type('mRNA')] list_all = list(set(list_mrna + list_mrna_rc)) list_db = [] list_db_rc = [] for mrna_id in list_all: cds_len = [int(i.end) - int(i.start) for i in db.children(mrna_id, featuretype='CDS', order_by='start')] cds_len_rc = [int(i.end) - int(i.start) for i in dbrc.children(mrna_id, featuretype='CDS', order_by='start')] if cds_len == cds_len_rc: list_db.append(mrna_id) elif cds_len > cds_len_rc: list_db.append(mrna_id) else: list_db_rc.append(mrna_id) gff_out = gffwriter.GFFWriter(filename) for evm in list_db: if evm in list_mrna: for i in db.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) i = db[evm] gff_out.write_rec(i) for i in db.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) for evm in list_db_rc: if evm in list_mrna_rc: for i in dbrc.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) i = dbrc[evm] gff_out.write_rec(i) for i in dbrc.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in dbrc.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) gff_out.close() if verbose: print(filename) return filename
Example #12
Source File: getRightStrand.py From LoReAn with MIT License | 4 votes |
def genename_last(gff_filename, prefix, verbose, wd, dict_ref_name, step): global prefix_name prefix_name = prefix out = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd) err = tempfile.NamedTemporaryFile(delete=False, mode="w") gt_com = GT_GFF3 % gff_filename if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True) gt_call.communicate() db1 = gffutils.create_db(out.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_name) gene_count = 0 list_mrna = [mRNA.attributes["ID"][0] for mRNA in db1.features_of_type('mRNA')] out_gff = tempfile.NamedTemporaryFile(delete=False, prefix="gffread", suffix=".gff3", dir=wd) gff_out = gffwriter.GFFWriter(out_gff.name) gene_name = [] for evm in list_mrna: for i in db1.children(evm, featuretype='CDS', order_by='start'): if i.chrom in dict_ref_name: i.chrom = dict_ref_name[i.chrom] gff_out.write_rec(i) i = db1[evm] if i.chrom in dict_ref_name: i.chrom = dict_ref_name[i.chrom] gff_out.write_rec(i) for i in db1.parents(evm, featuretype='gene', order_by='start'): if i.chrom in dict_ref_name: i.chrom = dict_ref_name[i.chrom] id_gene = i.attributes['ID'][0] if not id_gene in gene_name: gff_out.write_rec(i) gene_name.append(id_gene) for i in db1.children(evm, featuretype='exon', order_by='start'): if i.chrom in dict_ref_name: i.chrom = dict_ref_name[i.chrom] gff_out.write_rec(i) gff_out.close() if "pasa" in step: out_name = os.path.join(wd, "Final.evm.update.gff3") with open(out_name, "w") as fh: gt_com = GT_RETAINID % out_gff.name if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True) gt_call.communicate() if "lorean" in step: out_name = os.path.join(wd, "Final.LoReAn.update.gff3") with open(out_name, "w") as fh: gt_com = GT_RETAINID % out_gff.name if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True) gt_call.communicate() if verbose: print(out_name) return out_name
Example #13
Source File: getRightStrand.py From LoReAn with MIT License | 4 votes |
def pep_seq(myFasta, final_evm): fasta = {} db = gffutils.create_db(final_evm, ':memory:', merge_strategy="create_unique", keep_order=True) gff_file = final_evm + ".gff3" gff_out = gffwriter.GFFWriter(gff_file) for t in db.features_of_type('mRNA', order_by='start'): position = [] seq_combined = '' j = 0 for i in db.children(t, featuretype='CDS', order_by='start'): j += 1 if j == 1: pphase = i[7] seq = i.sequence(myFasta, use_strand=False) seq_combined += seq position = position + [i.start,i.stop] seq_combined = SeqRecord(Seq(seq_combined, generic_dna)) #print(t.attributes["ID"][0]) #print(seq_combined.seq) if t.strand == '-': pphase = i[7] seq_combined = seq_combined.reverse_complement() if pphase == "0" or pphase == ".": seq_transl = seq_combined.translate() elif pphase == "1": seq_transl = seq_combined[1:].translate() elif pphase == "2": seq_transl = seq_combined[2:].translate() seq_transl.description = position seq_transl.id = t.id position = sorted(position) t.start = position[0] t.stop = position[-1] fasta[str(seq_transl.seq)] = [seq_transl, t] count = 0 for key in fasta: for i in db.parents(fasta[key][1], featuretype='gene', order_by='start'): gff_out.write_rec(i) i.start = fasta[key][1].start i.stop = fasta[key][1].stop gff_out.write_rec(fasta[key][1]) for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'): gff_out.write_rec(i) for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'): count += 1 i.featuretype="exon" if "ID" in i.attributes: i.attributes["ID"][0] = i.attributes["ID"][0] + "-" + str(count) else: i.attributes["ID"] = ["exon" + "-" + str(count)] gff_out.write_rec(i) return (gff_file)
Example #14
Source File: extractGeneRegionFromGMLGenes.py From panaroo with MIT License | 4 votes |
def get_gene_coordinates( gff_file, contig, genes, gene1, l ): #Takes a GFF file, splits into CDS and FASTA and identifies the coordinates of the 2 genes gff = open(gff_file).read() #Import the GFF file split = gff.split("##FASTA") geneCoordinates = [] #Will be filled with the coordinates of the 2 genes with StringIO(split[1] ) as temp_fasta: #Import the sequences from the GFF as fasta sequences = list(SeqIO.parse(temp_fasta, "fasta")) parsed_gff = gffutils.create_db(clean_gff_string(split[0]), dbfn=":memory:", force=True, keep_order=True, from_string=True) for geneSample in parsed_gff.all_features( featuretype=()): #Iterate through the genes if geneSample.id == genes[0] or geneSample.id == genes[ 1]: #Check if the gene is one of the genes of interest geneCoordinates.append(geneSample.start) geneCoordinates.append(geneSample.stop) if geneSample.id == gene1: #Check if the gene is gene 1 strand = geneSample.strand firstPosition = min( geneCoordinates ) - l + 1 #The first position to be extracted from the contig endPosition = max( geneCoordinates) + l #The end position to be extracted from the contig if firstPosition < 0: firstPosition = 0 if endPosition > len(sequences[int(contig)].seq): endPosition = len(sequences[int(contig)].seq) if strand == "+": #Check if the sequence needs to be reverse complemented extractSequence = sequences[int(contig)].seq[ firstPosition: endPosition] #Assign to the sequence in the region of interest else: extractSequence = sequences[int( contig)].seq[firstPosition:endPosition].reverse_complement() return extractSequence
Example #15
Source File: run_ppanggolin.py From panaroo with MIT License | 4 votes |
def post_process_fmt(input_files, out_dir): # get mapping between GFF ids and ppanggolin ids id_mapping = {} for f in input_files: prefix = os.path.splitext(os.path.basename(f))[0] #Split file and parse gff_file = open(f, 'r') lines = gff_file.read() split = lines.split('##FASTA') if len(split) != 2: print("Problem reading GFF3 file: ", gff_file.name) raise RuntimeError("Error reading prokka input!") parsed_gff = gff.create_db(clean_gff_string(split[0]), dbfn=":memory:", force=True, keep_order=True, from_string=True) gene_count = 0 for entry in parsed_gff.all_features(featuretype=()): if "CDS" not in entry.featuretype: continue ppanggolin_id = prefix + "_CDS_" + str(gene_count).zfill(4) id_mapping[ppanggolin_id] = entry.id gene_count += 1 gff_file.close() id_mapping[''] = '' with open(out_dir + "ppanggolin_gene_presence_absence.csv", 'w') as outfile: with open(out_dir + "matrix.csv", 'r') as infile: outfile.write(next(infile)) for line in infile: line = line.strip().split(",") for i in range(14, len(line)): line[i] = ";".join([id_mapping[g.strip('"')] for g in line[i].split()]) outfile.write(",".join(line) + "\n") return
Example #16
Source File: run_pirate.py From panaroo with MIT License | 4 votes |
def post_process_fmt(input_files, pirate_dir, out_dir): file_id_maps = {} for f in input_files: #Split file and parse gff_file = open(f, 'r') lines = gff_file.read() split = lines.split('##FASTA') if len(split) != 2: print("Problem reading GFF3 file: ", gff_file.name) raise RuntimeError("Error reading prokka input!") parsed_gff = gff.create_db(clean_gff_string(split[0]), dbfn=":memory:", force=True, keep_order=True, from_string=True) gene_count = 1 count_to_id = {} for entry in parsed_gff.all_features(featuretype=()): if "CDS" not in entry.featuretype: continue count_to_id[gene_count] = entry.id gene_count += 1 file_id_maps[os.path.splitext(os.path.basename(f))[0].replace( "-", "_")] = count_to_id gff_file.close() with open(pirate_dir + "PIRATE.gene_families.ordered.tsv", 'r') as infile: with open(out_dir + "pirate_gene_presence_absence.csv", 'w') as outfile: line = next(infile) line = line.strip().split("\t") outfile.write("\t".join([line[0]] + line[22:]) + "\n") for line in infile: line = line.strip().split("\t") for i in range(22, len(line)): if line[i] == "": continue # currently take the first as we do in panaroo for this format. May want to redo. line[i] = line[i].split(";")[0] sample = "_".join(line[i].split("_")[:-1]) gene_num = int(line[i].split("_")[-1]) line[i] = file_id_maps[sample][gene_num] outfile.write("\t".join([line[0]] + line[22:]) + "\n") return