Python gffutils.create_db() Examples

The following are 16 code examples of gffutils.create_db(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gffutils , or try the search function .
Example #1
Source File: extract_features.py    From Liftoff with GNU General Public License v3.0 6 votes vote down vote up
def create_feature_db_connections(g_arg, db_arg, infer_transcripts, infer_genes):
    if infer_transcripts is True:
        disable_transcripts = False
    else:
        disable_transcripts = True
    if infer_genes is True:
        disable_genes = False
    else:
        disable_genes = True
    if db_arg is None:
        feature_db = gffutils.create_db(g_arg, g_arg + "_db", merge_strategy="create_unique", force=True,
                                 disable_infer_transcripts=disable_transcripts, disable_infer_genes=disable_genes, verbose=True)
        feature_db_name = db_arg
    else:
        feature_db = gffutils.FeatureDB(db_arg)
        feature_db_name = db_arg
    return feature_db, feature_db_name 
Example #2
Source File: gtf.py    From outrigger with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def create_db(gtf_filename, db_filename=None):
    db_filename = ':memory:' if db_filename is None else db_filename

    db = gffutils.create_db(
        gtf_filename,
        db_filename,
        merge_strategy='merge',
        id_spec={'gene': 'gene_id', 'transcript': 'transcript_id',
                 'exon': 'location_id', 'CDS': 'location_id',
                 'start_codon': 'location_id',
                 'stop_codon': 'location_id', 'UTR': 'location_id'},
        transform=transform,
        force=True,
        verbose=True,
        disable_infer_genes=True,
        disable_infer_transcripts=True,
        force_merge_fields=['source'])
    maybe_analyze(db)
    return db 
Example #3
Source File: dataloader.py    From models with MIT License 5 votes vote down vote up
def __init__(self, gtf_file, fasta_file,
                 disable_infer_transcripts=True,
                 disable_infer_genes=True):
        self.gtf_file = gtf_file
        self.fasta_file = fasta_file

        self.fasta_extractor = None

        self.db = gffutils.create_db(gtf_file, ":memory:",
                                     disable_infer_transcripts=disable_infer_transcripts,
                                     disable_infer_genes=disable_infer_genes)
        self.start_codons = list(self.db.features_of_type("start_codon"))
        self.input_transform = OneHot() 
Example #4
Source File: dataloader.py    From models with MIT License 5 votes vote down vote up
def _get_introns(self):
        """
        Get introns from the specified seq and annotation files.

        :return: List of all detected introns as gffutils.Feature objects.
        """
        # create a gffutils database
        self.db = gffutils.create_db(data=self.gtf_file, dbfn=":memory:",
                                     force=True, id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
                                     disable_infer_transcripts=True, disable_infer_genes=True, verbose=False,
                                     merge_strategy="merge")

        if not self.create_introns:
            # load introns from gtf, don't create them
            introns = list(self.db.features_of_type('intron', order_by=('seqid', 'start', 'end')))  # exons are sorted start-coord. asc.
            self._add_SOI(introns)
            return introns

        exons = list(self.db.features_of_type('exon', order_by=('seqid', 'start', 'end')))  # exons are sorted start-coord. asc.

        # group exons in a dict by gene id
        transcript_to_exon = self._get_tr_to_exon_dict(exons)

        collected_introns = self._build_introns(transcript_to_exon)

        self._add_SOI(collected_introns)

        return collected_introns 
Example #5
Source File: dataloader.py    From models with MIT License 5 votes vote down vote up
def _get_introns(self):
        """
        Get introns from the specified seq and annotation files.

        :return: List of all detected introns as gffutils.Feature objects.
        """
        # create a gffutils database
        self.db = gffutils.create_db(data=self.gtf_file, dbfn=":memory:",
                                     force=True, id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
                                     disable_infer_transcripts=True, disable_infer_genes=True, verbose=False,
                                     merge_strategy="merge")

        if not self.create_introns:
            # load introns from gtf, don't create them
            introns = list(self.db.features_of_type('intron', order_by=('seqid', 'start', 'end')))  # exons are sorted start-coord. asc.
            self._add_SOI(introns)
            return introns

        exons = list(self.db.features_of_type('exon', order_by=('seqid', 'start', 'end')))  # exons are sorted start-coord. asc.

        # group exons in a dict by gene id
        transcript_to_exon = self._get_tr_to_exon_dict(exons)

        collected_introns = self._build_introns(transcript_to_exon)

        self._add_SOI(collected_introns)

        return collected_introns 
Example #6
Source File: getRightStrand.py    From LoReAn with MIT License 5 votes vote down vote up
def add_removed_evm(pasa, exon, wd):
    """
    here the clusters of sequence from the same locus are prepared
    """

    db_evm = gffutils.create_db(pasa, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]

    db_gmap = gffutils.create_db(exon, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
    ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("mRNA")]


    uniq_evm = [evm for evm in ids_evm if evm not in ids_gmap]
    uniq_gene = [gene.attributes["ID"][0] for mrna in uniq_evm for gene in db_evm.parents(mrna)]
    uniq = list(set(uniq_gene))


    outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile.name)

    for name in uniq:
        for i in db_evm.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_evm[name])
    for name in ids_gmap_full:
        for i in db_gmap.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gmap[name])
    gff_out_s.close()

    return outfile.name




#final_evm="/home/lfaino/lfainoData/lorean/LoReAn_Example/Crispa/LoReAn_crispa/run/exonerate/exonerate_step4.final.4e4aoxwb.gff3"
#genome = "/home/lfaino/lfainoData/lorean/LoReAn_Example/Crispa/LoReAn_crispa/run/split/scaffold3.fasta.masked.rename.fasta" 
Example #7
Source File: collectOnly.py    From LoReAn with MIT License 5 votes vote down vote up
def add_EVM(final_update, wd, consensus_mapped_gff3):

    """
    """

    db_evm = gffutils.create_db(final_update, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]

    db_gmap = gffutils.create_db(consensus_mapped_gff3, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
    ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("gene")]

    uniq_evm = [evm for evm in ids_evm if not evm in ids_gmap]

    mRNA = []
    for evm in uniq_evm:
        for line in db_evm.parents(evm, order_by='start'):
            mRNA.append(line.attributes["ID"][0])
    mRNA_uniq = list(set(mRNA))
    outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.1.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile.name)

    for name in mRNA_uniq:
        for i in db_evm.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_evm[name])
    for name in ids_gmap_full:
        for i in db_gmap.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gmap[name])
    gff_out_s.close()

    return outfile.name
    #return outfile_out.name 
Example #8
Source File: extractGeneRegionFromGMLGenes.py    From panaroo with MIT License 5 votes vote down vote up
def get_gene_coordinates_single(
    gff_file, contig, gene, l
):  #Takes a GFF file, splits into CDS and FASTA and identifies the coordinates of the 2 genes
    gff = open(gff_file).read()  #Import the GFF file
    split = gff.split("##FASTA")

    geneCoordinates = []  #Will be filled with the coordinates of the 2 genes

    with StringIO(split[1]
                  ) as temp_fasta:  #Import the sequences from the GFF as fasta
        sequences = list(SeqIO.parse(temp_fasta, "fasta"))

    parsed_gff = gffutils.create_db(clean_gff_string(split[0]),
                                    dbfn=":memory:",
                                    force=True,
                                    keep_order=True,
                                    from_string=True)

    for geneSample in parsed_gff.all_features(
            featuretype=()):  #Iterate through the genes
        if geneSample.id == gene:  #Check if the gene is one of the genes of interest
            geneCoordinates.append(geneSample.start)
            geneCoordinates.append(geneSample.stop)

    firstPosition = min(
        geneCoordinates
    ) - l + 1  #The first position to be extracted from the contig
    endPosition = max(
        geneCoordinates) + l  #The end position to be extracted from the contig
    if firstPosition < 0:
        firstPosition = 0
    if endPosition > len(sequences[int(contig)].seq):
        endPosition = len(sequences[int(contig)].seq)

    extractSequence = sequences[int(contig)].seq[
        firstPosition:
        endPosition]  #Assign to the sequence in the region of interest

    return extractSequence 
Example #9
Source File: readGtf.py    From pyGenomeTracks with GNU General Public License v3.0 4 votes vote down vote up
def __init__(self, file_path, prefered_name="transcript_name",
                 merge_transcripts=True):
        """
        :param file_path: the path of the gtf file
        :return:
        """

        self.file_type = 'bed12'

        # list of bed fields
        self.fields = ['chromosome', 'start', 'end',
                       'name', 'score', 'strand',
                       'thick_start', 'thick_end',
                       'rgb', 'block_count',
                       'block_sizes', 'block_starts']

        self.BedInterval = collections.namedtuple('BedInterval', self.fields)
        # I think the name which should be written
        # should be the transcript_name
        # But we can change it to gene_name
        self.prefered_name = prefered_name
        self.merge_transcripts = merge_transcripts

        # Will process the gtf to get one item per transcript:
        # This will create a database:
        try:
            self.db = gffutils.create_db(file_path, ':memory:')
        except ValueError as ve:
            if "No lines parsed" in str(ve):
                self.length = 0
                self.all_transcripts = open(file_path, 'r')
            else:
                raise InputError("This is not a gtf file.")
        else:
            if self.merge_transcripts:
                self.length = len([i for i in self.db.features_of_type("gene")])
                self.all_transcripts = self.db.features_of_type("gene",
                                                                order_by='start')
            else:
                self.length = len([i for i in self.db.features_of_type("transcript")])
                self.all_transcripts = self.db.features_of_type("transcript",
                                                                order_by='start') 
Example #10
Source File: splicing.py    From kipoiseq with MIT License 4 votes vote down vote up
def generate_exons(gtf_file,
                   overhang=(100, 100),  # overhang from the exon
                   gtf_db_path=":memory:",
                   out_file=None,
                   disable_infer_transcripts=True,
                   disable_infer_genes=True,
                   firstLastNoExtend=True,
                   source_filter=None):
    """
    Build IntervalTree object from gtf file for one feature unit (e.g. gene, exon). If give out_file, pickle it.
    Args:
        gtf_file: gtf format file or pickled Intervaltree object.
        overhang: flanking intron length to take along with exon. Corresponding to left (acceptor side) and right (donor side)
        gtf_db_path: (optional) gtf database path. Database for one gtf file only need to be created once
        out_file: (optional) file path to store the pickled Intervaltree obejct. Next time run it can be given to `gtf_file`
        disable_infer_transcripts: option to disable infering transcripts. Can be True if the gtf file has transcripts annotated.
        disable_infer_genes: option to disable infering genes. Can be True if the gtf file has genes annotated.
        firstLastNoExtend: if True, overhang is not taken for 5' of the first exon, or 3' of the last exon of a gene.
        source_filter: gene source filters, such as "protein_coding" filter for protein coding genes
    """
    try:
        gtf_db = gffutils.interface.FeatureDB(gtf_db_path)
    except ValueError:
        gtf_db = gffutils.create_db(
            gtf_file,
            gtf_db_path,
            disable_infer_transcripts=disable_infer_transcripts,
            disable_infer_genes=disable_infer_genes)

    genes = gtf_db.features_of_type('gene')
    default_overhang = overhang
    for gene in genes:
        if source_filter is not None:
            if gene.source != source_filter:
                continue
        for exon in gtf_db.children(gene, featuretype='exon'):
            isLast = False  # track whether is last exon
            if firstLastNoExtend:
                if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start):
                    overhang = (overhang[0], 0)
                    isLast = True
                elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end):
                    overhang = (0, overhang[1])
            exon = ExonInterval.from_feature(exon, overhang)
            exon.isLast = isLast
            overhang = default_overhang
            yield exon 
Example #11
Source File: mapping.py    From LoReAn with MIT License 4 votes vote down vote up
def longest_cds(gff_file, gff_filerc, verbose, wd, filename):
    db = gffutils.create_db(gff_file, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform)
    dbrc = gffutils.create_db(gff_filerc, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform)
    list_mrna = [mRNA.attributes["ID"][0] for mRNA in db.features_of_type('mRNA')]
    list_mrna_rc = [mRNA.attributes["ID"][0] for mRNA in dbrc.features_of_type('mRNA')]
    list_all = list(set(list_mrna + list_mrna_rc))
    list_db = []
    list_db_rc = []
    for mrna_id in list_all:
        cds_len = [int(i.end) - int(i.start) for i in db.children(mrna_id, featuretype='CDS', order_by='start')]
        cds_len_rc = [int(i.end) - int(i.start) for i in dbrc.children(mrna_id, featuretype='CDS', order_by='start')]
        if cds_len == cds_len_rc:
            list_db.append(mrna_id)
        elif cds_len > cds_len_rc:
            list_db.append(mrna_id)
        else:
            list_db_rc.append(mrna_id)
    gff_out = gffwriter.GFFWriter(filename)
    for evm in list_db:
        if evm in list_mrna:
            for i in db.children(evm, featuretype='CDS', order_by='start'):
                gff_out.write_rec(i)
            i = db[evm]
            gff_out.write_rec(i)
            for i in db.parents(evm, featuretype='gene', order_by='start'):
                gff_out.write_rec(i)
            for i in db.children(evm, featuretype='exon', order_by='start'):
                gff_out.write_rec(i)
    for evm in list_db_rc:
        if evm in list_mrna_rc:
            for i in dbrc.children(evm, featuretype='CDS', order_by='start'):
                gff_out.write_rec(i)
            i = dbrc[evm]
            gff_out.write_rec(i)
            for i in dbrc.parents(evm, featuretype='gene', order_by='start'):
                gff_out.write_rec(i)
            for i in dbrc.children(evm, featuretype='exon', order_by='start'):
                gff_out.write_rec(i)
    gff_out.close()
    if verbose:
        print(filename)
    return filename 
Example #12
Source File: getRightStrand.py    From LoReAn with MIT License 4 votes vote down vote up
def genename_last(gff_filename, prefix, verbose, wd, dict_ref_name, step):

    global prefix_name
    prefix_name = prefix
    out = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, mode="w")
    gt_com = GT_GFF3 % gff_filename
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True)
    gt_call.communicate()

    db1 = gffutils.create_db(out.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_name)
    gene_count = 0
    list_mrna = [mRNA.attributes["ID"][0] for mRNA in db1.features_of_type('mRNA')]
    out_gff = tempfile.NamedTemporaryFile(delete=False, prefix="gffread", suffix=".gff3", dir=wd)
    gff_out = gffwriter.GFFWriter(out_gff.name)
    gene_name = []
    for evm in list_mrna:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            gff_out.write_rec(i)
        i = db1[evm]
        if i.chrom in dict_ref_name:
            i.chrom = dict_ref_name[i.chrom]
        gff_out.write_rec(i)
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            id_gene = i.attributes['ID'][0]
            if not id_gene in gene_name:
                gff_out.write_rec(i)
                gene_name.append(id_gene)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            gff_out.write_rec(i)
    gff_out.close()
    if "pasa" in step:
        out_name = os.path.join(wd, "Final.evm.update.gff3")
        with open(out_name, "w") as fh:
            gt_com = GT_RETAINID % out_gff.name
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % gt_com)
            gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True)
            gt_call.communicate()
    if "lorean" in step:
        out_name = os.path.join(wd, "Final.LoReAn.update.gff3")
        with open(out_name, "w") as fh:
            gt_com = GT_RETAINID % out_gff.name
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % gt_com)
            gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True)
            gt_call.communicate()
    if verbose:
        print(out_name)
    return out_name 
Example #13
Source File: getRightStrand.py    From LoReAn with MIT License 4 votes vote down vote up
def pep_seq(myFasta, final_evm):
    fasta = {}
    db = gffutils.create_db(final_evm, ':memory:', merge_strategy="create_unique", keep_order=True)
    gff_file = final_evm + ".gff3"
    gff_out = gffwriter.GFFWriter(gff_file)
    for t in db.features_of_type('mRNA', order_by='start'):
        position = []
        seq_combined = ''
        j = 0
        for i in db.children(t, featuretype='CDS', order_by='start'):
            j += 1
            if j == 1:
                pphase = i[7]
            seq = i.sequence(myFasta, use_strand=False)
            seq_combined += seq
            position = position + [i.start,i.stop]
        seq_combined = SeqRecord(Seq(seq_combined, generic_dna))
        #print(t.attributes["ID"][0])
        #print(seq_combined.seq)
        if t.strand == '-':
            pphase = i[7]
            seq_combined = seq_combined.reverse_complement()
        if pphase == "0" or pphase == ".":
            seq_transl = seq_combined.translate()
        elif pphase == "1":
            seq_transl = seq_combined[1:].translate()
        elif pphase == "2":
            seq_transl = seq_combined[2:].translate()
        seq_transl.description = position
        seq_transl.id = t.id
        position = sorted(position)
        t.start = position[0]
        t.stop = position[-1]
        fasta[str(seq_transl.seq)] = [seq_transl, t]
    count = 0
    for key in fasta:
        for i in db.parents(fasta[key][1], featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
            i.start = fasta[key][1].start
            i.stop = fasta[key][1].stop
        gff_out.write_rec(fasta[key][1])
        for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'):
            count += 1
            i.featuretype="exon"
            if "ID" in i.attributes:
                i.attributes["ID"][0] = i.attributes["ID"][0] + "-" + str(count)
            else:
                i.attributes["ID"] = ["exon" + "-" + str(count)]
            gff_out.write_rec(i)
    return (gff_file) 
Example #14
Source File: extractGeneRegionFromGMLGenes.py    From panaroo with MIT License 4 votes vote down vote up
def get_gene_coordinates(
    gff_file, contig, genes, gene1, l
):  #Takes a GFF file, splits into CDS and FASTA and identifies the coordinates of the 2 genes
    gff = open(gff_file).read()  #Import the GFF file
    split = gff.split("##FASTA")

    geneCoordinates = []  #Will be filled with the coordinates of the 2 genes

    with StringIO(split[1]
                  ) as temp_fasta:  #Import the sequences from the GFF as fasta
        sequences = list(SeqIO.parse(temp_fasta, "fasta"))

    parsed_gff = gffutils.create_db(clean_gff_string(split[0]),
                                    dbfn=":memory:",
                                    force=True,
                                    keep_order=True,
                                    from_string=True)

    for geneSample in parsed_gff.all_features(
            featuretype=()):  #Iterate through the genes
        if geneSample.id == genes[0] or geneSample.id == genes[
                1]:  #Check if the gene is one of the genes of interest
            geneCoordinates.append(geneSample.start)
            geneCoordinates.append(geneSample.stop)
        if geneSample.id == gene1:  #Check if the gene is gene 1
            strand = geneSample.strand

    firstPosition = min(
        geneCoordinates
    ) - l + 1  #The first position to be extracted from the contig
    endPosition = max(
        geneCoordinates) + l  #The end position to be extracted from the contig
    if firstPosition < 0:
        firstPosition = 0
    if endPosition > len(sequences[int(contig)].seq):
        endPosition = len(sequences[int(contig)].seq)

    if strand == "+":  #Check if the sequence needs to be reverse complemented
        extractSequence = sequences[int(contig)].seq[
            firstPosition:
            endPosition]  #Assign to the sequence in the region of interest
    else:
        extractSequence = sequences[int(
            contig)].seq[firstPosition:endPosition].reverse_complement()

    return extractSequence 
Example #15
Source File: run_ppanggolin.py    From panaroo with MIT License 4 votes vote down vote up
def post_process_fmt(input_files, out_dir):

    # get mapping between GFF ids and ppanggolin ids
    id_mapping = {}
    for f in input_files:
        prefix = os.path.splitext(os.path.basename(f))[0]

        #Split file and parse
        gff_file = open(f, 'r')
        lines = gff_file.read()
        split = lines.split('##FASTA')

        if len(split) != 2:
            print("Problem reading GFF3 file: ", gff_file.name)
            raise RuntimeError("Error reading prokka input!")

        parsed_gff = gff.create_db(clean_gff_string(split[0]),
                                   dbfn=":memory:",
                                   force=True,
                                   keep_order=True,
                                   from_string=True)

        gene_count = 0
        for entry in parsed_gff.all_features(featuretype=()):
            if "CDS" not in entry.featuretype: continue
            ppanggolin_id = prefix + "_CDS_" + str(gene_count).zfill(4)
            id_mapping[ppanggolin_id] = entry.id
            gene_count += 1

        gff_file.close()
    
    id_mapping[''] = ''

    with open(out_dir + "ppanggolin_gene_presence_absence.csv", 'w') as outfile:
        with open(out_dir + "matrix.csv", 'r') as infile:
            outfile.write(next(infile))
            for line in infile:
                line = line.strip().split(",")
                for i in range(14, len(line)):
                    line[i] = ";".join([id_mapping[g.strip('"')] for g in line[i].split()])
                outfile.write(",".join(line) + "\n")

    return 
Example #16
Source File: run_pirate.py    From panaroo with MIT License 4 votes vote down vote up
def post_process_fmt(input_files, pirate_dir, out_dir):

    file_id_maps = {}
    for f in input_files:
        #Split file and parse
        gff_file = open(f, 'r')
        lines = gff_file.read()
        split = lines.split('##FASTA')

        if len(split) != 2:
            print("Problem reading GFF3 file: ", gff_file.name)
            raise RuntimeError("Error reading prokka input!")

        parsed_gff = gff.create_db(clean_gff_string(split[0]),
                                   dbfn=":memory:",
                                   force=True,
                                   keep_order=True,
                                   from_string=True)

        gene_count = 1
        count_to_id = {}
        for entry in parsed_gff.all_features(featuretype=()):
            if "CDS" not in entry.featuretype: continue
            count_to_id[gene_count] = entry.id
            gene_count += 1

        file_id_maps[os.path.splitext(os.path.basename(f))[0].replace(
            "-", "_")] = count_to_id

        gff_file.close()

    with open(pirate_dir + "PIRATE.gene_families.ordered.tsv", 'r') as infile:
        with open(out_dir + "pirate_gene_presence_absence.csv",
                  'w') as outfile:
            line = next(infile)
            line = line.strip().split("\t")
            outfile.write("\t".join([line[0]] + line[22:]) + "\n")

            for line in infile:
                line = line.strip().split("\t")
                for i in range(22, len(line)):
                    if line[i] == "": continue
                    # currently take the first as we do in panaroo for this format. May want to redo.
                    line[i] = line[i].split(";")[0]
                    sample = "_".join(line[i].split("_")[:-1])
                    gene_num = int(line[i].split("_")[-1])
                    line[i] = file_id_maps[sample][gene_num]
                outfile.write("\t".join([line[0]] + line[22:]) + "\n")

    return