Python Examples of gffutils.create

Source File: extract_features.py From Liftoff with GNU General Public License v3.0

6 votes

def create_feature_db_connections(g_arg, db_arg, infer_transcripts, infer_genes):
    if infer_transcripts is True:
        disable_transcripts = False
    else:
        disable_transcripts = True
    if infer_genes is True:
        disable_genes = False
    else:
        disable_genes = True
    if db_arg is None:
        feature_db = gffutils.create_db(g_arg, g_arg + "_db", merge_strategy="create_unique", force=True,
                                 disable_infer_transcripts=disable_transcripts, disable_infer_genes=disable_genes, verbose=True)
        feature_db_name = db_arg
    else:
        feature_db = gffutils.FeatureDB(db_arg)
        feature_db_name = db_arg
    return feature_db, feature_db_name

Source File: gtf.py From outrigger with BSD 3-Clause "New" or "Revised" License

6 votes

def create_db(gtf_filename, db_filename=None):
    db_filename = ':memory:' if db_filename is None else db_filename

    db = gffutils.create_db(
        gtf_filename,
        db_filename,
        merge_strategy='merge',
        id_spec={'gene': 'gene_id', 'transcript': 'transcript_id',
                 'exon': 'location_id', 'CDS': 'location_id',
                 'start_codon': 'location_id',
                 'stop_codon': 'location_id', 'UTR': 'location_id'},
        transform=transform,
        force=True,
        verbose=True,
        disable_infer_genes=True,
        disable_infer_transcripts=True,
        force_merge_fields=['source'])
    maybe_analyze(db)
    return db

Source File: dataloader.py From models with MIT License

5 votes

def __init__(self, gtf_file, fasta_file,
                 disable_infer_transcripts=True,
                 disable_infer_genes=True):
        self.gtf_file = gtf_file
        self.fasta_file = fasta_file

        self.fasta_extractor = None

        self.db = gffutils.create_db(gtf_file, ":memory:",
                                     disable_infer_transcripts=disable_infer_transcripts,
                                     disable_infer_genes=disable_infer_genes)
        self.start_codons = list(self.db.features_of_type("start_codon"))
        self.input_transform = OneHot()

Source File: dataloader.py From models with MIT License

5 votes

def _get_introns(self):
        """
        Get introns from the specified seq and annotation files.

        :return: List of all detected introns as gffutils.Feature objects.
        """
        # create a gffutils database
        self.db = gffutils.create_db(data=self.gtf_file, dbfn=":memory:",
                                     force=True, id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
                                     disable_infer_transcripts=True, disable_infer_genes=True, verbose=False,
                                     merge_strategy="merge")

        if not self.create_introns:
            # load introns from gtf, don't create them
            introns = list(self.db.features_of_type('intron', order_by=('seqid', 'start', 'end')))  # exons are sorted start-coord. asc.
            self._add_SOI(introns)
            return introns

        exons = list(self.db.features_of_type('exon', order_by=('seqid', 'start', 'end')))  # exons are sorted start-coord. asc.

        # group exons in a dict by gene id
        transcript_to_exon = self._get_tr_to_exon_dict(exons)

        collected_introns = self._build_introns(transcript_to_exon)

        self._add_SOI(collected_introns)

        return collected_introns

Source File: dataloader.py From models with MIT License

5 votes

def _get_introns(self):
        """
        Get introns from the specified seq and annotation files.

        :return: List of all detected introns as gffutils.Feature objects.
        """
        # create a gffutils database
        self.db = gffutils.create_db(data=self.gtf_file, dbfn=":memory:",
                                     force=True, id_spec={'gene': 'gene_id', 'transcript': 'transcript_id'},
                                     disable_infer_transcripts=True, disable_infer_genes=True, verbose=False,
                                     merge_strategy="merge")

        if not self.create_introns:
            # load introns from gtf, don't create them
            introns = list(self.db.features_of_type('intron', order_by=('seqid', 'start', 'end')))  # exons are sorted start-coord. asc.
            self._add_SOI(introns)
            return introns

        exons = list(self.db.features_of_type('exon', order_by=('seqid', 'start', 'end')))  # exons are sorted start-coord. asc.

        # group exons in a dict by gene id
        transcript_to_exon = self._get_tr_to_exon_dict(exons)

        collected_introns = self._build_introns(transcript_to_exon)

        self._add_SOI(collected_introns)

        return collected_introns

Source File: getRightStrand.py From LoReAn with MIT License

5 votes

def add_removed_evm(pasa, exon, wd):
    """
    here the clusters of sequence from the same locus are prepared
    """

    db_evm = gffutils.create_db(pasa, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]

    db_gmap = gffutils.create_db(exon, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
    ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("mRNA")]


    uniq_evm = [evm for evm in ids_evm if evm not in ids_gmap]
    uniq_gene = [gene.attributes["ID"][0] for mrna in uniq_evm for gene in db_evm.parents(mrna)]
    uniq = list(set(uniq_gene))


    outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile.name)

    for name in uniq:
        for i in db_evm.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_evm[name])
    for name in ids_gmap_full:
        for i in db_gmap.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gmap[name])
    gff_out_s.close()

    return outfile.name




#final_evm="/home/lfaino/lfainoData/lorean/LoReAn_Example/Crispa/LoReAn_crispa/run/exonerate/exonerate_step4.final.4e4aoxwb.gff3"
#genome = "/home/lfaino/lfainoData/lorean/LoReAn_Example/Crispa/LoReAn_crispa/run/split/scaffold3.fasta.masked.rename.fasta"

Source File: collectOnly.py From LoReAn with MIT License

5 votes

def add_EVM(final_update, wd, consensus_mapped_gff3):

    """
    """

    db_evm = gffutils.create_db(final_update, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")]

    db_gmap = gffutils.create_db(consensus_mapped_gff3, ':memory:', merge_strategy='create_unique', keep_order=True)
    ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")]
    ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("gene")]

    uniq_evm = [evm for evm in ids_evm if not evm in ids_gmap]

    mRNA = []
    for evm in uniq_evm:
        for line in db_evm.parents(evm, order_by='start'):
            mRNA.append(line.attributes["ID"][0])
    mRNA_uniq = list(set(mRNA))
    outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.1.", suffix=".gff3", dir=wd)
    gff_out_s = gffwriter.GFFWriter(outfile.name)

    for name in mRNA_uniq:
        for i in db_evm.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_evm[name])
    for name in ids_gmap_full:
        for i in db_gmap.children(name, order_by='start'):
            gff_out_s.write_rec(i)
        gff_out_s.write_rec(db_gmap[name])
    gff_out_s.close()

    return outfile.name
    #return outfile_out.name

Source File: extractGeneRegionFromGMLGenes.py From panaroo with MIT License

5 votes

def get_gene_coordinates_single(
    gff_file, contig, gene, l
):  #Takes a GFF file, splits into CDS and FASTA and identifies the coordinates of the 2 genes
    gff = open(gff_file).read()  #Import the GFF file
    split = gff.split("##FASTA")

    geneCoordinates = []  #Will be filled with the coordinates of the 2 genes

    with StringIO(split[1]
                  ) as temp_fasta:  #Import the sequences from the GFF as fasta
        sequences = list(SeqIO.parse(temp_fasta, "fasta"))

    parsed_gff = gffutils.create_db(clean_gff_string(split[0]),
                                    dbfn=":memory:",
                                    force=True,
                                    keep_order=True,
                                    from_string=True)

    for geneSample in parsed_gff.all_features(
            featuretype=()):  #Iterate through the genes
        if geneSample.id == gene:  #Check if the gene is one of the genes of interest
            geneCoordinates.append(geneSample.start)
            geneCoordinates.append(geneSample.stop)

    firstPosition = min(
        geneCoordinates
    ) - l + 1  #The first position to be extracted from the contig
    endPosition = max(
        geneCoordinates) + l  #The end position to be extracted from the contig
    if firstPosition < 0:
        firstPosition = 0
    if endPosition > len(sequences[int(contig)].seq):
        endPosition = len(sequences[int(contig)].seq)

    extractSequence = sequences[int(contig)].seq[
        firstPosition:
        endPosition]  #Assign to the sequence in the region of interest

    return extractSequence

Source File: readGtf.py From pyGenomeTracks with GNU General Public License v3.0

4 votes

def __init__(self, file_path, prefered_name="transcript_name",
                 merge_transcripts=True):
        """
        :param file_path: the path of the gtf file
        :return:
        """

        self.file_type = 'bed12'

        # list of bed fields
        self.fields = ['chromosome', 'start', 'end',
                       'name', 'score', 'strand',
                       'thick_start', 'thick_end',
                       'rgb', 'block_count',
                       'block_sizes', 'block_starts']

        self.BedInterval = collections.namedtuple('BedInterval', self.fields)
        # I think the name which should be written
        # should be the transcript_name
        # But we can change it to gene_name
        self.prefered_name = prefered_name
        self.merge_transcripts = merge_transcripts

        # Will process the gtf to get one item per transcript:
        # This will create a database:
        try:
            self.db = gffutils.create_db(file_path, ':memory:')
        except ValueError as ve:
            if "No lines parsed" in str(ve):
                self.length = 0
                self.all_transcripts = open(file_path, 'r')
            else:
                raise InputError("This is not a gtf file.")
        else:
            if self.merge_transcripts:
                self.length = len([i for i in self.db.features_of_type("gene")])
                self.all_transcripts = self.db.features_of_type("gene",
                                                                order_by='start')
            else:
                self.length = len([i for i in self.db.features_of_type("transcript")])
                self.all_transcripts = self.db.features_of_type("transcript",
                                                                order_by='start')

Source File: splicing.py From kipoiseq with MIT License

4 votes

def generate_exons(gtf_file,
                   overhang=(100, 100),  # overhang from the exon
                   gtf_db_path=":memory:",
                   out_file=None,
                   disable_infer_transcripts=True,
                   disable_infer_genes=True,
                   firstLastNoExtend=True,
                   source_filter=None):
    """
    Build IntervalTree object from gtf file for one feature unit (e.g. gene, exon). If give out_file, pickle it.
    Args:
        gtf_file: gtf format file or pickled Intervaltree object.
        overhang: flanking intron length to take along with exon. Corresponding to left (acceptor side) and right (donor side)
        gtf_db_path: (optional) gtf database path. Database for one gtf file only need to be created once
        out_file: (optional) file path to store the pickled Intervaltree obejct. Next time run it can be given to `gtf_file`
        disable_infer_transcripts: option to disable infering transcripts. Can be True if the gtf file has transcripts annotated.
        disable_infer_genes: option to disable infering genes. Can be True if the gtf file has genes annotated.
        firstLastNoExtend: if True, overhang is not taken for 5' of the first exon, or 3' of the last exon of a gene.
        source_filter: gene source filters, such as "protein_coding" filter for protein coding genes
    """
    try:
        gtf_db = gffutils.interface.FeatureDB(gtf_db_path)
    except ValueError:
        gtf_db = gffutils.create_db(
            gtf_file,
            gtf_db_path,
            disable_infer_transcripts=disable_infer_transcripts,
            disable_infer_genes=disable_infer_genes)

    genes = gtf_db.features_of_type('gene')
    default_overhang = overhang
    for gene in genes:
        if source_filter is not None:
            if gene.source != source_filter:
                continue
        for exon in gtf_db.children(gene, featuretype='exon'):
            isLast = False  # track whether is last exon
            if firstLastNoExtend:
                if (gene.strand == "+" and exon.end == gene.end) or (gene.strand == "-" and exon.start == gene.start):
                    overhang = (overhang[0], 0)
                    isLast = True
                elif (gene.strand == "+" and exon.start == gene.start) or (gene.strand == "-" and exon.end == gene.end):
                    overhang = (0, overhang[1])
            exon = ExonInterval.from_feature(exon, overhang)
            exon.isLast = isLast
            overhang = default_overhang
            yield exon

Source File: mapping.py From LoReAn with MIT License

4 votes

def longest_cds(gff_file, gff_filerc, verbose, wd, filename):
    db = gffutils.create_db(gff_file, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform)
    dbrc = gffutils.create_db(gff_filerc, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform)
    list_mrna = [mRNA.attributes["ID"][0] for mRNA in db.features_of_type('mRNA')]
    list_mrna_rc = [mRNA.attributes["ID"][0] for mRNA in dbrc.features_of_type('mRNA')]
    list_all = list(set(list_mrna + list_mrna_rc))
    list_db = []
    list_db_rc = []
    for mrna_id in list_all:
        cds_len = [int(i.end) - int(i.start) for i in db.children(mrna_id, featuretype='CDS', order_by='start')]
        cds_len_rc = [int(i.end) - int(i.start) for i in dbrc.children(mrna_id, featuretype='CDS', order_by='start')]
        if cds_len == cds_len_rc:
            list_db.append(mrna_id)
        elif cds_len > cds_len_rc:
            list_db.append(mrna_id)
        else:
            list_db_rc.append(mrna_id)
    gff_out = gffwriter.GFFWriter(filename)
    for evm in list_db:
        if evm in list_mrna:
            for i in db.children(evm, featuretype='CDS', order_by='start'):
                gff_out.write_rec(i)
            i = db[evm]
            gff_out.write_rec(i)
            for i in db.parents(evm, featuretype='gene', order_by='start'):
                gff_out.write_rec(i)
            for i in db.children(evm, featuretype='exon', order_by='start'):
                gff_out.write_rec(i)
    for evm in list_db_rc:
        if evm in list_mrna_rc:
            for i in dbrc.children(evm, featuretype='CDS', order_by='start'):
                gff_out.write_rec(i)
            i = dbrc[evm]
            gff_out.write_rec(i)
            for i in dbrc.parents(evm, featuretype='gene', order_by='start'):
                gff_out.write_rec(i)
            for i in dbrc.children(evm, featuretype='exon', order_by='start'):
                gff_out.write_rec(i)
    gff_out.close()
    if verbose:
        print(filename)
    return filename

Source File: getRightStrand.py From LoReAn with MIT License

4 votes

def genename_last(gff_filename, prefix, verbose, wd, dict_ref_name, step):

    global prefix_name
    prefix_name = prefix
    out = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd)
    err = tempfile.NamedTemporaryFile(delete=False, mode="w")
    gt_com = GT_GFF3 % gff_filename
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % gt_com)
    gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True)
    gt_call.communicate()

    db1 = gffutils.create_db(out.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_name)
    gene_count = 0
    list_mrna = [mRNA.attributes["ID"][0] for mRNA in db1.features_of_type('mRNA')]
    out_gff = tempfile.NamedTemporaryFile(delete=False, prefix="gffread", suffix=".gff3", dir=wd)
    gff_out = gffwriter.GFFWriter(out_gff.name)
    gene_name = []
    for evm in list_mrna:
        for i in db1.children(evm, featuretype='CDS', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            gff_out.write_rec(i)
        i = db1[evm]
        if i.chrom in dict_ref_name:
            i.chrom = dict_ref_name[i.chrom]
        gff_out.write_rec(i)
        for i in db1.parents(evm, featuretype='gene', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            id_gene = i.attributes['ID'][0]
            if not id_gene in gene_name:
                gff_out.write_rec(i)
                gene_name.append(id_gene)
        for i in db1.children(evm, featuretype='exon', order_by='start'):
            if i.chrom in dict_ref_name:
                i.chrom = dict_ref_name[i.chrom]
            gff_out.write_rec(i)
    gff_out.close()
    if "pasa" in step:
        out_name = os.path.join(wd, "Final.evm.update.gff3")
        with open(out_name, "w") as fh:
            gt_com = GT_RETAINID % out_gff.name
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % gt_com)
            gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True)
            gt_call.communicate()
    if "lorean" in step:
        out_name = os.path.join(wd, "Final.LoReAn.update.gff3")
        with open(out_name, "w") as fh:
            gt_com = GT_RETAINID % out_gff.name
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % gt_com)
            gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True)
            gt_call.communicate()
    if verbose:
        print(out_name)
    return out_name

Source File: getRightStrand.py From LoReAn with MIT License

4 votes

def pep_seq(myFasta, final_evm):
    fasta = {}
    db = gffutils.create_db(final_evm, ':memory:', merge_strategy="create_unique", keep_order=True)
    gff_file = final_evm + ".gff3"
    gff_out = gffwriter.GFFWriter(gff_file)
    for t in db.features_of_type('mRNA', order_by='start'):
        position = []
        seq_combined = ''
        j = 0
        for i in db.children(t, featuretype='CDS', order_by='start'):
            j += 1
            if j == 1:
                pphase = i[7]
            seq = i.sequence(myFasta, use_strand=False)
            seq_combined += seq
            position = position + [i.start,i.stop]
        seq_combined = SeqRecord(Seq(seq_combined, generic_dna))
        #print(t.attributes["ID"][0])
        #print(seq_combined.seq)
        if t.strand == '-':
            pphase = i[7]
            seq_combined = seq_combined.reverse_complement()
        if pphase == "0" or pphase == ".":
            seq_transl = seq_combined.translate()
        elif pphase == "1":
            seq_transl = seq_combined[1:].translate()
        elif pphase == "2":
            seq_transl = seq_combined[2:].translate()
        seq_transl.description = position
        seq_transl.id = t.id
        position = sorted(position)
        t.start = position[0]
        t.stop = position[-1]
        fasta[str(seq_transl.seq)] = [seq_transl, t]
    count = 0
    for key in fasta:
        for i in db.parents(fasta[key][1], featuretype='gene', order_by='start'):
            gff_out.write_rec(i)
            i.start = fasta[key][1].start
            i.stop = fasta[key][1].stop
        gff_out.write_rec(fasta[key][1])
        for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'):
            gff_out.write_rec(i)
        for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'):
            count += 1
            i.featuretype="exon"
            if "ID" in i.attributes:
                i.attributes["ID"][0] = i.attributes["ID"][0] + "-" + str(count)
            else:
                i.attributes["ID"] = ["exon" + "-" + str(count)]
            gff_out.write_rec(i)
    return (gff_file)

Source File: extractGeneRegionFromGMLGenes.py From panaroo with MIT License

4 votes

def get_gene_coordinates(
    gff_file, contig, genes, gene1, l
):  #Takes a GFF file, splits into CDS and FASTA and identifies the coordinates of the 2 genes
    gff = open(gff_file).read()  #Import the GFF file
    split = gff.split("##FASTA")

    geneCoordinates = []  #Will be filled with the coordinates of the 2 genes

    with StringIO(split[1]
                  ) as temp_fasta:  #Import the sequences from the GFF as fasta
        sequences = list(SeqIO.parse(temp_fasta, "fasta"))

    parsed_gff = gffutils.create_db(clean_gff_string(split[0]),
                                    dbfn=":memory:",
                                    force=True,
                                    keep_order=True,
                                    from_string=True)

    for geneSample in parsed_gff.all_features(
            featuretype=()):  #Iterate through the genes
        if geneSample.id == genes[0] or geneSample.id == genes[
                1]:  #Check if the gene is one of the genes of interest
            geneCoordinates.append(geneSample.start)
            geneCoordinates.append(geneSample.stop)
        if geneSample.id == gene1:  #Check if the gene is gene 1
            strand = geneSample.strand

    firstPosition = min(
        geneCoordinates
    ) - l + 1  #The first position to be extracted from the contig
    endPosition = max(
        geneCoordinates) + l  #The end position to be extracted from the contig
    if firstPosition < 0:
        firstPosition = 0
    if endPosition > len(sequences[int(contig)].seq):
        endPosition = len(sequences[int(contig)].seq)

    if strand == "+":  #Check if the sequence needs to be reverse complemented
        extractSequence = sequences[int(contig)].seq[
            firstPosition:
            endPosition]  #Assign to the sequence in the region of interest
    else:
        extractSequence = sequences[int(
            contig)].seq[firstPosition:endPosition].reverse_complement()

    return extractSequence

Source File: run_ppanggolin.py From panaroo with MIT License

4 votes

def post_process_fmt(input_files, out_dir):

    # get mapping between GFF ids and ppanggolin ids
    id_mapping = {}
    for f in input_files:
        prefix = os.path.splitext(os.path.basename(f))[0]

        #Split file and parse
        gff_file = open(f, 'r')
        lines = gff_file.read()
        split = lines.split('##FASTA')

        if len(split) != 2:
            print("Problem reading GFF3 file: ", gff_file.name)
            raise RuntimeError("Error reading prokka input!")

        parsed_gff = gff.create_db(clean_gff_string(split[0]),
                                   dbfn=":memory:",
                                   force=True,
                                   keep_order=True,
                                   from_string=True)

        gene_count = 0
        for entry in parsed_gff.all_features(featuretype=()):
            if "CDS" not in entry.featuretype: continue
            ppanggolin_id = prefix + "_CDS_" + str(gene_count).zfill(4)
            id_mapping[ppanggolin_id] = entry.id
            gene_count += 1

        gff_file.close()
    
    id_mapping[''] = ''

    with open(out_dir + "ppanggolin_gene_presence_absence.csv", 'w') as outfile:
        with open(out_dir + "matrix.csv", 'r') as infile:
            outfile.write(next(infile))
            for line in infile:
                line = line.strip().split(",")
                for i in range(14, len(line)):
                    line[i] = ";".join([id_mapping[g.strip('"')] for g in line[i].split()])
                outfile.write(",".join(line) + "\n")

    return

Source File: run_pirate.py From panaroo with MIT License

4 votes

def post_process_fmt(input_files, pirate_dir, out_dir):

    file_id_maps = {}
    for f in input_files:
        #Split file and parse
        gff_file = open(f, 'r')
        lines = gff_file.read()
        split = lines.split('##FASTA')

        if len(split) != 2:
            print("Problem reading GFF3 file: ", gff_file.name)
            raise RuntimeError("Error reading prokka input!")

        parsed_gff = gff.create_db(clean_gff_string(split[0]),
                                   dbfn=":memory:",
                                   force=True,
                                   keep_order=True,
                                   from_string=True)

        gene_count = 1
        count_to_id = {}
        for entry in parsed_gff.all_features(featuretype=()):
            if "CDS" not in entry.featuretype: continue
            count_to_id[gene_count] = entry.id
            gene_count += 1

        file_id_maps[os.path.splitext(os.path.basename(f))[0].replace(
            "-", "_")] = count_to_id

        gff_file.close()

    with open(pirate_dir + "PIRATE.gene_families.ordered.tsv", 'r') as infile:
        with open(out_dir + "pirate_gene_presence_absence.csv",
                  'w') as outfile:
            line = next(infile)
            line = line.strip().split("\t")
            outfile.write("\t".join([line[0]] + line[22:]) + "\n")

            for line in infile:
                line = line.strip().split("\t")
                for i in range(22, len(line)):
                    if line[i] == "": continue
                    # currently take the first as we do in panaroo for this format. May want to redo.
                    line[i] = line[i].split(";")[0]
                    sample = "_".join(line[i].split("_")[:-1])
                    gene_num = int(line[i].split("_")[-1])
                    line[i] = file_id_maps[sample][gene_num]
                outfile.write("\t".join([line[0]] + line[22:]) + "\n")

    return

Python gffutils.create_db() Examples