Python Examples of Bio.SeqRecord.SeqRecord

Source File: prep-genome.py From kevlar with MIT License

7 votes

def main(args):
    for record in SeqIO.parse(args.infile, 'fasta'):
        if args.discard:
            if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0:
                continue

        subseqcounter = 0
        printlog(args.debug, "DEBUG: convert to upper case", record.id)
        sequence = str(record.seq).upper()
        printlog(args.debug, "DEBUG: split seq by Ns", record.id)
        subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength]
        printlog(args.debug, "DEBUG: print subseqs", record.id)
        for subseq in subseqs:
            subseqcounter += 1
            subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter)
            subrecord = SeqRecord(Seq(subseq), subid, '', '')
            SeqIO.write(subrecord, args.outfile, 'fasta')

Source File: util.py From deepbgc with MIT License

7 votes

def create_faux_record_from_proteins(proteins, id):
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    record = SeqRecord(seq=Seq(''), id=id)
    start = 0
    end = 0
    max_protein_id_len = 45
    for protein in proteins:
        nucl_length = len(protein.seq) * 3
        end += nucl_length
        feature = SeqFeature(
            location=FeatureLocation(start, end, strand=1),
            type="CDS",
            qualifiers={
                'protein_id': [protein.id[:max_protein_id_len]],
                'translation': [str(protein.seq)]
            }
        )
        start += nucl_length
        record.features.append(feature)
    return record

Source File: test_error_model.py From InSilicoSeq with MIT License

6 votes

def test_mut_sequence():
    random.seed(42)
    np.random.seed(42)

    err_mod = basic.BasicErrorModel()

    read = SeqRecord(
        Seq(str('AAAAA' * 25),
            IUPAC.unambiguous_dna
            ),
        id='read_1',
        description='test read'
    )
    read.letter_annotations["phred_quality"] = [5] * 125
    read.seq = err_mod.mut_sequence(read, 'forward')
    assert str(read.seq[:10]) == 'AAAACAGAAA'

Source File: multithreadLargeFasta.py From LoReAn with MIT License

6 votes

def single_fasta(ref, wd_split):
    """
    From a fasta file make single files with each sequence
    """

    fasta_file = open(ref, 'r')
    single_fasta_list = []
    count = 0
    dict_ref_name = {}
    ref_rename = ref + ".rename.fasta"
    with open(ref_rename, "w") as fh:
        for record in SeqIO.parse(fasta_file, "fasta"):
            count += 1
            new_name = "seq" + str(count)
            dict_ref_name[new_name] = record.id
            new_rec = SeqRecord(record.seq, new_name, '', '')
            fasta_name = wd_split + '/' + new_name + '.fasta'
            single_fasta_list.append(fasta_name)
            output_handle = open(fasta_name, "w")
            SeqIO.write(new_rec, output_handle, "fasta")
            SeqIO.write(new_rec, fh, "fasta")
            output_handle.close()
    return single_fasta_list, dict_ref_name, ref_rename

Source File: AMRHitHSP.py From staramr with Apache License 2.0

6 votes

def get_seq_record(self):
        """
        Gets a SeqRecord for this hit.
        :return: A SeqRecord for this hit.
        """
        return SeqRecord(Seq(self.get_genome_contig_hsp_seq()), id=self.get_amr_gene_id(),
                         description=(
                             'isolate: {}, contig: {}, contig_start: {}, contig_end: {}, database_gene_start: {},'
                             ' database_gene_end: {}, hsp/length: {}/{}, pid: {:0.2f}%, plength: {:0.2f}%').format(
                             self.get_genome_id(),
                             self.get_genome_contig_id(),
                             self.get_genome_contig_start(),
                             self.get_genome_contig_end(),
                             self.get_amr_gene_start(),
                             self.get_amr_gene_end(),
                             self.get_hsp_length(),
                             self.get_amr_gene_length(),
                             self.get_pid(),
                             self.get_plength()))

Source File: proteinAlign.py From LoReAn with MIT License

6 votes

def transeq(data):
    dummy = int(data[1])
    record = data[0]
    if dummy == 0:
        prot = (translate_frameshifted(record.seq[0:]))
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand0plus"))
    if dummy == 1:
        prot = (translate_frameshifted(record.seq[1:]))  # second frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand1plus"))
    if dummy == 2:
        prot = (translate_frameshifted(record.seq[2:]))  # third frame
        prot_rec =(SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand2plus"))
    if dummy == 3:
        prot = (translate_frameshifted(reverse_complement(record.seq)))  # negative first frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand0minus"))
    if dummy == 4:
        prot = (translate_frameshifted(reverse_complement(record.seq[:len(record.seq) - 1])))  # negative second frame
        prot_rec =(SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand1minus"))
    if dummy == 5:
        prot = (translate_frameshifted(reverse_complement(record.seq[:len(record.seq) - 2])))  # negative third frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand2minus"))
    return(prot_rec)

Source File: generate_output.py From panaroo with MIT License

6 votes

def generate_pan_genome_reference(G, output_dir, split_paralogs=False):

    # need to treat paralogs differently?
    centroids = set()
    records = []

    for node in G.nodes():
        if not split_paralogs and G.nodes[node]['centroid'][0] in centroids:
            continue
        records.append(
            SeqRecord(Seq(max(G.nodes[node]['dna'], key=lambda x: len(x)),
                          generic_dna),
                      id=G.nodes[node]['name'],
                      description=""))
        for centroid in G.nodes[node]['centroid']:
            centroids.add(centroid)

    with open(output_dir + "pan_genome_reference.fa", 'w') as outfile:
        SeqIO.write(records, outfile, "fasta")

    return

Source File: prokka.py From panaroo with MIT License

6 votes

def translate_sequences(sequence_dic):
    protein_list = []
    for strain_id in sequence_dic:
        sequence_record = sequence_dic[strain_id]
        if (len(sequence_record.seq) % 3) != 0:
            raise ValueError(
                "Coding sequence not divisible by 3, is it complete?!")
        protien_sequence = translate(str(sequence_record.seq))
        if protien_sequence[-1] == "*":
            protien_sequence = protien_sequence[0:-1]
        if "*" in protien_sequence:
            print(sequence_record)
            print(protien_sequence)
            # raise ValueError("Premature stop codon in a gene!")
        protein_record = SeqRecord(Seq(protien_sequence),
                                   id=strain_id,
                                   description=strain_id)
        protein_list.append(protein_record)
    return protein_list

Source File: intronerate.py From HybPiper with GNU General Public License v3.0

6 votes

def remove_exons(gff_filename,supercontig_filename,mode="all"):
    '''Given a supercontig and corresponding annotation, remove the exon sequences. In "intron" mode, only return sequences specifically annotated as introns'''
    exon_starts = []
    exon_ends = []
    gff = open(gff_filename).readlines()
    for line in gff:
        line = line.rstrip().split("\t")
        if len(line) > 2:
            if line[2] == "exon":
                exon_starts.append(int(line[3]))
                exon_ends.append(int(line[4]))
    supercontig = SeqIO.read(supercontig_filename,'fasta')
    exonless_contig = SeqRecord(Seq(''),id=supercontig.id)
    start = 0
    for exon in range(len(exon_starts)):
        exonless_contig += supercontig[start:exon_starts[exon]-1] 
        start = exon_ends[exon]
    exonless_contig += supercontig[start:]    
    exonless_contig.description = ''
    return exonless_contig

Source File: targeted_mut_fasta_corrected.py From DiscoSnp with GNU Affero General Public License v3.0

6 votes

def targeted_mut(fasta_file, index):
    
    genome = SeqIO.parse(fasta_file, "fasta")
    fasta_file_mut = ("{}_mut".format(fasta_file))
    nb_mut = 0
    sequences_mut = []
  
    for record in genome:
        chr = record.id
        mutable_seq = record.seq.tomutable()
        #print(mutable_seq[0])
        if chr in index: 
            for pos in index[chr]:
                if mutable_seq[pos - 1] == index[chr][pos]["ref"]:
                    mutable_seq[pos - 1] = index[chr][pos]["alt"]
                    nb_mut += 1
       
        record_mut=SeqRecord(mutable_seq, record.id,'','')
        sequences_mut.append(record_mut)
    SeqIO.write(sequences_mut, fasta_file_mut, "fasta")

    print("{} bases have been mutated".format(nb_mut))

Source File: test_align.py From augur with GNU Affero General Public License v3.0

6 votes

def test_prune_seqs_matching_alignment(self):
        sequence = {
            "seq1": SeqRecord(Seq("GTAC"), name="seq1"),
            "seq2": SeqRecord(Seq("CGTT"), name="seq2"),
            "seq3": SeqRecord(Seq("TAGC"), name="seq3"),
        }
        alignment = MultipleSeqAlignment(
            [
                SeqRecord(Seq("GTAC"), name="seq1"),
                SeqRecord(Seq("TAGC"), name="seq3"),
            ]
        )
        
        result = align.prune_seqs_matching_alignment(sequence.values(), alignment)
        assert [r.name for r in result] == ["seq2"]
        for r in result:
            assert r.seq == sequence[r.name].seq

Source File: import_beast.py From augur with GNU Affero General Public License v3.0

6 votes

def fake_alignment(T):
    """
    Fake alignment to appease treetime when only using it for naming nodes...
    This is lifted from refine.py and ideally could be imported

    Parameters
    -------
    T : <class 'Bio.Phylo.BaseTree.Tree'>

    Returns
    -------
    <class 'Bio.Align.MultipleSeqAlignment'>
    """
    from Bio import SeqRecord, Seq, Align
    seqs = []
    for n in T.get_terminals():
        seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description=''))
    aln = Align.MultipleSeqAlignment(seqs)
    return aln

Source File: seqprop.py From ssbio with MIT License

6 votes

def seq(self, s):
        if not s:
            self._seq = None

        elif self.sequence_file:
            raise ValueError('{}: unable to set sequence, sequence file is associated with this object'.format(self.id))

        elif type(s) == str or type(s) == Seq:
            self._seq = ssbio.protein.sequence.utils.cast_to_seq(obj=s)

        # If a SeqRecord, copy all attributes
        elif type(s) == SeqRecord:
            self._seq = s.seq
            if self.name == '<unknown name>':
                self.name = s.name
            if self.description == '<unknown description>':
                self.description = s.description
            if not self.dbxrefs:
                self.dbxrefs = s.dbxrefs
            if not self.features:
                self.features = s.features
            if not self.annotations:
                self.annotations = s.annotations
            if not self.letter_annotations:
                self.letter_annotations = s.letter_annotations

Source File: trim_lowercase.py From pacbio_variant_caller with MIT License

6 votes

def trim_lowercase(input_filename, output_filename, keep_completely_lowercased_sequences):
    """
    Trim all lowercase letters from the given input FASTA file, remove records
    that were completely lowercase, and write the trimmed FASTA to the given
    output file.
    """
    trimmed_records = []
    for record in SeqIO.parse(input_filename, "fasta"):
        # Remove all lowercase letters in the sequence for this record.
        trimmed_sequence = re.sub("(^[a-z]+|[a-z]+$)", "", str(record.seq))

        # If any uppercase sequence remains, create a new record for it.
        if len(trimmed_sequence) > 0:
            trimmed_records.append(SeqRecord(Seq(trimmed_sequence), id=record.id, description=""))
        elif keep_completely_lowercased_sequences:
            trimmed_records.append(SeqRecord(Seq(str(record.seq)), id=record.id, description=""))

    # Write out all trimmed records to the given output file.
    SeqIO.write(trimmed_records, output_filename, "fasta")

Source File: utils.py From ssbio with MIT License

6 votes

def cast_to_seq(obj, alphabet=IUPAC.extended_protein):
    """Return a Seq representation of a string or SeqRecord object.

    Args:
        obj (str, Seq, SeqRecord): Sequence string or Biopython SeqRecord object
        alphabet: See Biopython SeqRecord docs

    Returns:
        Seq: Seq representation of the sequence

    """

    if isinstance(obj, Seq):
        return obj
    if isinstance(obj, SeqRecord):
        return obj.seq
    if isinstance(obj, str):
        obj = obj.upper()
        return Seq(obj, alphabet)
    else:
        raise ValueError('Must provide a string, Seq, or SeqRecord object.')

Source File: utils.py From ssbio with MIT License

6 votes

def cast_to_str(obj):
    """Return a string representation of a Seq or SeqRecord.

    Args:
        obj (str, Seq, SeqRecord): Biopython Seq or SeqRecord

    Returns:
        str: String representation of the sequence

    """

    if isinstance(obj, str):
        return obj
    if isinstance(obj, Seq):
        return str(obj)
    if isinstance(obj, SeqRecord):
        return str(obj.seq)
    else:
        raise ValueError('Must provide a string, Seq, or SeqRecord object.')

Source File: promoters.py From antismash with GNU Affero General Public License v3.0

6 votes

def write_promoters_to_file(output_dir: str, prefix: str, promoters: List[Promoter]) -> None:
    """ Write the promoters to the given file using the given prefix.

        The prefix helps in having separate files for multi-record inputs with
        a singular output directory.
    """
    # positions file
    pos_handle = open(os.path.join(output_dir, prefix + "_promoter_positions.csv"), "w")
    pos_handle.write("\t".join(["#", "promoter", "start", "end", "length"]) + "\n")
    # sequences file
    seq_handle = open(os.path.join(output_dir, prefix + "_promoter_sequences.fasta"), "w")

    for i, promoter in enumerate(promoters):
        # write promoter positions to file
        pos_handle.write("\t".join(map(str, [i + 1, promoter.get_id(),
                                             promoter.start + 1, promoter.end + 1,
                                             len(promoter)])) + "\n")

        # write promoter sequences to file
        SeqIO.write(SeqRecord(promoter.seq, id=promoter.get_id(),
                    description="length={}bp".format(len(promoter))),
                    seq_handle,
                    "fasta")

Source File: intronerate.py From HybPiper with GNU General Public License v3.0

6 votes

def make_intron_supercontig(contig_info,gene,prefix,add_N = False):
    cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta'))
    intron_supercontig = SeqRecord(Seq(''))
    for i in contig_info:
        if i[5] == "(+)":
            intron_supercontig += cap3contigs[i[0]]
        elif i[5] == "(-)":
            intron_supercontig += cap3contigs[i[0]].reverse_complement()    
        else:
            sys.stderr.write("Strandedness not found!")
            sys.exit(1)
        if add_N and i != contig_info[-1]:
            intron_supercontig += "NNNNNNNNNN"    
    intron_supercontig.id = '{}-{}'.format(prefix,gene)
    intron_supercontig.description = ''
    SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta')

Source File: seq.py From wub with Mozilla Public License 2.0

6 votes

def count_records(input_object, format='fasta'):
    """Count SeqRecord objects from a file in the specified format.

    :param input_object: A file object or a file name.
    :param format: Input format (fasta by default).
    :returns: Number of records in input file.
    :rtype: int

    """
    handle = input_object
    if type(handle) == str:
        handle = open(handle, "rU")
    counter = 0
    for _ in SeqIO.parse(handle, format):
        counter += 1
    return counter

Source File: test_circular_conversion.py From antismash with GNU Affero General Public License v3.0

5 votes

def setUp(self):
        self.seqrec = SeqRecord(UnknownSeq(21))
        loc = CompoundLocation([FeatureLocation(12, 15, strand=1),
                                FeatureLocation(18, 21, strand=1),
                                FeatureLocation(0, 3, strand=1),
                                FeatureLocation(6, 9, strand=1)],
                                operator="join")
        self.seqcds = SeqFeature(loc, type="CDS")
        self.seqgene = SeqFeature(loc, type="gene")
        self.seqrec.annotations["topology"] = "circular"

Source File: record.py From antismash with GNU Affero General Public License v3.0

5 votes

def to_biopython(self) -> SeqRecord:
        """Returns a Bio.SeqRecord instance of the record"""
        features = self.get_all_features()
        bio_features = []  # type: List[SeqFeature]
        for feature in sorted(features):
            bio_features.extend(feature.to_biopython())
        return SeqRecord(self.seq, id=self._record.id, name=self._record.name,
                         description=self._record.description,
                         dbxrefs=self.dbxrefs, features=bio_features,
                         annotations=self._record.annotations,
                         letter_annotations=self._record.letter_annotations)

Source File: record.py From antismash with GNU Affero General Public License v3.0

5 votes

def __getattr__(self, attr: str) -> Any:
        # passthroughs to the original SeqRecord
        if attr in ["id", "seq", "description", "name", "annotations", "dbxrefs"]:
            return getattr(self._record, attr)
        if attr in Record.__slots__:
            return self.__getattribute__(attr)
        raise AttributeError("Record has no attribute '%s'" % attr)

Source File: seq.py From wub with Mozilla Public License 2.0

5 votes

def read_seq_records(input_object, format='fasta'):
    """Read SeqRecord objects from a file in the specified format.

    :param input_object: A file object or a file name.
    :param format: Input format (fasta by default).
    :returns: A dictionary with the parsed SeqRecord objects.
    :rtype: generator

    """
    handle = input_object
    if type(handle) == str:
        handle = open(handle, "rU")
    return SeqIO.parse(handle, format)

Source File: test_circular_conversion.py From antismash with GNU Affero General Public License v3.0

5 votes

def setUp(self):
        self.seqrec = SeqRecord(UnknownSeq(21))
        loc = CompoundLocation([FeatureLocation(12, 21, strand=1),
                                FeatureLocation(0, 3, strand=1),
                                FeatureLocation(6, 9, strand=1)],
                                operator="join")
        self.seqcds = SeqFeature(loc, type="CDS")
        self.seqgene = SeqFeature(loc, type="gene")
        self.seqrec.annotations["topology"] = "circular"

Source File: seq.py From wub with Mozilla Public License 2.0

5 votes

def read_seq_records_dict(input_object, format='fasta'):
    """Read SeqRecord objects to a dictionary from a file in the specified format.

    :param input_object: A file object or a file name.
    :param format: Input format (fasta by default).
    :returns: An iterator of SeqRecord objects.
    :rtype: dict

    """
    handle = input_object
    if type(handle) == str:
        handle = open(handle, "rU")
    return SeqIO.to_dict(SeqIO.parse(handle, format))

Source File: seq.py From wub with Mozilla Public License 2.0

5 votes

def record_lengths(input_iter):
    """Return lengths of SeqRecord obejcts in the input iterator.

    :param input_iter: An iterator of SeqRecord objects.
    :returns: An ordered dictionary with the lengths of the SeqRecord objects.
    :rtype: OrderedDict

    """
    lengths = OrderedDict((record.id, len(record)) for record in input_iter)
    return lengths

Source File: transcriptAssembly.py From LoReAn with MIT License

5 votes

def bamtofastq(bam, verbose):
    fasta = bam + ".fasta"
    in_file = open(bam, 'r')
    in_sam = Reader(in_file)
    with open(fasta, "w") as output_handle:
        for line in in_sam:
            if line.mapped:
                record = SeqRecord(Seq(str(line.seq)),name = str(line.qname))
                SeqIO.write(record, output_handle, "fasta")
    return fasta

Source File: ancestralReconstruction.py From popgen-stats with MIT License

5 votes

def concatenate_genes(coreGenes):
    """ Using Biopython, concatenate ancestral genes into one file"""
    print "Concatenating ancestral reconstructions"
    def parse_lazarus_output(coreGene):
        ancRecFile = open("%s/ancestral/ancestor.out.txt" % coreGene, "r")
        for i,line in enumerate(ancRecFile):
            if i == 13:
                record = SeqRecord(Seq(line.strip(), IUPAC.ambiguous_dna), 
                    id=coreGene, description="")
        return record
    pool = ThreadPool(args.threads)
    recs = pool.map(parse_lazarus_output, coreGenes)
    SeqIO.write(recs, "ancestralGenes.fa", "fasta")

Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0

5 votes

def get_unique_names(gene_dict):
    '''Given the dictionary of SeqRecord dictionaries, return a list of the unique sequence headers'''
    all_names = []
    for gene in gene_dict:
        all_names += list(gene_dict[gene].keys())
    return set(all_names)

Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0

5 votes

def insert_sequences(gene_dict,unique_names):
    '''Given the dictionary of dictionaries, insert blank sequences if any are missing for a gene'''
    inserted_sequences = 0
    for gene in gene_dict:
        for name in unique_names:
            if name not in gene_dict[gene]:
                gene_length = len(next(iter(gene_dict[gene].values())))
                gene_dict[gene][name] = SeqRecord(Seq("-"*gene_length),id=name)
                inserted_sequences += 1
    sys.stderr.write("{} Empty sequences inserted across all genes.\n".format(inserted_sequences))            
    return gene_dict

Python Bio.SeqRecord.SeqRecord() Examples