Python Bio.SeqRecord.SeqRecord() Examples

The following are 30 code examples of Bio.SeqRecord.SeqRecord(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Bio.SeqRecord , or try the search function .
Example #1
Source File: prep-genome.py    From kevlar with MIT License 7 votes vote down vote up
def main(args):
    for record in SeqIO.parse(args.infile, 'fasta'):
        if args.discard:
            if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0:
                continue

        subseqcounter = 0
        printlog(args.debug, "DEBUG: convert to upper case", record.id)
        sequence = str(record.seq).upper()
        printlog(args.debug, "DEBUG: split seq by Ns", record.id)
        subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength]
        printlog(args.debug, "DEBUG: print subseqs", record.id)
        for subseq in subseqs:
            subseqcounter += 1
            subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter)
            subrecord = SeqRecord(Seq(subseq), subid, '', '')
            SeqIO.write(subrecord, args.outfile, 'fasta') 
Example #2
Source File: util.py    From deepbgc with MIT License 7 votes vote down vote up
def create_faux_record_from_proteins(proteins, id):
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    record = SeqRecord(seq=Seq(''), id=id)
    start = 0
    end = 0
    max_protein_id_len = 45
    for protein in proteins:
        nucl_length = len(protein.seq) * 3
        end += nucl_length
        feature = SeqFeature(
            location=FeatureLocation(start, end, strand=1),
            type="CDS",
            qualifiers={
                'protein_id': [protein.id[:max_protein_id_len]],
                'translation': [str(protein.seq)]
            }
        )
        start += nucl_length
        record.features.append(feature)
    return record 
Example #3
Source File: test_error_model.py    From InSilicoSeq with MIT License 6 votes vote down vote up
def test_mut_sequence():
    random.seed(42)
    np.random.seed(42)

    err_mod = basic.BasicErrorModel()

    read = SeqRecord(
        Seq(str('AAAAA' * 25),
            IUPAC.unambiguous_dna
            ),
        id='read_1',
        description='test read'
    )
    read.letter_annotations["phred_quality"] = [5] * 125
    read.seq = err_mod.mut_sequence(read, 'forward')
    assert str(read.seq[:10]) == 'AAAACAGAAA' 
Example #4
Source File: multithreadLargeFasta.py    From LoReAn with MIT License 6 votes vote down vote up
def single_fasta(ref, wd_split):
    """
    From a fasta file make single files with each sequence
    """

    fasta_file = open(ref, 'r')
    single_fasta_list = []
    count = 0
    dict_ref_name = {}
    ref_rename = ref + ".rename.fasta"
    with open(ref_rename, "w") as fh:
        for record in SeqIO.parse(fasta_file, "fasta"):
            count += 1
            new_name = "seq" + str(count)
            dict_ref_name[new_name] = record.id
            new_rec = SeqRecord(record.seq, new_name, '', '')
            fasta_name = wd_split + '/' + new_name + '.fasta'
            single_fasta_list.append(fasta_name)
            output_handle = open(fasta_name, "w")
            SeqIO.write(new_rec, output_handle, "fasta")
            SeqIO.write(new_rec, fh, "fasta")
            output_handle.close()
    return single_fasta_list, dict_ref_name, ref_rename 
Example #5
Source File: AMRHitHSP.py    From staramr with Apache License 2.0 6 votes vote down vote up
def get_seq_record(self):
        """
        Gets a SeqRecord for this hit.
        :return: A SeqRecord for this hit.
        """
        return SeqRecord(Seq(self.get_genome_contig_hsp_seq()), id=self.get_amr_gene_id(),
                         description=(
                             'isolate: {}, contig: {}, contig_start: {}, contig_end: {}, database_gene_start: {},'
                             ' database_gene_end: {}, hsp/length: {}/{}, pid: {:0.2f}%, plength: {:0.2f}%').format(
                             self.get_genome_id(),
                             self.get_genome_contig_id(),
                             self.get_genome_contig_start(),
                             self.get_genome_contig_end(),
                             self.get_amr_gene_start(),
                             self.get_amr_gene_end(),
                             self.get_hsp_length(),
                             self.get_amr_gene_length(),
                             self.get_pid(),
                             self.get_plength())) 
Example #6
Source File: proteinAlign.py    From LoReAn with MIT License 6 votes vote down vote up
def transeq(data):
    dummy = int(data[1])
    record = data[0]
    if dummy == 0:
        prot = (translate_frameshifted(record.seq[0:]))
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand0plus"))
    if dummy == 1:
        prot = (translate_frameshifted(record.seq[1:]))  # second frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand1plus"))
    if dummy == 2:
        prot = (translate_frameshifted(record.seq[2:]))  # third frame
        prot_rec =(SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand2plus"))
    if dummy == 3:
        prot = (translate_frameshifted(reverse_complement(record.seq)))  # negative first frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand0minus"))
    if dummy == 4:
        prot = (translate_frameshifted(reverse_complement(record.seq[:len(record.seq) - 1])))  # negative second frame
        prot_rec =(SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand1minus"))
    if dummy == 5:
        prot = (translate_frameshifted(reverse_complement(record.seq[:len(record.seq) - 2])))  # negative third frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand2minus"))
    return(prot_rec) 
Example #7
Source File: generate_output.py    From panaroo with MIT License 6 votes vote down vote up
def generate_pan_genome_reference(G, output_dir, split_paralogs=False):

    # need to treat paralogs differently?
    centroids = set()
    records = []

    for node in G.nodes():
        if not split_paralogs and G.nodes[node]['centroid'][0] in centroids:
            continue
        records.append(
            SeqRecord(Seq(max(G.nodes[node]['dna'], key=lambda x: len(x)),
                          generic_dna),
                      id=G.nodes[node]['name'],
                      description=""))
        for centroid in G.nodes[node]['centroid']:
            centroids.add(centroid)

    with open(output_dir + "pan_genome_reference.fa", 'w') as outfile:
        SeqIO.write(records, outfile, "fasta")

    return 
Example #8
Source File: prokka.py    From panaroo with MIT License 6 votes vote down vote up
def translate_sequences(sequence_dic):
    protein_list = []
    for strain_id in sequence_dic:
        sequence_record = sequence_dic[strain_id]
        if (len(sequence_record.seq) % 3) != 0:
            raise ValueError(
                "Coding sequence not divisible by 3, is it complete?!")
        protien_sequence = translate(str(sequence_record.seq))
        if protien_sequence[-1] == "*":
            protien_sequence = protien_sequence[0:-1]
        if "*" in protien_sequence:
            print(sequence_record)
            print(protien_sequence)
            # raise ValueError("Premature stop codon in a gene!")
        protein_record = SeqRecord(Seq(protien_sequence),
                                   id=strain_id,
                                   description=strain_id)
        protein_list.append(protein_record)
    return protein_list 
Example #9
Source File: intronerate.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def remove_exons(gff_filename,supercontig_filename,mode="all"):
    '''Given a supercontig and corresponding annotation, remove the exon sequences. In "intron" mode, only return sequences specifically annotated as introns'''
    exon_starts = []
    exon_ends = []
    gff = open(gff_filename).readlines()
    for line in gff:
        line = line.rstrip().split("\t")
        if len(line) > 2:
            if line[2] == "exon":
                exon_starts.append(int(line[3]))
                exon_ends.append(int(line[4]))
    supercontig = SeqIO.read(supercontig_filename,'fasta')
    exonless_contig = SeqRecord(Seq(''),id=supercontig.id)
    start = 0
    for exon in range(len(exon_starts)):
        exonless_contig += supercontig[start:exon_starts[exon]-1] 
        start = exon_ends[exon]
    exonless_contig += supercontig[start:]    
    exonless_contig.description = ''
    return exonless_contig 
Example #10
Source File: targeted_mut_fasta_corrected.py    From DiscoSnp with GNU Affero General Public License v3.0 6 votes vote down vote up
def targeted_mut(fasta_file, index):
    
    genome = SeqIO.parse(fasta_file, "fasta")
    fasta_file_mut = ("{}_mut".format(fasta_file))
    nb_mut = 0
    sequences_mut = []
  
    for record in genome:
        chr = record.id
        mutable_seq = record.seq.tomutable()
        #print(mutable_seq[0])
        if chr in index: 
            for pos in index[chr]:
                if mutable_seq[pos - 1] == index[chr][pos]["ref"]:
                    mutable_seq[pos - 1] = index[chr][pos]["alt"]
                    nb_mut += 1
       
        record_mut=SeqRecord(mutable_seq, record.id,'','')
        sequences_mut.append(record_mut)
    SeqIO.write(sequences_mut, fasta_file_mut, "fasta")

    print("{} bases have been mutated".format(nb_mut)) 
Example #11
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_prune_seqs_matching_alignment(self):
        sequence = {
            "seq1": SeqRecord(Seq("GTAC"), name="seq1"),
            "seq2": SeqRecord(Seq("CGTT"), name="seq2"),
            "seq3": SeqRecord(Seq("TAGC"), name="seq3"),
        }
        alignment = MultipleSeqAlignment(
            [
                SeqRecord(Seq("GTAC"), name="seq1"),
                SeqRecord(Seq("TAGC"), name="seq3"),
            ]
        )
        
        result = align.prune_seqs_matching_alignment(sequence.values(), alignment)
        assert [r.name for r in result] == ["seq2"]
        for r in result:
            assert r.seq == sequence[r.name].seq 
Example #12
Source File: import_beast.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def fake_alignment(T):
    """
    Fake alignment to appease treetime when only using it for naming nodes...
    This is lifted from refine.py and ideally could be imported

    Parameters
    -------
    T : <class 'Bio.Phylo.BaseTree.Tree'>

    Returns
    -------
    <class 'Bio.Align.MultipleSeqAlignment'>
    """
    from Bio import SeqRecord, Seq, Align
    seqs = []
    for n in T.get_terminals():
        seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description=''))
    aln = Align.MultipleSeqAlignment(seqs)
    return aln 
Example #13
Source File: seqprop.py    From ssbio with MIT License 6 votes vote down vote up
def seq(self, s):
        if not s:
            self._seq = None

        elif self.sequence_file:
            raise ValueError('{}: unable to set sequence, sequence file is associated with this object'.format(self.id))

        elif type(s) == str or type(s) == Seq:
            self._seq = ssbio.protein.sequence.utils.cast_to_seq(obj=s)

        # If a SeqRecord, copy all attributes
        elif type(s) == SeqRecord:
            self._seq = s.seq
            if self.name == '<unknown name>':
                self.name = s.name
            if self.description == '<unknown description>':
                self.description = s.description
            if not self.dbxrefs:
                self.dbxrefs = s.dbxrefs
            if not self.features:
                self.features = s.features
            if not self.annotations:
                self.annotations = s.annotations
            if not self.letter_annotations:
                self.letter_annotations = s.letter_annotations 
Example #14
Source File: trim_lowercase.py    From pacbio_variant_caller with MIT License 6 votes vote down vote up
def trim_lowercase(input_filename, output_filename, keep_completely_lowercased_sequences):
    """
    Trim all lowercase letters from the given input FASTA file, remove records
    that were completely lowercase, and write the trimmed FASTA to the given
    output file.
    """
    trimmed_records = []
    for record in SeqIO.parse(input_filename, "fasta"):
        # Remove all lowercase letters in the sequence for this record.
        trimmed_sequence = re.sub("(^[a-z]+|[a-z]+$)", "", str(record.seq))

        # If any uppercase sequence remains, create a new record for it.
        if len(trimmed_sequence) > 0:
            trimmed_records.append(SeqRecord(Seq(trimmed_sequence), id=record.id, description=""))
        elif keep_completely_lowercased_sequences:
            trimmed_records.append(SeqRecord(Seq(str(record.seq)), id=record.id, description=""))

    # Write out all trimmed records to the given output file.
    SeqIO.write(trimmed_records, output_filename, "fasta") 
Example #15
Source File: utils.py    From ssbio with MIT License 6 votes vote down vote up
def cast_to_seq(obj, alphabet=IUPAC.extended_protein):
    """Return a Seq representation of a string or SeqRecord object.

    Args:
        obj (str, Seq, SeqRecord): Sequence string or Biopython SeqRecord object
        alphabet: See Biopython SeqRecord docs

    Returns:
        Seq: Seq representation of the sequence

    """

    if isinstance(obj, Seq):
        return obj
    if isinstance(obj, SeqRecord):
        return obj.seq
    if isinstance(obj, str):
        obj = obj.upper()
        return Seq(obj, alphabet)
    else:
        raise ValueError('Must provide a string, Seq, or SeqRecord object.') 
Example #16
Source File: utils.py    From ssbio with MIT License 6 votes vote down vote up
def cast_to_str(obj):
    """Return a string representation of a Seq or SeqRecord.

    Args:
        obj (str, Seq, SeqRecord): Biopython Seq or SeqRecord

    Returns:
        str: String representation of the sequence

    """

    if isinstance(obj, str):
        return obj
    if isinstance(obj, Seq):
        return str(obj)
    if isinstance(obj, SeqRecord):
        return str(obj.seq)
    else:
        raise ValueError('Must provide a string, Seq, or SeqRecord object.') 
Example #17
Source File: promoters.py    From antismash with GNU Affero General Public License v3.0 6 votes vote down vote up
def write_promoters_to_file(output_dir: str, prefix: str, promoters: List[Promoter]) -> None:
    """ Write the promoters to the given file using the given prefix.

        The prefix helps in having separate files for multi-record inputs with
        a singular output directory.
    """
    # positions file
    pos_handle = open(os.path.join(output_dir, prefix + "_promoter_positions.csv"), "w")
    pos_handle.write("\t".join(["#", "promoter", "start", "end", "length"]) + "\n")
    # sequences file
    seq_handle = open(os.path.join(output_dir, prefix + "_promoter_sequences.fasta"), "w")

    for i, promoter in enumerate(promoters):
        # write promoter positions to file
        pos_handle.write("\t".join(map(str, [i + 1, promoter.get_id(),
                                             promoter.start + 1, promoter.end + 1,
                                             len(promoter)])) + "\n")

        # write promoter sequences to file
        SeqIO.write(SeqRecord(promoter.seq, id=promoter.get_id(),
                    description="length={}bp".format(len(promoter))),
                    seq_handle,
                    "fasta") 
Example #18
Source File: intronerate.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def make_intron_supercontig(contig_info,gene,prefix,add_N = False):
    cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta'))
    intron_supercontig = SeqRecord(Seq(''))
    for i in contig_info:
        if i[5] == "(+)":
            intron_supercontig += cap3contigs[i[0]]
        elif i[5] == "(-)":
            intron_supercontig += cap3contigs[i[0]].reverse_complement()    
        else:
            sys.stderr.write("Strandedness not found!")
            sys.exit(1)
        if add_N and i != contig_info[-1]:
            intron_supercontig += "NNNNNNNNNN"    
    intron_supercontig.id = '{}-{}'.format(prefix,gene)
    intron_supercontig.description = ''
    SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta') 
Example #19
Source File: seq.py    From wub with Mozilla Public License 2.0 6 votes vote down vote up
def count_records(input_object, format='fasta'):
    """Count SeqRecord objects from a file in the specified format.

    :param input_object: A file object or a file name.
    :param format: Input format (fasta by default).
    :returns: Number of records in input file.
    :rtype: int

    """
    handle = input_object
    if type(handle) == str:
        handle = open(handle, "rU")
    counter = 0
    for _ in SeqIO.parse(handle, format):
        counter += 1
    return counter 
Example #20
Source File: test_circular_conversion.py    From antismash with GNU Affero General Public License v3.0 5 votes vote down vote up
def setUp(self):
        self.seqrec = SeqRecord(UnknownSeq(21))
        loc = CompoundLocation([FeatureLocation(12, 15, strand=1),
                                FeatureLocation(18, 21, strand=1),
                                FeatureLocation(0, 3, strand=1),
                                FeatureLocation(6, 9, strand=1)],
                                operator="join")
        self.seqcds = SeqFeature(loc, type="CDS")
        self.seqgene = SeqFeature(loc, type="gene")
        self.seqrec.annotations["topology"] = "circular" 
Example #21
Source File: record.py    From antismash with GNU Affero General Public License v3.0 5 votes vote down vote up
def to_biopython(self) -> SeqRecord:
        """Returns a Bio.SeqRecord instance of the record"""
        features = self.get_all_features()
        bio_features = []  # type: List[SeqFeature]
        for feature in sorted(features):
            bio_features.extend(feature.to_biopython())
        return SeqRecord(self.seq, id=self._record.id, name=self._record.name,
                         description=self._record.description,
                         dbxrefs=self.dbxrefs, features=bio_features,
                         annotations=self._record.annotations,
                         letter_annotations=self._record.letter_annotations) 
Example #22
Source File: record.py    From antismash with GNU Affero General Public License v3.0 5 votes vote down vote up
def __getattr__(self, attr: str) -> Any:
        # passthroughs to the original SeqRecord
        if attr in ["id", "seq", "description", "name", "annotations", "dbxrefs"]:
            return getattr(self._record, attr)
        if attr in Record.__slots__:
            return self.__getattribute__(attr)
        raise AttributeError("Record has no attribute '%s'" % attr) 
Example #23
Source File: seq.py    From wub with Mozilla Public License 2.0 5 votes vote down vote up
def read_seq_records(input_object, format='fasta'):
    """Read SeqRecord objects from a file in the specified format.

    :param input_object: A file object or a file name.
    :param format: Input format (fasta by default).
    :returns: A dictionary with the parsed SeqRecord objects.
    :rtype: generator

    """
    handle = input_object
    if type(handle) == str:
        handle = open(handle, "rU")
    return SeqIO.parse(handle, format) 
Example #24
Source File: test_circular_conversion.py    From antismash with GNU Affero General Public License v3.0 5 votes vote down vote up
def setUp(self):
        self.seqrec = SeqRecord(UnknownSeq(21))
        loc = CompoundLocation([FeatureLocation(12, 21, strand=1),
                                FeatureLocation(0, 3, strand=1),
                                FeatureLocation(6, 9, strand=1)],
                                operator="join")
        self.seqcds = SeqFeature(loc, type="CDS")
        self.seqgene = SeqFeature(loc, type="gene")
        self.seqrec.annotations["topology"] = "circular" 
Example #25
Source File: seq.py    From wub with Mozilla Public License 2.0 5 votes vote down vote up
def read_seq_records_dict(input_object, format='fasta'):
    """Read SeqRecord objects to a dictionary from a file in the specified format.

    :param input_object: A file object or a file name.
    :param format: Input format (fasta by default).
    :returns: An iterator of SeqRecord objects.
    :rtype: dict

    """
    handle = input_object
    if type(handle) == str:
        handle = open(handle, "rU")
    return SeqIO.to_dict(SeqIO.parse(handle, format)) 
Example #26
Source File: seq.py    From wub with Mozilla Public License 2.0 5 votes vote down vote up
def record_lengths(input_iter):
    """Return lengths of SeqRecord obejcts in the input iterator.

    :param input_iter: An iterator of SeqRecord objects.
    :returns: An ordered dictionary with the lengths of the SeqRecord objects.
    :rtype: OrderedDict

    """
    lengths = OrderedDict((record.id, len(record)) for record in input_iter)
    return lengths 
Example #27
Source File: transcriptAssembly.py    From LoReAn with MIT License 5 votes vote down vote up
def bamtofastq(bam, verbose):
    fasta = bam + ".fasta"
    in_file = open(bam, 'r')
    in_sam = Reader(in_file)
    with open(fasta, "w") as output_handle:
        for line in in_sam:
            if line.mapped:
                record = SeqRecord(Seq(str(line.seq)),name = str(line.qname))
                SeqIO.write(record, output_handle, "fasta")
    return fasta 
Example #28
Source File: ancestralReconstruction.py    From popgen-stats with MIT License 5 votes vote down vote up
def concatenate_genes(coreGenes):
    """ Using Biopython, concatenate ancestral genes into one file"""
    print "Concatenating ancestral reconstructions"
    def parse_lazarus_output(coreGene):
        ancRecFile = open("%s/ancestral/ancestor.out.txt" % coreGene, "r")
        for i,line in enumerate(ancRecFile):
            if i == 13:
                record = SeqRecord(Seq(line.strip(), IUPAC.ambiguous_dna), 
                    id=coreGene, description="")
        return record
    pool = ThreadPool(args.threads)
    recs = pool.map(parse_lazarus_output, coreGenes)
    SeqIO.write(recs, "ancestralGenes.fa", "fasta") 
Example #29
Source File: fasta_merge.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def get_unique_names(gene_dict):
    '''Given the dictionary of SeqRecord dictionaries, return a list of the unique sequence headers'''
    all_names = []
    for gene in gene_dict:
        all_names += list(gene_dict[gene].keys())
    return set(all_names) 
Example #30
Source File: fasta_merge.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def insert_sequences(gene_dict,unique_names):
    '''Given the dictionary of dictionaries, insert blank sequences if any are missing for a gene'''
    inserted_sequences = 0
    for gene in gene_dict:
        for name in unique_names:
            if name not in gene_dict[gene]:
                gene_length = len(next(iter(gene_dict[gene].values())))
                gene_dict[gene][name] = SeqRecord(Seq("-"*gene_length),id=name)
                inserted_sequences += 1
    sys.stderr.write("{} Empty sequences inserted across all genes.\n".format(inserted_sequences))            
    return gene_dict