Python Examples of Bio.Seq.Seq

Source File: util.py From deepbgc with MIT License

7 votes

def create_faux_record_from_proteins(proteins, id):
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    record = SeqRecord(seq=Seq(''), id=id)
    start = 0
    end = 0
    max_protein_id_len = 45
    for protein in proteins:
        nucl_length = len(protein.seq) * 3
        end += nucl_length
        feature = SeqFeature(
            location=FeatureLocation(start, end, strand=1),
            type="CDS",
            qualifiers={
                'protein_id': [protein.id[:max_protein_id_len]],
                'translation': [str(protein.seq)]
            }
        )
        start += nucl_length
        record.features.append(feature)
    return record

Source File: prep-genome.py From kevlar with MIT License

7 votes

def main(args):
    for record in SeqIO.parse(args.infile, 'fasta'):
        if args.discard:
            if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0:
                continue

        subseqcounter = 0
        printlog(args.debug, "DEBUG: convert to upper case", record.id)
        sequence = str(record.seq).upper()
        printlog(args.debug, "DEBUG: split seq by Ns", record.id)
        subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength]
        printlog(args.debug, "DEBUG: print subseqs", record.id)
        for subseq in subseqs:
            subseqcounter += 1
            subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter)
            subrecord = SeqRecord(Seq(subseq), subid, '', '')
            SeqIO.write(subrecord, args.outfile, 'fasta')

Source File: util.py From Azimuth with BSD 3-Clause "New" or "Revised" License

6 votes

def convert_to_thirty_one(guide_seq, gene, strand):
    '''
    Given a guide sequence, a gene name, and strand (e.g. "sense"), return a 31mer string which is our 30mer,
    plus one more at the end.
    '''
    guide_seq = Seq.Seq(guide_seq)
    gene_seq = Seq.Seq(get_gene_sequence(gene)).reverse_complement()
    if strand=='sense':
        guide_seq = guide_seq.reverse_complement()
    ind = gene_seq.find(guide_seq)
    if ind ==-1:
        print "returning sequence+'A', could not find guide %s in gene %s" % (guide_seq, gene)
        return gene_seq + 'A'
    assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
    #new_mer = gene_seq[ind:(ind+len(guide_seq))+1] #looks correct, but is wrong, due to strand frame-of-reference
    new_mer = gene_seq[(ind-1):(ind+len(guide_seq))] #this actually tacks on an extra one at the end for some reason
    if strand=='sense':
        new_mer = new_mer.reverse_complement()
    return str(new_mer)

Source File: prokka.py From panaroo with MIT License

6 votes

def translate_sequences(sequence_dic):
    protein_list = []
    for strain_id in sequence_dic:
        sequence_record = sequence_dic[strain_id]
        if (len(sequence_record.seq) % 3) != 0:
            raise ValueError(
                "Coding sequence not divisible by 3, is it complete?!")
        protien_sequence = translate(str(sequence_record.seq))
        if protien_sequence[-1] == "*":
            protien_sequence = protien_sequence[0:-1]
        if "*" in protien_sequence:
            print(sequence_record)
            print(protien_sequence)
            # raise ValueError("Premature stop codon in a gene!")
        protein_record = SeqRecord(Seq(protien_sequence),
                                   id=strain_id,
                                   description=strain_id)
        protein_list.append(protein_record)
    return protein_list

Source File: generate_output.py From panaroo with MIT License

6 votes

def generate_pan_genome_reference(G, output_dir, split_paralogs=False):

    # need to treat paralogs differently?
    centroids = set()
    records = []

    for node in G.nodes():
        if not split_paralogs and G.nodes[node]['centroid'][0] in centroids:
            continue
        records.append(
            SeqRecord(Seq(max(G.nodes[node]['dna'], key=lambda x: len(x)),
                          generic_dna),
                      id=G.nodes[node]['name'],
                      description=""))
        for centroid in G.nodes[node]['centroid']:
            centroids.add(centroid)

    with open(output_dir + "pan_genome_reference.fa", 'w') as outfile:
        SeqIO.write(records, outfile, "fasta")

    return

Source File: primer_vcf.py From VCF-kit with MIT License

6 votes

def fetch_sequence(self, use_template):
        """
            Fetches sequence surrounding variant for REF, ALT, or given sample.
        """
        sample_flag = ""
        if use_template == "REF":
            command = "samtools faidx {self.reference_file} {self.region}"
        elif use_template == "ALT" or use_template is None:
            command = "samtools faidx {self.reference_file} {self.region} | bcftools consensus {self.filename}"
        else:
            sample_flag = "--sample=" + use_template  # Get use_template for sample.
            command = "samtools faidx {self.reference_file} {self.region} | bcftools consensus {sample_flag} {self.filename}"
        command = command.format(**locals())
        try:
            seq = check_output(command, shell=True)
            seq = Seq(''.join(seq.splitlines()[1:]), DNA_SET)
        except:
            seq = Seq('')
        return seq

Source File: test_align.py From augur with GNU Affero General Public License v3.0

6 votes

def test_prune_seqs_matching_alignment(self):
        sequence = {
            "seq1": SeqRecord(Seq("GTAC"), name="seq1"),
            "seq2": SeqRecord(Seq("CGTT"), name="seq2"),
            "seq3": SeqRecord(Seq("TAGC"), name="seq3"),
        }
        alignment = MultipleSeqAlignment(
            [
                SeqRecord(Seq("GTAC"), name="seq1"),
                SeqRecord(Seq("TAGC"), name="seq3"),
            ]
        )
        
        result = align.prune_seqs_matching_alignment(sequence.values(), alignment)
        assert [r.name for r in result] == ["seq2"]
        for r in result:
            assert r.seq == sequence[r.name].seq

Source File: util.py From Azimuth with BSD 3-Clause "New" or "Revised" License

6 votes

def guide_positional_features(guide_seq, gene, strand):
    """
    Given a guide sequence, a gene name, and strand (e.g. "sense"), return the (absolute) nucleotide cut position, and the percent amino acid.
    From John's email:
    the cut site is always 3nts upstream of the NGG PAM:
    5' - 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 <cut> 18 19 20 N G G - 3'
    To calculate percent protein, we determined what amino acid number was being cut and just divided by the total number of amino acids. In the case where the cutsite was between two amino acid codons, I believe we rounded down

    """

    guide_seq = Seq.Seq(guide_seq)
    gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
    if strand=='sense':
        guide_seq = guide_seq.reverse_complement()
    ind = gene_seq.find(guide_seq)
    if ind ==-1:
        print "returning None, could not find guide %s in gene %s" % (guide_seq, gene)
        return ""
    assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
    ## now get what we want from this:
    import ipdb; ipdb.set_trace()
    raise NotImplementedError("incomplete implentation for now")

Source File: utils.py From ssbio with MIT License

6 votes

def cast_to_str(obj):
    """Return a string representation of a Seq or SeqRecord.

    Args:
        obj (str, Seq, SeqRecord): Biopython Seq or SeqRecord

    Returns:
        str: String representation of the sequence

    """

    if isinstance(obj, str):
        return obj
    if isinstance(obj, Seq):
        return str(obj)
    if isinstance(obj, SeqRecord):
        return str(obj.seq)
    else:
        raise ValueError('Must provide a string, Seq, or SeqRecord object.')

Source File: utils.py From ssbio with MIT License

6 votes

def cast_to_seq(obj, alphabet=IUPAC.extended_protein):
    """Return a Seq representation of a string or SeqRecord object.

    Args:
        obj (str, Seq, SeqRecord): Sequence string or Biopython SeqRecord object
        alphabet: See Biopython SeqRecord docs

    Returns:
        Seq: Seq representation of the sequence

    """

    if isinstance(obj, Seq):
        return obj
    if isinstance(obj, SeqRecord):
        return obj.seq
    if isinstance(obj, str):
        obj = obj.upper()
        return Seq(obj, alphabet)
    else:
        raise ValueError('Must provide a string, Seq, or SeqRecord object.')

Source File: seqprop.py From ssbio with MIT License

6 votes

def seq(self):
        """Seq: Dynamically loaded Seq object from the sequence file"""

        if self.sequence_file:
            file_to_load = copy(self.sequence_path)
            log.debug('{}: reading sequence from sequence file {}'.format(self.id, file_to_load))
            tmp_sr = SeqIO.read(file_to_load, 'fasta')
            return tmp_sr.seq

        else:
            if not self._seq:
                log.debug('{}: no sequence stored in memory'.format(self.id))
            else:
                log.debug('{}: reading sequence from memory'.format(self.id))

            return self._seq

Source File: AMRHitHSP.py From staramr with Apache License 2.0

6 votes

def get_seq_record(self):
        """
        Gets a SeqRecord for this hit.
        :return: A SeqRecord for this hit.
        """
        return SeqRecord(Seq(self.get_genome_contig_hsp_seq()), id=self.get_amr_gene_id(),
                         description=(
                             'isolate: {}, contig: {}, contig_start: {}, contig_end: {}, database_gene_start: {},'
                             ' database_gene_end: {}, hsp/length: {}/{}, pid: {:0.2f}%, plength: {:0.2f}%').format(
                             self.get_genome_id(),
                             self.get_genome_contig_id(),
                             self.get_genome_contig_start(),
                             self.get_genome_contig_end(),
                             self.get_amr_gene_start(),
                             self.get_amr_gene_end(),
                             self.get_hsp_length(),
                             self.get_amr_gene_length(),
                             self.get_pid(),
                             self.get_plength()))

Source File: primer3.py From VCF-kit with MIT License

6 votes

def __init__(self, primer_values, template, reference, left_primer = True):
        for k,v in primer_values.items():
            setattr(self, k, v)
        self.left = left_primer
        template = str(template)
        if left_primer:
            self.START = template.find(self.SEQUENCE)
        else:
            # Reverse - complement right primer to find its location
            pright_rc = str(Seq(self.SEQUENCE).reverse_complement())
            self.START = template.find(pright_rc)
        self.END = self.START + len(self.SEQUENCE)

        # Blast primer sequence
        b = blast(reference, num_alignments = 10, word_size = 14)
        self.unique_copies = b.check_primer(self.SEQUENCE)

Source File: proteinAlign.py From LoReAn with MIT License

6 votes

def transeq(data):
    dummy = int(data[1])
    record = data[0]
    if dummy == 0:
        prot = (translate_frameshifted(record.seq[0:]))
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand0plus"))
    if dummy == 1:
        prot = (translate_frameshifted(record.seq[1:]))  # second frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand1plus"))
    if dummy == 2:
        prot = (translate_frameshifted(record.seq[2:]))  # third frame
        prot_rec =(SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand2plus"))
    if dummy == 3:
        prot = (translate_frameshifted(reverse_complement(record.seq)))  # negative first frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand0minus"))
    if dummy == 4:
        prot = (translate_frameshifted(reverse_complement(record.seq[:len(record.seq) - 1])))  # negative second frame
        prot_rec =(SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand1minus"))
    if dummy == 5:
        prot = (translate_frameshifted(reverse_complement(record.seq[:len(record.seq) - 2])))  # negative third frame
        prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand2minus"))
    return(prot_rec)

Source File: intronerate.py From HybPiper with GNU General Public License v3.0

6 votes

def remove_exons(gff_filename,supercontig_filename,mode="all"):
    '''Given a supercontig and corresponding annotation, remove the exon sequences. In "intron" mode, only return sequences specifically annotated as introns'''
    exon_starts = []
    exon_ends = []
    gff = open(gff_filename).readlines()
    for line in gff:
        line = line.rstrip().split("\t")
        if len(line) > 2:
            if line[2] == "exon":
                exon_starts.append(int(line[3]))
                exon_ends.append(int(line[4]))
    supercontig = SeqIO.read(supercontig_filename,'fasta')
    exonless_contig = SeqRecord(Seq(''),id=supercontig.id)
    start = 0
    for exon in range(len(exon_starts)):
        exonless_contig += supercontig[start:exon_starts[exon]-1] 
        start = exon_ends[exon]
    exonless_contig += supercontig[start:]    
    exonless_contig.description = ''
    return exonless_contig

Source File: intronerate.py From HybPiper with GNU General Public License v3.0

6 votes

def make_intron_supercontig(contig_info,gene,prefix,add_N = False):
    cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta'))
    intron_supercontig = SeqRecord(Seq(''))
    for i in contig_info:
        if i[5] == "(+)":
            intron_supercontig += cap3contigs[i[0]]
        elif i[5] == "(-)":
            intron_supercontig += cap3contigs[i[0]].reverse_complement()    
        else:
            sys.stderr.write("Strandedness not found!")
            sys.exit(1)
        if add_N and i != contig_info[-1]:
            intron_supercontig += "NNNNNNNNNN"    
    intron_supercontig.id = '{}-{}'.format(prefix,gene)
    intron_supercontig.description = ''
    SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta')

Source File: test_translate.py From augur with GNU Affero General Public License v3.0

6 votes

def test_translate_feature(self):
        '''
        Test translate_feature from a dictionary of given nucleotides to dictionary of translated amino acids
        '''
        # Seq -> Amino https://en.wikipedia.org/wiki/DNA_codon_table
        seq1 = Seq("TTTCTTATGGTCGTA") 
        seq2 = Seq("TCTTCAACTGCTACA")
        seq3 = Seq("CATAATGAATATAAT")
        aln = {'seq1': seq1,
               'seq2': seq2,
               'seq3': seq3}
        feature = SeqFeature(FeatureLocation(0, 15), type="domain")

        # expected results
        expected_translations = {'seq1': 'FLMVV',
                                 'seq2': 'SSTAT',
                                 'seq3': 'HNEYN'}

        assert translate.translate_feature(aln, feature) == expected_translations

    # TODO: test_vcf_feature, assign_aa_vcf, assign_aa_fasta
    # Unclear how to emulate inputs (TreeTime dict, tree)

Source File: import_beast.py From augur with GNU Affero General Public License v3.0

6 votes

def fake_alignment(T):
    """
    Fake alignment to appease treetime when only using it for naming nodes...
    This is lifted from refine.py and ideally could be imported

    Parameters
    -------
    T : <class 'Bio.Phylo.BaseTree.Tree'>

    Returns
    -------
    <class 'Bio.Align.MultipleSeqAlignment'>
    """
    from Bio import SeqRecord, Seq, Align
    seqs = []
    for n in T.get_terminals():
        seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description=''))
    aln = Align.MultipleSeqAlignment(seqs)
    return aln

Source File: antibody.py From abstar with MIT License

5 votes

def _vdj_germ_aa(self):
        'Returns the germline amino acid sequence of the VDJ region.'
        trim = len(self.vdj_germ_nt) - (len(self.vdj_germ_nt[self.v_rf_offset:]) % 3)
        translated_seq = Seq(self.vdj_germ_nt[self.v_rf_offset:trim], generic_dna).translate()
        return str(translated_seq)

Source File: transcriptAssembly.py From LoReAn with MIT License

5 votes

def bamtofastq(bam, verbose):
    fasta = bam + ".fasta"
    in_file = open(bam, 'r')
    in_sam = Reader(in_file)
    with open(fasta, "w") as output_handle:
        for line in in_sam:
            if line.mapped:
                record = SeqRecord(Seq(str(line.seq)),name = str(line.qname))
                SeqIO.write(record, output_handle, "fasta")
    return fasta

Source File: germline.py From abstar with MIT License

5 votes

def _get_aa_sequence(self):
        return Seq(self.coding_region, generic_dna).translate()

Source File: coordinate_mapper.py From cDNA_Cupcake with BSD 3-Clause Clear License

5 votes

def consistute_genome_seq_from_exons(genome_dict, _chr, exons, strand):
    """
    genome_dict is expected to be SeqReaders.LazyFastaReader
    exons is a list of [Interval(start, end)]
    """
    seq = ''
    genome_seq = genome_dict[_chr].seq
    for e in exons:
        seq += str(genome_seq[e.start:e.end])

    seq = Seq(seq)
    if strand == '+':
        return seq.tostring()
    else:
        return seq.reverse_complement().tostring()

Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0

5 votes

def myTranslate(nucl):
    """Given a raw sequence of nucleotides, return raw sequence of amino acids."""
    #print nucl
    nucseq = Seq(nucl)
    #print nucseq
    aminoseq = nucseq.translate()
    return str(aminoseq)

Source File: SeqReaders.py From cDNA_Cupcake with BSD 3-Clause Clear License

5 votes

def __getitem__(self, k):
        if k not in self.d:
            raise Exception("key {0} not in dictionary!".format(k))
        self.f.seek(self.d[k])

        sequence = self.f.readline().strip()
        assert self.f.readline().startswith('+')
        qualstr = self.f.readline().strip()
        return SeqRecord(seq=Seq(sequence), id=k, \
                         letter_annotations={'phred_quality':[ord(x)-33 for x in qualstr]})

Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0

5 votes

def insert_sequences(gene_dict,unique_names):
    '''Given the dictionary of dictionaries, insert blank sequences if any are missing for a gene'''
    inserted_sequences = 0
    for gene in gene_dict:
        for name in unique_names:
            if name not in gene_dict[gene]:
                gene_length = len(next(iter(gene_dict[gene].values())))
                gene_dict[gene][name] = SeqRecord(Seq("-"*gene_length),id=name)
                inserted_sequences += 1
    sys.stderr.write("{} Empty sequences inserted across all genes.\n".format(inserted_sequences))            
    return gene_dict

Source File: simulate_errors.py From wub with Mozilla Public License 2.0

5 votes

def simulate_errors(input_iter, error_rate, error_weights):
    """Simulate sequencing errors for each SeqRecord object in the input iterator.

    :param input_iter: Iterator of SeqRecord objects.
    :para error_rate: Total error rate of substitutions, insertions and deletions.
    :param error_weights: Relative frequency of substitutions,insertions,deletions.
    :returns: Generator of SeqRecord objects.
    :rtype: generator
    """
    for record in input_iter:
        mutated_seq = sim_seq.simulate_sequencing_errors(record.seq, error_rate, error_weights).seq
        record.seq = Seq(mutated_seq)
        yield record

Source File: add_errors.py From wub with Mozilla Public License 2.0

5 votes

def add_fixed_errors(input_iter, nr_errors, error_type):
    """Simulate sequencing errors for each SeqRecord object in the input iterator.

    :param input_iter: Iterator of SeqRecord objects.
    :para nr_errors: Number of errors to introduce.
    :param error_type: Error type: substitution, insertion or deletion.
    :returns: Generator of SeqRecord objects.
    :rtype: generator
    """
    for record in input_iter:
        mutated_seq = sim_seq.add_errors(record.seq, nr_errors, error_type)
        record.seq = Seq(mutated_seq)
        yield record

Source File: align.py From augur with GNU Affero General Public License v3.0

5 votes

def make_gaps_ambiguous(aln):
    '''
    replace all gaps by 'N' in all sequences in the alignment. TreeTime will treat them
    as fully ambiguous and replace then with the most likely state. This modifies the
    alignment in place.

    Parameters
    ----------
    aln : MultipleSeqAlign
        Biopython Alignment
    '''
    for seq in aln:
        _seq = str(seq.seq)
        _seq = _seq.replace('-', 'N')
        seq.seq = Seq.Seq(_seq, alphabet=seq.seq.alphabet)

Source File: SeqReaders.py From cDNA_Cupcake with BSD 3-Clause Clear License

5 votes

def __getitem__(self, k):
        if k not in self.d:
            raise Exception("key {0} not in dictionary!".format(k))
        self.f.seek(self.d[k])
        content = ''
        for line in self.f:
            if line.startswith('>'):
                break
            content += line.strip()
        return SeqRecord(seq=Seq(content), id=k)

Source File: sequencevalidator.py From CAMISIM with Apache License 2.0

5 votes

def validate_sequence(self, sequence, key=None, silent=False):
		"""
			Validate that the sequence has only valid characters

			@attention:

			@param sequence: sequence
			@type sequence: Seq
			@param key: If True, no error message will be made
			@type key: basestring | None
			@param silent: If True, no error message will be made
			@type silent: bool

			@return: True if valid
			@rtype: bool
		"""
		assert isinstance(sequence, Seq)
		assert isinstance(silent, bool)

		prefix = ""
		if key:
			prefix = "'{}' ".format(key)

		if not len(sequence) > 0:
			if not silent:
				self._logger.error("{}Empty sequence".format(prefix))
			return False

		if not self.validate_characters(
			sequence.upper(), legal_alphabet=sequence.alphabet.letters, key=key, silent=silent):
			return False
		return True

Python Bio.Seq.Seq() Examples