Python Bio.SeqRecord.SeqRecord() Examples
The following are 30
code examples of Bio.SeqRecord.SeqRecord().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
Bio.SeqRecord
, or try the search function
.
Example #1
Source File: prep-genome.py From kevlar with MIT License | 7 votes |
def main(args): for record in SeqIO.parse(args.infile, 'fasta'): if args.discard: if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0: continue subseqcounter = 0 printlog(args.debug, "DEBUG: convert to upper case", record.id) sequence = str(record.seq).upper() printlog(args.debug, "DEBUG: split seq by Ns", record.id) subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength] printlog(args.debug, "DEBUG: print subseqs", record.id) for subseq in subseqs: subseqcounter += 1 subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter) subrecord = SeqRecord(Seq(subseq), subid, '', '') SeqIO.write(subrecord, args.outfile, 'fasta')
Example #2
Source File: util.py From deepbgc with MIT License | 7 votes |
def create_faux_record_from_proteins(proteins, id): from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.SeqFeature import SeqFeature, FeatureLocation record = SeqRecord(seq=Seq(''), id=id) start = 0 end = 0 max_protein_id_len = 45 for protein in proteins: nucl_length = len(protein.seq) * 3 end += nucl_length feature = SeqFeature( location=FeatureLocation(start, end, strand=1), type="CDS", qualifiers={ 'protein_id': [protein.id[:max_protein_id_len]], 'translation': [str(protein.seq)] } ) start += nucl_length record.features.append(feature) return record
Example #3
Source File: test_error_model.py From InSilicoSeq with MIT License | 6 votes |
def test_mut_sequence(): random.seed(42) np.random.seed(42) err_mod = basic.BasicErrorModel() read = SeqRecord( Seq(str('AAAAA' * 25), IUPAC.unambiguous_dna ), id='read_1', description='test read' ) read.letter_annotations["phred_quality"] = [5] * 125 read.seq = err_mod.mut_sequence(read, 'forward') assert str(read.seq[:10]) == 'AAAACAGAAA'
Example #4
Source File: multithreadLargeFasta.py From LoReAn with MIT License | 6 votes |
def single_fasta(ref, wd_split): """ From a fasta file make single files with each sequence """ fasta_file = open(ref, 'r') single_fasta_list = [] count = 0 dict_ref_name = {} ref_rename = ref + ".rename.fasta" with open(ref_rename, "w") as fh: for record in SeqIO.parse(fasta_file, "fasta"): count += 1 new_name = "seq" + str(count) dict_ref_name[new_name] = record.id new_rec = SeqRecord(record.seq, new_name, '', '') fasta_name = wd_split + '/' + new_name + '.fasta' single_fasta_list.append(fasta_name) output_handle = open(fasta_name, "w") SeqIO.write(new_rec, output_handle, "fasta") SeqIO.write(new_rec, fh, "fasta") output_handle.close() return single_fasta_list, dict_ref_name, ref_rename
Example #5
Source File: AMRHitHSP.py From staramr with Apache License 2.0 | 6 votes |
def get_seq_record(self): """ Gets a SeqRecord for this hit. :return: A SeqRecord for this hit. """ return SeqRecord(Seq(self.get_genome_contig_hsp_seq()), id=self.get_amr_gene_id(), description=( 'isolate: {}, contig: {}, contig_start: {}, contig_end: {}, database_gene_start: {},' ' database_gene_end: {}, hsp/length: {}/{}, pid: {:0.2f}%, plength: {:0.2f}%').format( self.get_genome_id(), self.get_genome_contig_id(), self.get_genome_contig_start(), self.get_genome_contig_end(), self.get_amr_gene_start(), self.get_amr_gene_end(), self.get_hsp_length(), self.get_amr_gene_length(), self.get_pid(), self.get_plength()))
Example #6
Source File: proteinAlign.py From LoReAn with MIT License | 6 votes |
def transeq(data): dummy = int(data[1]) record = data[0] if dummy == 0: prot = (translate_frameshifted(record.seq[0:])) prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand0plus")) if dummy == 1: prot = (translate_frameshifted(record.seq[1:])) # second frame prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand1plus")) if dummy == 2: prot = (translate_frameshifted(record.seq[2:])) # third frame prot_rec =(SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand2plus")) if dummy == 3: prot = (translate_frameshifted(reverse_complement(record.seq))) # negative first frame prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand0minus")) if dummy == 4: prot = (translate_frameshifted(reverse_complement(record.seq[:len(record.seq) - 1]))) # negative second frame prot_rec =(SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand1minus")) if dummy == 5: prot = (translate_frameshifted(reverse_complement(record.seq[:len(record.seq) - 2]))) # negative third frame prot_rec = (SeqRecord(Seq(prot, IUPAC.protein), id=record.id + "_strand2minus")) return(prot_rec)
Example #7
Source File: generate_output.py From panaroo with MIT License | 6 votes |
def generate_pan_genome_reference(G, output_dir, split_paralogs=False): # need to treat paralogs differently? centroids = set() records = [] for node in G.nodes(): if not split_paralogs and G.nodes[node]['centroid'][0] in centroids: continue records.append( SeqRecord(Seq(max(G.nodes[node]['dna'], key=lambda x: len(x)), generic_dna), id=G.nodes[node]['name'], description="")) for centroid in G.nodes[node]['centroid']: centroids.add(centroid) with open(output_dir + "pan_genome_reference.fa", 'w') as outfile: SeqIO.write(records, outfile, "fasta") return
Example #8
Source File: prokka.py From panaroo with MIT License | 6 votes |
def translate_sequences(sequence_dic): protein_list = [] for strain_id in sequence_dic: sequence_record = sequence_dic[strain_id] if (len(sequence_record.seq) % 3) != 0: raise ValueError( "Coding sequence not divisible by 3, is it complete?!") protien_sequence = translate(str(sequence_record.seq)) if protien_sequence[-1] == "*": protien_sequence = protien_sequence[0:-1] if "*" in protien_sequence: print(sequence_record) print(protien_sequence) # raise ValueError("Premature stop codon in a gene!") protein_record = SeqRecord(Seq(protien_sequence), id=strain_id, description=strain_id) protein_list.append(protein_record) return protein_list
Example #9
Source File: intronerate.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def remove_exons(gff_filename,supercontig_filename,mode="all"): '''Given a supercontig and corresponding annotation, remove the exon sequences. In "intron" mode, only return sequences specifically annotated as introns''' exon_starts = [] exon_ends = [] gff = open(gff_filename).readlines() for line in gff: line = line.rstrip().split("\t") if len(line) > 2: if line[2] == "exon": exon_starts.append(int(line[3])) exon_ends.append(int(line[4])) supercontig = SeqIO.read(supercontig_filename,'fasta') exonless_contig = SeqRecord(Seq(''),id=supercontig.id) start = 0 for exon in range(len(exon_starts)): exonless_contig += supercontig[start:exon_starts[exon]-1] start = exon_ends[exon] exonless_contig += supercontig[start:] exonless_contig.description = '' return exonless_contig
Example #10
Source File: targeted_mut_fasta_corrected.py From DiscoSnp with GNU Affero General Public License v3.0 | 6 votes |
def targeted_mut(fasta_file, index): genome = SeqIO.parse(fasta_file, "fasta") fasta_file_mut = ("{}_mut".format(fasta_file)) nb_mut = 0 sequences_mut = [] for record in genome: chr = record.id mutable_seq = record.seq.tomutable() #print(mutable_seq[0]) if chr in index: for pos in index[chr]: if mutable_seq[pos - 1] == index[chr][pos]["ref"]: mutable_seq[pos - 1] = index[chr][pos]["alt"] nb_mut += 1 record_mut=SeqRecord(mutable_seq, record.id,'','') sequences_mut.append(record_mut) SeqIO.write(sequences_mut, fasta_file_mut, "fasta") print("{} bases have been mutated".format(nb_mut))
Example #11
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def test_prune_seqs_matching_alignment(self): sequence = { "seq1": SeqRecord(Seq("GTAC"), name="seq1"), "seq2": SeqRecord(Seq("CGTT"), name="seq2"), "seq3": SeqRecord(Seq("TAGC"), name="seq3"), } alignment = MultipleSeqAlignment( [ SeqRecord(Seq("GTAC"), name="seq1"), SeqRecord(Seq("TAGC"), name="seq3"), ] ) result = align.prune_seqs_matching_alignment(sequence.values(), alignment) assert [r.name for r in result] == ["seq2"] for r in result: assert r.seq == sequence[r.name].seq
Example #12
Source File: import_beast.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def fake_alignment(T): """ Fake alignment to appease treetime when only using it for naming nodes... This is lifted from refine.py and ideally could be imported Parameters ------- T : <class 'Bio.Phylo.BaseTree.Tree'> Returns ------- <class 'Bio.Align.MultipleSeqAlignment'> """ from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) return aln
Example #13
Source File: seqprop.py From ssbio with MIT License | 6 votes |
def seq(self, s): if not s: self._seq = None elif self.sequence_file: raise ValueError('{}: unable to set sequence, sequence file is associated with this object'.format(self.id)) elif type(s) == str or type(s) == Seq: self._seq = ssbio.protein.sequence.utils.cast_to_seq(obj=s) # If a SeqRecord, copy all attributes elif type(s) == SeqRecord: self._seq = s.seq if self.name == '<unknown name>': self.name = s.name if self.description == '<unknown description>': self.description = s.description if not self.dbxrefs: self.dbxrefs = s.dbxrefs if not self.features: self.features = s.features if not self.annotations: self.annotations = s.annotations if not self.letter_annotations: self.letter_annotations = s.letter_annotations
Example #14
Source File: trim_lowercase.py From pacbio_variant_caller with MIT License | 6 votes |
def trim_lowercase(input_filename, output_filename, keep_completely_lowercased_sequences): """ Trim all lowercase letters from the given input FASTA file, remove records that were completely lowercase, and write the trimmed FASTA to the given output file. """ trimmed_records = [] for record in SeqIO.parse(input_filename, "fasta"): # Remove all lowercase letters in the sequence for this record. trimmed_sequence = re.sub("(^[a-z]+|[a-z]+$)", "", str(record.seq)) # If any uppercase sequence remains, create a new record for it. if len(trimmed_sequence) > 0: trimmed_records.append(SeqRecord(Seq(trimmed_sequence), id=record.id, description="")) elif keep_completely_lowercased_sequences: trimmed_records.append(SeqRecord(Seq(str(record.seq)), id=record.id, description="")) # Write out all trimmed records to the given output file. SeqIO.write(trimmed_records, output_filename, "fasta")
Example #15
Source File: utils.py From ssbio with MIT License | 6 votes |
def cast_to_seq(obj, alphabet=IUPAC.extended_protein): """Return a Seq representation of a string or SeqRecord object. Args: obj (str, Seq, SeqRecord): Sequence string or Biopython SeqRecord object alphabet: See Biopython SeqRecord docs Returns: Seq: Seq representation of the sequence """ if isinstance(obj, Seq): return obj if isinstance(obj, SeqRecord): return obj.seq if isinstance(obj, str): obj = obj.upper() return Seq(obj, alphabet) else: raise ValueError('Must provide a string, Seq, or SeqRecord object.')
Example #16
Source File: utils.py From ssbio with MIT License | 6 votes |
def cast_to_str(obj): """Return a string representation of a Seq or SeqRecord. Args: obj (str, Seq, SeqRecord): Biopython Seq or SeqRecord Returns: str: String representation of the sequence """ if isinstance(obj, str): return obj if isinstance(obj, Seq): return str(obj) if isinstance(obj, SeqRecord): return str(obj.seq) else: raise ValueError('Must provide a string, Seq, or SeqRecord object.')
Example #17
Source File: promoters.py From antismash with GNU Affero General Public License v3.0 | 6 votes |
def write_promoters_to_file(output_dir: str, prefix: str, promoters: List[Promoter]) -> None: """ Write the promoters to the given file using the given prefix. The prefix helps in having separate files for multi-record inputs with a singular output directory. """ # positions file pos_handle = open(os.path.join(output_dir, prefix + "_promoter_positions.csv"), "w") pos_handle.write("\t".join(["#", "promoter", "start", "end", "length"]) + "\n") # sequences file seq_handle = open(os.path.join(output_dir, prefix + "_promoter_sequences.fasta"), "w") for i, promoter in enumerate(promoters): # write promoter positions to file pos_handle.write("\t".join(map(str, [i + 1, promoter.get_id(), promoter.start + 1, promoter.end + 1, len(promoter)])) + "\n") # write promoter sequences to file SeqIO.write(SeqRecord(promoter.seq, id=promoter.get_id(), description="length={}bp".format(len(promoter))), seq_handle, "fasta")
Example #18
Source File: intronerate.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def make_intron_supercontig(contig_info,gene,prefix,add_N = False): cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta')) intron_supercontig = SeqRecord(Seq('')) for i in contig_info: if i[5] == "(+)": intron_supercontig += cap3contigs[i[0]] elif i[5] == "(-)": intron_supercontig += cap3contigs[i[0]].reverse_complement() else: sys.stderr.write("Strandedness not found!") sys.exit(1) if add_N and i != contig_info[-1]: intron_supercontig += "NNNNNNNNNN" intron_supercontig.id = '{}-{}'.format(prefix,gene) intron_supercontig.description = '' SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta')
Example #19
Source File: seq.py From wub with Mozilla Public License 2.0 | 6 votes |
def count_records(input_object, format='fasta'): """Count SeqRecord objects from a file in the specified format. :param input_object: A file object or a file name. :param format: Input format (fasta by default). :returns: Number of records in input file. :rtype: int """ handle = input_object if type(handle) == str: handle = open(handle, "rU") counter = 0 for _ in SeqIO.parse(handle, format): counter += 1 return counter
Example #20
Source File: test_circular_conversion.py From antismash with GNU Affero General Public License v3.0 | 5 votes |
def setUp(self): self.seqrec = SeqRecord(UnknownSeq(21)) loc = CompoundLocation([FeatureLocation(12, 15, strand=1), FeatureLocation(18, 21, strand=1), FeatureLocation(0, 3, strand=1), FeatureLocation(6, 9, strand=1)], operator="join") self.seqcds = SeqFeature(loc, type="CDS") self.seqgene = SeqFeature(loc, type="gene") self.seqrec.annotations["topology"] = "circular"
Example #21
Source File: record.py From antismash with GNU Affero General Public License v3.0 | 5 votes |
def to_biopython(self) -> SeqRecord: """Returns a Bio.SeqRecord instance of the record""" features = self.get_all_features() bio_features = [] # type: List[SeqFeature] for feature in sorted(features): bio_features.extend(feature.to_biopython()) return SeqRecord(self.seq, id=self._record.id, name=self._record.name, description=self._record.description, dbxrefs=self.dbxrefs, features=bio_features, annotations=self._record.annotations, letter_annotations=self._record.letter_annotations)
Example #22
Source File: record.py From antismash with GNU Affero General Public License v3.0 | 5 votes |
def __getattr__(self, attr: str) -> Any: # passthroughs to the original SeqRecord if attr in ["id", "seq", "description", "name", "annotations", "dbxrefs"]: return getattr(self._record, attr) if attr in Record.__slots__: return self.__getattribute__(attr) raise AttributeError("Record has no attribute '%s'" % attr)
Example #23
Source File: seq.py From wub with Mozilla Public License 2.0 | 5 votes |
def read_seq_records(input_object, format='fasta'): """Read SeqRecord objects from a file in the specified format. :param input_object: A file object or a file name. :param format: Input format (fasta by default). :returns: A dictionary with the parsed SeqRecord objects. :rtype: generator """ handle = input_object if type(handle) == str: handle = open(handle, "rU") return SeqIO.parse(handle, format)
Example #24
Source File: test_circular_conversion.py From antismash with GNU Affero General Public License v3.0 | 5 votes |
def setUp(self): self.seqrec = SeqRecord(UnknownSeq(21)) loc = CompoundLocation([FeatureLocation(12, 21, strand=1), FeatureLocation(0, 3, strand=1), FeatureLocation(6, 9, strand=1)], operator="join") self.seqcds = SeqFeature(loc, type="CDS") self.seqgene = SeqFeature(loc, type="gene") self.seqrec.annotations["topology"] = "circular"
Example #25
Source File: seq.py From wub with Mozilla Public License 2.0 | 5 votes |
def read_seq_records_dict(input_object, format='fasta'): """Read SeqRecord objects to a dictionary from a file in the specified format. :param input_object: A file object or a file name. :param format: Input format (fasta by default). :returns: An iterator of SeqRecord objects. :rtype: dict """ handle = input_object if type(handle) == str: handle = open(handle, "rU") return SeqIO.to_dict(SeqIO.parse(handle, format))
Example #26
Source File: seq.py From wub with Mozilla Public License 2.0 | 5 votes |
def record_lengths(input_iter): """Return lengths of SeqRecord obejcts in the input iterator. :param input_iter: An iterator of SeqRecord objects. :returns: An ordered dictionary with the lengths of the SeqRecord objects. :rtype: OrderedDict """ lengths = OrderedDict((record.id, len(record)) for record in input_iter) return lengths
Example #27
Source File: transcriptAssembly.py From LoReAn with MIT License | 5 votes |
def bamtofastq(bam, verbose): fasta = bam + ".fasta" in_file = open(bam, 'r') in_sam = Reader(in_file) with open(fasta, "w") as output_handle: for line in in_sam: if line.mapped: record = SeqRecord(Seq(str(line.seq)),name = str(line.qname)) SeqIO.write(record, output_handle, "fasta") return fasta
Example #28
Source File: ancestralReconstruction.py From popgen-stats with MIT License | 5 votes |
def concatenate_genes(coreGenes): """ Using Biopython, concatenate ancestral genes into one file""" print "Concatenating ancestral reconstructions" def parse_lazarus_output(coreGene): ancRecFile = open("%s/ancestral/ancestor.out.txt" % coreGene, "r") for i,line in enumerate(ancRecFile): if i == 13: record = SeqRecord(Seq(line.strip(), IUPAC.ambiguous_dna), id=coreGene, description="") return record pool = ThreadPool(args.threads) recs = pool.map(parse_lazarus_output, coreGenes) SeqIO.write(recs, "ancestralGenes.fa", "fasta")
Example #29
Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0 | 5 votes |
def get_unique_names(gene_dict): '''Given the dictionary of SeqRecord dictionaries, return a list of the unique sequence headers''' all_names = [] for gene in gene_dict: all_names += list(gene_dict[gene].keys()) return set(all_names)
Example #30
Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0 | 5 votes |
def insert_sequences(gene_dict,unique_names): '''Given the dictionary of dictionaries, insert blank sequences if any are missing for a gene''' inserted_sequences = 0 for gene in gene_dict: for name in unique_names: if name not in gene_dict[gene]: gene_length = len(next(iter(gene_dict[gene].values()))) gene_dict[gene][name] = SeqRecord(Seq("-"*gene_length),id=name) inserted_sequences += 1 sys.stderr.write("{} Empty sequences inserted across all genes.\n".format(inserted_sequences)) return gene_dict