Python Bio.SeqIO.parse() Examples
The following are 30
code examples of Bio.SeqIO.parse().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
Bio.SeqIO
, or try the search function
.
Example #1
Source File: prep-genome.py From kevlar with MIT License | 7 votes |
def main(args): for record in SeqIO.parse(args.infile, 'fasta'): if args.discard: if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0: continue subseqcounter = 0 printlog(args.debug, "DEBUG: convert to upper case", record.id) sequence = str(record.seq).upper() printlog(args.debug, "DEBUG: split seq by Ns", record.id) subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength] printlog(args.debug, "DEBUG: print subseqs", record.id) for subseq in subseqs: subseqcounter += 1 subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter) subrecord = SeqRecord(Seq(subseq), subid, '', '') SeqIO.write(subrecord, args.outfile, 'fasta')
Example #2
Source File: serialize_fasta.py From tape-neurips2019 with MIT License | 6 votes |
def fasta_to_tfrecord(filename, vocab=PFAM_VOCAB): proteins_to_embed = [] for record in SeqIO.parse(filename, 'fasta'): sequence = record.seq description = record.description int_sequence = [] for aa in sequence: if aa in string.whitespace: raise ValueError("whitespace found in sequence for {}".format(description)) int_sequence.append(vocab.get(aa)) protein_context = to_features(protein_length=len(int_sequence), id=description.encode('UTF-8')) protein_features = to_sequence_features(primary=int_sequence) example = tf.train.SequenceExample(context=protein_context, feature_lists=protein_features) proteins_to_embed.append(example) new_filename = filename.rsplit('.', maxsplit=1)[0] + '.tfrecord' print('Writing tfrecord file {}'.format(new_filename)) with tf.python_io.TFRecordWriter(new_filename) as writer: for example in proteins_to_embed: writer.write(example.SerializeToString())
Example #3
Source File: test_mask.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def test_e2e_fasta_beginning_end_sites(self, fasta_file, bed_file, out_file, sequences, argparser): from_beginning = 3 from_end = 1 arg_sites = [5, 12] expected_removals = sorted(set(TEST_BED_SEQUENCE + [s - 1 for s in arg_sites])) print(expected_removals) args = argparser("-s %s -o %s --mask %s --mask-from-beginning %s --mask-from-end %s --mask-sites %s" % ( fasta_file, out_file, bed_file, from_beginning, from_end, " ".join(str(s) for s in arg_sites))) mask.run(args) output = SeqIO.parse(out_file, "fasta") for record in output: reference = str(sequences[record.id].seq) masked_seq = str(record.seq) assert masked_seq[:from_beginning] == "N" * from_beginning assert masked_seq[-from_end:] == "N" * from_end for idx, site in enumerate(masked_seq[from_beginning:-from_end], from_beginning): if idx in expected_removals: assert site == "N" else: assert site == reference[idx]
Example #4
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def test_prepare_with_alignment_with_ref_name(self, test_file, test_seqs, existing_with_ref, existing_aln, ref_seq, out_file): """Test that, given a set of test sequences, an existing alignment, and a reference sequence name, no changes are made.""" aln_outfile, seqs_outfile, _ = align.prepare([test_file,], existing_with_ref, out_file, ref_seq.id, None) assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said" assert aln_outfile == existing_with_ref, "Rewrote the alignment file unexpectedly" # Alignment file should be unchanged aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta")) assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence dropped from alignment" for seq in existing_aln: assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly" assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly" # test sequences should be unchanged assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said" seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta")) for seq in test_seqs: assert seq in seq_output, "Some test sequences unexpectedly dropped" assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly" assert seq_output.keys() == test_seqs.keys()
Example #5
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def test_prepare_with_alignment_with_ref_seq(self, test_file, test_seqs, existing_file, existing_aln, ref_seq, ref_file, out_file): """Test that, given a set of test sequences, an existing alignment, and a reference sequence, the reference is added to the existing alignment and no other changes are made.""" aln_outfile, seqs_outfile, ref_name = align.prepare([test_file,], existing_file, out_file, None, ref_file) assert ref_name == ref_seq.id, "Didn't return strain name from refrence file" assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said" assert aln_outfile != existing_aln, "Unexpectedly overwrote existing alignment" # Alignment file should have the reference added aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta")) assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence not added to alignment" for seq in existing_aln: assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly" assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly" # test sequences should be unchanged assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said" seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta")) for seq in test_seqs: assert seq in seq_output, "Some test sequences unexpectedly dropped" assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly" assert seq_output.keys() == test_seqs.keys()
Example #6
Source File: bio_utils.py From models with MIT License | 6 votes |
def read_fasta(mirna_fasta_file, mrna_fasta_file): handle = open(mirna_fasta_file, "rU") mirna_list = list(SeqIO.parse(handle, "fasta")) handle.close() handle = open(mrna_fasta_file, "rU") mrna_list = list(SeqIO.parse(handle, "fasta")) handle.close() mirna_ids = [] mirna_sequences = [] mrna_ids = [] mrna_sequences = [] for i in range(len(mirna_list)): mirna_ids.append(str(mirna_list[i].id)) mirna_sequences.append(str(mirna_list[i].seq)) for i in range(len(mrna_list)): mrna_ids.append(str(mrna_list[i].id)) mrna_sequences.append(str(mrna_list[i].seq)) return (mirna_ids, mirna_sequences, mrna_ids, mrna_sequences)
Example #7
Source File: genomics.py From deepchem with MIT License | 6 votes |
def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"): """ Loads a sequence file and returns an array of one-hot sequences. Parameters ---------- fname: str Filename of fasta file. file_type: str The type of file encoding to process, e.g. fasta or fastq, this is passed to Biopython.SeqIO.parse. letters: str The set of letters that the sequences consist of, e.g. ATCG. Returns ------- np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1). """ from Bio import SeqIO sequences = SeqIO.parse(fname, file_type) return seq_one_hot_encode(sequences, letters)
Example #8
Source File: util.py From InSilicoSeq with MIT License | 6 votes |
def count_records(fasta_file): """Count the number of records in a fasta file and return a list of recods id Args: fasta_file (string): the path to a fasta file Returns: list: a list of record ids """ logger = logging.getLogger(__name__) record_list = [] for record in SeqIO.parse(fasta_file, "fasta"): record_list.append(record.id) try: assert len(record_list) != 0 except AssertionError as e: logger.error( 'Failed to find records in genome(s) file:%s' % fasta_file) sys.exit(1) else: return record_list
Example #9
Source File: ref_coord.py From NucDiff with Mozilla Public License 2.0 | 6 votes |
def READ_FASTA_ENTRY(file_name): fasta_sequences=[] sequence_name=[] full_names_dict={} sequences_dict={} if os.stat(file_name)[6]!=0: #not empty fh = open(file_name, "r") for record in SeqIO.parse(fh, "fasta"): short_name=str(record.id).split(' ')[0] sequences_dict[short_name]=str(record.seq) return sequences_dict #------------------------------------------------------------------------------------------
Example #10
Source File: general.py From NucDiff with Mozilla Public License 2.0 | 6 votes |
def READ_FASTA_ENTRY(file_name): fasta_sequences=[] sequence_name=[] full_names_dict={} sequences_dict={} if os.stat(file_name)[6]!=0: #not empty fh = open(file_name, "r") for record in SeqIO.parse(fh, "fasta"): short_name=str(record.id).split(' ')[0] sequence_name.append(short_name) full_names_dict[short_name]=str(record.id) fasta_sequences.append(str(record.seq)) sequences_dict[short_name]=str(record.seq) return sequences_dict,fasta_sequences, sequence_name, full_names_dict
Example #11
Source File: __init__.py From PHEnix with GNU General Public License v3.0 | 6 votes |
def _read_reference(self, reference): """Read in the reference from the file into a dictionary. Parameters ---------- reference: str Path to the reference. Returns ------- reference: dict Dictionary with chromosome - sequence mapping. """ if reference is None: self._reference = None else: self._reference = {} with open(reference) as reference_fp: for record in SeqIO.parse(reference_fp, "fasta"): self._reference[record.id] = list(record.seq)
Example #12
Source File: update.py From funannotate with BSD 2-Clause "Simplified" License | 6 votes |
def gbk2interlap(input): ''' function to parse GBK file, construct scaffold/gene interlap dictionary and funannotate standard annotation dictionary ''' inter = defaultdict(InterLap) Genes = {} with open(input, 'r') as filein: for record in SeqIO.parse(filein, 'genbank'): for f in record.features: if f.type == 'gene': locusTag, ID, Parent = lib.getID(f, f.type) start = int(f.location.nofuzzy_start) end = int(f.location.nofuzzy_end) inter[record.id].add((start, end, locusTag)) lib.gb_feature_add2dict(f, record, Genes) return inter, Genes
Example #13
Source File: populate.py From phageParser with MIT License | 6 votes |
def addspacerstodict(gendict, sfile): spacerrecords = SeqIO.parse(sfile, 'fasta') for record in spacerrecords: accessions = record.name.split('|') sequence = str(record.seq) for acc in accessions: acc_elems = acc.split('_') order = acc_elems[-1] acc_id = '_'.join(acc_elems[:-1]) try: if 'Spacers' in gendict[acc_id]: gendict[acc_id]['Spacers'][order] = sequence else: gendict[acc_id]['Spacers'] = {order: sequence} except KeyError: print('Error on accession id: %s' % acc_id) return gendict
Example #14
Source File: update.py From funannotate with BSD 2-Clause "Simplified" License | 6 votes |
def pasa_transcript2gene(input): # modify kallisto ouput to map gene names to each mRNA ID so you know what locus they have come from mRNADict = {} # since mRNA is unique, parse the transcript file which has mRNAID geneID in header with open(input, 'r') as transin: for line in transin: if line.startswith('>'): line = line.rstrip() line = line.replace('>', '') cols = line.split(' ') mRNAID = cols[0] geneID = cols[1] location = cols[-1] if not mRNAID in mRNADict: mRNADict[mRNAID] = (geneID, location) return mRNADict
Example #15
Source File: CRISPRessoCountCORE.py From CRISPResso with GNU Affero General Public License v3.0 | 6 votes |
def filter_se_fastq_by_qual(fastq_filename,output_filename=None,min_bp_quality=20,min_single_bp_quality=0): if fastq_filename.endswith('.gz'): fastq_handle=gzip.open(fastq_filename) else: fastq_handle=open(fastq_filename) if not output_filename: output_filename=fastq_filename.replace('.fastq','').replace('.gz','')+'_filtered.fastq.gz' try: fastq_filtered_outfile=gzip.open(output_filename,'w+') for record in SeqIO.parse(fastq_handle, "fastq"): if np.array(record.letter_annotations["phred_quality"]).mean()>=min_bp_quality \ and np.array(record.letter_annotations["phred_quality"]).min()>=min_single_bp_quality: fastq_filtered_outfile.write(record.format('fastq')) except: raise Exception('Error handling the fastq_filtered_outfile') return output_filename
Example #16
Source File: organism_name_update.py From phageParser with MIT License | 6 votes |
def fetch_names(id_list): organism_names = {} # Doing 100 by 100 to make sure requests to NCBI are not too big for i in range(0, len(id_list), 100): j = i + 100 if j >= len(id_list): j = len(id_list) sys.stderr.write( "Fetching entries from %s to %s from GenBank\n" % (i, j)) sys.stderr.flush() result_handle = Entrez.efetch(db=db, rettype="gb", id=id_list[i:j]) # Populate result per organism name for record in parse(result_handle, 'genbank'): # Using NCBI name, which should match accession number passed organism_names[record.name] = record.annotations['organism'] return organism_names
Example #17
Source File: CRISPRessoCORE.py From CRISPResso with GNU Affero General Public License v3.0 | 6 votes |
def filter_se_fastq_by_qual(fastq_filename,output_filename=None,min_bp_quality=20,min_single_bp_quality=0): if fastq_filename.endswith('.gz'): fastq_handle=gzip.open(fastq_filename) else: fastq_handle=open(fastq_filename) if not output_filename: output_filename=fastq_filename.replace('.fastq','').replace('.gz','')+'_filtered.fastq.gz' try: fastq_filtered_outfile=gzip.open(output_filename,'w+') for record in SeqIO.parse(fastq_handle, "fastq"): if np.array(record.letter_annotations["phred_quality"]).mean()>=min_bp_quality \ and np.array(record.letter_annotations["phred_quality"]).min()>=min_single_bp_quality: fastq_filtered_outfile.write(record.format('fastq')) except: raise Exception('Error handling the fastq_filtered_outfile') return output_filename
Example #18
Source File: EukRep_tests.py From EukRep with MIT License | 6 votes |
def compare_outputs(reference_db, test_db, out_name): reference_ids = {} print('Expect the following sequences to be predicted to be eukaryotic:') for record in SeqIO.parse(reference_db, "fasta"): reference_ids[record.id] = 0 print(record.id) #Check for enexptected output print('The following sequences were predicted to be eukaryotic:') for record in SeqIO.parse(test_db, "fasta"): print(record.id) if record.id not in reference_ids: print('\nUnexpected scaffold %s in %s' % (record.id, out_name)) return True else: reference_ids[record.id] = 1 #Check for missing output for scaffold in reference_ids: if reference_ids[scaffold] is 0: print('\nMissing scaffold %s in %s' % (scaffold, out_name)) return True return False
Example #19
Source File: find_fasta_gaps.py From smrtsv2 with MIT License | 6 votes |
def find_gaps(input_filename): # Load the original FASTA sequence. fasta = SeqIO.parse(input_filename, "fasta") for record in fasta: gap_start = None # Not in a gap for i in _range(len(record)): if record.seq[i].upper() == "N": if gap_start is None: gap_start = i else: if gap_start is not None: print("\t".join(map(str, (record.id, gap_start, i)))) gap_start = None if gap_start is not None: print("\t".join(map(str, (record.id, gap_start, len(record)))))
Example #20
Source File: pipe.py From singlem with GNU General Public License v3.0 | 6 votes |
def _align_proteins_to_hmm(self, protein_sequences, hmm_file): '''hmmalign proteins to hmm, and return an alignment object Parameters ---------- protein_sequences: generator / list of tuple(name,sequence) objects from SeqReader(). ''' cmd = "hmmalign '{}' /dev/stdin".format(hmm_file) output = extern.run(cmd, stdin=''.join([ ">{}\n{}\n".format(s[0], s[1]) for s in protein_sequences])) protein_alignment = [] for record in SeqIO.parse(StringIO(output), 'stockholm'): protein_alignment.append(AlignedProteinSequence(record.name, str(record.seq))) if len(protein_alignment) > 0: logging.debug("Read in %i aligned sequences e.g. %s %s" % ( len(protein_alignment), protein_alignment[0].name, protein_alignment[0].seq)) else: logging.debug("No aligned sequences found for this HMM") return protein_alignment
Example #21
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def test_prepare_no_alignment_with_ref_file(self, test_file, test_seqs, ref_file, ref_seq, out_file): _, output_fn, ref_name = align.prepare([test_file,], None, out_file, None, ref_file) assert ref_name == ref_seq.id, "Didn't return strain name from refrence file" assert os.path.isfile(output_fn), "Didn't write sequences where it said" output = list(SeqIO.parse(output_fn, "fasta")) # order matters assert output[0].id == ref_seq.id, "Reference sequence is not the first sequence in ouput file!" output_names = {record.name for record in output} assert all(name in output_names for name in test_seqs), "Some test sequences dropped unexpectedly" for record in output[1:]: assert record.seq == test_seqs[record.id].seq, "Some test sequences changed unexpectedly"
Example #22
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def test_prepare_no_alignment_or_ref(self, test_file, test_seqs, out_file): _, output, _ = align.prepare([test_file,], None, out_file, None, None) assert os.path.isfile(output), "Didn't write sequences where it said" for name, seq in SeqIO.to_dict(SeqIO.parse(output, "fasta")).items(): assert seq.seq == test_seqs[name].seq
Example #23
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def test_prepare_no_alignment_with_ref_name(self, test_with_ref, test_seqs, ref_seq, out_file): _, output_fn, _ = align.prepare([test_with_ref,], None, out_file, ref_seq.id, None) assert os.path.isfile(output_fn), "Didn't write sequences where it said" output = SeqIO.to_dict(SeqIO.parse(output_fn, "fasta")) assert output[ref_seq.id].seq == ref_seq.seq, "Reference sequence was not added to test sequences" for seq in test_seqs: assert seq in output, "Some test sequences dropped unexpectedly" assert output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
Example #24
Source File: processSam.py From miRge with GNU General Public License v3.0 | 5 votes |
def decorateSam(infTmp1, infTmp2, *arg): # The parameters of this function are as follows: # unmapped_mirna_SRR944031_vs_representative_seq.sam unmapped_mirna_SRR944031.fa mapped_mirna_SRR944031_vs_genome_sorted_clusters_overlap14_trimmed_orig.fa seqDic = {} for record in SeqIO.parse(infTmp2, "fasta"): seqDic.update({record.id:str(record.seq)}) outf = open(infTmp1[:-4]+"_modified.sam","w") if len(arg) == 1: clusterSeqDic = {} for record in SeqIO.parse(arg[0], "fasta"): clusterSeqDic.update({record.id:str(record.seq)}) with open(infTmp1,"r") as inf: for line in inf: if line[0] == '@': outf.write(line) else: miRNAName = line.strip().split('\t')[0] clusterRNAName = line.strip().split('\t')[2] if clusterRNAName in clusterSeqDic.keys(): outf.write('\t'.join([miRNAName, miRNAName.split('_')[-1], seqDic[miRNAName], clusterSeqDic[clusterRNAName]])+'\t'+'\t'.join(line.strip().split('\t')[1:])+'\n') else: outf.write('\t'.join([miRNAName, miRNAName.split('_')[-1], seqDic[miRNAName], '*'])+'\t'+'\t'.join(line.strip().split('\t')[1:])+'\n') else: with open(infTmp1,"r") as inf: for line in inf: if line[0] == '@': outf.write(line) else: miRNAName = line.strip().split('\t')[0] outf.write('\t'.join([miRNAName, miRNAName.split('_')[-1], seqDic[miRNAName]])+'\t'+'\t'.join(line.strip().split('\t')[1:])+'\n') outf.close()
Example #25
Source File: CRISPRessoCORE.py From CRISPResso with GNU Affero General Public License v3.0 | 5 votes |
def get_ids_reads_to_remove(fastq_filename,min_bp_quality=20,min_single_bp_quality=0): ids_to_remove=set() if fastq_filename.endswith('.gz'): fastq_handle=gzip.open(fastq_filename) else: fastq_handle=open(fastq_filename) for record in SeqIO.parse(fastq_handle, "fastq"): if np.array(record.letter_annotations["phred_quality"]).mean()<min_bp_quality \ or np.array(record.letter_annotations["phred_quality"]).min()<min_single_bp_quality: ids_to_remove.add(record.id) return ids_to_remove
Example #26
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def argparser(): """Provide an easy way to test command line arguments""" parser = argparse.ArgumentParser() align.register_arguments(parser) def parse(args): return parser.parse_args(args.split(" ")) return parse
Example #27
Source File: test_mask.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def test_e2e_fasta_mask_file(self, fasta_file, mask_file, sequences, argparser): args = argparser("-s %s --mask %s" % (fasta_file, mask_file)) mask.run(args) output = SeqIO.parse(fasta_file,"fasta") for record in output: reference = sequences[record.id].seq for idx, site in enumerate(record.seq): if idx in TEST_MASK_SEQUENCE: assert site == "N" else: assert site == reference[idx]
Example #28
Source File: update.py From funannotate with BSD 2-Clause "Simplified" License | 5 votes |
def gff2interlap(input, fasta): ''' function to parse GFF3 file, construct scaffold/gene interlap dictionary and funannotate standard annotation dictionary ''' inter = defaultdict(InterLap) Genes = {} Genes = lib.gff2dict(input, fasta, Genes) for k, v in natsorted(list(Genes.items())): inter[v['contig']].add((v['location'][0], v['location'][1], k)) return inter, Genes
Example #29
Source File: test_mask.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def test_mask_fasta_from_beginning_and_end_too_long(self, fasta_file, out_file, beginning, end): mask.mask_fasta([], fasta_file, out_file, mask_from_beginning=beginning, mask_from_end=end) output = SeqIO.parse(out_file, "fasta") for record in output: assert record.seq == "N" * len(record.seq)
Example #30
Source File: test_mask.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def test_e2e_fasta_minimal(self, fasta_file, bed_file, sequences, argparser): args = argparser("-s %s --mask %s" % (fasta_file, bed_file)) mask.run(args) output = SeqIO.parse(fasta_file,"fasta") for record in output: reference = sequences[record.id].seq for idx, site in enumerate(record.seq): if idx in TEST_BED_SEQUENCE: assert site == "N" else: assert site == reference[idx]