Python Bio.SeqIO.to_dict() Examples
The following are 30
code examples of Bio.SeqIO.to_dict().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
Bio.SeqIO
, or try the search function
.
Example #1
Source File: paralog_investigator.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def extract_paralogs(gene,prefix): putative_paralog_ids = list(set([x.split()[1].rstrip() for x in open(os.path.join(gene,prefix,"paralog_warning.txt"))])) try: chosen_paralog = open(os.path.join(gene,prefix,"exonerate_stats.csv")).readline().rstrip() except IOError: return 0 exonerate_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(gene,prefix,"exonerate_results.fasta"),'fasta')) if not os.path.isdir(os.path.join(gene,prefix,'paralogs')): os.mkdir(os.path.join(gene,prefix,"paralogs")) seqs_to_write = [exonerate_dict[x] for x in putative_paralog_ids] for seq in range(len(seqs_to_write)): if seqs_to_write[seq].id == chosen_paralog: seqs_to_write[seq].id = "{}.{}".format(prefix,"main") else: seqs_to_write[seq].id = "{}.{}".format(prefix,seq) SeqIO.write(seqs_to_write,os.path.join(gene,prefix,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta') return len(seqs_to_write)
Example #2
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 6 votes |
def testResfinderBetaLactamDelMiddleSuccess(self): file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-del-middle.fsa") files = [file] self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0) resfinder_results = self.amr_detection.get_resfinder_results() self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result') result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertAlmostEqual(result['%Identity'].iloc[0], 99.33, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths') hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-del-middle.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict( SeqIO.parse(path.join(self.test_data_dir, 'resfinder_beta-lactam-blaIMP-42-del-middle.fsa'), 'fasta')) logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq) logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq) self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq, "records don't match")
Example #3
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 6 votes |
def testResfinderBetaLactamInsStartSuccess(self): file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-ins-start.fsa") files = [file] self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0) resfinder_results = self.amr_detection.get_resfinder_results() self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result') result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertAlmostEqual(result['%Identity'].iloc[0], 99.73, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths') hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-ins-start.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict( SeqIO.parse(path.join(self.test_data_dir, 'beta-lactam-blaIMP-42-mut-2.fsa'), 'fasta')) logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq) logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq) self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq, "records don't match")
Example #4
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 6 votes |
def testResfinderBetaLactamDelStartSuccess(self): file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-del-start.fsa") files = [file] self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0) resfinder_results = self.amr_detection.get_resfinder_results() self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result') result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertAlmostEqual(result['%Identity'].iloc[0], 100.00, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 91.90, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '681/741', msg='Wrong lengths') hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-del-start.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq, "records don't match")
Example #5
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 6 votes |
def testResfinderBetaLactamInsMiddleSuccess(self): file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-ins-middle.fsa") files = [file] self.amr_detection.run_amr_detection(files, 97, 99, 99, 90,0,0,0,0,0) resfinder_results = self.amr_detection.get_resfinder_results() self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result') result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertAlmostEqual(result['%Identity'].iloc[0], 98.14, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 101.62, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '753/741', msg='Wrong lengths') hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-ins-middle.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict( SeqIO.parse(path.join(self.test_data_dir, 'beta-lactam-blaIMP-42-ins-middle.fsa'), 'fasta')) logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq) logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq) self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq.upper(), records['blaIMP-42_1_AB753456'].seq, "records don't match")
Example #6
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 6 votes |
def testResfinderBetaLactam2MutationsSuccess(self): file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa") files = [file] self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0) resfinder_results = self.amr_detection.get_resfinder_results() self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result') result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertAlmostEqual(result['%Identity'].iloc[0], 99.73, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths') self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone, meropenem', 'Wrong phenotype') hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq, "records don't match")
Example #7
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 6 votes |
def testResfinderExcludeNonMatches(self): amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, self.blast_handler, self.pointfinder_drug_table, self.pointfinder_database, include_negative_results=False, output_dir=self.outdir.name) file_beta_lactam = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa") file_non_match = path.join(self.test_data_dir, "non-match.fsa") files = [file_beta_lactam, file_non_match] amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0) summary_results = amr_detection.get_summary_results() self.assertEqual(len(summary_results.index), 1, 'Wrong number of rows in result') hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file_beta_lactam, 'fasta')) self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq, "records don't match")
Example #8
Source File: intronerate.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def make_intron_supercontig(contig_info,gene,prefix,add_N = False): cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta')) intron_supercontig = SeqRecord(Seq('')) for i in contig_info: if i[5] == "(+)": intron_supercontig += cap3contigs[i[0]] elif i[5] == "(-)": intron_supercontig += cap3contigs[i[0]].reverse_complement() else: sys.stderr.write("Strandedness not found!") sys.exit(1) if add_N and i != contig_info[-1]: intron_supercontig += "NNNNNNNNNN" intron_supercontig.id = '{}-{}'.format(prefix,gene) intron_supercontig.description = '' SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta')
Example #9
Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def initial_exonerate(proteinfilename, assemblyfilename,prefix): """Conduct exonerate search, returns a dictionary of results. Using the ryo option in exonerate, the header should contain all the useful information.""" logger = logging.getLogger("pipeline") outputfilename = "%s/exonerate_results.fasta" %prefix exonerate_ryo = '">%ti,%qi,%qab,%qae,%pi,(%tS),%tab,%tae\\n%tcs\\n"' exonerate_command = "exonerate -m protein2genome --showalignment no --showvulgar no -V 0 --ryo %s %s %s >%s" % (exonerate_ryo,proteinfilename,assemblyfilename,outputfilename) logger.debug(exonerate_command) #print exonerate_ryo #proc = subprocess.Popen(['exonerate','-m','protein2genome','--showalignment','no','-V','0','--showvulgar','no','--ryo',exonerate_ryo,proteinfilename,assemblyfilename]) proc = subprocess.call(exonerate_command,shell=True) protHitsCount = 0 #proc.wait() records = SeqIO.to_dict(SeqIO.parse(outputfilename,'fasta')) #proc.stdout.close() return records
Example #10
Source File: test_AMRDetectionPlasmid.py From staramr with Apache License 2.0 | 6 votes |
def testPlasmidfinderNameSuccess(self): file = path.join(self.test_data_dir, "test-plasmids-seq.fsa") files = [file] self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0) plasmidfinder_results = self.amr_detection.get_plasmidfinder_results() self.assertEqual(len(plasmidfinder_results.index), 1, 'Wrong number of rows in result') result = plasmidfinder_results[plasmidfinder_results['Plasmid'] == "IncW"] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertAlmostEqual(result['%Identity'].iloc[0], 100.00, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['Accession'].iloc[0], 'EF633507', msg='Wrong accession') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '243/243', msg='Wrong lengths') hit_file = path.join(self.outdir.name, 'plasmidfinder_test-plasmids-seq.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) self.assertEqual(expected_records['IncW_1__EF633507'].seq, records['IncW_1__EF633507'].seq, "records don't match")
Example #11
Source File: cov.py From methgo with MIT License | 6 votes |
def get_ctxnum(reffile): """ Get the number of CG/CHG/CHH from a reference genome FASTA file """ with open(reffile) as infile: fasta = SeqIO.to_dict(SeqIO.parse(infile, 'fasta')) for chr in fasta: fasta[chr] = str(fasta[chr].seq).upper() num_cg = 0 num_chg = 0 num_chh = 0 for chr in fasta: num_cg += len([match.start() for match in re.finditer(r'(?=(CG))', fasta[chr])]) num_cg += len([match.start()-1 for match in re.finditer(r'(?<=(CG))', fasta[chr])]) num_chg += len([match.start() for match in re.finditer(r'(?=(C[ACT]G))', fasta[chr])]) num_chg += len([match.start()-1 for match in re.finditer(r'(?<=(C[AGT]G))', fasta[chr])]) num_chh += len([match.start() for match in re.finditer(r'(?=(C[ACT][ACT]))', fasta[chr])]) num_chh += len([match.start()-1 for match in re.finditer(r'(?<=([AGT][AGT]G))', fasta[chr])]) return num_cg, num_chg, num_chh
Example #12
Source File: PSVLocations.py From SDA with MIT License | 6 votes |
def checkPsvs(df): print("Checking if PSVs appear in assemblies", file=sys.stderr) from Bio import SeqIO recs = SeqIO.to_dict(SeqIO.parse(args.check, "fasta")) groups = df.groupby(by ="ccid") for name, group in groups: # skip if we cannot find fasta entry if(name not in recs): continue rec = recs[name] for idx, row in group.iterrows(): pos = row["qpos"] alt = row["truealt"] recalt = rec.seq[pos].upper() #print(alt, recalt, pos, name, file=sys.stderr) assert alt == recalt, "PSV called inccorectly at {}:{}, {} instead of {}".format(name,pos, alt, recalt)
Example #13
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def test_prepare_with_alignment_with_ref_seq(self, test_file, test_seqs, existing_file, existing_aln, ref_seq, ref_file, out_file): """Test that, given a set of test sequences, an existing alignment, and a reference sequence, the reference is added to the existing alignment and no other changes are made.""" aln_outfile, seqs_outfile, ref_name = align.prepare([test_file,], existing_file, out_file, None, ref_file) assert ref_name == ref_seq.id, "Didn't return strain name from refrence file" assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said" assert aln_outfile != existing_aln, "Unexpectedly overwrote existing alignment" # Alignment file should have the reference added aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta")) assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence not added to alignment" for seq in existing_aln: assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly" assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly" # test sequences should be unchanged assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said" seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta")) for seq in test_seqs: assert seq in seq_output, "Some test sequences unexpectedly dropped" assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly" assert seq_output.keys() == test_seqs.keys()
Example #14
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def test_prepare_with_alignment_with_ref_name(self, test_file, test_seqs, existing_with_ref, existing_aln, ref_seq, out_file): """Test that, given a set of test sequences, an existing alignment, and a reference sequence name, no changes are made.""" aln_outfile, seqs_outfile, _ = align.prepare([test_file,], existing_with_ref, out_file, ref_seq.id, None) assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said" assert aln_outfile == existing_with_ref, "Rewrote the alignment file unexpectedly" # Alignment file should be unchanged aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta")) assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence dropped from alignment" for seq in existing_aln: assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly" assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly" # test sequences should be unchanged assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said" seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta")) for seq in test_seqs: assert seq in seq_output, "Some test sequences unexpectedly dropped" assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly" assert seq_output.keys() == test_seqs.keys()
Example #15
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 5 votes |
def testPointfinderSalmonella_16S_rrSD_C1065T_Success(self): pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella') blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, self.blast_out.name) amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, self.pointfinder_drug_table, pointfinder_database, output_dir=self.outdir.name) file = path.join(self.test_data_dir, "16S_rrsD-1T1065.fsa") files = [file] amr_detection.run_amr_detection(files, 99, 99, 90, 90,1000,1000000,1000,300,1000) pointfinder_results = amr_detection.get_pointfinder_results() self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') #Make sure that the quality module passes when we pass 1000,1000000,1000,300,1000 for the quality checking parameters summary_results = amr_detection.get_summary_results() self.assertEqual('Passed', summary_results['Quality Module'].iloc[0], 'Quality result not equal') result = pointfinder_results[pointfinder_results['Gene'] == '16S_rrsD (C1065T)'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertEqual(result.index[0], '16S_rrsD-1T1065', msg='Wrong file') self.assertEqual(result['Type'].iloc[0], 'nucleotide', msg='Wrong type') self.assertEqual(result['Position'].iloc[0], 1065, msg='Wrong codon position') self.assertEqual(result['Mutation'].iloc[0], 'C -> T', msg='Wrong mutation') self.assertAlmostEqual(result['%Identity'].iloc[0], 99.94, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '1544/1544', msg='Wrong lengths') self.assertEqual(result['Predicted Phenotype'].iloc[0], 'spectinomycin', 'Wrong phenotype') hit_file = path.join(self.outdir.name, 'pointfinder_16S_rrsD-1T1065.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) self.assertEqual(expected_records['16S_rrsD'].seq.upper(), records['16S_rrsD'].seq.upper(), "records don't match")
Example #16
Source File: dbotu.py From amplicon_sequencing_pipeline with MIT License | 5 votes |
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None): ''' seq_table: pandas.DataFrame Samples on the columns; sequences on the rows records: index of Bio.Seq Indexed, unaligned input sequences. This could come from BioPython's SeqIO.to_dict or SeqIO.index. max_dist: float genetic distance cutoff above which a sequence will not be merged into an OTU min_fold: float Multiply the sequence's abundance by this fold to get the minimum abundance of an OTU for merging threshold_pval: float P-value below which a sequence will not be merged into an OTU log: filehandle Log file reporting the abundance, genetic, and distribution checks. ''' self.seq_table = seq_table self.records = records self.max_dist = max_dist self.min_fold = min_fold self.threshold_pval = threshold_pval self.log = log # get a list of the names of the sequences in order of their (decreasing) abundance self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False) # check that all sequence IDs in the table are in the fasta missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records] if len(missing_ids) > 0: raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids)) # initialize OTU information self.membership = {} self.otus = []
Example #17
Source File: met.py From methgo with MIT License | 5 votes |
def const_ctxstr(reffile): """ Construct methylation context strings from a reference genome FASTA file """ with open(reffile) as infile: fasta = SeqIO.to_dict(SeqIO.parse(infile, 'fasta')) for chr in fasta: fasta[chr] = str(fasta[chr].seq).upper() ctxstr = {} for chr in fasta: ctxstr[chr] = ['-']*len(fasta[chr]) cg = [match.start() for match in re.finditer(r'(?=(CG))', fasta[chr])] for pos in cg: ctxstr[chr][pos] = 'X' chg = [match.start() for match in re.finditer(r'(?=(C[ACT]G))', fasta[chr])] for pos in chg: ctxstr[chr][pos] = 'Y' chh = [match.start() for match in re.finditer(r'(?=(C[ACT][ACT]))', fasta[chr])] for pos in chh: ctxstr[chr][pos] = 'Z' rcg = [match.start()-1 for match in re.finditer(r'(?<=(CG))', fasta[chr])] for pos in rcg: ctxstr[chr][pos] = 'x' rchg = [match.start()-1 for match in re.finditer(r'(?<=(C[AGT]G))', fasta[chr])] for pos in rchg: ctxstr[chr][pos] = 'y' rchh = [match.start()-1 for match in re.finditer(r'(?<=([AGT][AGT]G))', fasta[chr])] for pos in rchh: ctxstr[chr][pos] = 'z' for chr in ctxstr: ctxstr[chr] = ''.join(ctxstr[chr]) return ctxstr
Example #18
Source File: splice_cycle.py From Cogent with BSD 3-Clause Clear License | 5 votes |
def precycle_kmer_adjustment(kmer_size): """ Even with detect_and_replace_cycle (which only looks at repeat kmers within a seq), there can be cycles in the graph. So instead, try to detect k-mer re-usage and do simple tests using networkx.simple_cycles to see if we can eliminate them as much as possible. """ seqdict = SeqIO.to_dict(SeqIO.parse(open('in.trimmed.fa'), 'fasta')) kmer_usage = defaultdict(lambda: defaultdict(lambda: [])) # kmer -> (seqid, i) for r in SeqIO.parse(open('in.trimmed.fa'),'fasta'): for i in range(len(r.seq)-kmer_size): kmer_usage[str(r.seq)[i:i+kmer_size]][r.id].append(i) max_kmer_needed = [] for kmer,v in kmer_usage.items(): for seqid, indices in v.items(): if len(indices) > 1: # kmer appeared more than once in the same sequence max_kmer_needed.append(max_common_sequence_length(seqdict[seqid], indices, kmer_size)) if len(max_kmer_needed) == 0: log.info("K-mer in-seq cycle detection: None found at k={0}".format(kmer_size)) return kmer_size else: min_adjusted_k = min(max_kmer_needed) max_adjusted_k = max(max_kmer_needed) mean_adjusted_k = sum(max_kmer_needed)*1./len(max_kmer_needed) log.info("K-mer in-seq cycle detection: {0} in-seq cycle found with max common \ sequence analysis showing min: {1}, max: {2}, mean: {3}".format(\ len(max_kmer_needed), min_adjusted_k, max_adjusted_k, mean_adjusted_k)) return min(100, max_adjusted_k)
Example #19
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 5 votes |
def testPointfinderSalmonellaA67PSuccess(self): pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella') blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, self.blast_out.name) amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, self.pointfinder_drug_table, pointfinder_database, output_dir=self.outdir.name) file = path.join(self.test_data_dir, "gyrA-A67P.fsa") files = [file] amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0) pointfinder_results = amr_detection.get_pointfinder_results() self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertEqual(result.index[0], 'gyrA-A67P', msg='Wrong file') self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type') self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position') self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation') self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths') self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R, nalidixic acid', 'Wrong phenotype') hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match")
Example #20
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 5 votes |
def testPointfinderSalmonellaA67PSuccessNoPhenotype(self): pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella') blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, self.blast_out.name) amr_detection = AMRDetection(self.resfinder_database, blast_handler, pointfinder_database, output_dir=self.outdir.name) file = path.join(self.test_data_dir, "gyrA-A67P.fsa") files = [file] amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0) pointfinder_results = amr_detection.get_pointfinder_results() self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertEqual(result.index[0], 'gyrA-A67P', msg='Wrong file') self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type') self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position') self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation') self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths') self.assertFalse('Predicted Phenotype' in result.columns, 'Should not exist Predicted Phenotype column') hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match")
Example #21
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 5 votes |
def testPointfinderSalmonellaA67PReverseComplementSuccess(self): pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella') blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, self.blast_out.name) amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, self.pointfinder_drug_table, pointfinder_database, output_dir=self.outdir.name) file = path.join(self.test_data_dir, "gyrA-A67P-rc.fsa") files = [file] amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0) pointfinder_results = amr_detection.get_pointfinder_results() self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') #Make sure that the quality module fails when we pass 0,0,0,0,0 for the quality checking parameters summary_results = amr_detection.get_summary_results() self.assertEqual('Failed', summary_results['Quality Module'].iloc[0], 'Quality result not equal') result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertEqual(result.index[0], 'gyrA-A67P-rc', msg='Wrong file') self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type') self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position') self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation') self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths') self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R, nalidixic acid', 'Wrong phenotype') hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P-rc.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(path.join(self.test_data_dir, "gyrA-A67P.fsa"), 'fasta')) self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match")
Example #22
Source File: fasta_spliter.py From deepcontact with MIT License | 5 votes |
def save_seqs(self): if not os.path.exists(self.config['path']['id_list']): self.save_id_list() util.make_dir_if_not_exist(self.config['path']['sequence_dir']) sequences = SeqIO.to_dict(self.all_sequences()) with open(self.config['path']['id_list']) as f_in: for line in f_in: seq_id = line.strip().split()[0] seq_record = sequences[seq_id] with open(os.path.join(self.config['path']['sequence_dir'], seq_record.id + '.fasta'), 'w') as f_out: SeqIO.write(seq_record, f_out, 'fasta')
Example #23
Source File: LocalAssembly3.py From SDA with MIT License | 5 votes |
def addSeqs(self): fasta = self.mydir + self.asm + ".assemblies.fasta" Seqs = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) toadd = [] for fastaid in self.all["query_name"]: if(fastaid in Seqs): toadd.append(Seqs[fastaid].seq) else: toadd.append("NA") self.all["seq"] = toadd
Example #24
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 5 votes |
def testResfinderIncludeNonMatches(self): amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, self.blast_handler, self.pointfinder_drug_table, self.pointfinder_database, include_negative_results=True, output_dir=self.outdir.name) file_beta_lactam = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa") file_non_match = path.join(self.test_data_dir, "non-match.fsa") files = [file_beta_lactam, file_non_match] amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0) summary_results = amr_detection.get_summary_results() self.assertEqual(len(summary_results.index), 2, 'Wrong number of rows in result') result_beta_lactam = summary_results.loc['beta-lactam-blaIMP-42-mut-2'] self.assertTrue(isinstance(result_beta_lactam, pd.Series), 'Wrong type of results returned') self.assertEqual(result_beta_lactam['Genotype'], 'blaIMP-42', 'Wrong genotype') result_sensitive = summary_results.loc['non-match'] self.assertTrue(isinstance(result_sensitive, pd.Series), 'Wrong type of results returned') self.assertEqual(result_sensitive['Genotype'], 'None', 'Wrong genotype') hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file_beta_lactam, 'fasta')) self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq, "records don't match")
Example #25
Source File: split-clusters.py From zika-pipeline with MIT License | 5 votes |
def main(): records = SeqIO.to_dict(SeqIO.parse(open(sys.argv[1]), 'fasta')) reader = csv.DictReader(sys.stdin, dialect="excel-tab") clusters = list(reader) groups = set([c['group'] for c in clusters]) for group in groups: print "cluster%s\t%s-cluster%s" % (group, sys.argv[1], group) with open('%s-cluster%s' %(sys.argv[1], group), 'w') as fout: SeqIO.write([records[i['node']] for i in clusters if i['group'] == group], fout, 'fasta')
Example #26
Source File: test_AMRDetection.py From staramr with Apache License 2.0 | 5 votes |
def testPointfinderCampylobacterA70TSuccess(self): pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'campylobacter') blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2, self.blast_out.name) amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler, self.pointfinder_drug_table, pointfinder_database, output_dir=self.outdir.name) file = path.join(self.test_data_dir, "gyrA-A70T.fsa") files = [file] amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0) pointfinder_results = amr_detection.get_pointfinder_results() self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result') result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A70T)'] self.assertEqual(len(result.index), 1, 'Wrong number of results detected') self.assertEqual(result.index[0], 'gyrA-A70T', msg='Wrong file') self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type') self.assertEqual(result['Position'].iloc[0], 70, msg='Wrong codon position') self.assertEqual(result['Mutation'].iloc[0], 'GCC -> ACC (A -> T)', msg='Wrong mutation') self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid') self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap') self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2592/2592', msg='Wrong lengths') self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R', 'Wrong phenotype') hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A70T.fsa') records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta')) self.assertEqual(len(records), 1, 'Wrong number of hit records') expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta')) self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match")
Example #27
Source File: ground_truth.py From deepcontact with MIT License | 5 votes |
def __init__(self, config): self.config = config if 'astral206' in self.config['ground_truth'] and self.config['ground_truth']['astral206']: self.original_sequences = SeqIO.to_dict( SeqIO.parse(self.config['ground_truth']['original_sequence'], 'fasta'))
Example #28
Source File: VariantPhaser.py From cDNA_Cupcake with BSD 3-Clause Clear License | 5 votes |
def phase_variant(self, sam_filename, input_fa_or_fq, output_prefix, partial_ok=False): """ :param sam_filename: CCS SAM filename. Can be unsorted. :param input_fa_or_fq: Input CCS fasta/fastq filename. :param output_prefix: Output prefix. Writes to xxx.log. :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions. For each alignment: 1. discard if did not map to the strand expected 2. discard if did not map to the full range of variants (unless <partial_ok> is True) 3. discard if at var positions have non-called bases (outliers) """ f_log = open(output_prefix+'.log', 'w') seq_dict = SeqIO.to_dict(SeqIO.parse(open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq))) for r in GMAPSAMReader(sam_filename, True, query_len_dict=dict((k, len(seq_dict[k].seq)) for k in seq_dict)): if r.sID == '*': f_log.write("Ignore {0} because: unmapped.\n".format(r.qID)) continue if r.flag.strand != self.vc.expected_strand: f_log.write("Ignore {0} because: strand is {1}.\n".format(r.qID, r.flag.strand)) continue # ignore if not partial_ok and (r.sStart > self.min_var_pos or r.sEnd < self.max_var_pos): f_log.write("Ignore {0} because: aln too short, from {1}-{2}.\n".format(r.qID, r.sStart+1, r.sEnd)) continue i, msg = self.match_haplotype(r, str(seq_dict[r.qID].seq).upper(), partial_ok) if i is None: # read is rejected for reason listed in <msg> f_log.write("Ignore {0} because: {1}.\n".format(r.qID, msg)) continue else: f_log.write("{0} phased: haplotype {1}={2}\n".format(r.qID, i, self.haplotypes[i])) print("{0} has haplotype {1}:{2}".format(r.qID, i, self.haplotypes[i])) self.seq_hap_info[r.qID] = i
Example #29
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def test_postprocess_fill_gaps(self, existing_file, existing_aln, ref_seq, fill_gaps): """Postprocess should make the gaps ambiguous only if requested""" align.postprocess(existing_file, None, True, fill_gaps) output = SeqIO.to_dict(SeqIO.parse(existing_file, "fasta")) for name, record in output.items(): for idx, site in enumerate(existing_aln[name].seq): if site == "-": assert (record.seq[idx] == "N") == fill_gaps
Example #30
Source File: test_filter.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def test_filter_run_with_query(self, tmpdir, fasta_fn, argparser): """Test that filter --query works as expected""" out_fn = str(tmpdir / "out.fasta") meta_fn = write_metadata(tmpdir, (("strain","location","quality"), ("SEQ_1","colorado","good"), ("SEQ_2","colorado","bad"), ("SEQ_3","nevada","good"))) args = argparser('-s %s --metadata %s -o %s --query "location==\'colorado\'"' % (fasta_fn, meta_fn, out_fn)) augur.filter.run(args) output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta")) assert list(output.keys()) == ["SEQ_1", "SEQ_2"]