Python Bio.SeqIO.to_dict() Examples

The following are 30 code examples of Bio.SeqIO.to_dict(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Bio.SeqIO , or try the search function .
Example #1
Source File: paralog_investigator.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def extract_paralogs(gene,prefix):
    
    putative_paralog_ids = list(set([x.split()[1].rstrip() for x in open(os.path.join(gene,prefix,"paralog_warning.txt"))]))
    try:
        chosen_paralog = open(os.path.join(gene,prefix,"exonerate_stats.csv")).readline().rstrip()
    except IOError:
        return 0
    
    exonerate_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(gene,prefix,"exonerate_results.fasta"),'fasta'))
    
    if not os.path.isdir(os.path.join(gene,prefix,'paralogs')):
        os.mkdir(os.path.join(gene,prefix,"paralogs"))
    seqs_to_write = [exonerate_dict[x] for x in putative_paralog_ids]
    
    for seq in range(len(seqs_to_write)):
        if seqs_to_write[seq].id == chosen_paralog:
            seqs_to_write[seq].id = "{}.{}".format(prefix,"main")
            
        else:
            seqs_to_write[seq].id = "{}.{}".format(prefix,seq)
    
    SeqIO.write(seqs_to_write,os.path.join(gene,prefix,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta')
    
    return len(seqs_to_write) 
Example #2
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 6 votes vote down vote up
def testResfinderBetaLactamDelMiddleSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-del-middle.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.33, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-del-middle.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(
            SeqIO.parse(path.join(self.test_data_dir, 'resfinder_beta-lactam-blaIMP-42-del-middle.fsa'), 'fasta'))
        logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq)
        logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq)
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match") 
Example #3
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 6 votes vote down vote up
def testResfinderBetaLactamInsStartSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-ins-start.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.73, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-ins-start.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(
            SeqIO.parse(path.join(self.test_data_dir, 'beta-lactam-blaIMP-42-mut-2.fsa'), 'fasta'))
        logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq)
        logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq)
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match") 
Example #4
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 6 votes vote down vote up
def testResfinderBetaLactamDelStartSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-del-start.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 100.00, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 91.90, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '681/741', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-del-start.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match") 
Example #5
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 6 votes vote down vote up
def testResfinderBetaLactamInsMiddleSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-ins-middle.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 97, 99, 99, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 98.14, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 101.62, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '753/741', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-ins-middle.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(
            SeqIO.parse(path.join(self.test_data_dir, 'beta-lactam-blaIMP-42-ins-middle.fsa'), 'fasta'))
        logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq)
        logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq)
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq.upper(), records['blaIMP-42_1_AB753456'].seq,
                         "records don't match") 
Example #6
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 6 votes vote down vote up
def testResfinderBetaLactam2MutationsSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.73, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0],
                         'ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone, meropenem',
                         'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match") 
Example #7
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 6 votes vote down vote up
def testResfinderExcludeNonMatches(self):
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, self.blast_handler,
                                               self.pointfinder_drug_table, self.pointfinder_database,
                                               include_negative_results=False, output_dir=self.outdir.name)
        file_beta_lactam = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa")
        file_non_match = path.join(self.test_data_dir, "non-match.fsa")
        files = [file_beta_lactam, file_non_match]
        amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

        summary_results = amr_detection.get_summary_results()
        self.assertEqual(len(summary_results.index), 1, 'Wrong number of rows in result')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file_beta_lactam, 'fasta'))
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match") 
Example #8
Source File: intronerate.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def make_intron_supercontig(contig_info,gene,prefix,add_N = False):
    cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta'))
    intron_supercontig = SeqRecord(Seq(''))
    for i in contig_info:
        if i[5] == "(+)":
            intron_supercontig += cap3contigs[i[0]]
        elif i[5] == "(-)":
            intron_supercontig += cap3contigs[i[0]].reverse_complement()    
        else:
            sys.stderr.write("Strandedness not found!")
            sys.exit(1)
        if add_N and i != contig_info[-1]:
            intron_supercontig += "NNNNNNNNNN"    
    intron_supercontig.id = '{}-{}'.format(prefix,gene)
    intron_supercontig.description = ''
    SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta') 
Example #9
Source File: exonerate_hits.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def initial_exonerate(proteinfilename, assemblyfilename,prefix):
    """Conduct exonerate search, returns a dictionary of results.
    Using the ryo option in exonerate, the header should contain all the useful information."""
    logger = logging.getLogger("pipeline")
    
    outputfilename = "%s/exonerate_results.fasta" %prefix
    exonerate_ryo = '">%ti,%qi,%qab,%qae,%pi,(%tS),%tab,%tae\\n%tcs\\n"'
    exonerate_command = "exonerate -m protein2genome --showalignment no --showvulgar no -V 0 --ryo %s %s %s >%s" % (exonerate_ryo,proteinfilename,assemblyfilename,outputfilename)
    
    logger.debug(exonerate_command)
    #print exonerate_ryo
    #proc = subprocess.Popen(['exonerate','-m','protein2genome','--showalignment','no','-V','0','--showvulgar','no','--ryo',exonerate_ryo,proteinfilename,assemblyfilename])
    proc = subprocess.call(exonerate_command,shell=True)
    protHitsCount = 0
    #proc.wait()
    records = SeqIO.to_dict(SeqIO.parse(outputfilename,'fasta'))
    #proc.stdout.close()
    
    return records 
Example #10
Source File: test_AMRDetectionPlasmid.py    From staramr with Apache License 2.0 6 votes vote down vote up
def testPlasmidfinderNameSuccess(self):
        file = path.join(self.test_data_dir, "test-plasmids-seq.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

        plasmidfinder_results = self.amr_detection.get_plasmidfinder_results()
        self.assertEqual(len(plasmidfinder_results.index), 1, 'Wrong number of rows in result')

        result = plasmidfinder_results[plasmidfinder_results['Plasmid'] == "IncW"]

        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 100.00, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['Accession'].iloc[0], 'EF633507', msg='Wrong accession')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '243/243', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'plasmidfinder_test-plasmids-seq.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['IncW_1__EF633507'].seq, records['IncW_1__EF633507'].seq,
                         "records don't match") 
Example #11
Source File: cov.py    From methgo with MIT License 6 votes vote down vote up
def get_ctxnum(reffile):
    """
    Get the number of CG/CHG/CHH from a reference genome FASTA file
    """
    with open(reffile) as infile:
        fasta = SeqIO.to_dict(SeqIO.parse(infile, 'fasta'))
        for chr in fasta:
            fasta[chr] = str(fasta[chr].seq).upper()
    num_cg = 0
    num_chg = 0
    num_chh = 0
    for chr in fasta:
        num_cg += len([match.start() for match in re.finditer(r'(?=(CG))', fasta[chr])])
        num_cg += len([match.start()-1 for match in re.finditer(r'(?<=(CG))', fasta[chr])])
        num_chg += len([match.start() for match in re.finditer(r'(?=(C[ACT]G))', fasta[chr])])
        num_chg += len([match.start()-1 for match in re.finditer(r'(?<=(C[AGT]G))', fasta[chr])])
        num_chh += len([match.start() for match in re.finditer(r'(?=(C[ACT][ACT]))', fasta[chr])])
        num_chh += len([match.start()-1 for match in re.finditer(r'(?<=([AGT][AGT]G))', fasta[chr])])
    return num_cg, num_chg, num_chh 
Example #12
Source File: PSVLocations.py    From SDA with MIT License 6 votes vote down vote up
def checkPsvs(df):
	print("Checking if PSVs appear in assemblies", file=sys.stderr)
	from Bio import SeqIO
	recs = SeqIO.to_dict(SeqIO.parse(args.check, "fasta"))
	groups = df.groupby(by ="ccid")
	for name, group in groups:
		# skip if we cannot find fasta entry 
		if(name not in recs):
			continue 
		rec = recs[name]
		for idx, row in group.iterrows():
			pos = row["qpos"]
			alt = row["truealt"]
			recalt = rec.seq[pos].upper()
			#print(alt, recalt, pos, name, file=sys.stderr)
			assert alt == recalt, "PSV called inccorectly at {}:{}, {} instead of {}".format(name,pos, alt, recalt) 
Example #13
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_prepare_with_alignment_with_ref_seq(self, test_file, test_seqs, existing_file, existing_aln, ref_seq, ref_file, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence, the reference
        is added to the existing alignment and no other changes are made."""
        aln_outfile, seqs_outfile, ref_name = align.prepare([test_file,], existing_file, out_file, None, ref_file)
        assert ref_name == ref_seq.id, "Didn't return strain name from refrence file"
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile != existing_aln, "Unexpectedly overwrote existing alignment"
        # Alignment file should have the reference added
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence not added to alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys() 
Example #14
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_prepare_with_alignment_with_ref_name(self, test_file, test_seqs, existing_with_ref, existing_aln, ref_seq, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence name, no changes are made."""
        aln_outfile, seqs_outfile, _ = align.prepare([test_file,], existing_with_ref, out_file, ref_seq.id, None)
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile == existing_with_ref, "Rewrote the alignment file unexpectedly"
        # Alignment file should be unchanged
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence dropped from alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys() 
Example #15
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 5 votes vote down vote up
def testPointfinderSalmonella_16S_rrSD_C1065T_Success(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler,
                                               self.pointfinder_drug_table, pointfinder_database,
                                               output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "16S_rrsD-1T1065.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,1000,1000000,1000,300,1000)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        #Make sure that the quality module passes when we pass 1000,1000000,1000,300,1000 for the quality checking parameters
        summary_results = amr_detection.get_summary_results()
        self.assertEqual('Passed', summary_results['Quality Module'].iloc[0], 'Quality result not equal')

        result = pointfinder_results[pointfinder_results['Gene'] == '16S_rrsD (C1065T)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], '16S_rrsD-1T1065', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'nucleotide', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 1065, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'C -> T', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.94, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '1544/1544', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0], 'spectinomycin',
                         'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'pointfinder_16S_rrsD-1T1065.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['16S_rrsD'].seq.upper(), records['16S_rrsD'].seq.upper(),
                         "records don't match") 
Example #16
Source File: dbotu.py    From amplicon_sequencing_pipeline with MIT License 5 votes vote down vote up
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
        '''
        seq_table: pandas.DataFrame
          Samples on the columns; sequences on the rows
        records: index of Bio.Seq
          Indexed, unaligned input sequences. This could come from BioPython's
          SeqIO.to_dict or SeqIO.index.
        max_dist: float
          genetic distance cutoff above which a sequence will not be merged into an OTU
        min_fold: float
          Multiply the sequence's abundance by this fold to get the minimum abundance
          of an OTU for merging
        threshold_pval: float
          P-value below which a sequence will not be merged into an OTU
        log: filehandle
          Log file reporting the abundance, genetic, and distribution checks.
        '''
        self.seq_table = seq_table
        self.records = records
        self.max_dist = max_dist
        self.min_fold = min_fold
        self.threshold_pval = threshold_pval
        self.log = log

        # get a list of the names of the sequences in order of their (decreasing) abundance
        self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)

        # check that all sequence IDs in the table are in the fasta
        missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
        if len(missing_ids) > 0:
            raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))

        # initialize OTU information
        self.membership = {}
        self.otus = [] 
Example #17
Source File: met.py    From methgo with MIT License 5 votes vote down vote up
def const_ctxstr(reffile):
    """
    Construct methylation context strings from a reference genome FASTA file
    """
    with open(reffile) as infile:
        fasta = SeqIO.to_dict(SeqIO.parse(infile, 'fasta'))
        for chr in fasta:
            fasta[chr] = str(fasta[chr].seq).upper()
    ctxstr = {}
    for chr in fasta:
        ctxstr[chr] = ['-']*len(fasta[chr])
        cg = [match.start() for match in re.finditer(r'(?=(CG))', fasta[chr])]
        for pos in cg:
            ctxstr[chr][pos] = 'X'
        chg = [match.start() for match in re.finditer(r'(?=(C[ACT]G))', fasta[chr])]
        for pos in chg:
            ctxstr[chr][pos] = 'Y'
        chh = [match.start() for match in re.finditer(r'(?=(C[ACT][ACT]))', fasta[chr])]
        for pos in chh:
            ctxstr[chr][pos] = 'Z'
        rcg = [match.start()-1 for match in re.finditer(r'(?<=(CG))', fasta[chr])]
        for pos in rcg:
            ctxstr[chr][pos] = 'x'
        rchg = [match.start()-1 for match in re.finditer(r'(?<=(C[AGT]G))', fasta[chr])]
        for pos in rchg:
            ctxstr[chr][pos] = 'y'
        rchh = [match.start()-1 for match in re.finditer(r'(?<=([AGT][AGT]G))', fasta[chr])]
        for pos in rchh:
            ctxstr[chr][pos] = 'z'
    for chr in ctxstr:
        ctxstr[chr] = ''.join(ctxstr[chr])
    return ctxstr 
Example #18
Source File: splice_cycle.py    From Cogent with BSD 3-Clause Clear License 5 votes vote down vote up
def precycle_kmer_adjustment(kmer_size):
    """
    Even with detect_and_replace_cycle (which only looks at repeat kmers within a seq),
    there can be cycles in the graph. So instead, try to detect k-mer re-usage and do simple
    tests using networkx.simple_cycles to see if we can eliminate them as much as possible.
    """
    seqdict = SeqIO.to_dict(SeqIO.parse(open('in.trimmed.fa'), 'fasta'))
    kmer_usage = defaultdict(lambda: defaultdict(lambda: [])) # kmer -> (seqid, i)
    for r in SeqIO.parse(open('in.trimmed.fa'),'fasta'):
        for i in range(len(r.seq)-kmer_size):
            kmer_usage[str(r.seq)[i:i+kmer_size]][r.id].append(i)

    max_kmer_needed = []
    for kmer,v in kmer_usage.items():
        for seqid, indices in v.items():
            if len(indices) > 1: # kmer appeared more than once in the same sequence
                max_kmer_needed.append(max_common_sequence_length(seqdict[seqid], indices, kmer_size))
    if len(max_kmer_needed) == 0:
        log.info("K-mer in-seq cycle detection: None found at k={0}".format(kmer_size))
        return kmer_size
    else:
        min_adjusted_k = min(max_kmer_needed)
        max_adjusted_k = max(max_kmer_needed)
        mean_adjusted_k = sum(max_kmer_needed)*1./len(max_kmer_needed)
        log.info("K-mer in-seq cycle detection: {0} in-seq cycle found with max common \
        sequence analysis showing min: {1}, max: {2}, mean: {3}".format(\
            len(max_kmer_needed), min_adjusted_k, max_adjusted_k, mean_adjusted_k))

        return min(100, max_adjusted_k) 
Example #19
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 5 votes vote down vote up
def testPointfinderSalmonellaA67PSuccess(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler,
                                               self.pointfinder_drug_table, pointfinder_database,
                                               output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "gyrA-A67P.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], 'gyrA-A67P', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R, nalidixic acid',
                         'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match") 
Example #20
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 5 votes vote down vote up
def testPointfinderSalmonellaA67PSuccessNoPhenotype(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetection(self.resfinder_database, blast_handler, pointfinder_database,
                                     output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "gyrA-A67P.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], 'gyrA-A67P', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths')
        self.assertFalse('Predicted Phenotype' in result.columns, 'Should not exist Predicted Phenotype column')

        hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match") 
Example #21
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 5 votes vote down vote up
def testPointfinderSalmonellaA67PReverseComplementSuccess(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler,
                                               self.pointfinder_drug_table, pointfinder_database,
                                               output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "gyrA-A67P-rc.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        #Make sure that the quality module fails when we pass 0,0,0,0,0 for the quality checking parameters
        summary_results = amr_detection.get_summary_results()
        self.assertEqual('Failed', summary_results['Quality Module'].iloc[0], 'Quality result not equal')

        result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], 'gyrA-A67P-rc', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R, nalidixic acid',
                         'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P-rc.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(path.join(self.test_data_dir, "gyrA-A67P.fsa"), 'fasta'))
        self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match") 
Example #22
Source File: fasta_spliter.py    From deepcontact with MIT License 5 votes vote down vote up
def save_seqs(self):
        if not os.path.exists(self.config['path']['id_list']):
            self.save_id_list()
        util.make_dir_if_not_exist(self.config['path']['sequence_dir'])

        sequences = SeqIO.to_dict(self.all_sequences())
        with open(self.config['path']['id_list']) as f_in:
            for line in f_in:
                seq_id = line.strip().split()[0]
                seq_record = sequences[seq_id]
                with open(os.path.join(self.config['path']['sequence_dir'], seq_record.id + '.fasta'), 'w') as f_out:
                    SeqIO.write(seq_record, f_out, 'fasta') 
Example #23
Source File: LocalAssembly3.py    From SDA with MIT License 5 votes vote down vote up
def addSeqs(self):
		fasta = self.mydir + self.asm + ".assemblies.fasta"
		Seqs = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
		toadd = []
		for fastaid in self.all["query_name"]:
			if(fastaid in Seqs):
				toadd.append(Seqs[fastaid].seq)
			else:
				toadd.append("NA")
		self.all["seq"] = toadd 
Example #24
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 5 votes vote down vote up
def testResfinderIncludeNonMatches(self):
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, self.blast_handler,
                                               self.pointfinder_drug_table, self.pointfinder_database,
                                               include_negative_results=True, output_dir=self.outdir.name)
        file_beta_lactam = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa")
        file_non_match = path.join(self.test_data_dir, "non-match.fsa")
        files = [file_beta_lactam, file_non_match]
        amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

        summary_results = amr_detection.get_summary_results()
        self.assertEqual(len(summary_results.index), 2, 'Wrong number of rows in result')

        result_beta_lactam = summary_results.loc['beta-lactam-blaIMP-42-mut-2']
        self.assertTrue(isinstance(result_beta_lactam, pd.Series), 'Wrong type of results returned')
        self.assertEqual(result_beta_lactam['Genotype'], 'blaIMP-42', 'Wrong genotype')

        result_sensitive = summary_results.loc['non-match']
        self.assertTrue(isinstance(result_sensitive, pd.Series), 'Wrong type of results returned')
        self.assertEqual(result_sensitive['Genotype'], 'None', 'Wrong genotype')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file_beta_lactam, 'fasta'))
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match") 
Example #25
Source File: split-clusters.py    From zika-pipeline with MIT License 5 votes vote down vote up
def main():
        records = SeqIO.to_dict(SeqIO.parse(open(sys.argv[1]), 'fasta'))

	reader = csv.DictReader(sys.stdin, dialect="excel-tab")
	clusters = list(reader)

	groups = set([c['group'] for c in clusters])

	for group in groups:
		print "cluster%s\t%s-cluster%s" % (group, sys.argv[1], group)
		with open('%s-cluster%s' %(sys.argv[1], group), 'w') as fout:
			SeqIO.write([records[i['node']] for i in clusters if i['group'] == group], fout, 'fasta') 
Example #26
Source File: test_AMRDetection.py    From staramr with Apache License 2.0 5 votes vote down vote up
def testPointfinderCampylobacterA70TSuccess(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'campylobacter')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler,
                                               self.pointfinder_drug_table, pointfinder_database,
                                               output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "gyrA-A70T.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A70T)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], 'gyrA-A70T', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 70, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'GCC -> ACC (A -> T)', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2592/2592', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R', 'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A70T.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match") 
Example #27
Source File: ground_truth.py    From deepcontact with MIT License 5 votes vote down vote up
def __init__(self, config):
        self.config = config
        if 'astral206' in self.config['ground_truth'] and self.config['ground_truth']['astral206']:
            self.original_sequences = SeqIO.to_dict(
                    SeqIO.parse(self.config['ground_truth']['original_sequence'], 'fasta')) 
Example #28
Source File: VariantPhaser.py    From cDNA_Cupcake with BSD 3-Clause Clear License 5 votes vote down vote up
def phase_variant(self, sam_filename, input_fa_or_fq, output_prefix, partial_ok=False):
        """
        :param sam_filename: CCS SAM filename. Can be unsorted.
        :param input_fa_or_fq: Input CCS fasta/fastq filename.
        :param output_prefix: Output prefix. Writes to xxx.log.
        :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions.

        For each alignment:
        1. discard if did not map to the strand expected
        2. discard if did not map to the full range of variants (unless <partial_ok> is True)
        3. discard if at var positions have non-called bases (outliers)
        """
        f_log = open(output_prefix+'.log', 'w')

        seq_dict = SeqIO.to_dict(SeqIO.parse(open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq)))
        for r in GMAPSAMReader(sam_filename, True, query_len_dict=dict((k, len(seq_dict[k].seq)) for k in seq_dict)):
            if r.sID == '*':
                f_log.write("Ignore {0} because: unmapped.\n".format(r.qID))
                continue
            if r.flag.strand != self.vc.expected_strand:
                f_log.write("Ignore {0} because: strand is {1}.\n".format(r.qID, r.flag.strand))
                continue # ignore
            if not partial_ok and (r.sStart > self.min_var_pos or r.sEnd < self.max_var_pos):
                f_log.write("Ignore {0} because: aln too short, from {1}-{2}.\n".format(r.qID, r.sStart+1, r.sEnd))
                continue

            i, msg = self.match_haplotype(r, str(seq_dict[r.qID].seq).upper(), partial_ok)
            if i is None: # read is rejected for reason listed in <msg>
                f_log.write("Ignore {0} because: {1}.\n".format(r.qID, msg))
                continue
            else:
                f_log.write("{0} phased: haplotype {1}={2}\n".format(r.qID, i, self.haplotypes[i]))
                print("{0} has haplotype {1}:{2}".format(r.qID, i, self.haplotypes[i]))
                self.seq_hap_info[r.qID] = i 
Example #29
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_postprocess_fill_gaps(self, existing_file, existing_aln, ref_seq, fill_gaps):
        """Postprocess should make the gaps ambiguous only if requested"""
        align.postprocess(existing_file, None, True, fill_gaps)
        output = SeqIO.to_dict(SeqIO.parse(existing_file, "fasta"))
        for name, record in output.items():
            for idx, site in enumerate(existing_aln[name].seq):
                if site == "-":
                    assert (record.seq[idx] == "N") == fill_gaps 
Example #30
Source File: test_filter.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_filter_run_with_query(self, tmpdir, fasta_fn, argparser):
        """Test that filter --query works as expected"""
        out_fn = str(tmpdir / "out.fasta")
        meta_fn = write_metadata(tmpdir, (("strain","location","quality"),
                                          ("SEQ_1","colorado","good"),
                                          ("SEQ_2","colorado","bad"),
                                          ("SEQ_3","nevada","good")))
        args = argparser('-s %s --metadata %s -o %s --query "location==\'colorado\'"'
                         % (fasta_fn, meta_fn, out_fn))
        augur.filter.run(args)
        output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
        assert list(output.keys()) == ["SEQ_1", "SEQ_2"]