Python Examples of Bio.SeqIO.to

Source File: paralog_investigator.py From HybPiper with GNU General Public License v3.0

6 votes

def extract_paralogs(gene,prefix):
    
    putative_paralog_ids = list(set([x.split()[1].rstrip() for x in open(os.path.join(gene,prefix,"paralog_warning.txt"))]))
    try:
        chosen_paralog = open(os.path.join(gene,prefix,"exonerate_stats.csv")).readline().rstrip()
    except IOError:
        return 0
    
    exonerate_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(gene,prefix,"exonerate_results.fasta"),'fasta'))
    
    if not os.path.isdir(os.path.join(gene,prefix,'paralogs')):
        os.mkdir(os.path.join(gene,prefix,"paralogs"))
    seqs_to_write = [exonerate_dict[x] for x in putative_paralog_ids]
    
    for seq in range(len(seqs_to_write)):
        if seqs_to_write[seq].id == chosen_paralog:
            seqs_to_write[seq].id = "{}.{}".format(prefix,"main")
            
        else:
            seqs_to_write[seq].id = "{}.{}".format(prefix,seq)
    
    SeqIO.write(seqs_to_write,os.path.join(gene,prefix,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta')
    
    return len(seqs_to_write)

Source File: test_AMRDetection.py From staramr with Apache License 2.0

6 votes

def testResfinderBetaLactamDelMiddleSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-del-middle.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.33, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-del-middle.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(
            SeqIO.parse(path.join(self.test_data_dir, 'resfinder_beta-lactam-blaIMP-42-del-middle.fsa'), 'fasta'))
        logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq)
        logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq)
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match")

Source File: test_AMRDetection.py From staramr with Apache License 2.0

6 votes

def testResfinderBetaLactamInsStartSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-ins-start.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.73, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-ins-start.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(
            SeqIO.parse(path.join(self.test_data_dir, 'beta-lactam-blaIMP-42-mut-2.fsa'), 'fasta'))
        logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq)
        logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq)
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match")

Source File: test_AMRDetection.py From staramr with Apache License 2.0

6 votes

def testResfinderBetaLactamDelStartSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-del-start.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 91, 90, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 100.00, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 91.90, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '681/741', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-del-start.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match")

Source File: test_AMRDetection.py From staramr with Apache License 2.0

6 votes

def testResfinderBetaLactamInsMiddleSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-ins-middle.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 97, 99, 99, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 98.14, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 101.62, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '753/741', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-ins-middle.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(
            SeqIO.parse(path.join(self.test_data_dir, 'beta-lactam-blaIMP-42-ins-middle.fsa'), 'fasta'))
        logger.debug("expected_seq=%s", expected_records['blaIMP-42_1_AB753456'].seq)
        logger.debug("actual_seq=%s", records['blaIMP-42_1_AB753456'].seq)
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq.upper(), records['blaIMP-42_1_AB753456'].seq,
                         "records don't match")

Source File: test_AMRDetection.py From staramr with Apache License 2.0

6 votes

def testResfinderBetaLactam2MutationsSuccess(self):
        file = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

        resfinder_results = self.amr_detection.get_resfinder_results()
        self.assertEqual(len(resfinder_results.index), 1, 'Wrong number of rows in result')

        result = resfinder_results[resfinder_results['Gene'] == 'blaIMP-42']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.73, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '741/741', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0],
                         'ampicillin, amoxicillin/clavulanic acid, cefoxitin, ceftriaxone, meropenem',
                         'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match")

Source File: test_AMRDetection.py From staramr with Apache License 2.0

6 votes

def testResfinderExcludeNonMatches(self):
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, self.blast_handler,
                                               self.pointfinder_drug_table, self.pointfinder_database,
                                               include_negative_results=False, output_dir=self.outdir.name)
        file_beta_lactam = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa")
        file_non_match = path.join(self.test_data_dir, "non-match.fsa")
        files = [file_beta_lactam, file_non_match]
        amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

        summary_results = amr_detection.get_summary_results()
        self.assertEqual(len(summary_results.index), 1, 'Wrong number of rows in result')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file_beta_lactam, 'fasta'))
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match")

Source File: intronerate.py From HybPiper with GNU General Public License v3.0

6 votes

def make_intron_supercontig(contig_info,gene,prefix,add_N = False):
    cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta'))
    intron_supercontig = SeqRecord(Seq(''))
    for i in contig_info:
        if i[5] == "(+)":
            intron_supercontig += cap3contigs[i[0]]
        elif i[5] == "(-)":
            intron_supercontig += cap3contigs[i[0]].reverse_complement()    
        else:
            sys.stderr.write("Strandedness not found!")
            sys.exit(1)
        if add_N and i != contig_info[-1]:
            intron_supercontig += "NNNNNNNNNN"    
    intron_supercontig.id = '{}-{}'.format(prefix,gene)
    intron_supercontig.description = ''
    SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta')

Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0

6 votes

def initial_exonerate(proteinfilename, assemblyfilename,prefix):
    """Conduct exonerate search, returns a dictionary of results.
    Using the ryo option in exonerate, the header should contain all the useful information."""
    logger = logging.getLogger("pipeline")
    
    outputfilename = "%s/exonerate_results.fasta" %prefix
    exonerate_ryo = '">%ti,%qi,%qab,%qae,%pi,(%tS),%tab,%tae\\n%tcs\\n"'
    exonerate_command = "exonerate -m protein2genome --showalignment no --showvulgar no -V 0 --ryo %s %s %s >%s" % (exonerate_ryo,proteinfilename,assemblyfilename,outputfilename)
    
    logger.debug(exonerate_command)
    #print exonerate_ryo
    #proc = subprocess.Popen(['exonerate','-m','protein2genome','--showalignment','no','-V','0','--showvulgar','no','--ryo',exonerate_ryo,proteinfilename,assemblyfilename])
    proc = subprocess.call(exonerate_command,shell=True)
    protHitsCount = 0
    #proc.wait()
    records = SeqIO.to_dict(SeqIO.parse(outputfilename,'fasta'))
    #proc.stdout.close()
    
    return records

Source File: test_AMRDetectionPlasmid.py From staramr with Apache License 2.0

6 votes

def testPlasmidfinderNameSuccess(self):
        file = path.join(self.test_data_dir, "test-plasmids-seq.fsa")
        files = [file]
        self.amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

        plasmidfinder_results = self.amr_detection.get_plasmidfinder_results()
        self.assertEqual(len(plasmidfinder_results.index), 1, 'Wrong number of rows in result')

        result = plasmidfinder_results[plasmidfinder_results['Plasmid'] == "IncW"]

        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 100.00, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['Accession'].iloc[0], 'EF633507', msg='Wrong accession')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '243/243', msg='Wrong lengths')

        hit_file = path.join(self.outdir.name, 'plasmidfinder_test-plasmids-seq.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['IncW_1__EF633507'].seq, records['IncW_1__EF633507'].seq,
                         "records don't match")

Source File: cov.py From methgo with MIT License

6 votes

def get_ctxnum(reffile):
    """
    Get the number of CG/CHG/CHH from a reference genome FASTA file
    """
    with open(reffile) as infile:
        fasta = SeqIO.to_dict(SeqIO.parse(infile, 'fasta'))
        for chr in fasta:
            fasta[chr] = str(fasta[chr].seq).upper()
    num_cg = 0
    num_chg = 0
    num_chh = 0
    for chr in fasta:
        num_cg += len([match.start() for match in re.finditer(r'(?=(CG))', fasta[chr])])
        num_cg += len([match.start()-1 for match in re.finditer(r'(?<=(CG))', fasta[chr])])
        num_chg += len([match.start() for match in re.finditer(r'(?=(C[ACT]G))', fasta[chr])])
        num_chg += len([match.start()-1 for match in re.finditer(r'(?<=(C[AGT]G))', fasta[chr])])
        num_chh += len([match.start() for match in re.finditer(r'(?=(C[ACT][ACT]))', fasta[chr])])
        num_chh += len([match.start()-1 for match in re.finditer(r'(?<=([AGT][AGT]G))', fasta[chr])])
    return num_cg, num_chg, num_chh

Source File: PSVLocations.py From SDA with MIT License

6 votes

def checkPsvs(df):
	print("Checking if PSVs appear in assemblies", file=sys.stderr)
	from Bio import SeqIO
	recs = SeqIO.to_dict(SeqIO.parse(args.check, "fasta"))
	groups = df.groupby(by ="ccid")
	for name, group in groups:
		# skip if we cannot find fasta entry 
		if(name not in recs):
			continue 
		rec = recs[name]
		for idx, row in group.iterrows():
			pos = row["qpos"]
			alt = row["truealt"]
			recalt = rec.seq[pos].upper()
			#print(alt, recalt, pos, name, file=sys.stderr)
			assert alt == recalt, "PSV called inccorectly at {}:{}, {} instead of {}".format(name,pos, alt, recalt)

Source File: test_align.py From augur with GNU Affero General Public License v3.0

6 votes

def test_prepare_with_alignment_with_ref_seq(self, test_file, test_seqs, existing_file, existing_aln, ref_seq, ref_file, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence, the reference
        is added to the existing alignment and no other changes are made."""
        aln_outfile, seqs_outfile, ref_name = align.prepare([test_file,], existing_file, out_file, None, ref_file)
        assert ref_name == ref_seq.id, "Didn't return strain name from refrence file"
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile != existing_aln, "Unexpectedly overwrote existing alignment"
        # Alignment file should have the reference added
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence not added to alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys()

Source File: test_align.py From augur with GNU Affero General Public License v3.0

6 votes

def test_prepare_with_alignment_with_ref_name(self, test_file, test_seqs, existing_with_ref, existing_aln, ref_seq, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence name, no changes are made."""
        aln_outfile, seqs_outfile, _ = align.prepare([test_file,], existing_with_ref, out_file, ref_seq.id, None)
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile == existing_with_ref, "Rewrote the alignment file unexpectedly"
        # Alignment file should be unchanged
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence dropped from alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys()

Source File: test_AMRDetection.py From staramr with Apache License 2.0

5 votes

def testPointfinderSalmonella_16S_rrSD_C1065T_Success(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler,
                                               self.pointfinder_drug_table, pointfinder_database,
                                               output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "16S_rrsD-1T1065.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,1000,1000000,1000,300,1000)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        #Make sure that the quality module passes when we pass 1000,1000000,1000,300,1000 for the quality checking parameters
        summary_results = amr_detection.get_summary_results()
        self.assertEqual('Passed', summary_results['Quality Module'].iloc[0], 'Quality result not equal')

        result = pointfinder_results[pointfinder_results['Gene'] == '16S_rrsD (C1065T)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], '16S_rrsD-1T1065', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'nucleotide', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 1065, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'C -> T', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.94, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '1544/1544', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0], 'spectinomycin',
                         'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'pointfinder_16S_rrsD-1T1065.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['16S_rrsD'].seq.upper(), records['16S_rrsD'].seq.upper(),
                         "records don't match")

Source File: dbotu.py From amplicon_sequencing_pipeline with MIT License

5 votes

def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
        '''
        seq_table: pandas.DataFrame
          Samples on the columns; sequences on the rows
        records: index of Bio.Seq
          Indexed, unaligned input sequences. This could come from BioPython's
          SeqIO.to_dict or SeqIO.index.
        max_dist: float
          genetic distance cutoff above which a sequence will not be merged into an OTU
        min_fold: float
          Multiply the sequence's abundance by this fold to get the minimum abundance
          of an OTU for merging
        threshold_pval: float
          P-value below which a sequence will not be merged into an OTU
        log: filehandle
          Log file reporting the abundance, genetic, and distribution checks.
        '''
        self.seq_table = seq_table
        self.records = records
        self.max_dist = max_dist
        self.min_fold = min_fold
        self.threshold_pval = threshold_pval
        self.log = log

        # get a list of the names of the sequences in order of their (decreasing) abundance
        self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)

        # check that all sequence IDs in the table are in the fasta
        missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
        if len(missing_ids) > 0:
            raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))

        # initialize OTU information
        self.membership = {}
        self.otus = []

Source File: met.py From methgo with MIT License

5 votes

def const_ctxstr(reffile):
    """
    Construct methylation context strings from a reference genome FASTA file
    """
    with open(reffile) as infile:
        fasta = SeqIO.to_dict(SeqIO.parse(infile, 'fasta'))
        for chr in fasta:
            fasta[chr] = str(fasta[chr].seq).upper()
    ctxstr = {}
    for chr in fasta:
        ctxstr[chr] = ['-']*len(fasta[chr])
        cg = [match.start() for match in re.finditer(r'(?=(CG))', fasta[chr])]
        for pos in cg:
            ctxstr[chr][pos] = 'X'
        chg = [match.start() for match in re.finditer(r'(?=(C[ACT]G))', fasta[chr])]
        for pos in chg:
            ctxstr[chr][pos] = 'Y'
        chh = [match.start() for match in re.finditer(r'(?=(C[ACT][ACT]))', fasta[chr])]
        for pos in chh:
            ctxstr[chr][pos] = 'Z'
        rcg = [match.start()-1 for match in re.finditer(r'(?<=(CG))', fasta[chr])]
        for pos in rcg:
            ctxstr[chr][pos] = 'x'
        rchg = [match.start()-1 for match in re.finditer(r'(?<=(C[AGT]G))', fasta[chr])]
        for pos in rchg:
            ctxstr[chr][pos] = 'y'
        rchh = [match.start()-1 for match in re.finditer(r'(?<=([AGT][AGT]G))', fasta[chr])]
        for pos in rchh:
            ctxstr[chr][pos] = 'z'
    for chr in ctxstr:
        ctxstr[chr] = ''.join(ctxstr[chr])
    return ctxstr

Source File: splice_cycle.py From Cogent with BSD 3-Clause Clear License

5 votes

def precycle_kmer_adjustment(kmer_size):
    """
    Even with detect_and_replace_cycle (which only looks at repeat kmers within a seq),
    there can be cycles in the graph. So instead, try to detect k-mer re-usage and do simple
    tests using networkx.simple_cycles to see if we can eliminate them as much as possible.
    """
    seqdict = SeqIO.to_dict(SeqIO.parse(open('in.trimmed.fa'), 'fasta'))
    kmer_usage = defaultdict(lambda: defaultdict(lambda: [])) # kmer -> (seqid, i)
    for r in SeqIO.parse(open('in.trimmed.fa'),'fasta'):
        for i in range(len(r.seq)-kmer_size):
            kmer_usage[str(r.seq)[i:i+kmer_size]][r.id].append(i)

    max_kmer_needed = []
    for kmer,v in kmer_usage.items():
        for seqid, indices in v.items():
            if len(indices) > 1: # kmer appeared more than once in the same sequence
                max_kmer_needed.append(max_common_sequence_length(seqdict[seqid], indices, kmer_size))
    if len(max_kmer_needed) == 0:
        log.info("K-mer in-seq cycle detection: None found at k={0}".format(kmer_size))
        return kmer_size
    else:
        min_adjusted_k = min(max_kmer_needed)
        max_adjusted_k = max(max_kmer_needed)
        mean_adjusted_k = sum(max_kmer_needed)*1./len(max_kmer_needed)
        log.info("K-mer in-seq cycle detection: {0} in-seq cycle found with max common \
        sequence analysis showing min: {1}, max: {2}, mean: {3}".format(\
            len(max_kmer_needed), min_adjusted_k, max_adjusted_k, mean_adjusted_k))

        return min(100, max_adjusted_k)

Source File: test_AMRDetection.py From staramr with Apache License 2.0

5 votes

def testPointfinderSalmonellaA67PSuccess(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler,
                                               self.pointfinder_drug_table, pointfinder_database,
                                               output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "gyrA-A67P.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], 'gyrA-A67P', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R, nalidixic acid',
                         'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match")

Source File: test_AMRDetection.py From staramr with Apache License 2.0

5 votes

def testPointfinderSalmonellaA67PSuccessNoPhenotype(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetection(self.resfinder_database, blast_handler, pointfinder_database,
                                     output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "gyrA-A67P.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], 'gyrA-A67P', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths')
        self.assertFalse('Predicted Phenotype' in result.columns, 'Should not exist Predicted Phenotype column')

        hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match")

Source File: test_AMRDetection.py From staramr with Apache License 2.0

5 votes

def testPointfinderSalmonellaA67PReverseComplementSuccess(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'salmonella')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler,
                                               self.pointfinder_drug_table, pointfinder_database,
                                               output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "gyrA-A67P-rc.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        #Make sure that the quality module fails when we pass 0,0,0,0,0 for the quality checking parameters
        summary_results = amr_detection.get_summary_results()
        self.assertEqual('Failed', summary_results['Quality Module'].iloc[0], 'Quality result not equal')

        result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A67P)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], 'gyrA-A67P-rc', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 67, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'GCC -> CCC (A -> P)', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2637/2637', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R, nalidixic acid',
                         'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A67P-rc.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(path.join(self.test_data_dir, "gyrA-A67P.fsa"), 'fasta'))
        self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match")

Source File: fasta_spliter.py From deepcontact with MIT License

5 votes

def save_seqs(self):
        if not os.path.exists(self.config['path']['id_list']):
            self.save_id_list()
        util.make_dir_if_not_exist(self.config['path']['sequence_dir'])

        sequences = SeqIO.to_dict(self.all_sequences())
        with open(self.config['path']['id_list']) as f_in:
            for line in f_in:
                seq_id = line.strip().split()[0]
                seq_record = sequences[seq_id]
                with open(os.path.join(self.config['path']['sequence_dir'], seq_record.id + '.fasta'), 'w') as f_out:
                    SeqIO.write(seq_record, f_out, 'fasta')

Source File: LocalAssembly3.py From SDA with MIT License

5 votes

def addSeqs(self):
		fasta = self.mydir + self.asm + ".assemblies.fasta"
		Seqs = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
		toadd = []
		for fastaid in self.all["query_name"]:
			if(fastaid in Seqs):
				toadd.append(Seqs[fastaid].seq)
			else:
				toadd.append("NA")
		self.all["seq"] = toadd

Source File: test_AMRDetection.py From staramr with Apache License 2.0

5 votes

def testResfinderIncludeNonMatches(self):
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, self.blast_handler,
                                               self.pointfinder_drug_table, self.pointfinder_database,
                                               include_negative_results=True, output_dir=self.outdir.name)
        file_beta_lactam = path.join(self.test_data_dir, "beta-lactam-blaIMP-42-mut-2.fsa")
        file_non_match = path.join(self.test_data_dir, "non-match.fsa")
        files = [file_beta_lactam, file_non_match]
        amr_detection.run_amr_detection(files, 99, 90, 90, 90,0,0,0,0,0)

        summary_results = amr_detection.get_summary_results()
        self.assertEqual(len(summary_results.index), 2, 'Wrong number of rows in result')

        result_beta_lactam = summary_results.loc['beta-lactam-blaIMP-42-mut-2']
        self.assertTrue(isinstance(result_beta_lactam, pd.Series), 'Wrong type of results returned')
        self.assertEqual(result_beta_lactam['Genotype'], 'blaIMP-42', 'Wrong genotype')

        result_sensitive = summary_results.loc['non-match']
        self.assertTrue(isinstance(result_sensitive, pd.Series), 'Wrong type of results returned')
        self.assertEqual(result_sensitive['Genotype'], 'None', 'Wrong genotype')

        hit_file = path.join(self.outdir.name, 'resfinder_beta-lactam-blaIMP-42-mut-2.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file_beta_lactam, 'fasta'))
        self.assertEqual(expected_records['blaIMP-42_1_AB753456'].seq, records['blaIMP-42_1_AB753456'].seq,
                         "records don't match")

Source File: split-clusters.py From zika-pipeline with MIT License

5 votes

def main():
        records = SeqIO.to_dict(SeqIO.parse(open(sys.argv[1]), 'fasta'))

	reader = csv.DictReader(sys.stdin, dialect="excel-tab")
	clusters = list(reader)

	groups = set([c['group'] for c in clusters])

	for group in groups:
		print "cluster%s\t%s-cluster%s" % (group, sys.argv[1], group)
		with open('%s-cluster%s' %(sys.argv[1], group), 'w') as fout:
			SeqIO.write([records[i['node']] for i in clusters if i['group'] == group], fout, 'fasta')

Source File: test_AMRDetection.py From staramr with Apache License 2.0

5 votes

def testPointfinderCampylobacterA70TSuccess(self):
        pointfinder_database = PointfinderBlastDatabase(self.pointfinder_dir, 'campylobacter')
        blast_handler = JobHandler({'resfinder': self.resfinder_database, 'pointfinder': pointfinder_database}, 2,
                                     self.blast_out.name)
        amr_detection = AMRDetectionResistance(self.resfinder_database, self.resfinder_drug_table, blast_handler,
                                               self.pointfinder_drug_table, pointfinder_database,
                                               output_dir=self.outdir.name)

        file = path.join(self.test_data_dir, "gyrA-A70T.fsa")
        files = [file]
        amr_detection.run_amr_detection(files, 99, 99, 90, 90,0,0,0,0,0)

        pointfinder_results = amr_detection.get_pointfinder_results()
        self.assertEqual(len(pointfinder_results.index), 1, 'Wrong number of rows in result')

        result = pointfinder_results[pointfinder_results['Gene'] == 'gyrA (A70T)']
        self.assertEqual(len(result.index), 1, 'Wrong number of results detected')
        self.assertEqual(result.index[0], 'gyrA-A70T', msg='Wrong file')
        self.assertEqual(result['Type'].iloc[0], 'codon', msg='Wrong type')
        self.assertEqual(result['Position'].iloc[0], 70, msg='Wrong codon position')
        self.assertEqual(result['Mutation'].iloc[0], 'GCC -> ACC (A -> T)', msg='Wrong mutation')
        self.assertAlmostEqual(result['%Identity'].iloc[0], 99.96, places=2, msg='Wrong pid')
        self.assertAlmostEqual(result['%Overlap'].iloc[0], 100.00, places=2, msg='Wrong overlap')
        self.assertEqual(result['HSP Length/Total Length'].iloc[0], '2592/2592', msg='Wrong lengths')
        self.assertEqual(result['Predicted Phenotype'].iloc[0], 'ciprofloxacin I/R', 'Wrong phenotype')

        hit_file = path.join(self.outdir.name, 'pointfinder_gyrA-A70T.fsa')
        records = SeqIO.to_dict(SeqIO.parse(hit_file, 'fasta'))

        self.assertEqual(len(records), 1, 'Wrong number of hit records')

        expected_records = SeqIO.to_dict(SeqIO.parse(file, 'fasta'))
        self.assertEqual(expected_records['gyrA'].seq.upper(), records['gyrA'].seq.upper(), "records don't match")

Source File: ground_truth.py From deepcontact with MIT License

5 votes

def __init__(self, config):
        self.config = config
        if 'astral206' in self.config['ground_truth'] and self.config['ground_truth']['astral206']:
            self.original_sequences = SeqIO.to_dict(
                    SeqIO.parse(self.config['ground_truth']['original_sequence'], 'fasta'))

Source File: VariantPhaser.py From cDNA_Cupcake with BSD 3-Clause Clear License

5 votes

def phase_variant(self, sam_filename, input_fa_or_fq, output_prefix, partial_ok=False):
        """
        :param sam_filename: CCS SAM filename. Can be unsorted.
        :param input_fa_or_fq: Input CCS fasta/fastq filename.
        :param output_prefix: Output prefix. Writes to xxx.log.
        :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions.

        For each alignment:
        1. discard if did not map to the strand expected
        2. discard if did not map to the full range of variants (unless <partial_ok> is True)
        3. discard if at var positions have non-called bases (outliers)
        """
        f_log = open(output_prefix+'.log', 'w')

        seq_dict = SeqIO.to_dict(SeqIO.parse(open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq)))
        for r in GMAPSAMReader(sam_filename, True, query_len_dict=dict((k, len(seq_dict[k].seq)) for k in seq_dict)):
            if r.sID == '*':
                f_log.write("Ignore {0} because: unmapped.\n".format(r.qID))
                continue
            if r.flag.strand != self.vc.expected_strand:
                f_log.write("Ignore {0} because: strand is {1}.\n".format(r.qID, r.flag.strand))
                continue # ignore
            if not partial_ok and (r.sStart > self.min_var_pos or r.sEnd < self.max_var_pos):
                f_log.write("Ignore {0} because: aln too short, from {1}-{2}.\n".format(r.qID, r.sStart+1, r.sEnd))
                continue

            i, msg = self.match_haplotype(r, str(seq_dict[r.qID].seq).upper(), partial_ok)
            if i is None: # read is rejected for reason listed in <msg>
                f_log.write("Ignore {0} because: {1}.\n".format(r.qID, msg))
                continue
            else:
                f_log.write("{0} phased: haplotype {1}={2}\n".format(r.qID, i, self.haplotypes[i]))
                print("{0} has haplotype {1}:{2}".format(r.qID, i, self.haplotypes[i]))
                self.seq_hap_info[r.qID] = i

Source File: test_align.py From augur with GNU Affero General Public License v3.0

5 votes

def test_postprocess_fill_gaps(self, existing_file, existing_aln, ref_seq, fill_gaps):
        """Postprocess should make the gaps ambiguous only if requested"""
        align.postprocess(existing_file, None, True, fill_gaps)
        output = SeqIO.to_dict(SeqIO.parse(existing_file, "fasta"))
        for name, record in output.items():
            for idx, site in enumerate(existing_aln[name].seq):
                if site == "-":
                    assert (record.seq[idx] == "N") == fill_gaps

Source File: test_filter.py From augur with GNU Affero General Public License v3.0

5 votes

def test_filter_run_with_query(self, tmpdir, fasta_fn, argparser):
        """Test that filter --query works as expected"""
        out_fn = str(tmpdir / "out.fasta")
        meta_fn = write_metadata(tmpdir, (("strain","location","quality"),
                                          ("SEQ_1","colorado","good"),
                                          ("SEQ_2","colorado","bad"),
                                          ("SEQ_3","nevada","good")))
        args = argparser('-s %s --metadata %s -o %s --query "location==\'colorado\'"'
                         % (fasta_fn, meta_fn, out_fn))
        augur.filter.run(args)
        output = SeqIO.to_dict(SeqIO.parse(out_fn, "fasta"))
        assert list(output.keys()) == ["SEQ_1", "SEQ_2"]

Python Bio.SeqIO.to_dict() Examples