Python Bio.SeqIO.write() Examples
The following are 30
code examples of Bio.SeqIO.write().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
Bio.SeqIO
, or try the search function
.
Example #1
Source File: prep-genome.py From kevlar with MIT License | 7 votes |
def main(args): for record in SeqIO.parse(args.infile, 'fasta'): if args.discard: if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0: continue subseqcounter = 0 printlog(args.debug, "DEBUG: convert to upper case", record.id) sequence = str(record.seq).upper() printlog(args.debug, "DEBUG: split seq by Ns", record.id) subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength] printlog(args.debug, "DEBUG: print subseqs", record.id) for subseq in subseqs: subseqcounter += 1 subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter) subrecord = SeqRecord(Seq(subseq), subid, '', '') SeqIO.write(subrecord, args.outfile, 'fasta')
Example #2
Source File: fasta.py From ssbio with MIT License | 6 votes |
def write_fasta_file(seq_records, outname, outdir=None, outext='.faa', force_rerun=False): """Write a FASTA file for a SeqRecord or a list of SeqRecord objects. Args: seq_records (SeqRecord, list): SeqRecord or a list of SeqRecord objects outname: Name of the output file which will have outext appended to it outdir: Path to directory to output sequences to outext: Extension of FASTA file, default ".faa" force_rerun: If file should be overwritten if it exists Returns: str: Path to output FASTA file. """ if not outdir: outdir = '' outfile = ssbio.utils.outfile_maker(inname='', outname=outname, outdir=outdir, outext=outext) if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile): SeqIO.write(seq_records, outfile, "fasta") return outfile
Example #3
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def test_prepare_with_alignment_with_ref_name(self, test_file, test_seqs, existing_with_ref, existing_aln, ref_seq, out_file): """Test that, given a set of test sequences, an existing alignment, and a reference sequence name, no changes are made.""" aln_outfile, seqs_outfile, _ = align.prepare([test_file,], existing_with_ref, out_file, ref_seq.id, None) assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said" assert aln_outfile == existing_with_ref, "Rewrote the alignment file unexpectedly" # Alignment file should be unchanged aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta")) assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence dropped from alignment" for seq in existing_aln: assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly" assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly" # test sequences should be unchanged assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said" seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta")) for seq in test_seqs: assert seq in seq_output, "Some test sequences unexpectedly dropped" assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly" assert seq_output.keys() == test_seqs.keys()
Example #4
Source File: test_align.py From augur with GNU Affero General Public License v3.0 | 6 votes |
def test_prepare_with_alignment_with_ref_seq(self, test_file, test_seqs, existing_file, existing_aln, ref_seq, ref_file, out_file): """Test that, given a set of test sequences, an existing alignment, and a reference sequence, the reference is added to the existing alignment and no other changes are made.""" aln_outfile, seqs_outfile, ref_name = align.prepare([test_file,], existing_file, out_file, None, ref_file) assert ref_name == ref_seq.id, "Didn't return strain name from refrence file" assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said" assert aln_outfile != existing_aln, "Unexpectedly overwrote existing alignment" # Alignment file should have the reference added aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta")) assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence not added to alignment" for seq in existing_aln: assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly" assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly" # test sequences should be unchanged assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said" seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta")) for seq in test_seqs: assert seq in seq_output, "Some test sequences unexpectedly dropped" assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly" assert seq_output.keys() == test_seqs.keys()
Example #5
Source File: depth_calculator.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def depth_summary(all_genes,prefix): """Given the coding.depth file, calculate the average depth across the recovered sequence.""" depths = {g:0 for g in all_genes} gene = "ZZZZ" gene_depths = [] for line in open("coding.depth"): line = line.split() if line[0].split("-")[-1] == gene: gene_depths.append(int(line[2])) else: if gene == "ZZZZ": gene = line[0].split("-")[-1] gene_depths = [int(line[2])] else: mean_depth = sum(gene_depths)/float(len(gene_depths)) depths[gene] = mean_depth gene_depths = [] gene = line[0].split("-")[-1] output_list = [str(depths[key]) for key in sorted(depths)] sys.stdout.write("{}\t{}\n".format(prefix,"\t".join(output_list)))
Example #6
Source File: paralog_investigator.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def extract_paralogs(gene,prefix): putative_paralog_ids = list(set([x.split()[1].rstrip() for x in open(os.path.join(gene,prefix,"paralog_warning.txt"))])) try: chosen_paralog = open(os.path.join(gene,prefix,"exonerate_stats.csv")).readline().rstrip() except IOError: return 0 exonerate_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(gene,prefix,"exonerate_results.fasta"),'fasta')) if not os.path.isdir(os.path.join(gene,prefix,'paralogs')): os.mkdir(os.path.join(gene,prefix,"paralogs")) seqs_to_write = [exonerate_dict[x] for x in putative_paralog_ids] for seq in range(len(seqs_to_write)): if seqs_to_write[seq].id == chosen_paralog: seqs_to_write[seq].id = "{}.{}".format(prefix,"main") else: seqs_to_write[seq].id = "{}.{}".format(prefix,seq) SeqIO.write(seqs_to_write,os.path.join(gene,prefix,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta') return len(seqs_to_write)
Example #7
Source File: protein.py From ssbio with MIT License | 6 votes |
def write_all_sequences_file(self, outname, outdir=None): """Write all the stored sequences as a single FASTA file. By default, sets IDs to model gene IDs. Args: outname (str): Name of the output FASTA file without the extension outdir (str): Path to output directory for the file, default is the sequences directory """ if not outdir: outdir = self.sequence_dir if not outdir: raise ValueError('Output directory must be specified') outfile = op.join(outdir, outname + '.faa') SeqIO.write(self.sequences, outfile, "fasta") log.info('{}: wrote all protein sequences to file'.format(outfile)) return outfile
Example #8
Source File: write.py From phylopandas with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _write_doc_template(schema): s = """Write to {} format. Parameters ---------- filename : str File to write {} string to. If no filename is given, a {} string will be returned. sequence_col : str (default='sequence') Sequence column name in DataFrame. id_col : str (default='id') ID column name in DataFrame id_only : bool (default=False) If True, use only the ID column to label sequences in fasta. """.format(schema, schema, schema) return s
Example #9
Source File: seq.py From wub with Mozilla Public License 2.0 | 6 votes |
def base_complement(k): """ Return complement of base. Performs the subsitutions: A<=>T, C<=>G, X=>X for both upper and lower case. The return value is identical to the argument for all other values. :param k: A base. :returns: Complement of base. :rtype: str """ try: return comp[k] except KeyError: sys.stderr.write( "WARNING: No reverse complement for {} found, returning argument.".format(k)) return k
Example #10
Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def concatenate_sequences(gene_dict,fastafiles,unique_names): '''Given a dictionary of dictionaries with complete sampling in each gene, write out concatenated sequences to stdout. Returns a list of partition lengths.''' new_seq_dict = {} partition_lengths = [] for gene in fastafiles: for name in unique_names: try: new_seq_dict[name] += gene_dict[gene][name] except KeyError: new_seq_dict[name] = gene_dict[gene][name] partition_lengths.append(len(next(iter(gene_dict[gene].values())))) for final_seq in new_seq_dict: SeqIO.write(new_seq_dict[final_seq],sys.stdout,'fasta') final_seq_length = len(new_seq_dict[final_seq]) sys.stderr.write("Final conatenated sequence length: {}\n".format(final_seq_length)) return partition_lengths
Example #11
Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def raxml_partition(fastafiles,partition_lengths,partition_type): '''Generate a raxml partition file for the given fastafiles. User specifies the partition type''' gene_start = 1 partition_file = open("partition.raxml",'w') if partition_type == 'CODON': for g in range(len(fastafiles)): codon3_start = gene_start + 2 codon3_end = gene_start + partition_lengths[g] - 1 codon1_end = codon3_end - 2 codon2_start = gene_start + 1 codon2_end = codon3_end - 1 partition_file.write("{},{}{}={}-{}\\3,{}-{}\\3\n".format("DNA",fastafiles[g],"12",gene_start,codon1_end,codon2_start,codon2_end)) partition_file.write("{},{}{}={}-{}\\3\n".format("DNA",fastafiles[g],"3",codon3_start,codon3_end)) gene_start = codon3_end + 1 else: for g in range(len(fastafiles)): gene_end = gene_start + partition_lengths[g] - 1 partition_file.write("{},{}={}-{}\n".format(partition_type,fastafiles[g],gene_start,gene_end)) gene_start = gene_end + 1 partition_file.close()
Example #12
Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def supercontig_exonerate(supercontig,protseq,prefix,thresh=55): """Given a long, joined contig and a protein sequence, return the exonerate hit(s)""" logger = logging.getLogger("pipeline") exonerate_ryo = '>%ti,%qi,%qab,%qae,%pi,(%tS)\\n%tcs\\n' temp_prot_filename = "%s/temp.prot.fa"%prefix temp_contig_filename = "%s/temp.contig.fa"%prefix SeqIO.write(protseq,temp_prot_filename,'fasta') SeqIO.write(supercontig,temp_contig_filename,'fasta') logger.debug("Conducting exonerate search on supercontig") proc = subprocess.Popen(['exonerate','-m','protein2genome','--showalignment','no','-V','0','--showvulgar','no','--ryo',exonerate_ryo,temp_prot_filename,temp_contig_filename],stdout=subprocess.PIPE,universal_newlines=True) proc.wait() #print proc.stdout.readlines() supercontig_cds = [i for i in SeqIO.parse(proc.stdout,'fasta') if float(i.id.split(",")[4])>thresh] logger.debug("Supercontig lengths: %s" % " ".join([str(len(x.seq)) for x in supercontig_cds])) return supercontig_cds
Example #13
Source File: fastx_grep.py From bioinformatics_primer with MIT License | 6 votes |
def main(): """Make a jazz noise here""" args = get_args() regex = re.compile(args.pattern) out_fh = args.outfile or sys.stdout checked, took = 0, 0 for fh in args.file: in_format = args.format or guess_format(fh.name) for rec in SeqIO.parse(fh, in_format): checked += 1 out_fmt = args.out_format or args.format if any(map(regex.search, [rec.id, rec.description])): took += 1 SeqIO.write(rec, out_fh, args.out_format or in_format) print(f'Done, checked {checked}, took {took}.', file=sys.stderr) # --------------------------------------------------
Example #14
Source File: intronerate.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def make_intron_supercontig(contig_info,gene,prefix,add_N = False): cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta')) intron_supercontig = SeqRecord(Seq('')) for i in contig_info: if i[5] == "(+)": intron_supercontig += cap3contigs[i[0]] elif i[5] == "(-)": intron_supercontig += cap3contigs[i[0]].reverse_complement() else: sys.stderr.write("Strandedness not found!") sys.exit(1) if add_N and i != contig_info[-1]: intron_supercontig += "NNNNNNNNNN" intron_supercontig.id = '{}-{}'.format(prefix,gene) intron_supercontig.description = '' SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta')
Example #15
Source File: distribute_targets.py From HybPiper with GNU General Public License v3.0 | 6 votes |
def main(): parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-d","--delimiter",help="Field separating FASTA ids for multiple sequences per target. Default is '-' . For no delimeter, write None", default="-") parser.add_argument("baitfile",help="FASTA file containing bait sequences") parser.add_argument("--blastx",help="tabular blastx results file, used to select the best target for each gene",default=None) parser.add_argument("--bam",help="BAM file from BWA search, alternative to the BLASTx method",default=None) parser.add_argument("--target",help="Choose this version of the target always",default=None) parser.add_argument("--unpaired",help="Indicate whether to expect a file containing results from unpaired reads.",action="store_true",default=False) parser.add_argument("--exclude",help="Do not use any sequence with the specified string as the chosen target.",default=None) args = parser.parse_args() if args.blastx: besthits = tailored_target_blast(args.blastx,args.exclude) translate = False if args.bam: translate = True besthits = tailored_target_bwa(args.bam,args.unpaired,args.exclude) distribute_targets(args.baitfile,dirs=True,delim=args.delimiter,besthits=besthits,translate=translate,target=args.target)
Example #16
Source File: app.py From InSilicoSeq with MIT License | 6 votes |
def model_from_bam(args): """Main function for the `iss model` submodule This submodule write all variables necessary for building an ErrorModel to args.output + .npz Args: args (object): the command-line arguments from argparse """ logger = logging.getLogger(__name__) logger.debug('iss version %s' % __version__) logger.debug('Using verbose logger') try: # try to import bam module and write model data to file logger.info('Starting iss model') from iss import bam except ImportError as e: logger.error('Failed to import bam module: %s' % e) sys.exit(1) else: logger.info('Using KDE ErrorModel') bam.to_model(args.bam, args.output) logger.info('Model generation complete')
Example #17
Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0 | 5 votes |
def write_exonerate_stats(contig_id_list,prefix): '''Given a list of IDs from initial exonerate search, write info to a standard file''' with open("{}/exonerate_stats.csv".format(prefix),'w') as exonerate_statsfile: exonerate_statsfile.write("\n".join(contig_id_list)+'\n')
Example #18
Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0 | 5 votes |
def paralog_test(exonerate_hits,prot,prefix): """Gives a warning if there are multiple hits of long length to the same protein""" logger = logging.getLogger("pipeline") protlength = len(prot) hitlengths = [abs(int(x.split(",")[2]) - int(x.split(",")[3])) for x in exonerate_hits["assemblyHits"]] logger.debug("protein length: {}".format(protlength)) logger.debug("Hit lengths:") logger.debug(hitlengths) longhits = [x > 0.75*protlength for x in hitlengths] if sum(longhits) > 1: sys.stderr.write("WARNING: Multiple long-length exonerate hits for {}. Check for paralogs!\n".format(prot.id)) with open("{}/paralog_warning.txt".format(prefix),'w') as pw: for hit in range(len(exonerate_hits["assemblyHits"])): if longhits[hit]: pw.write(prot.id+ "\t"+exonerate_hits["assemblyHits"][hit] + "\n")
Example #19
Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0 | 5 votes |
def report_no_sequences(protname): sys.stderr.write("No valid sequences remain for {}!\n".format(protname))
Example #20
Source File: solution.py From bioinformatics_primer with MIT License | 5 votes |
def main(): args = get_args() exclude = get_excluded(args.exclude, args.summary) out_dir = args.out_dir if not os.path.isdir(out_dir): os.mkdir(out_dir) tax_id = dict() with open(args.summary, 'r') as sum_fh: sum_hdr = read_split(sum_fh.readline()) for line in sum_fh: #info = dict(zip(sum_hdr, read_split(line))) #tax_id[ info['readID'] ] = info['taxID'] dat = read_split(line) tax_id[ dat[0] ] = dat[2] took = 0 skipped = 0 basename = os.path.basename(args.fasta) out_file = os.path.join(out_dir, basename) exclude_fh = open(os.path.join(args.exclude_dir, basename), 'wt') \ if args.exclude_dir else None with open(out_file, 'w') as out_fh: for seq in SeqIO.parse(args.fasta, "fasta"): tax = tax_id[ seq.id ] if seq.id in tax_id else '0' if tax in exclude: skipped += 1 if not exclude_fh is None: SeqIO.write(seq, exclude_fh, "fasta") else: took += 1 SeqIO.write(seq, out_fh, "fasta") print("Done, took {}, skipped {}, see output {}".format( took, skipped, out_file)) # --------------------------------------------------
Example #21
Source File: distribute_targets.py From HybPiper with GNU General Public License v3.0 | 5 votes |
def tailored_target_blast(blastxfilename,exclude=None): """Determine, for each protein, the 'best' target protein, by tallying up the blastx hit scores.""" blastxfile = open(blastxfilename) hitcounts = {} for result in blastxfile: result = result.split() hitname = result[1].split("-") bitscore = float(result[-1]) try: protname = hitname[1] except IndexError: raise IndexError("Gene name not found! FASTA headers should be formatted like this:\n >SpeciesName-GeneName\n") taxon = hitname[0] if exclude and exclude in taxon: continue else: if protname in hitcounts: if taxon in hitcounts[protname]: hitcounts[protname][taxon] += bitscore else: hitcounts[protname][taxon] = bitscore else: hitcounts[protname] = {taxon:1} #For each protein, find the taxon with the highest total hit bitscore. besthits = {} besthit_counts = {} for prot in hitcounts: top_taxon = sorted(iter(hitcounts[prot].keys()), key = lambda k: hitcounts[prot][k], reverse=True)[0] besthits[prot] = top_taxon if top_taxon in besthit_counts: besthit_counts[top_taxon] += 1 else: besthit_counts[top_taxon] = 1 tallyfile = open("bait_tallies.txt",'w') for x in besthit_counts: tallyfile.write("{}\t{}\n".format(x, besthit_counts[x])) tallyfile.close() return besthits
Example #22
Source File: depth_calculator.py From HybPiper with GNU General Public License v3.0 | 5 votes |
def merge_seqs(genelist,prefix,file_type="coding"): '''Given a list of gene sequences, retreive the sequences and generate a single FASTA file.''' with open("{}_sequences.fasta".format(file_type),"w") as outfile: for gene in genelist: if file_type == "coding": seqfile = "{}/{}/sequences/FNA/{}.FNA".format(gene,prefix,gene) else: seqfile = "{}/{}/sequences/intron/{}_supercontig.fasta".format(gene,prefix,gene) seq = SeqIO.read(seqfile,'fasta') seq.id = seq.id + "-" + gene SeqIO.write(seq,outfile,'fasta')
Example #23
Source File: solution.py From bioinformatics_primer with MIT License | 5 votes |
def get_args(): parser = argparse.ArgumentParser(description='Filter FASTA with Centrifuge') parser.add_argument('-f', '--fasta', help='fasta file', type=str, metavar='FILE', required=True) parser.add_argument('-s', '--summary', help='Centrifuge summary file', type=str, metavar='FILE', required=True) parser.add_argument('-e', '--exclude', metavar='IDS_NAMES', required=True, help='Comma-separated list of taxIDs/names to exclude') parser.add_argument('-o', '--out_dir', help='Output directory', type=str, metavar='DIR', default='filtered') parser.add_argument('-x', '--exclude_dir', metavar='DIR', required=True, help='File name to write excluded') return parser.parse_args() # --------------------------------------------------
Example #24
Source File: solution.py From bioinformatics_primer with MIT License | 5 votes |
def main(): """Make a jazz noise here""" args = get_args() out_dir = args.outdir pct_gc = args.pct_gc if not os.path.isdir(out_dir): os.makedirs(out_dir) if not 0 < pct_gc <= 100: die('--pct_gc "{}" must be between 0 and 100'.format(pct_gc)) num_seqs = 0 for i, file in enumerate(args.fasta, start=1): if not os.path.isfile(file): warn('"{}" is not a file'.format(file)) continue print('{:3}: {}'.format(i, os.path.basename(file))) base, ext = os.path.splitext(os.path.basename(file)) high_file = os.path.join(out_dir, ''.join([base, '_high', ext])) low_file = os.path.join(out_dir, ''.join([base, '_low', ext])) high_fh = open(high_file, 'wt') low_fh = open(low_file, 'wt') for rec in SeqIO.parse(file, 'fasta'): num_seqs += 1 bases = Counter(rec.seq.upper()) gc = bases.get('G', 0) + bases.get('C', 0) pct = int((gc / len(rec.seq)) * 100) SeqIO.write(rec, low_fh if pct < pct_gc else high_fh, 'fasta') print('Done, wrote {} sequence{} to out dir "{}"'.format( num_seqs, '' if num_seqs == 1 else 's', out_dir)) # --------------------------------------------------
Example #25
Source File: solution.py From bioinformatics_primer with MIT License | 5 votes |
def main(): """main""" args = get_args() input_file = args.input out_file = args.output keyword = args.keyword.lower() skip = set(map(str.lower, args.skip)) if not os.path.isfile(input_file): die('"{}" is not a file'.format(input_file)) print('Processing "{}"'.format(input_file)) num_skipped = 0 num_taken = 0 with open(out_file, "w") as out_fh: for record in SeqIO.parse(input_file, "swiss"): annot = record.annotations if skip and 'taxonomy' in annot: taxa = set(map(str.lower, annot['taxonomy'])) if skip.intersection(taxa): num_skipped += 1 continue if 'keywords' in annot: kw = set(map(str.lower, annot['keywords'])) if keyword in kw: num_taken += 1 SeqIO.write(record, out_fh, 'fasta') else: num_skipped += 1 print('Done, skipped {} and took {}. See output in "{}".'.format( num_skipped, num_taken, out_file)) # --------------------------------------------------
Example #26
Source File: solution.py From bioinformatics_primer with MIT License | 5 votes |
def main(): """Make a jazz noise here""" args = get_args() out_dir = args.outdir if not os.path.isdir(out_dir): os.makedirs(out_dir) ext = args.extension if not ext.startswith('.'): ext = '.' + ext for i, fq in enumerate(args.fastq, start=1): basename = os.path.basename(fq) root, _ = os.path.splitext(basename) print('{:3}: {}'.format(i, basename)) out_file = os.path.join(out_dir, root + ext) out_fh = open(out_file, 'wt') for record in SeqIO.parse(fq, 'fastq'): SeqIO.write(record, out_fh, 'fasta') print('Done, see output in "{}".'.format(out_dir)) # --------------------------------------------------
Example #27
Source File: solution.py From bioinformatics_primer with MIT License | 5 votes |
def process(file, input_format, output_format, out_dir, seqs_per_file): """ Spilt file into smaller files into out_dir Optionally convert to output format Return number of sequences written """ if not os.path.isfile(file): warn('"{}" is not valid'.format(file)) return 0 basename, ext = os.path.splitext(os.path.basename(file)) out_fh = None i = 0 num_written = 0 nfile = 0 for record in SeqIO.parse(file, input_format): if i == seqs_per_file: i = 0 if out_fh is not None: out_fh.close() out_fh = None i += 1 num_written += 1 if out_fh is None: nfile += 1 path = os.path.join(out_dir, basename + '.' + '{:04d}'.format(nfile) + ext) out_fh = open(path, 'wt') SeqIO.write(record, out_fh, output_format) return num_written # --------------------------------------------------
Example #28
Source File: cdhit_unclustered.py From bioinformatics_primer with MIT License | 5 votes |
def get_unclustered(cluster_file, proteins_file, out_file): """Find the unclustered proteins in the cd-hit output""" if not os.path.isfile(cluster_file): die('cdhit "{}" is not a file'.format(cluster_file)) logging.debug('Parsing "{}"'.format(cluster_file)) clustered = set([rec.id for rec in SeqIO.parse(cluster_file, 'fasta')]) # Alternate (longer) way: # clustered = set() # for rec in SeqIO.parse(cluster_file, 'fasta'): # clustered.add(rec.id) logging.debug('Will write to "{}"'.format(out_file)) out_fh = open(out_file, 'wt') num_total = 0 num_unclustered = 0 for rec in SeqIO.parse(proteins_file, 'fasta'): num_total += 1 prot_id = re.sub(r'\|.*', '', rec.id) if not prot_id in clustered: num_unclustered += 1 SeqIO.write(rec, out_fh, 'fasta') logging.debug( 'Finished writing unclustered proteins'.format(num_unclustered)) return (num_unclustered, num_total) # --------------------------------------------------
Example #29
Source File: solution.py From bioinformatics_primer with MIT License | 5 votes |
def main(): """main""" args = get_args() out_dir = args.outdir if out_dir and not os.path.isdir(out_dir): os.makedirs(out_dir) for fnum, file in enumerate(args.file): if not os.path.isfile(file): warn('"{}" is not a file'.format(file)) continue filename = os.path.basename(file) base, ext = os.path.splitext(filename) forward = open(os.path.join(out_dir, base + '_1' + ext), 'wt') reverse = open(os.path.join(out_dir, base + '_2' + ext), 'wt') print("{:3d}: {}".format(fnum + 1, filename)) num_seqs = 0 for i, rec in enumerate(SeqIO.parse(file, 'fasta')): SeqIO.write(rec, forward if i % 2 == 0 else reverse, 'fasta') num_seqs += 1 print('\tSplit {:,d} sequences to dir "{}"'.format(num_seqs, out_dir)) # --------------------------------------------------
Example #30
Source File: align.py From augur with GNU Affero General Public License v3.0 | 5 votes |
def write_seqs(seqs, fname): """A wrapper around SeqIO.write with error handling""" try: SeqIO.write(seqs, fname, 'fasta') except FileNotFoundError: raise AlignmentError('ERROR: Couldn\'t write "{}" -- perhaps the directory doesn\'t exist?'.format(fname))