Python Bio.SeqIO.write() Examples

The following are 30 code examples of Bio.SeqIO.write(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Bio.SeqIO , or try the search function .
Example #1
Source File: prep-genome.py    From kevlar with MIT License 7 votes vote down vote up
def main(args):
    for record in SeqIO.parse(args.infile, 'fasta'):
        if args.discard:
            if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0:
                continue

        subseqcounter = 0
        printlog(args.debug, "DEBUG: convert to upper case", record.id)
        sequence = str(record.seq).upper()
        printlog(args.debug, "DEBUG: split seq by Ns", record.id)
        subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength]
        printlog(args.debug, "DEBUG: print subseqs", record.id)
        for subseq in subseqs:
            subseqcounter += 1
            subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter)
            subrecord = SeqRecord(Seq(subseq), subid, '', '')
            SeqIO.write(subrecord, args.outfile, 'fasta') 
Example #2
Source File: fasta.py    From ssbio with MIT License 6 votes vote down vote up
def write_fasta_file(seq_records, outname, outdir=None, outext='.faa', force_rerun=False):
    """Write a FASTA file for a SeqRecord or a list of SeqRecord objects.

    Args:
        seq_records (SeqRecord, list): SeqRecord or a list of SeqRecord objects
        outname: Name of the output file which will have outext appended to it
        outdir: Path to directory to output sequences to
        outext: Extension of FASTA file, default ".faa"
        force_rerun: If file should be overwritten if it exists

    Returns:
        str: Path to output FASTA file.

    """

    if not outdir:
        outdir = ''
    outfile = ssbio.utils.outfile_maker(inname='', outname=outname, outdir=outdir, outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        SeqIO.write(seq_records, outfile, "fasta")

    return outfile 
Example #3
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_prepare_with_alignment_with_ref_name(self, test_file, test_seqs, existing_with_ref, existing_aln, ref_seq, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence name, no changes are made."""
        aln_outfile, seqs_outfile, _ = align.prepare([test_file,], existing_with_ref, out_file, ref_seq.id, None)
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile == existing_with_ref, "Rewrote the alignment file unexpectedly"
        # Alignment file should be unchanged
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence dropped from alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys() 
Example #4
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_prepare_with_alignment_with_ref_seq(self, test_file, test_seqs, existing_file, existing_aln, ref_seq, ref_file, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence, the reference
        is added to the existing alignment and no other changes are made."""
        aln_outfile, seqs_outfile, ref_name = align.prepare([test_file,], existing_file, out_file, None, ref_file)
        assert ref_name == ref_seq.id, "Didn't return strain name from refrence file"
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile != existing_aln, "Unexpectedly overwrote existing alignment"
        # Alignment file should have the reference added
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence not added to alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys() 
Example #5
Source File: depth_calculator.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def depth_summary(all_genes,prefix):
    """Given the coding.depth file, calculate the average depth across the recovered sequence."""
    depths = {g:0 for g in all_genes}
    gene = "ZZZZ"
    gene_depths = []
    for line in open("coding.depth"):
        line = line.split()
        if line[0].split("-")[-1] == gene:
            gene_depths.append(int(line[2]))
        else:
            if gene == "ZZZZ":
                gene = line[0].split("-")[-1]
                gene_depths = [int(line[2])]
            else:
                mean_depth = sum(gene_depths)/float(len(gene_depths))
                depths[gene] = mean_depth
                gene_depths = []
            gene = line[0].split("-")[-1]
            
    output_list = [str(depths[key]) for key in sorted(depths)]                    
    sys.stdout.write("{}\t{}\n".format(prefix,"\t".join(output_list))) 
Example #6
Source File: paralog_investigator.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def extract_paralogs(gene,prefix):
    
    putative_paralog_ids = list(set([x.split()[1].rstrip() for x in open(os.path.join(gene,prefix,"paralog_warning.txt"))]))
    try:
        chosen_paralog = open(os.path.join(gene,prefix,"exonerate_stats.csv")).readline().rstrip()
    except IOError:
        return 0
    
    exonerate_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(gene,prefix,"exonerate_results.fasta"),'fasta'))
    
    if not os.path.isdir(os.path.join(gene,prefix,'paralogs')):
        os.mkdir(os.path.join(gene,prefix,"paralogs"))
    seqs_to_write = [exonerate_dict[x] for x in putative_paralog_ids]
    
    for seq in range(len(seqs_to_write)):
        if seqs_to_write[seq].id == chosen_paralog:
            seqs_to_write[seq].id = "{}.{}".format(prefix,"main")
            
        else:
            seqs_to_write[seq].id = "{}.{}".format(prefix,seq)
    
    SeqIO.write(seqs_to_write,os.path.join(gene,prefix,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta')
    
    return len(seqs_to_write) 
Example #7
Source File: protein.py    From ssbio with MIT License 6 votes vote down vote up
def write_all_sequences_file(self, outname, outdir=None):
        """Write all the stored sequences as a single FASTA file. By default, sets IDs to model gene IDs.

        Args:
            outname (str): Name of the output FASTA file without the extension
            outdir (str): Path to output directory for the file, default is the sequences directory

        """

        if not outdir:
            outdir = self.sequence_dir
            if not outdir:
                raise ValueError('Output directory must be specified')

        outfile = op.join(outdir, outname + '.faa')
        SeqIO.write(self.sequences, outfile, "fasta")

        log.info('{}: wrote all protein sequences to file'.format(outfile))
        return outfile 
Example #8
Source File: write.py    From phylopandas with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _write_doc_template(schema):
    s = """Write to {} format.

    Parameters
    ----------
    filename : str
        File to write {} string to. If no filename is given, a {} string
        will be returned.

    sequence_col : str (default='sequence')
        Sequence column name in DataFrame.

    id_col : str (default='id')
        ID column name in DataFrame

    id_only : bool (default=False)
        If True, use only the ID column to label sequences in fasta.
    """.format(schema, schema, schema)
    return s 
Example #9
Source File: seq.py    From wub with Mozilla Public License 2.0 6 votes vote down vote up
def base_complement(k):
    """ Return complement of base.

    Performs the subsitutions: A<=>T, C<=>G, X=>X for both upper and lower
    case. The return value is identical to the argument for all other values.

    :param k: A base.
    :returns: Complement of base.
    :rtype: str

    """
    try:
        return comp[k]
    except KeyError:
        sys.stderr.write(
            "WARNING: No reverse complement for {} found, returning argument.".format(k))
        return k 
Example #10
Source File: fasta_merge.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def concatenate_sequences(gene_dict,fastafiles,unique_names):
    '''Given a dictionary of dictionaries with complete sampling in each gene, write out concatenated sequences to stdout. Returns a list of partition lengths.'''    
    new_seq_dict = {}
    partition_lengths = []
    for gene in fastafiles:
        for name in unique_names:
            try:
                new_seq_dict[name] += gene_dict[gene][name]
            except KeyError:
                new_seq_dict[name] = gene_dict[gene][name]
        partition_lengths.append(len(next(iter(gene_dict[gene].values()))))
    for final_seq in new_seq_dict:
        SeqIO.write(new_seq_dict[final_seq],sys.stdout,'fasta')            
    final_seq_length = len(new_seq_dict[final_seq])
    sys.stderr.write("Final conatenated sequence length: {}\n".format(final_seq_length))
    return partition_lengths 
Example #11
Source File: fasta_merge.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def raxml_partition(fastafiles,partition_lengths,partition_type):
    '''Generate a raxml partition file for the given fastafiles. User specifies the partition type'''
    gene_start = 1
    partition_file = open("partition.raxml",'w')
    
    if partition_type == 'CODON':
        for g in range(len(fastafiles)):
            codon3_start = gene_start + 2
            codon3_end = gene_start + partition_lengths[g] - 1
            codon1_end = codon3_end - 2
            codon2_start = gene_start + 1
            codon2_end = codon3_end - 1
            partition_file.write("{},{}{}={}-{}\\3,{}-{}\\3\n".format("DNA",fastafiles[g],"12",gene_start,codon1_end,codon2_start,codon2_end))
            partition_file.write("{},{}{}={}-{}\\3\n".format("DNA",fastafiles[g],"3",codon3_start,codon3_end))
            gene_start = codon3_end + 1
    else:
        for g in range(len(fastafiles)):
            gene_end = gene_start + partition_lengths[g] - 1
            partition_file.write("{},{}={}-{}\n".format(partition_type,fastafiles[g],gene_start,gene_end))
            gene_start = gene_end + 1
        partition_file.close() 
Example #12
Source File: exonerate_hits.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def supercontig_exonerate(supercontig,protseq,prefix,thresh=55):
    """Given a long, joined contig and a protein sequence, return the exonerate hit(s)"""
    logger = logging.getLogger("pipeline")

    exonerate_ryo = '>%ti,%qi,%qab,%qae,%pi,(%tS)\\n%tcs\\n'
    temp_prot_filename = "%s/temp.prot.fa"%prefix
    temp_contig_filename =  "%s/temp.contig.fa"%prefix
    SeqIO.write(protseq,temp_prot_filename,'fasta')
    SeqIO.write(supercontig,temp_contig_filename,'fasta')
    logger.debug("Conducting exonerate search on supercontig")
    proc = subprocess.Popen(['exonerate','-m','protein2genome','--showalignment','no','-V','0','--showvulgar','no','--ryo',exonerate_ryo,temp_prot_filename,temp_contig_filename],stdout=subprocess.PIPE,universal_newlines=True)

    proc.wait()
    #print proc.stdout.readlines()
    supercontig_cds = [i for i in SeqIO.parse(proc.stdout,'fasta') if float(i.id.split(",")[4])>thresh]
    logger.debug("Supercontig lengths: %s" % " ".join([str(len(x.seq)) for x in supercontig_cds]))
    return supercontig_cds 
Example #13
Source File: fastx_grep.py    From bioinformatics_primer with MIT License 6 votes vote down vote up
def main():
    """Make a jazz noise here"""

    args = get_args()
    regex = re.compile(args.pattern)
    out_fh = args.outfile or sys.stdout
    checked, took = 0, 0

    for fh in args.file:
        in_format = args.format or guess_format(fh.name)
        for rec in SeqIO.parse(fh, in_format):
            checked += 1
            out_fmt = args.out_format or args.format
            if any(map(regex.search, [rec.id, rec.description])):
                took += 1
                SeqIO.write(rec, out_fh, args.out_format or in_format)

    print(f'Done, checked {checked}, took {took}.', file=sys.stderr)


# -------------------------------------------------- 
Example #14
Source File: intronerate.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def make_intron_supercontig(contig_info,gene,prefix,add_N = False):
    cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta'))
    intron_supercontig = SeqRecord(Seq(''))
    for i in contig_info:
        if i[5] == "(+)":
            intron_supercontig += cap3contigs[i[0]]
        elif i[5] == "(-)":
            intron_supercontig += cap3contigs[i[0]].reverse_complement()    
        else:
            sys.stderr.write("Strandedness not found!")
            sys.exit(1)
        if add_N and i != contig_info[-1]:
            intron_supercontig += "NNNNNNNNNN"    
    intron_supercontig.id = '{}-{}'.format(prefix,gene)
    intron_supercontig.description = ''
    SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta') 
Example #15
Source File: distribute_targets.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def main():
    parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("-d","--delimiter",help="Field separating FASTA ids for multiple sequences per target. Default is '-' . For no delimeter, write None", default="-")
    parser.add_argument("baitfile",help="FASTA file containing bait sequences")
    parser.add_argument("--blastx",help="tabular blastx results file, used to select the best target for each gene",default=None)
    parser.add_argument("--bam",help="BAM file from BWA search, alternative to the BLASTx method",default=None)
    parser.add_argument("--target",help="Choose this version of the target always",default=None)
    parser.add_argument("--unpaired",help="Indicate whether to expect a file containing results from unpaired reads.",action="store_true",default=False)
    parser.add_argument("--exclude",help="Do not use any sequence with the specified string as the chosen target.",default=None)
    args = parser.parse_args()
    
    if args.blastx:
        besthits = tailored_target_blast(args.blastx,args.exclude)
        translate = False            
    if args.bam:
        translate = True
        besthits = tailored_target_bwa(args.bam,args.unpaired,args.exclude)
    distribute_targets(args.baitfile,dirs=True,delim=args.delimiter,besthits=besthits,translate=translate,target=args.target) 
Example #16
Source File: app.py    From InSilicoSeq with MIT License 6 votes vote down vote up
def model_from_bam(args):
    """Main function for the `iss model` submodule

    This submodule write all variables necessary for building an ErrorModel
    to args.output + .npz

    Args:
        args (object): the command-line arguments from argparse
    """
    logger = logging.getLogger(__name__)
    logger.debug('iss version %s' % __version__)
    logger.debug('Using verbose logger')

    try:  # try to import bam module and write model data to file
        logger.info('Starting iss model')
        from iss import bam
    except ImportError as e:
        logger.error('Failed to import bam module: %s' % e)
        sys.exit(1)
    else:
        logger.info('Using KDE ErrorModel')
        bam.to_model(args.bam, args.output)
        logger.info('Model generation complete') 
Example #17
Source File: exonerate_hits.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def write_exonerate_stats(contig_id_list,prefix):
    '''Given a list of IDs from initial exonerate search, write info to a standard file'''
    with open("{}/exonerate_stats.csv".format(prefix),'w') as exonerate_statsfile:
        exonerate_statsfile.write("\n".join(contig_id_list)+'\n') 
Example #18
Source File: exonerate_hits.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def paralog_test(exonerate_hits,prot,prefix):
    """Gives a warning if there are multiple hits of long length to the same protein"""
    logger = logging.getLogger("pipeline")
    protlength = len(prot)
    hitlengths = [abs(int(x.split(",")[2]) - int(x.split(",")[3])) for x in exonerate_hits["assemblyHits"]]
    logger.debug("protein length: {}".format(protlength))
    logger.debug("Hit lengths:")
    logger.debug(hitlengths)
    longhits = [x > 0.75*protlength for x in hitlengths]
    if sum(longhits) > 1:
        sys.stderr.write("WARNING: Multiple long-length exonerate hits for {}. Check for paralogs!\n".format(prot.id))
        with open("{}/paralog_warning.txt".format(prefix),'w') as pw:
            for hit in range(len(exonerate_hits["assemblyHits"])):
                if longhits[hit]:
                    pw.write(prot.id+ "\t"+exonerate_hits["assemblyHits"][hit] + "\n") 
Example #19
Source File: exonerate_hits.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def report_no_sequences(protname):
    sys.stderr.write("No valid sequences remain for {}!\n".format(protname)) 
Example #20
Source File: solution.py    From bioinformatics_primer with MIT License 5 votes vote down vote up
def main():
    args    = get_args()
    exclude = get_excluded(args.exclude, args.summary)
    out_dir = args.out_dir

    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)

    tax_id = dict()
    with open(args.summary, 'r') as sum_fh:
        sum_hdr = read_split(sum_fh.readline())
        for line in sum_fh:
            #info = dict(zip(sum_hdr, read_split(line)))
            #tax_id[ info['readID'] ] = info['taxID']
            dat = read_split(line)
            tax_id[ dat[0] ] = dat[2]

    took       = 0
    skipped    = 0
    basename   = os.path.basename(args.fasta)
    out_file   = os.path.join(out_dir, basename)
    exclude_fh = open(os.path.join(args.exclude_dir, basename), 'wt') \
                 if args.exclude_dir else None

    with open(out_file, 'w') as out_fh:
        for seq in SeqIO.parse(args.fasta, "fasta"):
            tax = tax_id[ seq.id ] if seq.id in tax_id else '0'
            if tax in exclude:
                skipped += 1
                if not exclude_fh is None:
                    SeqIO.write(seq, exclude_fh, "fasta")
            else:
                took += 1
                SeqIO.write(seq, out_fh, "fasta")

    print("Done, took {}, skipped {}, see output {}".format(
        took, skipped, out_file))

# -------------------------------------------------- 
Example #21
Source File: distribute_targets.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def tailored_target_blast(blastxfilename,exclude=None):
    """Determine, for each protein, the 'best' target protein, by tallying up the blastx hit scores."""
    blastxfile = open(blastxfilename)
    
    hitcounts = {}
    for result in blastxfile:
        result = result.split()
        hitname = result[1].split("-")
        bitscore = float(result[-1])
        try: 
            protname = hitname[1]
        except IndexError:
            raise IndexError("Gene name not found! FASTA headers should be formatted like this:\n >SpeciesName-GeneName\n")
        taxon = hitname[0]
        if exclude and exclude in taxon:
            continue
        else:
            if protname in hitcounts:
                if taxon in hitcounts[protname]:
                    hitcounts[protname][taxon] += bitscore
                else:
                    hitcounts[protname][taxon] = bitscore
            else:
                hitcounts[protname] = {taxon:1}
    #For each protein, find the taxon with the highest total hit bitscore.
    besthits = {}
    besthit_counts = {}
    for prot in hitcounts:
        top_taxon = sorted(iter(hitcounts[prot].keys()), key = lambda k: hitcounts[prot][k], reverse=True)[0]
        besthits[prot] = top_taxon
        if top_taxon in besthit_counts:
            besthit_counts[top_taxon] += 1
        else:
            besthit_counts[top_taxon] = 1
    tallyfile = open("bait_tallies.txt",'w')
    for x in besthit_counts:
        tallyfile.write("{}\t{}\n".format(x, besthit_counts[x]))
    tallyfile.close()
    return besthits 
Example #22
Source File: depth_calculator.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def merge_seqs(genelist,prefix,file_type="coding"):
    '''Given a list of gene sequences, retreive the sequences and generate a single FASTA file.'''
    with open("{}_sequences.fasta".format(file_type),"w") as outfile:
        for gene in genelist:
            if file_type == "coding":
                seqfile = "{}/{}/sequences/FNA/{}.FNA".format(gene,prefix,gene)
            else:
                seqfile = "{}/{}/sequences/intron/{}_supercontig.fasta".format(gene,prefix,gene)    
            
            seq = SeqIO.read(seqfile,'fasta')
            seq.id = seq.id + "-" + gene
            SeqIO.write(seq,outfile,'fasta') 
Example #23
Source File: solution.py    From bioinformatics_primer with MIT License 5 votes vote down vote up
def get_args():
    parser = argparse.ArgumentParser(description='Filter FASTA with Centrifuge')
    parser.add_argument('-f', '--fasta', help='fasta file',
            type=str, metavar='FILE', required=True)
    parser.add_argument('-s', '--summary', help='Centrifuge summary file',
            type=str, metavar='FILE', required=True)
    parser.add_argument('-e', '--exclude', metavar='IDS_NAMES', required=True,
            help='Comma-separated list of taxIDs/names to exclude')
    parser.add_argument('-o', '--out_dir', help='Output directory',
            type=str, metavar='DIR', default='filtered')
    parser.add_argument('-x', '--exclude_dir', metavar='DIR', required=True,
            help='File name to write excluded')
    return parser.parse_args()

# -------------------------------------------------- 
Example #24
Source File: solution.py    From bioinformatics_primer with MIT License 5 votes vote down vote up
def main():
    """Make a jazz noise here"""
    args = get_args()
    out_dir = args.outdir
    pct_gc = args.pct_gc

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    if not 0 < pct_gc <= 100:
        die('--pct_gc "{}" must be between 0 and 100'.format(pct_gc))

    num_seqs = 0
    for i, file in enumerate(args.fasta, start=1):
        if not os.path.isfile(file):
            warn('"{}" is not a file'.format(file))
            continue

        print('{:3}: {}'.format(i, os.path.basename(file)))

        base, ext = os.path.splitext(os.path.basename(file))
        high_file = os.path.join(out_dir, ''.join([base, '_high', ext]))
        low_file = os.path.join(out_dir, ''.join([base, '_low', ext]))

        high_fh = open(high_file, 'wt')
        low_fh = open(low_file, 'wt')

        for rec in SeqIO.parse(file, 'fasta'):
            num_seqs += 1
            bases = Counter(rec.seq.upper())
            gc = bases.get('G', 0) + bases.get('C', 0)
            pct = int((gc / len(rec.seq)) * 100)
            SeqIO.write(rec, low_fh if pct < pct_gc else high_fh, 'fasta')

    print('Done, wrote {} sequence{} to out dir "{}"'.format(
        num_seqs, '' if num_seqs == 1 else 's', out_dir))


# -------------------------------------------------- 
Example #25
Source File: solution.py    From bioinformatics_primer with MIT License 5 votes vote down vote up
def main():
    """main"""
    args = get_args()
    input_file = args.input
    out_file = args.output
    keyword = args.keyword.lower()
    skip = set(map(str.lower, args.skip))

    if not os.path.isfile(input_file):
        die('"{}" is not a file'.format(input_file))

    print('Processing "{}"'.format(input_file))

    num_skipped = 0
    num_taken = 0
    with open(out_file, "w") as out_fh:
        for record in SeqIO.parse(input_file, "swiss"):
            annot = record.annotations
            if skip and 'taxonomy' in annot:
                taxa = set(map(str.lower, annot['taxonomy']))
                if skip.intersection(taxa):
                    num_skipped += 1
                    continue

            if 'keywords' in annot:
                kw = set(map(str.lower, annot['keywords']))

                if keyword in kw:
                    num_taken += 1
                    SeqIO.write(record, out_fh, 'fasta')
                else:
                    num_skipped += 1

    print('Done, skipped {} and took {}. See output in "{}".'.format(
        num_skipped, num_taken, out_file))


# -------------------------------------------------- 
Example #26
Source File: solution.py    From bioinformatics_primer with MIT License 5 votes vote down vote up
def main():
    """Make a jazz noise here"""
    args = get_args()
    out_dir = args.outdir

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    ext = args.extension
    if not ext.startswith('.'):
        ext = '.' + ext

    for i, fq in enumerate(args.fastq, start=1):
        basename = os.path.basename(fq)
        root, _ = os.path.splitext(basename)
        print('{:3}: {}'.format(i, basename))

        out_file = os.path.join(out_dir, root + ext)
        out_fh = open(out_file, 'wt')

        for record in SeqIO.parse(fq, 'fastq'):
            SeqIO.write(record, out_fh, 'fasta')

    print('Done, see output in "{}".'.format(out_dir))

# -------------------------------------------------- 
Example #27
Source File: solution.py    From bioinformatics_primer with MIT License 5 votes vote down vote up
def process(file, input_format, output_format, out_dir, seqs_per_file):
    """
    Spilt file into smaller files into out_dir
    Optionally convert to output format
    Return number of sequences written
    """
    if not os.path.isfile(file):
        warn('"{}" is not valid'.format(file))
        return 0

    basename, ext = os.path.splitext(os.path.basename(file))
    out_fh = None
    i = 0
    num_written = 0
    nfile = 0
    for record in SeqIO.parse(file, input_format):
        if i == seqs_per_file:
            i = 0
            if out_fh is not None:
                out_fh.close()
                out_fh = None

        i += 1
        num_written += 1
        if out_fh is None:
            nfile += 1
            path = os.path.join(out_dir,
                                basename + '.' + '{:04d}'.format(nfile) + ext)
            out_fh = open(path, 'wt')

        SeqIO.write(record, out_fh, output_format)

    return num_written


# -------------------------------------------------- 
Example #28
Source File: cdhit_unclustered.py    From bioinformatics_primer with MIT License 5 votes vote down vote up
def get_unclustered(cluster_file, proteins_file, out_file):
    """Find the unclustered proteins in the cd-hit output"""

    if not os.path.isfile(cluster_file):
        die('cdhit "{}" is not a file'.format(cluster_file))

    logging.debug('Parsing "{}"'.format(cluster_file))

    clustered = set([rec.id for rec in SeqIO.parse(cluster_file, 'fasta')])

    # Alternate (longer) way:
    # clustered = set()
    # for rec in SeqIO.parse(cluster_file, 'fasta'):
    #     clustered.add(rec.id)

    logging.debug('Will write to "{}"'.format(out_file))
    out_fh = open(out_file, 'wt')
    num_total = 0
    num_unclustered = 0

    for rec in SeqIO.parse(proteins_file, 'fasta'):
        num_total += 1
        prot_id = re.sub(r'\|.*', '', rec.id)
        if not prot_id in clustered:
            num_unclustered += 1
            SeqIO.write(rec, out_fh, 'fasta')

    logging.debug(
        'Finished writing unclustered proteins'.format(num_unclustered))

    return (num_unclustered, num_total)


# -------------------------------------------------- 
Example #29
Source File: solution.py    From bioinformatics_primer with MIT License 5 votes vote down vote up
def main():
    """main"""
    args = get_args()
    out_dir = args.outdir

    if out_dir and not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    for fnum, file in enumerate(args.file):
        if not os.path.isfile(file):
            warn('"{}" is not a file'.format(file))
            continue

        filename = os.path.basename(file)
        base, ext = os.path.splitext(filename)
        forward = open(os.path.join(out_dir, base + '_1' + ext), 'wt')
        reverse = open(os.path.join(out_dir, base + '_2' + ext), 'wt')

        print("{:3d}: {}".format(fnum + 1, filename))

        num_seqs = 0
        for i, rec in enumerate(SeqIO.parse(file, 'fasta')):
            SeqIO.write(rec, forward if i % 2 == 0 else reverse, 'fasta')
            num_seqs += 1

        print('\tSplit {:,d} sequences to dir "{}"'.format(num_seqs, out_dir))


# -------------------------------------------------- 
Example #30
Source File: align.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def write_seqs(seqs, fname):
    """A wrapper around SeqIO.write with error handling"""
    try:
        SeqIO.write(seqs, fname, 'fasta')
    except FileNotFoundError:
        raise AlignmentError('ERROR: Couldn\'t write "{}" -- perhaps the directory doesn\'t exist?'.format(fname))