Python Examples of Bio.SeqIO.write

Source File: prep-genome.py From kevlar with MIT License

7 votes

def main(args):
    for record in SeqIO.parse(args.infile, 'fasta'):
        if args.discard:
            if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0:
                continue

        subseqcounter = 0
        printlog(args.debug, "DEBUG: convert to upper case", record.id)
        sequence = str(record.seq).upper()
        printlog(args.debug, "DEBUG: split seq by Ns", record.id)
        subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength]
        printlog(args.debug, "DEBUG: print subseqs", record.id)
        for subseq in subseqs:
            subseqcounter += 1
            subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter)
            subrecord = SeqRecord(Seq(subseq), subid, '', '')
            SeqIO.write(subrecord, args.outfile, 'fasta')

Source File: fasta.py From ssbio with MIT License

6 votes

def write_fasta_file(seq_records, outname, outdir=None, outext='.faa', force_rerun=False):
    """Write a FASTA file for a SeqRecord or a list of SeqRecord objects.

    Args:
        seq_records (SeqRecord, list): SeqRecord or a list of SeqRecord objects
        outname: Name of the output file which will have outext appended to it
        outdir: Path to directory to output sequences to
        outext: Extension of FASTA file, default ".faa"
        force_rerun: If file should be overwritten if it exists

    Returns:
        str: Path to output FASTA file.

    """

    if not outdir:
        outdir = ''
    outfile = ssbio.utils.outfile_maker(inname='', outname=outname, outdir=outdir, outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        SeqIO.write(seq_records, outfile, "fasta")

    return outfile

Source File: test_align.py From augur with GNU Affero General Public License v3.0

6 votes

def test_prepare_with_alignment_with_ref_name(self, test_file, test_seqs, existing_with_ref, existing_aln, ref_seq, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence name, no changes are made."""
        aln_outfile, seqs_outfile, _ = align.prepare([test_file,], existing_with_ref, out_file, ref_seq.id, None)
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile == existing_with_ref, "Rewrote the alignment file unexpectedly"
        # Alignment file should be unchanged
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence dropped from alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys()

Source File: test_align.py From augur with GNU Affero General Public License v3.0

6 votes

def test_prepare_with_alignment_with_ref_seq(self, test_file, test_seqs, existing_file, existing_aln, ref_seq, ref_file, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence, the reference
        is added to the existing alignment and no other changes are made."""
        aln_outfile, seqs_outfile, ref_name = align.prepare([test_file,], existing_file, out_file, None, ref_file)
        assert ref_name == ref_seq.id, "Didn't return strain name from refrence file"
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile != existing_aln, "Unexpectedly overwrote existing alignment"
        # Alignment file should have the reference added
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence not added to alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys()

Source File: depth_calculator.py From HybPiper with GNU General Public License v3.0

6 votes

def depth_summary(all_genes,prefix):
    """Given the coding.depth file, calculate the average depth across the recovered sequence."""
    depths = {g:0 for g in all_genes}
    gene = "ZZZZ"
    gene_depths = []
    for line in open("coding.depth"):
        line = line.split()
        if line[0].split("-")[-1] == gene:
            gene_depths.append(int(line[2]))
        else:
            if gene == "ZZZZ":
                gene = line[0].split("-")[-1]
                gene_depths = [int(line[2])]
            else:
                mean_depth = sum(gene_depths)/float(len(gene_depths))
                depths[gene] = mean_depth
                gene_depths = []
            gene = line[0].split("-")[-1]
            
    output_list = [str(depths[key]) for key in sorted(depths)]                    
    sys.stdout.write("{}\t{}\n".format(prefix,"\t".join(output_list)))

Source File: paralog_investigator.py From HybPiper with GNU General Public License v3.0

6 votes

def extract_paralogs(gene,prefix):
    
    putative_paralog_ids = list(set([x.split()[1].rstrip() for x in open(os.path.join(gene,prefix,"paralog_warning.txt"))]))
    try:
        chosen_paralog = open(os.path.join(gene,prefix,"exonerate_stats.csv")).readline().rstrip()
    except IOError:
        return 0
    
    exonerate_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(gene,prefix,"exonerate_results.fasta"),'fasta'))
    
    if not os.path.isdir(os.path.join(gene,prefix,'paralogs')):
        os.mkdir(os.path.join(gene,prefix,"paralogs"))
    seqs_to_write = [exonerate_dict[x] for x in putative_paralog_ids]
    
    for seq in range(len(seqs_to_write)):
        if seqs_to_write[seq].id == chosen_paralog:
            seqs_to_write[seq].id = "{}.{}".format(prefix,"main")
            
        else:
            seqs_to_write[seq].id = "{}.{}".format(prefix,seq)
    
    SeqIO.write(seqs_to_write,os.path.join(gene,prefix,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta')
    
    return len(seqs_to_write)

Source File: protein.py From ssbio with MIT License

6 votes

def write_all_sequences_file(self, outname, outdir=None):
        """Write all the stored sequences as a single FASTA file. By default, sets IDs to model gene IDs.

        Args:
            outname (str): Name of the output FASTA file without the extension
            outdir (str): Path to output directory for the file, default is the sequences directory

        """

        if not outdir:
            outdir = self.sequence_dir
            if not outdir:
                raise ValueError('Output directory must be specified')

        outfile = op.join(outdir, outname + '.faa')
        SeqIO.write(self.sequences, outfile, "fasta")

        log.info('{}: wrote all protein sequences to file'.format(outfile))
        return outfile

Source File: write.py From phylopandas with BSD 3-Clause "New" or "Revised" License

6 votes

def _write_doc_template(schema):
    s = """Write to {} format.

    Parameters
    ----------
    filename : str
        File to write {} string to. If no filename is given, a {} string
        will be returned.

    sequence_col : str (default='sequence')
        Sequence column name in DataFrame.

    id_col : str (default='id')
        ID column name in DataFrame

    id_only : bool (default=False)
        If True, use only the ID column to label sequences in fasta.
    """.format(schema, schema, schema)
    return s

Source File: seq.py From wub with Mozilla Public License 2.0

6 votes

def base_complement(k):
    """ Return complement of base.

    Performs the subsitutions: A<=>T, C<=>G, X=>X for both upper and lower
    case. The return value is identical to the argument for all other values.

    :param k: A base.
    :returns: Complement of base.
    :rtype: str

    """
    try:
        return comp[k]
    except KeyError:
        sys.stderr.write(
            "WARNING: No reverse complement for {} found, returning argument.".format(k))
        return k

Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0

6 votes

def concatenate_sequences(gene_dict,fastafiles,unique_names):
    '''Given a dictionary of dictionaries with complete sampling in each gene, write out concatenated sequences to stdout. Returns a list of partition lengths.'''    
    new_seq_dict = {}
    partition_lengths = []
    for gene in fastafiles:
        for name in unique_names:
            try:
                new_seq_dict[name] += gene_dict[gene][name]
            except KeyError:
                new_seq_dict[name] = gene_dict[gene][name]
        partition_lengths.append(len(next(iter(gene_dict[gene].values()))))
    for final_seq in new_seq_dict:
        SeqIO.write(new_seq_dict[final_seq],sys.stdout,'fasta')            
    final_seq_length = len(new_seq_dict[final_seq])
    sys.stderr.write("Final conatenated sequence length: {}\n".format(final_seq_length))
    return partition_lengths

Source File: fasta_merge.py From HybPiper with GNU General Public License v3.0

6 votes

def raxml_partition(fastafiles,partition_lengths,partition_type):
    '''Generate a raxml partition file for the given fastafiles. User specifies the partition type'''
    gene_start = 1
    partition_file = open("partition.raxml",'w')
    
    if partition_type == 'CODON':
        for g in range(len(fastafiles)):
            codon3_start = gene_start + 2
            codon3_end = gene_start + partition_lengths[g] - 1
            codon1_end = codon3_end - 2
            codon2_start = gene_start + 1
            codon2_end = codon3_end - 1
            partition_file.write("{},{}{}={}-{}\\3,{}-{}\\3\n".format("DNA",fastafiles[g],"12",gene_start,codon1_end,codon2_start,codon2_end))
            partition_file.write("{},{}{}={}-{}\\3\n".format("DNA",fastafiles[g],"3",codon3_start,codon3_end))
            gene_start = codon3_end + 1
    else:
        for g in range(len(fastafiles)):
            gene_end = gene_start + partition_lengths[g] - 1
            partition_file.write("{},{}={}-{}\n".format(partition_type,fastafiles[g],gene_start,gene_end))
            gene_start = gene_end + 1
        partition_file.close()

Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0

6 votes

def supercontig_exonerate(supercontig,protseq,prefix,thresh=55):
    """Given a long, joined contig and a protein sequence, return the exonerate hit(s)"""
    logger = logging.getLogger("pipeline")

    exonerate_ryo = '>%ti,%qi,%qab,%qae,%pi,(%tS)\\n%tcs\\n'
    temp_prot_filename = "%s/temp.prot.fa"%prefix
    temp_contig_filename =  "%s/temp.contig.fa"%prefix
    SeqIO.write(protseq,temp_prot_filename,'fasta')
    SeqIO.write(supercontig,temp_contig_filename,'fasta')
    logger.debug("Conducting exonerate search on supercontig")
    proc = subprocess.Popen(['exonerate','-m','protein2genome','--showalignment','no','-V','0','--showvulgar','no','--ryo',exonerate_ryo,temp_prot_filename,temp_contig_filename],stdout=subprocess.PIPE,universal_newlines=True)

    proc.wait()
    #print proc.stdout.readlines()
    supercontig_cds = [i for i in SeqIO.parse(proc.stdout,'fasta') if float(i.id.split(",")[4])>thresh]
    logger.debug("Supercontig lengths: %s" % " ".join([str(len(x.seq)) for x in supercontig_cds]))
    return supercontig_cds

Source File: fastx_grep.py From bioinformatics_primer with MIT License

6 votes

def main():
    """Make a jazz noise here"""

    args = get_args()
    regex = re.compile(args.pattern)
    out_fh = args.outfile or sys.stdout
    checked, took = 0, 0

    for fh in args.file:
        in_format = args.format or guess_format(fh.name)
        for rec in SeqIO.parse(fh, in_format):
            checked += 1
            out_fmt = args.out_format or args.format
            if any(map(regex.search, [rec.id, rec.description])):
                took += 1
                SeqIO.write(rec, out_fh, args.out_format or in_format)

    print(f'Done, checked {checked}, took {took}.', file=sys.stderr)


# --------------------------------------------------

Source File: intronerate.py From HybPiper with GNU General Public License v3.0

6 votes

def make_intron_supercontig(contig_info,gene,prefix,add_N = False):
    cap3contigs = SeqIO.to_dict(SeqIO.parse("../{}_contigs.fasta".format(gene),'fasta'))
    intron_supercontig = SeqRecord(Seq(''))
    for i in contig_info:
        if i[5] == "(+)":
            intron_supercontig += cap3contigs[i[0]]
        elif i[5] == "(-)":
            intron_supercontig += cap3contigs[i[0]].reverse_complement()    
        else:
            sys.stderr.write("Strandedness not found!")
            sys.exit(1)
        if add_N and i != contig_info[-1]:
            intron_supercontig += "NNNNNNNNNN"    
    intron_supercontig.id = '{}-{}'.format(prefix,gene)
    intron_supercontig.description = ''
    SeqIO.write(intron_supercontig,'sequences/intron/{}_supercontig.fasta'.format(gene),'fasta')

Source File: distribute_targets.py From HybPiper with GNU General Public License v3.0

6 votes

def main():
    parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("-d","--delimiter",help="Field separating FASTA ids for multiple sequences per target. Default is '-' . For no delimeter, write None", default="-")
    parser.add_argument("baitfile",help="FASTA file containing bait sequences")
    parser.add_argument("--blastx",help="tabular blastx results file, used to select the best target for each gene",default=None)
    parser.add_argument("--bam",help="BAM file from BWA search, alternative to the BLASTx method",default=None)
    parser.add_argument("--target",help="Choose this version of the target always",default=None)
    parser.add_argument("--unpaired",help="Indicate whether to expect a file containing results from unpaired reads.",action="store_true",default=False)
    parser.add_argument("--exclude",help="Do not use any sequence with the specified string as the chosen target.",default=None)
    args = parser.parse_args()
    
    if args.blastx:
        besthits = tailored_target_blast(args.blastx,args.exclude)
        translate = False            
    if args.bam:
        translate = True
        besthits = tailored_target_bwa(args.bam,args.unpaired,args.exclude)
    distribute_targets(args.baitfile,dirs=True,delim=args.delimiter,besthits=besthits,translate=translate,target=args.target)

Source File: app.py From InSilicoSeq with MIT License

6 votes

def model_from_bam(args):
    """Main function for the `iss model` submodule

    This submodule write all variables necessary for building an ErrorModel
    to args.output + .npz

    Args:
        args (object): the command-line arguments from argparse
    """
    logger = logging.getLogger(__name__)
    logger.debug('iss version %s' % __version__)
    logger.debug('Using verbose logger')

    try:  # try to import bam module and write model data to file
        logger.info('Starting iss model')
        from iss import bam
    except ImportError as e:
        logger.error('Failed to import bam module: %s' % e)
        sys.exit(1)
    else:
        logger.info('Using KDE ErrorModel')
        bam.to_model(args.bam, args.output)
        logger.info('Model generation complete')

Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0

5 votes

def write_exonerate_stats(contig_id_list,prefix):
    '''Given a list of IDs from initial exonerate search, write info to a standard file'''
    with open("{}/exonerate_stats.csv".format(prefix),'w') as exonerate_statsfile:
        exonerate_statsfile.write("\n".join(contig_id_list)+'\n')

Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0

5 votes

def paralog_test(exonerate_hits,prot,prefix):
    """Gives a warning if there are multiple hits of long length to the same protein"""
    logger = logging.getLogger("pipeline")
    protlength = len(prot)
    hitlengths = [abs(int(x.split(",")[2]) - int(x.split(",")[3])) for x in exonerate_hits["assemblyHits"]]
    logger.debug("protein length: {}".format(protlength))
    logger.debug("Hit lengths:")
    logger.debug(hitlengths)
    longhits = [x > 0.75*protlength for x in hitlengths]
    if sum(longhits) > 1:
        sys.stderr.write("WARNING: Multiple long-length exonerate hits for {}. Check for paralogs!\n".format(prot.id))
        with open("{}/paralog_warning.txt".format(prefix),'w') as pw:
            for hit in range(len(exonerate_hits["assemblyHits"])):
                if longhits[hit]:
                    pw.write(prot.id+ "\t"+exonerate_hits["assemblyHits"][hit] + "\n")

Source File: exonerate_hits.py From HybPiper with GNU General Public License v3.0

5 votes

def report_no_sequences(protname):
    sys.stderr.write("No valid sequences remain for {}!\n".format(protname))

Source File: solution.py From bioinformatics_primer with MIT License

5 votes

def main():
    args    = get_args()
    exclude = get_excluded(args.exclude, args.summary)
    out_dir = args.out_dir

    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)

    tax_id = dict()
    with open(args.summary, 'r') as sum_fh:
        sum_hdr = read_split(sum_fh.readline())
        for line in sum_fh:
            #info = dict(zip(sum_hdr, read_split(line)))
            #tax_id[ info['readID'] ] = info['taxID']
            dat = read_split(line)
            tax_id[ dat[0] ] = dat[2]

    took       = 0
    skipped    = 0
    basename   = os.path.basename(args.fasta)
    out_file   = os.path.join(out_dir, basename)
    exclude_fh = open(os.path.join(args.exclude_dir, basename), 'wt') \
                 if args.exclude_dir else None

    with open(out_file, 'w') as out_fh:
        for seq in SeqIO.parse(args.fasta, "fasta"):
            tax = tax_id[ seq.id ] if seq.id in tax_id else '0'
            if tax in exclude:
                skipped += 1
                if not exclude_fh is None:
                    SeqIO.write(seq, exclude_fh, "fasta")
            else:
                took += 1
                SeqIO.write(seq, out_fh, "fasta")

    print("Done, took {}, skipped {}, see output {}".format(
        took, skipped, out_file))

# --------------------------------------------------

Source File: distribute_targets.py From HybPiper with GNU General Public License v3.0

5 votes

def tailored_target_blast(blastxfilename,exclude=None):
    """Determine, for each protein, the 'best' target protein, by tallying up the blastx hit scores."""
    blastxfile = open(blastxfilename)
    
    hitcounts = {}
    for result in blastxfile:
        result = result.split()
        hitname = result[1].split("-")
        bitscore = float(result[-1])
        try: 
            protname = hitname[1]
        except IndexError:
            raise IndexError("Gene name not found! FASTA headers should be formatted like this:\n >SpeciesName-GeneName\n")
        taxon = hitname[0]
        if exclude and exclude in taxon:
            continue
        else:
            if protname in hitcounts:
                if taxon in hitcounts[protname]:
                    hitcounts[protname][taxon] += bitscore
                else:
                    hitcounts[protname][taxon] = bitscore
            else:
                hitcounts[protname] = {taxon:1}
    #For each protein, find the taxon with the highest total hit bitscore.
    besthits = {}
    besthit_counts = {}
    for prot in hitcounts:
        top_taxon = sorted(iter(hitcounts[prot].keys()), key = lambda k: hitcounts[prot][k], reverse=True)[0]
        besthits[prot] = top_taxon
        if top_taxon in besthit_counts:
            besthit_counts[top_taxon] += 1
        else:
            besthit_counts[top_taxon] = 1
    tallyfile = open("bait_tallies.txt",'w')
    for x in besthit_counts:
        tallyfile.write("{}\t{}\n".format(x, besthit_counts[x]))
    tallyfile.close()
    return besthits

Source File: depth_calculator.py From HybPiper with GNU General Public License v3.0

5 votes

def merge_seqs(genelist,prefix,file_type="coding"):
    '''Given a list of gene sequences, retreive the sequences and generate a single FASTA file.'''
    with open("{}_sequences.fasta".format(file_type),"w") as outfile:
        for gene in genelist:
            if file_type == "coding":
                seqfile = "{}/{}/sequences/FNA/{}.FNA".format(gene,prefix,gene)
            else:
                seqfile = "{}/{}/sequences/intron/{}_supercontig.fasta".format(gene,prefix,gene)    
            
            seq = SeqIO.read(seqfile,'fasta')
            seq.id = seq.id + "-" + gene
            SeqIO.write(seq,outfile,'fasta')

Source File: solution.py From bioinformatics_primer with MIT License

5 votes

def get_args():
    parser = argparse.ArgumentParser(description='Filter FASTA with Centrifuge')
    parser.add_argument('-f', '--fasta', help='fasta file',
            type=str, metavar='FILE', required=True)
    parser.add_argument('-s', '--summary', help='Centrifuge summary file',
            type=str, metavar='FILE', required=True)
    parser.add_argument('-e', '--exclude', metavar='IDS_NAMES', required=True,
            help='Comma-separated list of taxIDs/names to exclude')
    parser.add_argument('-o', '--out_dir', help='Output directory',
            type=str, metavar='DIR', default='filtered')
    parser.add_argument('-x', '--exclude_dir', metavar='DIR', required=True,
            help='File name to write excluded')
    return parser.parse_args()

# --------------------------------------------------

Source File: solution.py From bioinformatics_primer with MIT License

5 votes

def main():
    """Make a jazz noise here"""
    args = get_args()
    out_dir = args.outdir
    pct_gc = args.pct_gc

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    if not 0 < pct_gc <= 100:
        die('--pct_gc "{}" must be between 0 and 100'.format(pct_gc))

    num_seqs = 0
    for i, file in enumerate(args.fasta, start=1):
        if not os.path.isfile(file):
            warn('"{}" is not a file'.format(file))
            continue

        print('{:3}: {}'.format(i, os.path.basename(file)))

        base, ext = os.path.splitext(os.path.basename(file))
        high_file = os.path.join(out_dir, ''.join([base, '_high', ext]))
        low_file = os.path.join(out_dir, ''.join([base, '_low', ext]))

        high_fh = open(high_file, 'wt')
        low_fh = open(low_file, 'wt')

        for rec in SeqIO.parse(file, 'fasta'):
            num_seqs += 1
            bases = Counter(rec.seq.upper())
            gc = bases.get('G', 0) + bases.get('C', 0)
            pct = int((gc / len(rec.seq)) * 100)
            SeqIO.write(rec, low_fh if pct < pct_gc else high_fh, 'fasta')

    print('Done, wrote {} sequence{} to out dir "{}"'.format(
        num_seqs, '' if num_seqs == 1 else 's', out_dir))


# --------------------------------------------------

Source File: solution.py From bioinformatics_primer with MIT License

5 votes

def main():
    """main"""
    args = get_args()
    input_file = args.input
    out_file = args.output
    keyword = args.keyword.lower()
    skip = set(map(str.lower, args.skip))

    if not os.path.isfile(input_file):
        die('"{}" is not a file'.format(input_file))

    print('Processing "{}"'.format(input_file))

    num_skipped = 0
    num_taken = 0
    with open(out_file, "w") as out_fh:
        for record in SeqIO.parse(input_file, "swiss"):
            annot = record.annotations
            if skip and 'taxonomy' in annot:
                taxa = set(map(str.lower, annot['taxonomy']))
                if skip.intersection(taxa):
                    num_skipped += 1
                    continue

            if 'keywords' in annot:
                kw = set(map(str.lower, annot['keywords']))

                if keyword in kw:
                    num_taken += 1
                    SeqIO.write(record, out_fh, 'fasta')
                else:
                    num_skipped += 1

    print('Done, skipped {} and took {}. See output in "{}".'.format(
        num_skipped, num_taken, out_file))


# --------------------------------------------------

Source File: solution.py From bioinformatics_primer with MIT License

5 votes

def main():
    """Make a jazz noise here"""
    args = get_args()
    out_dir = args.outdir

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    ext = args.extension
    if not ext.startswith('.'):
        ext = '.' + ext

    for i, fq in enumerate(args.fastq, start=1):
        basename = os.path.basename(fq)
        root, _ = os.path.splitext(basename)
        print('{:3}: {}'.format(i, basename))

        out_file = os.path.join(out_dir, root + ext)
        out_fh = open(out_file, 'wt')

        for record in SeqIO.parse(fq, 'fastq'):
            SeqIO.write(record, out_fh, 'fasta')

    print('Done, see output in "{}".'.format(out_dir))

# --------------------------------------------------

Source File: solution.py From bioinformatics_primer with MIT License

5 votes

def process(file, input_format, output_format, out_dir, seqs_per_file):
    """
    Spilt file into smaller files into out_dir
    Optionally convert to output format
    Return number of sequences written
    """
    if not os.path.isfile(file):
        warn('"{}" is not valid'.format(file))
        return 0

    basename, ext = os.path.splitext(os.path.basename(file))
    out_fh = None
    i = 0
    num_written = 0
    nfile = 0
    for record in SeqIO.parse(file, input_format):
        if i == seqs_per_file:
            i = 0
            if out_fh is not None:
                out_fh.close()
                out_fh = None

        i += 1
        num_written += 1
        if out_fh is None:
            nfile += 1
            path = os.path.join(out_dir,
                                basename + '.' + '{:04d}'.format(nfile) + ext)
            out_fh = open(path, 'wt')

        SeqIO.write(record, out_fh, output_format)

    return num_written


# --------------------------------------------------

Source File: cdhit_unclustered.py From bioinformatics_primer with MIT License

5 votes

def get_unclustered(cluster_file, proteins_file, out_file):
    """Find the unclustered proteins in the cd-hit output"""

    if not os.path.isfile(cluster_file):
        die('cdhit "{}" is not a file'.format(cluster_file))

    logging.debug('Parsing "{}"'.format(cluster_file))

    clustered = set([rec.id for rec in SeqIO.parse(cluster_file, 'fasta')])

    # Alternate (longer) way:
    # clustered = set()
    # for rec in SeqIO.parse(cluster_file, 'fasta'):
    #     clustered.add(rec.id)

    logging.debug('Will write to "{}"'.format(out_file))
    out_fh = open(out_file, 'wt')
    num_total = 0
    num_unclustered = 0

    for rec in SeqIO.parse(proteins_file, 'fasta'):
        num_total += 1
        prot_id = re.sub(r'\|.*', '', rec.id)
        if not prot_id in clustered:
            num_unclustered += 1
            SeqIO.write(rec, out_fh, 'fasta')

    logging.debug(
        'Finished writing unclustered proteins'.format(num_unclustered))

    return (num_unclustered, num_total)


# --------------------------------------------------

Source File: solution.py From bioinformatics_primer with MIT License

5 votes

def main():
    """main"""
    args = get_args()
    out_dir = args.outdir

    if out_dir and not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    for fnum, file in enumerate(args.file):
        if not os.path.isfile(file):
            warn('"{}" is not a file'.format(file))
            continue

        filename = os.path.basename(file)
        base, ext = os.path.splitext(filename)
        forward = open(os.path.join(out_dir, base + '_1' + ext), 'wt')
        reverse = open(os.path.join(out_dir, base + '_2' + ext), 'wt')

        print("{:3d}: {}".format(fnum + 1, filename))

        num_seqs = 0
        for i, rec in enumerate(SeqIO.parse(file, 'fasta')):
            SeqIO.write(rec, forward if i % 2 == 0 else reverse, 'fasta')
            num_seqs += 1

        print('\tSplit {:,d} sequences to dir "{}"'.format(num_seqs, out_dir))


# --------------------------------------------------

Source File: align.py From augur with GNU Affero General Public License v3.0

5 votes

def write_seqs(seqs, fname):
    """A wrapper around SeqIO.write with error handling"""
    try:
        SeqIO.write(seqs, fname, 'fasta')
    except FileNotFoundError:
        raise AlignmentError('ERROR: Couldn\'t write "{}" -- perhaps the directory doesn\'t exist?'.format(fname))

Python Bio.SeqIO.write() Examples