Python write fasta

Source File: seq_io.py From GTDBTk with GNU General Public License v3.0

7 votes

def write_fasta(seqs, fasta_file, wrap=80):
    """Write sequences to a fasta file.

    Parameters
    ----------
    seqs : dict[seq_id] -> seq
        Sequences indexed by sequence id.
    fasta_file : str
        Path to write the sequences to.
    wrap: int
        Number of AA/NT before the line is wrapped.
    """
    with open(fasta_file, 'w') as f:
        for gid, gseq in seqs.items():
            f.write('>{}\n'.format(gid))
            for i in range(0, len(gseq), wrap):
                f.write('{}\n'.format(gseq[i:i + wrap]))

Source File: dna2proteins.py From dna2proteins with MIT License

7 votes

def write_fasta(dictionary, filename):
    """
    Takes a dictionary and writes it to a fasta file
    Must specify the filename when caling the function
    """

    import textwrap
    with open(filename, "w") as outfile:
        for key, value in dictionary.items():
            outfile.write(key + "\n")
            outfile.write("\n".join(textwrap.wrap(value, 60)))
            outfile.write("\n")

    print "Success! File written"

## Swaps DNA sequencs for proteins

Source File: seq_io.py From SqueezeMeta with GNU General Public License v3.0

6 votes

def write_fasta(seqs, output_file):
    """Write sequences to fasta file.

    If the output file has the extension 'gz',
    it will be compressed using gzip.

    Parameters
    ----------
    seqs : dict[seq_id] -> seq
        Sequences indexed by sequence id.
    output_file : str
        Name of fasta file to produce.
    """

    if output_file.endswith('.gz'):
        fout = gzip.open(output_file, 'wb')
    else:
        fout = open(output_file, 'w')

    for seq_id, seq in viewitems(seqs):
        fout.write('>' + seq_id + '\n')
        fout.write(seq + '\n')
    fout.close()

Source File: fasta.py From ssbio with MIT License

6 votes

def write_fasta_file(seq_records, outname, outdir=None, outext='.faa', force_rerun=False):
    """Write a FASTA file for a SeqRecord or a list of SeqRecord objects.

    Args:
        seq_records (SeqRecord, list): SeqRecord or a list of SeqRecord objects
        outname: Name of the output file which will have outext appended to it
        outdir: Path to directory to output sequences to
        outext: Extension of FASTA file, default ".faa"
        force_rerun: If file should be overwritten if it exists

    Returns:
        str: Path to output FASTA file.

    """

    if not outdir:
        outdir = ''
    outfile = ssbio.utils.outfile_maker(inname='', outname=outname, outdir=outdir, outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        SeqIO.write(seq_records, outfile, "fasta")

    return outfile

Source File: seq_io.py From catch with MIT License

6 votes

def write_probe_fasta(probes, out_fn):
    """Write probe sequences to a FASTA file.

    This writes one probe sequence per line, with a header immediately
    preceding the sequence. If set, the header written is the one in
    probe.Probe.header. If not set, the probe.Probe.identifier() is used.

    Args:
        probes: list of instances of probe.Probe
        out_fn: path to FASTA file to write
    """
    with open(out_fn, 'w') as f:
        for p in probes:
            if p.header:
                f.write('>' + p.header + '\n')
            else:
                f.write('>probe_%s\n' % p.identifier())
            f.write(p.seq_str + '\n')

Source File: IceIterative2.py From cDNA_Cupcake with BSD 3-Clause Clear License

6 votes

def write_in_fasta(self, cid, write_all=False):
        """
        Write the ./tmp/<cid/10000 mod>/c<cid>/in.fasta for cluster cid.
        If write_all is True, write all subreads. Otherwise, only write
        a random subsample of num=self.dagcon_in_fasta_subsample reads.
        """
        #in_filename = op.join('./tmp/', str(cid/10000), 'c'+str(cid), 'in.fasta')
        in_filename = op.join(self.clusterInFa(cid))
        seqids = self.uc[cid]
        if not write_all:
            seqids = random.sample(seqids, min(self.dagcon_in_fa_subsample, len(seqids)))
        with open(in_filename, 'w') as f:
            for seqid in seqids:
                f.write(">{0}\n{1}\n".format(seqid,
                                             self.seq_dict[seqid].sequence))
        return in_filename

Source File: seq_parser.py From GetOrganelle with GNU General Public License v3.0

6 votes

def write_fasta_with_list(out_dir, matrix, overwrite):
    if not overwrite:
        while os.path.exists(out_dir):
            out_dir = '.'.join(out_dir.split('.')[:-1]) + '_.' + out_dir.split('.')[-1]
    fasta_file = open(out_dir, 'w')
    if matrix[2]:
        for i in range(len(matrix[0])):
            fasta_file.write('>' + matrix[0][i] + '\n')
            j = matrix[2]
            while j < len(matrix[1][i]):
                fasta_file.write(''.join(matrix[1][i][(j - matrix[2]):j]) + '\n')
                j += matrix[2]
            fasta_file.write(''.join(matrix[1][i][(j - matrix[2]):j]) + '\n')
    else:
        for i in range(len(matrix[0])):
            fasta_file.write('>' + matrix[0][i] + '\n')
            fasta_file.write(''.join(matrix[1][i]) + '\n')
    fasta_file.close()


# deprecated since GetOrganelle 1.6.3

Source File: check_annotations.py From GetOrganelle with GNU General Public License v3.0

6 votes

def write_fasta(out_dir, matrix, overwrite):
    if not overwrite:
        while os.path.exists(out_dir):
            out_dir = '.'.join(out_dir.split('.')[:-1])+'_.'+out_dir.split('.')[-1]
    fasta_file = open(out_dir, 'w')
    if matrix[2]:
        for i in range(len(matrix[0])):
            fasta_file.write('>'+matrix[0][i]+'\n')
            j = matrix[2]
            while j < len(matrix[1][i]):
                fasta_file.write(matrix[1][i][(j-matrix[2]):j]+'\n')
                j += matrix[2]
            fasta_file.write(matrix[1][i][(j-matrix[2]):j]+'\n')
    else:
        for i in range(len(matrix[0])):
            fasta_file.write('>'+matrix[0][i]+'\n')
            fasta_file.write(matrix[1][i]+'\n')
    fasta_file.close()

Source File: stitch.py From medaka with Mozilla Public License 2.0

5 votes

def write_fasta(filename, contigs):
    """Write a fasta file from tuples of (name, sequence).

    :param filename: output filename.
    :param contigs: tuples of the form (sequence name, base sequence).

    """
    with open(filename, 'w') as fasta:
        for name, seq in contigs:
            fasta.write('>{}\n{}\n'.format(name, seq))

Source File: FileIO.py From cDNA_Cupcake with BSD 3-Clause Clear License

5 votes

def write_preClusterSet_to_fasta(pCS, output_filename, fasta_d):
    """
    Write to fasta:
    ID -- cid | selected representative seqid for this cid
    Seq --- sequence of the selected representative

    Currently, the rep is randomly chosen.
    """
    with open(output_filename, 'w') as f:
        for cid in pCS.S:
            r = fasta_d[random.choice(pCS.S[cid].members)]
            f.write(">{0}\n{1}\n".format(r.id, r.seq))

Source File: util.py From picrust2 with GNU General Public License v3.0

5 votes

def write_fasta(seq, outfile):
    out_fasta = open(outfile, "w")

    # Look through sequence ids (sorted alphabetically so output file is
    # reproducible).
    for s in sorted(seq.keys()):
        out_fasta.write(">" + s + "\n")
        out_fasta.write(seq[s] + "\n")

    out_fasta.close()

Source File: concoct_csv_to_fasta.py From EdwardsLab with MIT License

5 votes

def write_fasta_files(faf, odir, bins, maxb, verbose=False):
    """
    Read the sequences from faf and write them into a set of files in odir.
    :param faf: The source fasta file
    :param odir: the output directory
    :param bins: the hash of contigs -> bin
    :param maxb: the maximum bin number
    :param verbose: more output
    :return: nada
    """

    if not os.path.exists(odir):
        os.mkdir(odir)
    
    outputfiles = []
    for i in range(maxb+1):
        outputfiles.append(open(os.path.join(odir, f"bin_{i}.fna"), 'w'))

    written_to=set()

    for fa, seq in stream_fasta(faf, True):
        faid = fa.split(" ")[0]
        if faid not in bins:
            if verbose:
                sys.stderr.write(f"Sequence {faid} not found in a bin\n")
            continue
        outputfiles[bins[faid]].write(">{}\n{}\n".format(fa, seq))
        written_to.add(bins[faid])

    for o in outputfiles:
        o.close()

    for i in range(maxb+1):
        if i not in written_to:
            os.remove(os.path.join(odir, f"bin_{i}.fna"))

Source File: fasta.py From antismash with GNU Affero General Public License v3.0

5 votes

def write_fasta(names: List[str], seqs: List[str], filename: str) -> None:
    """ Writes name/sequence pairs to file in FASTA format

        Argumnets:
            names: a list of sequence identifiers
            seqs: a list of sequences as strings
            filename: the filename to write the FASTA formatted data to

        Returns:
            None
    """
    out_file = open(filename, "w")
    for name, seq in zip(names, seqs):
        out_file.write(">%s\n%s\n" % (name, seq))
    out_file.close()

Source File: run_glimmerhmm.py From antismash with GNU Affero General Public License v3.0

5 votes

def write_search_fasta(record: Record) -> str:
    """ Constructs a FASTA representation of a record and writes it to a
        file in the current directory.

        Returns:
            the name of the file created
    """
    filename = "{}.fasta".format(record.id)
    with open(filename, 'w') as handle:
        seqio.write([record.to_biopython()], handle, 'fasta')
    return filename

Source File: utils.py From wgd with GNU General Public License v3.0

5 votes

def write_fasta(seq_dict, output_file):
    """
    Write a sequence dictionary to a fasta file.

    :param seq_dict: sequence dictionary, see :py:func:`read_fasta`
    :param output_file: output file name
    """
    with open(output_file, 'w') as o:
        for key, val in seq_dict.items():
            o.write('>' + key + '\n')
            o.write(val + '\n')
    return output_file

Source File: assembly.py From dnaplotlib with MIT License

5 votes

def write_to_fasta(entries, col_length = 20) :
    formatted_entries = []
    for seq_name, nts in entries:
        nts =  [ nts[i:i + col_length] for i in range(0, len(nts), col_length)]
        nts = '\n'.join(nts)
        formatted_entries.append( '>%s\n%s' %(seq_name, nts) )
    return '\r\n'.join(formatted_entries)

Source File: tb.py From ariba with GNU General Public License v3.0

5 votes

def write_prepareref_fasta_file(outfile, gene_coords, genes_need_upstream, genes_non_upstream, upstream_before=100, upstream_after=100):
    '''Writes fasta file to be used with -f option of prepareref'''
    tmp_dict = {}
    fasta_in = os.path.join(data_dir, 'NC_000962.3.fa.gz')
    pyfastaq.tasks.file_to_dict(fasta_in, tmp_dict)
    ref_seq = tmp_dict['NC_000962.3']

    with open(outfile, 'w') as f:
        for gene in genes_non_upstream:
            start = gene_coords[gene]['start']
            end = gene_coords[gene]['end']
            if start < end:
                gene_fa = pyfastaq.sequences.Fasta(gene, ref_seq[start:end+1])
            else:
                gene_fa = pyfastaq.sequences.Fasta(gene, ref_seq[end:start+1])
                gene_fa.revcomp()

            print(gene_fa, file=f)

        for gene in genes_need_upstream:
            start = gene_coords[gene]['start']
            end = gene_coords[gene]['end']
            if start < end:
                gene_fa = pyfastaq.sequences.Fasta(gene, ref_seq[start - upstream_before:start + upstream_after])
            else:
                gene_fa = pyfastaq.sequences.Fasta(gene, ref_seq[start - upstream_after + 1:start + upstream_before + 1])
                gene_fa.revcomp()

            gene_fa.id += '_upstream'
            print(gene_fa, file=f)

Source File: reference_data.py From ariba with GNU General Public License v3.0

5 votes

def write_seqs_to_fasta(self, outfile, names):
        f_out = pyfastaq.utils.open_file_write(outfile)

        for name in sorted(names):
            print(self.sequence(name), file=f_out)

        pyfastaq.utils.close(f_out)

Source File: SequenceSearcher.py From biskit with GNU General Public License v3.0

5 votes

def writeFasta( self, frecords, fastaOut ):
        """
        Create fasta file for given set of records.

        @param frecords: list of Bio.Blast.Records
        @type  frecords: [Bio.Blast.Record]
        @param fastaOut: file name
        @type  fastaOut: str
        """
        f = open( T.absfile(fastaOut), 'w' )
        for r in frecords:
            f.write( r.format('fasta') )  ## note better use direct SeqIO
        f.close()

Source File: SequenceSearcher.py From biskit with GNU General Public License v3.0

5 votes

def writeFastaClustered( self, fastaOut=None ):
        """
        Write non-redundant set of template sequences to fasta file.

        @param fastaOut: write non-redundant fasta records to file
                         (default: L{F_FASTA_NR})
        @type  fastaOut: str
        """
        fastaOut = fastaOut or self.outFolder + self.F_FASTA_NR

        self.writeFasta( self.getClusteredRecords(), fastaOut )

Source File: SequenceSearcher.py From biskit with GNU General Public License v3.0

5 votes

def writeFastaAll( self, fastaOut=None ):
        """
        Write all found template sequences to fasta file.

        @param fastaOut: write all fasta records to file
                         (default: L{F_FASTA_ALL})
        @type  fastaOut: str OR None
        """
        fastaOut = fastaOut or self.outFolder + self.F_FASTA_ALL
        self.writeFasta( self.frecords, fastaOut )

Source File: download.py From fauna with GNU Affero General Public License v3.0

5 votes

def write_fasta(self, viruses, fname, sep='|', fasta_fields=['strain', 'virus', 'accession'], **kwargs):
        try:
            handle = open(fname, 'w')
        except IOError:
            print('ERROR'); sys.exit(2)
            pass
        else:
            for virus in viruses:
                fields = [str(virus[field]) if (field in virus and virus[field] is not None) else '?' for field in fasta_fields]
                handle.write(">"+sep.join(fields)+'\n')
                handle.write(virus['sequence'] + "\n")
            handle.close()

Source File: seqUtils.py From SqueezeMeta with GNU General Public License v3.0

5 votes

def writeFasta(seqs, outputFile):
    '''write sequences to FASTA file'''
    if outputFile.endswith('.gz'):
        fout = gzip.open(outputFile, 'wb')
    else:
        fout = open(outputFile, 'w')

    for seqId, seq in seqs.items():
        fout.write('>' + seqId + '\n')
        fout.write(seq + '\n')
    fout.close()

Source File: seed.py From iva with GNU General Public License v3.0

5 votes

def write_fasta(self, filename, name):
        f = pyfastaq.utils.open_file_write(filename)
        print('>' + name, file=f)
        print(self.seq, file=f)
        pyfastaq.utils.close(f)

Source File: FileIO.py From cDNA_Cupcake with BSD 3-Clause Clear License

5 votes

def write_select_seqs_to_fasta(fasta_filename, seqids, output_filename, mode='w'):
    d = LazyFastaReader('isoseq_flnc.fasta')
    with open(output_filename, mode) as f:
        r = d[x]
        f.write(">{0}\n{1}\n".format(r.id, r.seq))

Source File: FileIO.py From cDNA_Cupcake with BSD 3-Clause Clear License

5 votes

def write_seqids_to_fasta(seqids, output_filename, fasta_d):
    """
    Write to fasta:
    ID --- the sequence id
    Seq -- the sequence
    """
    with open(output_filename, 'w') as f:
        for seqid in seqids:
            r = fasta_d[seqid]
            f.write(">{0}\n{1}\n".format(r.id, r.seq))

Source File: bio.py From Comparative-Annotation-Toolkit with Apache License 2.0

5 votes

def write_fasta(path_or_handle, name, seq, chunk_size=100, validate=None):
    """Writes out fasta file. if path ends in gz, will be gzipped.
    """
    if isinstance(path_or_handle, str):
        fh = opengz(path_or_handle, 'w')
    else:
        fh = path_or_handle
    if validate is 'DNA':
        valid_chars = set('ACGTUYSWKMBDHVNacgtuyswkmbdhvn.-*')
    elif validate is 'protein':
        valid_chars = set('ABCDEFGHIKLMPQSRTVWXYZUabcdefghiklmpqsrtvwxyzuNn.-*')
    else:
        valid_chars = set()
    try:
        assert any([isinstance(seq, str), isinstance(seq, str)])
    except AssertionError:
        raise RuntimeError("Sequence is not unicode or string")
    if validate is not None:
        try:
            assert all(x in valid_chars for x in seq)
        except AssertionError:
            bad_chars = {x for x in seq if x not in valid_chars}
            raise RuntimeError("Invalid FASTA character(s) seen in fasta sequence: {}".format(bad_chars))
    fh.write(">%s\n" % name)
    for i in range(0, len(seq), chunk_size):
        fh.write("%s\n" % seq[i:i+chunk_size])
    if isinstance(path_or_handle, str):
        fh.close()

Source File: simBench.py From V-pipe with Apache License 2.0

5 votes

def write_fasta(haplotype_seqs, outdir):
    fasta_record = collections.namedtuple("fasta_record", "id seq")
    output_files = []
    for idx in range(len(haplotype_seqs)):
        haplotype_id = ''.join(("haplotype", str(idx)))
        seq = fasta_record(id=haplotype_id, seq=haplotype_seqs[idx])
        output_file = os.path.join(outdir, ''.join((haplotype_id, ".fasta")))
        output_files.append(output_file)

        with open(output_file, 'w') as outfile:
            outfile.write(">{}\n{}\n".format(seq.id, seq.seq))

    sh.cat(output_files, _out=os.path.join(outdir, "haplotypes.fasta"))

Source File: __main__.py From vamb with MIT License

5 votes

def write_fasta(outdir, clusterspath, fastapath, contignames, contiglengths, minfasta, logfile):
    begintime = time.time()

    log('\nWriting FASTA files', logfile)
    log('Minimum FASTA size: {}'.format(minfasta), logfile, 1)

    lengthof = dict(zip(contignames, contiglengths))
    filtered_clusters = dict()

    with open(clusterspath) as file:
        clusters = vamb.vambtools.read_clusters(file)

    for cluster, contigs in clusters.items():
        size = sum(lengthof[contig] for contig in contigs)
        if size >= minfasta:
            filtered_clusters[cluster] = clusters[cluster]

    del lengthof, clusters
    keep = set()
    for contigs in filtered_clusters.values():
        keep.update(set(contigs))

    with vamb.vambtools.Reader(fastapath, 'rb') as file:
        fastadict = vamb.vambtools.loadfasta(file, keep=keep)

    vamb.vambtools.write_bins(os.path.join(outdir, "bins"), filtered_clusters, fastadict, maxbins=None)

    ncontigs = sum(map(len, filtered_clusters.values()))
    nfiles = len(filtered_clusters)
    print('', file=logfile)
    log('Wrote {} contigs to {} FASTA files'.format(ncontigs, nfiles), logfile, 1)

    elapsed = round(time.time() - begintime, 2)
    log('Wrote FASTA in {} seconds'.format(elapsed), logfile, 1)

Source File: trees_msa.py From OrthoFinder with GNU General Public License v3.0

5 votes

def WriteSeqsToFasta(self, seqs, outFilename):
        with open(outFilename, 'w') as outFile:
            for seq in self.SortSeqs([s.ToString() for s in seqs]):
                if seq in self.SeqLists:
                    outFile.write(">%s\n" % seq)
                    outFile.write(self.SeqLists[seq])
                else:
                    print(("ERROR: %s not found" % seq))

Source File: trees_msa.py From OrthoFinder with GNU General Public License v3.0

5 votes

def WriteFastaFiles(self, fastaWriter, ogs, idDict, qBoth):
        # The results ones are now written by default after orthogroups, check they're not already there
        if not os.path.exists(self.GetFastaFilename(0, True)):
            for iOg, og in enumerate(ogs):
                fastaWriter.WriteSeqsToFasta_withNewAccessions(og, self.GetFastaFilename(iOg, True), idDict)
        if qBoth: 
            for iOg, og in enumerate(ogs):
                fastaWriter.WriteSeqsToFasta(og, self.GetFastaFilename(iOg))

Source File: read_utils.py From SVE with GNU General Public License v3.0

5 votes

def write_fasta_by_chrom(ss, chrom_fasta_dir, chrom_base=''):
    names = []
    for s in ss:
        name = chrom_fasta_dir+'/'+chrom_base+s.name+'.fa'
        names += [name]
        with open(name, 'w') as fasta: s.write_to_fasta_file(fasta)
    return names

Source File: read_utils.py From SVE with GNU General Public License v3.0

5 votes

def write_fasta_mask(M,json_path):
    with open(json_path,'w') as f:
        json.dump(M,f)
    return True

#compute an expectation given randomly distributed short reads for the RD windows (hist bins)

Source File: read_utils.py From SVE with GNU General Public License v3.0

5 votes

def write_fasta(seqs, fasta_path):
    with open(fasta_path, 'w') as fasta:
        if type(seqs) is list:
            for seq in seqs: seq.write_to_fasta_file(fasta)
        elif type(seqs) is dict:
            for k in sorted(seqs,key=lambda x: x.zfill(max([len(k) for k in seqs]))):
                seqs[k].write_to_fasta_file(fasta)
        return True

#ss is a HTSeq Sequence list?

Source File: fasta.py From ssbio with MIT License

5 votes

def write_seq_as_temp_fasta(seq):
    """Write a sequence as a temporary FASTA file

    Args:
        seq (str, Seq, SeqRecord): Sequence string, Biopython Seq or SeqRecord object

    Returns:
        str: Path to temporary FASTA file (located in system temporary files directory)

    """
    sr = ssbio.protein.sequence.utils.cast_to_seq_record(seq, id='tempfasta')
    return write_fasta_file(seq_records=sr, outname='temp', outdir=tempfile.gettempdir(), force_rerun=True)

Source File: fasta.py From ssbio with MIT License

5 votes

def write_fasta_file_from_dict(indict, outname, outdir=None, outext='.faa', force_rerun=False):
    """Write a FASTA file for a dictionary of IDs and their sequence strings.

    Args:
        indict: Input dictionary with keys as IDs and values as sequence strings
        outname: Name of the output file which will have outext appended to it
        outdir: Path to directory to output sequences to
        outext: Extension of FASTA file, default ".faa"
        force_rerun: If file should be overwritten if it exists

    Returns:
        str: Path to output FASTA file.

    """

    if not outdir:
        outdir = ''
    outfile = ssbio.utils.outfile_maker(inname='', outname=outname, outdir=outdir, outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        seqs = []
        for i, s in indict.items():
            seq = ssbio.protein.sequence.utils.cast_to_seq_record(s, id=i)
            seqs.append(seq)
        SeqIO.write(seqs, outfile, "fasta")

    return outfile

Source File: toilInterface.py From Comparative-Annotation-Toolkit with Apache License 2.0

5 votes

def write_fasta_to_filestore(toil, fasta_local_path):
    """
    Convenience function that loads a fasta and its associated gdx/flat file into the fileStore.
    Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat)
    :param toil: Toil context manager
    :param fasta_local_path: Path to local fasta to load.
    :return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat
    """
    fasta_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path), fasta_local_path)
    gdx_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.gdx'), fasta_local_path + '.gdx')
    flat_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.flat'), fasta_local_path + '.flat')
    return fasta_file_id, gdx_file_id, flat_file_id

Source File: model.py From pmx with GNU Lesser General Public License v3.0

4 votes

def writeFASTA( self, filename, title = ""):
        fp = open(filename,"w")
        if not title: title = '_'.join(self.title.split())
        if len(self.chains) == 1:
            print >>fp, '> %s' % title
            print >>fp, self.chains[0].get_sequence()
        else:
            for chain in self.chains:
                print >>fp, '> %s_chain_%s' % (title, chain.id )
                print >>fp, chain.get_sequence()
                
    

##     def writeGRO( self, filename, title = ''):
##         fp = open(filename,'w')
##         if self.unity == 'nm': fac = 1.
##         else: fac = 0.1
##         if not title:
##             title = self.title
##         print >>fp, title
##         print >>fp, "%5d" % len(self.atoms)
##         if self.atoms[0].v[0] != 0.000 : bVel = True
##         else: bVel = False
##         if bVel:
##             gro_format = "%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"
##         else:
##             gro_format = "%8.3f%8.3f%8.3f"
##         for atom in self.atoms:
##             resid = (atom.resnr)%100000
##             at_id = (atom.id)%100000
##             ff = "%5d%-5.5s%5.5s%5d" % (resid, atom.resname, atom.name, at_id)
##             if bVel:
##                 ff+=gro_format % (atom.x[XX]*fac, atom.x[YY]*fac, atom.x[ZZ]*fac,
##                                   atom.v[XX], atom.v[YY], atom.v[ZZ])
##             else:
##                 ff+=gro_format % (atom.x[XX]*fac, atom.x[YY]*fac, atom.x[ZZ]*fac )
##             print >>fp, ff
            
##         if self.box[XX][YY] or self.box[XX][ZZ] or self.box[YY][XX] or \
##                self.box[YY][ZZ] or self.box[ZZ][XX] or self.box[ZZ][YY]:
##             bTric = False
##             ff = "%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f"
##         else:
##             bTric = True
##             ff = "%10.5f%10.5f%10.5f"
##         if bTric:
##             print >>fp, ff % (self.box[XX][XX],self.box[YY][YY],self.box[ZZ][ZZ])
##         else:
##             print >>fp, ff % (self.box[XX][XX],self.box[YY][YY],self.box[ZZ][ZZ],
##                               self.box[XX][YY],self.box[XX][ZZ],self.box[YY][XX],
##                               self.box[YY][ZZ],self.box[ZZ][XX],self.box[ZZ][YY])
##         fp.close()

Source File: tree.py From augur with GNU Affero General Public License v3.0

4 votes

def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
    from Bio import SeqIO
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq

    sequences = compress_seq['sequences']
    ref = compress_seq['reference']
    positions = compress_seq['positions']

    #If want to exclude sites from initial treebuild, read in here
    strip_pos = load_mask_sites(stripFile) if stripFile else []

    #Get sequence names
    seqNames = list(sequences.keys())

    #Check non-ref sites to see if informative
    printPositionMap = False    #If true, prints file mapping Fasta position to real position
    sites = []
    pos = []

    for key in positions:
        if key not in strip_pos:
            pattern = []
            for k in sequences.keys():
                #looping try/except is faster than list comprehension
                try:
                    pattern.append(sequences[k][key])
                except KeyError:
                    pattern.append(ref[key])
            origPattern = list(pattern)
            if '-' in pattern or 'N' in pattern:
                #remove gaps/Ns to see if otherwise informative
                pattern = [value for value in origPattern if value != '-' and value != 'N']
            un = np.unique(pattern, return_counts=True)
            #If not all - or N, not all same base, and >1 differing base, append
            if len(un[0])!=0 and len(un[0])!=1 and not (len(un[0])==2 and min(un[1])==1):
                sites.append(origPattern)
                pos.append("\t".join([str(len(pos)+1),str(key)]))

    #Rotate and convert to SeqRecord
    sites = np.asarray(sites)
    align = np.rot90(sites)
    seqNamesCorr = list(reversed(seqNames))
    toFasta = [ SeqRecord(id=seqNamesCorr[i], seq=Seq("".join(align[i])), description='') for i in range(len(sequences.keys()))]

    fasta_file = os.path.join(os.path.dirname(alignment), 'informative_sites.fasta')

    #now output this as fasta to read into raxml or iqtree
    SeqIO.write(toFasta, fasta_file, 'fasta')

    #If want a position map, print:
    if printPositionMap:
        with open(fasta_file+".positions.txt", 'w', encoding='utf-8') as the_file:
            the_file.write("\n".join(pos))

    return fasta_file