Python write fasta

39 Python code examples are found related to " write fasta". You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: seq_io.py    From GTDBTk with GNU General Public License v3.0 7 votes vote down vote up
def write_fasta(seqs, fasta_file, wrap=80):
    """Write sequences to a fasta file.

    Parameters
    ----------
    seqs : dict[seq_id] -> seq
        Sequences indexed by sequence id.
    fasta_file : str
        Path to write the sequences to.
    wrap: int
        Number of AA/NT before the line is wrapped.
    """
    with open(fasta_file, 'w') as f:
        for gid, gseq in seqs.items():
            f.write('>{}\n'.format(gid))
            for i in range(0, len(gseq), wrap):
                f.write('{}\n'.format(gseq[i:i + wrap])) 
Example 2
Source File: dna2proteins.py    From dna2proteins with MIT License 7 votes vote down vote up
def write_fasta(dictionary, filename):
    """
    Takes a dictionary and writes it to a fasta file
    Must specify the filename when caling the function
    """

    import textwrap
    with open(filename, "w") as outfile:
        for key, value in dictionary.items():
            outfile.write(key + "\n")
            outfile.write("\n".join(textwrap.wrap(value, 60)))
            outfile.write("\n")

    print "Success! File written"

## Swaps DNA sequencs for proteins 
Example 3
Source File: seq_io.py    From SqueezeMeta with GNU General Public License v3.0 6 votes vote down vote up
def write_fasta(seqs, output_file):
    """Write sequences to fasta file.

    If the output file has the extension 'gz',
    it will be compressed using gzip.

    Parameters
    ----------
    seqs : dict[seq_id] -> seq
        Sequences indexed by sequence id.
    output_file : str
        Name of fasta file to produce.
    """

    if output_file.endswith('.gz'):
        fout = gzip.open(output_file, 'wb')
    else:
        fout = open(output_file, 'w')

    for seq_id, seq in viewitems(seqs):
        fout.write('>' + seq_id + '\n')
        fout.write(seq + '\n')
    fout.close() 
Example 4
Source File: fasta.py    From ssbio with MIT License 6 votes vote down vote up
def write_fasta_file(seq_records, outname, outdir=None, outext='.faa', force_rerun=False):
    """Write a FASTA file for a SeqRecord or a list of SeqRecord objects.

    Args:
        seq_records (SeqRecord, list): SeqRecord or a list of SeqRecord objects
        outname: Name of the output file which will have outext appended to it
        outdir: Path to directory to output sequences to
        outext: Extension of FASTA file, default ".faa"
        force_rerun: If file should be overwritten if it exists

    Returns:
        str: Path to output FASTA file.

    """

    if not outdir:
        outdir = ''
    outfile = ssbio.utils.outfile_maker(inname='', outname=outname, outdir=outdir, outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        SeqIO.write(seq_records, outfile, "fasta")

    return outfile 
Example 5
Source File: seq_io.py    From catch with MIT License 6 votes vote down vote up
def write_probe_fasta(probes, out_fn):
    """Write probe sequences to a FASTA file.

    This writes one probe sequence per line, with a header immediately
    preceding the sequence. If set, the header written is the one in
    probe.Probe.header. If not set, the probe.Probe.identifier() is used.

    Args:
        probes: list of instances of probe.Probe
        out_fn: path to FASTA file to write
    """
    with open(out_fn, 'w') as f:
        for p in probes:
            if p.header:
                f.write('>' + p.header + '\n')
            else:
                f.write('>probe_%s\n' % p.identifier())
            f.write(p.seq_str + '\n') 
Example 6
Source File: IceIterative2.py    From cDNA_Cupcake with BSD 3-Clause Clear License 6 votes vote down vote up
def write_in_fasta(self, cid, write_all=False):
        """
        Write the ./tmp/<cid/10000 mod>/c<cid>/in.fasta for cluster cid.
        If write_all is True, write all subreads. Otherwise, only write
        a random subsample of num=self.dagcon_in_fasta_subsample reads.
        """
        #in_filename = op.join('./tmp/', str(cid/10000), 'c'+str(cid), 'in.fasta')
        in_filename = op.join(self.clusterInFa(cid))
        seqids = self.uc[cid]
        if not write_all:
            seqids = random.sample(seqids, min(self.dagcon_in_fa_subsample, len(seqids)))
        with open(in_filename, 'w') as f:
            for seqid in seqids:
                f.write(">{0}\n{1}\n".format(seqid,
                                             self.seq_dict[seqid].sequence))
        return in_filename 
Example 7
Source File: seq_parser.py    From GetOrganelle with GNU General Public License v3.0 6 votes vote down vote up
def write_fasta_with_list(out_dir, matrix, overwrite):
    if not overwrite:
        while os.path.exists(out_dir):
            out_dir = '.'.join(out_dir.split('.')[:-1]) + '_.' + out_dir.split('.')[-1]
    fasta_file = open(out_dir, 'w')
    if matrix[2]:
        for i in range(len(matrix[0])):
            fasta_file.write('>' + matrix[0][i] + '\n')
            j = matrix[2]
            while j < len(matrix[1][i]):
                fasta_file.write(''.join(matrix[1][i][(j - matrix[2]):j]) + '\n')
                j += matrix[2]
            fasta_file.write(''.join(matrix[1][i][(j - matrix[2]):j]) + '\n')
    else:
        for i in range(len(matrix[0])):
            fasta_file.write('>' + matrix[0][i] + '\n')
            fasta_file.write(''.join(matrix[1][i]) + '\n')
    fasta_file.close()


# deprecated since GetOrganelle 1.6.3 
Example 8
Source File: check_annotations.py    From GetOrganelle with GNU General Public License v3.0 6 votes vote down vote up
def write_fasta(out_dir, matrix, overwrite):
    if not overwrite:
        while os.path.exists(out_dir):
            out_dir = '.'.join(out_dir.split('.')[:-1])+'_.'+out_dir.split('.')[-1]
    fasta_file = open(out_dir, 'w')
    if matrix[2]:
        for i in range(len(matrix[0])):
            fasta_file.write('>'+matrix[0][i]+'\n')
            j = matrix[2]
            while j < len(matrix[1][i]):
                fasta_file.write(matrix[1][i][(j-matrix[2]):j]+'\n')
                j += matrix[2]
            fasta_file.write(matrix[1][i][(j-matrix[2]):j]+'\n')
    else:
        for i in range(len(matrix[0])):
            fasta_file.write('>'+matrix[0][i]+'\n')
            fasta_file.write(matrix[1][i]+'\n')
    fasta_file.close() 
Example 9
Source File: stitch.py    From medaka with Mozilla Public License 2.0 5 votes vote down vote up
def write_fasta(filename, contigs):
    """Write a fasta file from tuples of (name, sequence).

    :param filename: output filename.
    :param contigs: tuples of the form (sequence name, base sequence).

    """
    with open(filename, 'w') as fasta:
        for name, seq in contigs:
            fasta.write('>{}\n{}\n'.format(name, seq)) 
Example 10
Source File: FileIO.py    From cDNA_Cupcake with BSD 3-Clause Clear License 5 votes vote down vote up
def write_preClusterSet_to_fasta(pCS, output_filename, fasta_d):
    """
    Write to fasta:
    ID -- cid | selected representative seqid for this cid
    Seq --- sequence of the selected representative

    Currently, the rep is randomly chosen.
    """
    with open(output_filename, 'w') as f:
        for cid in pCS.S:
            r = fasta_d[random.choice(pCS.S[cid].members)]
            f.write(">{0}\n{1}\n".format(r.id, r.seq)) 
Example 11
Source File: util.py    From picrust2 with GNU General Public License v3.0 5 votes vote down vote up
def write_fasta(seq, outfile):
    out_fasta = open(outfile, "w")

    # Look through sequence ids (sorted alphabetically so output file is
    # reproducible).
    for s in sorted(seq.keys()):
        out_fasta.write(">" + s + "\n")
        out_fasta.write(seq[s] + "\n")

    out_fasta.close() 
Example 12
Source File: concoct_csv_to_fasta.py    From EdwardsLab with MIT License 5 votes vote down vote up
def write_fasta_files(faf, odir, bins, maxb, verbose=False):
    """
    Read the sequences from faf and write them into a set of files in odir.
    :param faf: The source fasta file
    :param odir: the output directory
    :param bins: the hash of contigs -> bin
    :param maxb: the maximum bin number
    :param verbose: more output
    :return: nada
    """

    if not os.path.exists(odir):
        os.mkdir(odir)
    
    outputfiles = []
    for i in range(maxb+1):
        outputfiles.append(open(os.path.join(odir, f"bin_{i}.fna"), 'w'))

    written_to=set()

    for fa, seq in stream_fasta(faf, True):
        faid = fa.split(" ")[0]
        if faid not in bins:
            if verbose:
                sys.stderr.write(f"Sequence {faid} not found in a bin\n")
            continue
        outputfiles[bins[faid]].write(">{}\n{}\n".format(fa, seq))
        written_to.add(bins[faid])

    for o in outputfiles:
        o.close()

    for i in range(maxb+1):
        if i not in written_to:
            os.remove(os.path.join(odir, f"bin_{i}.fna")) 
Example 13
Source File: fasta.py    From antismash with GNU Affero General Public License v3.0 5 votes vote down vote up
def write_fasta(names: List[str], seqs: List[str], filename: str) -> None:
    """ Writes name/sequence pairs to file in FASTA format

        Argumnets:
            names: a list of sequence identifiers
            seqs: a list of sequences as strings
            filename: the filename to write the FASTA formatted data to

        Returns:
            None
    """
    out_file = open(filename, "w")
    for name, seq in zip(names, seqs):
        out_file.write(">%s\n%s\n" % (name, seq))
    out_file.close() 
Example 14
Source File: run_glimmerhmm.py    From antismash with GNU Affero General Public License v3.0 5 votes vote down vote up
def write_search_fasta(record: Record) -> str:
    """ Constructs a FASTA representation of a record and writes it to a
        file in the current directory.

        Returns:
            the name of the file created
    """
    filename = "{}.fasta".format(record.id)
    with open(filename, 'w') as handle:
        seqio.write([record.to_biopython()], handle, 'fasta')
    return filename 
Example 15
Source File: utils.py    From wgd with GNU General Public License v3.0 5 votes vote down vote up
def write_fasta(seq_dict, output_file):
    """
    Write a sequence dictionary to a fasta file.

    :param seq_dict: sequence dictionary, see :py:func:`read_fasta`
    :param output_file: output file name
    """
    with open(output_file, 'w') as o:
        for key, val in seq_dict.items():
            o.write('>' + key + '\n')
            o.write(val + '\n')
    return output_file 
Example 16
Source File: assembly.py    From dnaplotlib with MIT License 5 votes vote down vote up
def write_to_fasta(entries, col_length = 20) :
    formatted_entries = []
    for seq_name, nts in entries:
        nts =  [ nts[i:i + col_length] for i in range(0, len(nts), col_length)]
        nts = '\n'.join(nts)
        formatted_entries.append( '>%s\n%s' %(seq_name, nts) )
    return '\r\n'.join(formatted_entries) 
Example 17
Source File: tb.py    From ariba with GNU General Public License v3.0 5 votes vote down vote up
def write_prepareref_fasta_file(outfile, gene_coords, genes_need_upstream, genes_non_upstream, upstream_before=100, upstream_after=100):
    '''Writes fasta file to be used with -f option of prepareref'''
    tmp_dict = {}
    fasta_in = os.path.join(data_dir, 'NC_000962.3.fa.gz')
    pyfastaq.tasks.file_to_dict(fasta_in, tmp_dict)
    ref_seq = tmp_dict['NC_000962.3']

    with open(outfile, 'w') as f:
        for gene in genes_non_upstream:
            start = gene_coords[gene]['start']
            end = gene_coords[gene]['end']
            if start < end:
                gene_fa = pyfastaq.sequences.Fasta(gene, ref_seq[start:end+1])
            else:
                gene_fa = pyfastaq.sequences.Fasta(gene, ref_seq[end:start+1])
                gene_fa.revcomp()

            print(gene_fa, file=f)

        for gene in genes_need_upstream:
            start = gene_coords[gene]['start']
            end = gene_coords[gene]['end']
            if start < end:
                gene_fa = pyfastaq.sequences.Fasta(gene, ref_seq[start - upstream_before:start + upstream_after])
            else:
                gene_fa = pyfastaq.sequences.Fasta(gene, ref_seq[start - upstream_after + 1:start + upstream_before + 1])
                gene_fa.revcomp()

            gene_fa.id += '_upstream'
            print(gene_fa, file=f) 
Example 18
Source File: reference_data.py    From ariba with GNU General Public License v3.0 5 votes vote down vote up
def write_seqs_to_fasta(self, outfile, names):
        f_out = pyfastaq.utils.open_file_write(outfile)

        for name in sorted(names):
            print(self.sequence(name), file=f_out)

        pyfastaq.utils.close(f_out) 
Example 19
Source File: SequenceSearcher.py    From biskit with GNU General Public License v3.0 5 votes vote down vote up
def writeFasta( self, frecords, fastaOut ):
        """
        Create fasta file for given set of records.

        @param frecords: list of Bio.Blast.Records
        @type  frecords: [Bio.Blast.Record]
        @param fastaOut: file name
        @type  fastaOut: str
        """
        f = open( T.absfile(fastaOut), 'w' )
        for r in frecords:
            f.write( r.format('fasta') )  ## note better use direct SeqIO
        f.close() 
Example 20
Source File: SequenceSearcher.py    From biskit with GNU General Public License v3.0 5 votes vote down vote up
def writeFastaClustered( self, fastaOut=None ):
        """
        Write non-redundant set of template sequences to fasta file.

        @param fastaOut: write non-redundant fasta records to file
                         (default: L{F_FASTA_NR})
        @type  fastaOut: str
        """
        fastaOut = fastaOut or self.outFolder + self.F_FASTA_NR

        self.writeFasta( self.getClusteredRecords(), fastaOut ) 
Example 21
Source File: SequenceSearcher.py    From biskit with GNU General Public License v3.0 5 votes vote down vote up
def writeFastaAll( self, fastaOut=None ):
        """
        Write all found template sequences to fasta file.

        @param fastaOut: write all fasta records to file
                         (default: L{F_FASTA_ALL})
        @type  fastaOut: str OR None
        """
        fastaOut = fastaOut or self.outFolder + self.F_FASTA_ALL
        self.writeFasta( self.frecords, fastaOut ) 
Example 22
Source File: download.py    From fauna with GNU Affero General Public License v3.0 5 votes vote down vote up
def write_fasta(self, viruses, fname, sep='|', fasta_fields=['strain', 'virus', 'accession'], **kwargs):
        try:
            handle = open(fname, 'w')
        except IOError:
            print('ERROR'); sys.exit(2)
            pass
        else:
            for virus in viruses:
                fields = [str(virus[field]) if (field in virus and virus[field] is not None) else '?' for field in fasta_fields]
                handle.write(">"+sep.join(fields)+'\n')
                handle.write(virus['sequence'] + "\n")
            handle.close() 
Example 23
Source File: seqUtils.py    From SqueezeMeta with GNU General Public License v3.0 5 votes vote down vote up
def writeFasta(seqs, outputFile):
    '''write sequences to FASTA file'''
    if outputFile.endswith('.gz'):
        fout = gzip.open(outputFile, 'wb')
    else:
        fout = open(outputFile, 'w')

    for seqId, seq in seqs.items():
        fout.write('>' + seqId + '\n')
        fout.write(seq + '\n')
    fout.close() 
Example 24
Source File: seed.py    From iva with GNU General Public License v3.0 5 votes vote down vote up
def write_fasta(self, filename, name):
        f = pyfastaq.utils.open_file_write(filename)
        print('>' + name, file=f)
        print(self.seq, file=f)
        pyfastaq.utils.close(f) 
Example 25
Source File: FileIO.py    From cDNA_Cupcake with BSD 3-Clause Clear License 5 votes vote down vote up
def write_select_seqs_to_fasta(fasta_filename, seqids, output_filename, mode='w'):
    d = LazyFastaReader('isoseq_flnc.fasta')
    with open(output_filename, mode) as f:
        r = d[x]
        f.write(">{0}\n{1}\n".format(r.id, r.seq)) 
Example 26
Source File: FileIO.py    From cDNA_Cupcake with BSD 3-Clause Clear License 5 votes vote down vote up
def write_seqids_to_fasta(seqids, output_filename, fasta_d):
    """
    Write to fasta:
    ID --- the sequence id
    Seq -- the sequence
    """
    with open(output_filename, 'w') as f:
        for seqid in seqids:
            r = fasta_d[seqid]
            f.write(">{0}\n{1}\n".format(r.id, r.seq)) 
Example 27
Source File: bio.py    From Comparative-Annotation-Toolkit with Apache License 2.0 5 votes vote down vote up
def write_fasta(path_or_handle, name, seq, chunk_size=100, validate=None):
    """Writes out fasta file. if path ends in gz, will be gzipped.
    """
    if isinstance(path_or_handle, str):
        fh = opengz(path_or_handle, 'w')
    else:
        fh = path_or_handle
    if validate is 'DNA':
        valid_chars = set('ACGTUYSWKMBDHVNacgtuyswkmbdhvn.-*')
    elif validate is 'protein':
        valid_chars = set('ABCDEFGHIKLMPQSRTVWXYZUabcdefghiklmpqsrtvwxyzuNn.-*')
    else:
        valid_chars = set()
    try:
        assert any([isinstance(seq, str), isinstance(seq, str)])
    except AssertionError:
        raise RuntimeError("Sequence is not unicode or string")
    if validate is not None:
        try:
            assert all(x in valid_chars for x in seq)
        except AssertionError:
            bad_chars = {x for x in seq if x not in valid_chars}
            raise RuntimeError("Invalid FASTA character(s) seen in fasta sequence: {}".format(bad_chars))
    fh.write(">%s\n" % name)
    for i in range(0, len(seq), chunk_size):
        fh.write("%s\n" % seq[i:i+chunk_size])
    if isinstance(path_or_handle, str):
        fh.close() 
Example 28
Source File: simBench.py    From V-pipe with Apache License 2.0 5 votes vote down vote up
def write_fasta(haplotype_seqs, outdir):
    fasta_record = collections.namedtuple("fasta_record", "id seq")
    output_files = []
    for idx in range(len(haplotype_seqs)):
        haplotype_id = ''.join(("haplotype", str(idx)))
        seq = fasta_record(id=haplotype_id, seq=haplotype_seqs[idx])
        output_file = os.path.join(outdir, ''.join((haplotype_id, ".fasta")))
        output_files.append(output_file)

        with open(output_file, 'w') as outfile:
            outfile.write(">{}\n{}\n".format(seq.id, seq.seq))

    sh.cat(output_files, _out=os.path.join(outdir, "haplotypes.fasta")) 
Example 29
Source File: __main__.py    From vamb with MIT License 5 votes vote down vote up
def write_fasta(outdir, clusterspath, fastapath, contignames, contiglengths, minfasta, logfile):
    begintime = time.time()

    log('\nWriting FASTA files', logfile)
    log('Minimum FASTA size: {}'.format(minfasta), logfile, 1)

    lengthof = dict(zip(contignames, contiglengths))
    filtered_clusters = dict()

    with open(clusterspath) as file:
        clusters = vamb.vambtools.read_clusters(file)

    for cluster, contigs in clusters.items():
        size = sum(lengthof[contig] for contig in contigs)
        if size >= minfasta:
            filtered_clusters[cluster] = clusters[cluster]

    del lengthof, clusters
    keep = set()
    for contigs in filtered_clusters.values():
        keep.update(set(contigs))

    with vamb.vambtools.Reader(fastapath, 'rb') as file:
        fastadict = vamb.vambtools.loadfasta(file, keep=keep)

    vamb.vambtools.write_bins(os.path.join(outdir, "bins"), filtered_clusters, fastadict, maxbins=None)

    ncontigs = sum(map(len, filtered_clusters.values()))
    nfiles = len(filtered_clusters)
    print('', file=logfile)
    log('Wrote {} contigs to {} FASTA files'.format(ncontigs, nfiles), logfile, 1)

    elapsed = round(time.time() - begintime, 2)
    log('Wrote FASTA in {} seconds'.format(elapsed), logfile, 1) 
Example 30
Source File: trees_msa.py    From OrthoFinder with GNU General Public License v3.0 5 votes vote down vote up
def WriteSeqsToFasta(self, seqs, outFilename):
        with open(outFilename, 'w') as outFile:
            for seq in self.SortSeqs([s.ToString() for s in seqs]):
                if seq in self.SeqLists:
                    outFile.write(">%s\n" % seq)
                    outFile.write(self.SeqLists[seq])
                else:
                    print(("ERROR: %s not found" % seq)) 
Example 31
Source File: trees_msa.py    From OrthoFinder with GNU General Public License v3.0 5 votes vote down vote up
def WriteFastaFiles(self, fastaWriter, ogs, idDict, qBoth):
        # The results ones are now written by default after orthogroups, check they're not already there
        if not os.path.exists(self.GetFastaFilename(0, True)):
            for iOg, og in enumerate(ogs):
                fastaWriter.WriteSeqsToFasta_withNewAccessions(og, self.GetFastaFilename(iOg, True), idDict)
        if qBoth: 
            for iOg, og in enumerate(ogs):
                fastaWriter.WriteSeqsToFasta(og, self.GetFastaFilename(iOg)) 
Example 32
Source File: read_utils.py    From SVE with GNU General Public License v3.0 5 votes vote down vote up
def write_fasta_by_chrom(ss, chrom_fasta_dir, chrom_base=''):
    names = []
    for s in ss:
        name = chrom_fasta_dir+'/'+chrom_base+s.name+'.fa'
        names += [name]
        with open(name, 'w') as fasta: s.write_to_fasta_file(fasta)
    return names 
Example 33
Source File: read_utils.py    From SVE with GNU General Public License v3.0 5 votes vote down vote up
def write_fasta_mask(M,json_path):
    with open(json_path,'w') as f:
        json.dump(M,f)
    return True

#compute an expectation given randomly distributed short reads for the RD windows (hist bins) 
Example 34
Source File: read_utils.py    From SVE with GNU General Public License v3.0 5 votes vote down vote up
def write_fasta(seqs, fasta_path):
    with open(fasta_path, 'w') as fasta:
        if type(seqs) is list:
            for seq in seqs: seq.write_to_fasta_file(fasta)
        elif type(seqs) is dict:
            for k in sorted(seqs,key=lambda x: x.zfill(max([len(k) for k in seqs]))):
                seqs[k].write_to_fasta_file(fasta)
        return True

#ss is a HTSeq Sequence list? 
Example 35
Source File: fasta.py    From ssbio with MIT License 5 votes vote down vote up
def write_seq_as_temp_fasta(seq):
    """Write a sequence as a temporary FASTA file

    Args:
        seq (str, Seq, SeqRecord): Sequence string, Biopython Seq or SeqRecord object

    Returns:
        str: Path to temporary FASTA file (located in system temporary files directory)

    """
    sr = ssbio.protein.sequence.utils.cast_to_seq_record(seq, id='tempfasta')
    return write_fasta_file(seq_records=sr, outname='temp', outdir=tempfile.gettempdir(), force_rerun=True) 
Example 36
Source File: fasta.py    From ssbio with MIT License 5 votes vote down vote up
def write_fasta_file_from_dict(indict, outname, outdir=None, outext='.faa', force_rerun=False):
    """Write a FASTA file for a dictionary of IDs and their sequence strings.

    Args:
        indict: Input dictionary with keys as IDs and values as sequence strings
        outname: Name of the output file which will have outext appended to it
        outdir: Path to directory to output sequences to
        outext: Extension of FASTA file, default ".faa"
        force_rerun: If file should be overwritten if it exists

    Returns:
        str: Path to output FASTA file.

    """

    if not outdir:
        outdir = ''
    outfile = ssbio.utils.outfile_maker(inname='', outname=outname, outdir=outdir, outext=outext)

    if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile):
        seqs = []
        for i, s in indict.items():
            seq = ssbio.protein.sequence.utils.cast_to_seq_record(s, id=i)
            seqs.append(seq)
        SeqIO.write(seqs, outfile, "fasta")

    return outfile 
Example 37
Source File: toilInterface.py    From Comparative-Annotation-Toolkit with Apache License 2.0 5 votes vote down vote up
def write_fasta_to_filestore(toil, fasta_local_path):
    """
    Convenience function that loads a fasta and its associated gdx/flat file into the fileStore.
    Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat)
    :param toil: Toil context manager
    :param fasta_local_path: Path to local fasta to load.
    :return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat
    """
    fasta_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path), fasta_local_path)
    gdx_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.gdx'), fasta_local_path + '.gdx')
    flat_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.flat'), fasta_local_path + '.flat')
    return fasta_file_id, gdx_file_id, flat_file_id 
Example 38
Source File: model.py    From pmx with GNU Lesser General Public License v3.0 4 votes vote down vote up
def writeFASTA( self, filename, title = ""):
        fp = open(filename,"w")
        if not title: title = '_'.join(self.title.split())
        if len(self.chains) == 1:
            print >>fp, '> %s' % title
            print >>fp, self.chains[0].get_sequence()
        else:
            for chain in self.chains:
                print >>fp, '> %s_chain_%s' % (title, chain.id )
                print >>fp, chain.get_sequence()
                
    

##     def writeGRO( self, filename, title = ''):
##         fp = open(filename,'w')
##         if self.unity == 'nm': fac = 1.
##         else: fac = 0.1
##         if not title:
##             title = self.title
##         print >>fp, title
##         print >>fp, "%5d" % len(self.atoms)
##         if self.atoms[0].v[0] != 0.000 : bVel = True
##         else: bVel = False
##         if bVel:
##             gro_format = "%8.3f%8.3f%8.3f%8.4f%8.4f%8.4f"
##         else:
##             gro_format = "%8.3f%8.3f%8.3f"
##         for atom in self.atoms:
##             resid = (atom.resnr)%100000
##             at_id = (atom.id)%100000
##             ff = "%5d%-5.5s%5.5s%5d" % (resid, atom.resname, atom.name, at_id)
##             if bVel:
##                 ff+=gro_format % (atom.x[XX]*fac, atom.x[YY]*fac, atom.x[ZZ]*fac,
##                                   atom.v[XX], atom.v[YY], atom.v[ZZ])
##             else:
##                 ff+=gro_format % (atom.x[XX]*fac, atom.x[YY]*fac, atom.x[ZZ]*fac )
##             print >>fp, ff
            
##         if self.box[XX][YY] or self.box[XX][ZZ] or self.box[YY][XX] or \
##                self.box[YY][ZZ] or self.box[ZZ][XX] or self.box[ZZ][YY]:
##             bTric = False
##             ff = "%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f"
##         else:
##             bTric = True
##             ff = "%10.5f%10.5f%10.5f"
##         if bTric:
##             print >>fp, ff % (self.box[XX][XX],self.box[YY][YY],self.box[ZZ][ZZ])
##         else:
##             print >>fp, ff % (self.box[XX][XX],self.box[YY][YY],self.box[ZZ][ZZ],
##                               self.box[XX][YY],self.box[XX][ZZ],self.box[YY][XX],
##                               self.box[YY][ZZ],self.box[ZZ][XX],self.box[ZZ][YY])
##         fp.close() 
Example 39
Source File: tree.py    From augur with GNU Affero General Public License v3.0 4 votes vote down vote up
def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
    from Bio import SeqIO
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq

    sequences = compress_seq['sequences']
    ref = compress_seq['reference']
    positions = compress_seq['positions']

    #If want to exclude sites from initial treebuild, read in here
    strip_pos = load_mask_sites(stripFile) if stripFile else []

    #Get sequence names
    seqNames = list(sequences.keys())

    #Check non-ref sites to see if informative
    printPositionMap = False    #If true, prints file mapping Fasta position to real position
    sites = []
    pos = []

    for key in positions:
        if key not in strip_pos:
            pattern = []
            for k in sequences.keys():
                #looping try/except is faster than list comprehension
                try:
                    pattern.append(sequences[k][key])
                except KeyError:
                    pattern.append(ref[key])
            origPattern = list(pattern)
            if '-' in pattern or 'N' in pattern:
                #remove gaps/Ns to see if otherwise informative
                pattern = [value for value in origPattern if value != '-' and value != 'N']
            un = np.unique(pattern, return_counts=True)
            #If not all - or N, not all same base, and >1 differing base, append
            if len(un[0])!=0 and len(un[0])!=1 and not (len(un[0])==2 and min(un[1])==1):
                sites.append(origPattern)
                pos.append("\t".join([str(len(pos)+1),str(key)]))

    #Rotate and convert to SeqRecord
    sites = np.asarray(sites)
    align = np.rot90(sites)
    seqNamesCorr = list(reversed(seqNames))
    toFasta = [ SeqRecord(id=seqNamesCorr[i], seq=Seq("".join(align[i])), description='') for i in range(len(sequences.keys()))]

    fasta_file = os.path.join(os.path.dirname(alignment), 'informative_sites.fasta')

    #now output this as fasta to read into raxml or iqtree
    SeqIO.write(toFasta, fasta_file, 'fasta')

    #If want a position map, print:
    if printPositionMap:
        with open(fasta_file+".positions.txt", 'w', encoding='utf-8') as the_file:
            the_file.write("\n".join(pos))

    return fasta_file