Python Bio.SeqIO.read() Examples

The following are 30 code examples of Bio.SeqIO.read(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Bio.SeqIO , or try the search function .
Example #1
Source File: uniprot.py    From ssbio with MIT License 6 votes vote down vote up
def seq(self):
        """Seq: Get the Seq object from the sequence file, metadata file, or in memory"""

        if self.sequence_file:
            log.debug('{}: reading sequence from sequence file {}'.format(self.id, self.sequence_path))
            tmp_sr = SeqIO.read(self.sequence_path, 'fasta')
            return tmp_sr.seq

        elif self.metadata_file:
            log.debug('{}: reading sequence from metadata file {}'.format(self.id, self.metadata_path))
            tmp_sr = SeqIO.read(self.metadata_path, 'uniprot-xml')
            return tmp_sr.seq

        else:
            if not self._seq:
                log.debug('{}: no sequence stored in memory'.format(self.id))
            else:
                log.debug('{}: reading sequence from memory'.format(self.id))

            return self._seq 
Example #2
Source File: align.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def prettify_alignment(aln):
    '''
    Converts all bases to uppercase and removes auto reverse-complement prefix (_R_).
    This modifies the alignment in place.

    Parameters
    ----------
    aln : MultipleSeqAlign
        Biopython Alignment
    '''
    for seq in aln:
        seq.seq = seq.seq.upper()
        # AlignIO.read repeats the ID in the name and description field
        if seq.id.startswith("_R_"):
            seq.id = seq.id[3:]
            print("Sequence \"{}\" was reverse-complemented by the alignment program.".format(seq.id))
        if seq.name.startswith("_R_"):
            seq.name = seq.name[3:]
        if seq.description.startswith("_R_"):
            seq.description = seq.description[3:] 
Example #3
Source File: intronerate.py    From HybPiper with GNU General Public License v3.0 6 votes vote down vote up
def remove_exons(gff_filename,supercontig_filename,mode="all"):
    '''Given a supercontig and corresponding annotation, remove the exon sequences. In "intron" mode, only return sequences specifically annotated as introns'''
    exon_starts = []
    exon_ends = []
    gff = open(gff_filename).readlines()
    for line in gff:
        line = line.rstrip().split("\t")
        if len(line) > 2:
            if line[2] == "exon":
                exon_starts.append(int(line[3]))
                exon_ends.append(int(line[4]))
    supercontig = SeqIO.read(supercontig_filename,'fasta')
    exonless_contig = SeqRecord(Seq(''),id=supercontig.id)
    start = 0
    for exon in range(len(exon_starts)):
        exonless_contig += supercontig[start:exon_starts[exon]-1] 
        start = exon_ends[exon]
    exonless_contig += supercontig[start:]    
    exonless_contig.description = ''
    return exonless_contig 
Example #4
Source File: abi.py    From TBProfiler with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self,in_obj,prefix):
        if isinstance(in_obj[0],str):
            self.filenames = in_obj
            self.records = [SeqIO.read(x,'abi') for x in self.filenames]
            self.prefix = prefix
        elif isinstance(in_obj[0],SeqIO.SeqRecord):
            self.records = in_obj
            self.prefix = prefix
        self.quals = {}
        for rec in self.records:
            self.quals[rec.name] = [ord(x) for x in rec.annotations["abif_raw"]["PCON1"]]
        self.trimmed_coords = {}
        for rec in self.records:
            trim_seq = str(SeqIO.AbiIO._abi_trim(rec).seq)
            start = str(rec.seq).index(trim_seq)
            end = start+len(trim_seq)
            self.trimmed_coords[rec.name] = list(range(start,end)) 
Example #5
Source File: utils.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def load_mask_sites(mask_file):
    """Load masking sites from either a BED file or a masking file.

    Parameters
    ----------
    mask_file: str
        Path to the BED or masking file

    Returns
    -------
    list[int]
        Sorted list of unique zero-indexed sites
    """
    if mask_file.lower().endswith(".bed"):
        mask_sites = read_bed_file(mask_file)
    else:
        mask_sites = read_mask_file(mask_file)
    print("%d masking sites read from %s" % (len(mask_sites), mask_file))
    return mask_sites 
Example #6
Source File: utils.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def annotate_parents_for_tree(tree):
    """Annotate each node in the given tree with its parent.

    >>> import io
    >>> tree = Bio.Phylo.read(io.StringIO("(A, (B, C))"), "newick")
    >>> not any([hasattr(node, "parent") for node in tree.find_clades()])
    True
    >>> tree = annotate_parents_for_tree(tree)
    >>> tree.root.parent is None
    True
    >>> all([hasattr(node, "parent") for node in tree.find_clades()])
    True
    """
    tree.root.parent = None
    for node in tree.find_clades(order="level"):
        for child in node.clades:
            child.parent = node

    # Return the tree.
    return tree 
Example #7
Source File: sequence_data.py    From treetime with MIT License 6 votes vote down vote up
def ref(self, in_ref):
        """
        Parameters
        ----------
        in_ref : file name, str, Bio.Seq.Seq, Bio.SeqRecord.SeqRecord
            reference sequence will read and stored a byte array
        """
        read_from_file=False
        if in_ref and isfile(in_ref):
            for fmt in ['fasta', 'genbank']:
                try:
                    in_ref = SeqIO.read(in_ref, fmt)
                    self.logger("SequenceData: loaded reference sequence as %s format"%fmt,1)
                    read_from_file=True
                    break
                except:
                    continue
            if not read_from_file:
                raise TypeError('SequenceData.ref: reference sequence file %s could not be parsed, fasta and genbank formats are supported.')

        if in_ref:
            self._ref = seq2array(in_ref, fill_overhangs=False, word_length=self.word_length)
            self.full_length = self._ref.shape[0]
            self.compressed_to_full_sequence_map = None
            self.multiplicity = None 
Example #8
Source File: SequenceSearcher.py    From biskit with GNU General Public License v3.0 6 votes vote down vote up
def __copyFileHandle(self, result_handle, fname ):
        """
        Copy the blast result to a file. The input file handle will be
        empty afterwards! Use the returned string handle instead.
    
        @param result_handle: file handle returned by NCBI* blast runners
        @type  result_handle: file
        @param fname : absolute path to output file
        @type  fname : str
    
        @return: a fresh 'file' (string) handle with the output
        @rtype : cStringIO.StringI
        """
        str_results = result_handle.read()
        try:
            save_file = open( fname, 'w' )
            save_file.write( str_results )
            save_file.close()
        finally:
            return cStringIO.StringIO( str_results ) 
Example #9
Source File: fasta.py    From ssbio with MIT License 6 votes vote down vote up
def fasta_files_equal(seq_file1, seq_file2):
    """Check equality of a FASTA file to another FASTA file

    Args:
        seq_file1: Path to a FASTA file
        seq_file2: Path to another FASTA file

    Returns:
        bool: If the sequences are the same

    """

    # Load already set representative sequence
    seq1 = SeqIO.read(open(seq_file1), 'fasta')

    # Load kegg sequence
    seq2 = SeqIO.read(open(seq_file2), 'fasta')

    # Test equality
    if str(seq1.seq) == str(seq2.seq):
        return True
    else:
        return False 
Example #10
Source File: uniprot.py    From ssbio with MIT License 6 votes vote down vote up
def metadata_path_unset(self):
        """Copy features to memory and remove the association of the metadata file."""
        if not self.metadata_file:
            raise IOError('No metadata file to unset')

        log.debug('{}: reading from metadata file {}'.format(self.id, self.metadata_path))
        tmp_sr = SeqIO.read(self.metadata_path, 'uniprot-xml')
        tmp_feats = tmp_sr.features

        # TODO: should this be in separate unset functions?
        self.metadata_dir = None
        self.metadata_file = None
        self.features = tmp_feats

        if self.sequence_file:
            tmp_sr = tmp_sr.seq
            self.sequence_dir = None
            self.sequence_file = None
            self.seq = tmp_sr 
Example #11
Source File: uniprot.py    From ssbio with MIT License 6 votes vote down vote up
def features(self):
        """list: Get the features from the feature file, metadata file, or in memory"""
        if self.feature_file:
            log.debug('{}: reading features from feature file {}'.format(self.id, self.feature_path))
            with open(self.feature_path) as handle:
                feats = list(GFF.parse(handle))
                if len(feats) > 1:
                    log.warning('Too many sequences in GFF')
                else:
                    return feats[0].features

        elif self.metadata_file:
            log.debug('{}: reading features from metadata file {}'.format(self.id, self.metadata_path))
            tmp_sr = SeqIO.read(self.metadata_path, 'uniprot-xml')
            return tmp_sr.features

        else:
            return self._features 
Example #12
Source File: avian_flu_upload.py    From fauna with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, **kwargs):
        upload.__init__(self, **kwargs)
        self.grouping_upload_fields = ['vtype', 'subtype', 'lineage']
        # patterns from the subtype and lineage fields in the GISAID fasta file
        self.patterns = {('a / h1n1', 'pdm09'): ('a', 'h1n1', 'seasonal_h1n1pdm'),
                    ('a / h1n2', ''): ('a', 'h1n2', None),
                    ('a / h1n2', 'seasonal'): ('a', 'h1n2', 'seasonal_h1n2'),
                    ('a / h2n2', ''): ('a', 'h2n2', None),
                    ('a / h3n2', ''): ('a', 'h3n2', 'seasonal_h3n2'),
                    ('a / h3n2', 'seasonal'): ('a', 'h3n2', 'seasonal_h3n2'),
                    ('a / h3n3', ''): ('a', 'h3n3', None),
                    ('a / h5n1', ''): ('a', 'h5n1', None),
                    ('a / h5n6', ''): ('a', 'h5n6', None),
                    ('a / h6n1', ''): ('a', 'h6n1', None),
                    ('a / h7n1', ''): ('a', 'h7n1', None),
                    ('a / h7n2', ''): ('a', 'h7n2', None),
                    ('a / h7n3', ''): ('a', 'h7n3', None),
                    ('a / h7n7', ''): ('a', 'h7n7', None),
                    ('a / h7n9', ''): ('a', 'h7n9', None),
                    ('a / h9n2', ''): ('a', 'h9n2', None),
                    ('a / h10n7', ''): ('a', 'h10n7', None),
                    ('a / h10n8', ''): ('a', 'h10n8', None),
                    ('a / h11', ''): ('a', 'h11', None),
                    ('b / h0n0', 'victoria'): ('b', None, 'seasonal_vic'),
                    ('b / h0n0', 'yamagata'): ('b', None, 'seasonal_yam'),
                    ('b', 'victoria'): ('b', None, 'seasonal_vic'),
                    ('b', 'yamagata'): ('b', None, 'seasonal_yam'),
                    ('h5n1',''): ('a', 'h5n1', None),
                    ('h7n9',''): ('a', 'h7n9', None),
                    ('h9n2',''): ('a', 'h9n2', None)}
        self.outgroups = {lineage: SeqIO.read('source-data/'+lineage+'_outgroup.gb', 'genbank') for lineage in ['H3N2', 'H1N1pdm', 'Vic', 'Yam']}
        self.outgroup_patterns = {'H3N2': ('a', 'h3n2', 'seasonal_h3n2'),
                                  'H1N1': ('a', 'h1n1', 'seasonal_h1n1'),
                                  'H1N1pdm': ('a', 'h1n1', 'seasonal_h1n1pdm'),
                                  'Vic': ('b', None, 'seasonal_vic'),
                                  'Yam': ('b', None, 'seasonal_yam')}
        self.strain_fix_fname = "source-data/flu_strain_name_fix.tsv"
        self.location_fix_fname = "source-data/flu_location_fix.tsv"
        self.virus_to_sequence_transfer_fields = ['submission_date']
        self.fix = set() 
Example #13
Source File: SequenceSearcher.py    From biskit with GNU General Public License v3.0 5 votes vote down vote up
def fastaRecordFromId_remote( self, id ):
        """
        experimental: fetch fasta records from remote database
        """
        from Bio import Entrez
        from Bio import SeqIO
        Entrez.email = "A.N.Other@example.com"
        if self.verbose: self.log.add_nobreak( 'r' )
        handle = Entrez.efetch(db="protein", rettype="fasta", id=id)
        frecord = SeqIO.read(handle, "fasta")
        frecord.id = str(id)
        handle.close()
        return frecord 
Example #14
Source File: flu_update.py    From fauna with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, **kwargs):
        update.__init__(self, **kwargs)
        flu_upload.__init__(self, **kwargs)
        self.outgroups = {lineage: SeqIO.read('source-data/'+lineage+'_outgroup.gb', 'genbank') for lineage in ['H3N2', 'H1N1', 'H1N1pdm', 'Vic', 'Yam']}
        self.outgroup_patterns = {'H3N2': ('a', 'h3n2', 'seasonal_h3n2'),
                                  'H1N1': ('a', 'h1n1', 'seasonal_h1n1'),
                                  'H1N1pdm': ('a', 'h1n1', 'seasonal_h1n1pdm'),
                                  'Vic': ('b', None, 'seasonal_vic'),
                                  'Yam': ('b', None, 'seasonal_yam')} 
Example #15
Source File: models.py    From mykrobe with MIT License 5 votes vote down vote up
def _parse_genbank(self, genbank):
        d = {}
        with open(genbank, 'r') as infile:
            for feat in SeqIO.read(infile, "genbank").features:
                if feat.type == "gene":
                    try:
                        name = feat.qualifiers.get(
                            "gene",
                            feat.qualifiers.get("db_xref"))[0]
                    except TypeError:
                        pass
                    else:
                        # SeqIO converts to 0-based
                        strand = int(feat.location.strand)
                        if strand == 1:
                            forward = True
                        elif strand == -1:
                            forward = False
                        else:
                            raise ValueError("Strand must be 1 or -1")
                        d[name] = Gene(
                            name,
                            self.reference,
                            start=feat.location.start + 1,
                            end=feat.location.end,
                            forward=forward)
        return d 
Example #16
Source File: flu_upload.py    From fauna with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, **kwargs):
        upload.__init__(self, **kwargs)
        self.grouping_upload_fields = ['vtype', 'subtype', 'lineage']
        # patterns from the subtype and lineage fields in the GISAID fasta file
        self.patterns = {('a / h1n1', 'pdm09'): ('a', 'h1n1', 'seasonal_h1n1pdm'),
                    ('a / h1n2', ''): ('a', 'h1n2', None),
                    ('a / h1n2', 'seasonal'): ('a', 'h1n2', 'seasonal_h1n2'),
                    ('a / h2n2', ''): ('a', 'h2n2', None),
                    ('a / h3n2', ''): ('a', 'h3n2', 'seasonal_h3n2'),
                    ('a / h3n2', 'seasonal'): ('a', 'h3n2', 'seasonal_h3n2'),
                    ('a / h3n3', ''): ('a', 'h3n3', None),
                    ('a / h5n1', ''): ('a', 'h5n1', None),
                    ('a / h5n6', ''): ('a', 'h5n6', None),
                    ('a / h6n1', ''): ('a', 'h6n1', None),
                    ('a / h7n1', ''): ('a', 'h7n1', None),
                    ('a / h7n2', ''): ('a', 'h7n2', None),
                    ('a / h7n3', ''): ('a', 'h7n3', None),
                    ('a / h7n7', ''): ('a', 'h7n7', None),
                    ('a / h7n9', ''): ('a', 'h7n9', None),
                    ('a / h9n2', ''): ('a', 'h9n2', None),
                    ('a / h10n7', ''): ('a', 'h10n7', None),
                    ('a / h10n8', ''): ('a', 'h10n8', None),
                    ('a / h11', ''): ('a', 'h11', None),
                    ('b / h0n0', 'victoria'): ('b', None, 'seasonal_vic'),
                    ('b / h0n0', 'yamagata'): ('b', None, 'seasonal_yam'),
                    ('b', 'victoria'): ('b', None, 'seasonal_vic'),
                    ('b', 'yamagata'): ('b', None, 'seasonal_yam')}
        self.outgroups = {lineage: SeqIO.read('source-data/'+lineage+'_outgroup.gb', 'genbank') for lineage in ['H3N2', 'H1N1pdm', 'Vic', 'Yam']}
        self.outgroup_patterns = {'H3N2': ('a', 'h3n2', 'seasonal_h3n2'),
                                  'H1N1': ('a', 'h1n1', 'seasonal_h1n1'),
                                  'H1N1pdm': ('a', 'h1n1', 'seasonal_h1n1pdm'),
                                  'Vic': ('b', None, 'seasonal_vic'),
                                  'Yam': ('b', None, 'seasonal_yam')}
        self.strain_fix_fname = "source-data/flu_strain_name_fix.tsv"
        self.location_fix_fname = "source-data/flu_location_fix.tsv"
        self.location_label_fix_fname = "source-data/flu_fix_location_label.tsv"
        self.virus_to_sequence_transfer_fields = ['submission_date']
        self.fix = set() 
Example #17
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def get_tax_id(self, query_name):
        """to get data from ncbi taxomomy, we need to have the taxid. we can
        get that by passing the species name to esearch, which will return
        the tax id"""
        query_name = query_name.replace(' ', "+").strip()
        Entrez.email = 'A.N.Other@example.com'
        search = Entrez.esearch(term=query_name, db="taxonomy", retmode="xml")
        record = Entrez.read(search)
        return record['IdList'][0] if record['IdList'] else None 
Example #18
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def fetchContentsByIDs(self, IDs, base=None, proportion=None, processSig=None):
        contents = ""
        for num, ID in enumerate(IDs):
            ID_path = self.fetchRecordPath(ID)
            with open(ID_path, encoding="utf-8", errors='ignore') as f:
                contents += f.read()
            if processSig:
                processSig.emit(base + (num+1)*proportion/len(IDs))
        return contents

    # def fetchIDsByContents(self, contents):
    #     '''注意这个ID是locus的ID'''
    #     rgx = re.compile(r"(?sm)LOCUS {7}(\S+).+?^//\s*?(?=LOCUS|$)")
    #     return rgx.findall(contents) 
Example #19
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def merge_file_contents(self, files, base=None, proportion=None, processSig=None):
        all_content = ""
        for num, file in enumerate(files):
            with open(file, encoding="utf-8", errors='ignore') as f:
                all_content += f.read()
            if processSig:
                processSig.emit(base + (num+1)*proportion/len(files))
        return all_content 
Example #20
Source File: handleGB.py    From PhyloSuite with GNU General Public License v3.0 5 votes vote down vote up
def fetchCBS(self):
        dict_ = {"1":"0", "2":"1", "5":"4", "9":"7"}
        out = ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"]
        if str(self.codeTable) not in dict_: return out  ##记得改
        code = dict_[str(self.codeTable)]
        os.chdir(self.path)
        infile = "codonW_infile.fas"
        outfile = "codonW_outfile.txt"
        blkfile = "codonW_blk.txt"
        errorfile = "codonW_error.fas"
        with open(self.path + os.sep + infile, "w", encoding="utf-8") as f:
            f.write(">seq\n%s\n"%self.seq)
        command = '"%s" "%s" "%s" "%s" -all_indices -nomenu -silent -noblk -code %s'%(self.codonW, infile, outfile, blkfile, code)
        # print(command)
        popen = self.factory.init_popen(command)
        try:
            while True:
                try:
                    out_line = popen.stdout.readline().decode("utf-8", errors='ignore')
                except UnicodeDecodeError:
                    out_line = popen.stdout.readline().decode("gbk", errors='ignore')
                if out_line == "" and popen.poll() is not None:
                    break
        except: pass
        ## 读取输出结果
        if not os.path.exists(self.path + os.sep + outfile):
            with open(errorfile, "a", encoding="utf-8") as f2:
                f2.write(command + "\n" + self.seq + "\n")
            # print("error seq.:", self.seq)
        else:
            with open(self.path + os.sep + outfile, encoding="utf-8", errors="ignore") as f1:
                content = f1.read()
            try:
                list_ = content.split("\n")[1].split("\t")
                out = list_[5:9] + list_[11:15]
            except IndexError:
                with open(errorfile, "a", encoding="utf-8") as f2:
                    f2.write(command + "\n" + self.seq + "\n")
        for num, i in enumerate(out):
            if (not self.is_float(i)) and (not self.is_int(i)): out[num] = "NA"
        return out 
Example #21
Source File: flu_upload.py    From fauna with GNU Affero General Public License v3.0 5 votes vote down vote up
def align_flu(self, doc, min_score_percentage=0.85, **kwargs):
        '''
        align with sequence from outgroup to determine subtype and lineage
        :return: True if determined grouping, False otherwise
        '''
        try:
            scores = []
            from Bio.Seq import Seq
            from Bio.SeqRecord import SeqRecord
            from Bio.Alphabet import IUPAC
            from Bio import AlignIO
            record = SeqRecord(Seq(doc['sequence'],
                               IUPAC.ambiguous_dna),
                               id=doc['strain'])
            for olineage, oseq in self.outgroups.items():
                SeqIO.write([oseq, record], "temp_in.fasta", "fasta")
                os.system("mafft --auto temp_in.fasta > temp_out.fasta 2>tmp")
                tmp_aln = np.array(AlignIO.read('temp_out.fasta', 'fasta'))
                scores.append((olineage, (tmp_aln[0]==tmp_aln[1]).sum()))
            scores.sort(key = lambda x:x[1], reverse=True)
            if scores[0][1]>min_score_percentage*len(record.seq):
                print("Lineage based on similarity:", scores[0][0], doc['strain'], len(record.seq), scores)
                return self.outgroup_patterns[scores[0][0]]
            else:
                print("Couldn't parse virus subtype and lineage from aligning sequence: ", doc['strain'], len(record.seq), scores)
                return None
        except:
            print("Alignment failed: " + doc['strain'])
            return None 
Example #22
Source File: sf_miscellaneous.py    From pan-genome-analysis with GNU General Public License v3.0 5 votes vote down vote up
def gbk_seq(each_gb_path, gbk_name):
    # get sequence from gbk file
    gb = each_gb_path + gbk_name
    genome = SeqIO.read(gb,'genbank')
    allseq = genome.seq
    return allseq 
Example #23
Source File: intronerate.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def parse_gff(filename):
    '''Parse a GFF file created for a single gene, return a list of lists containing the annotation info'''
    with open(filename) as gff_file:
        gff_dump = gff_file.read()
        gff_split = gff_dump.split("# --- END OF GFF DUMP ---")
        raw_hits = [x.split('\n') for x in gff_split[:-1]]
        hits = []
        for h in raw_hits:
            new_hit = []
            for line in h:
                if not line.startswith("#") and not line == "":
                    new_hit.append(line.rstrip().split('\t'))
            hits.append(new_hit)
    return hits 
Example #24
Source File: paralog_retriever.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def retrieve_seqs(path,name,gene):
    seqs_to_write = None
    if os.path.isdir(os.path.join(path,name,gene,name,'paralogs')):
        seqs_to_write = [x for x in SeqIO.parse(os.path.join(path,name,gene,name,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta')]
        num_seqs = str(len(seqs_to_write))
    elif os.path.isfile(os.path.join(path,name,gene,name,'sequences','FNA','{}.FNA'.format(gene))):
        seqs_to_write = SeqIO.read(os.path.join(path,name,gene,name,'sequences','FNA','{}.FNA'.format(gene)),'fasta')
        num_seqs = "1"
    
    if seqs_to_write:
        SeqIO.write(seqs_to_write,sys.stdout,'fasta')
    else:
        num_seqs = "0"
    
    return num_seqs 
Example #25
Source File: depth_calculator.py    From HybPiper with GNU General Public License v3.0 5 votes vote down vote up
def merge_seqs(genelist,prefix,file_type="coding"):
    '''Given a list of gene sequences, retreive the sequences and generate a single FASTA file.'''
    with open("{}_sequences.fasta".format(file_type),"w") as outfile:
        for gene in genelist:
            if file_type == "coding":
                seqfile = "{}/{}/sequences/FNA/{}.FNA".format(gene,prefix,gene)
            else:
                seqfile = "{}/{}/sequences/intron/{}_supercontig.fasta".format(gene,prefix,gene)    
            
            seq = SeqIO.read(seqfile,'fasta')
            seq.id = seq.id + "-" + gene
            SeqIO.write(seq,outfile,'fasta') 
Example #26
Source File: align.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def read_alignment(fname):
    try:
        return AlignIO.read(fname, 'fasta')
    except Exception as error:
        raise AlignmentError("\nERROR: Problem reading in {}: {}".format(fname, str(error))) 
Example #27
Source File: align.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def read_sequences(*fnames):
    """return list of sequences from all fnames"""
    seqs = {}
    try:
        for fname in fnames:
            for record in SeqIO.parse(fname, 'fasta'):
                if record.name in seqs and record.seq != seqs[record.name].seq:
                    raise AlignmentError("Detected duplicate input strains \"%s\" but the sequences are different." % record.name)
                    # if the same sequence then we can proceed (and we only take one)
                seqs[record.name] = record
    except FileNotFoundError:
        raise AlignmentError("\nCannot read sequences -- make sure the file %s exists and contains sequences in fasta format" % fname)
    except ValueError as error:
        raise AlignmentError("\nERROR: Problem reading in {}: {}".format(fname, str(error)))
    return list(seqs.values()) 
Example #28
Source File: export_v2.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def set_description(data_json, cmd_line_description_file):
    """
    Read Markdown file provided by *cmd_line_description_file* and set
    `meta.description` in *data_json* to the text provided.
    """
    try:
        with open(cmd_line_description_file, encoding='utf-8') as description_file:
            markdown_text = description_file.read()
            data_json['meta']['description'] = markdown_text
    except FileNotFoundError:
        fatal("Provided desciption file {} does not exist".format(cmd_line_description_file)) 
Example #29
Source File: export_v2.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_root_sequence(root_node, ref=None, translations=None):
    '''
    create a json structure that contains the sequence of the root, both as
    nucleotide and as translations. This allows look-up of the sequence for
    all states, including those that are not variable.

    Parameters
    ----------
    root_node : dict
    	data associated with the node
    ref : str, optional
        filename of the root sequence
    translations : str, optional
        file name of translations

    Returns
    -------
    dict
        dict of nucleotide sequence and translations
    '''
    root_sequence = {}
    if ref and translations:
        from Bio import SeqIO
        refseq = SeqIO.read(ref, 'fasta')
        root_sequence['nuc']=str(refseq.seq)
        for gene in SeqIO.parse(translations, 'fasta'):
            root_sequence[gene.id] = str(gene.seq)
    else:
        root_sequence["nuc"] = root_node["sequence"]
        root_sequence.update(root_node["aa_sequences"])

    return root_sequence 
Example #30
Source File: uniprot.py    From ssbio with MIT License 5 votes vote down vote up
def metadata_path(self, m_path):
        """Provide pointers to the paths of the metadata file

        Args:
            m_path: Path to metadata file

        """
        if not m_path:
            self.metadata_dir = None
            self.metadata_file = None

        else:
            if not op.exists(m_path):
                raise OSError('{}: file does not exist!'.format(m_path))

            if not op.dirname(m_path):
                self.metadata_dir = '.'
            else:
                self.metadata_dir = op.dirname(m_path)
            self.metadata_file = op.basename(m_path)

            # Parse the metadata file using Biopython's built in SeqRecord parser
            # Just updating IDs and stuff
            tmp_sr = SeqIO.read(self.metadata_path, 'uniprot-xml')
            parsed = parse_uniprot_xml_metadata(tmp_sr)
            self.update(parsed, overwrite=True)