Python Bio.SeqIO.parse() Examples

The following are 30 code examples of Bio.SeqIO.parse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Bio.SeqIO , or try the search function .
Example #1
Source File: prep-genome.py    From kevlar with MIT License 7 votes vote down vote up
def main(args):
    for record in SeqIO.parse(args.infile, 'fasta'):
        if args.discard:
            if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0:
                continue

        subseqcounter = 0
        printlog(args.debug, "DEBUG: convert to upper case", record.id)
        sequence = str(record.seq).upper()
        printlog(args.debug, "DEBUG: split seq by Ns", record.id)
        subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength]
        printlog(args.debug, "DEBUG: print subseqs", record.id)
        for subseq in subseqs:
            subseqcounter += 1
            subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter)
            subrecord = SeqRecord(Seq(subseq), subid, '', '')
            SeqIO.write(subrecord, args.outfile, 'fasta') 
Example #2
Source File: serialize_fasta.py    From tape-neurips2019 with MIT License 6 votes vote down vote up
def fasta_to_tfrecord(filename, vocab=PFAM_VOCAB):
    proteins_to_embed = []
    for record in SeqIO.parse(filename, 'fasta'):
        sequence = record.seq
        description = record.description

        int_sequence = []
        for aa in sequence:
            if aa in string.whitespace:
                raise ValueError("whitespace found in sequence for {}".format(description))
            int_sequence.append(vocab.get(aa))

        protein_context = to_features(protein_length=len(int_sequence), id=description.encode('UTF-8'))
        protein_features = to_sequence_features(primary=int_sequence)

        example = tf.train.SequenceExample(context=protein_context, feature_lists=protein_features)
        proteins_to_embed.append(example)

    new_filename = filename.rsplit('.', maxsplit=1)[0] + '.tfrecord'

    print('Writing tfrecord file {}'.format(new_filename))

    with tf.python_io.TFRecordWriter(new_filename) as writer:
        for example in proteins_to_embed:
            writer.write(example.SerializeToString()) 
Example #3
Source File: test_mask.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_e2e_fasta_beginning_end_sites(self, fasta_file, bed_file, out_file, sequences, argparser):
        from_beginning = 3
        from_end = 1
        arg_sites = [5, 12]
        expected_removals = sorted(set(TEST_BED_SEQUENCE + [s - 1 for s in arg_sites]))
        print(expected_removals)
        args = argparser("-s %s -o %s --mask %s --mask-from-beginning %s --mask-from-end %s --mask-sites %s" % (
                         fasta_file, out_file, bed_file, from_beginning, from_end, " ".join(str(s) for s in arg_sites)))
        mask.run(args)
        output = SeqIO.parse(out_file, "fasta")
        for record in output:
            reference = str(sequences[record.id].seq)
            masked_seq = str(record.seq)
            assert masked_seq[:from_beginning] == "N" * from_beginning
            assert masked_seq[-from_end:] == "N" * from_end
            for idx, site in enumerate(masked_seq[from_beginning:-from_end], from_beginning):
                if idx in expected_removals:
                    assert site == "N"
                else:
                    assert site == reference[idx] 
Example #4
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_prepare_with_alignment_with_ref_name(self, test_file, test_seqs, existing_with_ref, existing_aln, ref_seq, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence name, no changes are made."""
        aln_outfile, seqs_outfile, _ = align.prepare([test_file,], existing_with_ref, out_file, ref_seq.id, None)
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile == existing_with_ref, "Rewrote the alignment file unexpectedly"
        # Alignment file should be unchanged
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence dropped from alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys() 
Example #5
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_prepare_with_alignment_with_ref_seq(self, test_file, test_seqs, existing_file, existing_aln, ref_seq, ref_file, out_file):
        """Test that, given a set of test sequences, an existing alignment, and a reference sequence, the reference
        is added to the existing alignment and no other changes are made."""
        aln_outfile, seqs_outfile, ref_name = align.prepare([test_file,], existing_file, out_file, None, ref_file)
        assert ref_name == ref_seq.id, "Didn't return strain name from refrence file"
        assert os.path.isfile(aln_outfile), "Didn't write existing alignment where it said"
        assert aln_outfile != existing_aln, "Unexpectedly overwrote existing alignment"
        # Alignment file should have the reference added
        aln_output = SeqIO.to_dict(SeqIO.parse(aln_outfile, "fasta"))
        assert aln_output[ref_seq.id].seq == ref_seq.seq, "Reference sequence not added to alignment"
        for seq in existing_aln:
            assert seq in aln_output, "Some existing alignment sequences dropped unexpectedly"
            assert aln_output[seq].seq == existing_aln[seq].seq, "Some existing alignment sequences changed unexpectedly"
        # test sequences should be unchanged
        assert os.path.isfile(seqs_outfile), "Didn't write test sequences where it said"
        seq_output = SeqIO.to_dict(SeqIO.parse(seqs_outfile, "fasta"))
        for seq in test_seqs:
            assert seq in seq_output, "Some test sequences unexpectedly dropped"
            assert seq_output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly"
        assert seq_output.keys() == test_seqs.keys() 
Example #6
Source File: bio_utils.py    From models with MIT License 6 votes vote down vote up
def read_fasta(mirna_fasta_file, mrna_fasta_file):
    handle = open(mirna_fasta_file, "rU")
    mirna_list = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    handle = open(mrna_fasta_file, "rU")
    mrna_list = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    mirna_ids = []
    mirna_sequences = []
    mrna_ids = []
    mrna_sequences = []

    for i in range(len(mirna_list)):
        mirna_ids.append(str(mirna_list[i].id))
        mirna_sequences.append(str(mirna_list[i].seq))

    for i in range(len(mrna_list)):
        mrna_ids.append(str(mrna_list[i].id))
        mrna_sequences.append(str(mrna_list[i].seq))

    return (mirna_ids, mirna_sequences, mrna_ids, mrna_sequences) 
Example #7
Source File: genomics.py    From deepchem with MIT License 6 votes vote down vote up
def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
  """
  Loads a sequence file and returns an array of one-hot sequences.

  Parameters
  ----------
  fname: str
    Filename of fasta file.
  file_type: str
    The type of file encoding to process, e.g. fasta or fastq, this
    is passed to Biopython.SeqIO.parse.
  letters: str
    The set of letters that the sequences consist of, e.g. ATCG.

  Returns
  -------
  np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
  """

  from Bio import SeqIO

  sequences = SeqIO.parse(fname, file_type)
  return seq_one_hot_encode(sequences, letters) 
Example #8
Source File: util.py    From InSilicoSeq with MIT License 6 votes vote down vote up
def count_records(fasta_file):
    """Count the number of records in a fasta file and return a list of
    recods id

    Args:
        fasta_file (string): the path to a fasta file

    Returns:
        list: a list of record ids
    """
    logger = logging.getLogger(__name__)
    record_list = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        record_list.append(record.id)
    try:
        assert len(record_list) != 0
    except AssertionError as e:
        logger.error(
            'Failed to find records in genome(s) file:%s' % fasta_file)
        sys.exit(1)
    else:
        return record_list 
Example #9
Source File: ref_coord.py    From NucDiff with Mozilla Public License 2.0 6 votes vote down vote up
def READ_FASTA_ENTRY(file_name):
    fasta_sequences=[]
    sequence_name=[]
    full_names_dict={}

    sequences_dict={}

    
    if os.stat(file_name)[6]!=0: #not empty
       
        fh = open(file_name, "r")
        for record in SeqIO.parse(fh, "fasta"):
            short_name=str(record.id).split(' ')[0]
            sequences_dict[short_name]=str(record.seq)

        
    return sequences_dict



#------------------------------------------------------------------------------------------ 
Example #10
Source File: general.py    From NucDiff with Mozilla Public License 2.0 6 votes vote down vote up
def READ_FASTA_ENTRY(file_name):
    fasta_sequences=[]
    sequence_name=[]
    full_names_dict={}

    sequences_dict={}

    
    if os.stat(file_name)[6]!=0: #not empty
       
        fh = open(file_name, "r")
        for record in SeqIO.parse(fh, "fasta"):
            short_name=str(record.id).split(' ')[0]
            sequence_name.append(short_name)
            full_names_dict[short_name]=str(record.id)

            fasta_sequences.append(str(record.seq))
            sequences_dict[short_name]=str(record.seq)

        
    return sequences_dict,fasta_sequences, sequence_name, full_names_dict 
Example #11
Source File: __init__.py    From PHEnix with GNU General Public License v3.0 6 votes vote down vote up
def _read_reference(self, reference):
        """Read in the reference from the file into a dictionary.

        Parameters
        ----------
        reference: str
            Path to the reference.

        Returns
        -------
        reference: dict
            Dictionary with chromosome - sequence mapping.
        """

        if reference is None:
            self._reference = None
        else:
            self._reference = {}

            with open(reference) as reference_fp:
                for record in SeqIO.parse(reference_fp, "fasta"):
                    self._reference[record.id] = list(record.seq) 
Example #12
Source File: update.py    From funannotate with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def gbk2interlap(input):
    '''
    function to parse GBK file, construct scaffold/gene interlap dictionary and funannotate standard annotation dictionary
    '''
    inter = defaultdict(InterLap)
    Genes = {}
    with open(input, 'r') as filein:
        for record in SeqIO.parse(filein, 'genbank'):
            for f in record.features:
                if f.type == 'gene':
                    locusTag, ID, Parent = lib.getID(f, f.type)
                    start = int(f.location.nofuzzy_start)
                    end = int(f.location.nofuzzy_end)
                    inter[record.id].add((start, end, locusTag))
                lib.gb_feature_add2dict(f, record, Genes)
    return inter, Genes 
Example #13
Source File: populate.py    From phageParser with MIT License 6 votes vote down vote up
def addspacerstodict(gendict, sfile):
    spacerrecords = SeqIO.parse(sfile, 'fasta')
    for record in spacerrecords:
        accessions = record.name.split('|')
        sequence = str(record.seq)
        for acc in accessions:
            acc_elems = acc.split('_')
            order = acc_elems[-1]
            acc_id = '_'.join(acc_elems[:-1])
            try:
                if 'Spacers' in gendict[acc_id]:
                    gendict[acc_id]['Spacers'][order] = sequence
                else:
                    gendict[acc_id]['Spacers'] = {order: sequence}
            except KeyError:
                print('Error on accession id:  %s' % acc_id)
    return gendict 
Example #14
Source File: update.py    From funannotate with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def pasa_transcript2gene(input):
    # modify kallisto ouput to map gene names to each mRNA ID so you know what locus they have come from
    mRNADict = {}
    # since mRNA is unique, parse the transcript file which has mRNAID geneID in header
    with open(input, 'r') as transin:
        for line in transin:
            if line.startswith('>'):
                line = line.rstrip()
                line = line.replace('>', '')
                cols = line.split(' ')
                mRNAID = cols[0]
                geneID = cols[1]
                location = cols[-1]
                if not mRNAID in mRNADict:
                    mRNADict[mRNAID] = (geneID, location)
    return mRNADict 
Example #15
Source File: CRISPRessoCountCORE.py    From CRISPResso with GNU Affero General Public License v3.0 6 votes vote down vote up
def filter_se_fastq_by_qual(fastq_filename,output_filename=None,min_bp_quality=20,min_single_bp_quality=0):

        if fastq_filename.endswith('.gz'):
                fastq_handle=gzip.open(fastq_filename)
        else:
                fastq_handle=open(fastq_filename)

        if not output_filename:
                output_filename=fastq_filename.replace('.fastq','').replace('.gz','')+'_filtered.fastq.gz'

        try: 
            fastq_filtered_outfile=gzip.open(output_filename,'w+')

            for record in SeqIO.parse(fastq_handle, "fastq"):
                if np.array(record.letter_annotations["phred_quality"]).mean()>=min_bp_quality \
                and np.array(record.letter_annotations["phred_quality"]).min()>=min_single_bp_quality:
                    fastq_filtered_outfile.write(record.format('fastq'))
        except:
                raise Exception('Error handling the fastq_filtered_outfile')

 
        return output_filename 
Example #16
Source File: organism_name_update.py    From phageParser with MIT License 6 votes vote down vote up
def fetch_names(id_list):
    organism_names = {}

    # Doing 100 by 100 to make sure requests to NCBI are not too big
    for i in range(0, len(id_list), 100):
        j = i + 100
        if j >= len(id_list):
            j = len(id_list)

        sys.stderr.write(
            "Fetching entries from %s to %s from GenBank\n" % (i, j))
        sys.stderr.flush()
        result_handle = Entrez.efetch(db=db, rettype="gb", id=id_list[i:j])

        # Populate result per organism name
        for record in parse(result_handle, 'genbank'):
            # Using NCBI name, which should match accession number passed
            organism_names[record.name] = record.annotations['organism']

    return organism_names 
Example #17
Source File: CRISPRessoCORE.py    From CRISPResso with GNU Affero General Public License v3.0 6 votes vote down vote up
def filter_se_fastq_by_qual(fastq_filename,output_filename=None,min_bp_quality=20,min_single_bp_quality=0):

        if fastq_filename.endswith('.gz'):
                fastq_handle=gzip.open(fastq_filename)
        else:
                fastq_handle=open(fastq_filename)

        if not output_filename:
                output_filename=fastq_filename.replace('.fastq','').replace('.gz','')+'_filtered.fastq.gz'

        try:
            fastq_filtered_outfile=gzip.open(output_filename,'w+')

            for record in SeqIO.parse(fastq_handle, "fastq"):
                if np.array(record.letter_annotations["phred_quality"]).mean()>=min_bp_quality \
                and np.array(record.letter_annotations["phred_quality"]).min()>=min_single_bp_quality:
                    fastq_filtered_outfile.write(record.format('fastq'))
        except:
                raise Exception('Error handling the fastq_filtered_outfile')


        return output_filename 
Example #18
Source File: EukRep_tests.py    From EukRep with MIT License 6 votes vote down vote up
def compare_outputs(reference_db, test_db, out_name):
    reference_ids = {}
    print('Expect the following sequences to be predicted to be eukaryotic:')
    for record in SeqIO.parse(reference_db, "fasta"):
        reference_ids[record.id] = 0
        print(record.id)

    #Check for enexptected output
    print('The following sequences were predicted to be eukaryotic:')
    for record in SeqIO.parse(test_db, "fasta"):
        print(record.id)
        if record.id not in reference_ids:
            print('\nUnexpected scaffold %s in %s' % (record.id, out_name))
            return True
        else:
            reference_ids[record.id] = 1

    #Check for missing output
    for scaffold in reference_ids:
        if reference_ids[scaffold] is 0:
            print('\nMissing scaffold %s in %s' % (scaffold, out_name))
            return True
    return False 
Example #19
Source File: find_fasta_gaps.py    From smrtsv2 with MIT License 6 votes vote down vote up
def find_gaps(input_filename):
    # Load the original FASTA sequence.
    fasta = SeqIO.parse(input_filename, "fasta")

    for record in fasta:
        gap_start = None  # Not in a gap

        for i in _range(len(record)):

            if record.seq[i].upper() == "N":
                if gap_start is None:
                    gap_start = i
            else:
                if gap_start is not None:
                    print("\t".join(map(str, (record.id, gap_start, i))))
                    gap_start = None

        if gap_start is not None:
            print("\t".join(map(str, (record.id, gap_start, len(record))))) 
Example #20
Source File: pipe.py    From singlem with GNU General Public License v3.0 6 votes vote down vote up
def _align_proteins_to_hmm(self, protein_sequences, hmm_file):
        '''hmmalign proteins to hmm, and return an alignment object

        Parameters
        ----------
        protein_sequences: generator / list of tuple(name,sequence) objects
        from SeqReader().

        '''
        cmd = "hmmalign '{}' /dev/stdin".format(hmm_file)
        output = extern.run(cmd, stdin=''.join([
            ">{}\n{}\n".format(s[0], s[1]) for s in protein_sequences]))
        protein_alignment = []
        for record in SeqIO.parse(StringIO(output), 'stockholm'):
            protein_alignment.append(AlignedProteinSequence(record.name, str(record.seq)))
        if len(protein_alignment) > 0:
            logging.debug("Read in %i aligned sequences e.g. %s %s" % (
                len(protein_alignment),
                protein_alignment[0].name,
                protein_alignment[0].seq))
        else:
            logging.debug("No aligned sequences found for this HMM")
        return protein_alignment 
Example #21
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_prepare_no_alignment_with_ref_file(self, test_file, test_seqs, ref_file, ref_seq, out_file):
        _, output_fn, ref_name = align.prepare([test_file,], None, out_file, None, ref_file)
        assert ref_name == ref_seq.id, "Didn't return strain name from refrence file"
        assert os.path.isfile(output_fn), "Didn't write sequences where it said"
        output = list(SeqIO.parse(output_fn, "fasta")) # order matters
        assert output[0].id == ref_seq.id, "Reference sequence is not the first sequence in ouput file!"
        output_names = {record.name for record in output}
        assert all(name in output_names for name in test_seqs), "Some test sequences dropped unexpectedly"
        for record in output[1:]:
            assert record.seq == test_seqs[record.id].seq, "Some test sequences changed unexpectedly" 
Example #22
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_prepare_no_alignment_or_ref(self, test_file, test_seqs, out_file):
        _, output, _ = align.prepare([test_file,], None, out_file, None, None)
        assert os.path.isfile(output), "Didn't write sequences where it said"
        for name, seq in SeqIO.to_dict(SeqIO.parse(output, "fasta")).items():
            assert seq.seq == test_seqs[name].seq 
Example #23
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_prepare_no_alignment_with_ref_name(self, test_with_ref, test_seqs, ref_seq, out_file):
        _, output_fn, _ = align.prepare([test_with_ref,], None, out_file, ref_seq.id, None)
        assert os.path.isfile(output_fn), "Didn't write sequences where it said"
        output = SeqIO.to_dict(SeqIO.parse(output_fn, "fasta"))
        assert output[ref_seq.id].seq == ref_seq.seq, "Reference sequence was not added to test sequences"
        for seq in test_seqs:
            assert seq in output, "Some test sequences dropped unexpectedly"
            assert output[seq].seq == test_seqs[seq].seq, "Some test sequences changed unexpectedly" 
Example #24
Source File: processSam.py    From miRge with GNU General Public License v3.0 5 votes vote down vote up
def decorateSam(infTmp1, infTmp2, *arg):
	# The parameters of this function are as follows:
	# unmapped_mirna_SRR944031_vs_representative_seq.sam unmapped_mirna_SRR944031.fa mapped_mirna_SRR944031_vs_genome_sorted_clusters_overlap14_trimmed_orig.fa
		seqDic = {}
		for record in SeqIO.parse(infTmp2, "fasta"):
			seqDic.update({record.id:str(record.seq)})
		outf = open(infTmp1[:-4]+"_modified.sam","w")
		if len(arg) == 1:
			clusterSeqDic = {}
			for record in SeqIO.parse(arg[0], "fasta"):
				clusterSeqDic.update({record.id:str(record.seq)})
			with open(infTmp1,"r") as inf:
				for line in inf:
					if line[0] == '@':
						outf.write(line)
					else:
						miRNAName = line.strip().split('\t')[0]
						clusterRNAName = line.strip().split('\t')[2]
						if clusterRNAName in clusterSeqDic.keys():
							outf.write('\t'.join([miRNAName, miRNAName.split('_')[-1], seqDic[miRNAName], clusterSeqDic[clusterRNAName]])+'\t'+'\t'.join(line.strip().split('\t')[1:])+'\n')
						else:
							outf.write('\t'.join([miRNAName, miRNAName.split('_')[-1], seqDic[miRNAName], '*'])+'\t'+'\t'.join(line.strip().split('\t')[1:])+'\n')
		else:
			with open(infTmp1,"r") as inf:
				for line in inf:
					if line[0] == '@':
						outf.write(line)
					else:
						miRNAName = line.strip().split('\t')[0]
						outf.write('\t'.join([miRNAName, miRNAName.split('_')[-1], seqDic[miRNAName]])+'\t'+'\t'.join(line.strip().split('\t')[1:])+'\n')
		outf.close() 
Example #25
Source File: CRISPRessoCORE.py    From CRISPResso with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_ids_reads_to_remove(fastq_filename,min_bp_quality=20,min_single_bp_quality=0):
    ids_to_remove=set()
    if fastq_filename.endswith('.gz'):
        fastq_handle=gzip.open(fastq_filename)
    else:
        fastq_handle=open(fastq_filename)

    for record in SeqIO.parse(fastq_handle, "fastq"):
        if np.array(record.letter_annotations["phred_quality"]).mean()<min_bp_quality \
        or np.array(record.letter_annotations["phred_quality"]).min()<min_single_bp_quality:
            ids_to_remove.add(record.id)

    return ids_to_remove 
Example #26
Source File: test_align.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def argparser():
    """Provide an easy way to test command line arguments"""
    parser = argparse.ArgumentParser()
    align.register_arguments(parser)
    def parse(args):
        return parser.parse_args(args.split(" "))
    return parse 
Example #27
Source File: test_mask.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_e2e_fasta_mask_file(self, fasta_file, mask_file, sequences, argparser):
        args = argparser("-s %s --mask %s" % (fasta_file, mask_file))
        mask.run(args)
        output = SeqIO.parse(fasta_file,"fasta")
        for record in output:
            reference = sequences[record.id].seq
            for idx, site in enumerate(record.seq):
                if idx in TEST_MASK_SEQUENCE:
                    assert site == "N"
                else:
                    assert site == reference[idx] 
Example #28
Source File: update.py    From funannotate with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def gff2interlap(input, fasta):
    '''
    function to parse GFF3 file, construct scaffold/gene interlap dictionary and funannotate standard annotation dictionary
    '''
    inter = defaultdict(InterLap)
    Genes = {}
    Genes = lib.gff2dict(input, fasta, Genes)
    for k, v in natsorted(list(Genes.items())):
        inter[v['contig']].add((v['location'][0], v['location'][1], k))
    return inter, Genes 
Example #29
Source File: test_mask.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_mask_fasta_from_beginning_and_end_too_long(self, fasta_file, out_file, beginning, end):
        mask.mask_fasta([], fasta_file, out_file, mask_from_beginning=beginning, mask_from_end=end)
        output = SeqIO.parse(out_file, "fasta")
        for record in output:
            assert record.seq == "N" * len(record.seq) 
Example #30
Source File: test_mask.py    From augur with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_e2e_fasta_minimal(self, fasta_file, bed_file, sequences, argparser):
        args = argparser("-s %s --mask %s" % (fasta_file, bed_file))
        mask.run(args)
        output = SeqIO.parse(fasta_file,"fasta")
        for record in output:
            reference = sequences[record.id].seq
            for idx, site in enumerate(record.seq):
                if idx in TEST_BED_SEQUENCE:
                    assert site == "N"
                else:
                    assert site == reference[idx]