Python pysam.Fastafile() Examples
The following are 14
code examples of pysam.Fastafile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pysam
, or try the search function
.
Example #1
Source File: basenji_motifs_denovo.py From basenji with Apache License 2.0 | 6 votes |
def filter_seqlets(seqlet_acts, seqlet_intervals, genome_fasta_file, end_distance=100, verbose=True): """ Filter seqlets by valid chromosome coordinates. """ # read chromosome lengths chr_lengths = {} genome_fasta_open = pysam.Fastafile(genome_fasta_file) for chrom in genome_fasta_open.references: chr_lengths[chrom] = genome_fasta_open.get_reference_length(chrom) genome_fasta_open.close() # check coordinates filter_mask = np.zeros(len(seqlet_intervals), dtype='bool') for si, seq_int in enumerate(seqlet_intervals): left_valid = (seq_int.start > end_distance) right_valid = (seq_int.end + end_distance < chr_lengths[seq_int.chr]) filter_mask[si] = left_valid and right_valid if verbose: print('Removing %d seqlets near chromosome ends.' % (len(seqlet_intervals) - filter_mask.sum())) # filter seqlet_acts = seqlet_acts[filter_mask] seqlet_intervals = [seq_int for si, seq_int in enumerate(seqlet_intervals) if filter_mask[si]] return seqlet_acts, seqlet_intervals
Example #2
Source File: genome.py From basenji with Apache License 2.0 | 6 votes |
def load_chromosomes(genome_file): """ Load genome segments from either a FASTA file or chromosome length table. """ # is genome_file FASTA or (chrom,start,end) table? file_fasta = (open(genome_file).readline()[0] == '>') chrom_segments = {} if file_fasta: fasta_open = pysam.Fastafile(genome_file) for i in range(len(fasta_open.references)): chrom_segments[fasta_open.references[i]] = [(0, fasta_open.lengths[i])] fasta_open.close() else: for line in open(genome_file): a = line.split() chrom_segments[a[0]] = [(0, int(a[1]))] return chrom_segments
Example #3
Source File: test_data.py From basenji with Apache License 2.0 | 6 votes |
def test_seqs(self): """Test that the one hot coded sequences match.""" # read sequence coordinates seqs_bed_file = '%s/sequences.bed' % self.out_dir seq_coords = read_seq_coords(seqs_bed_file) # read one hot coding from TF Records train_tfrs_str = '%s/tfrecords/train-0.tfr' % self.out_dir seqs_1hot, _ = self.read_tfrecords(train_tfrs_str) # open FASTA fasta_open = pysam.Fastafile(self.fasta_file) # check random sequences seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32) for si in seq_indexes: sc = seq_coords[si] seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end) seq_1hot_dna = hot1_dna(seqs_1hot[si]) self.assertEqual(seq_fasta, seq_1hot_dna)
Example #4
Source File: dbmatch.py From vgraph with Apache License 2.0 | 6 votes |
def match_database(args): """Match a genome to a database of alleles.""" refs = Fastafile(expanduser(args.reference)) db = VariantFile(expanduser(args.database)) sample = VariantFile(expanduser(args.sample)) format_meta, info_meta = build_new_metadata(db, sample) with VariantFile(args.output, 'w', header=sample.header) as out: for superlocus, matches in generate_matches(refs, sample, db, args): for allele_locus, allele, match in matches: # Annotate results of search status, times = translate_match(match) suffix = '_' + status for locus in allele_locus: annotate_info(locus, allele, info_meta, suffix, times) annotate_format(locus, allele, format_meta, suffix, times) for locus in sorted(superlocus, key=NormalizedLocus.record_order_key): out.write(locus.record)
Example #5
Source File: basenji_data_gene.py From basenji with Apache License 2.0 | 5 votes |
def sufficient_sequence(fasta_file, genes_df, seq_length, n_allowed_pct): """Return boolean mask specifying genes with sufficient sequence.""" # open FASTA fasta_open = pysam.Fastafile(fasta_file) # initialize gene boolean gene_valid = np.ones(genes_df.shape[0], dtype='bool') gi = 0 for gene in genes_df.itertuples(): chr_len = fasta_open.get_reference_length(gene.chr) mid_pos = (gene.start + gene.end) // 2 seq_start = mid_pos - seq_length//2 seq_end = seq_start + seq_length # count requested N's n_requested = 0 if seq_start < 0: n_requested += -seq_start if seq_end > chr_len: n_requested += seq_end - chr_len if n_requested > 0: if n_requested/seq_length < n_allowed_pct: print('Allowing %s with %d Ns' % (gene.Index, n_requested)) else: print('Skipping %s with %d Ns' % (gene.Index, n_requested)) gene_valid[gi] = False gi += 1 fasta_open.close() return gene_valid
Example #6
Source File: test_data2.py From basenji with Apache License 2.0 | 5 votes |
def test_seqs(self): """Test that the one hot coded sequences match.""" for gi in range(2): # read sequence coordinates seqs_bed_file = '%s/sequences%d.bed' % (self.out_dir, gi) seq_coords = read_seq_coords(seqs_bed_file) # read one hot coding from TF Records train_tfrs_str = '%s/tfrecords/train-%d-0.tfr' % (self.out_dir, gi) seqs_1hot, _, genomes = self.read_tfrecords(train_tfrs_str) # check genome self.assertEqual(len(np.unique(genomes)), 1) self.assertEqual(genomes[0], gi) # open FASTA fasta_open = pysam.Fastafile(self.fasta_files[gi]) # check random sequences seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32) for si in seq_indexes: sc = seq_coords[si] seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end).upper() seq_1hot_dna = hot1_dna(seqs_1hot[si]) self.assertEqual(seq_fasta, seq_1hot_dna)
Example #7
Source File: vgraph.py From vgraph with Apache License 2.0 | 5 votes |
def normalize(args): """Normalize variants.""" refs = Fastafile(expanduser(args.reference)) variants = VariantFile(args.sample) with VariantFile(args.output, 'w', header=variants.header) as out: # Create parallel locus iterator by chromosome for _, ref, loci in records_by_chromosome(refs, [variants], [None], args): loci = sort_almost_sorted(loci[0], key=NormalizedLocus.left_order_key) for locus in loci: record = locus.record start = locus.left.start stop = locus.left.stop alleles = locus.left.alleles if '' in alleles: pad = ref[start - 1:start] start -= 1 alleles = [pad + a for a in alleles] record.alleles = alleles record.start = start record.stop = stop out.write(record)
Example #8
Source File: svtool_to_vcf.py From metasv with BSD 2-Clause "Simplified" License | 5 votes |
def convert_svtool_to_vcf(file_name, sample, out_vcf, toolname, reference, sort=False, index=False): vcf_template_reader = get_template() vcf_template_reader.samples = [sample] vcf_fd = open(out_vcf, "w") if out_vcf is not None else sys.stdout vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader) reference_handle = pysam.Fastafile(reference) if reference else None reference_contigs = get_contigs(reference) if sort: if not reference_contigs: logger.warn("Chromosomes will be sorted in lexicographic order since reference is missing") else: logger.info("Chromosomes will be sorted by the reference order") vcf_records = [] for tool_record in tool_to_reader[toolname](file_name, reference_handle=reference_handle): vcf_record = tool_record.to_vcf_record(sample) if vcf_record is None: continue if sort: vcf_records.append(vcf_record) else: vcf_writer.write_record(vcf_record) if sort: if reference_contigs: contigs_order_dict = {contig.name: index for (index, contig) in enumerate(reference_contigs)} vcf_records.sort( cmp=lambda x, y: cmp((contigs_order_dict[x.CHROM], x.POS), (contigs_order_dict[y.CHROM], y.POS))) else: vcf_records.sort(cmp=lambda x, y: cmp((x.CHROM, x.POS), (y.CHROM, y.POS))) for vcf_record in vcf_records: vcf_writer.write_record(vcf_record) vcf_writer.close() if out_vcf and index: pysam.tabix_index(out_vcf, force=True, preset='vcf')
Example #9
Source File: basenji_hdf5_cluster.py From basenji with Apache License 2.0 | 4 votes |
def segments_1hot(fasta_file, segments, seq_length, stride): """ Read and 1-hot code sequences in their segment batches. Args fasta_file: FASTA genome segments: list of (chrom,start,end) genomic segments to read seq_length: sequence length to break them into stride: distance to advance each sequence Returns: seqs_1hot: You know. seqs_segments: list of (chrom,start,end) sequence segments """ # open fasta fasta = pysam.Fastafile(fasta_file) # initialize 1-hot coding list seqs_1hot = [] # segment corresponding to each sequence seqs_segments = [] for chrom, seg_start, seg_end in segments: # read sequence seg_seq = fasta.fetch(chrom, seg_start, seg_end) # break up into batchable sequences (as above in bigwig_batch) bstart = 0 bend = bstart + seq_length while bend < len(seg_seq): # append seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend])) seqs_segments.append((chrom, seg_start + bstart, seg_start + bend)) # update bstart += stride bend += stride return np.array(seqs_1hot), seqs_segments ################################################################################
Example #10
Source File: basenji_hdf5_single.py From basenji with Apache License 2.0 | 4 votes |
def segments_1hot(fasta_file, segments, seq_length, stride): """ Read and 1-hot code sequences in their segment batches. Args fasta_file: FASTA genome segments: list of (chrom,start,end) genomic segments to read seq_length: sequence length to break them into Returns: seqs_1hot: You know. seqs_segments: list of (chrom,start,end) sequence segments """ # open fasta fasta = pysam.Fastafile(fasta_file) # initialize 1-hot coding list seqs_1hot = [] # segment corresponding to each sequence seqs_segments = [] for chrom, seg_start, seg_end in segments: # read sequence seg_seq = fasta.fetch(chrom, seg_start, seg_end) # break up into batchable sequences (as above in bigwig_batch) bstart = 0 bend = bstart + seq_length while bend < len(seg_seq): # append seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend])) seqs_segments.append((chrom, seg_start + bstart, seg_start + bend)) # update bstart += stride bend += stride return np.array(seqs_1hot), seqs_segments ################################################################################
Example #11
Source File: bam_cov.py From basenji with Apache License 2.0 | 4 votes |
def __init__(self, chrom_lengths, stranded=False, smooth_sd=32, clip_max=None, clip_max_multi=2, shift_center=False, shift_forward=0, shift_reverse=0, mapq_t=1, fasta_file=None): self.stranded = stranded if self.stranded: # model + and - strand of each chromosome self.chrom_lengths = OrderedDict() for chrom, clen in chrom_lengths.items(): self.chrom_lengths['%s+'%chrom] = clen for chrom, clen in chrom_lengths.items(): self.chrom_lengths['%s-'%chrom] = clen else: self.chrom_lengths = chrom_lengths self.genome_length = sum(self.chrom_lengths.values()) self.unique_counts = np.zeros(self.genome_length, dtype='uint8') self.active_blocks = None self.smooth_sd = smooth_sd self.shift_center = shift_center self.shift_forward = shift_forward self.shift_reverse = shift_reverse self.clip_max = clip_max self.clip_max_multi = clip_max_multi self.adaptive_cdf = 0.01 self.adaptive_t = {} self.mapq_t = mapq_t self.fasta = None if fasta_file is not None: self.fasta = pysam.Fastafile(fasta_file) self.gc_model = None
Example #12
Source File: hdf5_bed.py From basenji with Apache License 2.0 | 4 votes |
def main(): usage = 'usage: %prog [options] <hdf5_file> <bed_file>' parser = OptionParser(usage) parser.add_option( '-f', dest='fasta_file', default='%s/assembly/hg19.fa' % os.environ['HG19'], help='FASTA file [Default: %default]') parser.add_option( '-n', dest='check_num', default=100, type='int', help='Number of sequences to check [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide HDF5 and BED files') else: hdf5_file = args[0] bed_file = args[1] fasta = pysam.Fastafile(options.fasta_file) hdf5_in = h5py.File(hdf5_file) si = 0 for line in open(bed_file): a = line.split() if a[-1] == 'train': chrom = a[0] start = int(a[1]) end = int(a[2]) bed_seq = fasta.fetch(chrom, start, end).upper() hdf5_seq = basenji.dna_io.hot1_dna(hdf5_in['train_in'][si:si + 1])[0] print(bed_seq[:10], len(bed_seq)) assert (bed_seq == hdf5_seq) si += 1 if si > options.check_num: break ################################################################################ # __main__ ################################################################################
Example #13
Source File: dbmatch.py From vgraph with Apache License 2.0 | 4 votes |
def match_database2(args): """Match a genome to a database of alleles.""" refs = Fastafile(expanduser(args.reference)) db = VariantFile(expanduser(args.database)) sample = VariantFile(expanduser(args.sample)) try: sample_name = sample.header.samples[args.name] except TypeError: sample_name = args.name if db.index is None: raise ValueError('database file must be indexed') if sample.index is None: raise ValueError('sample file must be indexed') # Open tabluar output file, if requested table = None if args.table: tablefile = open(args.table, 'w') if args.table != '-' else sys.stdout table = csv.writer(tablefile, delimiter='\t', lineterminator='\n') write_table_header(table) update_info_header(sample.header) with VariantFile(args.output, 'w', header=sample.header) as out: for superlocus, matches in generate_matches(refs, sample, db, args): clear_info_fields(superlocus) for allele_locus, allele, match in matches: dbvar = allele.record var_id = dbvar.id or f'{dbvar.chrom}_{dbvar.start+1}_{dbvar.stop}_{dbvar.alts[0]}' status, times = translate_match(match) for locus in allele_locus: info = locus.record.info info[status] = info.get(status, ()) + (var_id, ) * times write_table_row(table, sample_name, var_id, allele_locus, status, match) for locus in sorted(superlocus, key=NormalizedLocus.record_order_key): out.write(locus.record)
Example #14
Source File: repmatch.py From vgraph with Apache License 2.0 | 4 votes |
def match_replicates(args): """Match a genome against another presumably identical genome (i.e. replicates).""" refs = Fastafile(expanduser(args.reference)) in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]] out_vars = make_outputs(in_vars, args.out1, args.out2) match_status_map = {True: '=', False: 'X', None: '.'} # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (super1, super2) in superloci: super1.sort(key=NormalizedLocus.natural_order_key) super2.sort(key=NormalizedLocus.natural_order_key) super_start, super_stop = get_superlocus_bounds([super1, super2]) print('-' * 80) print(f'{chrom}:[{super_start:d}-{super_stop:d}):') print() for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: lstart = locus.start lstop = locus.stop lref = locus.ref or '-' indices = locus.allele_indices sep = '|' if locus.phased else '/' geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices) print(f' NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}') print() match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug) match_status = match_status_map[match] print(f' MATCH={match_status} TYPE={match_type}') print() write_match(out_vars[0], super1, args.name1, match_status, match_type) write_match(out_vars[1], super2, args.name2, match_status, match_type) for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: print(f' VCF{i:d}: {locus.record}', end='') print() for out_var in out_vars: if out_var is not None: out_var.close()