Python pysam.Fastafile() Examples

The following are 14 code examples of pysam.Fastafile(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pysam , or try the search function .
Example #1
Source File: basenji_motifs_denovo.py    From basenji with Apache License 2.0 6 votes vote down vote up
def filter_seqlets(seqlet_acts, seqlet_intervals, genome_fasta_file, end_distance=100, verbose=True):
    """ Filter seqlets by valid chromosome coordinates. """

    # read chromosome lengths
    chr_lengths = {}
    genome_fasta_open = pysam.Fastafile(genome_fasta_file)
    for chrom in genome_fasta_open.references:
        chr_lengths[chrom] = genome_fasta_open.get_reference_length(chrom)
    genome_fasta_open.close()

    # check coordinates
    filter_mask = np.zeros(len(seqlet_intervals), dtype='bool')
    for si, seq_int in enumerate(seqlet_intervals):
        left_valid = (seq_int.start > end_distance)
        right_valid = (seq_int.end + end_distance < chr_lengths[seq_int.chr])
        filter_mask[si] = left_valid and right_valid

    if verbose:
        print('Removing %d seqlets near chromosome ends.' % (len(seqlet_intervals) - filter_mask.sum()))

    # filter
    seqlet_acts = seqlet_acts[filter_mask]
    seqlet_intervals = [seq_int for si, seq_int in enumerate(seqlet_intervals) if filter_mask[si]]

    return seqlet_acts, seqlet_intervals 
Example #2
Source File: genome.py    From basenji with Apache License 2.0 6 votes vote down vote up
def load_chromosomes(genome_file):
  """ Load genome segments from either a FASTA file or
          chromosome length table. """

  # is genome_file FASTA or (chrom,start,end) table?
  file_fasta = (open(genome_file).readline()[0] == '>')

  chrom_segments = {}

  if file_fasta:
    fasta_open = pysam.Fastafile(genome_file)
    for i in range(len(fasta_open.references)):
      chrom_segments[fasta_open.references[i]] = [(0, fasta_open.lengths[i])]
    fasta_open.close()

  else:
    for line in open(genome_file):
      a = line.split()
      chrom_segments[a[0]] = [(0, int(a[1]))]

  return chrom_segments 
Example #3
Source File: test_data.py    From basenji with Apache License 2.0 6 votes vote down vote up
def test_seqs(self):
    """Test that the one hot coded sequences match."""
    # read sequence coordinates
    seqs_bed_file = '%s/sequences.bed' % self.out_dir
    seq_coords = read_seq_coords(seqs_bed_file)

    # read one hot coding from TF Records
    train_tfrs_str = '%s/tfrecords/train-0.tfr' % self.out_dir
    seqs_1hot, _ = self.read_tfrecords(train_tfrs_str)

    # open FASTA
    fasta_open = pysam.Fastafile(self.fasta_file)

    # check random sequences
    seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32)
    for si in seq_indexes:
      sc = seq_coords[si]

      seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end)
      seq_1hot_dna = hot1_dna(seqs_1hot[si])
      self.assertEqual(seq_fasta, seq_1hot_dna) 
Example #4
Source File: dbmatch.py    From vgraph with Apache License 2.0 6 votes vote down vote up
def match_database(args):
    """Match a genome to a database of alleles."""
    refs   = Fastafile(expanduser(args.reference))
    db     = VariantFile(expanduser(args.database))
    sample = VariantFile(expanduser(args.sample))

    format_meta, info_meta = build_new_metadata(db, sample)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        for superlocus, matches in generate_matches(refs, sample, db, args):
            for allele_locus, allele, match in matches:
                # Annotate results of search
                status, times = translate_match(match)
                suffix = '_' + status

                for locus in allele_locus:
                    annotate_info(locus, allele, info_meta, suffix, times)
                    annotate_format(locus, allele, format_meta, suffix, times)

            for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
                out.write(locus.record) 
Example #5
Source File: basenji_data_gene.py    From basenji with Apache License 2.0 5 votes vote down vote up
def sufficient_sequence(fasta_file, genes_df, seq_length, n_allowed_pct):
  """Return boolean mask specifying genes with sufficient sequence."""

  # open FASTA
  fasta_open = pysam.Fastafile(fasta_file)

  # initialize gene boolean
  gene_valid = np.ones(genes_df.shape[0], dtype='bool')

  gi = 0
  for gene in genes_df.itertuples():
    chr_len = fasta_open.get_reference_length(gene.chr)
    mid_pos = (gene.start + gene.end) // 2
    seq_start = mid_pos - seq_length//2
    seq_end = seq_start + seq_length

    # count requested N's
    n_requested = 0
    if seq_start < 0:
      n_requested += -seq_start
    if seq_end > chr_len:
      n_requested += seq_end - chr_len

    if n_requested > 0:
      if n_requested/seq_length < n_allowed_pct:                
        print('Allowing %s with %d Ns' % (gene.Index, n_requested))
      else:
        print('Skipping %s with %d Ns' % (gene.Index, n_requested))
        gene_valid[gi] = False

    gi += 1

  fasta_open.close()

  return gene_valid 
Example #6
Source File: test_data2.py    From basenji with Apache License 2.0 5 votes vote down vote up
def test_seqs(self):
    """Test that the one hot coded sequences match."""
    for gi in range(2):
      # read sequence coordinates
      seqs_bed_file = '%s/sequences%d.bed' % (self.out_dir, gi)
      seq_coords = read_seq_coords(seqs_bed_file)

      # read one hot coding from TF Records
      train_tfrs_str = '%s/tfrecords/train-%d-0.tfr' % (self.out_dir, gi)
      seqs_1hot, _, genomes = self.read_tfrecords(train_tfrs_str)

      # check genome
      self.assertEqual(len(np.unique(genomes)), 1)
      self.assertEqual(genomes[0], gi)

      # open FASTA
      fasta_open = pysam.Fastafile(self.fasta_files[gi])

      # check random sequences
      seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32)
      for si in seq_indexes:
        sc = seq_coords[si]

        seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end).upper()
        seq_1hot_dna = hot1_dna(seqs_1hot[si])
        self.assertEqual(seq_fasta, seq_1hot_dna) 
Example #7
Source File: vgraph.py    From vgraph with Apache License 2.0 5 votes vote down vote up
def normalize(args):
    """Normalize variants."""
    refs = Fastafile(expanduser(args.reference))
    variants = VariantFile(args.sample)

    with VariantFile(args.output, 'w', header=variants.header) as out:
        # Create parallel locus iterator by chromosome
        for _, ref, loci in records_by_chromosome(refs, [variants], [None], args):
            loci = sort_almost_sorted(loci[0], key=NormalizedLocus.left_order_key)

            for locus in loci:
                record  = locus.record
                start   = locus.left.start
                stop    = locus.left.stop
                alleles = locus.left.alleles

                if '' in alleles:
                    pad = ref[start - 1:start]
                    start -= 1
                    alleles = [pad + a for a in alleles]

                record.alleles = alleles
                record.start   = start
                record.stop    = stop

                out.write(record) 
Example #8
Source File: svtool_to_vcf.py    From metasv with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def convert_svtool_to_vcf(file_name, sample, out_vcf, toolname, reference, sort=False, index=False):
    vcf_template_reader = get_template()
    vcf_template_reader.samples = [sample]

    vcf_fd = open(out_vcf, "w") if out_vcf is not None else sys.stdout
    vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)

    reference_handle = pysam.Fastafile(reference) if reference else None
    reference_contigs = get_contigs(reference)
    if sort:
        if not reference_contigs:
            logger.warn("Chromosomes will be sorted in lexicographic order since reference is missing")
        else:
            logger.info("Chromosomes will be sorted by the reference order")

    vcf_records = []
    for tool_record in tool_to_reader[toolname](file_name, reference_handle=reference_handle):
        vcf_record = tool_record.to_vcf_record(sample)
        if vcf_record is None:
            continue
        if sort:
            vcf_records.append(vcf_record)
        else:
            vcf_writer.write_record(vcf_record)

    if sort:
        if reference_contigs:
            contigs_order_dict = {contig.name: index for (index, contig) in enumerate(reference_contigs)}
            vcf_records.sort(
                cmp=lambda x, y: cmp((contigs_order_dict[x.CHROM], x.POS), (contigs_order_dict[y.CHROM], y.POS)))
        else:
            vcf_records.sort(cmp=lambda x, y: cmp((x.CHROM, x.POS), (y.CHROM, y.POS)))

        for vcf_record in vcf_records:
            vcf_writer.write_record(vcf_record)
    vcf_writer.close()
    if out_vcf and index:
        pysam.tabix_index(out_vcf, force=True, preset='vcf') 
Example #9
Source File: basenji_hdf5_cluster.py    From basenji with Apache License 2.0 4 votes vote down vote up
def segments_1hot(fasta_file, segments, seq_length, stride):
  """ Read and 1-hot code sequences in their segment batches.

    Args
     fasta_file: FASTA genome
     segments: list of (chrom,start,end) genomic segments to read
     seq_length: sequence length to break them into
     stride: distance to advance each sequence

    Returns:
     seqs_1hot: You know.
     seqs_segments: list of (chrom,start,end) sequence segments
    """

  # open fasta
  fasta = pysam.Fastafile(fasta_file)

  # initialize 1-hot coding list
  seqs_1hot = []

  # segment corresponding to each sequence
  seqs_segments = []

  for chrom, seg_start, seg_end in segments:
    # read sequence
    seg_seq = fasta.fetch(chrom, seg_start, seg_end)

    # break up into batchable sequences (as above in bigwig_batch)
    bstart = 0
    bend = bstart + seq_length
    while bend < len(seg_seq):
      # append
      seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend]))

      seqs_segments.append((chrom, seg_start + bstart, seg_start + bend))

      # update
      bstart += stride
      bend += stride

  return np.array(seqs_1hot), seqs_segments


################################################################################ 
Example #10
Source File: basenji_hdf5_single.py    From basenji with Apache License 2.0 4 votes vote down vote up
def segments_1hot(fasta_file, segments, seq_length, stride):
  """ Read and 1-hot code sequences in their segment batches.

    Args
     fasta_file: FASTA genome
     segments: list of (chrom,start,end) genomic segments to read
     seq_length: sequence length to break them into

    Returns:
     seqs_1hot: You know.
     seqs_segments: list of (chrom,start,end) sequence segments
    """

  # open fasta
  fasta = pysam.Fastafile(fasta_file)

  # initialize 1-hot coding list
  seqs_1hot = []

  # segment corresponding to each sequence
  seqs_segments = []

  for chrom, seg_start, seg_end in segments:
    # read sequence
    seg_seq = fasta.fetch(chrom, seg_start, seg_end)

    # break up into batchable sequences (as above in bigwig_batch)
    bstart = 0
    bend = bstart + seq_length
    while bend < len(seg_seq):
      # append
      seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend]))

      seqs_segments.append((chrom, seg_start + bstart, seg_start + bend))

      # update
      bstart += stride
      bend += stride

  return np.array(seqs_1hot), seqs_segments


################################################################################ 
Example #11
Source File: bam_cov.py    From basenji with Apache License 2.0 4 votes vote down vote up
def __init__(self,
               chrom_lengths,
               stranded=False,
               smooth_sd=32,
               clip_max=None,
               clip_max_multi=2,
               shift_center=False,
               shift_forward=0,
               shift_reverse=0,
               mapq_t=1,
               fasta_file=None):

    self.stranded = stranded
    if self.stranded:
      # model + and - strand of each chromosome
      self.chrom_lengths = OrderedDict()
      for chrom, clen in chrom_lengths.items():
        self.chrom_lengths['%s+'%chrom] = clen
      for chrom, clen in chrom_lengths.items():
        self.chrom_lengths['%s-'%chrom] = clen

    else:
      self.chrom_lengths = chrom_lengths

    self.genome_length = sum(self.chrom_lengths.values())
    self.unique_counts = np.zeros(self.genome_length, dtype='uint8')
    self.active_blocks = None

    self.smooth_sd = smooth_sd

    self.shift_center = shift_center
    self.shift_forward = shift_forward
    self.shift_reverse = shift_reverse

    self.clip_max = clip_max
    self.clip_max_multi = clip_max_multi

    self.adaptive_cdf = 0.01
    self.adaptive_t = {}

    self.mapq_t = mapq_t

    self.fasta = None
    if fasta_file is not None:
      self.fasta = pysam.Fastafile(fasta_file)

    self.gc_model = None 
Example #12
Source File: hdf5_bed.py    From basenji with Apache License 2.0 4 votes vote down vote up
def main():
  usage = 'usage: %prog [options] <hdf5_file> <bed_file>'
  parser = OptionParser(usage)
  parser.add_option(
      '-f',
      dest='fasta_file',
      default='%s/assembly/hg19.fa' % os.environ['HG19'],
      help='FASTA file [Default: %default]')
  parser.add_option(
      '-n',
      dest='check_num',
      default=100,
      type='int',
      help='Number of sequences to check [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 2:
    parser.error('Must provide HDF5 and BED files')
  else:
    hdf5_file = args[0]
    bed_file = args[1]

  fasta = pysam.Fastafile(options.fasta_file)
  hdf5_in = h5py.File(hdf5_file)

  si = 0
  for line in open(bed_file):
    a = line.split()
    if a[-1] == 'train':
      chrom = a[0]
      start = int(a[1])
      end = int(a[2])

      bed_seq = fasta.fetch(chrom, start, end).upper()
      hdf5_seq = basenji.dna_io.hot1_dna(hdf5_in['train_in'][si:si + 1])[0]

      print(bed_seq[:10], len(bed_seq))
      assert (bed_seq == hdf5_seq)

    si += 1
    if si > options.check_num:
      break


################################################################################
# __main__
################################################################################ 
Example #13
Source File: dbmatch.py    From vgraph with Apache License 2.0 4 votes vote down vote up
def match_database2(args):
    """Match a genome to a database of alleles."""
    refs   = Fastafile(expanduser(args.reference))
    db     = VariantFile(expanduser(args.database))
    sample = VariantFile(expanduser(args.sample))

    try:
        sample_name = sample.header.samples[args.name]
    except TypeError:
        sample_name = args.name

    if db.index is None:
        raise ValueError('database file must be indexed')
    if sample.index is None:
        raise ValueError('sample file must be indexed')

    # Open tabluar output file, if requested
    table = None
    if args.table:
        tablefile = open(args.table, 'w') if args.table != '-' else sys.stdout
        table = csv.writer(tablefile, delimiter='\t', lineterminator='\n')
        write_table_header(table)

    update_info_header(sample.header)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        for superlocus, matches in generate_matches(refs, sample, db, args):
            clear_info_fields(superlocus)

            for allele_locus, allele, match in matches:
                dbvar  = allele.record
                var_id = dbvar.id or f'{dbvar.chrom}_{dbvar.start+1}_{dbvar.stop}_{dbvar.alts[0]}'

                status, times = translate_match(match)

                for locus in allele_locus:
                    info = locus.record.info
                    info[status] = info.get(status, ()) + (var_id, ) * times

                write_table_row(table, sample_name, var_id, allele_locus, status, match)

            for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
                out.write(locus.record) 
Example #14
Source File: repmatch.py    From vgraph with Apache License 2.0 4 votes vote down vote up
def match_replicates(args):
    """Match a genome against another presumably identical genome (i.e. replicates)."""
    refs     = Fastafile(expanduser(args.reference))
    in_vars  = [VariantFile(var) for var in [args.vcf1, args.vcf2]]
    out_vars = make_outputs(in_vars, args.out1, args.out2)

    match_status_map = {True: '=', False: 'X', None: '.'}

    # Create parallel locus iterator by chromosome
    for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args):
        # Create superloci by taking the union of overlapping loci across all of the locus streams
        loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci]
        superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop'))

        # Proceed by superlocus
        for _, _, (super1, super2) in superloci:
            super1.sort(key=NormalizedLocus.natural_order_key)
            super2.sort(key=NormalizedLocus.natural_order_key)

            super_start, super_stop = get_superlocus_bounds([super1, super2])

            print('-' * 80)
            print(f'{chrom}:[{super_start:d}-{super_stop:d}):')
            print()

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    lstart = locus.start
                    lstop = locus.stop
                    lref = locus.ref or '-'
                    indices = locus.allele_indices
                    sep = '|' if locus.phased else '/'
                    geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices)
                    print(f'  NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}')
            print()

            match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug)
            match_status = match_status_map[match]

            print(f'    MATCH={match_status} TYPE={match_type}')
            print()

            write_match(out_vars[0], super1, args.name1, match_status, match_type)
            write_match(out_vars[1], super2, args.name2, match_status, match_type)

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    print(f'  VCF{i:d}: {locus.record}', end='')
            print()

    for out_var in out_vars:
        if out_var is not None:
            out_var.close()