Python Examples of pysam.Fastafile

Source File: basenji_motifs_denovo.py From basenji with Apache License 2.0

6 votes

def filter_seqlets(seqlet_acts, seqlet_intervals, genome_fasta_file, end_distance=100, verbose=True):
    """ Filter seqlets by valid chromosome coordinates. """

    # read chromosome lengths
    chr_lengths = {}
    genome_fasta_open = pysam.Fastafile(genome_fasta_file)
    for chrom in genome_fasta_open.references:
        chr_lengths[chrom] = genome_fasta_open.get_reference_length(chrom)
    genome_fasta_open.close()

    # check coordinates
    filter_mask = np.zeros(len(seqlet_intervals), dtype='bool')
    for si, seq_int in enumerate(seqlet_intervals):
        left_valid = (seq_int.start > end_distance)
        right_valid = (seq_int.end + end_distance < chr_lengths[seq_int.chr])
        filter_mask[si] = left_valid and right_valid

    if verbose:
        print('Removing %d seqlets near chromosome ends.' % (len(seqlet_intervals) - filter_mask.sum()))

    # filter
    seqlet_acts = seqlet_acts[filter_mask]
    seqlet_intervals = [seq_int for si, seq_int in enumerate(seqlet_intervals) if filter_mask[si]]

    return seqlet_acts, seqlet_intervals

Source File: genome.py From basenji with Apache License 2.0

6 votes

def load_chromosomes(genome_file):
  """ Load genome segments from either a FASTA file or
          chromosome length table. """

  # is genome_file FASTA or (chrom,start,end) table?
  file_fasta = (open(genome_file).readline()[0] == '>')

  chrom_segments = {}

  if file_fasta:
    fasta_open = pysam.Fastafile(genome_file)
    for i in range(len(fasta_open.references)):
      chrom_segments[fasta_open.references[i]] = [(0, fasta_open.lengths[i])]
    fasta_open.close()

  else:
    for line in open(genome_file):
      a = line.split()
      chrom_segments[a[0]] = [(0, int(a[1]))]

  return chrom_segments

Source File: test_data.py From basenji with Apache License 2.0

6 votes

def test_seqs(self):
    """Test that the one hot coded sequences match."""
    # read sequence coordinates
    seqs_bed_file = '%s/sequences.bed' % self.out_dir
    seq_coords = read_seq_coords(seqs_bed_file)

    # read one hot coding from TF Records
    train_tfrs_str = '%s/tfrecords/train-0.tfr' % self.out_dir
    seqs_1hot, _ = self.read_tfrecords(train_tfrs_str)

    # open FASTA
    fasta_open = pysam.Fastafile(self.fasta_file)

    # check random sequences
    seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32)
    for si in seq_indexes:
      sc = seq_coords[si]

      seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end)
      seq_1hot_dna = hot1_dna(seqs_1hot[si])
      self.assertEqual(seq_fasta, seq_1hot_dna)

Source File: dbmatch.py From vgraph with Apache License 2.0

6 votes

def match_database(args):
    """Match a genome to a database of alleles."""
    refs   = Fastafile(expanduser(args.reference))
    db     = VariantFile(expanduser(args.database))
    sample = VariantFile(expanduser(args.sample))

    format_meta, info_meta = build_new_metadata(db, sample)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        for superlocus, matches in generate_matches(refs, sample, db, args):
            for allele_locus, allele, match in matches:
                # Annotate results of search
                status, times = translate_match(match)
                suffix = '_' + status

                for locus in allele_locus:
                    annotate_info(locus, allele, info_meta, suffix, times)
                    annotate_format(locus, allele, format_meta, suffix, times)

            for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
                out.write(locus.record)

Source File: basenji_data_gene.py From basenji with Apache License 2.0

5 votes

def sufficient_sequence(fasta_file, genes_df, seq_length, n_allowed_pct):
  """Return boolean mask specifying genes with sufficient sequence."""

  # open FASTA
  fasta_open = pysam.Fastafile(fasta_file)

  # initialize gene boolean
  gene_valid = np.ones(genes_df.shape[0], dtype='bool')

  gi = 0
  for gene in genes_df.itertuples():
    chr_len = fasta_open.get_reference_length(gene.chr)
    mid_pos = (gene.start + gene.end) // 2
    seq_start = mid_pos - seq_length//2
    seq_end = seq_start + seq_length

    # count requested N's
    n_requested = 0
    if seq_start < 0:
      n_requested += -seq_start
    if seq_end > chr_len:
      n_requested += seq_end - chr_len

    if n_requested > 0:
      if n_requested/seq_length < n_allowed_pct:                
        print('Allowing %s with %d Ns' % (gene.Index, n_requested))
      else:
        print('Skipping %s with %d Ns' % (gene.Index, n_requested))
        gene_valid[gi] = False

    gi += 1

  fasta_open.close()

  return gene_valid

Source File: test_data2.py From basenji with Apache License 2.0

5 votes

def test_seqs(self):
    """Test that the one hot coded sequences match."""
    for gi in range(2):
      # read sequence coordinates
      seqs_bed_file = '%s/sequences%d.bed' % (self.out_dir, gi)
      seq_coords = read_seq_coords(seqs_bed_file)

      # read one hot coding from TF Records
      train_tfrs_str = '%s/tfrecords/train-%d-0.tfr' % (self.out_dir, gi)
      seqs_1hot, _, genomes = self.read_tfrecords(train_tfrs_str)

      # check genome
      self.assertEqual(len(np.unique(genomes)), 1)
      self.assertEqual(genomes[0], gi)

      # open FASTA
      fasta_open = pysam.Fastafile(self.fasta_files[gi])

      # check random sequences
      seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32)
      for si in seq_indexes:
        sc = seq_coords[si]

        seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end).upper()
        seq_1hot_dna = hot1_dna(seqs_1hot[si])
        self.assertEqual(seq_fasta, seq_1hot_dna)

Source File: vgraph.py From vgraph with Apache License 2.0

5 votes

def normalize(args):
    """Normalize variants."""
    refs = Fastafile(expanduser(args.reference))
    variants = VariantFile(args.sample)

    with VariantFile(args.output, 'w', header=variants.header) as out:
        # Create parallel locus iterator by chromosome
        for _, ref, loci in records_by_chromosome(refs, [variants], [None], args):
            loci = sort_almost_sorted(loci[0], key=NormalizedLocus.left_order_key)

            for locus in loci:
                record  = locus.record
                start   = locus.left.start
                stop    = locus.left.stop
                alleles = locus.left.alleles

                if '' in alleles:
                    pad = ref[start - 1:start]
                    start -= 1
                    alleles = [pad + a for a in alleles]

                record.alleles = alleles
                record.start   = start
                record.stop    = stop

                out.write(record)

Source File: svtool_to_vcf.py From metasv with BSD 2-Clause "Simplified" License

5 votes

def convert_svtool_to_vcf(file_name, sample, out_vcf, toolname, reference, sort=False, index=False):
    vcf_template_reader = get_template()
    vcf_template_reader.samples = [sample]

    vcf_fd = open(out_vcf, "w") if out_vcf is not None else sys.stdout
    vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)

    reference_handle = pysam.Fastafile(reference) if reference else None
    reference_contigs = get_contigs(reference)
    if sort:
        if not reference_contigs:
            logger.warn("Chromosomes will be sorted in lexicographic order since reference is missing")
        else:
            logger.info("Chromosomes will be sorted by the reference order")

    vcf_records = []
    for tool_record in tool_to_reader[toolname](file_name, reference_handle=reference_handle):
        vcf_record = tool_record.to_vcf_record(sample)
        if vcf_record is None:
            continue
        if sort:
            vcf_records.append(vcf_record)
        else:
            vcf_writer.write_record(vcf_record)

    if sort:
        if reference_contigs:
            contigs_order_dict = {contig.name: index for (index, contig) in enumerate(reference_contigs)}
            vcf_records.sort(
                cmp=lambda x, y: cmp((contigs_order_dict[x.CHROM], x.POS), (contigs_order_dict[y.CHROM], y.POS)))
        else:
            vcf_records.sort(cmp=lambda x, y: cmp((x.CHROM, x.POS), (y.CHROM, y.POS)))

        for vcf_record in vcf_records:
            vcf_writer.write_record(vcf_record)
    vcf_writer.close()
    if out_vcf and index:
        pysam.tabix_index(out_vcf, force=True, preset='vcf')

Source File: basenji_hdf5_cluster.py From basenji with Apache License 2.0

4 votes

def segments_1hot(fasta_file, segments, seq_length, stride):
  """ Read and 1-hot code sequences in their segment batches.

    Args
     fasta_file: FASTA genome
     segments: list of (chrom,start,end) genomic segments to read
     seq_length: sequence length to break them into
     stride: distance to advance each sequence

    Returns:
     seqs_1hot: You know.
     seqs_segments: list of (chrom,start,end) sequence segments
    """

  # open fasta
  fasta = pysam.Fastafile(fasta_file)

  # initialize 1-hot coding list
  seqs_1hot = []

  # segment corresponding to each sequence
  seqs_segments = []

  for chrom, seg_start, seg_end in segments:
    # read sequence
    seg_seq = fasta.fetch(chrom, seg_start, seg_end)

    # break up into batchable sequences (as above in bigwig_batch)
    bstart = 0
    bend = bstart + seq_length
    while bend < len(seg_seq):
      # append
      seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend]))

      seqs_segments.append((chrom, seg_start + bstart, seg_start + bend))

      # update
      bstart += stride
      bend += stride

  return np.array(seqs_1hot), seqs_segments


################################################################################

Source File: basenji_hdf5_single.py From basenji with Apache License 2.0

4 votes

def segments_1hot(fasta_file, segments, seq_length, stride):
  """ Read and 1-hot code sequences in their segment batches.

    Args
     fasta_file: FASTA genome
     segments: list of (chrom,start,end) genomic segments to read
     seq_length: sequence length to break them into

    Returns:
     seqs_1hot: You know.
     seqs_segments: list of (chrom,start,end) sequence segments
    """

  # open fasta
  fasta = pysam.Fastafile(fasta_file)

  # initialize 1-hot coding list
  seqs_1hot = []

  # segment corresponding to each sequence
  seqs_segments = []

  for chrom, seg_start, seg_end in segments:
    # read sequence
    seg_seq = fasta.fetch(chrom, seg_start, seg_end)

    # break up into batchable sequences (as above in bigwig_batch)
    bstart = 0
    bend = bstart + seq_length
    while bend < len(seg_seq):
      # append
      seqs_1hot.append(dna_io.dna_1hot(seg_seq[bstart:bend]))

      seqs_segments.append((chrom, seg_start + bstart, seg_start + bend))

      # update
      bstart += stride
      bend += stride

  return np.array(seqs_1hot), seqs_segments


################################################################################

Source File: bam_cov.py From basenji with Apache License 2.0

4 votes

def __init__(self,
               chrom_lengths,
               stranded=False,
               smooth_sd=32,
               clip_max=None,
               clip_max_multi=2,
               shift_center=False,
               shift_forward=0,
               shift_reverse=0,
               mapq_t=1,
               fasta_file=None):

    self.stranded = stranded
    if self.stranded:
      # model + and - strand of each chromosome
      self.chrom_lengths = OrderedDict()
      for chrom, clen in chrom_lengths.items():
        self.chrom_lengths['%s+'%chrom] = clen
      for chrom, clen in chrom_lengths.items():
        self.chrom_lengths['%s-'%chrom] = clen

    else:
      self.chrom_lengths = chrom_lengths

    self.genome_length = sum(self.chrom_lengths.values())
    self.unique_counts = np.zeros(self.genome_length, dtype='uint8')
    self.active_blocks = None

    self.smooth_sd = smooth_sd

    self.shift_center = shift_center
    self.shift_forward = shift_forward
    self.shift_reverse = shift_reverse

    self.clip_max = clip_max
    self.clip_max_multi = clip_max_multi

    self.adaptive_cdf = 0.01
    self.adaptive_t = {}

    self.mapq_t = mapq_t

    self.fasta = None
    if fasta_file is not None:
      self.fasta = pysam.Fastafile(fasta_file)

    self.gc_model = None

Source File: hdf5_bed.py From basenji with Apache License 2.0

4 votes

def main():
  usage = 'usage: %prog [options] <hdf5_file> <bed_file>'
  parser = OptionParser(usage)
  parser.add_option(
      '-f',
      dest='fasta_file',
      default='%s/assembly/hg19.fa' % os.environ['HG19'],
      help='FASTA file [Default: %default]')
  parser.add_option(
      '-n',
      dest='check_num',
      default=100,
      type='int',
      help='Number of sequences to check [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 2:
    parser.error('Must provide HDF5 and BED files')
  else:
    hdf5_file = args[0]
    bed_file = args[1]

  fasta = pysam.Fastafile(options.fasta_file)
  hdf5_in = h5py.File(hdf5_file)

  si = 0
  for line in open(bed_file):
    a = line.split()
    if a[-1] == 'train':
      chrom = a[0]
      start = int(a[1])
      end = int(a[2])

      bed_seq = fasta.fetch(chrom, start, end).upper()
      hdf5_seq = basenji.dna_io.hot1_dna(hdf5_in['train_in'][si:si + 1])[0]

      print(bed_seq[:10], len(bed_seq))
      assert (bed_seq == hdf5_seq)

    si += 1
    if si > options.check_num:
      break


################################################################################
# __main__
################################################################################

Source File: dbmatch.py From vgraph with Apache License 2.0

4 votes

def match_database2(args):
    """Match a genome to a database of alleles."""
    refs   = Fastafile(expanduser(args.reference))
    db     = VariantFile(expanduser(args.database))
    sample = VariantFile(expanduser(args.sample))

    try:
        sample_name = sample.header.samples[args.name]
    except TypeError:
        sample_name = args.name

    if db.index is None:
        raise ValueError('database file must be indexed')
    if sample.index is None:
        raise ValueError('sample file must be indexed')

    # Open tabluar output file, if requested
    table = None
    if args.table:
        tablefile = open(args.table, 'w') if args.table != '-' else sys.stdout
        table = csv.writer(tablefile, delimiter='\t', lineterminator='\n')
        write_table_header(table)

    update_info_header(sample.header)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        for superlocus, matches in generate_matches(refs, sample, db, args):
            clear_info_fields(superlocus)

            for allele_locus, allele, match in matches:
                dbvar  = allele.record
                var_id = dbvar.id or f'{dbvar.chrom}_{dbvar.start+1}_{dbvar.stop}_{dbvar.alts[0]}'

                status, times = translate_match(match)

                for locus in allele_locus:
                    info = locus.record.info
                    info[status] = info.get(status, ()) + (var_id, ) * times

                write_table_row(table, sample_name, var_id, allele_locus, status, match)

            for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
                out.write(locus.record)

Source File: repmatch.py From vgraph with Apache License 2.0

4 votes

def match_replicates(args):
    """Match a genome against another presumably identical genome (i.e. replicates)."""
    refs     = Fastafile(expanduser(args.reference))
    in_vars  = [VariantFile(var) for var in [args.vcf1, args.vcf2]]
    out_vars = make_outputs(in_vars, args.out1, args.out2)

    match_status_map = {True: '=', False: 'X', None: '.'}

    # Create parallel locus iterator by chromosome
    for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args):
        # Create superloci by taking the union of overlapping loci across all of the locus streams
        loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci]
        superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop'))

        # Proceed by superlocus
        for _, _, (super1, super2) in superloci:
            super1.sort(key=NormalizedLocus.natural_order_key)
            super2.sort(key=NormalizedLocus.natural_order_key)

            super_start, super_stop = get_superlocus_bounds([super1, super2])

            print('-' * 80)
            print(f'{chrom}:[{super_start:d}-{super_stop:d}):')
            print()

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    lstart = locus.start
                    lstop = locus.stop
                    lref = locus.ref or '-'
                    indices = locus.allele_indices
                    sep = '|' if locus.phased else '/'
                    geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices)
                    print(f'  NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}')
            print()

            match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug)
            match_status = match_status_map[match]

            print(f'    MATCH={match_status} TYPE={match_type}')
            print()

            write_match(out_vars[0], super1, args.name1, match_status, match_type)
            write_match(out_vars[1], super2, args.name2, match_status, match_type)

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    print(f'  VCF{i:d}: {locus.record}', end='')
            print()

    for out_var in out_vars:
        if out_var is not None:
            out_var.close()

Python pysam.Fastafile() Examples