Python Examples of pysam.VariantFile

Source File: VCF_remove_phase.py From helen with MIT License

10 votes

def fix_vcf(input_vcf, output_vcf):
    vcf_in = pysam.VariantFile(input_vcf)
    vcf_out = pysam.VariantFile(output_vcf, 'w', header=vcf_in.header)
    records = vcf_in.fetch()
    PS_dictionary = defaultdict(str)
    PS_value = 100

    for record in records:
        for sample in record.samples:
            input_ps = str(record.samples[sample]['PS'])

            if input_ps not in PS_dictionary:
                PS_dictionary = str(PS_value)
                PS_value += 50

            record.samples[sample]['PS'] = str(PS_value)
            vcf_out.write(record)

Source File: varianttable.py From igv-reports with MIT License

6 votes

def __init__(self, vcfFile, info_columns = None, info_columns_prefixes = None, sample_columns = None):

        vcf = pysam.VariantFile(vcfFile)

        self.info_fields =  info_columns or []
        self.info_field_prefixes = info_columns_prefixes or []
        self.sample_fields = sample_columns or []
        self.variants = []
        self.features = []   #Bed-like features

        for unique_id, var in enumerate(vcf.fetch()):
            self.variants.append((var, unique_id))
            chr = var.chrom
            start = var.pos - 1
            end = start + 1       #TODO -- handle structure variants and deletions > 1 base
            self.features.append((Feature(chr, start, end, ''), unique_id))

Source File: variants.py From ga4gh-server with Apache License 2.0

6 votes

def _populateFromVariantFile(self, varFile, dataUrl, indexFile):
        """
        Populates the instance variables of this VariantSet from the specified
        pysam VariantFile object.
        """
        if varFile.index is None:
            raise exceptions.NotIndexedException(dataUrl)
        for chrom in varFile.index:
            # Unlike Tabix indices, CSI indices include all contigs defined
            # in the BCF header.  Thus we must test each one to see if
            # records exist or else they are likely to trigger spurious
            # overlapping errors.
            chrom, _, _ = self.sanitizeVariantFileFetch(chrom)
            if not isEmptyIter(varFile.fetch(chrom)):
                if chrom in self._chromFileMap:
                    raise exceptions.OverlappingVcfException(dataUrl, chrom)
            self._chromFileMap[chrom] = dataUrl, indexFile
        self._updateMetadata(varFile)
        self._updateCallSetIds(varFile)
        self._updateVariantAnnotationSets(varFile, dataUrl)

Source File: dbmatch.py From vgraph with Apache License 2.0

6 votes

def match_database(args):
    """Match a genome to a database of alleles."""
    refs   = Fastafile(expanduser(args.reference))
    db     = VariantFile(expanduser(args.database))
    sample = VariantFile(expanduser(args.sample))

    format_meta, info_meta = build_new_metadata(db, sample)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        for superlocus, matches in generate_matches(refs, sample, db, args):
            for allele_locus, allele, match in matches:
                # Annotate results of search
                status, times = translate_match(match)
                suffix = '_' + status

                for locus in allele_locus:
                    annotate_info(locus, allele, info_meta, suffix, times)
                    annotate_format(locus, allele, format_meta, suffix, times)

            for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
                out.write(locus.record)

Source File: emerald.py From basenji with Apache License 2.0

5 votes

def fetch(self, chrm, pos_start, pos_end, return_samples=False):
    vcf_file = '%s.%s.vcf.gz' % (self.pop_vcf_stem, chrm)
    vcf_open = VariantFile(vcf_file, drop_samples=(not return_samples))
    return vcf_open.fetch(chrm, pos_start, pos_end)

Source File: vcf.py From igv-reports with MIT License

5 votes

def __init__(self, path):

        self.file = pysam.VariantFile(path)

Source File: variants.py From ga4gh-server with Apache License 2.0

5 votes

def populateFromFile(self, dataUrls, indexFiles):
        """
        Populates this variant set using the specified lists of data
        files and indexes. These must be in the same order, such that
        the jth index file corresponds to the jth data file.
        """
        assert len(dataUrls) == len(indexFiles)
        for dataUrl, indexFile in zip(dataUrls, indexFiles):
            varFile = pysam.VariantFile(dataUrl, index_filename=indexFile)
            try:
                self._populateFromVariantFile(varFile, dataUrl, indexFile)
            finally:
                varFile.close()

Source File: variants.py From ga4gh-server with Apache License 2.0

5 votes

def checkConsistency(self):
        """
        Perform consistency check on the variant set
        """
        for referenceName, (dataUrl, indexFile) in self._chromFileMap.items():
            varFile = pysam.VariantFile(dataUrl, index_filename=indexFile)
            try:
                for chrom in varFile.index:
                    chrom, _, _ = self.sanitizeVariantFileFetch(chrom)
                    if not isEmptyIter(varFile.fetch(chrom)):
                        self._checkMetadata(varFile)
                        self._checkCallSetIds(varFile)
            finally:
                varFile.close()

Source File: variants.py From ga4gh-server with Apache License 2.0

5 votes

def getNumVariants(self):
        """
        Returns the total number of variants in this VariantSet.
        """
        # TODO How do we get the number of records in a VariantFile?
        return 0

Source File: variants.py From ga4gh-server with Apache License 2.0

5 votes

def openFile(self, dataUrlIndexFilePair):
        dataUrl, indexFile = dataUrlIndexFilePair
        return pysam.VariantFile(dataUrl, index_filename=indexFile)

Source File: repmatch.py From vgraph with Apache License 2.0

5 votes

def make_outputs(in_vars, out1, out2):
    """Make output files."""
    out_vars = [None, None]

    if out1:
        in_vars[0].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)')
        in_vars[0].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)')
        out_vars[0] = VariantFile(out1, 'w', header=in_vars[0].header)

    if out2:
        in_vars[1].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)')
        in_vars[1].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)')
        out_vars[1] = VariantFile(out2, 'w', header=in_vars[1].header)

    return out_vars

Source File: vgraph.py From vgraph with Apache License 2.0

5 votes

def normalize(args):
    """Normalize variants."""
    refs = Fastafile(expanduser(args.reference))
    variants = VariantFile(args.sample)

    with VariantFile(args.output, 'w', header=variants.header) as out:
        # Create parallel locus iterator by chromosome
        for _, ref, loci in records_by_chromosome(refs, [variants], [None], args):
            loci = sort_almost_sorted(loci[0], key=NormalizedLocus.left_order_key)

            for locus in loci:
                record  = locus.record
                start   = locus.left.start
                stop    = locus.left.stop
                alleles = locus.left.alleles

                if '' in alleles:
                    pad = ref[start - 1:start]
                    start -= 1
                    alleles = [pad + a for a in alleles]

                record.alleles = alleles
                record.start   = start
                record.stop    = stop

                out.write(record)

Source File: test_vcf.py From tskit with MIT License

5 votes

def ts_to_pysam(ts, *args, **kwargs):
    """
    Returns a pysam VariantFile for the specified tree sequence and arguments.
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        vcf_path = os.path.join(temp_dir, "file.vcf")
        with open(vcf_path, "w") as f:
            ts.write_vcf(f, *args, **kwargs)
        yield pysam.VariantFile(vcf_path)

Source File: genotype.py From smrtsv2 with MIT License

4 votes

def vcf_header_lines(vcf_file_name):
    """
    Get header lines for the genotype output VCF from the input variant-call VCF file.

    :param vcf_file_name: Name of a variant VCF file.

    :return: A list of VCF headers as strings with newlines. Does not include the column heading line at the end
        of the headers.
    """

    header_list = list()

    with pysam.VariantFile(vcf_file_name) as vcf_file:
        # Add VCF version if missing
        header_list.append(vcf_version(vcf_file))

        # Set file date
        header_list.append('##fileDate={}\n'.format(time.strftime("%Y%m%d")))

        # Set source
        header_list.extend(vcf_get_source_list(vcf_file))
        header_list.append('##source=SMRTSV_Genotyper_{}\n'.format(smrtsvlib.__version__))

        # Get header elements excluding FORMAT tags
        for header_element in vcf_file.header.records:

            # Replace FORMAT tags
            if header_element.type == 'FORMAT':
                continue

            # Source and date handled
            if header_element.type == 'GENERIC' and header_element.key.lower() in {'fileformat', 'source', 'filedate'}:
                continue

            # Write record
            header_list.append(str(header_element))

        # Add FORMAT tags written by the genotyper
        header_list.extend(vcf_get_format_tags())

    # Return header lines
    return header_list

Source File: dbmatch.py From vgraph with Apache License 2.0

4 votes

def match_database2(args):
    """Match a genome to a database of alleles."""
    refs   = Fastafile(expanduser(args.reference))
    db     = VariantFile(expanduser(args.database))
    sample = VariantFile(expanduser(args.sample))

    try:
        sample_name = sample.header.samples[args.name]
    except TypeError:
        sample_name = args.name

    if db.index is None:
        raise ValueError('database file must be indexed')
    if sample.index is None:
        raise ValueError('sample file must be indexed')

    # Open tabluar output file, if requested
    table = None
    if args.table:
        tablefile = open(args.table, 'w') if args.table != '-' else sys.stdout
        table = csv.writer(tablefile, delimiter='\t', lineterminator='\n')
        write_table_header(table)

    update_info_header(sample.header)

    with VariantFile(args.output, 'w', header=sample.header) as out:
        for superlocus, matches in generate_matches(refs, sample, db, args):
            clear_info_fields(superlocus)

            for allele_locus, allele, match in matches:
                dbvar  = allele.record
                var_id = dbvar.id or f'{dbvar.chrom}_{dbvar.start+1}_{dbvar.stop}_{dbvar.alts[0]}'

                status, times = translate_match(match)

                for locus in allele_locus:
                    info = locus.record.info
                    info[status] = info.get(status, ()) + (var_id, ) * times

                write_table_row(table, sample_name, var_id, allele_locus, status, match)

            for locus in sorted(superlocus, key=NormalizedLocus.record_order_key):
                out.write(locus.record)

Source File: repmatch.py From vgraph with Apache License 2.0

4 votes

def match_replicates(args):
    """Match a genome against another presumably identical genome (i.e. replicates)."""
    refs     = Fastafile(expanduser(args.reference))
    in_vars  = [VariantFile(var) for var in [args.vcf1, args.vcf2]]
    out_vars = make_outputs(in_vars, args.out1, args.out2)

    match_status_map = {True: '=', False: 'X', None: '.'}

    # Create parallel locus iterator by chromosome
    for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args):
        # Create superloci by taking the union of overlapping loci across all of the locus streams
        loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci]
        superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop'))

        # Proceed by superlocus
        for _, _, (super1, super2) in superloci:
            super1.sort(key=NormalizedLocus.natural_order_key)
            super2.sort(key=NormalizedLocus.natural_order_key)

            super_start, super_stop = get_superlocus_bounds([super1, super2])

            print('-' * 80)
            print(f'{chrom}:[{super_start:d}-{super_stop:d}):')
            print()

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    lstart = locus.start
                    lstop = locus.stop
                    lref = locus.ref or '-'
                    indices = locus.allele_indices
                    sep = '|' if locus.phased else '/'
                    geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices)
                    print(f'  NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}')
            print()

            match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug)
            match_status = match_status_map[match]

            print(f'    MATCH={match_status} TYPE={match_type}')
            print()

            write_match(out_vars[0], super1, args.name1, match_status, match_type)
            write_match(out_vars[1], super2, args.name2, match_status, match_type)

            for i, superlocus in enumerate([super1, super2], 1):
                for locus in superlocus:
                    print(f'  VCF{i:d}: {locus.record}', end='')
            print()

    for out_var in out_vars:
        if out_var is not None:
            out_var.close()

Python pysam.VariantFile() Examples