Python pysam.VariantFile() Examples
The following are 16
code examples of pysam.VariantFile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pysam
, or try the search function
.
Example #1
Source File: VCF_remove_phase.py From helen with MIT License | 10 votes |
def fix_vcf(input_vcf, output_vcf): vcf_in = pysam.VariantFile(input_vcf) vcf_out = pysam.VariantFile(output_vcf, 'w', header=vcf_in.header) records = vcf_in.fetch() PS_dictionary = defaultdict(str) PS_value = 100 for record in records: for sample in record.samples: input_ps = str(record.samples[sample]['PS']) if input_ps not in PS_dictionary: PS_dictionary = str(PS_value) PS_value += 50 record.samples[sample]['PS'] = str(PS_value) vcf_out.write(record)
Example #2
Source File: varianttable.py From igv-reports with MIT License | 6 votes |
def __init__(self, vcfFile, info_columns = None, info_columns_prefixes = None, sample_columns = None): vcf = pysam.VariantFile(vcfFile) self.info_fields = info_columns or [] self.info_field_prefixes = info_columns_prefixes or [] self.sample_fields = sample_columns or [] self.variants = [] self.features = [] #Bed-like features for unique_id, var in enumerate(vcf.fetch()): self.variants.append((var, unique_id)) chr = var.chrom start = var.pos - 1 end = start + 1 #TODO -- handle structure variants and deletions > 1 base self.features.append((Feature(chr, start, end, ''), unique_id))
Example #3
Source File: variants.py From ga4gh-server with Apache License 2.0 | 6 votes |
def _populateFromVariantFile(self, varFile, dataUrl, indexFile): """ Populates the instance variables of this VariantSet from the specified pysam VariantFile object. """ if varFile.index is None: raise exceptions.NotIndexedException(dataUrl) for chrom in varFile.index: # Unlike Tabix indices, CSI indices include all contigs defined # in the BCF header. Thus we must test each one to see if # records exist or else they are likely to trigger spurious # overlapping errors. chrom, _, _ = self.sanitizeVariantFileFetch(chrom) if not isEmptyIter(varFile.fetch(chrom)): if chrom in self._chromFileMap: raise exceptions.OverlappingVcfException(dataUrl, chrom) self._chromFileMap[chrom] = dataUrl, indexFile self._updateMetadata(varFile) self._updateCallSetIds(varFile) self._updateVariantAnnotationSets(varFile, dataUrl)
Example #4
Source File: dbmatch.py From vgraph with Apache License 2.0 | 6 votes |
def match_database(args): """Match a genome to a database of alleles.""" refs = Fastafile(expanduser(args.reference)) db = VariantFile(expanduser(args.database)) sample = VariantFile(expanduser(args.sample)) format_meta, info_meta = build_new_metadata(db, sample) with VariantFile(args.output, 'w', header=sample.header) as out: for superlocus, matches in generate_matches(refs, sample, db, args): for allele_locus, allele, match in matches: # Annotate results of search status, times = translate_match(match) suffix = '_' + status for locus in allele_locus: annotate_info(locus, allele, info_meta, suffix, times) annotate_format(locus, allele, format_meta, suffix, times) for locus in sorted(superlocus, key=NormalizedLocus.record_order_key): out.write(locus.record)
Example #5
Source File: emerald.py From basenji with Apache License 2.0 | 5 votes |
def fetch(self, chrm, pos_start, pos_end, return_samples=False): vcf_file = '%s.%s.vcf.gz' % (self.pop_vcf_stem, chrm) vcf_open = VariantFile(vcf_file, drop_samples=(not return_samples)) return vcf_open.fetch(chrm, pos_start, pos_end)
Example #6
Source File: vcf.py From igv-reports with MIT License | 5 votes |
def __init__(self, path): self.file = pysam.VariantFile(path)
Example #7
Source File: variants.py From ga4gh-server with Apache License 2.0 | 5 votes |
def populateFromFile(self, dataUrls, indexFiles): """ Populates this variant set using the specified lists of data files and indexes. These must be in the same order, such that the jth index file corresponds to the jth data file. """ assert len(dataUrls) == len(indexFiles) for dataUrl, indexFile in zip(dataUrls, indexFiles): varFile = pysam.VariantFile(dataUrl, index_filename=indexFile) try: self._populateFromVariantFile(varFile, dataUrl, indexFile) finally: varFile.close()
Example #8
Source File: variants.py From ga4gh-server with Apache License 2.0 | 5 votes |
def checkConsistency(self): """ Perform consistency check on the variant set """ for referenceName, (dataUrl, indexFile) in self._chromFileMap.items(): varFile = pysam.VariantFile(dataUrl, index_filename=indexFile) try: for chrom in varFile.index: chrom, _, _ = self.sanitizeVariantFileFetch(chrom) if not isEmptyIter(varFile.fetch(chrom)): self._checkMetadata(varFile) self._checkCallSetIds(varFile) finally: varFile.close()
Example #9
Source File: variants.py From ga4gh-server with Apache License 2.0 | 5 votes |
def getNumVariants(self): """ Returns the total number of variants in this VariantSet. """ # TODO How do we get the number of records in a VariantFile? return 0
Example #10
Source File: variants.py From ga4gh-server with Apache License 2.0 | 5 votes |
def openFile(self, dataUrlIndexFilePair): dataUrl, indexFile = dataUrlIndexFilePair return pysam.VariantFile(dataUrl, index_filename=indexFile)
Example #11
Source File: repmatch.py From vgraph with Apache License 2.0 | 5 votes |
def make_outputs(in_vars, out1, out2): """Make output files.""" out_vars = [None, None] if out1: in_vars[0].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)') in_vars[0].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)') out_vars[0] = VariantFile(out1, 'w', header=in_vars[0].header) if out2: in_vars[1].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)') in_vars[1].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)') out_vars[1] = VariantFile(out2, 'w', header=in_vars[1].header) return out_vars
Example #12
Source File: vgraph.py From vgraph with Apache License 2.0 | 5 votes |
def normalize(args): """Normalize variants.""" refs = Fastafile(expanduser(args.reference)) variants = VariantFile(args.sample) with VariantFile(args.output, 'w', header=variants.header) as out: # Create parallel locus iterator by chromosome for _, ref, loci in records_by_chromosome(refs, [variants], [None], args): loci = sort_almost_sorted(loci[0], key=NormalizedLocus.left_order_key) for locus in loci: record = locus.record start = locus.left.start stop = locus.left.stop alleles = locus.left.alleles if '' in alleles: pad = ref[start - 1:start] start -= 1 alleles = [pad + a for a in alleles] record.alleles = alleles record.start = start record.stop = stop out.write(record)
Example #13
Source File: test_vcf.py From tskit with MIT License | 5 votes |
def ts_to_pysam(ts, *args, **kwargs): """ Returns a pysam VariantFile for the specified tree sequence and arguments. """ with tempfile.TemporaryDirectory() as temp_dir: vcf_path = os.path.join(temp_dir, "file.vcf") with open(vcf_path, "w") as f: ts.write_vcf(f, *args, **kwargs) yield pysam.VariantFile(vcf_path)
Example #14
Source File: genotype.py From smrtsv2 with MIT License | 4 votes |
def vcf_header_lines(vcf_file_name): """ Get header lines for the genotype output VCF from the input variant-call VCF file. :param vcf_file_name: Name of a variant VCF file. :return: A list of VCF headers as strings with newlines. Does not include the column heading line at the end of the headers. """ header_list = list() with pysam.VariantFile(vcf_file_name) as vcf_file: # Add VCF version if missing header_list.append(vcf_version(vcf_file)) # Set file date header_list.append('##fileDate={}\n'.format(time.strftime("%Y%m%d"))) # Set source header_list.extend(vcf_get_source_list(vcf_file)) header_list.append('##source=SMRTSV_Genotyper_{}\n'.format(smrtsvlib.__version__)) # Get header elements excluding FORMAT tags for header_element in vcf_file.header.records: # Replace FORMAT tags if header_element.type == 'FORMAT': continue # Source and date handled if header_element.type == 'GENERIC' and header_element.key.lower() in {'fileformat', 'source', 'filedate'}: continue # Write record header_list.append(str(header_element)) # Add FORMAT tags written by the genotyper header_list.extend(vcf_get_format_tags()) # Return header lines return header_list
Example #15
Source File: dbmatch.py From vgraph with Apache License 2.0 | 4 votes |
def match_database2(args): """Match a genome to a database of alleles.""" refs = Fastafile(expanduser(args.reference)) db = VariantFile(expanduser(args.database)) sample = VariantFile(expanduser(args.sample)) try: sample_name = sample.header.samples[args.name] except TypeError: sample_name = args.name if db.index is None: raise ValueError('database file must be indexed') if sample.index is None: raise ValueError('sample file must be indexed') # Open tabluar output file, if requested table = None if args.table: tablefile = open(args.table, 'w') if args.table != '-' else sys.stdout table = csv.writer(tablefile, delimiter='\t', lineterminator='\n') write_table_header(table) update_info_header(sample.header) with VariantFile(args.output, 'w', header=sample.header) as out: for superlocus, matches in generate_matches(refs, sample, db, args): clear_info_fields(superlocus) for allele_locus, allele, match in matches: dbvar = allele.record var_id = dbvar.id or f'{dbvar.chrom}_{dbvar.start+1}_{dbvar.stop}_{dbvar.alts[0]}' status, times = translate_match(match) for locus in allele_locus: info = locus.record.info info[status] = info.get(status, ()) + (var_id, ) * times write_table_row(table, sample_name, var_id, allele_locus, status, match) for locus in sorted(superlocus, key=NormalizedLocus.record_order_key): out.write(locus.record)
Example #16
Source File: repmatch.py From vgraph with Apache License 2.0 | 4 votes |
def match_replicates(args): """Match a genome against another presumably identical genome (i.e. replicates).""" refs = Fastafile(expanduser(args.reference)) in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]] out_vars = make_outputs(in_vars, args.out1, args.out2) match_status_map = {True: '=', False: 'X', None: '.'} # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (super1, super2) in superloci: super1.sort(key=NormalizedLocus.natural_order_key) super2.sort(key=NormalizedLocus.natural_order_key) super_start, super_stop = get_superlocus_bounds([super1, super2]) print('-' * 80) print(f'{chrom}:[{super_start:d}-{super_stop:d}):') print() for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: lstart = locus.start lstop = locus.stop lref = locus.ref or '-' indices = locus.allele_indices sep = '|' if locus.phased else '/' geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices) print(f' NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}') print() match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug) match_status = match_status_map[match] print(f' MATCH={match_status} TYPE={match_type}') print() write_match(out_vars[0], super1, args.name1, match_status, match_type) write_match(out_vars[1], super2, args.name2, match_status, match_type) for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: print(f' VCF{i:d}: {locus.record}', end='') print() for out_var in out_vars: if out_var is not None: out_var.close()