Python cyvcf2.VCF Examples
The following are 30
code examples of cyvcf2.VCF().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
cyvcf2
, or try the search function
.
Example #1
Source File: variant.py From CharGer with GNU General Public License v3.0 | 6 votes |
def get_vep_csq_fields(cls: Type[V], vcf_raw_headers: List[str]) -> List[str]: """Extract the CSQ fields VEP output in the given VCF.""" # Get CSQ spec # Reverse the header order because the newer header appears later try: csq_info_header = next( l for l in reversed(vcf_raw_headers) if l.startswith("##INFO=<ID=CSQ,") ) except StopIteration: raise ValueError(f"Cannot find CSQ format in the VCF header") m = re.search(r"Format: ([\w\|]+)['\"]", csq_info_header) if m: csq_format = m.group(1) else: raise ValueError( f"Cannot parse the CSQ field format from its INFO VCF header: {csq_info_header}" ) csq_fields = csq_format.split("|") return csq_fields
Example #2
Source File: variant.py From CharGer with GNU General Public License v3.0 | 6 votes |
def read_vcf(cls: Type[V], path: Path) -> Generator[V, None, None]: """ Read VCF record from `path`. This function walks through each variant record in the given VCF using :class:`cyvcf2.VCF <cyvcf2.cyvcf2.VCF>`, and yields the record as a :class:`Variant` object. See also :meth:`read_and_parse_vcf` to read and parse the VCF. Args: path: Path to the VCF. Returns: An generator walking through all variants per record. """ with closing(VCF(str(path))) as vcf: for cy_variant in vcf: variant = cls.from_cyvcf2(cy_variant) yield variant
Example #3
Source File: heatmap_survivor_SV_calls.py From nano-snakemake with MIT License | 6 votes |
def main(vcf): variants = VCF(vcf) samples = variants.samples identifiers = {i: set() for i in samples} for v in variants: for sample, call in zip(samples, v.gt_types): if is_variant(call): identifiers[sample].add(v.ID) df = pd.DataFrame(index=samples, columns=samples) for i_sample, i_values in identifiers.items(): for j_sample, j_values in identifiers.items(): df.loc[i_sample, j_sample] = len(i_values & j_values) try: proper_names = [i.split('/')[1].split('.')[0] for i in samples] except IndexError: proper_names = samples sns.heatmap(data=df, annot=True, fmt="d", linewidths=0.5, xticklabels=proper_names, yticklabels=proper_names) plt.savefig("SV-calls_heatmap.png", bbox_inches="tight")
Example #4
Source File: fix_SVLEN.py From nano-snakemake with MIT License | 6 votes |
def main(): args = get_args() vcf = VCF(args.vcf) w = Writer(args.output, vcf) for v in vcf: if v.INFO["SVTYPE"] == "DEL": if not v.INFO["SVLEN"] < 0: v.INFO["SVLEN"] = - v.INFO["SVLEN"] w.write_record(v)
Example #5
Source File: filter_small_variants.py From nano-snakemake with MIT License | 5 votes |
def main(): args = get_args() vcf_in = VCF(args.vcf) vcf_out = Writer(args.output, vcf_in) for v in vcf_in: if v.INFO["SVLEN"] > 49: vcf_out.write_record(v) vcf_in.close() vcf_out.close()
Example #6
Source File: test_parse_headers.py From scout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_parse_rank_results_header(variant_clinical_file): ## GIVEN a vcf object vcf_obj = VCF(variant_clinical_file) ## WHEN parsing the rank results header rank_results_header = parse_rank_results_header(vcf_obj) ## THEN assert the header is returned correct assert isinstance(rank_results_header, list) assert rank_results_header assert "Consequence" in rank_results_header
Example #7
Source File: test_parse_headers.py From scout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_parse_vep_header(variant_clinical_file): ## GIVEN a vcf object vcf_obj = VCF(variant_clinical_file) ## WHEN parsing the vep header vep_header = parse_vep_header(vcf_obj) ## THEN assert the header is returned correct assert isinstance(vep_header, list) assert vep_header assert "ALLELE" in vep_header
Example #8
Source File: test_parse_genotype.py From scout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_parse_cnvnator(): """docstring for test_parse_cnvnator""" vcf_obj = VCF(one_cnvnator) for variant in vcf_obj: assert variant
Example #9
Source File: get_file_info.py From puzzle with MIT License | 5 votes |
def get_variant_type(variant_source): """Try to find out what type of variants that exists in a variant source Args: variant_source (str): Path to variant source source_mode (str): 'vcf' or 'gemini' Returns: variant_type (str): 'sv' or 'snv' """ file_type = get_file_type(variant_source) variant_type = 'sv' if file_type == 'vcf': variants = VCF(variant_source) elif file_type == 'gemini': variants = GeminiQuery(variant_source) gemini_query = "SELECT * from variants" variants.run(gemini_query) # Check 1000 first variants, if anyone is a snv we set the variant_type # to 'snv' for i,variant in enumerate(variants): if file_type == 'vcf': if variant.is_snp: variant_type = 'snv' elif file_type == 'gemini': if variant['type'] == 'snp': variant_type = 'snv' if i > 1000: break return variant_type
Example #10
Source File: variant_mixin.py From puzzle with MIT License | 5 votes |
def variant(self, case_id, variant_id): """Return a specific variant. Args: case_id (str): Path to vcf file variant_id (str): A variant id Returns: variant (Variant): The variant object for the given id """ case_obj = self.case(case_id=case_id) vcf_file_path = case_obj.variant_source self.head = get_header(vcf_file_path) self.vep_header = self.head.vep_columns self.snpeff_header = self.head.snpeff_columns handle = VCF(vcf_file_path) for index, variant in enumerate(handle): index += 1 line_id = get_variant_id(variant_line=str(variant)).lstrip('chrCHR') if line_id == variant_id: return self._format_variants( variant=variant, index=index, case_obj=case_obj, add_all_info=True ) return None
Example #11
Source File: sv-upset-plot.py From nano-snakemake with MIT License | 5 votes |
def make_sets(vcf, names): """From the merged SV file, return pd.Series of overlapping sets""" calls = defaultdict(int) for v in VCF(vcf): calls[gt_types_to_binary_comparison(v.gt_types)] += 1 tf_array = [[True, False]] * len(list(calls.keys())[0]) index = pd.MultiIndex.from_product(tf_array, names=names) values = [calls[''.join([str(int(j)) for j in i])] for i in index] return pd.Series(values, index=index)
Example #12
Source File: zygosity-accuracy.py From nano-snakemake with MIT License | 5 votes |
def zygosity_accuracy(vcff): """ First level of the dict is the "truth" call, second level is the "test" """ zygosities = dict(het=dict(het=0, hom=0, nocall=0 ), hom=dict(het=0, hom=0, nocall=0 ), nocall=dict(het=0, hom=0, nocall=0 ), ) for v in VCF(vcff): zyg_truth, zyg_test = [get_zygosity(gt) for gt in v.gt_types] zygosities[zyg_truth][zyg_test] += 1 zygs = ["nocall", "het", "hom"] df = pd.DataFrame(index=zygs, columns=zygs) for tr in zygs: for te in zygs: df.loc[tr, te] = zygosities[tr][te] print(df)
Example #13
Source File: fix_alt_genotype.py From nano-snakemake with MIT License | 5 votes |
def main(): args = get_args() vcf = VCF(args.vcf) output = Writer(args.output, vcf) incorrect = 0 for v in vcf: if v.REF == v.ALT[0] and v.INFO["SVTYPE"] == "DEL": v.ALT = "<DEL>" incorrect += 1 output.write_record(v) print("Fixed {} positions".format(incorrect)) output.close() vcf.close()
Example #14
Source File: fix_reference_genotype.py From nano-snakemake with MIT License | 5 votes |
def main(): args = get_args() genome = Fasta(args.genome) vcf = VCF(args.vcf) output = Writer(args.output, vcf) incorrect_reference = 0 for v in vcf: ref_nucl = get_reference_nucleotide(v.CHROM, v.start, genome) if v.REF != ref_nucl: v.REF = ref_nucl incorrect_reference += 1 output.write_record(v) print("Fixed {} positions".format(incorrect_reference)) output.close() vcf.close()
Example #15
Source File: test_variant_loader.py From scout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_load_variants_high_treshold(real_populated_database, case_obj): ## GIVEN a populated database and a case adapter = real_populated_database ## WHEN Trying to load variants where no one are above the rank score treshold vcf = VCF(case_obj["vcf_files"]["vcf_sv"]) rankscore_treshold = 1000000 individual_positions = {"ADM1059A2": 0, "ADM1059A1": 1, "ADM1059A3": 2} ## THEN assert that no variants are inserted nr_inserted = adapter._load_variants( vcf, "clinical", case_obj, individual_positions, rankscore_treshold, "cut000" ) assert nr_inserted == 0
Example #16
Source File: precision-recall.py From nano-snakemake with MIT License | 5 votes |
def make_venn(vcf, outname="venn.png"): positions = [[], []] for v in VCF(vcf): if not v.CHROM == 'chrEBV': for index, call in enumerate(v.gt_types): if is_variant(call): positions[index].append( "{}:{}-{}".format(v.CHROM, v.start, v.INFO.get('SVTYPE'))) identifier_sets = [set(i) for i in positions] venn2(identifier_sets, set_labels=('Truth', 'Test')) plt.savefig(outname) plt.close() return identifier_sets
Example #17
Source File: precision-recall.py From nano-snakemake with MIT License | 5 votes |
def bar_chart(vcf, outname="stacked_bar.png"): """ Make a stacked bar chart for length of the SV split by validation status This ignores zygosity. """ len_dict = {"True": [], "False": [], "Missed": []} for v in VCF(vcf): if not v.INFO.get('SVTYPE') == 'TRA' and abs(v.INFO.get('AVGLEN')) >= 50: calls = [is_variant(call) for call in v.gt_types] if calls == [True, True]: len_dict['True'].append(v.INFO.get('AVGLEN')) elif calls == [False, True]: len_dict['False'].append(v.INFO.get('AVGLEN')) elif calls == [True, False]: len_dict['Missed'].append(v.INFO.get('AVGLEN')) plt.subplot(2, 1, 1) plt.hist(x=np.array(list(len_dict.values())), bins=[i for i in range(0, 2000, 10)], stacked=True, histtype='bar', label=list(len_dict.keys())) plt.xlabel('Length of structural variant') plt.ylabel('Number of variants') plt.legend(frameon=False, fontsize="small") plt.subplot(2, 1, 2) plt.hist(x=np.array(list(len_dict.values())), bins=[i for i in range(0, 20000, 100)], stacked=True, histtype='bar', label=list(len_dict.keys()), log=True) plt.xlabel('Length of structural variant') plt.ylabel('Number of variants') plt.legend(frameon=False, fontsize="small") plt.tight_layout() plt.savefig(outname) plt.close()
Example #18
Source File: precision-recall-curve-qualities.py From nano-snakemake with MIT License | 5 votes |
def extract_variants(vcf): return pd.DataFrame( data=[ tuple(v.format('CO')) + tuple([is_variant(call) for call in v.gt_types]) + parse_qualities(v.format('QV')) for v in VCF(vcf)], columns=["coords_truth", "coords_test", "truth", "test", "quals_truth", "quals_test"])
Example #19
Source File: split-vcf-by-type.py From nano-snakemake with MIT License | 5 votes |
def main(): args = get_args() vars = defaultdict(list) vcf = VCF(args.vcf) output = args.vcf.replace('.vcf', '') for v in vcf: vars[v.INFO.get('SVTYPE')].append(v) for k, varlist in vars.items(): w = Writer(output + '_' + k.replace('/', '') + '.vcf', vcf) for v in varlist: w.write_record(v)
Example #20
Source File: SV-length-plot.py From nano-snakemake with MIT License | 5 votes |
def main(): args = get_args() len_dict = defaultdict(list) vcf = VCF(args.vcf) if len(vcf.samples) > 1: sys.stderr.write("\n\nWarning: this script does not support multiple samples in a vcf.\n") sys.stderr.write("Plotting and counting only for {}.".format(vcf.samples[0])) for v in vcf: if is_variant(v.gt_types) and not v.INFO.get('SVTYPE') == 'TRA': try: if get_svlen(v) >= 50: len_dict[get_svtype(v)].append(get_svlen(v)) except TypeError: if v.INFO.get('SVTYPE') == 'INV': if (v.end - v.start) >= 50: len_dict[get_svtype(v)].append(v.end - v.start) elif v.INFO.get('SVTYPE') == 'BND': try: len_dict['BND'].append(abs(v.INFO.get('MATEDIST'))) except TypeError: len_dict['BND'].append(0) else: sys.stderr.write("Exception when parsing variant:\n{}\n\n".format(v)) len_dict["parse_error"].append(0) with open(args.counts, 'w') as counts: counts.write("Number of nucleotides affected by SV:\n") for svtype, lengths in len_dict.items(): counts.write("{}:\t{} variants\t{}bp\n".format( svtype, len(lengths), sum(lengths))) make_plot(dict_of_lengths=len_dict, output=args.output)
Example #21
Source File: SV-length-plot.py From nano-snakemake with MIT License | 5 votes |
def is_variant(gt_types): """Check if a variant position qualifies as a variant 0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT If the VCF does not contain genotypes/zygosities, and hence gt_types is empty, then assume the position is indeed a variant allele""" if not gt_types: return True else: if gt_types[0] in [1, 3]: return True else: return False
Example #22
Source File: SV-carriers-plot.py From nano-snakemake with MIT License | 5 votes |
def main(): args = get_args() variants = VCF(args.vcf) plt.hist(x=[get_carrier_number(v.gt_types) for v in variants], bins=range(10), align="left") plt.xlabel('Number of carriers') plt.xticks(range(1, len(variants.samples) + 1), range(1, len(variants.samples) + 1)) plt.ylabel('Number of variants') plt.savefig(args.output)
Example #23
Source File: conftest.py From scout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def one_cancer_manta_SV_variant(request, vep_94_manta_annotated_SV_variants_file): LOG.info("Return one parsed cancer SV variant") variant_parser = VCF(vep_94_manta_annotated_SV_variants_file) variant = next(variant_parser) return variant
Example #24
Source File: gather.py From models with MIT License | 5 votes |
def get_vep_scores(vcf_name, vep_vcf_key="CSQ", sel_vep_keys=["phyloP46way_placental", "phyloP46way_primate", "CADD_phred", "CADD_raw"]): vcf_fh = cyvcf2.VCF(vcf_name) # get the correct elements for hdr in vcf_fh.header_iter(): hdr_info = hdr.info() if 'ID' in hdr_info: if hdr_info['ID'] == vep_vcf_key: vep_keys = hdr_info['Description'].split(": ")[-1].rstrip('"').split("|") break sel_vep_elms = [vep_keys.index(k) for k in sel_vep_keys] info_tags = [] entries = [] # Iterate over all entries and extract the `info_tag` if set, otherwise return all INFO tags for rec in vcf_fh: info_dict = dict(rec.INFO) if vep_vcf_key in info_dict: vep_entries = info_dict[vep_vcf_key].split(",")[0].split("|") variant_uid = ":".join([rec.CHROM, str(rec.POS), rec.REF, rec.ALT[0]]) vals = [vep_entries[i] for i in sel_vep_elms] entries.append(pd.Series([vep_entries[i] for i in sel_vep_elms], name = variant_uid, index = sel_vep_keys)) # Turn into a data frame df = pd.DataFrame(entries,) df = df.replace("", "nan").astype(float) # dedup df = df.loc[~pd.Series(df.index.values).duplicated().values,:] return df
Example #25
Source File: test_hemi.py From cyvcf2 with MIT License | 5 votes |
def test_hemi(): """ make sure that we are getting the correct gt_types for hemizygous variants """ for p in (HEM_PATH, VCF_PATH): vcf = VCF(p) for v in vcf: check_var(v)
Example #26
Source File: variant.py From CharGer with GNU General Public License v3.0 | 5 votes |
def from_cyvcf2(cls: Type[V], variant: CyVCF2Variant) -> V: """ Create one Variant object based on the given :class:`cyvcf2.Variant <cyvcf2.cyvcf2.Variant>` VCF record. """ return cls( chrom=variant.CHROM, start_pos=variant.start + 1, end_pos=variant.end, ref_allele=variant.REF, alt_allele=variant.ALT[0], id=variant.ID, filter=variant.FILTER, info=dict(variant.INFO), )
Example #27
Source File: variant.py From CharGer with GNU General Public License v3.0 | 5 votes |
def get_vep_version(cls: Type[V], vcf_raw_headers: List[str]) -> str: """Extract the VEP version in the given VCF.""" # Find VEP version # Reverse the header order because the newer header appears later try: vep_header = next( l for l in reversed(vcf_raw_headers) if l.startswith("##VEP=") ) vep_version = re.match(r"^##VEP=['\"]?v(\d+)['\"]?", vep_header).group(1) # type: ignore except (StopIteration, AttributeError): logger.warning(f"Cannot find VEP version in the VCF header") vep_version = "UNKNOWN" return vep_version
Example #28
Source File: vcf.py From kipoiseq with MIT License | 5 votes |
def __init__(self, *args, **kwargs): from cyvcf2 import VCF super(MultiSampleVCF, self).__init__(*args, **kwargs, strict_gt=True) self.sample_mapping = dict(zip(self.samples, range(len(self.samples))))
Example #29
Source File: conftest.py From scout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def one_variant(request, variant_clinical_file): LOG.info("Return one parsed variant") variant_parser = VCF(variant_clinical_file) variant = next(variant_parser) return variant
Example #30
Source File: conftest.py From scout with BSD 3-Clause "New" or "Revised" License | 5 votes |
def one_vep97_annotated_variant(request, vep_97_annotated_variant_clinical_file): LOG.info("Return one parsed variant") variant_parser = VCF(vep_97_annotated_variant_clinical_file) variant = next(variant_parser) return variant