Python cyvcf2.VCF Examples

The following are 30 code examples of cyvcf2.VCF(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module cyvcf2 , or try the search function .
Example #1
Source File: variant.py    From CharGer with GNU General Public License v3.0 6 votes vote down vote up
def get_vep_csq_fields(cls: Type[V], vcf_raw_headers: List[str]) -> List[str]:
        """Extract the CSQ fields VEP output in the given VCF."""
        # Get CSQ spec
        # Reverse the header order because the newer header appears later
        try:
            csq_info_header = next(
                l for l in reversed(vcf_raw_headers) if l.startswith("##INFO=<ID=CSQ,")
            )
        except StopIteration:
            raise ValueError(f"Cannot find CSQ format in the VCF header")
        m = re.search(r"Format: ([\w\|]+)['\"]", csq_info_header)
        if m:
            csq_format = m.group(1)
        else:
            raise ValueError(
                f"Cannot parse the CSQ field format from its INFO VCF header: {csq_info_header}"
            )
        csq_fields = csq_format.split("|")
        return csq_fields 
Example #2
Source File: variant.py    From CharGer with GNU General Public License v3.0 6 votes vote down vote up
def read_vcf(cls: Type[V], path: Path) -> Generator[V, None, None]:
        """
        Read VCF record from `path`.

        This function walks through each variant record in the given VCF using :class:`cyvcf2.VCF
        <cyvcf2.cyvcf2.VCF>`, and yields the record as a :class:`Variant` object.

        See also :meth:`read_and_parse_vcf` to read and parse the VCF.

        Args:
            path: Path to the VCF.

        Returns:
            An generator walking through all variants per record.

        """
        with closing(VCF(str(path))) as vcf:
            for cy_variant in vcf:
                variant = cls.from_cyvcf2(cy_variant)
                yield variant 
Example #3
Source File: heatmap_survivor_SV_calls.py    From nano-snakemake with MIT License 6 votes vote down vote up
def main(vcf):
    variants = VCF(vcf)
    samples = variants.samples
    identifiers = {i: set() for i in samples}
    for v in variants:
        for sample, call in zip(samples, v.gt_types):
            if is_variant(call):
                identifiers[sample].add(v.ID)

    df = pd.DataFrame(index=samples, columns=samples)
    for i_sample, i_values in identifiers.items():
        for j_sample, j_values in identifiers.items():
            df.loc[i_sample, j_sample] = len(i_values & j_values)
    try:
        proper_names = [i.split('/')[1].split('.')[0] for i in samples]
    except IndexError:
        proper_names = samples
    sns.heatmap(data=df,
                annot=True,
                fmt="d",
                linewidths=0.5,
                xticklabels=proper_names,
                yticklabels=proper_names)
    plt.savefig("SV-calls_heatmap.png", bbox_inches="tight") 
Example #4
Source File: fix_SVLEN.py    From nano-snakemake with MIT License 6 votes vote down vote up
def main():
    args = get_args()
    vcf = VCF(args.vcf)
    w = Writer(args.output, vcf)
    for v in vcf:
        if v.INFO["SVTYPE"] == "DEL":
            if not v.INFO["SVLEN"] < 0:
                v.INFO["SVLEN"] = - v.INFO["SVLEN"]
        w.write_record(v) 
Example #5
Source File: filter_small_variants.py    From nano-snakemake with MIT License 5 votes vote down vote up
def main():
    args = get_args()
    vcf_in = VCF(args.vcf)
    vcf_out = Writer(args.output, vcf_in)
    for v in vcf_in:
        if v.INFO["SVLEN"] > 49:
            vcf_out.write_record(v)
    vcf_in.close()
    vcf_out.close() 
Example #6
Source File: test_parse_headers.py    From scout with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_parse_rank_results_header(variant_clinical_file):
    ## GIVEN a vcf object
    vcf_obj = VCF(variant_clinical_file)
    ## WHEN parsing the rank results header
    rank_results_header = parse_rank_results_header(vcf_obj)
    ## THEN assert the header is returned correct
    assert isinstance(rank_results_header, list)
    assert rank_results_header
    assert "Consequence" in rank_results_header 
Example #7
Source File: test_parse_headers.py    From scout with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_parse_vep_header(variant_clinical_file):
    ## GIVEN a vcf object
    vcf_obj = VCF(variant_clinical_file)
    ## WHEN parsing the vep header
    vep_header = parse_vep_header(vcf_obj)
    ## THEN assert the header is returned correct
    assert isinstance(vep_header, list)
    assert vep_header
    assert "ALLELE" in vep_header 
Example #8
Source File: test_parse_genotype.py    From scout with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_parse_cnvnator():
    """docstring for test_parse_cnvnator"""
    vcf_obj = VCF(one_cnvnator)
    for variant in vcf_obj:
        assert variant 
Example #9
Source File: get_file_info.py    From puzzle with MIT License 5 votes vote down vote up
def get_variant_type(variant_source):
    """Try to find out what type of variants that exists in a variant source
    
        Args:
            variant_source (str): Path to variant source
            source_mode (str): 'vcf' or 'gemini'
        
        Returns:
            variant_type (str): 'sv' or 'snv'
    """
    file_type = get_file_type(variant_source)
    variant_type = 'sv'
    if file_type == 'vcf':
        variants = VCF(variant_source)
    elif file_type == 'gemini':
        variants = GeminiQuery(variant_source)
        gemini_query = "SELECT * from variants"
        variants.run(gemini_query)
    # Check 1000 first variants, if anyone is a snv we set the variant_type
    # to 'snv'
    for i,variant in enumerate(variants):
        if file_type == 'vcf':
            if variant.is_snp:
                variant_type = 'snv'
        elif file_type == 'gemini':
            if variant['type'] == 'snp':
                variant_type = 'snv'
            
        if i > 1000:
            break
    
    return variant_type 
Example #10
Source File: variant_mixin.py    From puzzle with MIT License 5 votes vote down vote up
def variant(self, case_id, variant_id):
        """Return a specific variant.

            Args:
                case_id (str): Path to vcf file
                variant_id (str): A variant id

            Returns:
                variant (Variant): The variant object for the given id
        """
        case_obj = self.case(case_id=case_id)
        vcf_file_path = case_obj.variant_source
        self.head = get_header(vcf_file_path)

        self.vep_header = self.head.vep_columns
        self.snpeff_header = self.head.snpeff_columns

        handle = VCF(vcf_file_path)

        for index, variant in enumerate(handle):
            index += 1
            line_id = get_variant_id(variant_line=str(variant)).lstrip('chrCHR')
            if line_id == variant_id:
                return self._format_variants(
                    variant=variant,
                    index=index,
                    case_obj=case_obj,
                    add_all_info=True
                    )

        return None 
Example #11
Source File: sv-upset-plot.py    From nano-snakemake with MIT License 5 votes vote down vote up
def make_sets(vcf, names):
    """From the merged SV file, return pd.Series of overlapping sets"""
    calls = defaultdict(int)
    for v in VCF(vcf):
        calls[gt_types_to_binary_comparison(v.gt_types)] += 1
    tf_array = [[True, False]] * len(list(calls.keys())[0])
    index = pd.MultiIndex.from_product(tf_array, names=names)
    values = [calls[''.join([str(int(j)) for j in i])] for i in index]
    return pd.Series(values, index=index) 
Example #12
Source File: zygosity-accuracy.py    From nano-snakemake with MIT License 5 votes vote down vote up
def zygosity_accuracy(vcff):
    """
    First level of the dict is the "truth" call, second level is the "test"
    """
    zygosities = dict(het=dict(het=0,
                               hom=0,
                               nocall=0
                               ),
                      hom=dict(het=0,
                               hom=0,
                               nocall=0
                               ),
                      nocall=dict(het=0,
                                  hom=0,
                                  nocall=0
                                  ),
                      )
    for v in VCF(vcff):
        zyg_truth, zyg_test = [get_zygosity(gt) for gt in v.gt_types]
        zygosities[zyg_truth][zyg_test] += 1
    zygs = ["nocall", "het", "hom"]
    df = pd.DataFrame(index=zygs, columns=zygs)
    for tr in zygs:
        for te in zygs:
            df.loc[tr, te] = zygosities[tr][te]
    print(df) 
Example #13
Source File: fix_alt_genotype.py    From nano-snakemake with MIT License 5 votes vote down vote up
def main():
    args = get_args()
    vcf = VCF(args.vcf)
    output = Writer(args.output, vcf)
    incorrect = 0
    for v in vcf:
        if v.REF == v.ALT[0] and v.INFO["SVTYPE"] == "DEL":
            v.ALT = "<DEL>"
            incorrect += 1
        output.write_record(v)
    print("Fixed {} positions".format(incorrect))
    output.close()
    vcf.close() 
Example #14
Source File: fix_reference_genotype.py    From nano-snakemake with MIT License 5 votes vote down vote up
def main():
    args = get_args()
    genome = Fasta(args.genome)
    vcf = VCF(args.vcf)
    output = Writer(args.output, vcf)
    incorrect_reference = 0
    for v in vcf:
        ref_nucl = get_reference_nucleotide(v.CHROM, v.start, genome)
        if v.REF != ref_nucl:
            v.REF = ref_nucl
            incorrect_reference += 1
        output.write_record(v)
    print("Fixed {} positions".format(incorrect_reference))
    output.close()
    vcf.close() 
Example #15
Source File: test_variant_loader.py    From scout with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_load_variants_high_treshold(real_populated_database, case_obj):
    ## GIVEN a populated database and a case
    adapter = real_populated_database
    ## WHEN Trying to load variants where no one are above the rank score treshold
    vcf = VCF(case_obj["vcf_files"]["vcf_sv"])
    rankscore_treshold = 1000000
    individual_positions = {"ADM1059A2": 0, "ADM1059A1": 1, "ADM1059A3": 2}
    ## THEN assert that no variants are inserted
    nr_inserted = adapter._load_variants(
        vcf, "clinical", case_obj, individual_positions, rankscore_treshold, "cut000"
    )
    assert nr_inserted == 0 
Example #16
Source File: precision-recall.py    From nano-snakemake with MIT License 5 votes vote down vote up
def make_venn(vcf, outname="venn.png"):
    positions = [[], []]
    for v in VCF(vcf):
        if not v.CHROM == 'chrEBV':
            for index, call in enumerate(v.gt_types):
                if is_variant(call):
                    positions[index].append(
                        "{}:{}-{}".format(v.CHROM, v.start, v.INFO.get('SVTYPE')))
    identifier_sets = [set(i) for i in positions]
    venn2(identifier_sets, set_labels=('Truth', 'Test'))
    plt.savefig(outname)
    plt.close()
    return identifier_sets 
Example #17
Source File: precision-recall.py    From nano-snakemake with MIT License 5 votes vote down vote up
def bar_chart(vcf, outname="stacked_bar.png"):
    """
    Make a stacked bar chart for length of the SV split by validation status
    This ignores zygosity.
    """
    len_dict = {"True": [], "False": [], "Missed": []}
    for v in VCF(vcf):
        if not v.INFO.get('SVTYPE') == 'TRA' and abs(v.INFO.get('AVGLEN')) >= 50:
            calls = [is_variant(call) for call in v.gt_types]
            if calls == [True, True]:
                len_dict['True'].append(v.INFO.get('AVGLEN'))
            elif calls == [False, True]:
                len_dict['False'].append(v.INFO.get('AVGLEN'))
            elif calls == [True, False]:
                len_dict['Missed'].append(v.INFO.get('AVGLEN'))
    plt.subplot(2, 1, 1)
    plt.hist(x=np.array(list(len_dict.values())),
             bins=[i for i in range(0, 2000, 10)],
             stacked=True,
             histtype='bar',
             label=list(len_dict.keys()))
    plt.xlabel('Length of structural variant')
    plt.ylabel('Number of variants')
    plt.legend(frameon=False,
               fontsize="small")
    plt.subplot(2, 1, 2)
    plt.hist(x=np.array(list(len_dict.values())),
             bins=[i for i in range(0, 20000, 100)],
             stacked=True,
             histtype='bar',
             label=list(len_dict.keys()),
             log=True)
    plt.xlabel('Length of structural variant')
    plt.ylabel('Number of variants')
    plt.legend(frameon=False,
               fontsize="small")
    plt.tight_layout()
    plt.savefig(outname)
    plt.close() 
Example #18
Source File: precision-recall-curve-qualities.py    From nano-snakemake with MIT License 5 votes vote down vote up
def extract_variants(vcf):
    return pd.DataFrame(
        data=[
            tuple(v.format('CO')) +
            tuple([is_variant(call) for call in v.gt_types]) +
            parse_qualities(v.format('QV')) for v in VCF(vcf)],
        columns=["coords_truth", "coords_test", "truth", "test", "quals_truth", "quals_test"]) 
Example #19
Source File: split-vcf-by-type.py    From nano-snakemake with MIT License 5 votes vote down vote up
def main():
    args = get_args()
    vars = defaultdict(list)
    vcf = VCF(args.vcf)
    output = args.vcf.replace('.vcf', '')
    for v in vcf:
        vars[v.INFO.get('SVTYPE')].append(v)
    for k, varlist in vars.items():
        w = Writer(output + '_' + k.replace('/', '') + '.vcf', vcf)
        for v in varlist:
            w.write_record(v) 
Example #20
Source File: SV-length-plot.py    From nano-snakemake with MIT License 5 votes vote down vote up
def main():
    args = get_args()
    len_dict = defaultdict(list)
    vcf = VCF(args.vcf)
    if len(vcf.samples) > 1:
        sys.stderr.write("\n\nWarning: this script does not support multiple samples in a vcf.\n")
        sys.stderr.write("Plotting and counting only for {}.".format(vcf.samples[0]))
    for v in vcf:
        if is_variant(v.gt_types) and not v.INFO.get('SVTYPE') == 'TRA':
            try:
                if get_svlen(v) >= 50:
                    len_dict[get_svtype(v)].append(get_svlen(v))
            except TypeError:
                if v.INFO.get('SVTYPE') == 'INV':
                    if (v.end - v.start) >= 50:
                        len_dict[get_svtype(v)].append(v.end - v.start)
                elif v.INFO.get('SVTYPE') == 'BND':
                    try:
                        len_dict['BND'].append(abs(v.INFO.get('MATEDIST')))
                    except TypeError:
                        len_dict['BND'].append(0)
                else:
                    sys.stderr.write("Exception when parsing variant:\n{}\n\n".format(v))
                    len_dict["parse_error"].append(0)
    with open(args.counts, 'w') as counts:
        counts.write("Number of nucleotides affected by SV:\n")
        for svtype, lengths in len_dict.items():
            counts.write("{}:\t{} variants\t{}bp\n".format(
                svtype, len(lengths), sum(lengths)))
    make_plot(dict_of_lengths=len_dict,
              output=args.output) 
Example #21
Source File: SV-length-plot.py    From nano-snakemake with MIT License 5 votes vote down vote up
def is_variant(gt_types):
    """Check if a variant position qualifies as a variant

    0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
    If the VCF does not contain genotypes/zygosities, and hence gt_types is empty,
    then assume the position is indeed a variant allele"""
    if not gt_types:
        return True
    else:
        if gt_types[0] in [1, 3]:
            return True
        else:
            return False 
Example #22
Source File: SV-carriers-plot.py    From nano-snakemake with MIT License 5 votes vote down vote up
def main():
    args = get_args()
    variants = VCF(args.vcf)
    plt.hist(x=[get_carrier_number(v.gt_types) for v in variants],
             bins=range(10),
             align="left")
    plt.xlabel('Number of carriers')
    plt.xticks(range(1, len(variants.samples) + 1), range(1, len(variants.samples) + 1))
    plt.ylabel('Number of variants')
    plt.savefig(args.output) 
Example #23
Source File: conftest.py    From scout with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def one_cancer_manta_SV_variant(request, vep_94_manta_annotated_SV_variants_file):
    LOG.info("Return one parsed cancer SV variant")
    variant_parser = VCF(vep_94_manta_annotated_SV_variants_file)

    variant = next(variant_parser)
    return variant 
Example #24
Source File: gather.py    From models with MIT License 5 votes vote down vote up
def get_vep_scores(vcf_name,
                   vep_vcf_key="CSQ",
                   sel_vep_keys=["phyloP46way_placental",
                                 "phyloP46way_primate",
                                 "CADD_phred",
                                 "CADD_raw"]):
    vcf_fh = cyvcf2.VCF(vcf_name)
    # get the correct elements
    for hdr in vcf_fh.header_iter():
        hdr_info = hdr.info()
        if 'ID' in hdr_info:
            if hdr_info['ID'] == vep_vcf_key:
                vep_keys = hdr_info['Description'].split(": ")[-1].rstrip('"').split("|")
                break
    sel_vep_elms = [vep_keys.index(k) for k in sel_vep_keys]
    info_tags = []
    entries = []
    # Iterate over all entries and extract the `info_tag` if set, otherwise return all INFO tags
    for rec in vcf_fh:
        info_dict = dict(rec.INFO)
        if vep_vcf_key in info_dict:
            vep_entries = info_dict[vep_vcf_key].split(",")[0].split("|")
            variant_uid = ":".join([rec.CHROM, str(rec.POS), rec.REF, rec.ALT[0]])
            vals = [vep_entries[i] for i in sel_vep_elms]
            entries.append(pd.Series([vep_entries[i] for i in sel_vep_elms], name = variant_uid, index = sel_vep_keys))
    # Turn into a data frame
    df = pd.DataFrame(entries,)
    df = df.replace("", "nan").astype(float)
    # dedup
    df = df.loc[~pd.Series(df.index.values).duplicated().values,:]
    return df 
Example #25
Source File: test_hemi.py    From cyvcf2 with MIT License 5 votes vote down vote up
def test_hemi():
    """
    make sure that we are getting the correct gt_types
    for hemizygous variants
    """

    for p in (HEM_PATH, VCF_PATH):
        vcf = VCF(p)

        for v in vcf:
            check_var(v) 
Example #26
Source File: variant.py    From CharGer with GNU General Public License v3.0 5 votes vote down vote up
def from_cyvcf2(cls: Type[V], variant: CyVCF2Variant) -> V:
        """
        Create one Variant object based on the given
        :class:`cyvcf2.Variant <cyvcf2.cyvcf2.Variant>` VCF record.
        """
        return cls(
            chrom=variant.CHROM,
            start_pos=variant.start + 1,
            end_pos=variant.end,
            ref_allele=variant.REF,
            alt_allele=variant.ALT[0],
            id=variant.ID,
            filter=variant.FILTER,
            info=dict(variant.INFO),
        ) 
Example #27
Source File: variant.py    From CharGer with GNU General Public License v3.0 5 votes vote down vote up
def get_vep_version(cls: Type[V], vcf_raw_headers: List[str]) -> str:
        """Extract the VEP version in the given VCF."""
        # Find VEP version
        # Reverse the header order because the newer header appears later
        try:
            vep_header = next(
                l for l in reversed(vcf_raw_headers) if l.startswith("##VEP=")
            )
            vep_version = re.match(r"^##VEP=['\"]?v(\d+)['\"]?", vep_header).group(1)  # type: ignore
        except (StopIteration, AttributeError):
            logger.warning(f"Cannot find VEP version in the VCF header")
            vep_version = "UNKNOWN"
        return vep_version 
Example #28
Source File: vcf.py    From kipoiseq with MIT License 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        from cyvcf2 import VCF
        super(MultiSampleVCF, self).__init__(*args, **kwargs, strict_gt=True)
        self.sample_mapping = dict(zip(self.samples, range(len(self.samples)))) 
Example #29
Source File: conftest.py    From scout with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def one_variant(request, variant_clinical_file):
    LOG.info("Return one parsed variant")
    variant_parser = VCF(variant_clinical_file)

    variant = next(variant_parser)
    return variant 
Example #30
Source File: conftest.py    From scout with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def one_vep97_annotated_variant(request, vep_97_annotated_variant_clinical_file):
    LOG.info("Return one parsed variant")
    variant_parser = VCF(vep_97_annotated_variant_clinical_file)

    variant = next(variant_parser)
    return variant