Python Examples of cyvcf2.VCF

Source File: variant.py From CharGer with GNU General Public License v3.0

6 votes

def get_vep_csq_fields(cls: Type[V], vcf_raw_headers: List[str]) -> List[str]:
        """Extract the CSQ fields VEP output in the given VCF."""
        # Get CSQ spec
        # Reverse the header order because the newer header appears later
        try:
            csq_info_header = next(
                l for l in reversed(vcf_raw_headers) if l.startswith("##INFO=<ID=CSQ,")
            )
        except StopIteration:
            raise ValueError(f"Cannot find CSQ format in the VCF header")
        m = re.search(r"Format: ([\w\|]+)['\"]", csq_info_header)
        if m:
            csq_format = m.group(1)
        else:
            raise ValueError(
                f"Cannot parse the CSQ field format from its INFO VCF header: {csq_info_header}"
            )
        csq_fields = csq_format.split("|")
        return csq_fields

Source File: variant.py From CharGer with GNU General Public License v3.0

6 votes

def read_vcf(cls: Type[V], path: Path) -> Generator[V, None, None]:
        """
        Read VCF record from `path`.

        This function walks through each variant record in the given VCF using :class:`cyvcf2.VCF
        <cyvcf2.cyvcf2.VCF>`, and yields the record as a :class:`Variant` object.

        See also :meth:`read_and_parse_vcf` to read and parse the VCF.

        Args:
            path: Path to the VCF.

        Returns:
            An generator walking through all variants per record.

        """
        with closing(VCF(str(path))) as vcf:
            for cy_variant in vcf:
                variant = cls.from_cyvcf2(cy_variant)
                yield variant

Source File: heatmap_survivor_SV_calls.py From nano-snakemake with MIT License

6 votes

def main(vcf):
    variants = VCF(vcf)
    samples = variants.samples
    identifiers = {i: set() for i in samples}
    for v in variants:
        for sample, call in zip(samples, v.gt_types):
            if is_variant(call):
                identifiers[sample].add(v.ID)

    df = pd.DataFrame(index=samples, columns=samples)
    for i_sample, i_values in identifiers.items():
        for j_sample, j_values in identifiers.items():
            df.loc[i_sample, j_sample] = len(i_values & j_values)
    try:
        proper_names = [i.split('/')[1].split('.')[0] for i in samples]
    except IndexError:
        proper_names = samples
    sns.heatmap(data=df,
                annot=True,
                fmt="d",
                linewidths=0.5,
                xticklabels=proper_names,
                yticklabels=proper_names)
    plt.savefig("SV-calls_heatmap.png", bbox_inches="tight")

Source File: fix_SVLEN.py From nano-snakemake with MIT License

6 votes

def main():
    args = get_args()
    vcf = VCF(args.vcf)
    w = Writer(args.output, vcf)
    for v in vcf:
        if v.INFO["SVTYPE"] == "DEL":
            if not v.INFO["SVLEN"] < 0:
                v.INFO["SVLEN"] = - v.INFO["SVLEN"]
        w.write_record(v)

Source File: filter_small_variants.py From nano-snakemake with MIT License

5 votes

def main():
    args = get_args()
    vcf_in = VCF(args.vcf)
    vcf_out = Writer(args.output, vcf_in)
    for v in vcf_in:
        if v.INFO["SVLEN"] > 49:
            vcf_out.write_record(v)
    vcf_in.close()
    vcf_out.close()

Source File: test_parse_headers.py From scout with BSD 3-Clause "New" or "Revised" License

5 votes

def test_parse_rank_results_header(variant_clinical_file):
    ## GIVEN a vcf object
    vcf_obj = VCF(variant_clinical_file)
    ## WHEN parsing the rank results header
    rank_results_header = parse_rank_results_header(vcf_obj)
    ## THEN assert the header is returned correct
    assert isinstance(rank_results_header, list)
    assert rank_results_header
    assert "Consequence" in rank_results_header

Source File: test_parse_headers.py From scout with BSD 3-Clause "New" or "Revised" License

5 votes

def test_parse_vep_header(variant_clinical_file):
    ## GIVEN a vcf object
    vcf_obj = VCF(variant_clinical_file)
    ## WHEN parsing the vep header
    vep_header = parse_vep_header(vcf_obj)
    ## THEN assert the header is returned correct
    assert isinstance(vep_header, list)
    assert vep_header
    assert "ALLELE" in vep_header

Source File: test_parse_genotype.py From scout with BSD 3-Clause "New" or "Revised" License

5 votes

def test_parse_cnvnator():
    """docstring for test_parse_cnvnator"""
    vcf_obj = VCF(one_cnvnator)
    for variant in vcf_obj:
        assert variant

Source File: get_file_info.py From puzzle with MIT License

5 votes

def get_variant_type(variant_source):
    """Try to find out what type of variants that exists in a variant source
    
        Args:
            variant_source (str): Path to variant source
            source_mode (str): 'vcf' or 'gemini'
        
        Returns:
            variant_type (str): 'sv' or 'snv'
    """
    file_type = get_file_type(variant_source)
    variant_type = 'sv'
    if file_type == 'vcf':
        variants = VCF(variant_source)
    elif file_type == 'gemini':
        variants = GeminiQuery(variant_source)
        gemini_query = "SELECT * from variants"
        variants.run(gemini_query)
    # Check 1000 first variants, if anyone is a snv we set the variant_type
    # to 'snv'
    for i,variant in enumerate(variants):
        if file_type == 'vcf':
            if variant.is_snp:
                variant_type = 'snv'
        elif file_type == 'gemini':
            if variant['type'] == 'snp':
                variant_type = 'snv'
            
        if i > 1000:
            break
    
    return variant_type

Source File: variant_mixin.py From puzzle with MIT License

5 votes

def variant(self, case_id, variant_id):
        """Return a specific variant.

            Args:
                case_id (str): Path to vcf file
                variant_id (str): A variant id

            Returns:
                variant (Variant): The variant object for the given id
        """
        case_obj = self.case(case_id=case_id)
        vcf_file_path = case_obj.variant_source
        self.head = get_header(vcf_file_path)

        self.vep_header = self.head.vep_columns
        self.snpeff_header = self.head.snpeff_columns

        handle = VCF(vcf_file_path)

        for index, variant in enumerate(handle):
            index += 1
            line_id = get_variant_id(variant_line=str(variant)).lstrip('chrCHR')
            if line_id == variant_id:
                return self._format_variants(
                    variant=variant,
                    index=index,
                    case_obj=case_obj,
                    add_all_info=True
                    )

        return None

Source File: sv-upset-plot.py From nano-snakemake with MIT License

5 votes

def make_sets(vcf, names):
    """From the merged SV file, return pd.Series of overlapping sets"""
    calls = defaultdict(int)
    for v in VCF(vcf):
        calls[gt_types_to_binary_comparison(v.gt_types)] += 1
    tf_array = [[True, False]] * len(list(calls.keys())[0])
    index = pd.MultiIndex.from_product(tf_array, names=names)
    values = [calls[''.join([str(int(j)) for j in i])] for i in index]
    return pd.Series(values, index=index)

Source File: zygosity-accuracy.py From nano-snakemake with MIT License

5 votes

def zygosity_accuracy(vcff):
    """
    First level of the dict is the "truth" call, second level is the "test"
    """
    zygosities = dict(het=dict(het=0,
                               hom=0,
                               nocall=0
                               ),
                      hom=dict(het=0,
                               hom=0,
                               nocall=0
                               ),
                      nocall=dict(het=0,
                                  hom=0,
                                  nocall=0
                                  ),
                      )
    for v in VCF(vcff):
        zyg_truth, zyg_test = [get_zygosity(gt) for gt in v.gt_types]
        zygosities[zyg_truth][zyg_test] += 1
    zygs = ["nocall", "het", "hom"]
    df = pd.DataFrame(index=zygs, columns=zygs)
    for tr in zygs:
        for te in zygs:
            df.loc[tr, te] = zygosities[tr][te]
    print(df)

Source File: fix_alt_genotype.py From nano-snakemake with MIT License

5 votes

def main():
    args = get_args()
    vcf = VCF(args.vcf)
    output = Writer(args.output, vcf)
    incorrect = 0
    for v in vcf:
        if v.REF == v.ALT[0] and v.INFO["SVTYPE"] == "DEL":
            v.ALT = "<DEL>"
            incorrect += 1
        output.write_record(v)
    print("Fixed {} positions".format(incorrect))
    output.close()
    vcf.close()

Source File: fix_reference_genotype.py From nano-snakemake with MIT License

5 votes

def main():
    args = get_args()
    genome = Fasta(args.genome)
    vcf = VCF(args.vcf)
    output = Writer(args.output, vcf)
    incorrect_reference = 0
    for v in vcf:
        ref_nucl = get_reference_nucleotide(v.CHROM, v.start, genome)
        if v.REF != ref_nucl:
            v.REF = ref_nucl
            incorrect_reference += 1
        output.write_record(v)
    print("Fixed {} positions".format(incorrect_reference))
    output.close()
    vcf.close()

Source File: test_variant_loader.py From scout with BSD 3-Clause "New" or "Revised" License

5 votes

def test_load_variants_high_treshold(real_populated_database, case_obj):
    ## GIVEN a populated database and a case
    adapter = real_populated_database
    ## WHEN Trying to load variants where no one are above the rank score treshold
    vcf = VCF(case_obj["vcf_files"]["vcf_sv"])
    rankscore_treshold = 1000000
    individual_positions = {"ADM1059A2": 0, "ADM1059A1": 1, "ADM1059A3": 2}
    ## THEN assert that no variants are inserted
    nr_inserted = adapter._load_variants(
        vcf, "clinical", case_obj, individual_positions, rankscore_treshold, "cut000"
    )
    assert nr_inserted == 0

Source File: precision-recall.py From nano-snakemake with MIT License

5 votes

def make_venn(vcf, outname="venn.png"):
    positions = [[], []]
    for v in VCF(vcf):
        if not v.CHROM == 'chrEBV':
            for index, call in enumerate(v.gt_types):
                if is_variant(call):
                    positions[index].append(
                        "{}:{}-{}".format(v.CHROM, v.start, v.INFO.get('SVTYPE')))
    identifier_sets = [set(i) for i in positions]
    venn2(identifier_sets, set_labels=('Truth', 'Test'))
    plt.savefig(outname)
    plt.close()
    return identifier_sets

Source File: precision-recall.py From nano-snakemake with MIT License

5 votes

def bar_chart(vcf, outname="stacked_bar.png"):
    """
    Make a stacked bar chart for length of the SV split by validation status
    This ignores zygosity.
    """
    len_dict = {"True": [], "False": [], "Missed": []}
    for v in VCF(vcf):
        if not v.INFO.get('SVTYPE') == 'TRA' and abs(v.INFO.get('AVGLEN')) >= 50:
            calls = [is_variant(call) for call in v.gt_types]
            if calls == [True, True]:
                len_dict['True'].append(v.INFO.get('AVGLEN'))
            elif calls == [False, True]:
                len_dict['False'].append(v.INFO.get('AVGLEN'))
            elif calls == [True, False]:
                len_dict['Missed'].append(v.INFO.get('AVGLEN'))
    plt.subplot(2, 1, 1)
    plt.hist(x=np.array(list(len_dict.values())),
             bins=[i for i in range(0, 2000, 10)],
             stacked=True,
             histtype='bar',
             label=list(len_dict.keys()))
    plt.xlabel('Length of structural variant')
    plt.ylabel('Number of variants')
    plt.legend(frameon=False,
               fontsize="small")
    plt.subplot(2, 1, 2)
    plt.hist(x=np.array(list(len_dict.values())),
             bins=[i for i in range(0, 20000, 100)],
             stacked=True,
             histtype='bar',
             label=list(len_dict.keys()),
             log=True)
    plt.xlabel('Length of structural variant')
    plt.ylabel('Number of variants')
    plt.legend(frameon=False,
               fontsize="small")
    plt.tight_layout()
    plt.savefig(outname)
    plt.close()

Source File: precision-recall-curve-qualities.py From nano-snakemake with MIT License

5 votes

def extract_variants(vcf):
    return pd.DataFrame(
        data=[
            tuple(v.format('CO')) +
            tuple([is_variant(call) for call in v.gt_types]) +
            parse_qualities(v.format('QV')) for v in VCF(vcf)],
        columns=["coords_truth", "coords_test", "truth", "test", "quals_truth", "quals_test"])

Source File: split-vcf-by-type.py From nano-snakemake with MIT License

5 votes

def main():
    args = get_args()
    vars = defaultdict(list)
    vcf = VCF(args.vcf)
    output = args.vcf.replace('.vcf', '')
    for v in vcf:
        vars[v.INFO.get('SVTYPE')].append(v)
    for k, varlist in vars.items():
        w = Writer(output + '_' + k.replace('/', '') + '.vcf', vcf)
        for v in varlist:
            w.write_record(v)

Source File: SV-length-plot.py From nano-snakemake with MIT License

5 votes

def main():
    args = get_args()
    len_dict = defaultdict(list)
    vcf = VCF(args.vcf)
    if len(vcf.samples) > 1:
        sys.stderr.write("\n\nWarning: this script does not support multiple samples in a vcf.\n")
        sys.stderr.write("Plotting and counting only for {}.".format(vcf.samples[0]))
    for v in vcf:
        if is_variant(v.gt_types) and not v.INFO.get('SVTYPE') == 'TRA':
            try:
                if get_svlen(v) >= 50:
                    len_dict[get_svtype(v)].append(get_svlen(v))
            except TypeError:
                if v.INFO.get('SVTYPE') == 'INV':
                    if (v.end - v.start) >= 50:
                        len_dict[get_svtype(v)].append(v.end - v.start)
                elif v.INFO.get('SVTYPE') == 'BND':
                    try:
                        len_dict['BND'].append(abs(v.INFO.get('MATEDIST')))
                    except TypeError:
                        len_dict['BND'].append(0)
                else:
                    sys.stderr.write("Exception when parsing variant:\n{}\n\n".format(v))
                    len_dict["parse_error"].append(0)
    with open(args.counts, 'w') as counts:
        counts.write("Number of nucleotides affected by SV:\n")
        for svtype, lengths in len_dict.items():
            counts.write("{}:\t{} variants\t{}bp\n".format(
                svtype, len(lengths), sum(lengths)))
    make_plot(dict_of_lengths=len_dict,
              output=args.output)

Source File: SV-length-plot.py From nano-snakemake with MIT License

5 votes

def is_variant(gt_types):
    """Check if a variant position qualifies as a variant

    0,1,2,3==HOM_REF, HET, UNKNOWN, HOM_ALT
    If the VCF does not contain genotypes/zygosities, and hence gt_types is empty,
    then assume the position is indeed a variant allele"""
    if not gt_types:
        return True
    else:
        if gt_types[0] in [1, 3]:
            return True
        else:
            return False

Source File: SV-carriers-plot.py From nano-snakemake with MIT License

5 votes

def main():
    args = get_args()
    variants = VCF(args.vcf)
    plt.hist(x=[get_carrier_number(v.gt_types) for v in variants],
             bins=range(10),
             align="left")
    plt.xlabel('Number of carriers')
    plt.xticks(range(1, len(variants.samples) + 1), range(1, len(variants.samples) + 1))
    plt.ylabel('Number of variants')
    plt.savefig(args.output)

Source File: conftest.py From scout with BSD 3-Clause "New" or "Revised" License

5 votes

def one_cancer_manta_SV_variant(request, vep_94_manta_annotated_SV_variants_file):
    LOG.info("Return one parsed cancer SV variant")
    variant_parser = VCF(vep_94_manta_annotated_SV_variants_file)

    variant = next(variant_parser)
    return variant

Source File: gather.py From models with MIT License

5 votes

def get_vep_scores(vcf_name,
                   vep_vcf_key="CSQ",
                   sel_vep_keys=["phyloP46way_placental",
                                 "phyloP46way_primate",
                                 "CADD_phred",
                                 "CADD_raw"]):
    vcf_fh = cyvcf2.VCF(vcf_name)
    # get the correct elements
    for hdr in vcf_fh.header_iter():
        hdr_info = hdr.info()
        if 'ID' in hdr_info:
            if hdr_info['ID'] == vep_vcf_key:
                vep_keys = hdr_info['Description'].split(": ")[-1].rstrip('"').split("|")
                break
    sel_vep_elms = [vep_keys.index(k) for k in sel_vep_keys]
    info_tags = []
    entries = []
    # Iterate over all entries and extract the `info_tag` if set, otherwise return all INFO tags
    for rec in vcf_fh:
        info_dict = dict(rec.INFO)
        if vep_vcf_key in info_dict:
            vep_entries = info_dict[vep_vcf_key].split(",")[0].split("|")
            variant_uid = ":".join([rec.CHROM, str(rec.POS), rec.REF, rec.ALT[0]])
            vals = [vep_entries[i] for i in sel_vep_elms]
            entries.append(pd.Series([vep_entries[i] for i in sel_vep_elms], name = variant_uid, index = sel_vep_keys))
    # Turn into a data frame
    df = pd.DataFrame(entries,)
    df = df.replace("", "nan").astype(float)
    # dedup
    df = df.loc[~pd.Series(df.index.values).duplicated().values,:]
    return df

Source File: test_hemi.py From cyvcf2 with MIT License

5 votes

def test_hemi():
    """
    make sure that we are getting the correct gt_types
    for hemizygous variants
    """

    for p in (HEM_PATH, VCF_PATH):
        vcf = VCF(p)

        for v in vcf:
            check_var(v)

Source File: variant.py From CharGer with GNU General Public License v3.0

5 votes

def from_cyvcf2(cls: Type[V], variant: CyVCF2Variant) -> V:
        """
        Create one Variant object based on the given
        :class:`cyvcf2.Variant <cyvcf2.cyvcf2.Variant>` VCF record.
        """
        return cls(
            chrom=variant.CHROM,
            start_pos=variant.start + 1,
            end_pos=variant.end,
            ref_allele=variant.REF,
            alt_allele=variant.ALT[0],
            id=variant.ID,
            filter=variant.FILTER,
            info=dict(variant.INFO),
        )

Source File: variant.py From CharGer with GNU General Public License v3.0

5 votes

def get_vep_version(cls: Type[V], vcf_raw_headers: List[str]) -> str:
        """Extract the VEP version in the given VCF."""
        # Find VEP version
        # Reverse the header order because the newer header appears later
        try:
            vep_header = next(
                l for l in reversed(vcf_raw_headers) if l.startswith("##VEP=")
            )
            vep_version = re.match(r"^##VEP=['\"]?v(\d+)['\"]?", vep_header).group(1)  # type: ignore
        except (StopIteration, AttributeError):
            logger.warning(f"Cannot find VEP version in the VCF header")
            vep_version = "UNKNOWN"
        return vep_version

Source File: vcf.py From kipoiseq with MIT License

5 votes

def __init__(self, *args, **kwargs):
        from cyvcf2 import VCF
        super(MultiSampleVCF, self).__init__(*args, **kwargs, strict_gt=True)
        self.sample_mapping = dict(zip(self.samples, range(len(self.samples))))

Source File: conftest.py From scout with BSD 3-Clause "New" or "Revised" License

5 votes

def one_variant(request, variant_clinical_file):
    LOG.info("Return one parsed variant")
    variant_parser = VCF(variant_clinical_file)

    variant = next(variant_parser)
    return variant

Source File: conftest.py From scout with BSD 3-Clause "New" or "Revised" License

5 votes

def one_vep97_annotated_variant(request, vep_97_annotated_variant_clinical_file):
    LOG.info("Return one parsed variant")
    variant_parser = VCF(vep_97_annotated_variant_clinical_file)

    variant = next(variant_parser)
    return variant

Python cyvcf2.VCF Examples