|
| 1 | +#!/usr/bin/env python |
| 2 | +import plotly.express as px |
| 3 | +from cyvcf2 import VCF |
| 4 | +import pandas as pd |
| 5 | +import numpy as np |
| 6 | +from tqdm.auto import tqdm |
| 7 | +import physt |
| 8 | +from physt import h1 as histogram |
| 9 | +import physt.plotting |
| 10 | +physt.plotting.set_default_backend("plotly") |
| 11 | +import sklearn.impute |
| 12 | +import sklearn.decomposition |
| 13 | + |
| 14 | + |
| 15 | +from sys import stdin, stdout, stderr |
| 16 | +from subprocess import Popen, PIPE, DEVNULL |
| 17 | +import argparse |
| 18 | +import math |
| 19 | +import json |
| 20 | +from concurrent.futures import ProcessPoolExecutor, as_completed |
| 21 | +import random |
| 22 | + |
| 23 | + |
| 24 | +def parallel_regions(vcf, chunk=1000000): |
| 25 | + V = VCF(vcf) |
| 26 | + for cname, clen in zip(V.seqnames, V.seqlens): |
| 27 | + for start in range(0, clen, chunk): |
| 28 | + s = start+1 |
| 29 | + e = min(start+chunk, clen) |
| 30 | + yield f"{cname}:{s}-{e}" |
| 31 | + |
| 32 | + |
| 33 | + |
| 34 | +def variant2dict(v, fields=None, genotypes=False): |
| 35 | + dat = {"CHROM": v.CHROM, "POS": v.POS, "REF": v.REF, "ALT": v.ALT, "QUAL": v.QUAL} |
| 36 | + dat["call_rate"] = v.call_rate |
| 37 | + for key, val in v.INFO: |
| 38 | + if isinstance(val, str) and "," in val: |
| 39 | + val = val.split(',') |
| 40 | + if isinstance(val, tuple) or isinstance(val, list): |
| 41 | + for i, alt in enumerate(v.ALT): |
| 42 | + dat[f"INFO_{key}_{i}"] = val[i] |
| 43 | + else: |
| 44 | + dat[f"INFO_{key}"] = val |
| 45 | + if fields: |
| 46 | + dat = {K:V for K, V in dat.items() if K in fields} |
| 47 | + if genotypes: |
| 48 | + dat["genotypes"] = [v[0] + v[1] if v[0] != -1 and v[1] != -1 else float('nan') for v in v.genotypes ] |
| 49 | + return dat |
| 50 | + |
| 51 | + |
| 52 | +def one_chunk_stats(vcf, chunk, fill=True, fields=None, min_maf=0.01, min_call=0.7, subsample=0.01): |
| 53 | + cmd=f"bcftools view -r {chunk} {vcf} -Ou" |
| 54 | + if fill: |
| 55 | + cmd = f"{cmd} | bcftools +fill-tags - -Ou -- -d -t all,F_MISSING" |
| 56 | + variants = [] |
| 57 | + res = [] |
| 58 | + with Popen(cmd, shell=True, stdout=PIPE, stderr=DEVNULL) as proc: |
| 59 | + vcf = VCF(proc.stdout) |
| 60 | + for v in vcf: |
| 61 | + if v.call_rate >= min_call and len(v.ALT) == 1 and v.INFO["MAF"] >= min_maf and random.random() <= subsample: |
| 62 | + variants.append(variant2dict(v, genotypes=True)) |
| 63 | + res.append(variant2dict(v, fields=["INFO_MAF", "call_rate", "INFO_HWE", "INFO_ExcHet", "INFO_AC", "QUAL"])) |
| 64 | + del vcf |
| 65 | + if not res: |
| 66 | + return { |
| 67 | + "chunk": chunk, |
| 68 | + "n_snps": len(res), |
| 69 | + } |
| 70 | + df = pd.DataFrame.from_records(res) |
| 71 | + try: |
| 72 | + return { |
| 73 | + "chunk": chunk, |
| 74 | + "n_snps": len(res), |
| 75 | + "maf": histogram(df.INFO_MAF, "fixed_width", bin_width=0.01, adaptive=True), |
| 76 | + "call_rate": histogram(df.call_rate, "fixed_width", bin_width=0.01, adaptive=True), |
| 77 | + "hwe": histogram(df.INFO_HWE, "fixed_width", bin_width=0.01, adaptive=True), |
| 78 | + "exc_het": histogram(df.INFO_ExcHet, "fixed_width", bin_width=0.01, adaptive=True), |
| 79 | + "ac": histogram(df.INFO_AC, "fixed_width", bin_width=1, adaptive=True), |
| 80 | + "qual": histogram(df.QUAL, "fixed_width", bin_width=1, adaptive=True), |
| 81 | + "subset_variants": variants |
| 82 | + } |
| 83 | + except Exception as exc: |
| 84 | + print(exc.__class__.__name__, str(exc), df, res) |
| 85 | + |
| 86 | + |
| 87 | + |
| 88 | +def update_result(globalres, res, hists = ["maf", "call_rate", "hwe", "exc_het", "ac", "qual"], sums=["n_snps"], lists=["subset_variants",]): |
| 89 | + for listkey in lists: |
| 90 | + if listkey not in res: |
| 91 | + continue |
| 92 | + if listkey not in globalres: |
| 93 | + globalres[listkey] = [] |
| 94 | + globalres[listkey].extend(res[listkey]) |
| 95 | + for sumkey in sums: |
| 96 | + if sumkey not in res: |
| 97 | + continue |
| 98 | + if sumkey not in globalres: |
| 99 | + globalres[sumkey] = 0 |
| 100 | + globalres[sumkey] += res[sumkey] |
| 101 | + for histkey in hists: |
| 102 | + if histkey not in res: |
| 103 | + continue |
| 104 | + if histkey not in globalres: |
| 105 | + globalres[histkey] = res[histkey] |
| 106 | + continue |
| 107 | + globalres[histkey] += res[histkey] |
| 108 | + return globalres |
| 109 | + |
| 110 | + |
| 111 | +def chunkwise_bcftools_stats(vcf, threads=8): |
| 112 | + regions = list(parallel_regions(vcf)) |
| 113 | + global_res = { |
| 114 | + "samples": VCF(vcf).samples |
| 115 | + } |
| 116 | + with tqdm(total=len(regions), unit="chunk") as pbar: |
| 117 | + for i in range(0, len(regions), 1000): |
| 118 | + to=min(len(regions), i+1000) |
| 119 | + with ProcessPoolExecutor(threads) as exc: |
| 120 | + jobs = (exc.submit(one_chunk_stats, vcf, region) for region in regions[i:to]) |
| 121 | + for job in as_completed(jobs): |
| 122 | + pbar.update(1) |
| 123 | + update_result(global_res, job.result()) |
| 124 | + return global_res |
| 125 | + |
| 126 | + |
| 127 | +def genotype_pca(res): |
| 128 | + genomat = np.vstack([x["genotypes"] for x in res]) |
| 129 | + imp = sklearn.impute.SimpleImputer() |
| 130 | + genomat = imp.fit_transform(genomat) |
| 131 | + pc = sklearn.decomposition.PCA() |
| 132 | + gpc = pc.fit_transform(genomat.T) |
| 133 | + return genomat, gpc, pc.explained_variance_ratio_ |
| 134 | + |
| 135 | + |
| 136 | +def generate_report(vcf, threads): |
| 137 | + gres = chunkwise_bcftools_stats(vcf, threads=threads) |
| 138 | + |
| 139 | + fig_maf = gres["maf"].plot() |
| 140 | + fig_maf.update_xaxes(title_text="MAF") |
| 141 | + fig_maf.update_yaxes(title_text="# SNPS") |
| 142 | + fig_maf.update_layout(title_text="MAF Spectrum") |
| 143 | + MAF_CODE=fig_maf.to_html(full_html=False) |
| 144 | + |
| 145 | + fig_cr = gres["call_rate"].plot() |
| 146 | + fig_cr.update_xaxes(title_text="Call Rate") |
| 147 | + fig_cr.update_yaxes(title_text="# SNPS") |
| 148 | + fig_cr.update_layout(title_text="Call Rate Spectrum") |
| 149 | + CALLRATE_CODE = fig_cr.to_html(full_html=False) |
| 150 | + |
| 151 | + fig_hwe = gres["hwe"].plot() |
| 152 | + fig_hwe.update_xaxes(title_text="p(not in HWE)") |
| 153 | + fig_hwe.update_yaxes(title_text="# SNPS") |
| 154 | + fig_hwe.update_layout(title_text="HWE Test") |
| 155 | + HWE_CODE = fig_hwe.to_html(full_html=False) |
| 156 | + |
| 157 | + fig_xh = gres["exc_het"].plot() |
| 158 | + fig_xh.update_xaxes(title_text="p(Excess Heterozygotes)") |
| 159 | + fig_xh.update_yaxes(title_text="# SNPS") |
| 160 | + fig_xh.update_layout(title_text="ExHet Test") |
| 161 | + EXHET_CODE = fig_xh.to_html(full_html=False) |
| 162 | + |
| 163 | + fig_ac = gres["ac"].plot() |
| 164 | + fig_ac.update_xaxes(title_text="Alternate Allele Count") |
| 165 | + fig_ac.update_yaxes(title_text="# SNPS") |
| 166 | + fig_ac.update_layout(title_text="Count of Alt Alleles") |
| 167 | + AC_CODE = fig_ac.to_html(full_html=False) |
| 168 | + |
| 169 | + fig_qual = gres["qual"].plot() |
| 170 | + fig_qual.update_xaxes(title_text="Variant Quality") |
| 171 | + fig_qual.update_yaxes(title_text="# SNPS") |
| 172 | + fig_qual.update_layout(title_text="Variant Quality") |
| 173 | + QUAL_CODE = fig_qual.to_html(full_html=False) |
| 174 | + |
| 175 | + |
| 176 | + x, pc, varex = genotype_pca(gres["subset_variants"]) |
| 177 | + pc_fig = px.scatter( |
| 178 | + x=pc[:,0], |
| 179 | + y=pc[:,1], |
| 180 | + labels={"xy"[i]: f"PC {i+1} ({varex[i]*100:.1f}%)" for i in range(2)}, |
| 181 | + hover_name=gres["samples"], |
| 182 | + title="PCA on imputed subset of high-quality SNPs", |
| 183 | + ) |
| 184 | + PCA_CODE = pc_fig.to_html(full_html=False) |
| 185 | + |
| 186 | + html = f""" |
| 187 | + <html> |
| 188 | + <head> |
| 189 | + <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| 190 | + <meta http-equiv="X-UA-Compatible" content="ie=edge"> |
| 191 | + </head> |
| 192 | + <body> |
| 193 | + <h1>Statistics</h1> |
| 194 | + <h2>Minor Allele Frequency</h2> |
| 195 | + {MAF_CODE} |
| 196 | + <h2>Call Rate</h2> |
| 197 | + {CALLRATE_CODE} |
| 198 | + <h2>Hardy-Weinberg Equilibirum Tests</h2> |
| 199 | + {HWE_CODE} |
| 200 | + <h2>Excess Heterozygosity Tests</h2> |
| 201 | + {EXHET_CODE} |
| 202 | + <h2>Allele Counts</h2> |
| 203 | + {AC_CODE} |
| 204 | + <h2>Variant Quality</h2> |
| 205 | + {QUAL_CODE} |
| 206 | + <h1>PCA</h1> |
| 207 | + {PCA_CODE} |
| 208 | + </body> |
| 209 | + </html> |
| 210 | + """ |
| 211 | + return html |
| 212 | + |
| 213 | + |
| 214 | +def main(argv=None): |
| 215 | + """vcfreport: Prepare a basic html report about a VCF file""" |
| 216 | + ap = argparse.ArgumentParser("blsl vcfreport") |
| 217 | + ap.add_argument("--output", "-o", type=argparse.FileType("w"), required=True, |
| 218 | + help="Output html file") |
| 219 | + ap.add_argument("--threads", "-j", type=int, default=2, |
| 220 | + help="Parallel threads") |
| 221 | + ap.add_argument("vcf", |
| 222 | + help="VCF input file (must be indexed)") |
| 223 | + args=ap.parse_args(argv) |
| 224 | + |
| 225 | + html = generate_report(args.vcf, threads=args.threads) |
| 226 | + args.output.write(html) |
| 227 | + args.output.flush() |
0 commit comments