Skip to content

Commit c35e451

Browse files
committed
new tool: vcfreport
1 parent 371e7db commit c35e451

File tree

3 files changed

+245
-3
lines changed

3 files changed

+245
-3
lines changed

blsl/__init__.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
from sys import argv, exit, stderr
77

8-
__version__ = "0.3.1"
8+
__version__ = "0.4.0"
9+
910

1011
cmds = {}
1112

@@ -89,15 +90,27 @@
8990
cmds["pairs"] = pairs_main
9091

9192

93+
9294
try:
9395
from .vcfstats import main as vcfstats_main
9496
cmds["vcfstats"] = vcfstats_main
97+
except ImportError as exc:
98+
if len(argv) < 2:
99+
print(str(exc), "-- disabling vcfstats command", file=stderr)
95100

101+
try:
96102
from .vcfparallel import main as vcfparallel_main
97103
cmds["vcfparallel"] = vcfparallel_main
98104
except ImportError as exc:
99105
if len(argv) < 2:
100-
print(str(exc), "-- disabling vcfstats command", file=stderr)
106+
print(str(exc), "-- disabling vcfparallel command", file=stderr)
107+
108+
try:
109+
from .vcfreport import main as vcfreport_main
110+
cmds["vcfreport"] = vcfreport_main
111+
except ImportError as exc:
112+
if len(argv) < 2:
113+
print(str(exc), "-- disabling vcfreport command", file=stderr)
101114

102115
from .shannon import main as shannon_main
103116
cmds["shannon-entropy"] = shannon_main
@@ -131,4 +144,3 @@ def main():
131144
mainhelp()
132145
exit(1)
133146
cmds[argv[1]](argv[2:])
134-

blsl/vcfreport.py

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
#!/usr/bin/env python
2+
import plotly.express as px
3+
from cyvcf2 import VCF
4+
import pandas as pd
5+
import numpy as np
6+
from tqdm.auto import tqdm
7+
import physt
8+
from physt import h1 as histogram
9+
import physt.plotting
10+
physt.plotting.set_default_backend("plotly")
11+
import sklearn.impute
12+
import sklearn.decomposition
13+
14+
15+
from sys import stdin, stdout, stderr
16+
from subprocess import Popen, PIPE, DEVNULL
17+
import argparse
18+
import math
19+
import json
20+
from concurrent.futures import ProcessPoolExecutor, as_completed
21+
import random
22+
23+
24+
def parallel_regions(vcf, chunk=1000000):
25+
V = VCF(vcf)
26+
for cname, clen in zip(V.seqnames, V.seqlens):
27+
for start in range(0, clen, chunk):
28+
s = start+1
29+
e = min(start+chunk, clen)
30+
yield f"{cname}:{s}-{e}"
31+
32+
33+
34+
def variant2dict(v, fields=None, genotypes=False):
35+
dat = {"CHROM": v.CHROM, "POS": v.POS, "REF": v.REF, "ALT": v.ALT, "QUAL": v.QUAL}
36+
dat["call_rate"] = v.call_rate
37+
for key, val in v.INFO:
38+
if isinstance(val, str) and "," in val:
39+
val = val.split(',')
40+
if isinstance(val, tuple) or isinstance(val, list):
41+
for i, alt in enumerate(v.ALT):
42+
dat[f"INFO_{key}_{i}"] = val[i]
43+
else:
44+
dat[f"INFO_{key}"] = val
45+
if fields:
46+
dat = {K:V for K, V in dat.items() if K in fields}
47+
if genotypes:
48+
dat["genotypes"] = [v[0] + v[1] if v[0] != -1 and v[1] != -1 else float('nan') for v in v.genotypes ]
49+
return dat
50+
51+
52+
def one_chunk_stats(vcf, chunk, fill=True, fields=None, min_maf=0.01, min_call=0.7, subsample=0.01):
53+
cmd=f"bcftools view -r {chunk} {vcf} -Ou"
54+
if fill:
55+
cmd = f"{cmd} | bcftools +fill-tags - -Ou -- -d -t all,F_MISSING"
56+
variants = []
57+
res = []
58+
with Popen(cmd, shell=True, stdout=PIPE, stderr=DEVNULL) as proc:
59+
vcf = VCF(proc.stdout)
60+
for v in vcf:
61+
if v.call_rate >= min_call and len(v.ALT) == 1 and v.INFO["MAF"] >= min_maf and random.random() <= subsample:
62+
variants.append(variant2dict(v, genotypes=True))
63+
res.append(variant2dict(v, fields=["INFO_MAF", "call_rate", "INFO_HWE", "INFO_ExcHet", "INFO_AC", "QUAL"]))
64+
del vcf
65+
if not res:
66+
return {
67+
"chunk": chunk,
68+
"n_snps": len(res),
69+
}
70+
df = pd.DataFrame.from_records(res)
71+
try:
72+
return {
73+
"chunk": chunk,
74+
"n_snps": len(res),
75+
"maf": histogram(df.INFO_MAF, "fixed_width", bin_width=0.01, adaptive=True),
76+
"call_rate": histogram(df.call_rate, "fixed_width", bin_width=0.01, adaptive=True),
77+
"hwe": histogram(df.INFO_HWE, "fixed_width", bin_width=0.01, adaptive=True),
78+
"exc_het": histogram(df.INFO_ExcHet, "fixed_width", bin_width=0.01, adaptive=True),
79+
"ac": histogram(df.INFO_AC, "fixed_width", bin_width=1, adaptive=True),
80+
"qual": histogram(df.QUAL, "fixed_width", bin_width=1, adaptive=True),
81+
"subset_variants": variants
82+
}
83+
except Exception as exc:
84+
print(exc.__class__.__name__, str(exc), df, res)
85+
86+
87+
88+
def update_result(globalres, res, hists = ["maf", "call_rate", "hwe", "exc_het", "ac", "qual"], sums=["n_snps"], lists=["subset_variants",]):
89+
for listkey in lists:
90+
if listkey not in res:
91+
continue
92+
if listkey not in globalres:
93+
globalres[listkey] = []
94+
globalres[listkey].extend(res[listkey])
95+
for sumkey in sums:
96+
if sumkey not in res:
97+
continue
98+
if sumkey not in globalres:
99+
globalres[sumkey] = 0
100+
globalres[sumkey] += res[sumkey]
101+
for histkey in hists:
102+
if histkey not in res:
103+
continue
104+
if histkey not in globalres:
105+
globalres[histkey] = res[histkey]
106+
continue
107+
globalres[histkey] += res[histkey]
108+
return globalres
109+
110+
111+
def chunkwise_bcftools_stats(vcf, threads=8):
112+
regions = list(parallel_regions(vcf))
113+
global_res = {
114+
"samples": VCF(vcf).samples
115+
}
116+
with tqdm(total=len(regions), unit="chunk") as pbar:
117+
for i in range(0, len(regions), 1000):
118+
to=min(len(regions), i+1000)
119+
with ProcessPoolExecutor(threads) as exc:
120+
jobs = (exc.submit(one_chunk_stats, vcf, region) for region in regions[i:to])
121+
for job in as_completed(jobs):
122+
pbar.update(1)
123+
update_result(global_res, job.result())
124+
return global_res
125+
126+
127+
def genotype_pca(res):
128+
genomat = np.vstack([x["genotypes"] for x in res])
129+
imp = sklearn.impute.SimpleImputer()
130+
genomat = imp.fit_transform(genomat)
131+
pc = sklearn.decomposition.PCA()
132+
gpc = pc.fit_transform(genomat.T)
133+
return genomat, gpc, pc.explained_variance_ratio_
134+
135+
136+
def generate_report(vcf, threads):
137+
gres = chunkwise_bcftools_stats(vcf, threads=threads)
138+
139+
fig_maf = gres["maf"].plot()
140+
fig_maf.update_xaxes(title_text="MAF")
141+
fig_maf.update_yaxes(title_text="# SNPS")
142+
fig_maf.update_layout(title_text="MAF Spectrum")
143+
MAF_CODE=fig_maf.to_html(full_html=False)
144+
145+
fig_cr = gres["call_rate"].plot()
146+
fig_cr.update_xaxes(title_text="Call Rate")
147+
fig_cr.update_yaxes(title_text="# SNPS")
148+
fig_cr.update_layout(title_text="Call Rate Spectrum")
149+
CALLRATE_CODE = fig_cr.to_html(full_html=False)
150+
151+
fig_hwe = gres["hwe"].plot()
152+
fig_hwe.update_xaxes(title_text="p(not in HWE)")
153+
fig_hwe.update_yaxes(title_text="# SNPS")
154+
fig_hwe.update_layout(title_text="HWE Test")
155+
HWE_CODE = fig_hwe.to_html(full_html=False)
156+
157+
fig_xh = gres["exc_het"].plot()
158+
fig_xh.update_xaxes(title_text="p(Excess Heterozygotes)")
159+
fig_xh.update_yaxes(title_text="# SNPS")
160+
fig_xh.update_layout(title_text="ExHet Test")
161+
EXHET_CODE = fig_xh.to_html(full_html=False)
162+
163+
fig_ac = gres["ac"].plot()
164+
fig_ac.update_xaxes(title_text="Alternate Allele Count")
165+
fig_ac.update_yaxes(title_text="# SNPS")
166+
fig_ac.update_layout(title_text="Count of Alt Alleles")
167+
AC_CODE = fig_ac.to_html(full_html=False)
168+
169+
fig_qual = gres["qual"].plot()
170+
fig_qual.update_xaxes(title_text="Variant Quality")
171+
fig_qual.update_yaxes(title_text="# SNPS")
172+
fig_qual.update_layout(title_text="Variant Quality")
173+
QUAL_CODE = fig_qual.to_html(full_html=False)
174+
175+
176+
x, pc, varex = genotype_pca(gres["subset_variants"])
177+
pc_fig = px.scatter(
178+
x=pc[:,0],
179+
y=pc[:,1],
180+
labels={"xy"[i]: f"PC {i+1} ({varex[i]*100:.1f}%)" for i in range(2)},
181+
hover_name=gres["samples"],
182+
title="PCA on imputed subset of high-quality SNPs",
183+
)
184+
PCA_CODE = pc_fig.to_html(full_html=False)
185+
186+
html = f"""
187+
<html>
188+
<head>
189+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
190+
<meta http-equiv="X-UA-Compatible" content="ie=edge">
191+
</head>
192+
<body>
193+
<h1>Statistics</h1>
194+
<h2>Minor Allele Frequency</h2>
195+
{MAF_CODE}
196+
<h2>Call Rate</h2>
197+
{CALLRATE_CODE}
198+
<h2>Hardy-Weinberg Equilibirum Tests</h2>
199+
{HWE_CODE}
200+
<h2>Excess Heterozygosity Tests</h2>
201+
{EXHET_CODE}
202+
<h2>Allele Counts</h2>
203+
{AC_CODE}
204+
<h2>Variant Quality</h2>
205+
{QUAL_CODE}
206+
<h1>PCA</h1>
207+
{PCA_CODE}
208+
</body>
209+
</html>
210+
"""
211+
return html
212+
213+
214+
def main(argv=None):
215+
"""vcfreport: Prepare a basic html report about a VCF file"""
216+
ap = argparse.ArgumentParser("blsl vcfreport")
217+
ap.add_argument("--output", "-o", type=argparse.FileType("w"), required=True,
218+
help="Output html file")
219+
ap.add_argument("--threads", "-j", type=int, default=2,
220+
help="Parallel threads")
221+
ap.add_argument("vcf",
222+
help="VCF input file (must be indexed)")
223+
args=ap.parse_args(argv)
224+
225+
html = generate_report(args.vcf, threads=args.threads)
226+
args.output.write(html)
227+
args.output.flush()

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ dependencies = [
1010
"biopython",
1111
"cyvcf2",
1212
"pandas",
13+
"scikit-learn",
14+
"physt",
15+
"plotly",
1316
]
1417
dynamic = [
1518
"version",

0 commit comments

Comments
 (0)