Skip to content

Commit ebf0215

Browse files
committed
feat(benchmarks): add script to compare C++ and Python backend results (#40)
Introduces a CLI tool to load, index, and align benchmark JSON results from both backends. It displays a side-by-side comparison table showing latency (ms), throughput (tokens/s), and the percentage speedup/slowdown.
1 parent 01ed329 commit ebf0215

1 file changed

Lines changed: 141 additions & 0 deletions

File tree

benchmark/compare.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env python3
2+
"""Compare Quadtrix C++ and Python benchmark JSON files."""
3+
4+
from __future__ import annotations
5+
6+
import argparse
7+
import json
8+
from pathlib import Path
9+
from typing import Any
10+
11+
12+
DEFAULT_RESULTS = Path(__file__).resolve().parent / "results"
13+
14+
15+
def load(path: Path) -> dict[str, Any]:
16+
with path.open("r", encoding="utf-8") as f:
17+
return json.load(f)
18+
19+
20+
def index_rows(result: dict[str, Any]) -> dict[tuple[str, str, int, int], dict[str, Any]]:
21+
indexed = {}
22+
for row in result.get("results", []):
23+
key = (
24+
row.get("suite", ""),
25+
row.get("name", ""),
26+
int(row.get("batch_size") or 0),
27+
int(row.get("sequence_length") or 0),
28+
)
29+
indexed[key] = row
30+
return indexed
31+
32+
33+
def pct(new: float, old: float) -> float:
34+
if old == 0:
35+
return 0.0
36+
return (new - old) / old * 100.0
37+
38+
39+
def compare_backends(cpp_path: Path, python_path: Path) -> int:
40+
missing = [str(path) for path in (cpp_path, python_path) if not path.exists()]
41+
if missing:
42+
print("Missing benchmark result file(s):")
43+
for path in missing:
44+
print(f" {path}")
45+
print("Run benchmark/run_all.py first, or pass explicit --cpp/--python paths.")
46+
return 1
47+
48+
cpp = load(cpp_path)
49+
py = load(python_path)
50+
cpp_rows = index_rows(cpp)
51+
py_rows = index_rows(py)
52+
53+
common = sorted(set(cpp_rows) & set(py_rows))
54+
if not common:
55+
print("No matching benchmark rows found.")
56+
return 1
57+
58+
print("Quadtrix C++ vs Python Benchmark Comparison")
59+
print(f"C++: {cpp_path}")
60+
print(f"Python: {python_path}")
61+
print()
62+
print(f"{'suite':<12} {'name':<24} {'shape':<10} {'cpp ms':>10} {'py ms':>10} {'cpp tok/s':>12} {'py tok/s':>12} {'latency':>10}")
63+
print("-" * 110)
64+
65+
for key in common:
66+
suite, name, batch, seq = key
67+
c = cpp_rows[key]
68+
p = py_rows[key]
69+
cpp_ms = float(c.get("avg_ms") or 0.0)
70+
py_ms = float(p.get("avg_ms") or 0.0)
71+
cpp_tps = float(c.get("tokens_per_sec") or 0.0)
72+
py_tps = float(p.get("tokens_per_sec") or 0.0)
73+
shape = f"{batch}x{seq}" if batch or seq else "-"
74+
delta = pct(cpp_ms, py_ms)
75+
print(
76+
f"{suite:<12} {name:<24} {shape:<10} "
77+
f"{cpp_ms:10.3f} {py_ms:10.3f} {cpp_tps:12.1f} {py_tps:12.1f} {delta:+9.1f}%"
78+
)
79+
return 0
80+
81+
82+
def compare_baseline(current_path: Path, baseline_path: Path, threshold_pct: float) -> int:
83+
missing = [str(path) for path in (current_path, baseline_path) if not path.exists()]
84+
if missing:
85+
print("Missing benchmark result file(s):")
86+
for path in missing:
87+
print(f" {path}")
88+
return 1
89+
90+
current = load(current_path)
91+
baseline = load(baseline_path)
92+
current_rows = index_rows(current)
93+
baseline_rows = index_rows(baseline)
94+
common = sorted(set(current_rows) & set(baseline_rows))
95+
96+
print("Quadtrix Benchmark Baseline Comparison")
97+
print(f"Current: {current_path}")
98+
print(f"Baseline: {baseline_path}")
99+
print()
100+
101+
regressions = []
102+
for key in common:
103+
c = current_rows[key]
104+
b = baseline_rows[key]
105+
delta = pct(float(c.get("avg_ms") or 0.0), float(b.get("avg_ms") or 0.0))
106+
if delta > threshold_pct:
107+
regressions.append((key, delta, b, c))
108+
109+
if not regressions:
110+
print(f"No latency regressions over {threshold_pct:.1f}%.")
111+
return 0
112+
113+
print(f"Latency regressions over {threshold_pct:.1f}%:")
114+
for key, delta, b, c in regressions:
115+
suite, name, batch, seq = key
116+
print(
117+
f" {suite}/{name} {batch}x{seq}: "
118+
f"{float(b.get('avg_ms') or 0.0):.3f} ms -> {float(c.get('avg_ms') or 0.0):.3f} ms ({delta:+.1f}%)"
119+
)
120+
return 2
121+
122+
123+
def parse_args() -> argparse.Namespace:
124+
parser = argparse.ArgumentParser(description="Compare Quadtrix benchmark results.")
125+
parser.add_argument("--cpp", type=Path, default=DEFAULT_RESULTS / "cpp_benchmark.json")
126+
parser.add_argument("--python", type=Path, default=DEFAULT_RESULTS / "python_benchmark.json")
127+
parser.add_argument("--current", type=Path, default=None)
128+
parser.add_argument("--baseline", type=Path, default=None)
129+
parser.add_argument("--threshold-pct", type=float, default=10.0)
130+
return parser.parse_args()
131+
132+
133+
def main() -> int:
134+
args = parse_args()
135+
if args.current and args.baseline:
136+
return compare_baseline(args.current, args.baseline, args.threshold_pct)
137+
return compare_backends(args.cpp, args.python)
138+
139+
140+
if __name__ == "__main__":
141+
raise SystemExit(main())

0 commit comments

Comments
 (0)