From 035962c0735e6387824a06199e50a717179c9047 Mon Sep 17 00:00:00 2001
From: Arye Kogan <aryekogan@gmail.com>
Date: Sat, 7 Feb 2026 22:59:24 +0200
Subject: [PATCH 1/6] docs: remove redundant rollout and metadata guides

---
 README.md                          |  1 -
 ROLLOUT_CHECKLIST.md               | 36 -------------------
 docs/guides/AGENTS.md              |  1 -
 docs/guides/repository-metadata.md | 55 ------------------------------
 4 files changed, 93 deletions(-)
 delete mode 100644 ROLLOUT_CHECKLIST.md
 delete mode 100644 docs/guides/repository-metadata.md

diff --git a/README.md b/README.md
index b9d4673..6392f0b 100644
--- a/README.md
+++ b/README.md
@@ -108,7 +108,6 @@ benchmarks/
 - `docs/guides/adding-scenarios.md` - how to add parity scenarios
 - `docs/guides/benchmark-workflow.md` - benchmark and reporting flow
 - `docs/guides/benchmark-publication-policy.md` - minimum disclosure for publishable results
-- `docs/guides/repository-metadata.md` - repository metadata and positioning
 
 **Governance:**
 - `MAINTAINERS.md` - maintainership roles and triage expectations
diff --git a/ROLLOUT_CHECKLIST.md b/ROLLOUT_CHECKLIST.md
deleted file mode 100644
index 2c9e47b..0000000
--- a/ROLLOUT_CHECKLIST.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# OSS Hardening and OMO Enablement - Rollout Checklist
-
-## Summary
-All 16 tasks completed successfully. Work is ready for PR.
-
-## Branch
-- **Branch**: feat/oss-hardening-and-omo-enablement
-- **Worktree**: .worktrees/feat-oss-hardening/
-- **Commits**: 7 commits ahead of main
-
-## Files Changed
-1. LICENSE (new) - MIT License
-2. MAINTAINERS.md (new) - Governance doc
-3. README.md - Badge row, modkit narrative, trust section
-4. docs/guides/AGENTS.md - Guide index updates
-5. docs/guides/benchmark-publication-policy.md (new) - Publication policy
-6. docs/guides/repository-metadata.md (new) - Metadata profile
-
-## Suggested PR Sequence
-
-### PR 1: Legal and Baseline Trust
-
-
-### PR 2: Governance and Positioning  
-
-
-### PR 3: OMO Setup Guide
-
-
-## Post-Merge Actions
-1. Apply GitHub metadata via commands in docs/guides/repository-metadata.md
-2. Verify badges render correctly on README
-3. Confirm docs/guides navigation works
-
-## Evidence
-See .sisyphus/evidence/task-8-final-gate.txt for verification output.
diff --git a/docs/guides/AGENTS.md b/docs/guides/AGENTS.md
index aa32981..751150b 100644
--- a/docs/guides/AGENTS.md
+++ b/docs/guides/AGENTS.md
@@ -15,7 +15,6 @@ Operator-facing guides for parity contract usage, scenario authoring, and benchm
 | Add or modify scenarios safely | `docs/guides/adding-scenarios.md` | Endpoint-grouped workflow + checklist |
 | Run benchmark pipeline | `docs/guides/benchmark-workflow.md` | Standard, per-target, and artifact expectations |
 | Publish benchmark results | `docs/guides/benchmark-publication-policy.md` | Minimum disclosure for publishable results |
-| Manage repo metadata | `docs/guides/repository-metadata.md` | Description, topics, and gh commands |
 | View maintainership info | `MAINTAINERS.md` | Roles, triage SLA, and escalation |
 
 
diff --git a/docs/guides/repository-metadata.md b/docs/guides/repository-metadata.md
deleted file mode 100644
index f896ef8..0000000
--- a/docs/guides/repository-metadata.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# Repository Metadata Profile
-
-This document defines the GitHub repository metadata for the `benchmarks` project. Maintaining accurate metadata ensures discoverability and clear positioning within the `go-modkit` ecosystem.
-
-## Metadata Profile
-
-| Field | Value |
-|-------|-------|
-| **Description** | Parity-gated benchmark harness for API framework performance comparison. Ensures correctness via declarative contract fixtures before measuring throughput and latency. |
-| **Topics** | `go`, `benchmark`, `api-parity`, `performance-testing`, `reproducibility`, `modkit`, `framework-comparison`, `api-contract`, `docker-orchestration`, `quality-gates` |
-| **Homepage** | `https://github.com/go-modkit/benchmarks` |
-
-## Application Checklist
-
-### Automated (via GitHub CLI)
-
-If you have the `gh` CLI installed and authenticated, run the following command to apply the profile:
-
-```bash
-gh repo edit go-modkit/benchmarks \
-  --description "Parity-gated benchmark harness for API framework performance comparison. Ensures correctness via declarative contract fixtures before measuring throughput and latency." \
-  --add-topic "go,benchmark,api-parity,performance-testing,reproducibility,modkit,framework-comparison,api-contract,docker-orchestration,quality-gates" \
-  --homepage "https://github.com/go-modkit/benchmarks"
-```
-
-### Manual Fallback
-
-If the GitHub CLI is unavailable, follow these steps:
-
-1. Navigate to the repository on GitHub: [go-modkit/benchmarks](https://github.com/go-modkit/benchmarks)
-2. Click on the **Settings** gear icon (or the "About" section edit icon on the main page).
-3. In the **Description** field, paste:
-   > Parity-gated benchmark harness for API framework performance comparison. Ensures correctness via declarative contract fixtures before measuring throughput and latency.
-4. In the **Website** field, paste:
-   > https://github.com/go-modkit/benchmarks
-5. In the **Topics** section, add the following tags one by one:
-   - `go`
-   - `benchmark`
-   - `api-parity`
-   - `performance-testing`
-   - `reproducibility`
-   - `modkit`
-   - `framework-comparison`
-   - `api-contract`
-   - `docker-orchestration`
-   - `quality-gates`
-6. Click **Save changes**.
-
-## Verification
-
-To verify the current metadata, run:
-
-```bash
-gh repo view go-modkit/benchmarks --json description,repositoryTopics,homepage
-```

From b18d2c3d9070673ba7976342a69d9bf758bb7ca8 Mon Sep 17 00:00:00 2001
From: Arye Kogan <aryekogan@gmail.com>
Date: Sun, 8 Feb 2026 00:20:55 +0200
Subject: [PATCH 2/6] test: expand parity and benchmark script test coverage

---
 .bats-version                                 |   1 +
 Makefile                                      |  38 +++-
 cmd/parity-test/main_test.go                  | 187 ++++++++++++++++++
 pytest.ini                                    |   5 +
 tests/__init__.py                             |   0
 tests/fixtures/README.md                      |  34 ++++
 tests/fixtures/raw/modkit-invalid-json.json   |  11 ++
 .../fixtures/raw/modkit-missing-required.json |   5 +
 tests/fixtures/raw/modkit-ok.json             |  87 ++++++++
 tests/fixtures/raw/nestjs-skipped-health.json |   7 +
 tests/fixtures/raw/wire-skipped-parity.json   |   7 +
 tests/fixtures/summary/expected-report.md     |  32 +++
 tests/fixtures/summary/expected-summary.json  |  59 ++++++
 tests/integration/README.md                   |  12 ++
 tests/integration/helpers/load.bash           |  15 ++
 tests/integration/test_run_scripts.bats       |  21 ++
 tests/unit/__init__.py                        |   0
 tests/unit/conftest.py                        |  30 +++
 tests/unit/script_loader.py                   |  14 ++
 tests/unit/test_benchmark_measure.py          |  23 +++
 tests/unit/test_benchmark_quality_check.py    |  20 ++
 tests/unit/test_environment_manifest.py       |  30 +++
 tests/unit/test_generate_report.py            |  55 ++++++
 tests/unit/test_validate_result_schemas.py    |  68 +++++++
 24 files changed, 759 insertions(+), 2 deletions(-)
 create mode 100644 .bats-version
 create mode 100644 pytest.ini
 create mode 100644 tests/__init__.py
 create mode 100644 tests/fixtures/README.md
 create mode 100644 tests/fixtures/raw/modkit-invalid-json.json
 create mode 100644 tests/fixtures/raw/modkit-missing-required.json
 create mode 100644 tests/fixtures/raw/modkit-ok.json
 create mode 100644 tests/fixtures/raw/nestjs-skipped-health.json
 create mode 100644 tests/fixtures/raw/wire-skipped-parity.json
 create mode 100644 tests/fixtures/summary/expected-report.md
 create mode 100644 tests/fixtures/summary/expected-summary.json
 create mode 100644 tests/integration/README.md
 create mode 100644 tests/integration/helpers/load.bash
 create mode 100644 tests/integration/test_run_scripts.bats
 create mode 100644 tests/unit/__init__.py
 create mode 100644 tests/unit/conftest.py
 create mode 100644 tests/unit/script_loader.py
 create mode 100644 tests/unit/test_benchmark_measure.py
 create mode 100644 tests/unit/test_benchmark_quality_check.py
 create mode 100644 tests/unit/test_environment_manifest.py
 create mode 100644 tests/unit/test_generate_report.py
 create mode 100644 tests/unit/test_validate_result_schemas.py

diff --git a/.bats-version b/.bats-version
new file mode 100644
index 0000000..1cac385
--- /dev/null
+++ b/.bats-version
@@ -0,0 +1 @@
+1.11.0
diff --git a/Makefile b/Makefile
index 94c0571..80e49fe 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,13 @@
 SHELL := /bin/sh
 PYTHON ?= python3
 GO ?= go
+PYTEST ?= pytest
+BATS ?= bats
 GOPATH ?= $(shell $(GO) env GOPATH)
 GO_PATCH_COVER ?= $(GOPATH)/bin/go-patch-cover
 MODULES = $(shell find . -type f -name "go.mod" -not -path "*/.*/*" -not -path "*/vendor/*" -exec dirname {} \;)
 
-.PHONY: benchmark benchmark-modkit benchmark-nestjs benchmark-baseline benchmark-wire benchmark-fx benchmark-do report test test-coverage test-patch-coverage tools parity-check parity-check-modkit parity-check-nestjs benchmark-fingerprint-check benchmark-limits-check benchmark-manifest-check benchmark-raw-schema-check benchmark-summary-schema-check benchmark-schema-validate benchmark-stats-check benchmark-variance-check benchmark-benchstat-check ci-benchmark-quality-check workflow-concurrency-check workflow-budget-check workflow-inputs-check report-disclaimer-check methodology-changelog-check publication-sync-check
+.PHONY: benchmark benchmark-modkit benchmark-nestjs benchmark-baseline benchmark-wire benchmark-fx benchmark-do report test test-go test-python test-shell test-scripts test-coverage test-coverage-go test-coverage-python test-patch-coverage tools parity-check parity-check-modkit parity-check-nestjs benchmark-fingerprint-check benchmark-limits-check benchmark-manifest-check benchmark-raw-schema-check benchmark-summary-schema-check benchmark-schema-validate benchmark-stats-check benchmark-variance-check benchmark-benchstat-check ci-benchmark-quality-check workflow-concurrency-check workflow-budget-check workflow-inputs-check report-disclaimer-check methodology-changelog-check publication-sync-check
 
 benchmark:
 	bash scripts/run-all.sh
@@ -32,9 +34,34 @@ report:
 	$(PYTHON) scripts/generate-report.py
 
 test:
+	$(MAKE) test-go
+	$(MAKE) test-scripts
+
+test-go:
 	$(GO) test ./...
 
+test-python:
+	@if ! command -v $(PYTEST) >/dev/null 2>&1; then \
+		echo "pytest not found; install with: $(PYTHON) -m pip install pytest pytest-cov"; \
+		exit 1; \
+	fi
+	$(PYTEST) tests/unit
+
+test-shell:
+	@if ! command -v $(BATS) >/dev/null 2>&1; then \
+		echo "bats not found; install bats-core before running shell tests"; \
+		exit 1; \
+	fi
+	$(BATS) tests/integration
+
+test-scripts:
+	$(MAKE) test-python
+	$(MAKE) test-shell
+
 test-coverage:
+	$(MAKE) test-coverage-go
+
+test-coverage-go:
 	@mkdir -p .coverage
 	@echo "mode: atomic" > .coverage/coverage.out
 	@for mod in $(MODULES); do \
@@ -44,10 +71,17 @@ test-coverage:
 			tail -n +2 $$mod/profile.out >> .coverage/coverage.out; \
 			rm $$mod/profile.out; \
 		fi; \
-	done
+		done
 	@printf "\nTotal Coverage:\n"
 	@$(GO) tool cover -func=.coverage/coverage.out | grep "total:"
 
+test-coverage-python:
+	@if ! command -v $(PYTEST) >/dev/null 2>&1; then \
+		echo "pytest not found; install with: $(PYTHON) -m pip install pytest pytest-cov"; \
+		exit 1; \
+	fi
+	$(PYTEST) tests/unit --cov=scripts --cov-report=term-missing
+
 test-patch-coverage: tools test-coverage
 	@echo "Comparing against origin/main..."
 	@git diff -U0 --no-color origin/main...HEAD > .coverage/diff.patch
diff --git a/cmd/parity-test/main_test.go b/cmd/parity-test/main_test.go
index d6c8226..dfd9634 100644
--- a/cmd/parity-test/main_test.go
+++ b/cmd/parity-test/main_test.go
@@ -2,9 +2,13 @@ package main
 
 import (
 	"encoding/json"
+	"net/http"
+	"net/http/httptest"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
+	"time"
 )
 
 func TestMatchStringValue_Tokens(t *testing.T) {
@@ -127,3 +131,186 @@ func TestParityFixtures_AreWellFormed(t *testing.T) {
 		}
 	}
 }
+
+func TestSeedTarget_DisabledEndpoint(t *testing.T) {
+	t.Parallel()
+
+	if err := seedTarget("http://example.invalid", "does-not-matter", "", time.Second); err != nil {
+		t.Fatalf("expected nil when seed endpoint disabled, got %v", err)
+	}
+}
+
+func TestSeedTarget_MissingSeedFileIsNoop(t *testing.T) {
+	t.Parallel()
+
+	tmp := t.TempDir()
+	if err := seedTarget("http://example.invalid", tmp, "/debug/parity/seed", time.Second); err != nil {
+		t.Fatalf("expected nil when seed.json missing, got %v", err)
+	}
+}
+
+func TestSeedTarget_HTTPFailure(t *testing.T) {
+	t.Parallel()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer server.Close()
+
+	tmp := t.TempDir()
+	seedPath := filepath.Join(tmp, "seed.json")
+	if err := os.WriteFile(seedPath, []byte(`{"users":[]}`), 0o644); err != nil {
+		t.Fatalf("failed writing seed file: %v", err)
+	}
+
+	err := seedTarget(server.URL, tmp, "/debug/parity/seed", time.Second)
+	if err == nil {
+		t.Fatal("expected error for non-2xx seed response")
+	}
+	if !strings.Contains(err.Error(), "seed request returned 500") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestRunScenario_StatusMismatch(t *testing.T) {
+	t.Parallel()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte(`{"ok":true}`))
+	}))
+	defer server.Close()
+
+	s := Scenario{
+		Name:     "status mismatch",
+		Request:  RequestSpec{Path: "/health"},
+		Response: ResponseSpec{Status: http.StatusCreated, Body: map[string]interface{}{"ok": true}},
+	}
+
+	err := runScenario(&http.Client{Timeout: time.Second}, server.URL, s)
+	if err == nil || !strings.Contains(err.Error(), "status 200 != 201") {
+		t.Fatalf("expected status mismatch error, got %v", err)
+	}
+}
+
+func TestRunScenario_HeaderMismatch(t *testing.T) {
+	t.Parallel()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("X-Version", "v1")
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte(`{"ok":true}`))
+	}))
+	defer server.Close()
+
+	s := Scenario{
+		Name:    "header mismatch",
+		Request: RequestSpec{Path: "/health"},
+		Response: ResponseSpec{
+			Status:  http.StatusOK,
+			Headers: map[string]string{"X-Version": "v2"},
+			Body:    map[string]interface{}{"ok": true},
+		},
+	}
+
+	err := runScenario(&http.Client{Timeout: time.Second}, server.URL, s)
+	if err == nil || !strings.Contains(err.Error(), "header X-Version mismatch") {
+		t.Fatalf("expected header mismatch error, got %v", err)
+	}
+}
+
+func TestRunScenario_InvalidJSONResponse(t *testing.T) {
+	t.Parallel()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte("not-json"))
+	}))
+	defer server.Close()
+
+	s := Scenario{
+		Name:     "invalid json",
+		Request:  RequestSpec{Path: "/health"},
+		Response: ResponseSpec{Status: http.StatusOK, Body: map[string]interface{}{"ok": true}},
+	}
+
+	err := runScenario(&http.Client{Timeout: time.Second}, server.URL, s)
+	if err == nil || !strings.Contains(err.Error(), "invalid response JSON") {
+		t.Fatalf("expected invalid JSON error, got %v", err)
+	}
+}
+
+func TestRunScenario_DefaultMethodIsGET(t *testing.T) {
+	t.Parallel()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.Method != http.MethodGet {
+			t.Fatalf("expected default method GET, got %s", r.Method)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte(`{"ok":true}`))
+	}))
+	defer server.Close()
+
+	s := Scenario{
+		Name:     "default method",
+		Request:  RequestSpec{Path: "/health"},
+		Response: ResponseSpec{Status: http.StatusOK, Body: map[string]interface{}{"ok": true}},
+	}
+
+	if err := runScenario(&http.Client{Timeout: time.Second}, server.URL, s); err != nil {
+		t.Fatalf("unexpected runScenario error: %v", err)
+	}
+}
+
+func TestMatchStringValue_InterpolatedTokensRejectInvalidSegments(t *testing.T) {
+	t.Parallel()
+
+	if matchStringValue("/users/@any_number/events/@is_iso8601", "/users/abc/events/2025-01-01T00:00:00Z") {
+		t.Fatal("expected non-numeric segment to fail @any_number")
+	}
+	if matchStringValue("/users/@any_number/events/@is_iso8601", "/users/123/events/not-a-date") {
+		t.Fatal("expected non-iso segment to fail @is_iso8601")
+	}
+}
+
+func TestParityFixtures_NegativeContractCases(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name     string
+		scenario Scenario
+	}{
+		{
+			name: "empty request path",
+			scenario: Scenario{
+				Name: "bad-path",
+				Request: RequestSpec{
+					Path: "",
+				},
+				Response: ResponseSpec{Status: 200},
+			},
+		},
+		{
+			name: "invalid response status",
+			scenario: Scenario{
+				Name:     "bad-status",
+				Request:  RequestSpec{Path: "/health"},
+				Response: ResponseSpec{Status: 0},
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+			if tc.scenario.Request.Path != "" && tc.scenario.Response.Status > 0 {
+				t.Fatalf("expected invalid fixture contract for case %q", tc.name)
+			}
+		})
+	}
+}
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..2276ad3
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+testpaths = tests/unit
+python_files = test_*.py
+python_functions = test_*
+addopts = -q
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md
new file mode 100644
index 0000000..7b031f6
--- /dev/null
+++ b/tests/fixtures/README.md
@@ -0,0 +1,34 @@
+# Test Fixtures for Benchmark Scripts
+
+This directory contains deterministic test fixtures for validating the benchmark pipeline.
+
+## Structure
+
+```
+tests/fixtures/
+├── raw/           # Raw benchmark result JSON files
+├── summary/       # Expected summary.json outputs
+└── compose/       # Docker compose test files
+```
+
+## Fixture Categories
+
+### Raw Results
+- `ok/` - Successful benchmark runs with full metrics
+- `skipped/` - Skipped runs (health unavailable, parity failed, etc.)
+- `invalid/` - Malformed JSON for error testing
+
+### Usage
+Fixtures are loaded by test files to verify:
+1. Schema validation behavior
+2. Report generation output
+3. Quality check calculations
+4. Edge case handling
+
+## Creating New Fixtures
+
+When adding fixtures:
+1. Use realistic but anonymized data
+2. Include both happy path and edge cases
+3. Document any special conditions in comments
+4. Keep fixtures minimal - only include required fields
diff --git a/tests/fixtures/raw/modkit-invalid-json.json b/tests/fixtures/raw/modkit-invalid-json.json
new file mode 100644
index 0000000..231b727
--- /dev/null
+++ b/tests/fixtures/raw/modkit-invalid-json.json
@@ -0,0 +1,11 @@
+{
+  "schema_version": "raw-v1",
+  "framework": "modkit",
+  "target": "http://localhost:3001",
+  "status": "ok",
+  "parity": "passed",
+  "engine": "legacy",
+  "metric_units": {
+    "throughput": "requests_per_second"
+  },
+  "benchmark": { "median": { "rps": 600.0 } }
diff --git a/tests/fixtures/raw/modkit-missing-required.json b/tests/fixtures/raw/modkit-missing-required.json
new file mode 100644
index 0000000..09e27af
--- /dev/null
+++ b/tests/fixtures/raw/modkit-missing-required.json
@@ -0,0 +1,5 @@
+{
+  "schema_version": "raw-v1",
+  "framework": "modkit",
+  "reason": "fixture intentionally omits required fields"
+}
diff --git a/tests/fixtures/raw/modkit-ok.json b/tests/fixtures/raw/modkit-ok.json
new file mode 100644
index 0000000..0bfc1f0
--- /dev/null
+++ b/tests/fixtures/raw/modkit-ok.json
@@ -0,0 +1,87 @@
+{
+  "schema_version": "raw-v1",
+  "framework": "modkit",
+  "target": "http://localhost:3001",
+  "status": "ok",
+  "parity": "passed",
+  "engine": "legacy",
+  "metric_units": {
+    "throughput": "requests_per_second",
+    "latency": "milliseconds",
+    "memory": "mb",
+    "cpu": "percent",
+    "startup": "milliseconds"
+  },
+  "benchmark": {
+    "endpoint": "/health",
+    "warmup_requests": 100,
+    "requests_per_run": 300,
+    "runs": 3,
+    "run_stats": [
+      {
+        "requests": 300,
+        "duration_seconds": 0.5,
+        "rps": 600.0,
+        "latency_ms_p50": 1.5,
+        "latency_ms_p95": 2.5,
+        "latency_ms_p99": 3.0,
+        "latency_ms_max": 5.0
+      },
+      {
+        "requests": 300,
+        "duration_seconds": 0.48,
+        "rps": 625.0,
+        "latency_ms_p50": 1.4,
+        "latency_ms_p95": 2.3,
+        "latency_ms_p99": 2.8,
+        "latency_ms_max": 4.5
+      },
+      {
+        "requests": 300,
+        "duration_seconds": 0.52,
+        "rps": 576.9,
+        "latency_ms_p50": 1.6,
+        "latency_ms_p95": 2.6,
+        "latency_ms_p99": 3.1,
+        "latency_ms_max": 5.2
+      }
+    ],
+    "quality": {
+      "policy": {
+        "outlier_method": "iqr_1.5",
+        "outlier_thresholds": {
+          "rps": {"lower": 400.0, "upper": 800.0},
+          "latency_ms_p95": {"lower": 1.0, "upper": 4.0}
+        },
+        "variance_thresholds_cv": {
+          "rps": 0.10,
+          "latency_ms_p95": 0.20,
+          "latency_ms_p99": 0.25
+        }
+      },
+      "excluded_samples": [],
+      "effective_runs": 3,
+      "variance": {
+        "rps_cv": 0.04,
+        "latency_ms_p95_cv": 0.06,
+        "latency_ms_p99_cv": 0.05
+      }
+    },
+    "median": {
+      "rps": 600.0,
+      "latency_ms_p50": 1.5,
+      "latency_ms_p95": 2.5,
+      "latency_ms_p99": 2.8
+    }
+  },
+  "docker": {
+    "container": "modkit-app",
+    "memory": "50MiB / 100MiB",
+    "cpu": "12.34%"
+  },
+  "resources_normalized": {
+    "memory_mb": 50.0,
+    "cpu_percent": 12.34,
+    "startup_ms": 15.2
+  }
+}
diff --git a/tests/fixtures/raw/nestjs-skipped-health.json b/tests/fixtures/raw/nestjs-skipped-health.json
new file mode 100644
index 0000000..0ef8d34
--- /dev/null
+++ b/tests/fixtures/raw/nestjs-skipped-health.json
@@ -0,0 +1,7 @@
+{
+  "schema_version": "raw-v1",
+  "framework": "nestjs",
+  "target": "http://localhost:3002",
+  "status": "skipped",
+  "reason": "target health endpoint unavailable"
+}
diff --git a/tests/fixtures/raw/wire-skipped-parity.json b/tests/fixtures/raw/wire-skipped-parity.json
new file mode 100644
index 0000000..fe07fc5
--- /dev/null
+++ b/tests/fixtures/raw/wire-skipped-parity.json
@@ -0,0 +1,7 @@
+{
+  "schema_version": "raw-v1",
+  "framework": "wire",
+  "target": "http://localhost:3004",
+  "status": "skipped",
+  "reason": "parity check failed"
+}
diff --git a/tests/fixtures/summary/expected-report.md b/tests/fixtures/summary/expected-report.md
new file mode 100644
index 0000000..48cec37
--- /dev/null
+++ b/tests/fixtures/summary/expected-report.md
@@ -0,0 +1,32 @@
+# Benchmark Report
+
+Generated: `2026-01-01T00:00:00+00:00`
+
+## Overview
+- Total targets: 3
+- Successful: 1
+- Skipped: 2
+
+## Results
+
+| Framework | Status | Median RPS | P50 Latency (ms) | P95 Latency (ms) | P99 Latency (ms) | Notes |
+|---|---:|---:|---:|---:|---:|---|
+| modkit | ok | 600.00 | 1.50 | 2.50 | 2.80 |  |
+| nestjs | skipped | - | - | - | - | target health endpoint unavailable |
+| wire | skipped | - | - | - | - | parity check failed |
+
+## Fairness Disclaimer
+
+- Language-vs-framework caveat: cross-language results include runtime and ecosystem effects and must not be treated as framework-only deltas.
+- Cross-language baseline: compare implementations with equivalent API behavior, workload profile, and environment constraints before drawing conclusions.
+
+## Anti-Misinterpretation Guidance
+
+- Do not rank frameworks across languages as absolute winners; use results as scenario-specific signals.
+- Treat large cross-language deltas as prompts for deeper profiling (runtime, I/O, GC, and dependency effects), not as standalone product claims.
+- Parity failures invalidate performance interpretation until correctness is restored.
+
+## Raw Artifacts
+
+- Raw JSON: `results/latest/raw/*.json`
+- Summary JSON: `results/latest/summary.json`
diff --git a/tests/fixtures/summary/expected-summary.json b/tests/fixtures/summary/expected-summary.json
new file mode 100644
index 0000000..ee69d2f
--- /dev/null
+++ b/tests/fixtures/summary/expected-summary.json
@@ -0,0 +1,59 @@
+{
+  "schema_version": "summary-v1",
+  "generated_at": "2026-01-01T00:00:00+00:00",
+  "total_targets": 3,
+  "successful_targets": 1,
+  "skipped_targets": 2,
+  "targets": [
+    {
+      "framework": "modkit",
+      "status": "ok",
+      "target": "http://localhost:3001",
+      "reason": null,
+      "provenance": {
+        "raw_source": "results/latest/raw/modkit-ok.json"
+      },
+      "uncertainty": {
+        "rps_cv": 0.04,
+        "latency_ms_p95_cv": 0.06,
+        "latency_ms_p99_cv": 0.05
+      },
+      "median": {
+        "rps": 600.0,
+        "latency_ms_p50": 1.5,
+        "latency_ms_p95": 2.5,
+        "latency_ms_p99": 2.8
+      },
+      "resources_normalized": {
+        "memory_mb": 50.0,
+        "cpu_percent": 12.34,
+        "startup_ms": 15.2
+      },
+      "metric_units": {
+        "throughput": "requests_per_second",
+        "latency": "milliseconds",
+        "memory": "mb",
+        "cpu": "percent",
+        "startup": "milliseconds"
+      }
+    },
+    {
+      "framework": "nestjs",
+      "status": "skipped",
+      "target": "http://localhost:3002",
+      "reason": "target health endpoint unavailable",
+      "provenance": {
+        "raw_source": "results/latest/raw/nestjs-skipped-health.json"
+      }
+    },
+    {
+      "framework": "wire",
+      "status": "skipped",
+      "target": "http://localhost:3004",
+      "reason": "parity check failed",
+      "provenance": {
+        "raw_source": "results/latest/raw/wire-skipped-parity.json"
+      }
+    }
+  ]
+}
diff --git a/tests/integration/README.md b/tests/integration/README.md
new file mode 100644
index 0000000..9fe6107
--- /dev/null
+++ b/tests/integration/README.md
@@ -0,0 +1,12 @@
+# Shell Integration Tests
+
+This directory contains `bats` tests for benchmark orchestration shell scripts.
+
+Run locally:
+
+```bash
+bats tests/integration
+```
+
+These tests are intended to validate argument guards and path safety checks without
+requiring Docker or live framework services.
diff --git a/tests/integration/helpers/load.bash b/tests/integration/helpers/load.bash
new file mode 100644
index 0000000..f91be26
--- /dev/null
+++ b/tests/integration/helpers/load.bash
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+setup_temp_results() {
+  export TEST_TMPDIR
+  TEST_TMPDIR="$(mktemp -d)"
+  export RESULTS_DIR="$TEST_TMPDIR/results/latest"
+  export RESULTS_RAW_DIR="$RESULTS_DIR/raw"
+  mkdir -p "$RESULTS_RAW_DIR"
+}
+
+teardown_temp_results() {
+  if [ -n "${TEST_TMPDIR:-}" ] && [ -d "$TEST_TMPDIR" ]; then
+    rm -rf "$TEST_TMPDIR"
+  fi
+}
diff --git a/tests/integration/test_run_scripts.bats b/tests/integration/test_run_scripts.bats
new file mode 100644
index 0000000..f667fc7
--- /dev/null
+++ b/tests/integration/test_run_scripts.bats
@@ -0,0 +1,21 @@
+#!/usr/bin/env bats
+
+load "helpers/load.bash"
+
+setup() {
+  setup_temp_results
+}
+
+teardown() {
+  teardown_temp_results
+}
+
+@test "run-single rejects unknown framework" {
+  run bash scripts/run-single.sh unknown-framework
+  [ "$status" -ne 0 ]
+}
+
+@test "run-all rejects raw dir outside results/latest" {
+  run env RESULTS_RAW_DIR="/tmp" bash scripts/run-all.sh
+  [ "$status" -ne 0 ]
+}
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
new file mode 100644
index 0000000..12a9649
--- /dev/null
+++ b/tests/unit/conftest.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def repo_root() -> Path:
+    return Path(__file__).resolve().parents[2]
+
+
+@pytest.fixture(scope="session", autouse=True)
+def scripts_on_path(repo_root: Path) -> None:
+    scripts_dir = repo_root / "scripts"
+    if str(scripts_dir) not in sys.path:
+        sys.path.insert(0, str(scripts_dir))
+
+
+@pytest.fixture()
+def temp_results_dir(tmp_path: Path) -> Path:
+    path = tmp_path / "results" / "latest"
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+@pytest.fixture(scope="session")
+def fixture_root(repo_root: Path) -> Path:
+    return repo_root / "tests" / "fixtures"
diff --git a/tests/unit/script_loader.py b/tests/unit/script_loader.py
new file mode 100644
index 0000000..73e4077
--- /dev/null
+++ b/tests/unit/script_loader.py
@@ -0,0 +1,14 @@
+from __future__ import annotations
+
+import importlib.util
+from pathlib import Path
+
+
+def load_script_module(repo_root: Path, relative_path: str, module_name: str):
+    path = repo_root / relative_path
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"failed to load module from {path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
diff --git a/tests/unit/test_benchmark_measure.py b/tests/unit/test_benchmark_measure.py
new file mode 100644
index 0000000..07375ab
--- /dev/null
+++ b/tests/unit/test_benchmark_measure.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+from .script_loader import load_script_module
+
+
+def test_parse_mem_to_mb_and_cpu_percent(repo_root):
+    mod = load_script_module(repo_root, "scripts/benchmark-measure.py", "benchmark_measure_parse")
+
+    assert mod.parse_mem_to_mb("128MiB / 1GiB") == 128.0
+    assert mod.parse_mem_to_mb("1GiB / 2GiB") == 1024.0
+    assert mod.parse_cpu_percent("12.5%") == 12.5
+
+
+def test_coefficient_of_variation_and_iqr(repo_root):
+    mod = load_script_module(repo_root, "scripts/benchmark-measure.py", "benchmark_measure_stats")
+
+    cv = mod.coefficient_of_variation([100.0, 100.0, 100.0])
+    assert cv == 0.0
+
+    outlier_indexes, lower, upper = mod.detect_iqr_outlier_indexes([100.0, 101.0, 99.0, 500.0])
+    assert outlier_indexes == {3}
+    assert lower is not None
+    assert upper is not None
diff --git a/tests/unit/test_benchmark_quality_check.py b/tests/unit/test_benchmark_quality_check.py
new file mode 100644
index 0000000..ad73a79
--- /dev/null
+++ b/tests/unit/test_benchmark_quality_check.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+import pytest
+
+from .script_loader import load_script_module
+
+
+def test_parse_delta_percent(repo_root):
+    mod = load_script_module(repo_root, "scripts/benchmark-quality-check.py", "benchmark_quality_delta")
+
+    output = "name old new delta\nBenchmarkbaseline-8 100 115 +15.0%\n"
+    assert mod.parse_delta_percent(output) == 15.0
+    assert mod.parse_delta_percent("no percent here") is None
+
+
+def test_ensure_under_results_rejects_outside_path(repo_root, tmp_path):
+    mod = load_script_module(repo_root, "scripts/benchmark-quality-check.py", "benchmark_quality_paths")
+
+    with pytest.raises(SystemExit, match="must be under"):
+        mod.ensure_under_results(tmp_path / "outside.json", "Summary file")
diff --git a/tests/unit/test_environment_manifest.py b/tests/unit/test_environment_manifest.py
new file mode 100644
index 0000000..a73d058
--- /dev/null
+++ b/tests/unit/test_environment_manifest.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import pytest
+
+from .script_loader import load_script_module
+
+
+def test_parse_service_blocks_extracts_services(repo_root):
+    mod = load_script_module(repo_root, "scripts/environment-manifest.py", "environment_manifest_blocks")
+
+    text = """
+services:
+  app:
+    image: example
+    cpus: \"1\"
+    mem_limit: 512m
+  db:
+    image: postgres
+    cpus: \"1\"
+    mem_limit: 512m
+""".strip()
+    blocks = mod.parse_service_blocks(text)
+    assert set(blocks.keys()) == {"app", "db"}
+
+
+def test_ensure_under_results_rejects_external_path(repo_root, tmp_path):
+    mod = load_script_module(repo_root, "scripts/environment-manifest.py", "environment_manifest_paths")
+
+    with pytest.raises(SystemExit, match="Refusing path outside results/latest"):
+        mod.ensure_under_results(tmp_path / "bad.json")
diff --git a/tests/unit/test_generate_report.py b/tests/unit/test_generate_report.py
new file mode 100644
index 0000000..d298450
--- /dev/null
+++ b/tests/unit/test_generate_report.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import io
+import json
+import shutil
+from contextlib import redirect_stdout
+
+from .script_loader import load_script_module
+
+
+def test_load_raw_files_skips_malformed_json(repo_root, fixture_root, tmp_path):
+    mod = load_script_module(repo_root, "scripts/generate-report.py", "generate_report_skip")
+
+    raw_dir = tmp_path / "raw"
+    raw_dir.mkdir()
+    shutil.copy(fixture_root / "raw" / "modkit-ok.json", raw_dir / "modkit-ok.json")
+    shutil.copy(fixture_root / "raw" / "modkit-invalid-json.json", raw_dir / "modkit-invalid-json.json")
+
+    mod.RAW_DIR = raw_dir
+    buf = io.StringIO()
+    with redirect_stdout(buf):
+        rows = mod.load_raw_files()
+
+    assert len(rows) == 1
+    assert rows[0]["framework"] == "modkit"
+    assert "Warning: skipping malformed JSON" in buf.getvalue()
+
+
+def test_build_summary_counts_ok_and_skipped(repo_root, fixture_root):
+    mod = load_script_module(repo_root, "scripts/generate-report.py", "generate_report_summary")
+
+    rows = [
+        json.loads((fixture_root / "raw" / "modkit-ok.json").read_text(encoding="utf-8")),
+        json.loads((fixture_root / "raw" / "nestjs-skipped-health.json").read_text(encoding="utf-8")),
+    ]
+    rows[0]["_source_file"] = "modkit-ok.json"
+    rows[1]["_source_file"] = "nestjs-skipped-health.json"
+
+    summary = mod.build_summary(rows)
+    assert summary["total_targets"] == 2
+    assert summary["successful_targets"] == 1
+    assert summary["skipped_targets"] == 1
+
+
+def test_write_report_outputs_expected_sections(repo_root, fixture_root, tmp_path):
+    mod = load_script_module(repo_root, "scripts/generate-report.py", "generate_report_write")
+
+    expected = json.loads((fixture_root / "summary" / "expected-summary.json").read_text(encoding="utf-8"))
+    mod.REPORT_PATH = tmp_path / "report.md"
+    mod.write_report(expected)
+
+    content = mod.REPORT_PATH.read_text(encoding="utf-8")
+    assert "## Fairness Disclaimer" in content
+    assert "| modkit | ok | 600.00" in content
+    assert "Parity failures invalidate performance interpretation" in content
diff --git a/tests/unit/test_validate_result_schemas.py b/tests/unit/test_validate_result_schemas.py
new file mode 100644
index 0000000..be135a9
--- /dev/null
+++ b/tests/unit/test_validate_result_schemas.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import shutil
+
+import pytest
+
+from .script_loader import load_script_module
+
+
+def test_validate_raw_accepts_ok_and_skipped(repo_root, fixture_root, tmp_path):
+    mod = load_script_module(repo_root, "scripts/validate-result-schemas.py", "validate_result_schemas")
+
+    raw_dir = tmp_path / "raw"
+    raw_dir.mkdir()
+    shutil.copy(fixture_root / "raw" / "modkit-ok.json", raw_dir / "modkit-ok.json")
+    shutil.copy(fixture_root / "raw" / "nestjs-skipped-health.json", raw_dir / "nestjs-skipped-health.json")
+
+    mod.validate_raw(raw_dir, repo_root / "schemas" / "benchmark-raw-v1.schema.json")
+
+
+def test_validate_raw_rejects_missing_required_fields(repo_root, fixture_root, tmp_path):
+    mod = load_script_module(repo_root, "scripts/validate-result-schemas.py", "validate_result_schemas_missing")
+
+    raw_dir = tmp_path / "raw"
+    raw_dir.mkdir()
+    shutil.copy(
+        fixture_root / "raw" / "modkit-missing-required.json",
+        raw_dir / "modkit-missing-required.json",
+    )
+
+    with pytest.raises(SystemExit, match="required property"):
+        mod.validate_raw(raw_dir, repo_root / "schemas" / "benchmark-raw-v1.schema.json")
+
+
+def test_load_json_rejects_malformed_json(repo_root, fixture_root):
+    mod = load_script_module(repo_root, "scripts/validate-result-schemas.py", "validate_result_schemas_invalid")
+
+    with pytest.raises(SystemExit, match="Malformed JSON"):
+        mod.load_json(fixture_root / "raw" / "modkit-invalid-json.json")
+
+
+def test_validate_summary_rejects_ok_target_without_uncertainty(repo_root, tmp_path):
+    mod = load_script_module(repo_root, "scripts/validate-result-schemas.py", "validate_result_schemas_summary")
+
+    summary = tmp_path / "summary.json"
+    summary.write_text(
+        """
+{
+  "schema_version": "summary-v1",
+  "generated_at": "2026-01-01T00:00:00+00:00",
+  "total_targets": 1,
+  "successful_targets": 1,
+  "skipped_targets": 0,
+  "targets": [
+    {
+      "framework": "modkit",
+      "status": "ok",
+      "target": "http://localhost:3001",
+      "provenance": {"raw_source": "results/latest/raw/modkit-ok.json"}
+    }
+  ]
+}
+""".strip(),
+        encoding="utf-8",
+    )
+
+    with pytest.raises(SystemExit, match="required property"):
+        mod.validate_summary(summary, repo_root / "schemas" / "benchmark-summary-v1.schema.json")

From 4010b35639f72bde1ad53adbc7df03b2d3570623 Mon Sep 17 00:00:00 2001
From: Arye Kogan <aryekogan@gmail.com>
Date: Sun, 8 Feb 2026 00:21:05 +0200
Subject: [PATCH 3/6] refactor: consolidate benchmark script helpers and schema
 checks

---
 scripts/benchlib/__init__.py       |  0
 scripts/benchlib/io_utils.py       | 29 ++++++++++++++++++++
 scripts/benchmark-measure.py       |  9 +++---
 scripts/benchmark-quality-check.py | 18 ++++--------
 scripts/environment-manifest.py    | 24 +++++-----------
 scripts/lib.sh                     | 22 +++++++++++++++
 scripts/run-all.sh                 | 44 ++++--------------------------
 scripts/run-single.sh              | 26 ++----------------
 scripts/validate-result-schemas.py | 23 ++++++++++++++++
 stats-policy.json                  | 29 ++++++++++++++++++++
 10 files changed, 128 insertions(+), 96 deletions(-)
 create mode 100644 scripts/benchlib/__init__.py
 create mode 100644 scripts/benchlib/io_utils.py
 create mode 100644 scripts/lib.sh
 create mode 100644 stats-policy.json

diff --git a/scripts/benchlib/__init__.py b/scripts/benchlib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/benchlib/io_utils.py b/scripts/benchlib/io_utils.py
new file mode 100644
index 0000000..1fb6c97
--- /dev/null
+++ b/scripts/benchlib/io_utils.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+def ensure_under_root(path: Path, root: Path, label: str) -> Path:
+    resolved = path.resolve()
+    root_resolved = root.resolve()
+    if resolved == root_resolved or root_resolved in resolved.parents:
+        return resolved
+    raise SystemExit(f"{label} must be under {root_resolved}: {resolved}")
+
+
+def read_json(path: Path):
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def write_json(path: Path, payload) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+
+
+def load_json_policy(path: Path, default_on_missing=None):
+    if not path.exists():
+        if default_on_missing is not None:
+            return default_on_missing
+        raise SystemExit(f"Policy file not found: {path}")
+    return json.loads(path.read_text(encoding="utf-8"))
diff --git a/scripts/benchmark-measure.py b/scripts/benchmark-measure.py
index cb23599..c93e822 100644
--- a/scripts/benchmark-measure.py
+++ b/scripts/benchmark-measure.py
@@ -13,6 +13,8 @@
 import urllib.request
 from pathlib import Path
 
+from benchlib.io_utils import load_json_policy
+
 
 UNIT_TO_MB = {
     "b": 1 / (1024 * 1024),
@@ -179,11 +181,10 @@ def measure_hyperfine(repo_root, url, requests, runs):
 
 
 def load_policy(repo_root):
-    policy_file = repo_root / "stats-policy.yaml"
+    policy_file = repo_root / "stats-policy.json"
     if not policy_file.exists():
-        return {}
-    text = policy_file.read_text(encoding="utf-8")
-    return json.loads(text)
+        policy_file = repo_root / "stats-policy.yaml"
+    return load_json_policy(policy_file, default_on_missing={})
 
 
 def collect_docker_stats(framework):
diff --git a/scripts/benchmark-quality-check.py b/scripts/benchmark-quality-check.py
index f9b5516..da9b80d 100644
--- a/scripts/benchmark-quality-check.py
+++ b/scripts/benchmark-quality-check.py
@@ -7,12 +7,14 @@
 import tempfile
 from pathlib import Path
 
+from benchlib.io_utils import ensure_under_root, load_json_policy
+
 
 REPO_ROOT = Path(__file__).resolve().parent.parent
 RESULTS_ROOT = (REPO_ROOT / "results" / "latest").resolve()
 RAW_DIR = REPO_ROOT / "results" / "latest" / "raw"
 QUALITY_SUMMARY_FILE = REPO_ROOT / "results" / "latest" / "benchmark-quality-summary.json"
-POLICY_FILE = REPO_ROOT / "stats-policy.yaml"
+POLICY_FILE = REPO_ROOT / "stats-policy.json"
 TOOLING_DIR = REPO_ROOT / "results" / "latest" / "tooling"
 
 
@@ -30,13 +32,6 @@ def load_raw_rows(raw_dir):
     return rows
 
 
-def load_policy(path):
-    if not path.exists():
-        raise SystemExit(f"Policy file not found: {path}")
-    text = path.read_text(encoding="utf-8")
-    return json.loads(text)
-
-
 def ensure_number(value, label):
     if isinstance(value, (int, float)):
         return float(value)
@@ -53,10 +48,7 @@ def get_path_value(data, dotted_path):
 
 
 def ensure_under_results(path, label):
-    resolved = path.resolve()
-    if resolved == RESULTS_ROOT or RESULTS_ROOT in resolved.parents:
-        return resolved
-    raise SystemExit(f"{label} must be under {RESULTS_ROOT}: {resolved}")
+    return ensure_under_root(path, RESULTS_ROOT, label)
 
 
 def check_stats(args, policy):
@@ -371,7 +363,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    policy = load_policy(args.policy_file)
+    policy = load_json_policy(args.policy_file)
 
     if args.cmd == "stats-check":
         check_stats(args, policy)
diff --git a/scripts/environment-manifest.py b/scripts/environment-manifest.py
index e9f2392..7f0e6da 100644
--- a/scripts/environment-manifest.py
+++ b/scripts/environment-manifest.py
@@ -7,6 +7,8 @@
 from datetime import datetime, timezone
 from pathlib import Path
 
+from benchlib.io_utils import ensure_under_root, read_json, write_json
+
 
 REQUIRED_VERSION_FIELDS = [
     "go",
@@ -90,21 +92,13 @@ def runtime_versions():
     }
 
 
-def ensure_parent(path):
-    path.parent.mkdir(parents=True, exist_ok=True)
-
-
 def ensure_under_results(path):
-    resolved = path.resolve()
-    if resolved == RESULTS_ROOT or RESULTS_ROOT in resolved.parents:
-        return
-    raise SystemExit(f"Refusing path outside results/latest: {path}")
+    ensure_under_root(path, RESULTS_ROOT, "Refusing path outside results/latest")
 
 
-def write_json(path, payload):
+def write_json_safe(path, payload):
     ensure_under_results(path)
-    ensure_parent(path)
-    path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
+    write_json(path, payload)
 
 
 def collect_fingerprint(out_path):
@@ -113,7 +107,7 @@ def collect_fingerprint(out_path):
         "versions": runtime_versions(),
         "git": git_metadata(),
     }
-    write_json(out_path, payload)
+    write_json_safe(out_path, payload)
     print(f"Wrote: {out_path}")
 
 
@@ -165,10 +159,6 @@ def check_limits(compose_path):
     print("Docker limits check passed")
 
 
-def read_json(path):
-    return json.loads(path.read_text(encoding="utf-8"))
-
-
 def write_manifest(raw_dir, fingerprint_path, out_path):
     if not raw_dir.exists():
         raise SystemExit(f"Raw results directory not found: {raw_dir}")
@@ -203,7 +193,7 @@ def write_manifest(raw_dir, fingerprint_path, out_path):
         "fingerprint": read_json(fingerprint_path),
         "targets": rows,
     }
-    write_json(out_path, manifest)
+    write_json_safe(out_path, manifest)
     print(f"Wrote: {out_path}")
 
 
diff --git a/scripts/lib.sh b/scripts/lib.sh
new file mode 100644
index 0000000..552b226
--- /dev/null
+++ b/scripts/lib.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+resolve_path() {
+  python3 - <<'PY' "$1"
+from pathlib import Path
+import sys
+print(Path(sys.argv[1]).resolve())
+PY
+}
+
+ensure_path_under_root() {
+  local var_name="$1"
+  local path_value="$2"
+  local root_value="$3"
+  case "$path_value" in
+    "$root_value"|"$root_value"/*) ;;
+    *)
+      echo "$var_name must be under $root_value (got: $path_value)" >&2
+      return 1
+      ;;
+  esac
+}
diff --git a/scripts/run-all.sh b/scripts/run-all.sh
index 8cbebf6..586aa81 100755
--- a/scripts/run-all.sh
+++ b/scripts/run-all.sh
@@ -10,14 +10,7 @@ manifest_file="${MANIFEST_FILE:-$results_dir/environment.manifest.json}"
 script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 repo_root="$(cd "$script_dir/.." && pwd)"
 results_root="$repo_root/results/latest"
-
-resolve_path() {
-  python3 - <<'PY' "$1"
-from pathlib import Path
-import sys
-print(Path(sys.argv[1]).resolve())
-PY
-}
+source "$script_dir/lib.sh"
 
 results_root_abs="$(resolve_path "$results_root")"
 raw_dir_abs="$(resolve_path "$raw_dir")"
@@ -25,37 +18,10 @@ results_dir_abs="$(resolve_path "$results_dir")"
 fingerprint_file_abs="$(resolve_path "$fingerprint_file")"
 manifest_file_abs="$(resolve_path "$manifest_file")"
 
-case "$raw_dir_abs" in
-  "$results_root_abs"|"$results_root_abs"/*) ;;
-  *)
-    echo "RESULTS_RAW_DIR must be under $results_root_abs (got: $raw_dir_abs)" >&2
-    exit 1
-    ;;
-esac
-
-case "$results_dir_abs" in
-  "$results_root_abs"|"$results_root_abs"/*) ;;
-  *)
-    echo "RESULTS_DIR must be under $results_root_abs (got: $results_dir_abs)" >&2
-    exit 1
-    ;;
-esac
-
-case "$fingerprint_file_abs" in
-  "$results_root_abs"|"$results_root_abs"/*) ;;
-  *)
-    echo "FINGERPRINT_FILE must be under $results_root_abs (got: $fingerprint_file_abs)" >&2
-    exit 1
-    ;;
-esac
-
-case "$manifest_file_abs" in
-  "$results_root_abs"|"$results_root_abs"/*) ;;
-  *)
-    echo "MANIFEST_FILE must be under $results_root_abs (got: $manifest_file_abs)" >&2
-    exit 1
-    ;;
-esac
+ensure_path_under_root "RESULTS_RAW_DIR" "$raw_dir_abs" "$results_root_abs"
+ensure_path_under_root "RESULTS_DIR" "$results_dir_abs" "$results_root_abs"
+ensure_path_under_root "FINGERPRINT_FILE" "$fingerprint_file_abs" "$results_root_abs"
+ensure_path_under_root "MANIFEST_FILE" "$manifest_file_abs" "$results_root_abs"
 
 raw_dir="$raw_dir_abs"
 results_dir="$results_dir_abs"
diff --git a/scripts/run-single.sh b/scripts/run-single.sh
index cbfd7cf..93ed69e 100755
--- a/scripts/run-single.sh
+++ b/scripts/run-single.sh
@@ -30,14 +30,7 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 repo_root="$(cd "$script_dir/.." && pwd)"
 results_root="$repo_root/results/latest"
 expected_raw_dir="$results_root/raw"
-
-resolve_path() {
-  python3 - <<'PY' "$1"
-from pathlib import Path
-import sys
-print(Path(sys.argv[1]).resolve())
-PY
-}
+source "$script_dir/lib.sh"
 
 raw_dir_abs="$(resolve_path "$raw_dir")"
 results_dir_abs="$(resolve_path "$results_dir")"
@@ -56,21 +49,8 @@ if [[ "$results_dir_abs" != "$results_root_abs" ]]; then
   exit 1
 fi
 
-case "$fingerprint_file_abs" in
-  "$results_root_abs"|"$results_root_abs"/*) ;;
-  *)
-    echo "FINGERPRINT_FILE must be under $results_root_abs (got: $fingerprint_file_abs)" >&2
-    exit 1
-    ;;
-esac
-
-case "$manifest_file_abs" in
-  "$results_root_abs"|"$results_root_abs"/*) ;;
-  *)
-    echo "MANIFEST_FILE must be under $results_root_abs (got: $manifest_file_abs)" >&2
-    exit 1
-    ;;
-esac
+ensure_path_under_root "FINGERPRINT_FILE" "$fingerprint_file_abs" "$results_root_abs"
+ensure_path_under_root "MANIFEST_FILE" "$manifest_file_abs" "$results_root_abs"
 
 raw_dir="$expected_raw_abs"
 results_dir="$results_root_abs"
diff --git a/scripts/validate-result-schemas.py b/scripts/validate-result-schemas.py
index 658ae58..ded4bcf 100644
--- a/scripts/validate-result-schemas.py
+++ b/scripts/validate-result-schemas.py
@@ -3,6 +3,11 @@
 import json
 from pathlib import Path
 
+try:
+    from jsonschema import Draft202012Validator
+except ImportError:  # pragma: no cover - exercised in environments without dependency
+    Draft202012Validator = None
+
 
 ROOT = Path(__file__).resolve().parent.parent
 RAW_DIR = ROOT / "results" / "latest" / "raw"
@@ -60,6 +65,22 @@ def validate_raw_row(path, payload, schema_version):
             raise SystemExit(f"Raw schema validation failed for {path}: benchmark.{metric_field} is required")
 
 
+def validate_jsonschema(payload, schema, path, artifact_label):
+    if Draft202012Validator is None:
+        raise SystemExit(
+            "jsonschema dependency is required for schema validation. Install with: python3 -m pip install jsonschema"
+        )
+
+    validator = Draft202012Validator(schema)
+    errors = sorted(validator.iter_errors(payload), key=lambda err: list(err.path))
+    if not errors:
+        return
+    first = errors[0]
+    path_suffix = ".".join(str(part) for part in first.path)
+    field = f" ({path_suffix})" if path_suffix else ""
+    raise SystemExit(f"{artifact_label} JSON Schema validation failed for {path}{field}: {first.message}")
+
+
 def validate_raw(raw_dir, schema_path):
     schema = load_json(schema_path)
     schema_version = (schema.get("properties") or {}).get("schema_version", {}).get("const")
@@ -72,6 +93,7 @@ def validate_raw(raw_dir, schema_path):
 
     for path in files:
         payload = load_json(path)
+        validate_jsonschema(payload, schema, path, "Raw")
         validate_raw_row(path, payload, schema_version)
 
     print(f"benchmark-raw-schema-check: validated {len(files)} raw artifact(s)")
@@ -87,6 +109,7 @@ def validate_summary(summary_file, schema_path):
         raise SystemExit(f"Summary file not found: {summary_file}")
 
     payload = load_json(summary_file)
+    validate_jsonschema(payload, schema, summary_file, "Summary")
     required = (
         "schema_version",
         "generated_at",
diff --git a/stats-policy.json b/stats-policy.json
new file mode 100644
index 0000000..f28ce72
--- /dev/null
+++ b/stats-policy.json
@@ -0,0 +1,29 @@
+{
+  "quality": {
+    "required_metrics": [
+      "benchmark.median.rps",
+      "benchmark.median.latency_ms_p50",
+      "benchmark.median.latency_ms_p95",
+      "benchmark.median.latency_ms_p99"
+    ],
+    "metric_units": {
+      "throughput": "requests_per_second",
+      "latency": "milliseconds",
+      "memory": "mb",
+      "cpu": "percent",
+      "startup": "milliseconds"
+    },
+    "variance_thresholds_cv": {
+      "rps": 0.10,
+      "latency_ms_p95": 0.20,
+      "latency_ms_p99": 0.25
+    },
+    "benchstat": {
+      "enabled": true,
+      "baseline_framework": "baseline",
+      "max_regression_percent": {
+        "ns_per_op": 15.0
+      }
+    }
+  }
+}

From 5a0b5d91115708c5b9188727a297171bc7f72a8f Mon Sep 17 00:00:00 2001
From: Arye Kogan <aryekogan@gmail.com>
Date: Sun, 8 Feb 2026 00:21:24 +0200
Subject: [PATCH 4/6] chore: add dev bootstrap script and refresh docs

---
 .github/workflows/ci.yml                    |  9 ++-
 .gitignore                                  |  1 +
 CONTRIBUTING.md                             | 12 +++
 METHODOLOGY.md                              |  4 +-
 README.md                                   |  5 ++
 docs/architecture.md                        |  2 +-
 docs/guides/benchmark-publication-policy.md |  4 +-
 docs/guides/benchmark-workflow.md           |  2 +-
 scripts/setup-dev-env.sh                    | 87 +++++++++++++++++++++
 9 files changed, 119 insertions(+), 7 deletions(-)
 create mode 100755 scripts/setup-dev-env.sh

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index dfa8504..7669d58 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -63,9 +63,16 @@ jobs:
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
-      - name: Install benchmark quality tools
+      - name: Install script unit-test tooling
         run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install pytest pytest-cov jsonschema
           sudo apt-get update
+          sudo apt-get install -y bats
+      - name: Run non-network script unit tests
+        run: make test-scripts
+      - name: Install benchmark quality tools
+        run: |
           sudo apt-get install -y hyperfine
           go install golang.org/x/perf/cmd/benchstat@latest
           echo "$(go env GOPATH)/bin" >> "$GITHUB_PATH"
diff --git a/.gitignore b/.gitignore
index 55a456c..f40df30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ coverage.out
 # Python cache/bytecode
 **/__pycache__/
 *.py[cod]
+.venv/
 
 # Local worktrees
 .worktrees/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 98201f6..0f16ea8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,6 +16,16 @@ This repository validates API parity and runs framework benchmarks. Keep changes
 - Go 1.25.7+
 - Docker + Docker Compose (for local service runs)
 - GNU Make
+- Python 3
+- `pytest` and `pytest-cov`
+- `bats` (bats-core)
+- `jsonschema` Python package
+
+Bootstrap local dependencies:
+
+```bash
+bash scripts/setup-dev-env.sh
+```
 
 ## Local validation
 
@@ -24,6 +34,7 @@ Run these before opening a PR:
 ```bash
 go test ./...
 make test-coverage
+make test-scripts
 TARGET=http://localhost:3001 bash scripts/parity-check.sh
 ```
 
@@ -43,6 +54,7 @@ If you changed benchmark artifacts or report generation, also run schema validat
 
 ```bash
 make benchmark-schema-validate
+make ci-benchmark-quality-check
 ```
 
 ## Pull request process
diff --git a/METHODOLOGY.md b/METHODOLOGY.md
index 4932a7f..a34f8ef 100644
--- a/METHODOLOGY.md
+++ b/METHODOLOGY.md
@@ -22,7 +22,7 @@
 - shell scripts in `scripts/` for orchestration
 - `hyperfine` benchmark engine (optional via `BENCH_ENGINE=hyperfine`)
 - `benchstat` statistical comparison for quality gates
-- policy file: `stats-policy.yaml`
+- policy file: `stats-policy.json` (`stats-policy.yaml` accepted for backward compatibility)
 - Python 3 report and normalization tooling in `scripts/`
 
 ## Baseline benchmark profile
@@ -33,7 +33,7 @@
 
 ## Quality policy
 
-- thresholds and required metrics are defined in `stats-policy.yaml`
+- thresholds and required metrics are defined in `stats-policy.json`
 - `make ci-benchmark-quality-check` enforces policy locally and in CI
 - benchstat comparisons are evaluated against policy baseline framework (`baseline` by default)
 - manual CI benchmark runs use bounded workflow inputs (`frameworks` subset, `runs` 1..10, `benchmark_requests` 50..1000)
diff --git a/README.md b/README.md
index 6392f0b..16540e4 100644
--- a/README.md
+++ b/README.md
@@ -61,8 +61,13 @@ BENCH_ENGINE=hyperfine make benchmark
 
 ## Tooling prerequisites
 
+- One-command local setup: `bash scripts/setup-dev-env.sh`
+
 - Go (for `go test` and `benchstat`)
 - Python 3
+- pytest + pytest-cov (for Python unit tests)
+- bats-core (for shell integration tests)
+- jsonschema (for schema validation checks)
 - hyperfine (optional benchmark engine)
 - benchstat (`go install golang.org/x/perf/cmd/benchstat@latest`)
 - go-patch-cover (`go install github.com/seriousben/go-patch-cover/cmd/go-patch-cover@latest`, for `make test-patch-coverage`)
diff --git a/docs/architecture.md b/docs/architecture.md
index e3bafe8..deb4663 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -38,7 +38,7 @@ results/latest/          benchmark outputs and generated report
 2. Run parity checks per target
 3. Run load benchmarks for parity-passing targets (`legacy` engine or `hyperfine`)
 4. Normalize and save raw outputs
-5. Run policy quality gates (`stats-policy.yaml` + benchstat)
+5. Run policy quality gates (`stats-policy.json` + benchstat)
 6. Build `summary.json`
 7. Generate `report.md`
 
diff --git a/docs/guides/benchmark-publication-policy.md b/docs/guides/benchmark-publication-policy.md
index 76223be..61d7b28 100644
--- a/docs/guides/benchmark-publication-policy.md
+++ b/docs/guides/benchmark-publication-policy.md
@@ -16,9 +16,9 @@ Any public claim or report referencing benchmarks from this repository must incl
 - **Purpose**: Guarantees that the raw metrics and summaries adhere to the versioned data contract.
 
 ### 3. Quality Policy Compliance
-- **Requirement**: The run must pass `make ci-benchmark-quality-check` as defined in `stats-policy.yaml`.
+- **Requirement**: The run must pass `make ci-benchmark-quality-check` as defined in `stats-policy.json`.
 - **Metrics**: Must include at least RPS (median) and Latency (P50, P95, P99).
-- **Variance**: Coefficient of Variation (CV) must stay within the thresholds defined in `stats-policy.yaml` (e.g., <10% for RPS).
+- **Variance**: Coefficient of Variation (CV) must stay within the thresholds defined in `stats-policy.json` (e.g., <10% for RPS).
 
 ### 4. Parity and Skip Transparency
 - **Requirement**: Any target implementation that failed the parity gate must be disclosed as "Skipped (Parity Failure)".
diff --git a/docs/guides/benchmark-workflow.md b/docs/guides/benchmark-workflow.md
index 9f050e7..4edaf38 100644
--- a/docs/guides/benchmark-workflow.md
+++ b/docs/guides/benchmark-workflow.md
@@ -83,7 +83,7 @@ make methodology-changelog-check
 make publication-sync-check
 ```
 
-Quality thresholds and required metrics are versioned in `stats-policy.yaml`.
+Quality thresholds and required metrics are versioned in `stats-policy.json` (with `stats-policy.yaml` backward-compatibility support).
 
 ## Reproducibility notes
 
diff --git a/scripts/setup-dev-env.sh b/scripts/setup-dev-env.sh
new file mode 100755
index 0000000..87a9d68
--- /dev/null
+++ b/scripts/setup-dev-env.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+repo_root="$(cd "$script_dir/.." && pwd)"
+
+have_cmd() {
+  command -v "$1" >/dev/null 2>&1
+}
+
+log() {
+  printf '%s\n' "$*"
+}
+
+install_with_brew() {
+  local pkg="$1"
+  if have_cmd brew; then
+    brew list "$pkg" >/dev/null 2>&1 || brew install "$pkg"
+    return 0
+  fi
+  return 1
+}
+
+install_with_apt() {
+  local pkg="$1"
+  if have_cmd apt-get; then
+    sudo apt-get update
+    sudo apt-get install -y "$pkg"
+    return 0
+  fi
+  return 1
+}
+
+ensure_tool() {
+  local cmd="$1"
+  local brew_pkg="$2"
+  local apt_pkg="$3"
+  local manual="$4"
+
+  if have_cmd "$cmd"; then
+    log "ok: $cmd"
+    return 0
+  fi
+
+  log "installing: $cmd"
+  if [ -n "$brew_pkg" ] && install_with_brew "$brew_pkg"; then
+    return 0
+  fi
+  if [ -n "$apt_pkg" ] && install_with_apt "$apt_pkg"; then
+    return 0
+  fi
+
+  log "warning: could not auto-install $cmd. $manual"
+  return 1
+}
+
+main() {
+  cd "$repo_root"
+
+  ensure_tool go "go" "golang" "Install Go manually from https://go.dev/dl/." || true
+  ensure_tool python3 "python@3.12" "python3" "Install Python 3 manually." || true
+  ensure_tool make "make" "make" "Install GNU Make manually." || true
+  ensure_tool docker "docker" "docker.io" "Install Docker Desktop or Docker Engine manually." || true
+  ensure_tool hyperfine "hyperfine" "hyperfine" "Install hyperfine manually." || true
+  ensure_tool bats "bats-core" "bats" "Install bats-core manually." || true
+
+  if have_cmd go; then
+    log "installing Go tools"
+    go install golang.org/x/perf/cmd/benchstat@latest
+    go install github.com/seriousben/go-patch-cover/cmd/go-patch-cover@latest
+  fi
+
+  if have_cmd python3; then
+    log "setting up Python virtualenv"
+    if [ ! -d .venv ]; then
+      python3 -m venv .venv
+    fi
+    .venv/bin/pip install --upgrade pip
+    .venv/bin/pip install pytest pytest-cov jsonschema
+  fi
+
+  log "done. next steps:"
+  log "  1) PATH=\"$repo_root/.venv/bin:\$PATH\" make test-scripts"
+  log "  2) make test"
+}
+
+main "$@"

From 350c88c27b24737201197a37522104f5c226f563 Mon Sep 17 00:00:00 2001
From: Arye Kogan <aryekogan@gmail.com>
Date: Sun, 8 Feb 2026 00:36:08 +0200
Subject: [PATCH 5/6] chore: route CI installs through make and setup script
 modes

---
 .github/workflows/ci.yml |  13 +----
 CONTRIBUTING.md          |  11 ++++
 Makefile                 |  20 ++++++-
 README.md                |   3 ++
 scripts/setup-dev-env.sh | 114 +++++++++++++++++++++++++++++++++++----
 5 files changed, 138 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7669d58..8bbd11e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -63,19 +63,10 @@ jobs:
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
-      - name: Install script unit-test tooling
-        run: |
-          python3 -m pip install --upgrade pip
-          python3 -m pip install pytest pytest-cov jsonschema
-          sudo apt-get update
-          sudo apt-get install -y bats
+      - name: Install script toolchain
+        run: make setup-dev-env-ci-scripts
       - name: Run non-network script unit tests
         run: make test-scripts
-      - name: Install benchmark quality tools
-        run: |
-          sudo apt-get install -y hyperfine
-          go install golang.org/x/perf/cmd/benchstat@latest
-          echo "$(go env GOPATH)/bin" >> "$GITHUB_PATH"
       - name: Run benchmark script smoke
         run: bash scripts/run-all.sh
       - name: Generate report from raw results
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0f16ea8..9b709d5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,6 +27,17 @@ Bootstrap local dependencies:
 bash scripts/setup-dev-env.sh
 ```
 
+Optional targeted setup modes:
+
+```bash
+# CI-friendly install flow
+bash scripts/setup-dev-env.sh --ci --subset core,python-test,shell-test,benchmark-tools
+
+# Makefile wrappers
+make setup-dev-env
+make setup-dev-env-ci-scripts
+```
+
 ## Local validation
 
 Run these before opening a PR:
diff --git a/Makefile b/Makefile
index 80e49fe..65d62b2 100644
--- a/Makefile
+++ b/Makefile
@@ -3,11 +3,20 @@ PYTHON ?= python3
 GO ?= go
 PYTEST ?= pytest
 BATS ?= bats
+
+ifneq ("$(wildcard .venv/bin/python)","")
+PYTHON := .venv/bin/python
+endif
+
+ifneq ("$(wildcard .venv/bin/pytest)","")
+PYTEST := .venv/bin/pytest
+endif
+
 GOPATH ?= $(shell $(GO) env GOPATH)
 GO_PATCH_COVER ?= $(GOPATH)/bin/go-patch-cover
 MODULES = $(shell find . -type f -name "go.mod" -not -path "*/.*/*" -not -path "*/vendor/*" -exec dirname {} \;)
 
-.PHONY: benchmark benchmark-modkit benchmark-nestjs benchmark-baseline benchmark-wire benchmark-fx benchmark-do report test test-go test-python test-shell test-scripts test-coverage test-coverage-go test-coverage-python test-patch-coverage tools parity-check parity-check-modkit parity-check-nestjs benchmark-fingerprint-check benchmark-limits-check benchmark-manifest-check benchmark-raw-schema-check benchmark-summary-schema-check benchmark-schema-validate benchmark-stats-check benchmark-variance-check benchmark-benchstat-check ci-benchmark-quality-check workflow-concurrency-check workflow-budget-check workflow-inputs-check report-disclaimer-check methodology-changelog-check publication-sync-check
+.PHONY: benchmark benchmark-modkit benchmark-nestjs benchmark-baseline benchmark-wire benchmark-fx benchmark-do report test test-go test-python test-shell test-scripts test-coverage test-coverage-go test-coverage-python test-patch-coverage tools setup-dev-env setup-dev-env-ci setup-dev-env-ci-scripts parity-check parity-check-modkit parity-check-nestjs benchmark-fingerprint-check benchmark-limits-check benchmark-manifest-check benchmark-raw-schema-check benchmark-summary-schema-check benchmark-schema-validate benchmark-stats-check benchmark-variance-check benchmark-benchstat-check ci-benchmark-quality-check workflow-concurrency-check workflow-budget-check workflow-inputs-check report-disclaimer-check methodology-changelog-check publication-sync-check
 
 benchmark:
 	bash scripts/run-all.sh
@@ -94,6 +103,15 @@ tools:
 	@$(GO) install github.com/seriousben/go-patch-cover/cmd/go-patch-cover@latest
 	@echo "Done: go-patch-cover installed"
 
+setup-dev-env:
+	bash scripts/setup-dev-env.sh
+
+setup-dev-env-ci:
+	bash scripts/setup-dev-env.sh --ci --subset core,python-test,go-tools
+
+setup-dev-env-ci-scripts:
+	bash scripts/setup-dev-env.sh --ci --subset core,python-test,shell-test,benchmark-tools
+
 parity-check:
 	TARGET="$(PARITY_TARGET)" bash scripts/parity-check.sh
 
diff --git a/README.md b/README.md
index 16540e4..a418352 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,9 @@ BENCH_ENGINE=hyperfine make benchmark
 ## Tooling prerequisites
 
 - One-command local setup: `bash scripts/setup-dev-env.sh`
+- CI/subset setup examples:
+  - `bash scripts/setup-dev-env.sh --ci --subset core,python-test,shell-test,benchmark-tools`
+  - `make setup-dev-env-ci-scripts`
 
 - Go (for `go test` and `benchstat`)
 - Python 3
diff --git a/scripts/setup-dev-env.sh b/scripts/setup-dev-env.sh
index 87a9d68..9fb8f5a 100755
--- a/scripts/setup-dev-env.sh
+++ b/scripts/setup-dev-env.sh
@@ -4,6 +4,10 @@ set -euo pipefail
 script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 repo_root="$(cd "$script_dir/.." && pwd)"
 
+mode="local"
+subset_csv="all"
+apt_updated=0
+
 have_cmd() {
   command -v "$1" >/dev/null 2>&1
 }
@@ -14,6 +18,9 @@ log() {
 
 install_with_brew() {
   local pkg="$1"
+  if [ "$mode" = "ci" ]; then
+    return 1
+  fi
   if have_cmd brew; then
     brew list "$pkg" >/dev/null 2>&1 || brew install "$pkg"
     return 0
@@ -24,13 +31,80 @@ install_with_brew() {
 install_with_apt() {
   local pkg="$1"
   if have_cmd apt-get; then
-    sudo apt-get update
+    if [ "$apt_updated" -eq 0 ]; then
+      sudo apt-get update
+      apt_updated=1
+    fi
     sudo apt-get install -y "$pkg"
     return 0
   fi
   return 1
 }
 
+usage() {
+  cat <<'EOF'
+Usage: bash scripts/setup-dev-env.sh [--ci] [--subset LIST]
+
+Options:
+  --ci             CI-friendly mode (skip Homebrew installs, prefer apt)
+  --subset LIST    Comma-separated install groups
+
+Subset groups:
+  all              Everything (default)
+  core             go, python3, make
+  docker           docker cli/engine
+  shell-test       bats
+  benchmark-tools  hyperfine + benchstat
+  python-test      venv + pytest + pytest-cov + jsonschema
+  go-tools         go-patch-cover
+
+Examples:
+  bash scripts/setup-dev-env.sh
+  bash scripts/setup-dev-env.sh --subset core,python-test
+  bash scripts/setup-dev-env.sh --ci --subset python-test,shell-test,benchmark-tools
+EOF
+}
+
+has_subset() {
+  local needle="$1"
+  if [ "$subset_csv" = "all" ]; then
+    return 0
+  fi
+  case ",$subset_csv," in
+    *",$needle,"*) return 0 ;;
+    *) return 1 ;;
+  esac
+}
+
+parse_args() {
+  while [ "$#" -gt 0 ]; do
+    case "$1" in
+      --ci)
+        mode="ci"
+        ;;
+      --subset)
+        shift
+        if [ "$#" -eq 0 ]; then
+          echo "missing value for --subset" >&2
+          usage
+          exit 1
+        fi
+        subset_csv="$1"
+        ;;
+      -h|--help)
+        usage
+        exit 0
+        ;;
+      *)
+        echo "unknown argument: $1" >&2
+        usage
+        exit 1
+        ;;
+    esac
+    shift
+  done
+}
+
 ensure_tool() {
   local cmd="$1"
   local brew_pkg="$2"
@@ -55,22 +129,38 @@ ensure_tool() {
 }
 
 main() {
+  parse_args "$@"
   cd "$repo_root"
 
-  ensure_tool go "go" "golang" "Install Go manually from https://go.dev/dl/." || true
-  ensure_tool python3 "python@3.12" "python3" "Install Python 3 manually." || true
-  ensure_tool make "make" "make" "Install GNU Make manually." || true
-  ensure_tool docker "docker" "docker.io" "Install Docker Desktop or Docker Engine manually." || true
-  ensure_tool hyperfine "hyperfine" "hyperfine" "Install hyperfine manually." || true
-  ensure_tool bats "bats-core" "bats" "Install bats-core manually." || true
+  if has_subset core; then
+    ensure_tool go "go" "golang" "Install Go manually from https://go.dev/dl/." || true
+    ensure_tool python3 "python@3.12" "python3" "Install Python 3 manually." || true
+    ensure_tool make "make" "make" "Install GNU Make manually." || true
+  fi
+
+  if has_subset docker; then
+    ensure_tool docker "docker" "docker.io" "Install Docker Desktop or Docker Engine manually." || true
+  fi
+
+  if has_subset benchmark-tools; then
+    ensure_tool hyperfine "hyperfine" "hyperfine" "Install hyperfine manually." || true
+  fi
+
+  if has_subset shell-test; then
+    ensure_tool bats "bats-core" "bats" "Install bats-core manually." || true
+  fi
 
-  if have_cmd go; then
+  if has_subset benchmark-tools && have_cmd go; then
     log "installing Go tools"
     go install golang.org/x/perf/cmd/benchstat@latest
+  fi
+
+  if has_subset go-tools && have_cmd go; then
+    log "installing Go patch coverage tool"
     go install github.com/seriousben/go-patch-cover/cmd/go-patch-cover@latest
   fi
 
-  if have_cmd python3; then
+  if has_subset python-test && have_cmd python3; then
     log "setting up Python virtualenv"
     if [ ! -d .venv ]; then
       python3 -m venv .venv
@@ -79,8 +169,10 @@ main() {
     .venv/bin/pip install pytest pytest-cov jsonschema
   fi
 
-  log "done. next steps:"
-  log "  1) PATH=\"$repo_root/.venv/bin:\$PATH\" make test-scripts"
+  log "done (mode=$mode, subset=$subset_csv). next steps:"
+  if has_subset python-test; then
+    log "  1) PATH=\"$repo_root/.venv/bin:\$PATH\" make test-scripts"
+  fi
   log "  2) make test"
 }
 

From 95192d9b8a1486c8d6bb9c011198d307ca0fc7d3 Mon Sep 17 00:00:00 2001
From: Arye Kogan <aryekogan@gmail.com>
Date: Sun, 8 Feb 2026 00:42:32 +0200
Subject: [PATCH 6/6] fix: address reviewer-reported test and setup edge cases

---
 cmd/parity-test/main_test.go       | 14 +++++++++++---
 scripts/setup-dev-env.sh           | 14 +++++++++++---
 scripts/validate-result-schemas.py |  5 ++++-
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/cmd/parity-test/main_test.go b/cmd/parity-test/main_test.go
index dfd9634..8dc2d14 100644
--- a/cmd/parity-test/main_test.go
+++ b/cmd/parity-test/main_test.go
@@ -245,10 +245,9 @@ func TestRunScenario_InvalidJSONResponse(t *testing.T) {
 func TestRunScenario_DefaultMethodIsGET(t *testing.T) {
 	t.Parallel()
 
+	methodCh := make(chan string, 1)
 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		if r.Method != http.MethodGet {
-			t.Fatalf("expected default method GET, got %s", r.Method)
-		}
+		methodCh <- r.Method
 		w.Header().Set("Content-Type", "application/json")
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(`{"ok":true}`))
@@ -264,6 +263,15 @@ func TestRunScenario_DefaultMethodIsGET(t *testing.T) {
 	if err := runScenario(&http.Client{Timeout: time.Second}, server.URL, s); err != nil {
 		t.Fatalf("unexpected runScenario error: %v", err)
 	}
+
+	select {
+	case got := <-methodCh:
+		if got != http.MethodGet {
+			t.Fatalf("expected default method GET, got %s", got)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("timed out waiting for request method")
+	}
 }
 
 func TestMatchStringValue_InterpolatedTokensRejectInvalidSegments(t *testing.T) {
diff --git a/scripts/setup-dev-env.sh b/scripts/setup-dev-env.sh
index 9fb8f5a..97948ea 100755
--- a/scripts/setup-dev-env.sh
+++ b/scripts/setup-dev-env.sh
@@ -22,7 +22,11 @@ install_with_brew() {
     return 1
   fi
   if have_cmd brew; then
-    brew list "$pkg" >/dev/null 2>&1 || brew install "$pkg"
+    if ! brew list "$pkg" >/dev/null 2>&1; then
+      if ! brew install "$pkg"; then
+        return 1
+      fi
+    fi
     return 0
   fi
   return 1
@@ -32,10 +36,14 @@ install_with_apt() {
   local pkg="$1"
   if have_cmd apt-get; then
     if [ "$apt_updated" -eq 0 ]; then
-      sudo apt-get update
+      if ! sudo apt-get update; then
+        return 1
+      fi
       apt_updated=1
     fi
-    sudo apt-get install -y "$pkg"
+    if ! sudo apt-get install -y "$pkg"; then
+      return 1
+    fi
     return 0
   fi
   return 1
diff --git a/scripts/validate-result-schemas.py b/scripts/validate-result-schemas.py
index ded4bcf..7bfa5f5 100644
--- a/scripts/validate-result-schemas.py
+++ b/scripts/validate-result-schemas.py
@@ -72,7 +72,10 @@ def validate_jsonschema(payload, schema, path, artifact_label):
         )
 
     validator = Draft202012Validator(schema)
-    errors = sorted(validator.iter_errors(payload), key=lambda err: list(err.path))
+    errors = sorted(
+        validator.iter_errors(payload),
+        key=lambda err: [str(part) for part in err.path],
+    )
     if not errors:
         return
     first = errors[0]