Luce-Org · dusterbloom · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/.gitignore b/.gitignore
@@ -79,3 +79,7 @@ fix-plan.md
 # Harness test artifacts
 .harness-work/
 health
+
+# Workdir editor backup suffixes
+*.git-head
+*.pre-pflash-rename
diff --git a/dflash/scripts/eval_quality_compare.py b/dflash/scripts/eval_quality_compare.py
@@ -0,0 +1,166 @@
+"""MT-Bench quality comparator.
+
+Reads all results_*.json in the given directory (or current dir),
+treats baseline_off as reference, and prints a markdown comparison table.
+
+Usage:
+    python eval_quality_compare.py [--dir PATH] [--out PATH]
+"""
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def load_results(path: Path) -> dict[tuple[int, int], str]:
+    """Returns {(question_id, turn_num): reply} for turn_num in {1, 2}."""
+    mapping = {}
+    with open(path) as f:
+        records = json.load(f)
+    for r in records:
+        qid = r["question_id"]
+        mapping[(qid, 1)] = r["turn_1"]
+        mapping[(qid, 2)] = r["turn_2"]
+    return mapping
+
+
+def lcp_ratio(a: str, b: str) -> float:
+    """Longest common prefix length / min(len(a), len(b))."""
+    denom = min(len(a), len(b))
+    if denom == 0:
+        return 1.0 if a == b else 0.0
+    i = 0
+    while i < denom and a[i] == b[i]:
+        i += 1
+    return i / denom
+
+
+def compare(ref: dict, cand: dict) -> dict:
+    """Compute comparison metrics between ref and cand reply maps."""
+    keys = sorted(set(ref) & set(cand))
+    if not keys:
+        return {"exact_match_rate": 0.0, "mean_lcp_ratio": 0.0,
+                "divergence_count": 0, "total_pairs": 0,
+                "first_5_divergences": []}
+
+    exact = 0
+    lcp_sum = 0.0
+    divergences = []
+
+    for k in keys:
+        r, c = ref[k], cand[k]
+        if r == c:
+            exact += 1
+        else:
+            if len(divergences) < 5:
+                qid, turn = k
+                divergences.append((qid, turn, r[:50], c[:50]))
+        lcp_sum += lcp_ratio(r, c)
+
+    n = len(keys)
+    return {
+        "exact_match_rate":   exact / n,
+        "mean_lcp_ratio":     lcp_sum / n,
+        "divergence_count":   n - exact,
+        "total_pairs":        n,
+        "first_5_divergences": divergences,
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="MT-Bench quality comparator")
+    ap.add_argument("--dir", type=Path, default=Path("."),
+                    help="Directory containing results_*.json files")
+    ap.add_argument("--out", type=Path,
+                    default=Path(__file__).parent.parent / "eval/summary.md",
+                    help="Output markdown summary path")
+    args = ap.parse_args()
+
+    result_files = sorted(args.dir.glob("results_*.json"))
+    if not result_files:
+        print(f"ERROR: no results_*.json found in {args.dir}", file=sys.stderr)
+        return 1
+
+    # Map config name -> result file
+    configs: dict[str, Path] = {}
+    for f in result_files:
+        # strip "results_" prefix and ".json" suffix
+        name = f.stem[len("results_"):]
+        configs[name] = f
+
+    if "baseline_off" not in configs:
+        print("ERROR: baseline_off results not found — cannot compare", file=sys.stderr)
+        return 1
+
+    ref = load_results(configs["baseline_off"])
+
+    rows = []
+    for name, path in configs.items():
+        cand = load_results(path)
+        m = compare(ref, cand)
+        m["config"] = name
+        rows.append(m)
+
+    # Sort: baseline_off first, then alphabetical
+    def sort_key(r):
+        if r["config"] == "baseline_off":
+            return (0, r["config"])
+        return (1, r["config"])
+    rows.sort(key=sort_key)
+
+    # Sanity check: baseline_off_2 vs baseline_off
+    sanity_row = next((r for r in rows if r["config"] == "baseline_off_2"), None)
+    sanity_warning = ""
+    if sanity_row and sanity_row["exact_match_rate"] < 0.99:
+        sanity_warning = (
+            f"WARNING: baseline_off_2 exact_match_rate={sanity_row['exact_match_rate']:.3f} "
+            f"< 0.99 — SERVER IS NONDETERMINISTIC. All other comparisons are suspect.\n\n"
+        )
+
+    # Build markdown table
+    lines = []
+    if sanity_warning:
+        lines.append(f"> {sanity_warning.strip()}\n")
+
+    lines.append("| config | exact_match_rate | mean_lcp_ratio | divergence_count | total_pairs |")
+    lines.append("|--------|-----------------|----------------|-----------------|-------------|")
+    for r in rows:
+        lines.append(
+            f"| {r['config']} "
+            f"| {r['exact_match_rate']:.3f} "
+            f"| {r['mean_lcp_ratio']:.3f} "
+            f"| {r['divergence_count']} "
+            f"| {r['total_pairs']} |"
+        )
+
+    lines.append("")
+    lines.append("## First 5 divergences per config (vs baseline_off)")
+    for r in rows:
+        if r["config"] == "baseline_off" or not r["first_5_divergences"]:
+            continue
+        lines.append(f"\n### {r['config']}")
+        lines.append("| qid | turn | ref (first 50) | cand (first 50) |")
+        lines.append("|-----|------|----------------|-----------------|")
+        for qid, turn, ref50, cand50 in r["first_5_divergences"]:
+            ref50_s  = ref50.replace("|", "\\|").replace("\n", " ")
+            cand50_s = cand50.replace("|", "\\|").replace("\n", " ")
+            lines.append(f"| {qid} | {turn} | {ref50_s!r} | {cand50_s!r} |")
+
+    table = "\n".join(lines)
+
+    # Print to stdout
+    if sanity_warning:
+        print(f"\n{'!'*70}")
+        print(sanity_warning.strip())
+        print(f"{'!'*70}\n")
+    print(table)
+
+    # Write summary file
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    args.out.write_text(table + "\n")
+    print(f"\nSummary written to {args.out}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/docs/anchor-transitive.md b/docs/anchor-transitive.md
@@ -0,0 +1,15 @@
+# anchor transitive scan
+
+`scan_and_force_transitive` (anchor_scan.cpp) expands the query pool with
+tokens from newly-forced chunks and re-runs `scan_and_force` until fixed
+point or max_iters (default 3) is reached.
+
+Improves multi-hop retrieval: enables discovery of intermediate context
+chunks whose tokens do not appear in the original query but connect
+query-to-needle via shared rare tokens.
+
+Empirical result: F1=0.628 on LongBench HotpotQA at ee7 + keep=0.15
+(vs uncompressed F1=0.697). This is the ceiling for attention-score-based
+prefill compression on this task; see bench/2026-05-25_longbench_hotpotqa/.
+
+On by default. Disable via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0.
diff --git a/docs/pflash-adaptive-composition.md b/docs/pflash-adaptive-composition.md
@@ -0,0 +1,18 @@
+# pflash adaptive composition (Design 1)
+
+When pflash compresses a prompt, the target spec-decode verify window must
+cover the entire compressed sequence — otherwise verify sees only the last
+fa_window positions and loses needle context.
+
+`http_server.cpp`: when pflash_compressed, sets
+`req.fa_window_override = effective_prompt.size() + 256`.
+This never caps visibility; pflash already paid compute to pick which tokens
+matter, so every kept token must be visible in verify.
+
+`qwen35_backend.cpp` C2 gate: after prefill, checks whether spec-decode
+arithmetic still earns its drafter cost at the override window size.
+
+- override <= 2 * cfg_.fa_window → spec-decode
+- override >  2 * cfg_.fa_window → AR fallback (fa_window=0, full attention)
+
+Both paths see every kept token. The gate chooses mechanism, not visibility.
diff --git a/docs/pflash-compress-cfg.md b/docs/pflash-compress-cfg.md
@@ -0,0 +1,46 @@
+# pflash compression knobs
+
+All PFLASH_COMPRESS_* and DFLASH_COMPRESS_* env vars are read once per
+request in `compress_cfg_from_env(n_chunks, n_keep)` in qwen3_drafter.cpp.
+
+## anchor_radius adaptive ladder
+
+Prevents the 64K NIAH cliff: at long context the needle text is more likely
+to straddle multiple chunks, and a fixed radius=2 window (5 chunks / ~160
+tokens) loses the back half of the needle.
+
+Default ladder (override via PFLASH_COMPRESS_ANCHOR_RADIUS):
+
+| n_chunks   | anchor_radius |
+|------------|---------------|
+| < 1024     | 2             |
+| 1024-2047  | 4             |
+| >= 2048    | 8             |
+
+## max_anchor_hits adaptive ladder
+
+Same breakpoints as anchor_radius. At long context anchors are sparser, so
+more hits per query token are affordable.
+
+| n_chunks   | max_anchor_hits |
+|------------|-----------------|
+| < 1024     | 8               |
+| 1024-2047  | 16              |
+| >= 2048    | 32              |
+
+## anchor_transitive
+
+On by default. Gated rare-token bridge expands the query pool with tokens
+from newly-forced chunks and re-runs anchor scan to fixed point.
+Improves multi-hop F1 on LongBench HotpotQA (empirically; F1=0.628 ceiling
+at ee7+anchor-transitive on RTX 3090 — see bench/2026-05-25_longbench_hotpotqa/).
+Control via PFLASH_COMPRESS_ANCHOR_TRANSITIVE=0 to disable.
+
+## head/tail chunk forcing
+
+Head and tail chunks are force-included before top-K scoring fills the
+remainder. The counts scale with n_keep so top-K always gets at least one
+slot even when head_raw + tail_raw >= n_keep.
+
+Defaults: head=8, tail=24 (override via DFLASH_COMPRESS_HEAD_CHUNKS /
+DFLASH_COMPRESS_TAIL_CHUNKS).
diff --git a/docs/pflash-drafter-template-alignment.md b/docs/pflash-drafter-template-alignment.md
@@ -0,0 +1,95 @@
+# Drafter / target distribution alignment via closed-think prefill
+
+## Problem
+
+PR #274 (adaptive composition) shipped on `feat/pflash-drafter-ee7`, validating
+13× prefill TPS and +47% decode TPS at long context. It surfaced a load-bearing
+ceiling on the dflash decode side: spec-decode `accept_rate` was capped at
+13–21% on the opencode harness and went to 0.0% on a peer-chat call. Composition
+arm decode TPS (24.4 tok/s) therefore stayed below pflash-only (33.0 tok/s) —
+the drafter overhead wasn't amortizing through acceptance.
+
+## Diagnosis (the wrong hypothesis first)
+
+The peer-chat conversation suggested "drafter conditioned on a different chat
+template than the target." Three Phase-1 Explore agents traced the code and
+showed that framing is architecturally wrong:
+
+- Both target and drafter receive the **same** `effective_prompt` token IDs at
+  prefill. The chat template is applied **once** on the target side at
+  `server/src/server/http_server.cpp:996-1014`, tokenized with the target's
+  tokenizer at `:1014`, then flows to both target and drafter via
+  `gen_req.prompt = effective_prompt` at `:1265`.
+- The drafter `dflash-draft-3.6-q4_k_m.gguf` does **not** apply any chat
+  template at runtime. `server/src/draft/draft_gguf_loader.cpp` doesn't read
+  the `tokenizer.chat_template` GGUF metadata key.
+
+A `--draft-chat-template` flag would fix nothing — there is no drafter-side
+template-application code path to redirect.
+
+## Diagnosis (the actual root cause)
+
+The drafter GGUF **does** ship the official Qwen3.6 chat template as
+`tokenizer.chat_template` metadata. That template appends
+`<think>\n\n</think>\n\n` after `<|im_start|>assistant\n` when
+`enable_thinking=false`. The drafter was distilled with that closed-think
+suffix in its training distribution — every assistant turn it predicts
+expects that prefix.
+
+The target's Unsloth Qwen3-Coder template (`project_unsloth_jinja_template_solves_tool_call`
+in memory) does **not** append that suffix. So at the moment spec-decode
+predicts the next token after `<|im_start|>assistant\n`:
+
+- drafter's distribution expects `<think>` literal tokens
+- target's distribution expects the actual answer
+
+Drafter proposes `<think>...`, target rejects, falls back to AR. Repeat at
+every position. `accept_rate` ≈ 0%.
+
+## Fix
+
+Make the **target's render** match the drafter's training distribution.
+`render_chat_template_jinja` now appends `<think>\n\n</think>\n\n` after a
+bare `<|im_start|>assistant` marker when **all three** of these hold:
+
+1. `arch_hint == ChatFormat::QWEN3` (gated to Qwen3-family — qwen35, qwen35moe;
+   Laguna / Gemma4 don't use ChatML tokens and must not be touched)
+2. `!enable_thinking`
+3. The rendered prompt ends with the bare assistant marker (tolerant of
+   trailing whitespace variants: `\n`, `\n\n`, trailing space)
+
+Condition (3) prevents double-appending when a user-supplied template already
+emits the closed-think suffix.
+
+## Multi-arch safety
+
+`chat_format_for_arch()` in `server/src/server/chat_template.cpp` returns:
+- `ChatFormat::QWEN3` for `qwen3`, `qwen35`, `qwen35moe`
+- `ChatFormat::LAGUNA` for `laguna`
+- `ChatFormat::GEMMA4` for `gemma4`
+
+The suffix only fires for `QWEN3`. A new test
+(`test_chat_format_for_arch_qwen35moe_returns_qwen3`) locks the qwen35moe →
+QWEN3 inheritance so a future arch-enum addition doesn't silently flip
+behavior. Tests also lock the Laguna/Gemma4 no-append case and the
+no-double-append guard.
+
+## Expected impact
+
+- `accept_rate` lifts from 13–21% (and 0% on peer-chat) on Qwen3.6 dense with
+  Unsloth Qwen3-Coder template. Threshold for declaring the fix worked:
+  non-zero peer-chat accept_rate AND opencode harness accept_rate ≥30% on at
+  least 2 of 3 turns from Round 5b D.
+- Composition arm decode TPS rises above pflash-only on long-generation
+  workloads (currently 24.4 vs 33.0; the gap exists because spec-decode
+  amortization is bounded by accept_rate).
+- davide221's qwen35moe `chat CACHE` hang (issue #280) likely has the same
+  root cause via the same code path — qwen35moe inherits ChatFormat::QWEN3
+  and the suffix will fire there too.
+
+## Out of scope
+
+The sibling commits on `fix/qwen36-claude-code-tool-calling` (target-side
+tool-format normalization, scrub/truncate, Anthropic→Qwen tool shape,
+param-name aliasing) ship as PR #276. They are not drafter alignment — they
+are independent target-side tool-formatting improvements.