PrimeIntellect-ai · hallerite · May 25, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,6 @@ coverage.xml
 .idea/
 .vscode/
 *.swp
+
+# agent harness state
+.claude/
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ from transformers import AutoTokenizer
 from renderers import create_renderer
 
 tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
-r = create_renderer(tok, renderer="auto")           # → Qwen3Renderer
+r = create_renderer(tok)                            # → Qwen3Renderer (auto-resolved)
 
 prompt_ids = r.render_ids(
     [{"role": "user", "content": "hi"}],
@@ -71,17 +71,17 @@ Each hand-coded bridge:
 ### Picking a renderer
 
 ```python
-r = create_renderer(tok, renderer="auto")
+r = create_renderer(tok)                # AutoRendererConfig is the implicit default
 ```
 
-Auto-detect matches `tokenizer.name_or_path` against `MODEL_RENDERER_MAP` by **exact match**. Prefix matching is intentionally off — same architecture can ship different chat templates (base vs instruct, fine-tune renames). Fine-tunes must pass `renderer=<name>` explicitly; unknown names fall back to `DefaultRenderer`.
+Auto-detect matches `tokenizer.name_or_path` against `MODEL_RENDERER_MAP` by **exact match**. Prefix matching is intentionally off — same architecture can ship different chat templates (base vs instruct, fine-tune renames). Fine-tunes must pass an explicit typed config (e.g. `Qwen3RendererConfig()`); unknown names fall back to `DefaultRenderer`.
 
 ### Pools
 
 ```python
 from renderers import create_renderer_pool
 
-pool = create_renderer_pool("Qwen/Qwen3-8B", renderer="auto", size=16)
+pool = create_renderer_pool("Qwen/Qwen3-8B", size=16)
 with pool.checkout() as r:
     ids = r.render_ids(messages)
 ```
@@ -108,25 +108,50 @@ Empirical delta on Qwen3.5-35B-A3B + mini-swe-agent-plus, step 0:
 
 Each break fragments a rollout into multiple training samples — every fragment re-encodes its prefix, inflating compute roughly linearly with the number of breaks.
 
-## Compaction overrides
+## Typed renderer configs
 
-`create_renderer` and `create_renderer_pool` accept two constructor-only flags:
+Each renderer accepts a typed pydantic config that pins its template-control kwargs at construction. `create_renderer` and `create_renderer_pool` take one positional `config` argument:
 
 ```python
-preserve_all_thinking: bool = False
-preserve_thinking_between_tool_calls: bool = False
+from renderers import (
+    create_renderer,
+    AutoRendererConfig,
+    Qwen3RendererConfig,
+    GLM5RendererConfig,
+    DefaultRendererConfig,
+)
+
+# Auto-resolve renderer from the tokenizer's model name. Carries the
+# shared preserve_* flags; template kwargs require an explicit choice.
+renderer = create_renderer(tokenizer)
+renderer = create_renderer(tokenizer, AutoRendererConfig(preserve_all_thinking=True))
+
+# Explicit choice — the typed config exposes exactly the fields that
+# renderer's chat template honours.
+renderer = create_renderer(tokenizer, Qwen3RendererConfig(enable_thinking=False))
+renderer = create_renderer(tokenizer, GLM5RendererConfig(clear_thinking=False))
+
+# Default renderer (apply_chat_template fallback) — extra fields are
+# captured via pydantic ``extra="allow"`` and forwarded to the Jinja
+# template; tool / reasoning parsers are typed.
+renderer = create_renderer(
+    tokenizer,
+    DefaultRendererConfig(tool_parser="qwen3", reasoning_parser="think"),
+)
 ```
 
-Defaults preserve byte-identity with the model's chat template. Flipping a flag at construction restores `reasoning_content` the template would otherwise drop:
+Discriminated union: every per-renderer config is a variant of `RendererConfig`, dispatched on the `name` field. Bogus combinations (e.g. `add_vision_id` under `name="qwen3"`) error at construction with a `pydantic.ValidationError`. Downstream pydantic configs (prime-rl orchestrator, verifiers `ClientConfig`) hold a single field typed as `RendererConfig` and inherit the same strict-per-variant validation.
+
+Two shared behaviour flags live on every variant via `_BaseRendererConfig`:
 
-- `preserve_all_thinking=True` — every past assistant's reasoning is kept.
-- `preserve_thinking_between_tool_calls=True` — reasoning is kept on assistants in the in-flight tool cycle (no-op for current renderers; reserved for future templates that drop it).
+- `preserve_all_thinking=True` — every past assistant's `reasoning_content` is kept, even when the chat template would drop it.
+- `preserve_thinking_between_tool_calls=True` — reasoning is kept on assistants in the in-flight tool cycle (post-last-user A-T-…-A block when it contains a tool response). A new user turn closes the block and drops its thinking.
 
-The canonical use case is **compaction**. Injecting a `user` turn like *"summarize the work so far"* puts every prior assistant in a "past cycle", so template-default rules drop their `reasoning_content` before the summarizer sees it. Build the renderer with `preserve_all_thinking=True` to keep reasoning visible end-to-end on those flows. Both flags only ever *add* tokens vs the template default.
+These OR-compose with template-level toggles (e.g. GLM-5 `clear_thinking`, Nemotron-3 `truncate_history_thinking`): either flag saying "keep" wins. preserve_* can only ever *extend* retention — never override a template kwarg into a "drop" decision. The canonical use case is **compaction**: injecting a `user` turn like *"summarize the work so far"* puts every prior assistant in a past cycle, and `preserve_all_thinking=True` keeps reasoning visible end-to-end.
 
 ## `DefaultRenderer`
 
-Fallback for unsupported models. Wraps `apply_chat_template` and accepts `tool_parser` / `reasoning_parser` kwargs (vLLM convention). `bridge_to_next_turn` returns `None` because the template's close is unknown, so multi-turn rollouts fall back to full re-render. Implementing a hand-coded renderer is a few hundred lines of Python (`render_ids` + `parse_response` + `bridge_to_next_turn`) and is the only path that closes the failure modes above by construction.
+Fallback for unsupported models. Wraps `apply_chat_template` and accepts `tool_parser` / `reasoning_parser` (vLLM convention) plus arbitrary Jinja kwargs via `DefaultRendererConfig`'s `extra="allow"`. `bridge_to_next_turn` returns `None` because the template's close is unknown, so multi-turn rollouts fall back to full re-render. Implementing a hand-coded renderer is a few hundred lines of Python (`render_ids` + `parse_response` + `bridge_to_next_turn`) and is the only path that closes the failure modes above by construction.
 
 ## Roadmap
 

diff --git a/docs/renderer-config.md b/docs/renderer-config.md
@@ -0,0 +1,163 @@
+# Renderer config
+
+`renderers.RendererConfig` is the typed input to `create_renderer` and
+`create_renderer_pool`. It pins the renderer choice and its template-control
+kwargs at construction.
+
+```python
+from renderers import create_renderer, Qwen35RendererConfig
+
+r = create_renderer(tokenizer, Qwen35RendererConfig(enable_thinking=False))
+```
+
+`RendererConfig` is a pydantic discriminated union (one variant per renderer,
+dispatched on the `name` field). Selecting a variant exposes exactly the
+fields that renderer's chat template honours; anything else raises a
+`pydantic.ValidationError` at construction.
+
+## Per-renderer configs
+
+Each hand-coded renderer has a typed config class with the template kwargs
+its Jinja chat template reads. For example:
+
+| Renderer       | Config class             | Template fields                                                |
+|----------------|--------------------------|----------------------------------------------------------------|
+| Qwen3          | `Qwen3RendererConfig`    | `enable_thinking`                                              |
+| Qwen3.5 / 3.6  | `Qwen35RendererConfig`   | `enable_thinking`, `add_vision_id`                             |
+| Qwen3-VL       | `Qwen3VLRendererConfig`  | `add_vision_id`                                                |
+| GLM-5 / 5.1    | `GLM5RendererConfig`     | `enable_thinking`, `clear_thinking`                            |
+| GLM-4.5        | `GLM45RendererConfig`    | `enable_thinking`                                              |
+| Nemotron-3     | `Nemotron3RendererConfig`| `enable_thinking`, `truncate_history_thinking`                 |
+| Kimi K2.5      | `KimiK25RendererConfig`  | `thinking`                                                     |
+| MiniMax-M2     | `MiniMaxM2RendererConfig`| `model_identity`                                               |
+| Laguna-XS.2    | `LagunaXS2RendererConfig`| `enable_thinking`, `render_assistant_messages_raw`             |
+| gpt-oss        | `GptOssRendererConfig`   | `reasoning_effort`, `conversation_start_date`                  |
+
+Field names mirror the upstream Jinja variable names. Passing
+`Qwen3RendererConfig(add_vision_id=True)` raises — Qwen3 is text-only, so
+the field doesn't exist on its config. Use
+`type(config).template_field_names()` to introspect the fields that mirror
+chat-template kwargs (parity is verified against `apply_chat_template` in
+`tests/test_renderer_config_parity.py`).
+
+Configs are frozen. To override a field, construct a new instance or call
+`config.model_copy(update={...})`.
+
+## Auto-resolution
+
+`create_renderer(tokenizer)` (no config) resolves the renderer from
+`tokenizer.name_or_path` via `MODEL_RENDERER_MAP`:
+
+```python
+r = create_renderer(tokenizer)                                 # AutoRendererConfig() is the default
+r = create_renderer(tokenizer, AutoRendererConfig(preserve_all_thinking=True))
+```
+
+`AutoRendererConfig` carries only the shared `preserve_*` flags. Template
+kwargs depend on the renderer, so overriding them requires naming the
+renderer explicitly:
+
+```python
+r = create_renderer(tokenizer, GLM5RendererConfig(clear_thinking=False))
+```
+
+Auto-resolution fails loudly for VLMs that miss the exact-match lookup —
+`DefaultRenderer` only knows `apply_chat_template` + text tokens, so silently
+falling back for a VLM would produce token streams the trainer can't
+reconstruct. Text-only fine-tunes without a registered renderer fall back to
+`DefaultRenderer` and log the choice at INFO.
+
+## `preserve_*` flags
+
+Every variant carries two renderer-agnostic flags on `_BaseRendererConfig`:
+
+- `preserve_all_thinking: bool = False` — re-emit `reasoning_content` on
+  every past assistant turn, even when the chat template would drop it.
+- `preserve_thinking_between_tool_calls: bool = False` — re-emit
+  `reasoning_content` only inside the in-flight tool cycle (the contiguous
+  A-T-…-A block after the most recent `user` message, when it contains at
+  least one `tool` response). A new user turn closes the block and drops
+  its thinking.
+
+These OR-compose with template-level toggles. GLM-5's `clear_thinking` and
+Nemotron-3's `truncate_history_thinking` already gate past thinking; the
+`preserve_*` flags add to that:
+
+| `clear_thinking` | `preserve_all_thinking` | past thinking? |
+|------------------|-------------------------|----------------|
+| `True` (default — drop) | `False` (default) | dropped |
+| `True`           | `True`                  | kept           |
+| `False` (keep)   | `False`                 | kept           |
+| `False`          | `True`                  | kept           |
+
+`preserve_*` can only extend retention, never force a drop. The canonical
+use case is **compaction**: injecting a `user` turn like *"summarize the work
+so far"* puts every prior assistant in a past cycle, and
+`preserve_all_thinking=True` keeps reasoning visible end-to-end.
+
+## `DefaultRendererConfig` accepts arbitrary Jinja kwargs
+
+`DefaultRenderer` wraps `tokenizer.apply_chat_template` for any model that
+doesn't have a hand-coded renderer. Its config sets `extra="allow"`:
+
+```python
+from renderers import create_renderer, DefaultRendererConfig
+
+r = create_renderer(
+    tokenizer,
+    DefaultRendererConfig(
+        tool_parser="qwen3",                # registered in renderers.parsers
+        reasoning_parser="think",
+        enable_thinking=False,              # forwarded to apply_chat_template
+        custom_jinja_kwarg=True,            # ditto
+    ),
+)
+```
+
+`tool_parser` and `reasoning_parser` are typed because they configure
+`DefaultRenderer`'s own parsing pipeline. Every other field lands in
+`model_extra` and `DefaultRenderer._apply` forwards `model_extra` verbatim
+to `apply_chat_template`.
+
+## Downstream integration
+
+Downstream pydantic configs (`prime-rl` orchestrator, `verifiers`
+`ClientConfig`) hold a single field typed as `RendererConfig`:
+
+```python
+from pydantic import BaseModel, Field
+from renderers import AutoRendererConfig, RendererConfig
+
+class ClientConfig(BaseModel):
+    renderer: RendererConfig = Field(default_factory=AutoRendererConfig)
+```
+
+In TOML / YAML, the discriminator routes deserialization:
+
+```toml
+[client.renderer]
+name = "qwen3.5"
+enable_thinking = false
+add_vision_id = true
+preserve_all_thinking = true
+```
+
+Pydantic dispatches on `name = "qwen3.5"` to `Qwen35RendererConfig`. Bogus
+combinations (e.g. `add_vision_id` under `name = "qwen3"`) raise at
+config-load with a clear message naming the offending field and the variant
+that rejected it.
+
+To construct a config from a renderer name string (e.g. from a CLI flag):
+
+```python
+from renderers import config_from_name
+
+cfg = config_from_name("glm-5")           # → GLM5RendererConfig() with defaults
+cfg = config_from_name("auto")            # → None, the implicit "auto" form
+```
+
+## Renaming a renderer is a breaking change
+
+The discriminator key is the renderer name string. Renaming `"qwen3.5"` to
+something else would break any downstream config that references it by
+name. Add new renderers; don't rename existing ones.
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,11 @@ dependencies = [
     # around ``from_pretrained``, so subsequent ``AutoTokenizer`` calls
     # outside the renderers package stay vanilla.
     "fastokens>=0.2.0",
+    # ``BaseRendererConfig`` inherits from ``pydantic_config.BaseConfig`` so
+    # the typed-config surface stays uniform with prime-rl / verifiers config
+    # bases. Transitively brings pydantic, which ``renderers.configs`` also
+    # imports directly.
+    "prime-pydantic-config>=0.3.0.dev83",
 ]
 
 [tool.hatch.version]
@@ -73,7 +78,7 @@ exclude-newer = "7 days"
 # MiniMax-M2's slow→fast tokenizer conversion path. Exempting it from
 # the project-wide 7-day cutoff lets the lockfile pick it up immediately
 # while the rest of the dependency graph stays gated.
-exclude-newer-package = { fastokens = false }
+exclude-newer-package = { fastokens = false, "prime-pydantic-config" = false }
 
 [tool.ty.environment]
 python-version = "3.13"

diff --git a/renderers/__init__.py b/renderers/__init__.py
@@ -38,9 +38,30 @@
     trim_to_turn_close,
 )
 from renderers.client import OverlongPromptError
+from renderers.configs import (
+    AutoRendererConfig,
+    BaseRendererConfig,
+    config_from_name,
+    DefaultRendererConfig,
+    DeepSeekV3RendererConfig,
+    GLM45RendererConfig,
+    GLM51RendererConfig,
+    GLM5RendererConfig,
+    GptOssRendererConfig,
+    KimiK25RendererConfig,
+    KimiK2RendererConfig,
+    LagunaXS2RendererConfig,
+    MiniMaxM2RendererConfig,
+    Nemotron3RendererConfig,
+    Qwen35RendererConfig,
+    Qwen36RendererConfig,
+    Qwen3RendererConfig,
+    Qwen3VLRendererConfig,
+    RendererConfig,
+)
 from renderers.deepseek_v3 import DeepSeekV3Renderer
 from renderers.default import DefaultRenderer
-from renderers.glm5 import GLM5Renderer
+from renderers.glm5 import GLM5Renderer, GLM51Renderer
 from renderers.glm45 import GLM45Renderer
 from renderers.gpt_oss import GptOssRenderer
 from renderers.kimi_k2 import KimiK2Renderer
@@ -54,34 +75,53 @@
 from renderers.qwen36 import Qwen36Renderer
 
 __all__ = [
+    "AutoRendererConfig",
+    "BaseRendererConfig",
     "Content",
     "ContentPart",
     "DeepSeekV3Renderer",
+    "DeepSeekV3RendererConfig",
     "DefaultRenderer",
+    "DefaultRendererConfig",
     "GLM45Renderer",
+    "GLM45RendererConfig",
+    "GLM51Renderer",
+    "GLM51RendererConfig",
     "GLM5Renderer",
+    "GLM5RendererConfig",
     "GptOssRenderer",
+    "GptOssRendererConfig",
     "ImagePart",
-    "KimiK2Renderer",
     "KimiK25Renderer",
+    "KimiK25RendererConfig",
+    "KimiK2Renderer",
+    "KimiK2RendererConfig",
     "LagunaXS2Renderer",
+    "LagunaXS2RendererConfig",
     "MULTIMODAL_MODELS",
     "Message",
     "MiniMaxM2Renderer",
+    "MiniMaxM2RendererConfig",
     "MultiModalData",
     "MultimodalRenderer",
     "Nemotron3Renderer",
+    "Nemotron3RendererConfig",
     "OverlongPromptError",
     "ParsedResponse",
     "ParsedToolCall",
     "PlaceholderRange",
-    "Qwen3Renderer",
-    "Qwen3VLRenderer",
     "Qwen35Renderer",
+    "Qwen35RendererConfig",
     "Qwen36Renderer",
+    "Qwen36RendererConfig",
+    "Qwen3Renderer",
+    "Qwen3RendererConfig",
+    "Qwen3VLRenderer",
+    "Qwen3VLRendererConfig",
     "RenderedConversation",
     "RenderedTokens",
     "Renderer",
+    "RendererConfig",
     "RendererPool",
     "TextPart",
     "ThinkingPart",
@@ -94,6 +134,7 @@
     "attribute_text_segments",
     "build_training_sample",
     "build_trajectory_step",
+    "config_from_name",
     "create_renderer",
     "create_renderer_pool",
     "is_multimodal",