Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
b37d3f2
Add renderer chat template kwargs passthrough
eligotts May 23, 2026
0bd7e6d
Reject constructor kwargs in chat template kwargs
eligotts May 23, 2026
d80d4ac
Simplify chat template kwargs validation
eligotts May 23, 2026
7fbf390
Format chat template kwargs changes
eligotts May 23, 2026
b543277
Address chat template kwargs review comments
eligotts May 23, 2026
deb3bdf
Verify chat_template_kwargs parity vs apply_chat_template
hallerite May 24, 2026
c0384a2
Expose every chat-template kwarg the upstream Jinja accepts
hallerite May 24, 2026
f593a29
Refuse bridge when add_vision_id loses prior count
hallerite May 24, 2026
f545378
Apply ruff format to chat-template-kwargs changes
hallerite May 24, 2026
255db59
Replace chat_template_kwargs with typed renderer configs
hallerite May 25, 2026
eb03934
Clean up stale references to the deleted chat_template_kwargs API
hallerite May 25, 2026
d2bcf7e
Strip doc-rot framing — describe current state, not migration history
hallerite May 25, 2026
8c514e0
Rewrite renderer-config doc in the prime-rl / verifiers docs style
hallerite May 25, 2026
0769548
Rename config_for_name → config_from_name
hallerite May 25, 2026
3dab877
Inherit BaseRendererConfig from pydantic_config.BaseConfig
hallerite May 25, 2026
2d74a6b
Trim stale pyproject comments on pydantic and prime-pydantic-config
hallerite May 25, 2026
3e07d7a
Drop direct pydantic dep — get it transitively via prime-pydantic-config
hallerite May 25, 2026
4c9099d
Bump prime-pydantic-config floor to the latest dev release (0.3.0.dev83)
hallerite May 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ coverage.xml
.idea/
.vscode/
*.swp

# agent harness state
.claude/
51 changes: 38 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ from transformers import AutoTokenizer
from renderers import create_renderer

tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
r = create_renderer(tok, renderer="auto") # → Qwen3Renderer
r = create_renderer(tok) # → Qwen3Renderer (auto-resolved)

prompt_ids = r.render_ids(
[{"role": "user", "content": "hi"}],
Expand Down Expand Up @@ -71,17 +71,17 @@ Each hand-coded bridge:
### Picking a renderer

```python
r = create_renderer(tok, renderer="auto")
r = create_renderer(tok) # AutoRendererConfig is the implicit default
```

Auto-detect matches `tokenizer.name_or_path` against `MODEL_RENDERER_MAP` by **exact match**. Prefix matching is intentionally off — same architecture can ship different chat templates (base vs instruct, fine-tune renames). Fine-tunes must pass `renderer=<name>` explicitly; unknown names fall back to `DefaultRenderer`.
Auto-detect matches `tokenizer.name_or_path` against `MODEL_RENDERER_MAP` by **exact match**. Prefix matching is intentionally off — same architecture can ship different chat templates (base vs instruct, fine-tune renames). Fine-tunes must pass an explicit typed config (e.g. `Qwen3RendererConfig()`); unknown names fall back to `DefaultRenderer`.

### Pools

```python
from renderers import create_renderer_pool

pool = create_renderer_pool("Qwen/Qwen3-8B", renderer="auto", size=16)
pool = create_renderer_pool("Qwen/Qwen3-8B", size=16)
with pool.checkout() as r:
ids = r.render_ids(messages)
```
Expand All @@ -108,25 +108,50 @@ Empirical delta on Qwen3.5-35B-A3B + mini-swe-agent-plus, step 0:

Each break fragments a rollout into multiple training samples — every fragment re-encodes its prefix, inflating compute roughly linearly with the number of breaks.

## Compaction overrides
## Typed renderer configs

`create_renderer` and `create_renderer_pool` accept two constructor-only flags:
Each renderer accepts a typed pydantic config that pins its template-control kwargs at construction. `create_renderer` and `create_renderer_pool` take one positional `config` argument:

```python
preserve_all_thinking: bool = False
preserve_thinking_between_tool_calls: bool = False
from renderers import (
create_renderer,
AutoRendererConfig,
Qwen3RendererConfig,
GLM5RendererConfig,
DefaultRendererConfig,
)

# Auto-resolve renderer from the tokenizer's model name. Carries the
# shared preserve_* flags; template kwargs require an explicit choice.
Comment thread
hallerite marked this conversation as resolved.
renderer = create_renderer(tokenizer)
Comment thread
mikasenghaas marked this conversation as resolved.
renderer = create_renderer(tokenizer, AutoRendererConfig(preserve_all_thinking=True))

# Explicit choice — the typed config exposes exactly the fields that
# renderer's chat template honours.
renderer = create_renderer(tokenizer, Qwen3RendererConfig(enable_thinking=False))
renderer = create_renderer(tokenizer, GLM5RendererConfig(clear_thinking=False))

# Default renderer (apply_chat_template fallback) — extra fields are
# captured via pydantic ``extra="allow"`` and forwarded to the Jinja
# template; tool / reasoning parsers are typed.
renderer = create_renderer(
tokenizer,
DefaultRendererConfig(tool_parser="qwen3", reasoning_parser="think"),
)
```

Defaults preserve byte-identity with the model's chat template. Flipping a flag at construction restores `reasoning_content` the template would otherwise drop:
Discriminated union: every per-renderer config is a variant of `RendererConfig`, dispatched on the `name` field. Bogus combinations (e.g. `add_vision_id` under `name="qwen3"`) error at construction with a `pydantic.ValidationError`. Downstream pydantic configs (prime-rl orchestrator, verifiers `ClientConfig`) hold a single field typed as `RendererConfig` and inherit the same strict-per-variant validation.

Two shared behaviour flags live on every variant via `_BaseRendererConfig`:

- `preserve_all_thinking=True` — every past assistant's reasoning is kept.
- `preserve_thinking_between_tool_calls=True` — reasoning is kept on assistants in the in-flight tool cycle (no-op for current renderers; reserved for future templates that drop it).
- `preserve_all_thinking=True` — every past assistant's `reasoning_content` is kept, even when the chat template would drop it.
Comment thread
hallerite marked this conversation as resolved.
- `preserve_thinking_between_tool_calls=True` — reasoning is kept on assistants in the in-flight tool cycle (post-last-user A-T-…-A block when it contains a tool response). A new user turn closes the block and drops its thinking.

The canonical use case is **compaction**. Injecting a `user` turn like *"summarize the work so far"* puts every prior assistant in a "past cycle", so template-default rules drop their `reasoning_content` before the summarizer sees it. Build the renderer with `preserve_all_thinking=True` to keep reasoning visible end-to-end on those flows. Both flags only ever *add* tokens vs the template default.
These OR-compose with template-level toggles (e.g. GLM-5 `clear_thinking`, Nemotron-3 `truncate_history_thinking`): either flag saying "keep" wins. preserve_* can only ever *extend* retention — never override a template kwarg into a "drop" decision. The canonical use case is **compaction**: injecting a `user` turn like *"summarize the work so far"* puts every prior assistant in a past cycle, and `preserve_all_thinking=True` keeps reasoning visible end-to-end.

## `DefaultRenderer`

Fallback for unsupported models. Wraps `apply_chat_template` and accepts `tool_parser` / `reasoning_parser` kwargs (vLLM convention). `bridge_to_next_turn` returns `None` because the template's close is unknown, so multi-turn rollouts fall back to full re-render. Implementing a hand-coded renderer is a few hundred lines of Python (`render_ids` + `parse_response` + `bridge_to_next_turn`) and is the only path that closes the failure modes above by construction.
Fallback for unsupported models. Wraps `apply_chat_template` and accepts `tool_parser` / `reasoning_parser` (vLLM convention) plus arbitrary Jinja kwargs via `DefaultRendererConfig`'s `extra="allow"`. `bridge_to_next_turn` returns `None` because the template's close is unknown, so multi-turn rollouts fall back to full re-render. Implementing a hand-coded renderer is a few hundred lines of Python (`render_ids` + `parse_response` + `bridge_to_next_turn`) and is the only path that closes the failure modes above by construction.

## Roadmap

Expand Down
163 changes: 163 additions & 0 deletions docs/renderer-config.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# Renderer config

`renderers.RendererConfig` is the typed input to `create_renderer` and
`create_renderer_pool`. It pins the renderer choice and its template-control
kwargs at construction.

```python
from renderers import create_renderer, Qwen35RendererConfig

r = create_renderer(tokenizer, Qwen35RendererConfig(enable_thinking=False))
```

`RendererConfig` is a pydantic discriminated union (one variant per renderer,
dispatched on the `name` field). Selecting a variant exposes exactly the
fields that renderer's chat template honours; anything else raises a
`pydantic.ValidationError` at construction.

## Per-renderer configs

Each hand-coded renderer has a typed config class with the template kwargs
its Jinja chat template reads. For example:

| Renderer | Config class | Template fields |
|----------------|--------------------------|----------------------------------------------------------------|
| Qwen3 | `Qwen3RendererConfig` | `enable_thinking` |
| Qwen3.5 / 3.6 | `Qwen35RendererConfig` | `enable_thinking`, `add_vision_id` |
| Qwen3-VL | `Qwen3VLRendererConfig` | `add_vision_id` |
| GLM-5 / 5.1 | `GLM5RendererConfig` | `enable_thinking`, `clear_thinking` |
| GLM-4.5 | `GLM45RendererConfig` | `enable_thinking` |
| Nemotron-3 | `Nemotron3RendererConfig`| `enable_thinking`, `truncate_history_thinking` |
| Kimi K2.5 | `KimiK25RendererConfig` | `thinking` |
| MiniMax-M2 | `MiniMaxM2RendererConfig`| `model_identity` |
| Laguna-XS.2 | `LagunaXS2RendererConfig`| `enable_thinking`, `render_assistant_messages_raw` |
| gpt-oss | `GptOssRendererConfig` | `reasoning_effort`, `conversation_start_date` |

Field names mirror the upstream Jinja variable names. Passing
`Qwen3RendererConfig(add_vision_id=True)` raises — Qwen3 is text-only, so
the field doesn't exist on its config. Use
`type(config).template_field_names()` to introspect the fields that mirror
chat-template kwargs (parity is verified against `apply_chat_template` in
`tests/test_renderer_config_parity.py`).

Configs are frozen. To override a field, construct a new instance or call
`config.model_copy(update={...})`.

## Auto-resolution

`create_renderer(tokenizer)` (no config) resolves the renderer from
`tokenizer.name_or_path` via `MODEL_RENDERER_MAP`:

```python
r = create_renderer(tokenizer) # AutoRendererConfig() is the default
r = create_renderer(tokenizer, AutoRendererConfig(preserve_all_thinking=True))
```

`AutoRendererConfig` carries only the shared `preserve_*` flags. Template
kwargs depend on the renderer, so overriding them requires naming the
renderer explicitly:

```python
r = create_renderer(tokenizer, GLM5RendererConfig(clear_thinking=False))
```

Auto-resolution fails loudly for VLMs that miss the exact-match lookup —
`DefaultRenderer` only knows `apply_chat_template` + text tokens, so silently
falling back for a VLM would produce token streams the trainer can't
reconstruct. Text-only fine-tunes without a registered renderer fall back to
`DefaultRenderer` and log the choice at INFO.

## `preserve_*` flags

Every variant carries two renderer-agnostic flags on `_BaseRendererConfig`:

- `preserve_all_thinking: bool = False` — re-emit `reasoning_content` on
every past assistant turn, even when the chat template would drop it.
- `preserve_thinking_between_tool_calls: bool = False` — re-emit
`reasoning_content` only inside the in-flight tool cycle (the contiguous
A-T-…-A block after the most recent `user` message, when it contains at
least one `tool` response). A new user turn closes the block and drops
its thinking.

These OR-compose with template-level toggles. GLM-5's `clear_thinking` and
Nemotron-3's `truncate_history_thinking` already gate past thinking; the
`preserve_*` flags add to that:

| `clear_thinking` | `preserve_all_thinking` | past thinking? |
|------------------|-------------------------|----------------|
| `True` (default — drop) | `False` (default) | dropped |
| `True` | `True` | kept |
| `False` (keep) | `False` | kept |
| `False` | `True` | kept |

`preserve_*` can only extend retention, never force a drop. The canonical
use case is **compaction**: injecting a `user` turn like *"summarize the work
so far"* puts every prior assistant in a past cycle, and
`preserve_all_thinking=True` keeps reasoning visible end-to-end.

## `DefaultRendererConfig` accepts arbitrary Jinja kwargs

`DefaultRenderer` wraps `tokenizer.apply_chat_template` for any model that
doesn't have a hand-coded renderer. Its config sets `extra="allow"`:

```python
from renderers import create_renderer, DefaultRendererConfig

r = create_renderer(
tokenizer,
DefaultRendererConfig(
tool_parser="qwen3", # registered in renderers.parsers
reasoning_parser="think",
enable_thinking=False, # forwarded to apply_chat_template
custom_jinja_kwarg=True, # ditto
),
)
```

`tool_parser` and `reasoning_parser` are typed because they configure
`DefaultRenderer`'s own parsing pipeline. Every other field lands in
`model_extra` and `DefaultRenderer._apply` forwards `model_extra` verbatim
to `apply_chat_template`.

## Downstream integration

Downstream pydantic configs (`prime-rl` orchestrator, `verifiers`
`ClientConfig`) hold a single field typed as `RendererConfig`:

```python
from pydantic import BaseModel, Field
from renderers import AutoRendererConfig, RendererConfig

class ClientConfig(BaseModel):
renderer: RendererConfig = Field(default_factory=AutoRendererConfig)
```

In TOML / YAML, the discriminator routes deserialization:

```toml
[client.renderer]
name = "qwen3.5"
enable_thinking = false
add_vision_id = true
preserve_all_thinking = true
```

Pydantic dispatches on `name = "qwen3.5"` to `Qwen35RendererConfig`. Bogus
combinations (e.g. `add_vision_id` under `name = "qwen3"`) raise at
config-load with a clear message naming the offending field and the variant
that rejected it.

To construct a config from a renderer name string (e.g. from a CLI flag):

```python
from renderers import config_from_name

cfg = config_from_name("glm-5") # → GLM5RendererConfig() with defaults
cfg = config_from_name("auto") # → None, the implicit "auto" form
```

## Renaming a renderer is a breaking change

The discriminator key is the renderer name string. Renaming `"qwen3.5"` to
something else would break any downstream config that references it by
name. Add new renderers; don't rename existing ones.
7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ dependencies = [
# around ``from_pretrained``, so subsequent ``AutoTokenizer`` calls
# outside the renderers package stay vanilla.
"fastokens>=0.2.0",
# ``BaseRendererConfig`` inherits from ``pydantic_config.BaseConfig`` so
# the typed-config surface stays uniform with prime-rl / verifiers config
# bases. Transitively brings pydantic, which ``renderers.configs`` also
# imports directly.
"prime-pydantic-config>=0.3.0.dev83",
]

[tool.hatch.version]
Expand Down Expand Up @@ -73,7 +78,7 @@ exclude-newer = "7 days"
# MiniMax-M2's slow→fast tokenizer conversion path. Exempting it from
# the project-wide 7-day cutoff lets the lockfile pick it up immediately
# while the rest of the dependency graph stays gated.
exclude-newer-package = { fastokens = false }
exclude-newer-package = { fastokens = false, "prime-pydantic-config" = false }

[tool.ty.environment]
python-version = "3.13"
Expand Down
49 changes: 45 additions & 4 deletions renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,30 @@
trim_to_turn_close,
)
from renderers.client import OverlongPromptError
from renderers.configs import (
AutoRendererConfig,
BaseRendererConfig,
config_from_name,
DefaultRendererConfig,
DeepSeekV3RendererConfig,
GLM45RendererConfig,
GLM51RendererConfig,
GLM5RendererConfig,
GptOssRendererConfig,
KimiK25RendererConfig,
KimiK2RendererConfig,
LagunaXS2RendererConfig,
MiniMaxM2RendererConfig,
Nemotron3RendererConfig,
Qwen35RendererConfig,
Qwen36RendererConfig,
Qwen3RendererConfig,
Qwen3VLRendererConfig,
RendererConfig,
)
from renderers.deepseek_v3 import DeepSeekV3Renderer
from renderers.default import DefaultRenderer
from renderers.glm5 import GLM5Renderer
from renderers.glm5 import GLM5Renderer, GLM51Renderer
from renderers.glm45 import GLM45Renderer
from renderers.gpt_oss import GptOssRenderer
from renderers.kimi_k2 import KimiK2Renderer
Expand All @@ -54,34 +75,53 @@
from renderers.qwen36 import Qwen36Renderer

__all__ = [
"AutoRendererConfig",
"BaseRendererConfig",
"Content",
"ContentPart",
"DeepSeekV3Renderer",
"DeepSeekV3RendererConfig",
"DefaultRenderer",
"DefaultRendererConfig",
"GLM45Renderer",
"GLM45RendererConfig",
"GLM51Renderer",
"GLM51RendererConfig",
"GLM5Renderer",
"GLM5RendererConfig",
"GptOssRenderer",
"GptOssRendererConfig",
"ImagePart",
"KimiK2Renderer",
"KimiK25Renderer",
"KimiK25RendererConfig",
"KimiK2Renderer",
"KimiK2RendererConfig",
"LagunaXS2Renderer",
"LagunaXS2RendererConfig",
"MULTIMODAL_MODELS",
"Message",
"MiniMaxM2Renderer",
"MiniMaxM2RendererConfig",
"MultiModalData",
"MultimodalRenderer",
"Nemotron3Renderer",
"Nemotron3RendererConfig",
"OverlongPromptError",
"ParsedResponse",
"ParsedToolCall",
"PlaceholderRange",
"Qwen3Renderer",
"Qwen3VLRenderer",
"Qwen35Renderer",
"Qwen35RendererConfig",
"Qwen36Renderer",
"Qwen36RendererConfig",
"Qwen3Renderer",
"Qwen3RendererConfig",
"Qwen3VLRenderer",
"Qwen3VLRendererConfig",
"RenderedConversation",
"RenderedTokens",
"Renderer",
"RendererConfig",
"RendererPool",
"TextPart",
"ThinkingPart",
Expand All @@ -94,6 +134,7 @@
"attribute_text_segments",
"build_training_sample",
"build_trajectory_step",
"config_from_name",
"create_renderer",
"create_renderer_pool",
"is_multimodal",
Expand Down
Loading
Loading