diff --git a/tests/test_renderer_client.py b/tests/test_renderer_client.py index 10fcac691..0e14fc63a 100644 --- a/tests/test_renderer_client.py +++ b/tests/test_renderer_client.py @@ -1,3 +1,4 @@ +import asyncio from functools import lru_cache from unittest.mock import patch @@ -81,6 +82,155 @@ def test_renderer_client_uses_renderer_model_name_override(): ) +def test_renderer_client_threads_chat_template_kwargs_into_pool(): + """``sampling_args.extra_body.chat_template_kwargs`` land on the + concrete RendererConfig (resolving Auto/None against + ``MODEL_RENDERER_MAP`` when needed) and are stripped from the params + forwarded to /generate. Covers explicit / Auto / None bases.""" + from renderers import AutoRendererConfig, Qwen3RendererConfig + + bases = [ + Qwen3RendererConfig(enable_thinking=True), + AutoRendererConfig(preserve_all_thinking=True), + None, + ] + for base in bases: + RendererClient._shared_pools.clear() + + client = object.__new__(RendererClient) + client._renderer = None + client._pool_size = 1 + client._config = vf.ClientConfig(client_type="renderer", renderer_config=base) + client._client = object() # type: ignore[attr-defined] + + sentinel_pool = RendererPool.__new__(RendererPool) + captured: dict = {} + + async def _fake_generate(**kwargs): + captured.update(kwargs) + return {"content": "ok"} + + with ( + patch( + "verifiers.clients.renderer_client.create_renderer_pool", + return_value=sentinel_pool, + ) as create_pool_mock, + patch( + "verifiers.clients.renderer_client.generate", + side_effect=_fake_generate, + ), + ): + asyncio.run( + client.get_native_response( + prompt=[{"role": "user", "content": "hi"}], + model="Qwen/Qwen3-8B", + sampling_args={ + "extra_body": { + "chat_template_kwargs": {"enable_thinking": False}, + "top_k": 20, + } + }, + tools=None, + ) + ) + + expected_preserve_all = ( + base.preserve_all_thinking + if isinstance(base, AutoRendererConfig) + else False + ) + create_pool_mock.assert_called_once_with( + "Qwen/Qwen3-8B", + Qwen3RendererConfig( + enable_thinking=False, + preserve_all_thinking=expected_preserve_all, + ), + size=1, + ) + assert captured["sampling_params"] == {"top_k": 20} + + +def test_renderer_client_auto_resolves_against_renderer_model_name_override(): + """When ``ClientConfig.renderer_model_name`` overrides the API request + model, auto-resolution looks up the OVERRIDE in ``MODEL_RENDERER_MAP`` + (it's what loads the tokenizer the renderer holds) — not the request + model, which may not be in the map at all.""" + from renderers import Qwen3RendererConfig + + RendererClient._shared_pools.clear() + + client = object.__new__(RendererClient) + client._renderer = None + client._pool_size = 1 + client._config = vf.ClientConfig( + client_type="renderer", + renderer_model_name="Qwen/Qwen3-8B", # override + ) + client._client = object() # type: ignore[attr-defined] + + sentinel_pool = RendererPool.__new__(RendererPool) + + async def _fake_generate(**kwargs): + return {"content": "ok"} + + with ( + patch( + "verifiers.clients.renderer_client.create_renderer_pool", + return_value=sentinel_pool, + ) as create_pool_mock, + patch("verifiers.clients.renderer_client.generate", side_effect=_fake_generate), + ): + asyncio.run( + client.get_native_response( + prompt=[{"role": "user", "content": "hi"}], + model="r8-smoke", # not in MODEL_RENDERER_MAP + sampling_args={ + "extra_body": {"chat_template_kwargs": {"enable_thinking": False}} + }, + tools=None, + ) + ) + + # Resolves against renderer_model_name (Qwen/Qwen3-8B → "qwen3") rather + # than the request "r8-smoke" (which would fall through to default). + create_pool_mock.assert_called_once_with( + "Qwen/Qwen3-8B", + Qwen3RendererConfig(enable_thinking=False), + size=1, + ) + + +def test_renderer_client_rejects_invalid_chat_template_kwargs(): + """Unknown / mistyped chat_template_kwargs surface as a pydantic + ``ValidationError`` (``extra="forbid"`` on the typed RendererConfig).""" + from pydantic import ValidationError + from renderers import Qwen3RendererConfig + + RendererClient._shared_pools.clear() + + client = object.__new__(RendererClient) + client._renderer = None + client._pool_size = 1 + client._config = vf.ClientConfig( + client_type="renderer", renderer_config=Qwen3RendererConfig() + ) + client._client = object() # type: ignore[attr-defined] + + with pytest.raises(ValidationError, match="enable_thinkng"): + asyncio.run( + client.get_native_response( + prompt=[{"role": "user", "content": "hi"}], + model="Qwen/Qwen3-8B", + sampling_args={ + "extra_body": { + "chat_template_kwargs": {"enable_thinkng": False}, # typo + } + }, + tools=None, + ) + ) + + # Provenance: Eli's review on PR #1068, comment 3150580768. # "RendererClient parses the GPT-OSS assistant tool call into ToolCall(name=...), # but ToolEnv returns ToolMessage with only content/tool_call_id, and diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 68bfa7bf0..b6ea82166 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -18,18 +18,22 @@ from renderers import Message as RendererMessage from renderers import OverlongPromptError as RendererOverlongPromptError from renderers import ( + AutoRendererConfig, MultimodalRenderer, ParsedToolCall, RenderedTokens, Renderer, + RendererConfig, RendererPool, ToolCallParseStatus, ToolSpec, + config_from_name, create_renderer_pool, is_multimodal, ) from renderers import ToolCall as RendererToolCall from renderers import ToolCallFunction +from renderers.base import MODEL_RENDERER_MAP from renderers.client import _maybe_offload, generate from verifiers.clients.client import Client @@ -374,6 +378,53 @@ async def _get_incremental_prompt_ids( return None +def _resolve_renderer_config( + base: RendererConfig | None, + chat_template_kwargs: Mapping[str, Any] | None, + *, + renderer_model: str, +) -> RendererConfig | None: + """Merge ``chat_template_kwargs`` into a typed ``RendererConfig``. + + When ``base`` is ``None`` or ``AutoRendererConfig`` (would auto-resolve + inside ``renderers.create_renderer``), we pull resolution forward via + ``MODEL_RENDERER_MAP`` so kwargs land on the concrete config variant + and pydantic validates them against the actual renderer's schema — + ``AutoRendererConfig`` intentionally carries only ``preserve_*`` and + would reject template kwargs like ``enable_thinking``. ``renderer_model`` + must match what the pool will tokenize with (i.e. + ``ClientConfig.renderer_model_name`` when set, else the request model), + so resolution agrees with the tokenizer the renderer will hold. + + Kwargs override fields with the same name on the (resolved) base. + Typed configs (``extra="forbid"``) reject unknown keys with a + field-path error; ``DefaultRendererConfig`` (``extra="allow"``) keeps + the escape hatch for arbitrary jinja kwargs. + """ + if not chat_template_kwargs: + return base + + # Resolve auto → concrete (mirrors ``renderers._resolve_auto``) so + # ``enable_thinking`` etc. validate against the right schema instead of + # ``AutoRendererConfig``'s minimal one. Carries ``preserve_*`` across. + if base is None or isinstance(base, AutoRendererConfig): + renderer_name = MODEL_RENDERER_MAP.get(renderer_model, "default") + # ``config_from_name`` returns ``None`` only for ``"auto"``, which + # ``MODEL_RENDERER_MAP.get(..., "default")`` excludes — assert for ty. + concrete = config_from_name(renderer_name) + assert concrete is not None + if isinstance(base, AutoRendererConfig): + concrete = concrete.model_copy( + update={ + "preserve_all_thinking": base.preserve_all_thinking, + "preserve_thinking_between_tool_calls": base.preserve_thinking_between_tool_calls, + } + ) + base = cast(RendererConfig, concrete) + + return type(base).model_validate({**base.model_dump(), **chat_template_kwargs}) + + class RendererClient( Client[AsyncOpenAI, list[RendererMessage], dict[str, Any], ToolSpec] ): @@ -419,13 +470,19 @@ async def close(self) -> None: # ── Renderer management ───────────────────────────────────────── - def _get_renderer_or_pool(self, model: str) -> Renderer | RendererPool: + def _get_renderer_or_pool( + self, + model: str, + *, + renderer_config: RendererConfig | None = None, + ) -> Renderer | RendererPool: if self._renderer is not None: return self._renderer - renderer_config = ( - self._config.renderer_config if self._config is not None else None - ) + if renderer_config is None: + renderer_config = ( + self._config.renderer_config if self._config is not None else None + ) renderer_model = ( self._config.renderer_model_name if self._config is not None and self._config.renderer_model_name is not None @@ -477,14 +534,33 @@ async def get_native_response( tools: list[ToolSpec] | None = None, **kwargs: Any, ) -> dict[str, Any]: - renderer = self._get_renderer_or_pool(model) - args = dict(sampling_args) extra_headers = { **dict(args.pop("extra_headers", None) or {}), **dict(kwargs.pop("extra_headers", None) or {}), } sampling_params: dict[str, Any] = dict(args.pop("extra_body", None) or {}) + + # ``chat_template_kwargs`` belong to the renderer, not the engine — + # peel them off the per-request sampling and fold them into the + # typed RendererConfig. Pool cache key already includes the + # effective config so identical kwargs reuse the same renderer. + chat_template_kwargs = sampling_params.pop("chat_template_kwargs", None) + # Auto-resolution must agree with the model the pool will tokenize + # against — ``renderer_model_name`` overrides the request ``model`` + # when set (same precedence ``_get_renderer_or_pool`` uses below). + renderer_model = ( + self._config.renderer_model_name + if self._config is not None and self._config.renderer_model_name is not None + else model + ) + effective_cfg = _resolve_renderer_config( + self._config.renderer_config if self._config is not None else None, + chat_template_kwargs, + renderer_model=renderer_model, + ) + renderer = self._get_renderer_or_pool(model, renderer_config=effective_cfg) + for key in ( "temperature", "top_p",