diff --git a/renderers/base.py b/renderers/base.py index e9805c4..89d6577 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -911,8 +911,8 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No # ``enable_thinking=true`` (open ``\n`` at the gen prompt); # the smaller 0.8B / 2B variants flip the polarity (default # ``enable_thinking=false``, empty ``\n\n\n\n``). - # ``Qwen35Renderer`` auto-detects polarity from the tokenizer's - # chat_template at construction, so all seven sizes are + # ``Qwen35Renderer`` hard-codes this polarity per model + # (``_ENABLE_THINKING_DEFAULTS``), so all seven sizes are # token-for-token parity-tested against their own # ``apply_chat_template`` — including with # ``add_generation_prompt=True``. diff --git a/renderers/qwen35.py b/renderers/qwen35.py index abcacec..b3c6af7 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -66,39 +66,44 @@ ) -def _detect_enable_thinking_default(tokenizer: PreTrainedTokenizer) -> bool: - """Probe the tokenizer's chat template to learn its ``enable_thinking`` - default polarity at the generation-prompt boundary. - - The Qwen3.5 family ships two template variants that differ only in the - polarity of the gated branch: - - * Big sizes (4B / 9B / 35B-A3B / 122B-A10B / 397B-A17B) emit an open - ``\\n`` by default and the empty ``\\n\\n\\n\\n`` - block when ``enable_thinking`` is explicitly false. - * Small sizes (0.8B / 2B) flip the polarity — they emit the empty - block by default and the open ``\\n`` only when - ``enable_thinking`` is explicitly true. - - A one-shot ``apply_chat_template`` call with no flag and a minimal - user message reveals which variant is in use: the empty-block tail - ends with ````, the open-think tail does not. Failing the - probe (no chat_template, exotic config) falls back to the big-model - default of True, which matches every entry in - ``MODEL_RENDERER_MAP`` that routes to ``qwen3.5`` without explicit - polarity awareness. +# Per-model ``enable_thinking`` default, applied when the renderer config +# leaves it ``None``. The Qwen3.5 family ships two chat-template variants +# that differ only in the polarity of the gated thinking branch: +# +# * Big sizes (4B / 9B / 35B-A3B / 122B-A10B / 397B-A17B) default +# ``enable_thinking=true`` — an open ``\n`` at the gen prompt. +# * Small sizes (0.8B / 2B) flip it — default ``false``, emitting the +# empty ``\n\n\n\n`` block. +# +# These are hard-coded (keyed by ``tokenizer.name_or_path``) rather than +# probed from the live ``chat_template``: probing meant calling +# ``apply_chat_template`` at construction, which pulls ``transformers`` onto +# the hot path and breaks bring-your-own-tokenizer use. The values are the +# ground truth pinned by ``tests/test_qwen35_size_coverage.py`` — both the +# polarity assertions and byte-parity against each size's own +# ``apply_chat_template``. +_ENABLE_THINKING_DEFAULTS: dict[str, bool] = { + "Qwen/Qwen3.5-0.8B": False, + "Qwen/Qwen3.5-2B": False, + "Qwen/Qwen3.5-4B": True, + "Qwen/Qwen3.5-9B": True, + "Qwen/Qwen3.5-35B-A3B": True, + "Qwen/Qwen3.5-122B-A10B": True, + "Qwen/Qwen3.5-397B-A17B": True, + # Qwen3.6 extends the Qwen3.5 template; same big-size polarity. + "Qwen/Qwen3.6-35B-A3B": True, +} + + +def _default_enable_thinking(tokenizer) -> bool: + """Hard-coded ``enable_thinking`` default for ``tokenizer``'s model. + + Falls back to ``True`` (the big-model default, and the majority of the + family) for unknown / fine-tuned checkpoints whose ``name_or_path`` isn't + in ``_ENABLE_THINKING_DEFAULTS``; pass an explicit ``enable_thinking=`` to + a small-size fine-tune that needs ``False``. """ - try: - out = tokenizer.apply_chat_template( - [{"role": "user", "content": "x"}], - tokenize=False, - add_generation_prompt=True, - ) - except Exception: - return True - if not isinstance(out, str): - return True - return not out.rstrip().endswith("") + return _ENABLE_THINKING_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), True) class Qwen35Renderer: @@ -116,13 +121,13 @@ def __init__( self._tokenizer = tokenizer self._processor = processor cfg = config or type(self)._config_cls() - # ``enable_thinking=None`` defers to the tokenizer's chat-template - # default (Instruct → off, Thinking → on). Materialise here so - # downstream reads see a concrete bool; rebind the config with - # the resolved value so introspection sees the same. + # ``enable_thinking=None`` defers to the model's known default (see + # ``_ENABLE_THINKING_DEFAULTS``). Materialise here so downstream reads + # see a concrete bool; rebind the config with the resolved value so + # introspection sees the same. if cfg.enable_thinking is None: cfg = cfg.model_copy( - update={"enable_thinking": _detect_enable_thinking_default(tokenizer)} + update={"enable_thinking": _default_enable_thinking(tokenizer)} ) self.config = cfg diff --git a/tests/test_qwen35_size_coverage.py b/tests/test_qwen35_size_coverage.py index 6bb1161..366b4e7 100644 --- a/tests/test_qwen35_size_coverage.py +++ b/tests/test_qwen35_size_coverage.py @@ -5,9 +5,8 @@ ``enable_thinking=true``); the smaller 0.8B / 2B sizes ship the polarity- flipped variant (default ``enable_thinking=false`` → empty ``\\n\\n\\n\\n`` at the gen-prompt boundary). The renderer -detects polarity from the tokenizer's chat_template at construction, so -both variants render byte-identical to their own -``apply_chat_template``. +hard-codes this polarity per model (``_ENABLE_THINKING_DEFAULTS``), so +both variants render byte-identical to their own ``apply_chat_template``. These tests lock in (a) the exact set of Qwen3.5 sizes in the map and (b) byte parity for every one of them across representative @@ -57,7 +56,7 @@ def test_no_other_qwen35_sizes_silently_added(): # --------------------------------------------------------------------------- -# Polarity auto-detection: 0.8B / 2B flip ``enable_thinking`` default. +# Polarity defaults: 0.8B / 2B flip ``enable_thinking`` default. # --------------------------------------------------------------------------- @@ -73,10 +72,10 @@ def test_no_other_qwen35_sizes_silently_added(): ("Qwen/Qwen3.5-397B-A17B", True), ], ) -def test_qwen35_enable_thinking_polarity_autodetected(qwen35_model, expected_default): - """The renderer's ``_enable_thinking`` resolves to the chat template's - own default when no explicit flag is passed — so big / small sizes - each match their own template at the gen-prompt boundary.""" +def test_qwen35_enable_thinking_polarity_default(qwen35_model, expected_default): + """With no explicit flag, the renderer resolves ``enable_thinking`` from + the hard-coded per-model default — so big / small sizes each match their + own template at the gen-prompt boundary.""" tok = load_tokenizer(qwen35_model) renderer = create_renderer(tok, Qwen35RendererConfig()) assert isinstance(renderer, Qwen35Renderer) @@ -86,6 +85,30 @@ def test_qwen35_enable_thinking_polarity_autodetected(qwen35_model, expected_def ) +def test_construction_does_not_call_apply_chat_template(): + """The ``enable_thinking`` default is hard-coded per model, so building a + ``Qwen35Renderer`` must not probe ``apply_chat_template`` — a + bring-your-own tokenizer with no chat-template support still works.""" + + class _Stub: + name_or_path = "Qwen/Qwen3.5-0.8B" + unk_token_id = -1 + + def convert_tokens_to_ids(self, token): + # Any stable non-unk id per token; the renderer only needs the + # special tokens to resolve to distinct, in-vocab ids. + return abs(hash(token)) % 1_000_000 + 1 + + def apply_chat_template(self, *args, **kwargs): + raise AssertionError( + "apply_chat_template must not be called at construction" + ) + + renderer = Qwen35Renderer(_Stub()) + # 0.8B is a small size → thinking defaults off, from the hard-coded table. + assert renderer.config.enable_thinking is False + + # --------------------------------------------------------------------------- # Byte parity for each in-map Qwen3.5 size. # --------------------------------------------------------------------------- @@ -146,7 +169,7 @@ def test_qwen35_size_parity_with_apply_chat_template( """Each in-map Qwen3.5 size renders byte-identical to its own ``apply_chat_template`` output. Locks in the property that lets us share ``Qwen35Renderer`` across all seven sizes — the polarity - flip on 0.8B / 2B is absorbed by the constructor's auto-detect.""" + flip on 0.8B / 2B is absorbed by the per-model default.""" tok = load_tokenizer(qwen35_model) renderer = create_renderer(tok, Qwen35RendererConfig()) assert isinstance(renderer, Qwen35Renderer)