PrimeIntellect-ai · hallerite · May 27, 2026 · May 27, 2026
diff --git a/renderers/base.py b/renderers/base.py
@@ -911,8 +911,8 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No
     # ``enable_thinking=true`` (open ``<think>\n`` at the gen prompt);
     # the smaller 0.8B / 2B variants flip the polarity (default
     # ``enable_thinking=false``, empty ``<think>\n\n</think>\n\n``).
-    # ``Qwen35Renderer`` auto-detects polarity from the tokenizer's
-    # chat_template at construction, so all seven sizes are
+    # ``Qwen35Renderer`` hard-codes this polarity per model
+    # (``_ENABLE_THINKING_DEFAULTS``), so all seven sizes are
     # token-for-token parity-tested against their own
     # ``apply_chat_template`` — including with
     # ``add_generation_prompt=True``.

diff --git a/renderers/qwen35.py b/renderers/qwen35.py
@@ -66,39 +66,44 @@
 )
 
 
-def _detect_enable_thinking_default(tokenizer: PreTrainedTokenizer) -> bool:
-    """Probe the tokenizer's chat template to learn its ``enable_thinking``
-    default polarity at the generation-prompt boundary.
-
-    The Qwen3.5 family ships two template variants that differ only in the
-    polarity of the gated branch:
-
-    * Big sizes (4B / 9B / 35B-A3B / 122B-A10B / 397B-A17B) emit an open
-      ``<think>\\n`` by default and the empty ``<think>\\n\\n</think>\\n\\n``
-      block when ``enable_thinking`` is explicitly false.
-    * Small sizes (0.8B / 2B) flip the polarity — they emit the empty
-      block by default and the open ``<think>\\n`` only when
-      ``enable_thinking`` is explicitly true.
-
-    A one-shot ``apply_chat_template`` call with no flag and a minimal
-    user message reveals which variant is in use: the empty-block tail
-    ends with ``</think>``, the open-think tail does not. Failing the
-    probe (no chat_template, exotic config) falls back to the big-model
-    default of True, which matches every entry in
-    ``MODEL_RENDERER_MAP`` that routes to ``qwen3.5`` without explicit
-    polarity awareness.
+# Per-model ``enable_thinking`` default, applied when the renderer config
+# leaves it ``None``. The Qwen3.5 family ships two chat-template variants
+# that differ only in the polarity of the gated thinking branch:
+#
+#   * Big sizes (4B / 9B / 35B-A3B / 122B-A10B / 397B-A17B) default
+#     ``enable_thinking=true`` — an open ``<think>\n`` at the gen prompt.
+#   * Small sizes (0.8B / 2B) flip it — default ``false``, emitting the
+#     empty ``<think>\n\n</think>\n\n`` block.
+#
+# These are hard-coded (keyed by ``tokenizer.name_or_path``) rather than
+# probed from the live ``chat_template``: probing meant calling
+# ``apply_chat_template`` at construction, which pulls ``transformers`` onto
+# the hot path and breaks bring-your-own-tokenizer use. The values are the
+# ground truth pinned by ``tests/test_qwen35_size_coverage.py`` — both the
+# polarity assertions and byte-parity against each size's own
+# ``apply_chat_template``.
+_ENABLE_THINKING_DEFAULTS: dict[str, bool] = {
+    "Qwen/Qwen3.5-0.8B": False,
+    "Qwen/Qwen3.5-2B": False,
+    "Qwen/Qwen3.5-4B": True,
+    "Qwen/Qwen3.5-9B": True,
+    "Qwen/Qwen3.5-35B-A3B": True,
+    "Qwen/Qwen3.5-122B-A10B": True,
+    "Qwen/Qwen3.5-397B-A17B": True,
+    # Qwen3.6 extends the Qwen3.5 template; same big-size polarity.
+    "Qwen/Qwen3.6-35B-A3B": True,
+}
+
+
+def _default_enable_thinking(tokenizer) -> bool:
+    """Hard-coded ``enable_thinking`` default for ``tokenizer``'s model.
+
+    Falls back to ``True`` (the big-model default, and the majority of the
+    family) for unknown / fine-tuned checkpoints whose ``name_or_path`` isn't
+    in ``_ENABLE_THINKING_DEFAULTS``; pass an explicit ``enable_thinking=`` to
+    a small-size fine-tune that needs ``False``.
     """
-    try:
-        out = tokenizer.apply_chat_template(
-            [{"role": "user", "content": "x"}],
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-    except Exception:
-        return True
-    if not isinstance(out, str):
-        return True
-    return not out.rstrip().endswith("</think>")
+    return _ENABLE_THINKING_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), True)
 
 
 class Qwen35Renderer:
@@ -116,13 +121,13 @@ def __init__(
         self._tokenizer = tokenizer
         self._processor = processor
         cfg = config or type(self)._config_cls()
-        # ``enable_thinking=None`` defers to the tokenizer's chat-template
-        # default (Instruct → off, Thinking → on). Materialise here so
-        # downstream reads see a concrete bool; rebind the config with
-        # the resolved value so introspection sees the same.
+        # ``enable_thinking=None`` defers to the model's known default (see
+        # ``_ENABLE_THINKING_DEFAULTS``). Materialise here so downstream reads
+        # see a concrete bool; rebind the config with the resolved value so
+        # introspection sees the same.
         if cfg.enable_thinking is None:
             cfg = cfg.model_copy(
-                update={"enable_thinking": _detect_enable_thinking_default(tokenizer)}
+                update={"enable_thinking": _default_enable_thinking(tokenizer)}
             )
         self.config = cfg
 

diff --git a/tests/test_qwen35_size_coverage.py b/tests/test_qwen35_size_coverage.py
@@ -5,9 +5,8 @@
 ``enable_thinking=true``); the smaller 0.8B / 2B sizes ship the polarity-
 flipped variant (default ``enable_thinking=false`` → empty
 ``<think>\\n\\n</think>\\n\\n`` at the gen-prompt boundary). The renderer
-detects polarity from the tokenizer's chat_template at construction, so
-both variants render byte-identical to their own
-``apply_chat_template``.
+hard-codes this polarity per model (``_ENABLE_THINKING_DEFAULTS``), so
+both variants render byte-identical to their own ``apply_chat_template``.
 
 These tests lock in (a) the exact set of Qwen3.5 sizes in the map and
 (b) byte parity for every one of them across representative
@@ -57,7 +56,7 @@ def test_no_other_qwen35_sizes_silently_added():
 
 
 # ---------------------------------------------------------------------------
-# Polarity auto-detection: 0.8B / 2B flip ``enable_thinking`` default.
+# Polarity defaults: 0.8B / 2B flip ``enable_thinking`` default.
 # ---------------------------------------------------------------------------
 
 
@@ -73,10 +72,10 @@ def test_no_other_qwen35_sizes_silently_added():
         ("Qwen/Qwen3.5-397B-A17B", True),
     ],
 )
-def test_qwen35_enable_thinking_polarity_autodetected(qwen35_model, expected_default):
-    """The renderer's ``_enable_thinking`` resolves to the chat template's
-    own default when no explicit flag is passed — so big / small sizes
-    each match their own template at the gen-prompt boundary."""
+def test_qwen35_enable_thinking_polarity_default(qwen35_model, expected_default):
+    """With no explicit flag, the renderer resolves ``enable_thinking`` from
+    the hard-coded per-model default — so big / small sizes each match their
+    own template at the gen-prompt boundary."""
     tok = load_tokenizer(qwen35_model)
     renderer = create_renderer(tok, Qwen35RendererConfig())
     assert isinstance(renderer, Qwen35Renderer)
@@ -86,6 +85,30 @@ def test_qwen35_enable_thinking_polarity_autodetected(qwen35_model, expected_def
     )
 
 
+def test_construction_does_not_call_apply_chat_template():
+    """The ``enable_thinking`` default is hard-coded per model, so building a
+    ``Qwen35Renderer`` must not probe ``apply_chat_template`` — a
+    bring-your-own tokenizer with no chat-template support still works."""
+
+    class _Stub:
+        name_or_path = "Qwen/Qwen3.5-0.8B"
+        unk_token_id = -1
+
+        def convert_tokens_to_ids(self, token):
+            # Any stable non-unk id per token; the renderer only needs the
+            # special tokens to resolve to distinct, in-vocab ids.
+            return abs(hash(token)) % 1_000_000 + 1
+
+        def apply_chat_template(self, *args, **kwargs):
+            raise AssertionError(
+                "apply_chat_template must not be called at construction"
+            )
+
+    renderer = Qwen35Renderer(_Stub())
+    # 0.8B is a small size → thinking defaults off, from the hard-coded table.
+    assert renderer.config.enable_thinking is False
+
+
 # ---------------------------------------------------------------------------
 # Byte parity for each in-map Qwen3.5 size.
 # ---------------------------------------------------------------------------
@@ -146,7 +169,7 @@ def test_qwen35_size_parity_with_apply_chat_template(
     """Each in-map Qwen3.5 size renders byte-identical to its own
     ``apply_chat_template`` output. Locks in the property that lets us
     share ``Qwen35Renderer`` across all seven sizes — the polarity
-    flip on 0.8B / 2B is absorbed by the constructor's auto-detect."""
+    flip on 0.8B / 2B is absorbed by the per-model default."""
     tok = load_tokenizer(qwen35_model)
     renderer = create_renderer(tok, Qwen35RendererConfig())
     assert isinstance(renderer, Qwen35Renderer)