Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions renderers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,8 +911,8 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No
# ``enable_thinking=true`` (open ``<think>\n`` at the gen prompt);
# the smaller 0.8B / 2B variants flip the polarity (default
# ``enable_thinking=false``, empty ``<think>\n\n</think>\n\n``).
# ``Qwen35Renderer`` auto-detects polarity from the tokenizer's
# chat_template at construction, so all seven sizes are
# ``Qwen35Renderer`` hard-codes this polarity per model
# (``_ENABLE_THINKING_DEFAULTS``), so all seven sizes are
# token-for-token parity-tested against their own
# ``apply_chat_template`` — including with
# ``add_generation_prompt=True``.
Expand Down
79 changes: 42 additions & 37 deletions renderers/qwen35.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,39 +66,44 @@
)


def _detect_enable_thinking_default(tokenizer: PreTrainedTokenizer) -> bool:
"""Probe the tokenizer's chat template to learn its ``enable_thinking``
default polarity at the generation-prompt boundary.

The Qwen3.5 family ships two template variants that differ only in the
polarity of the gated branch:

* Big sizes (4B / 9B / 35B-A3B / 122B-A10B / 397B-A17B) emit an open
``<think>\\n`` by default and the empty ``<think>\\n\\n</think>\\n\\n``
block when ``enable_thinking`` is explicitly false.
* Small sizes (0.8B / 2B) flip the polarity — they emit the empty
block by default and the open ``<think>\\n`` only when
``enable_thinking`` is explicitly true.

A one-shot ``apply_chat_template`` call with no flag and a minimal
user message reveals which variant is in use: the empty-block tail
ends with ``</think>``, the open-think tail does not. Failing the
probe (no chat_template, exotic config) falls back to the big-model
default of True, which matches every entry in
``MODEL_RENDERER_MAP`` that routes to ``qwen3.5`` without explicit
polarity awareness.
# Per-model ``enable_thinking`` default, applied when the renderer config
# leaves it ``None``. The Qwen3.5 family ships two chat-template variants
# that differ only in the polarity of the gated thinking branch:
#
# * Big sizes (4B / 9B / 35B-A3B / 122B-A10B / 397B-A17B) default
# ``enable_thinking=true`` — an open ``<think>\n`` at the gen prompt.
# * Small sizes (0.8B / 2B) flip it — default ``false``, emitting the
# empty ``<think>\n\n</think>\n\n`` block.
#
# These are hard-coded (keyed by ``tokenizer.name_or_path``) rather than
# probed from the live ``chat_template``: probing meant calling
# ``apply_chat_template`` at construction, which pulls ``transformers`` onto
# the hot path and breaks bring-your-own-tokenizer use. The values are the
# ground truth pinned by ``tests/test_qwen35_size_coverage.py`` — both the
# polarity assertions and byte-parity against each size's own
# ``apply_chat_template``.
_ENABLE_THINKING_DEFAULTS: dict[str, bool] = {
"Qwen/Qwen3.5-0.8B": False,
"Qwen/Qwen3.5-2B": False,
"Qwen/Qwen3.5-4B": True,
"Qwen/Qwen3.5-9B": True,
"Qwen/Qwen3.5-35B-A3B": True,
"Qwen/Qwen3.5-122B-A10B": True,
"Qwen/Qwen3.5-397B-A17B": True,
# Qwen3.6 extends the Qwen3.5 template; same big-size polarity.
"Qwen/Qwen3.6-35B-A3B": True,
}


def _default_enable_thinking(tokenizer) -> bool:
"""Hard-coded ``enable_thinking`` default for ``tokenizer``'s model.

Falls back to ``True`` (the big-model default, and the majority of the
family) for unknown / fine-tuned checkpoints whose ``name_or_path`` isn't
in ``_ENABLE_THINKING_DEFAULTS``; pass an explicit ``enable_thinking=`` to
a small-size fine-tune that needs ``False``.
"""
try:
out = tokenizer.apply_chat_template(
[{"role": "user", "content": "x"}],
tokenize=False,
add_generation_prompt=True,
)
except Exception:
return True
if not isinstance(out, str):
return True
return not out.rstrip().endswith("</think>")
return _ENABLE_THINKING_DEFAULTS.get(getattr(tokenizer, "name_or_path", ""), True)


class Qwen35Renderer:
Expand All @@ -116,13 +121,13 @@ def __init__(
self._tokenizer = tokenizer
self._processor = processor
cfg = config or type(self)._config_cls()
# ``enable_thinking=None`` defers to the tokenizer's chat-template
# default (Instruct → off, Thinking → on). Materialise here so
# downstream reads see a concrete bool; rebind the config with
# the resolved value so introspection sees the same.
# ``enable_thinking=None`` defers to the model's known default (see
# ``_ENABLE_THINKING_DEFAULTS``). Materialise here so downstream reads
# see a concrete bool; rebind the config with the resolved value so
# introspection sees the same.
if cfg.enable_thinking is None:
cfg = cfg.model_copy(
update={"enable_thinking": _detect_enable_thinking_default(tokenizer)}
update={"enable_thinking": _default_enable_thinking(tokenizer)}
)
self.config = cfg

Expand Down
41 changes: 32 additions & 9 deletions tests/test_qwen35_size_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
``enable_thinking=true``); the smaller 0.8B / 2B sizes ship the polarity-
flipped variant (default ``enable_thinking=false`` → empty
``<think>\\n\\n</think>\\n\\n`` at the gen-prompt boundary). The renderer
detects polarity from the tokenizer's chat_template at construction, so
both variants render byte-identical to their own
``apply_chat_template``.
hard-codes this polarity per model (``_ENABLE_THINKING_DEFAULTS``), so
both variants render byte-identical to their own ``apply_chat_template``.

These tests lock in (a) the exact set of Qwen3.5 sizes in the map and
(b) byte parity for every one of them across representative
Expand Down Expand Up @@ -57,7 +56,7 @@ def test_no_other_qwen35_sizes_silently_added():


# ---------------------------------------------------------------------------
# Polarity auto-detection: 0.8B / 2B flip ``enable_thinking`` default.
# Polarity defaults: 0.8B / 2B flip ``enable_thinking`` default.
# ---------------------------------------------------------------------------


Expand All @@ -73,10 +72,10 @@ def test_no_other_qwen35_sizes_silently_added():
("Qwen/Qwen3.5-397B-A17B", True),
],
)
def test_qwen35_enable_thinking_polarity_autodetected(qwen35_model, expected_default):
"""The renderer's ``_enable_thinking`` resolves to the chat template's
own default when no explicit flag is passed — so big / small sizes
each match their own template at the gen-prompt boundary."""
def test_qwen35_enable_thinking_polarity_default(qwen35_model, expected_default):
"""With no explicit flag, the renderer resolves ``enable_thinking`` from
the hard-coded per-model default — so big / small sizes each match their
own template at the gen-prompt boundary."""
tok = load_tokenizer(qwen35_model)
renderer = create_renderer(tok, Qwen35RendererConfig())
assert isinstance(renderer, Qwen35Renderer)
Expand All @@ -86,6 +85,30 @@ def test_qwen35_enable_thinking_polarity_autodetected(qwen35_model, expected_def
)


def test_construction_does_not_call_apply_chat_template():
"""The ``enable_thinking`` default is hard-coded per model, so building a
``Qwen35Renderer`` must not probe ``apply_chat_template`` — a
bring-your-own tokenizer with no chat-template support still works."""

class _Stub:
name_or_path = "Qwen/Qwen3.5-0.8B"
unk_token_id = -1

def convert_tokens_to_ids(self, token):
# Any stable non-unk id per token; the renderer only needs the
# special tokens to resolve to distinct, in-vocab ids.
return abs(hash(token)) % 1_000_000 + 1

def apply_chat_template(self, *args, **kwargs):
raise AssertionError(
"apply_chat_template must not be called at construction"
)

renderer = Qwen35Renderer(_Stub())
# 0.8B is a small size → thinking defaults off, from the hard-coded table.
assert renderer.config.enable_thinking is False


# ---------------------------------------------------------------------------
# Byte parity for each in-map Qwen3.5 size.
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -146,7 +169,7 @@ def test_qwen35_size_parity_with_apply_chat_template(
"""Each in-map Qwen3.5 size renders byte-identical to its own
``apply_chat_template`` output. Locks in the property that lets us
share ``Qwen35Renderer`` across all seven sizes — the polarity
flip on 0.8B / 2B is absorbed by the constructor's auto-detect."""
flip on 0.8B / 2B is absorbed by the per-model default."""
tok = load_tokenizer(qwen35_model)
renderer = create_renderer(tok, Qwen35RendererConfig())
assert isinstance(renderer, Qwen35Renderer)
Expand Down
Loading