From f0435d420324e90511841589a53143f51ba7a86f Mon Sep 17 00:00:00 2001 From: hallerite Date: Fri, 29 May 2026 00:17:09 +0530 Subject: [PATCH] fix(tokenizer): apply config-build fallback to offset tokenizer too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #72 wired ``_load_fast_tokenizer_directly`` into ``_load_tokenizer_via_auto`` so ``load_tokenizer`` survives model-config build failures (e.g. HF RoPE validation rejecting nested ``rope_parameters`` for ``poolside/Laguna-XS.2``). But ``_get_offset_tokenizer`` was calling ``AutoTokenizer.from_pretrained`` directly to keep the fastokens patch out of this path — bypassing the fallback entirely. That meant every hand-coded renderer (LagunaXS2, Qwen35, etc.) still crashed on the first rollout for Laguna-family models: ``render`` → ``emit_text_segments`` → ``attribute_text_segments`` → ``_get_offset_tokenizer`` → raw ``AutoTokenizer.from_pretrained`` → RoPE validator → ``KeyError``. Reproduced end-to-end with prime-rl + reverse-text + ``poolside/Laguna-XS.2``: without this patch, every rollout aborts with the same ``KeyError`` that #72 was supposed to fix. With it, the first two RL steps complete cleanly (reward 0.36 / 0.31, ~100 tok/sample). Route through ``_load_tokenizer_via_auto`` instead. Same vanilla path (no fastokens patching, since that helper doesn't apply it), but now the ``_load_fast_tokenizer_directly`` fallback runs when the model config build fails. Co-Authored-By: Claude Opus 4.7 (1M context) --- renderers/base.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/renderers/base.py b/renderers/base.py index 5bed116..244f79f 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1587,7 +1587,6 @@ def _get_offset_tokenizer(tokenizer): cached = _offset_tokenizers.get(name_or_path) if cached is not None: return cached - from transformers import AutoTokenizer kwargs: dict[str, Any] = {} revision = TRUSTED_REVISIONS.get(name_or_path) @@ -1597,10 +1596,12 @@ def _get_offset_tokenizer(tokenizer): kwargs = {"trust_remote_code": False} # Explicitly vanilla — we want HF's Rust tokenizer with offset # tracking, not the fastokens shim. ``load_tokenizer`` would - # patch fastokens in by default; calling - # ``AutoTokenizer.from_pretrained`` directly here keeps the - # fastokens patch out of this code path entirely. - offset_tok = AutoTokenizer.from_pretrained(name_or_path, **kwargs) + # patch fastokens in by default; routing through + # ``_load_tokenizer_via_auto`` keeps the fastokens patch out + # of this code path while still applying the config-build + # fallback (RoPE-validation failures on nested + # ``rope_parameters``, etc.). + offset_tok = _load_tokenizer_via_auto(name_or_path, **kwargs) if not getattr(offset_tok, "is_fast", False): raise RuntimeError( f"Vanilla tokenizer for {name_or_path!r} is not a fast "