From 30655d6e7b3aa1fe36baf8a4e49edea0650c2d2b Mon Sep 17 00:00:00 2001 From: hallerite Date: Wed, 27 May 2026 21:59:39 +0000 Subject: [PATCH] fix(tokenizer): fall back to direct fast-tokenizer load when model config build fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `AutoTokenizer.from_pretrained` eagerly constructs the *model* config to resolve the tokenizer class — even for a plain `PreTrainedTokenizerFast`. That construction runs HF's RoPE validator, which rejects configs carrying nested `rope_parameters` (e.g. poolside/Laguna-XS.2: `full_attention` / `sliding_attention` blocks with no top-level `rope_theta`) when the config is built outside vLLM's `patch_rope_parameters`. The resulting `KeyError` escapes (AutoTokenizer only catches `ValueError`/`OSError`) and kills the tokenizer load — a modeling-only concern breaking something the tokenizer never needed. renderers needs the tokenizer, not the model. When `AutoTokenizer` fails while building the config, fall back to loading the repo's self-contained `tokenizer.json` directly via `PreTrainedTokenizerFast`, which never touches the model config. The fallback runs under the fastokens patch, so models like Laguna keep the Rust fast-path speedup. Custom `auto_map` tokenizers and repos without a fast tokenizer are left to surface the original error. Co-Authored-By: Claude Opus 4.7 (1M context) --- renderers/base.py | 75 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/renderers/base.py b/renderers/base.py index 89d6577..5bed116 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1089,7 +1089,6 @@ def _patched_load(model_name_or_path: str, **kwargs): path is still discoverable in logs. """ import fastokens - from transformers import AutoTokenizer global _FASTOKENS_ANNOUNCED @@ -1102,13 +1101,72 @@ def _patched_load(model_name_or_path: str, **kwargs): ) _FASTOKENS_ANNOUNCED = True try: - return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs) + return _load_tokenizer_via_auto(model_name_or_path, **kwargs) finally: with _FASTOKENS_PATCH_LOCK: with contextlib.redirect_stdout(io.StringIO()): fastokens.unpatch_transformers() +def _load_fast_tokenizer_directly( + model_name_or_path: str, revision: str | None +) -> Any | None: + """Load a self-contained fast tokenizer without building the model config. + + ``AutoTokenizer.from_pretrained`` eagerly constructs the *model* config to + resolve the tokenizer class — even for a plain ``PreTrainedTokenizerFast``. + That construction can raise on modeling-only concerns the tokenizer never + needs (e.g. RoPE parameter validation for configs that carry nested + ``rope_parameters``). When the repo ships a complete ``tokenizer.json`` and + declares no custom tokenizer, the tokenizer is fully self-describing, so we + load it directly and skip the config detour. + + Returns ``None`` when there's nothing safe to load this way — a custom + ``auto_map`` tokenizer (which must run through ``AutoTokenizer`` with + ``trust_remote_code``) or no fast tokenizer at all — so the caller can + surface its original error instead. + """ + from transformers import PreTrainedTokenizerFast + from transformers.models.auto.tokenization_auto import get_tokenizer_config + + try: + if "auto_map" in get_tokenizer_config(model_name_or_path, revision=revision): + return None + return PreTrainedTokenizerFast.from_pretrained( + model_name_or_path, revision=revision + ) + except Exception: + return None + + +def _load_tokenizer_via_auto(model_name_or_path: str, **kwargs) -> Any: + """``AutoTokenizer.from_pretrained`` with a config-free fallback. + + renderers needs the tokenizer, not the model. If ``AutoTokenizer`` fails + while building the model config it loads to resolve the tokenizer class, + retry by loading the repo's self-contained ``tokenizer.json`` directly. The + original error is re-raised if the repo has no such tokenizer. + """ + from transformers import AutoTokenizer + + try: + return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs) + except Exception as exc: + tok = _load_fast_tokenizer_directly( + model_name_or_path, revision=kwargs.get("revision") + ) + if tok is None: + raise + logger.debug( + "AutoTokenizer.from_pretrained(%r) failed building the model config " + "(%s: %s); loaded the tokenizer directly from tokenizer.json.", + model_name_or_path, + type(exc).__name__, + str(exc)[:160], + ) + return tok + + def load_tokenizer( model_name_or_path: str, *, @@ -1138,9 +1196,14 @@ def load_tokenizer( fastokens raises during the patched load (e.g. an unknown pre-tokenizer type), we automatically retry with the vanilla backend and emit an INFO log. - """ - from transformers import AutoTokenizer + ``AutoTokenizer.from_pretrained`` eagerly builds the model config to + resolve the tokenizer class. If that construction raises on a + modeling-only concern the tokenizer doesn't need (e.g. RoPE + validation for configs with nested ``rope_parameters``), we fall + back to loading the repo's self-contained ``tokenizer.json`` + directly — see ``_load_tokenizer_via_auto``. + """ kwargs: dict[str, Any] = {} revision = TRUSTED_REVISIONS.get(model_name_or_path) if revision is not None: @@ -1149,7 +1212,7 @@ def load_tokenizer( kwargs = {"trust_remote_code": False} if not use_fastokens or model_name_or_path in FASTOKENS_INCOMPATIBLE: - return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs) + return _load_tokenizer_via_auto(model_name_or_path, **kwargs) try: return _patched_load(model_name_or_path, **kwargs) @@ -1162,7 +1225,7 @@ def load_tokenizer( type(exc).__name__, str(exc)[:160], ) - return AutoTokenizer.from_pretrained(model_name_or_path, **kwargs) + return _load_tokenizer_via_auto(model_name_or_path, **kwargs) def _populate_registry():