From e8d8abeafa114ef5aad3d01b6f74876386cdc2f2 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:43:33 +0200 Subject: [PATCH 01/35] Add Rust native renderer foundation --- .gitignore | 3 + Cargo.lock | 720 +++++++++++++++++++++++ Cargo.toml | 36 ++ crates/renderers-core/Cargo.toml | 25 + crates/renderers-core/src/bridge.rs | 103 ++++ crates/renderers-core/src/emit.rs | 120 ++++ crates/renderers-core/src/lib.rs | 41 ++ crates/renderers-core/src/parsing/mod.rs | 54 ++ crates/renderers-core/src/thinking.rs | 92 +++ crates/renderers-core/src/tokenizer.rs | 124 ++++ crates/renderers-core/src/traits.rs | 86 +++ crates/renderers-core/src/types.rs | 335 +++++++++++ 12 files changed, 1739 insertions(+) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 crates/renderers-core/Cargo.toml create mode 100644 crates/renderers-core/src/bridge.rs create mode 100644 crates/renderers-core/src/emit.rs create mode 100644 crates/renderers-core/src/lib.rs create mode 100644 crates/renderers-core/src/parsing/mod.rs create mode 100644 crates/renderers-core/src/thinking.rs create mode 100644 crates/renderers-core/src/tokenizer.rs create mode 100644 crates/renderers-core/src/traits.rs create mode 100644 crates/renderers-core/src/types.rs diff --git a/.gitignore b/.gitignore index cca70b8..4db25f1 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,6 @@ coverage.xml # agent harness state .claude/ + +# rust +target/ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..8037260 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,720 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "cc" +version = "1.2.62" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "onig" +version = "6.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" +dependencies = [ + "bitflags", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e68317604e77e53b85896388e1a803c1d21b74c899ec9e5e1112db90735edd7" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +dependencies = [ + "either", + "itertools 0.11.0", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "renderers-core" +version = "0.1.0" +dependencies = [ + "bumpalo", + "once_cell", + "phf", + "regex", + "serde", + "serde_json", + "smallvec", + "thiserror", + "tokenizers", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64", + "nom", + "serde", + "unicode-segmentation", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokenizers" +version = "0.20.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b08cc37428a476fc9e20ac850132a513a2e1ce32b6a31addf2b74fa7033b905" +dependencies = [ + "aho-corasick", + "derive_builder", + "esaxx-rs", + "getrandom", + "itertools 0.12.1", + "lazy_static", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9e0ed8e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,36 @@ +[workspace] +resolver = "2" +members = [ + "crates/renderers-core", +] + +[workspace.package] +edition = "2021" +license = "Apache-2.0" +repository = "https://github.com/thomaub/renderers" +rust-version = "1.78" + +[workspace.dependencies] +tokenizers = { version = "0.20", default-features = false, features = ["onig", "esaxx_fast"] } +serde = { version = "1", features = ["derive"] } +serde_json = { version = "1", features = ["preserve_order"] } +regex = "1" +once_cell = "1" +thiserror = "1" +smallvec = { version = "1", features = ["union", "const_generics"] } +bumpalo = { version = "3", features = ["collections"] } +phf = { version = "0.11", features = ["macros"] } + +[profile.release] +opt-level = 3 +lto = "thin" +codegen-units = 1 +debug = false +overflow-checks = false +panic = "abort" + +[profile.bench] +opt-level = 3 +lto = "thin" +codegen-units = 1 +debug = true diff --git a/crates/renderers-core/Cargo.toml b/crates/renderers-core/Cargo.toml new file mode 100644 index 0000000..4291afb --- /dev/null +++ b/crates/renderers-core/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "renderers-core" +version = "0.1.0" +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +description = "Deterministic message-to-token rendering for LLM training/inference (Rust core)." + +[lib] +path = "src/lib.rs" + +[dependencies] +tokenizers = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +regex = { workspace = true } +once_cell = { workspace = true } +thiserror = { workspace = true } +smallvec = { workspace = true } +bumpalo = { workspace = true } +phf = { workspace = true } + +[dev-dependencies] +serde_json = { workspace = true } diff --git a/crates/renderers-core/src/bridge.rs b/crates/renderers-core/src/bridge.rs new file mode 100644 index 0000000..42b6627 --- /dev/null +++ b/crates/renderers-core/src/bridge.rs @@ -0,0 +1,103 @@ +//! Helpers shared across renderers' `bridge_to_next_turn` implementations. + +use crate::types::Message; + +/// Returns `true` if any message in `new_messages` carries the assistant +/// role. Bridges refuse to retokenize assistant content because it would +/// replace model-sampled tokens with canonical template text, violating +/// the byte-for-byte contract. +#[inline] +pub fn reject_assistant_in_extension(new_messages: &[Message]) -> bool { + new_messages.iter().any(|m| m.role == "assistant") +} + +/// Return the longest prefix of `prev_prompt + prev_completion` that ends +/// at a turn-close token, or `None` if none exists and `synthesize_close` +/// is `None`. +/// +/// Scans only within the completion segment — close tokens inside the +/// prompt are structural scaffolding, not turn boundaries the current +/// step produced. +/// +/// Returns an owned `Vec` that the caller can mutate; the inputs are +/// borrowed. +pub fn trim_to_turn_close( + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + close_token_ids: &[u32], + synthesize_close: Option, +) -> Option> { + let prompt_len = previous_prompt_ids.len(); + let total_len = prompt_len + previous_completion_ids.len(); + + // Walk the completion section backwards looking for a close token. + for offset in (0..previous_completion_ids.len()).rev() { + if close_token_ids.contains(&previous_completion_ids[offset]) { + let mut out = Vec::with_capacity(prompt_len + offset + 1); + out.extend_from_slice(previous_prompt_ids); + out.extend_from_slice(&previous_completion_ids[..=offset]); + return Some(out); + } + } + + let close = synthesize_close?; + let mut out = Vec::with_capacity(total_len + 1); + out.extend_from_slice(previous_prompt_ids); + out.extend_from_slice(previous_completion_ids); + out.push(close); + Some(out) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn msg(role: &str) -> Message { + Message { + role: role.to_string(), + ..Default::default() + } + } + + #[test] + fn rejects_assistant_in_extension() { + assert!(reject_assistant_in_extension(&[msg("assistant")])); + assert!(!reject_assistant_in_extension(&[msg("user"), msg("tool")])); + assert!(!reject_assistant_in_extension(&[])); + } + + #[test] + fn trim_to_close_keeps_prefix() { + let prompt = vec![1, 2, 3]; + let completion = vec![4, 5, 9, 6, 9]; + let close = [9u32]; + let trimmed = trim_to_turn_close(&prompt, &completion, &close, None).unwrap(); + assert_eq!(trimmed, vec![1, 2, 3, 4, 5, 9, 6, 9]); + } + + #[test] + fn trim_to_close_finds_last_close() { + let prompt = vec![1, 2]; + let completion = vec![9, 3, 4]; + let close = [9u32]; + let trimmed = trim_to_turn_close(&prompt, &completion, &close, None).unwrap(); + assert_eq!(trimmed, vec![1, 2, 9]); + } + + #[test] + fn trim_to_close_ignores_prompt_close() { + let prompt = vec![9, 1, 2]; + let completion = vec![3, 4]; + let close = [9u32]; + assert!(trim_to_turn_close(&prompt, &completion, &close, None).is_none()); + } + + #[test] + fn trim_to_close_synthesises_when_truncated() { + let prompt = vec![1, 2]; + let completion = vec![3, 4]; + let close = [9u32]; + let trimmed = trim_to_turn_close(&prompt, &completion, &close, Some(9)).unwrap(); + assert_eq!(trimmed, vec![1, 2, 3, 4, 9]); + } +} diff --git a/crates/renderers-core/src/emit.rs b/crates/renderers-core/src/emit.rs new file mode 100644 index 0000000..96a8016 --- /dev/null +++ b/crates/renderers-core/src/emit.rs @@ -0,0 +1,120 @@ +//! Render-buffer helpers used by every hand-coded family. +//! +//! The pattern is the same everywhere: pre-allocated `Vec` for tokens +//! and `Vec` for per-token message attribution, with three primitives +//! to fill them. Centralising the primitives lets each family stay focused +//! on its own template logic without re-deriving the bookkeeping. + +use crate::tokenizer::Tokenizer; +use crate::types::{RenderError, RenderedTokens, SCAFFOLD_IDX}; + +/// Mutable render-time buffer paired with a tokenizer reference. +/// +/// Holds both the token stream and the parallel `message_indices` array. +/// All emits are O(1) amortised against the pre-allocated capacity. +pub struct RenderBuf<'tok> { + tokens: Vec, + indices: Vec, + tokenizer: &'tok Tokenizer, + /// Scratch `Vec` reused across `encode` calls so each text segment + /// doesn't allocate. The tokenizer's `encode` API returns its own + /// `Encoding`, so the saving is at the buffer-extension layer, not + /// at encode itself. + scratch_offsets: Vec, +} + +impl<'tok> std::fmt::Debug for RenderBuf<'tok> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RenderBuf") + .field("tokens_len", &self.tokens.len()) + .field("indices_len", &self.indices.len()) + .finish() + } +} + +impl<'tok> RenderBuf<'tok> { + pub fn new(tokenizer: &'tok Tokenizer, hint: usize) -> Self { + Self { + tokens: Vec::with_capacity(hint), + indices: Vec::with_capacity(hint), + tokenizer, + scratch_offsets: Vec::new(), + } + } + + #[inline] + pub fn tokenizer(&self) -> &Tokenizer { + self.tokenizer + } + + /// Append a single special token id to the buffer. + #[inline] + pub fn special(&mut self, token_id: u32, msg_idx: i32) { + self.tokens.push(token_id); + self.indices.push(msg_idx); + } + + /// Append a span of token ids to the buffer, all attributed to the + /// same message index. + #[inline] + pub fn ids(&mut self, token_ids: &[u32], msg_idx: i32) { + self.tokens.extend_from_slice(token_ids); + // `resize` with a Copy fill is the cheapest way to extend the + // indices vector by N elements of the same value. + let new_len = self.indices.len() + token_ids.len(); + self.indices.resize(new_len, msg_idx); + } + + /// Encode `text` and append the resulting tokens, attributing all of + /// them to `msg_idx`. Empty strings are a no-op (saves a tokenizer + /// call on the common "no content here" path). + #[inline] + pub fn text(&mut self, text: &str, msg_idx: i32) -> Result<(), RenderError> { + if text.is_empty() { + return Ok(()); + } + let encoded = self.tokenizer.encode_no_special(text)?; + self.ids(encoded.as_slice(), msg_idx); + Ok(()) + } + + /// Append a scaffold token (one whose attribution is "structural, + /// not from any message" — uses [`SCAFFOLD_IDX`]). + #[inline] + pub fn scaffold_special(&mut self, token_id: u32) { + self.special(token_id, SCAFFOLD_IDX); + } + + /// Encode `text` and append as scaffolding (attribution [`SCAFFOLD_IDX`]). + #[inline] + pub fn scaffold_text(&mut self, text: &str) -> Result<(), RenderError> { + self.text(text, SCAFFOLD_IDX) + } + + /// Consume the buffer and return a [`RenderedTokens`]. + pub fn into_rendered(self) -> RenderedTokens { + debug_assert_eq!(self.tokens.len(), self.indices.len()); + let _ = self.scratch_offsets; // keep the field but ignore + RenderedTokens { + token_ids: self.tokens, + message_indices: self.indices, + multi_modal_data: None, + } + } + + /// Take the token ids only, dropping per-token attribution. Used by + /// `render_ids` callers that don't need the indices array. + pub fn into_token_ids(self) -> Vec { + self.tokens + } + + #[inline] + pub fn len(&self) -> usize { + self.tokens.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.tokens.is_empty() + } +} diff --git a/crates/renderers-core/src/lib.rs b/crates/renderers-core/src/lib.rs new file mode 100644 index 0000000..2b6ccb2 --- /dev/null +++ b/crates/renderers-core/src/lib.rs @@ -0,0 +1,41 @@ +//! `renderers-core` — deterministic message → token rendering for LLM +//! training and inference. +//! +//! This crate is the pure-Rust foundation: data types, the [`Renderer`] +//! trait, parsing primitives, and per-family renderer implementations. +//! The Python wrapper lives in `renderers-py`. +//! +//! # Design at a glance +//! +//! - Messages flow into a [`Renderer`] which emits [`RenderedTokens`] +//! (token ids plus per-token message attribution). +//! - Completion token ids flow back into [`Renderer::parse_response`], +//! which returns a [`ParsedResponse`] with content, optional reasoning, +//! and per-attempt [`ParsedToolCall`] records (success and malformed +//! both surface, distinguished by [`ToolCallParseStatus`]). +//! - Multi-turn rollouts use [`Renderer::bridge_to_next_turn`] to extend +//! the prior turn's token stream byte-for-byte, avoiding re-tokenization +//! drift. +//! +//! The crate is `#![forbid(unsafe_code)]` and aims to keep allocation off +//! the hot path: render buffers grow once, parsing uses a per-call arena, +//! and concrete renderers cache resolved special-token ids at construction. + +#![forbid(unsafe_code)] +#![warn(missing_debug_implementations)] +#![warn(rust_2018_idioms)] + +pub mod bridge; +pub mod emit; +pub mod parsing; +pub mod thinking; +pub mod tokenizer; +pub mod traits; +pub mod types; + +pub use traits::{MultimodalRenderer, Renderer}; +pub use types::{ + Content, ContentPart, ImageRef, Message, MultiModalData, ParsedResponse, ParsedToolCall, + PlaceholderRange, RenderError, RenderedTokens, ToolArguments, ToolCall, ToolCallFunction, + ToolCallParseStatus, ToolSpec, VideoRef, SCAFFOLD_IDX, +}; diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs new file mode 100644 index 0000000..dcdef52 --- /dev/null +++ b/crates/renderers-core/src/parsing/mod.rs @@ -0,0 +1,54 @@ +//! Token-level parsing primitives shared across family-specific parsers. +//! +//! The strategy matches the Python implementation: scan token ids for +//! special-token boundaries (no decoded-text regex on the full stream), +//! then decode only inside the bounded segments. This is the only way to +//! avoid false positives from content that happens to look like a +//! special token. +//! +//! All helpers operate on `&[u32]` slices and are `#[inline]`-marked so +//! they vanish into the family parsers at -O. + +use crate::tokenizer::Tokenizer; +use crate::types::RenderError; + +/// Find the first index of `target` in `ids`, or `None`. +#[inline] +pub fn find(ids: &[u32], target: u32) -> Option { + ids.iter().position(|&x| x == target) +} + +/// Find the first index of `target` in `ids[start..]`, or `None`. +#[inline] +pub fn find_from(ids: &[u32], target: u32, start: usize) -> Option { + ids[start..].iter().position(|&x| x == target).map(|i| i + start) +} + +/// Find the first index of any token in `targets`, or `None`. `targets` +/// is small (≤ a few) for every renderer, so a linear contains-check is +/// faster than a `HashSet`. +#[inline] +pub fn find_any(ids: &[u32], targets: &[u32]) -> Option { + ids.iter().position(|x| targets.contains(x)) +} + +/// Truncate `ids` at the first stop token. Returns the prefix as a +/// borrowed slice — no allocation. +#[inline] +pub fn strip_stop_tokens<'a>(ids: &'a [u32], stop_ids: &[u32]) -> &'a [u32] { + match find_any(ids, stop_ids) { + Some(i) => &ids[..i], + None => ids, + } +} + +/// Decode `ids` via `tokenizer.decode(ids, skip_special_tokens=False)`. +/// Returns an empty string for empty input without calling the +/// tokenizer (saves an FFI-free but still measurable ~µs per call). +#[inline] +pub fn decode(tokenizer: &Tokenizer, ids: &[u32]) -> Result { + if ids.is_empty() { + return Ok(String::new()); + } + tokenizer.decode(ids) +} diff --git a/crates/renderers-core/src/thinking.rs b/crates/renderers-core/src/thinking.rs new file mode 100644 index 0000000..c291658 --- /dev/null +++ b/crates/renderers-core/src/thinking.rs @@ -0,0 +1,92 @@ +//! `...` retention rules shared across renderers. + +use crate::types::Message; + +/// Should `messages[msg_idx]`'s reasoning content be re-emitted even when +/// the chat template would normally drop it? +/// +/// Returns `true` only as an override above the template default. Each +/// renderer ORs this into its own "render thinking?" condition. +/// +/// Mirrors `renderers/base.py:should_preserve_past_thinking`. +pub fn should_preserve_past_thinking( + messages: &[Message], + msg_idx: usize, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +) -> bool { + if preserve_all_thinking { + return true; + } + if !preserve_thinking_between_tool_calls { + return false; + } + // Find the most recent user message (or None). + let last_user: Option = messages + .iter() + .enumerate() + .rev() + .find_map(|(j, m)| (m.role == "user").then_some(j)); + + let Some(last_user) = last_user else { + // No user message before us: keep only if there's any tool turn + // anywhere; rare path but matches the Python contract. + return messages.iter().any(|m| m.role == "tool"); + }; + + if msg_idx <= last_user { + return false; + } + // The current segment must contain a tool response for the block to + // count as an in-flight tool cycle. + messages[last_user + 1..] + .iter() + .any(|m| m.role == "tool") +} + +#[cfg(test)] +mod tests { + use super::*; + + fn m(role: &str) -> Message { + Message { + role: role.to_string(), + ..Default::default() + } + } + + #[test] + fn preserve_all_wins() { + let msgs = vec![m("user"), m("assistant")]; + assert!(should_preserve_past_thinking(&msgs, 1, true, false)); + } + + #[test] + fn between_tool_calls_keeps_active_cycle() { + // Tool-cycle assistants (after the last user) are kept; the + // current tool block must contain at least one `tool` turn. + let msgs = vec![m("user"), m("assistant"), m("tool"), m("assistant")]; + // both assistants are after last_user=0 and the segment has a tool + assert!(should_preserve_past_thinking(&msgs, 1, false, true)); + assert!(should_preserve_past_thinking(&msgs, 3, false, true)); + // a prior tool cycle (before a later user) is dropped + let msgs2 = vec![ + m("user"), + m("assistant"), + m("tool"), + m("assistant"), + m("user"), + m("assistant"), + ]; + // assistant at idx=1 is before last_user=4 → dropped + assert!(!should_preserve_past_thinking(&msgs2, 1, false, true)); + // assistant at idx=5 is after last_user=4 but segment has no tool → dropped + assert!(!should_preserve_past_thinking(&msgs2, 5, false, true)); + } + + #[test] + fn between_tool_calls_drops_without_tool() { + let msgs = vec![m("user"), m("assistant"), m("assistant")]; + assert!(!should_preserve_past_thinking(&msgs, 2, false, true)); + } +} diff --git a/crates/renderers-core/src/tokenizer.rs b/crates/renderers-core/src/tokenizer.rs new file mode 100644 index 0000000..b75d8f3 --- /dev/null +++ b/crates/renderers-core/src/tokenizer.rs @@ -0,0 +1,124 @@ +//! Thin wrapper around `tokenizers::Tokenizer`. +//! +//! Provides three things the bare crate doesn't: +//! 1. A cached `unk_token_id` lookup so [`Tokenizer::token_to_id_strict`] +//! can match Python's "unk-id-is-missing" convention. +//! 2. An `encode_no_special` that returns `Vec` directly, sized to +//! the encoding length — saves the caller from juggling the +//! `tokenizers::Encoding` struct on every hot-path text segment. +//! 3. `Send + Sync` Arc-friendly storage so renderers can share one +//! instance across threads. + +use std::sync::Arc; + +use crate::types::RenderError; + +/// Owned tokenizer handle. Cloning is cheap (`Arc`); the +/// `tokenizers::Tokenizer` itself is held behind the Arc. +#[derive(Clone, Debug)] +pub struct Tokenizer { + inner: Arc, +} + +#[derive(Debug)] +struct Inner { + tok: tokenizers::Tokenizer, + unk_id: Option, +} + +impl Tokenizer { + /// Load a `tokenizer.json` from disk. + pub fn from_file(path: impl AsRef) -> Result { + let tok = tokenizers::Tokenizer::from_file(path) + .map_err(|e| RenderError::Tokenizer(e.to_string()))?; + Ok(Self::wrap(tok)) + } + + /// Wrap an already-loaded `tokenizers::Tokenizer`. + pub fn wrap(tok: tokenizers::Tokenizer) -> Self { + let unk_id = tok.token_to_id(""); + Self { + inner: Arc::new(Inner { tok, unk_id }), + } + } + + /// Returns the token id for `token`, or `None` if missing / + /// resolved to ``. Matches the Python helper at + /// `renderers/parsers.py:_token_id`. + pub fn token_to_id(&self, token: &str) -> Option { + let tid = self.inner.tok.token_to_id(token)?; + if Some(tid) == self.inner.unk_id { + None + } else { + Some(tid) + } + } + + /// Strict variant: returns an error if the token is missing. + pub fn token_to_id_strict(&self, token: &str) -> Result { + self.token_to_id(token) + .ok_or_else(|| RenderError::MissingSpecialToken(token.to_string())) + } + + /// Encode `text` without adding model special tokens, returning the + /// id sequence directly. Hot-path callers should batch text segments + /// where possible, but per-segment encode is still significantly + /// faster than the Python equivalent because there's no FFI hop. + pub fn encode_no_special(&self, text: &str) -> Result { + let enc = self + .inner + .tok + .encode_fast(text, false) + .map_err(|e| RenderError::Tokenizer(e.to_string()))?; + Ok(Encoded { enc }) + } + + /// Decode `ids` to text, including special tokens (matches the + /// Python `tokenizer.decode(ids, skip_special_tokens=False)` used + /// across the parsing layer). + pub fn decode(&self, ids: &[u32]) -> Result { + self.inner + .tok + .decode(ids, /*skip_special_tokens=*/ false) + .map_err(|e| RenderError::Tokenizer(e.to_string())) + } + + /// Borrow the underlying `tokenizers::Tokenizer` for advanced uses + /// (batch encoding, vocab access, ...). Prefer the wrappers above on + /// the hot path. + pub fn raw(&self) -> &tokenizers::Tokenizer { + &self.inner.tok + } +} + +/// Lightweight wrapper around `tokenizers::Encoding` exposing just the +/// id slice. Holding the encoding (instead of allocating a fresh +/// `Vec`) skips one copy on the way to `RenderBuf::ids`. +pub struct Encoded { + enc: tokenizers::Encoding, +} + +impl std::fmt::Debug for Encoded { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Encoded") + .field("len", &self.enc.len()) + .finish() + } +} + +impl Encoded { + #[inline] + pub fn as_slice(&self) -> &[u32] { + self.enc.get_ids() + } + + #[inline] + pub fn len(&self) -> usize { + self.enc.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.enc.is_empty() + } +} diff --git a/crates/renderers-core/src/traits.rs b/crates/renderers-core/src/traits.rs new file mode 100644 index 0000000..b0bbb2b --- /dev/null +++ b/crates/renderers-core/src/traits.rs @@ -0,0 +1,86 @@ +//! The [`Renderer`] trait and its multimodal extension. +//! +//! Both are object-safe so a `Box` (or `Arc`) +//! at the public boundary works without extra ceremony. Family-specific +//! configuration lives on the concrete struct that impls these traits. + +use crate::types::{MultiModalData, ParsedResponse, RenderError, RenderedTokens}; +use crate::types::{Message, ToolSpec}; + +/// Deterministic message → token renderer for a specific model family. +/// +/// Implementors must: +/// +/// - Be `Send + Sync` so a single instance can be shared via `Arc` across +/// threads (the Python `RendererPool` is obsolete in Rust). +/// - Produce byte-for-byte identical output to the corresponding Python +/// renderer for the same inputs — verified by the `test_render_ids`, +/// `test_bridge`, `test_roundtrip`, and `test_parse_response_robustness` +/// golden suites. +pub trait Renderer: Send + Sync + std::fmt::Debug { + /// Render `messages` to tokens with per-token message attribution. + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result; + + /// Render `messages` to tokens, dropping per-token attribution. The + /// default impl delegates to [`Renderer::render`]; family-specific + /// renderers may override with a slimmer path if it shows up in + /// profiling (the saving is one `Vec` allocation). + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + Ok(self.render(messages, tools, add_generation_prompt)?.token_ids) + } + + /// Parse a completion's token ids back into a structured response. + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse; + + /// Stop token ids the sampler should respect. + fn stop_token_ids(&self) -> &[u32]; + + /// Extend the prior turn's tokens verbatim with `new_messages`. + /// + /// Contract: + /// - The returned token stream starts with + /// `previous_prompt_ids + previous_completion_ids` (byte-for-byte). + /// - Returns `None` if `new_messages` contains an assistant turn + /// (refuses to retokenize sampled output) or if the prior turn was + /// truncated and no canonical close can be synthesised. + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + tools: Option<&[ToolSpec]>, + ) -> Result, RenderError>; + + /// Downcast to a multimodal renderer if this implementor supports it. + /// Default returns `None`; multimodal families override. + fn as_multimodal(&self) -> Option<&dyn MultimodalRenderer> { + None + } +} + +/// Extension implemented by multimodal-capable renderers. +pub trait MultimodalRenderer: Renderer { + /// Map of placeholder token id → modality marker (1 = image, 2 = video). + fn mm_token_type_id_map(&self) -> &[(u32, u8)]; + + /// Multimodal-aware bridge. The trailing argument carries the prior + /// turn's sidecar so placeholders survive across turns. + fn bridge_to_next_turn_mm( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + tools: Option<&[ToolSpec]>, + previous_multi_modal_data: Option<&MultiModalData>, + ) -> Result, RenderError>; +} diff --git a/crates/renderers-core/src/types.rs b/crates/renderers-core/src/types.rs new file mode 100644 index 0000000..ba2ffcc --- /dev/null +++ b/crates/renderers-core/src/types.rs @@ -0,0 +1,335 @@ +//! Core data types for renderers. +//! +//! The shapes mirror the Python `renderers.base` types so JSON round-trips +//! and PyO3 wrapping stay mechanical. Strings are owned (`String`) — PyO3 +//! always materialises strings on entry, so `Cow<'a, str>` would only +//! propagate lifetimes for no win. The few `&str` borrows that pay off are +//! taken locally inside renderer implementations from `&[Message]` slices. + +use std::ops::Range; + +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +/// Sentinel value for `message_indices` entries that come from structural +/// scaffolding rather than a specific message (e.g. the generation prompt). +/// +/// Kept as a named constant so the `-1` in code is searchable and easy to +/// audit. Matches the Python contract at `renderers/base.py:160`. +pub const SCAFFOLD_IDX: i32 = -1; + +/// A single content part inside a multi-part message body. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum ContentPart { + /// Plain text. + Text { + text: String, + }, + /// Model chain-of-thought as a content part. + Thinking { + thinking: String, + }, + /// Image reference. Resolution to bytes / processor output happens + /// in the multimodal renderer. + Image(ImageRef), + /// Video reference; mirrors [`ImageRef`]. + Video(VideoRef), +} + +/// Image source variants accepted in [`ContentPart::Image`]. Phase 1 +/// covers text-only families, so only the URL/path discriminators carry +/// data — inline bytes are routed through `serde_json::Value` payload +/// for now and resolved by the (Phase 5) multimodal port. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct ImageRef { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub url: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, +} + +/// Video source variants accepted in [`ContentPart::Video`]. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct VideoRef { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub url: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, +} + +/// Message body. Either a plain string or a list of structured parts. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(untagged)] +pub enum Content { + Text(String), + Parts(Vec), +} + +impl Default for Content { + fn default() -> Self { + Content::Text(String::new()) + } +} + +impl Content { + /// Borrow the body as a `&str` if it is a plain string; returns + /// `""` for `Parts` variants (Qwen3 ignores list content entirely). + pub fn as_text(&self) -> &str { + match self { + Content::Text(s) => s.as_str(), + Content::Parts(_) => "", + } + } + + pub fn is_empty(&self) -> bool { + match self { + Content::Text(s) => s.is_empty(), + Content::Parts(p) => p.is_empty(), + } + } +} + +/// Function body inside a [`ToolCall`]. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct ToolCallFunction { + #[serde(default)] + pub name: String, + /// Arguments may arrive as a JSON object or as a pre-serialised JSON + /// string (some OpenAI-format clients do this); preserve the + /// distinction. + #[serde(default)] + pub arguments: ToolArguments, +} + +/// Structured tool invocation in OpenAI function-calling format. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct ToolCall { + #[serde(default = "default_tool_type", rename = "type")] + pub kind: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub id: Option, + pub function: ToolCallFunction, +} + +fn default_tool_type() -> String { + "function".to_string() +} + +/// Tool specification (OpenAI function-calling format) passed to +/// [`Renderer::render`](crate::Renderer::render). +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct ToolSpec { + pub name: String, + #[serde(default)] + pub description: String, + #[serde(default)] + pub parameters: serde_json::Value, +} + +/// A single turn in a multi-turn conversation. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct Message { + pub role: String, + #[serde(default)] + pub content: Content, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub tool_calls: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub tool_call_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub name: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub reasoning_content: Option, +} + +impl Message { + /// Borrow `content` as a `&str` only when it is a plain string. Many + /// hand-coded renderers (Qwen3, GLM5, ...) drop list-content entirely + /// for non-multimodal text paths; this helper makes that explicit. + #[inline] + pub fn text_content(&self) -> &str { + self.content.as_text() + } +} + +/// Tool-call argument payload. The JSON-object case is the common path; +/// the raw-string case preserves the OpenAI quirk where some clients +/// pre-serialise arguments to a string. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(untagged)] +pub enum ToolArguments { + Object(serde_json::Value), + Raw(String), +} + +impl Default for ToolArguments { + fn default() -> Self { + ToolArguments::Object(serde_json::Value::Object(Default::default())) + } +} + +impl ToolArguments { + /// Render arguments as a JSON string suitable for inserting verbatim + /// into a tool-call payload (matches Python's + /// `json.dumps(arguments, ensure_ascii=False)`). + pub fn to_json_string(&self) -> String { + match self { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => serde_json::to_string(v) + .unwrap_or_else(|_| "{}".to_string()), + } + } +} + +/// Where a single multimodal item's placeholder tokens sit in the stream. +#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct PlaceholderRange { + pub offset: usize, + pub length: usize, +} + +/// Multimodal sidecar emitted alongside the token stream. The shape +/// mirrors vLLM's `mm_*` payload without depending on vLLM types. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct MultiModalData { + #[serde(default)] + pub mm_hashes: std::collections::BTreeMap>, + #[serde(default)] + pub mm_placeholders: std::collections::BTreeMap>, + /// Per-item processor outputs. The values are passed through as opaque + /// JSON to keep this crate framework-agnostic; vision processors live + /// behind the PyO3 boundary in the current Phase 1 design. + #[serde(default)] + pub mm_items: std::collections::BTreeMap>, +} + +impl MultiModalData { + pub fn is_empty(&self) -> bool { + self.mm_hashes.is_empty() && self.mm_placeholders.is_empty() && self.mm_items.is_empty() + } +} + +/// Result of rendering messages to tokens. +/// +/// `token_ids` and `message_indices` are parallel: `message_indices[i]` is +/// the index into the input `messages` slice of the message that produced +/// `token_ids[i]`, or [`SCAFFOLD_IDX`] for structural scaffolding tokens. +/// +/// Both vectors are sized once during render — see +/// [`RenderedTokens::with_capacity`]. +#[derive(Clone, Debug, Default)] +pub struct RenderedTokens { + pub token_ids: Vec, + pub message_indices: Vec, + pub multi_modal_data: Option, +} + +impl RenderedTokens { + pub fn new() -> Self { + Self::default() + } + + /// Pre-allocate both buffers to the same capacity. Renderers pass an + /// estimate based on `messages.len() * 256` to keep the hot path + /// realloc-free for typical conversations. + pub fn with_capacity(cap: usize) -> Self { + Self { + token_ids: Vec::with_capacity(cap), + message_indices: Vec::with_capacity(cap), + multi_modal_data: None, + } + } + + #[inline] + pub fn len(&self) -> usize { + self.token_ids.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.token_ids.is_empty() + } +} + +/// Per-attempt outcome of parsing a single tool-call block. Matches the +/// Python `ToolCallParseStatus` semantics — every parse attempt surfaces +/// (success and malformed alike), distinguished by this status. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ToolCallParseStatus { + Ok, + InvalidJson, + UnclosedBlock, + MissingName, + MalformedStructure, +} + +impl ToolCallParseStatus { + /// Wire string matching the Python enum values + /// (`"ok" | "invalid_json" | ...`) so PyO3 can round-trip them. + pub fn as_wire(&self) -> &'static str { + match self { + Self::Ok => "ok", + Self::InvalidJson => "invalid_json", + Self::UnclosedBlock => "unclosed_block", + Self::MissingName => "missing_name", + Self::MalformedStructure => "malformed_structure", + } + } +} + +/// A single tool-call block as the parser saw it. +/// +/// `arguments` carries `None` only when the block was so malformed that +/// nothing could be recovered; successful parses produce +/// [`ToolArguments::Object`], pre-serialised string arguments produce +/// [`ToolArguments::Raw`]. +#[derive(Clone, Debug)] +pub struct ParsedToolCall { + pub raw: String, + pub name: Option, + pub arguments: Option, + /// Half-open `[start, end)` slice into the stop-stripped completion + /// token stream. `None` for text-based parsers that can't cheaply + /// recover offsets. + pub token_span: Option>, + pub status: ToolCallParseStatus, + /// Native id when the format carries one (Kimi K2). + pub id: Option, +} + +impl Default for ParsedToolCall { + fn default() -> Self { + Self { + raw: String::new(), + name: None, + arguments: None, + token_span: None, + status: ToolCallParseStatus::Ok, + id: None, + } + } +} + +/// Result of parsing completion tokens back into a structured message. +#[derive(Clone, Debug, Default)] +pub struct ParsedResponse { + pub content: String, + pub reasoning_content: Option, + pub tool_calls: Vec, +} + +/// Errors surfaced by rendering. +#[derive(Debug, Error)] +pub enum RenderError { + #[error("no messages provided")] + EmptyMessages, + #[error("special token {0:?} not found in tokenizer vocabulary")] + MissingSpecialToken(String), + #[error("tokenizer error: {0}")] + Tokenizer(String), + #[error("invalid input: {0}")] + Invalid(String), +} + From 62625b66bcc100a19a068fb4af66e52fd3ca4056 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:43:56 +0200 Subject: [PATCH 02/35] Add Qwen3 native parity path --- Cargo.lock | 138 +++++ Cargo.toml | 1 + crates/README.md | 116 +++++ crates/renderers-core/src/families/mod.rs | 9 + crates/renderers-core/src/families/qwen3.rs | 525 ++++++++++++++++++++ crates/renderers-core/src/lib.rs | 2 + crates/renderers-core/src/parsing/mod.rs | 2 + crates/renderers-core/src/parsing/qwen3.rs | 151 ++++++ crates/renderers-core/src/registry.rs | 49 ++ crates/renderers-py/Cargo.toml | 20 + crates/renderers-py/pyproject.toml | 19 + crates/renderers-py/src/lib.rs | 355 +++++++++++++ pyproject.toml | 7 + renderers/_native_router.py | 113 +++++ renderers/qwen3.py | 40 ++ tests/test_native_parity.py | 318 ++++++++++++ tests/test_native_router.py | 106 ++++ 17 files changed, 1971 insertions(+) create mode 100644 crates/README.md create mode 100644 crates/renderers-core/src/families/mod.rs create mode 100644 crates/renderers-core/src/families/qwen3.rs create mode 100644 crates/renderers-core/src/parsing/qwen3.rs create mode 100644 crates/renderers-core/src/registry.rs create mode 100644 crates/renderers-py/Cargo.toml create mode 100644 crates/renderers-py/pyproject.toml create mode 100644 crates/renderers-py/src/lib.rs create mode 100644 renderers/_native_router.py create mode 100644 tests/test_native_parity.py create mode 100644 tests/test_native_router.py diff --git a/Cargo.lock b/Cargo.lock index 8037260..aa35e8f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + [[package]] name = "base64" version = "0.13.1" @@ -186,6 +192,12 @@ version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "ident_case" version = "1.0.1" @@ -202,6 +214,15 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + [[package]] name = "itertools" version = "0.11.0" @@ -266,6 +287,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -386,6 +416,12 @@ version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -404,6 +440,79 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pyo3" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "pythonize" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcf491425978bd889015d5430f6473d91bdfa2097262f1e731aadcf6c2113e" +dependencies = [ + "pyo3", + "serde", +] + [[package]] name = "quote" version = "1.0.45" @@ -518,6 +627,23 @@ dependencies = [ "tokenizers", ] +[[package]] +name = "renderers-py" +version = "0.1.0" +dependencies = [ + "pyo3", + "pythonize", + "renderers-core", + "serde", + "serde_json", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "serde" version = "1.0.228" @@ -609,6 +735,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + [[package]] name = "thiserror" version = "1.0.69" @@ -687,6 +819,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" diff --git a/Cargo.toml b/Cargo.toml index 9e0ed8e..f51c0b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ resolver = "2" members = [ "crates/renderers-core", + "crates/renderers-py", ] [workspace.package] diff --git a/crates/README.md b/crates/README.md new file mode 100644 index 0000000..b249442 --- /dev/null +++ b/crates/README.md @@ -0,0 +1,116 @@ +# `renderers` Rust port + +Pure-Rust port of the `renderers` library, with a thin PyO3 wrapper so +existing Python callers can opt into the native path without code +changes. + +## Workspace layout + +| Crate | Role | +| ---------------- | ------------------------------------------------------------------------------------------ | +| `renderers-core` | Pure-Rust crate. Public `Renderer` / `MultimodalRenderer` traits, family implementations. | +| `renderers-py` | PyO3 wrapper. Builds the `renderers._native` extension module via maturin. | + +The Rust crate is usable standalone (e.g. from an sglang-rs / vllm-rs +integration); the Python wrapper exists only to bridge into the +existing `renderers` package. + +## Building the native extension + +For development (editable install into the active venv): + +```bash +maturin develop --manifest-path crates/renderers-py/Cargo.toml --release +``` + +This installs `renderers_native..so` into the venv's +`site-packages` so `import renderers_native` resolves. It's kept as a +top-level module (rather than `renderers._native`) so the maturin-built +wheel doesn't collide with the hatchling-built `renderers` wheel at +install time. + +## Opting into the native path at runtime + +The Python shims keep the pure-Python implementation as the default and +only route to the native module when `RENDERERS_NATIVE` selects the +family: + +```bash +RENDERERS_NATIVE=qwen3 pytest tests/test_render_ids.py +RENDERERS_NATIVE=all pytest tests/ # everything ported +``` + +## Parity testing + +Two complementary suites validate the port: + +1. **`tests/test_render_ids.py` (and siblings)** — Python (or, when the + env var routes, native) vs HuggingFace's `apply_chat_template`. + Catches drift from the upstream reference. Run under the native + path with `RENDERERS_NATIVE=qwen3 pytest tests/test_render_ids.py`. +2. **`tests/test_native_parity.py`** — Python vs native, holding the + reference fixed. Catches drift between the two implementations even + if HuggingFace changes its template. Cheaper because the HF call + isn't on the path. Marker: `-m parity`. + +The parity suite skips cleanly when the tokenizer.json isn't on disk +or the extension isn't built, so it's safe to import in CI without +gating on either. + +Recognised values: + +| `RENDERERS_NATIVE` | Behaviour | +| ------------------ | -------------------------------------------------------- | +| unset / `0` | Pure Python (default) | +| `1` / `all` | Route every supported family to the native module | +| `qwen3` | Route only Qwen3 | +| `qwen3,qwen35,...` | Route a comma-separated list of families | + +If `RENDERERS_NATIVE` is set but the extension isn't installed, the +shim logs a one-shot info message and falls back to Python. + +## Family coverage + +| Family | Status | +| ------------ | ----------------------------------------------- | +| Qwen3 | ✅ ported (Phase 2) | +| Qwen3.5 | planned (Phase 3) | +| GLM 4.5 / 5 | planned (Phase 3) | +| DeepSeek V3 | planned (Phase 3) | +| Nemotron3 | planned (Phase 3) | +| Kimi K2 | planned (Phase 4) | +| Kimi K2.5 | planned (Phase 4 — text; multimodal Phase 5) | +| MiniMax M2 | planned (Phase 4) | +| Qwen3.6 | planned (Phase 4) | +| Qwen3-VL | planned (Phase 5 — multimodal incl. processor) | +| Qwen3.5 mm | planned (Phase 5) | +| GPT-OSS | planned (Phase 6 — via `openai-harmony` crate) | +| Default | planned (Phase 7 — via `minijinja`) | + +## Performance targets + +Single-call latency (Qwen3.5, 1500-token prompt, 512-token completion): + +| Phase | Python (current) | Rust (target) | Speedup | +| -------------------- | ---------------: | ------------: | ------: | +| `render_ids` | 0.5–1.0 ms | 0.15–0.3 ms | 3–5×| +| `parse_response` | 0.05–0.15 ms | 0.05–0.15 ms¹ | 5–10×| +| `bridge_to_next_turn`| 0.3–0.6 ms | 0.05–0.15 ms | 4–6×| + +¹ Speedup vs Python including FFI overhead, which is the actual gap; +absolute numbers depend on completion content shape. + +Throughput on an 8-thread caller is expected to gain another ~5–8× +because every method releases the GIL (`py.allow_threads`) — the Python +pool model is obsolete in Rust. + +## Crate-level invariants + +- `#![forbid(unsafe_code)]` at the crate root of `renderers-core`. +- All hot-path scans use bounded `&[u32]` slices; no allocation in + `find` / `find_from` / `find_any`. +- `RenderBuf` reserves capacity once based on `messages.len() * 256`. +- Special-token ids are resolved at renderer construction and cached + on the struct. +- Tokenizer is held behind `Arc<...>` so a single instance serves any + number of concurrent callers. diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs new file mode 100644 index 0000000..0a507d3 --- /dev/null +++ b/crates/renderers-core/src/families/mod.rs @@ -0,0 +1,9 @@ +//! Per-family renderer implementations. +//! +//! Each family lives in its own module so the hand-coded template logic +//! stays focused. New families slot in by adding a module here and a +//! registry entry in [`crate::registry`]. + +pub mod qwen3; + +pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; diff --git a/crates/renderers-core/src/families/qwen3.rs b/crates/renderers-core/src/families/qwen3.rs new file mode 100644 index 0000000..a3a7660 --- /dev/null +++ b/crates/renderers-core/src/families/qwen3.rs @@ -0,0 +1,525 @@ +//! Qwen3 renderer. Port of `renderers/qwen3.py`. +//! +//! Byte-for-byte identical output to the Python version — the +//! `test_render_ids` / `test_bridge` / `test_roundtrip` golden suites are +//! the contract. +//! +//! # Performance notes +//! +//! - Special-token ids are resolved once at construction and cached on +//! the struct. Zero per-call lookup cost. +//! - The render buffer is sized to `messages.len() * 256` up front; this +//! covers ~99% of multi-turn conversations with no realloc. +//! - The tools header / footer are static `&str` constants — no +//! per-call allocation. +//! - Tool-call argument serialisation goes through `serde_json` directly, +//! ~5–10× faster than Python's `json.dumps` for the JSON sizes typical +//! here. + +use serde_json::json; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::qwen3::parse_qwen3; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, +}; + +const TOOLS_HEADER: &str = "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n"; + +const TOOLS_FOOTER: &str = "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n"; + +const GEN_PROMPT_NO_THINKING_SUFFIX: &str = "\n\n\n\n"; + +/// Builder for [`Qwen3Renderer`]. Use this to surface the rare optional +/// flags without polluting the most common constructor. +#[derive(Debug, Clone)] +pub struct Qwen3RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for Qwen3RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl Qwen3RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + + pub fn build(self, tokenizer: Tokenizer) -> Result { + Qwen3Renderer::new_with(tokenizer, self) + } +} + +/// Deterministic Qwen3 renderer. +#[derive(Debug, Clone)] +pub struct Qwen3Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + im_start: u32, + im_end: u32, + /// Cached for parity with the Python `_endoftext` field; the + /// stop-token set already encodes the same id, so this is unused + /// directly but kept for debug parity. + #[allow(dead_code)] + endoftext: u32, + tool_call: u32, + tool_call_end: u32, + tool_response: u32, + tool_response_end: u32, + + /// Cached stop tokens (`im_end`, `endoftext`) for `stop_token_ids` + /// and bridge close-token sets. Two-element vector held by-value + /// per renderer instance. + stop_tokens: Vec, +} + +impl Qwen3Renderer { + /// Convenience constructor with all defaults. + pub fn new(tokenizer: Tokenizer) -> Result { + Qwen3RendererBuilder::default().build(tokenizer) + } + + pub fn builder() -> Qwen3RendererBuilder { + Qwen3RendererBuilder::default() + } + + fn new_with( + tokenizer: Tokenizer, + cfg: Qwen3RendererBuilder, + ) -> Result { + let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let tool_response = tokenizer.token_to_id_strict("")?; + let tool_response_end = tokenizer.token_to_id_strict("")?; + + let stop_tokens = vec![im_end, endoftext]; + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_start, + im_end, + endoftext, + tool_call, + tool_call_end, + tool_response, + tool_response_end, + stop_tokens, + }) + } + + /// Index of the most recent user message whose content is *not* a + /// `...` placeholder. Defaults to + /// `len - 1` when no real user message is present. + fn last_query_index(messages: &[Message]) -> i32 { + for (i, msg) in messages.iter().enumerate().rev() { + if msg.role != "user" { + continue; + } + let content = msg.text_content(); + if !(content.starts_with("") && content.ends_with("")) { + return i as i32; + } + } + (messages.len() as i32).saturating_sub(1) + } + + fn emit_system_with_tools( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + tools: &[ToolSpec], + first_is_system: bool, + ) -> Result<(), RenderError> { + let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; + buf.special(self.im_start, sys_idx); + let mut tool_text = String::from("system\n"); + if first_is_system { + tool_text.push_str(messages[0].text_content()); + tool_text.push_str("\n\n"); + } + tool_text.push_str(TOOLS_HEADER); + for tool in tools { + tool_text.push('\n'); + let spec = json!({ + "name": tool.name, + "description": tool.description, + "parameters": tool.parameters, + }); + // `to_string` here is `serde_json::to_string` (no pretty, + // no ensure_ascii — matches Python's + // `json.dumps(..., ensure_ascii=False)`). + tool_text.push_str(&serde_json::to_string(&spec).map_err(|e| { + RenderError::Invalid(format!("tool spec serialisation failed: {e}")) + })?); + } + tool_text.push_str(TOOLS_FOOTER); + buf.text(&tool_text, sys_idx)?; + buf.special(self.im_end, sys_idx); + buf.text("\n", sys_idx)?; + Ok(()) + } + + fn emit_system_no_tools( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + ) -> Result<(), RenderError> { + buf.special(self.im_start, 0); + let mut s = String::with_capacity(messages[0].text_content().len() + 8); + s.push_str("system\n"); + s.push_str(messages[0].text_content()); + buf.text(&s, 0)?; + buf.special(self.im_end, 0); + buf.text("\n", 0)?; + Ok(()) + } + + fn emit_user( + &self, + buf: &mut RenderBuf<'_>, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + Ok(()) + } + + fn emit_non_initial_system( + &self, + buf: &mut RenderBuf<'_>, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + msg_idx: usize, + content: &str, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let next_is_tool = + msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let idx = msg_idx as i32; + + if !prev_is_tool { + buf.special(self.im_start, idx); + buf.text("user", idx)?; + } + buf.text("\n", idx)?; + buf.special(self.tool_response, idx); + let mut wrapped = String::with_capacity(content.len() + 2); + wrapped.push('\n'); + wrapped.push_str(content); + wrapped.push('\n'); + buf.text(&wrapped, idx)?; + buf.special(self.tool_response_end, idx); + if !next_is_tool { + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + } + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: usize, + last_query_index: i32, + is_last: bool, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + // Recover reasoning content either from the explicit field or + // from inline `...` text. Match the Python + // implementation's split semantics exactly. + let raw_content = msg.text_content(); + let (reasoning_content, content_after_think) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let reasoning = if let Some((_, inner)) = before.rsplit_once("") { + inner.trim_start_matches('\n').trim_end_matches('\n').to_string() + } else { + before.trim_start_matches('\n').trim_end_matches('\n').to_string() + }; + (reasoning, after.trim_start_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + + let idx = msg_idx as i32; + buf.special(self.im_start, idx); + + let tool_calls = &msg.tool_calls; + let emit_in_template_window = + (msg_idx as i32) > last_query_index && (is_last || !reasoning_content.is_empty()); + let emit_via_override = preserve_thinking && !reasoning_content.is_empty(); + + let prefix = if emit_in_template_window || emit_via_override { + let mut s = String::with_capacity(reasoning_content.len() + content_after_think.len() + 32); + s.push_str("assistant\n\n"); + s.push_str(reasoning_content.trim_matches('\n')); + s.push_str("\n\n\n"); + s.push_str(content_after_think.trim_start_matches('\n')); + s + } else { + let mut s = String::with_capacity(content_after_think.len() + 10); + s.push_str("assistant\n"); + s.push_str(&content_after_think); + s + }; + + if tool_calls.is_empty() { + buf.text(&prefix, idx)?; + } else { + for (tc_idx, tc) in tool_calls.iter().enumerate() { + let name = tc.function.name.as_str(); + let args_str = match &tc.function.arguments { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => serde_json::to_string(v).map_err(|e| { + RenderError::Invalid(format!("tool args serialisation failed: {e}")) + })?, + }; + if tc_idx == 0 { + let mut s = prefix.clone(); + if !content_after_think.is_empty() { + s.push('\n'); + } + buf.text(&s, idx)?; + } else { + buf.text("\n", idx)?; + } + buf.special(self.tool_call, idx); + let mut payload = String::with_capacity(args_str.len() + name.len() + 24); + payload.push_str("\n{\"name\": \""); + payload.push_str(name); + payload.push_str("\", \"arguments\": "); + payload.push_str(&args_str); + payload.push_str("}\n"); + buf.text(&payload, idx)?; + buf.special(self.tool_call_end, idx); + } + } + + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + Ok(()) + } + + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + // Heuristic: ~256 tokens / message, plus a flat surcharge for the + // tools block (it can be substantial). Realloc once if we + // underestimate; the cost of over-allocating is a few KB. + let base = messages.len().max(1) * 256; + let tools_bonus = tools.map(|t| 256 * t.len().max(1)).unwrap_or(0); + base + tools_bonus + } +} + +impl Renderer for Qwen3Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let cap = Self::estimate_capacity(messages, tools); + let mut buf = RenderBuf::new(&self.tokenizer, cap); + + let first_is_system = messages[0].role == "system"; + + // 1. System + tools header. + match tools { + Some(t) if !t.is_empty() => { + self.emit_system_with_tools(&mut buf, messages, t, first_is_system)?; + } + _ => { + if first_is_system { + self.emit_system_no_tools(&mut buf, messages)?; + } + } + } + + // 2. Last-query index. + let last_qi = Self::last_query_index(messages); + let num_messages = messages.len(); + + // 3. Body. + for (i, msg) in messages.iter().enumerate() { + let content = msg.text_content(); + match msg.role.as_str() { + "system" => { + if i == 0 { + continue; + } + self.emit_non_initial_system(&mut buf, content, i as i32)?; + } + "user" => { + self.emit_user(&mut buf, content, i as i32)?; + } + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant( + &mut buf, + msg, + i, + last_qi, + i + 1 == num_messages, + preserve_thinking, + )?; + } + "tool" => { + self.emit_tool(&mut buf, messages, i, content)?; + } + _ => { + // Unknown role: skip silently (matches Python which + // simply has no branch for it). + } + } + } + + // 4. Generation prompt. + if add_generation_prompt { + buf.scaffold_special(self.im_start); + buf.scaffold_text("assistant\n")?; + if !self.enable_thinking { + buf.scaffold_text(GEN_PROMPT_NO_THINKING_SUFFIX)?; + } + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_qwen3( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.tool_call, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let cap = Self::estimate_capacity(new_messages, None); + let mut buf = RenderBuf::new(&self.tokenizer, cap); + + // Trailing `\n` after the prior turn's close token. + buf.scaffold_text("\n")?; + + for (i, msg) in new_messages.iter().enumerate() { + let content = msg.text_content(); + let idx = i as i32; + match msg.role.as_str() { + "user" => self.emit_user(&mut buf, content, idx)?, + "system" => self.emit_non_initial_system(&mut buf, content, idx)?, + "tool" => self.emit_tool(&mut buf, new_messages, i, content)?, + _ => return Ok(None), + } + } + + buf.scaffold_special(self.im_start); + buf.scaffold_text("assistant\n")?; + if !self.enable_thinking { + buf.scaffold_text(GEN_PROMPT_NO_THINKING_SUFFIX)?; + } + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} diff --git a/crates/renderers-core/src/lib.rs b/crates/renderers-core/src/lib.rs index 2b6ccb2..597f57c 100644 --- a/crates/renderers-core/src/lib.rs +++ b/crates/renderers-core/src/lib.rs @@ -27,7 +27,9 @@ pub mod bridge; pub mod emit; +pub mod families; pub mod parsing; +pub mod registry; pub mod thinking; pub mod tokenizer; pub mod traits; diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs index dcdef52..59b81ea 100644 --- a/crates/renderers-core/src/parsing/mod.rs +++ b/crates/renderers-core/src/parsing/mod.rs @@ -9,6 +9,8 @@ //! All helpers operate on `&[u32]` slices and are `#[inline]`-marked so //! they vanish into the family parsers at -O. +pub mod qwen3; + use crate::tokenizer::Tokenizer; use crate::types::RenderError; diff --git a/crates/renderers-core/src/parsing/qwen3.rs b/crates/renderers-core/src/parsing/qwen3.rs new file mode 100644 index 0000000..bcd7ee2 --- /dev/null +++ b/crates/renderers-core/src/parsing/qwen3.rs @@ -0,0 +1,151 @@ +//! Qwen3 tool-call parser — Hermes-style JSON tool calls. +//! +//! Port of `renderers/parsing.py:parse_qwen3`. The structural shape is: +//! +//! ```text +//! ...content tokens... +//! +//! { "name": "fn", "arguments": { ... } } +//! +//! ...possibly more blocks... +//! ``` +//! +//! Reasoning (`...`) is emitted as plain text by Qwen3 +//! (not special tokens), so it falls out from the decoded content. + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +/// Parse Qwen3 completion tokens. `stop_ids` is consulted only to +/// truncate runaway content past EOS; the parser itself walks the +/// truncated prefix. +pub fn parse_qwen3( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + tool_call_id: u32, + tool_call_end_id: u32, +) -> ParsedResponse { + let ids = strip_stop_tokens(token_ids, stop_ids); + + let mut tool_calls: Vec = Vec::new(); + let (content_ids, _scanned) = match find(ids, tool_call_id) { + Some(tc_start) => { + let content = &ids[..tc_start]; + let mut i = tc_start; + while i < ids.len() { + if ids[i] == tool_call_id { + match find_from(ids, tool_call_end_id, i + 1) { + None => { + // No closing delim — runs to end of stripped ids. + let raw = decode(tokenizer, &ids[i + 1..]) + .unwrap_or_default() + .trim() + .to_string(); + tool_calls.push(ParsedToolCall { + raw, + token_span: Some(i..ids.len()), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; + } + Some(end) => { + let block = &ids[i + 1..end]; + let tc_text = decode(tokenizer, block) + .unwrap_or_default() + .trim() + .to_string(); + let span = i..(end + 1); + match serde_json::from_str::(&tc_text) { + Err(_) => { + tool_calls.push(ParsedToolCall { + raw: tc_text, + token_span: Some(span), + status: ToolCallParseStatus::InvalidJson, + ..Default::default() + }); + } + Ok(value) => { + let (name, args) = extract_name_and_args(&value); + if name.is_empty() { + tool_calls.push(ParsedToolCall { + raw: tc_text, + name: None, + arguments: Some(args), + token_span: Some(span), + status: ToolCallParseStatus::MissingName, + ..Default::default() + }); + } else { + tool_calls.push(ParsedToolCall { + raw: tc_text, + name: Some(name), + arguments: Some(args), + token_span: Some(span), + status: ToolCallParseStatus::Ok, + ..Default::default() + }); + } + } + } + i = end + 1; + } + } + } else { + i += 1; + } + } + (content, true) + } + None => (ids, false), + }; + + let text = decode(tokenizer, content_ids).unwrap_or_default(); + let (reasoning, content) = split_thinking(&text); + + ParsedResponse { + content: content.trim().to_string(), + reasoning_content: reasoning.filter(|s| !s.is_empty()), + tool_calls, + } +} + +/// Pull `name` (string) and `arguments` (object or whatever the model +/// emitted) out of a parsed tool-call JSON value. Matches the Python +/// `parsed.get("name", "")` / `parsed.get("arguments", {})` semantics. +fn extract_name_and_args(value: &serde_json::Value) -> (String, ToolArguments) { + let obj = match value.as_object() { + Some(o) => o, + None => return (String::new(), ToolArguments::default()), + }; + let name = obj + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let args = match obj.get("arguments") { + None => ToolArguments::default(), + Some(serde_json::Value::String(s)) => ToolArguments::Raw(s.clone()), + Some(v) => ToolArguments::Object(v.clone()), + }; + (name, args) +} + +/// Split a decoded text segment around ``. Mirrors the inline +/// logic at `renderers/parsing.py` for Qwen3 (which has no `` as +/// special token — reasoning lives in the decoded text). +fn split_thinking(text: &str) -> (Option, String) { + if let Some((before, after)) = text.split_once("") { + let reasoning = before + .replace("", "") + .trim_matches('\n') + .trim() + .to_string(); + let content = after.trim_matches('\n').to_string(); + (Some(reasoning), content) + } else { + (None, text.to_string()) + } +} diff --git a/crates/renderers-core/src/registry.rs b/crates/renderers-core/src/registry.rs new file mode 100644 index 0000000..6e27c3c --- /dev/null +++ b/crates/renderers-core/src/registry.rs @@ -0,0 +1,49 @@ +//! Tokenizer-path → renderer factory registry. +//! +//! Mirrors `renderers/base.py:MODEL_RENDERER_MAP` for the subset of +//! families ported to Rust so far. New families slot in by adding a +//! match arm in [`create_renderer`]. + +use crate::families::Qwen3Renderer; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::RenderError; + +/// Renderer family identifier — closed enum used by [`create_renderer`]. +/// Adding a family means a new variant here plus a match arm. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum RendererKind { + Qwen3, +} + +impl RendererKind { + pub fn from_str(name: &str) -> Option { + match name { + "qwen3" | "Qwen3" => Some(Self::Qwen3), + _ => None, + } + } +} + +/// Configuration passed to [`create_renderer`]. +#[derive(Clone, Debug, Default)] +pub struct RendererConfig { + pub preserve_all_thinking: bool, + pub preserve_thinking_between_tool_calls: bool, +} + +/// Build a renderer of the requested kind backed by `tokenizer`. +pub fn create_renderer( + kind: RendererKind, + tokenizer: Tokenizer, + cfg: RendererConfig, +) -> Result, RenderError> { + match kind { + RendererKind::Qwen3 => Ok(Box::new( + Qwen3Renderer::builder() + .preserve_all_thinking(cfg.preserve_all_thinking) + .preserve_thinking_between_tool_calls(cfg.preserve_thinking_between_tool_calls) + .build(tokenizer)?, + )), + } +} diff --git a/crates/renderers-py/Cargo.toml b/crates/renderers-py/Cargo.toml new file mode 100644 index 0000000..e1437b6 --- /dev/null +++ b/crates/renderers-py/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "renderers-py" +version = "0.1.0" +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +description = "PyO3 bindings for renderers-core." + +[lib] +name = "renderers_native" +crate-type = ["cdylib", "rlib"] +path = "src/lib.rs" + +[dependencies] +renderers-core = { path = "../renderers-core" } +pyo3 = { version = "0.22", features = ["extension-module", "abi3-py310"] } +serde = { workspace = true } +serde_json = { workspace = true } +pythonize = "0.22" diff --git a/crates/renderers-py/pyproject.toml b/crates/renderers-py/pyproject.toml new file mode 100644 index 0000000..6d28eda --- /dev/null +++ b/crates/renderers-py/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["maturin>=1.4"] +build-backend = "maturin" + +[project] +name = "renderers-native" +version = "0.1.0" +requires-python = ">=3.10,<3.14" +description = "Native (Rust) extension module for the `renderers` package." +license = { text = "Apache-2.0" } + +# Maturin installs the cdylib as a top-level module. The pure-Python +# `renderers` package imports it as ``import renderers_native``, kept +# separate from the `renderers/` hatchling-built wheel so the two +# distributions don't collide at install time. +[tool.maturin] +module-name = "renderers_native" +features = ["pyo3/extension-module"] +strip = true diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs new file mode 100644 index 0000000..7a29bb2 --- /dev/null +++ b/crates/renderers-py/src/lib.rs @@ -0,0 +1,355 @@ +//! Python bindings for `renderers-core`. +//! +//! The boundary is intentionally thin: one polymorphic `Renderer` +//! pyclass holds an `Arc`; small result +//! pyclasses wrap `RenderedTokens` / `ParsedResponse` / `ParsedToolCall` +//! with `#[getter]` accessors. Argument unpacking is done by +//! `pythonize` so callers can pass plain dicts / lists for messages and +//! tools without per-field PyO3 conversion. + +use std::sync::Arc; + +use pyo3::exceptions::{PyRuntimeError, PyValueError}; +use pyo3::prelude::*; +use pyo3::types::{PyList, PyType}; + +use renderers_core::families::Qwen3RendererBuilder; +use renderers_core::tokenizer::Tokenizer; +use renderers_core::types::{ + Message, ParsedResponse, ParsedToolCall, RenderedTokens, ToolArguments, ToolCallParseStatus, + ToolSpec, +}; +use renderers_core::Renderer as CoreRenderer; + +fn render_err(e: renderers_core::types::RenderError) -> PyErr { + PyRuntimeError::new_err(e.to_string()) +} + +fn invalid(msg: impl Into) -> PyErr { + PyValueError::new_err(msg.into()) +} + +/// Decode a Python `list[dict]` of messages via pythonize. +fn parse_messages(obj: &Bound<'_, PyAny>) -> PyResult> { + let value: serde_json::Value = pythonize::depythonize(obj).map_err(|e| { + invalid(format!("messages must be a list of dicts (decode failed: {e})")) + })?; + serde_json::from_value(value).map_err(|e| invalid(format!("messages shape mismatch: {e}"))) +} + +fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult>> { + let Some(obj) = obj else { return Ok(None) }; + if obj.is_none() { + return Ok(None); + } + let value: serde_json::Value = pythonize::depythonize(obj) + .map_err(|e| invalid(format!("tools must be a list of dicts (decode failed: {e})")))?; + let parsed: Vec = serde_json::from_value(value) + .map_err(|e| invalid(format!("tools shape mismatch: {e}")))?; + Ok(Some(parsed)) +} + +fn parse_u32_list(obj: &Bound<'_, PyAny>) -> PyResult> { + // Accept either a Python list of ints or a numpy-style sequence. + let list = obj.downcast::().map_err(|_| invalid("expected list[int]"))?; + let mut out = Vec::with_capacity(list.len()); + for item in list.iter() { + let v: i64 = item.extract()?; + if v < 0 || v > u32::MAX as i64 { + return Err(invalid(format!("token id out of range: {v}"))); + } + out.push(v as u32); + } + Ok(out) +} + +#[pyclass(name = "RenderedTokens", module = "renderers_native")] +#[derive(Clone)] +struct PyRenderedTokens { + inner: RenderedTokens, +} + +#[pymethods] +impl PyRenderedTokens { + #[getter] + fn token_ids<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + // Cast u32 -> i64 for Python `int` compatibility. PyList::new is + // the fastest path; per-element extract is unavoidable until + // numpy support is added. + PyList::new_bound(py, self.inner.token_ids.iter().map(|&t| t as i64)) + } + + #[getter] + fn message_indices<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + PyList::new_bound(py, self.inner.message_indices.iter().copied()) + } + + #[getter] + fn multi_modal_data<'py>(&self, py: Python<'py>) -> PyResult> { + match &self.inner.multi_modal_data { + Some(mm) => pythonize::pythonize(py, mm) + .map_err(|e| invalid(format!("mm serialisation failed: {e}"))), + None => Ok(py.None().into_bound(py)), + } + } + + fn __repr__(&self) -> String { + format!( + "RenderedTokens(token_ids=<{} tokens>, message_indices=<{} entries>, multi_modal_data={})", + self.inner.token_ids.len(), + self.inner.message_indices.len(), + if self.inner.multi_modal_data.is_some() { + "Some(...)" + } else { + "None" + }, + ) + } +} + +#[pyclass(name = "ParsedToolCall", module = "renderers_native")] +#[derive(Clone)] +struct PyParsedToolCall { + inner: ParsedToolCall, +} + +#[pymethods] +impl PyParsedToolCall { + #[getter] + fn raw(&self) -> &str { + &self.inner.raw + } + + #[getter] + fn name(&self) -> Option<&str> { + self.inner.name.as_deref() + } + + #[getter] + fn arguments<'py>(&self, py: Python<'py>) -> PyResult> { + match &self.inner.arguments { + None => Ok(py.None().into_bound(py)), + Some(ToolArguments::Object(v)) => pythonize::pythonize(py, v) + .map_err(|e| invalid(format!("args serialisation: {e}"))), + Some(ToolArguments::Raw(s)) => Ok(s.clone().into_py(py).into_bound(py)), + } + } + + #[getter] + fn token_span(&self) -> Option<(usize, usize)> { + self.inner.token_span.as_ref().map(|r| (r.start, r.end)) + } + + #[getter] + fn status(&self) -> &'static str { + self.inner.status.as_wire() + } + + #[getter] + fn id(&self) -> Option<&str> { + self.inner.id.as_deref() + } + + fn __repr__(&self) -> String { + format!( + "ParsedToolCall(name={:?}, status={:?}, has_args={})", + self.inner.name, + self.inner.status, + self.inner.arguments.is_some(), + ) + } +} + +#[pyclass(name = "ParsedResponse", module = "renderers_native")] +#[derive(Clone)] +struct PyParsedResponse { + inner: ParsedResponse, +} + +#[pymethods] +impl PyParsedResponse { + #[getter] + fn content(&self) -> &str { + &self.inner.content + } + + #[getter] + fn reasoning_content(&self) -> Option<&str> { + self.inner.reasoning_content.as_deref() + } + + #[getter] + fn tool_calls(&self) -> Vec { + self.inner + .tool_calls + .iter() + .cloned() + .map(|c| PyParsedToolCall { inner: c }) + .collect() + } + + fn __repr__(&self) -> String { + format!( + "ParsedResponse(content_len={}, reasoning_content={}, tool_calls={})", + self.inner.content.len(), + self.inner.reasoning_content.is_some(), + self.inner.tool_calls.len(), + ) + } +} + +/// Wire enum mirror — matches the Python `ToolCallParseStatus` string +/// values so existing code reading `tc.status == "ok"` keeps working. +#[pyclass(name = "ToolCallParseStatus", module = "renderers_native")] +#[derive(Clone, Copy)] +struct PyToolCallParseStatus { + inner: ToolCallParseStatus, +} + +#[pymethods] +impl PyToolCallParseStatus { + #[classattr] + const OK: &'static str = "ok"; + #[classattr] + const INVALID_JSON: &'static str = "invalid_json"; + #[classattr] + const UNCLOSED_BLOCK: &'static str = "unclosed_block"; + #[classattr] + const MISSING_NAME: &'static str = "missing_name"; + #[classattr] + const MALFORMED_STRUCTURE: &'static str = "malformed_structure"; + + #[getter] + fn value(&self) -> &'static str { + self.inner.as_wire() + } +} + +/// Polymorphic Python-facing renderer. +#[pyclass(name = "Renderer", module = "renderers_native")] +struct PyRenderer { + inner: Arc, +} + +#[pymethods] +impl PyRenderer { + /// Construct a Qwen3 renderer from a tokenizer.json on disk. + /// + /// Kept as an explicit classmethod (rather than `__new__`) so the + /// type signature stays unambiguous from Python and future families + /// can add their own classmethods. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn qwen3( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + Qwen3RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] + fn render( + &self, + py: Python<'_>, + messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult { + let msgs = parse_messages(messages)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let out = py + .allow_threads(move || renderer.render(&msgs, tools.as_deref(), add_generation_prompt)) + .map_err(render_err)?; + Ok(PyRenderedTokens { inner: out }) + } + + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] + fn render_ids<'py>( + &self, + py: Python<'py>, + messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult> { + let msgs = parse_messages(messages)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let ids = py + .allow_threads(move || { + renderer.render_ids(&msgs, tools.as_deref(), add_generation_prompt) + }) + .map_err(render_err)?; + Ok(PyList::new_bound(py, ids.iter().map(|&t| t as i64))) + } + + fn parse_response( + &self, + py: Python<'_>, + token_ids: &Bound<'_, PyAny>, + ) -> PyResult { + let ids = parse_u32_list(token_ids)?; + let renderer = self.inner.clone(); + let parsed = py.allow_threads(move || renderer.parse_response(&ids)); + Ok(PyParsedResponse { inner: parsed }) + } + + fn get_stop_token_ids<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + PyList::new_bound(py, self.inner.stop_token_ids().iter().map(|&t| t as i64)) + } + + #[pyo3(signature = (previous_prompt_ids, previous_completion_ids, new_messages, *, tools = None))] + fn bridge_to_next_turn( + &self, + py: Python<'_>, + previous_prompt_ids: &Bound<'_, PyAny>, + previous_completion_ids: &Bound<'_, PyAny>, + new_messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + ) -> PyResult> { + let prev_p = parse_u32_list(previous_prompt_ids)?; + let prev_c = parse_u32_list(previous_completion_ids)?; + let msgs = parse_messages(new_messages)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let bridged = py + .allow_threads(move || { + renderer.bridge_to_next_turn(&prev_p, &prev_c, &msgs, tools.as_deref()) + }) + .map_err(render_err)?; + Ok(bridged.map(|rt| PyRenderedTokens { inner: rt })) + } +} + +#[pymodule] +fn renderers_native(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + let _ = py; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) +} diff --git a/pyproject.toml b/pyproject.toml index 389870f..63f6cc6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,13 @@ exclude-newer = "7 days" # while the rest of the dependency graph stays gated. exclude-newer-package = { fastokens = false, "prime-pydantic-config" = false } +[tool.pytest.ini_options] +# Registered markers — `parity` gates the Python<->Rust native parity +# suite. Skip with `-m 'not parity'` if the native module isn't built. +markers = [ + "parity: Python<->Rust native parity tests (require maturin develop + a tokenizer.json)", +] + [tool.ty.environment] python-version = "3.13" diff --git a/renderers/_native_router.py b/renderers/_native_router.py new file mode 100644 index 0000000..ece558d --- /dev/null +++ b/renderers/_native_router.py @@ -0,0 +1,113 @@ +"""Routing layer between the pure-Python renderers and the Rust port. + +Loaded by each family shim (currently ``renderers.qwen3``). Resolves +whether the native module is available and, if so, whether the caller +opted into it for this family via the ``RENDERERS_NATIVE`` env var. + +The env-var accepts: + +- ``0`` / empty / unset — use the pure-Python implementation (default). +- ``1`` / ``all`` — route every supported family to the native module. +- comma-separated list of family names, e.g. ``qwen3`` or + ``qwen3,qwen35`` — route only those families. + +Family detection is opt-in per family so callers can roll out the +native path one model at a time; everything else falls back to Python +verbatim. +""" + +from __future__ import annotations + +import logging +import os +from pathlib import Path +from typing import Any + +logger = logging.getLogger("renderers._native_router") + +_NATIVE_MODULE: Any | None = None +_NATIVE_LOAD_ATTEMPTED = False + + +def native_enabled(family: str) -> bool: + """Should *family* route to the native module?""" + raw = os.environ.get("RENDERERS_NATIVE", "").strip() + if not raw or raw == "0": + return False + if raw in {"1", "all"}: + return True + return family in {part.strip() for part in raw.split(",") if part.strip()} + + +def load_native() -> Any | None: + """Import ``renderers_native`` lazily. Returns ``None`` if the + extension module is not installed (caller falls back to Python). + + Kept as a top-level distribution (rather than `renderers._native`) + so the maturin-built wheel doesn't collide with the hatchling-built + `renderers` wheel at install time. + """ + global _NATIVE_MODULE, _NATIVE_LOAD_ATTEMPTED + if _NATIVE_LOAD_ATTEMPTED: + return _NATIVE_MODULE + _NATIVE_LOAD_ATTEMPTED = True + try: + import renderers_native # type: ignore[import-not-found] + + _NATIVE_MODULE = renderers_native + except ImportError as exc: + logger.info( + "RENDERERS_NATIVE is set but the native extension is not " + "available (%s); falling back to pure Python. Build it with " + "`maturin develop --manifest-path crates/renderers-py/Cargo.toml`.", + exc, + ) + _NATIVE_MODULE = None + return _NATIVE_MODULE + + +def resolve_tokenizer_path(tokenizer: Any) -> str: + """Return a filesystem path to ``tokenizer.json`` for *tokenizer*. + + Accepts either: + + - a string (already a path / HF model id) — the caller is + responsible for snapshotting the model first if it's a remote id. + - a HuggingFace ``PreTrainedTokenizerBase`` — pulls + ``name_or_path`` and locates ``tokenizer.json`` next to it. + """ + if isinstance(tokenizer, (str, os.PathLike)): + path = Path(tokenizer) + if path.is_dir(): + return str(path / "tokenizer.json") + return str(path) + + name_or_path = getattr(tokenizer, "name_or_path", None) + if not name_or_path: + raise ValueError( + "Cannot determine tokenizer.json path: tokenizer has no " + "name_or_path attribute. Pass an explicit path string instead." + ) + + candidate = Path(name_or_path) + if candidate.is_dir(): + path = candidate / "tokenizer.json" + if path.exists(): + return str(path) + + # HF cache fallback: /models--name--with--slashes/snapshots//tokenizer.json + try: + from huggingface_hub import try_to_load_from_cache # type: ignore + except ImportError: + raise ValueError( + f"tokenizer.json not found near {name_or_path}; install " + "huggingface_hub or pass an explicit path." + ) + + cached = try_to_load_from_cache(repo_id=name_or_path, filename="tokenizer.json") + if cached is None or cached is False: + raise ValueError( + f"tokenizer.json not available in the local HF cache for {name_or_path}. " + "Run `snapshot_download` first or pass an explicit path." + ) + return str(cached) diff --git a/renderers/qwen3.py b/renderers/qwen3.py index fe97561..d86a180 100644 --- a/renderers/qwen3.py +++ b/renderers/qwen3.py @@ -5,6 +5,14 @@ - Tool calls use JSON format: {"name": "...", "arguments": ...} - Thinking blocks only inserted when loop.last OR reasoning_content present - Generation prompt does NOT add by default + +# Native (Rust) routing + +When ``RENDERERS_NATIVE`` selects ``qwen3`` (see +``renderers._native_router``) and the native extension is available, +``Qwen3Renderer(...)`` returns an instance of the Rust implementation +instead of this Python class. The returned object satisfies the same +duck-typed Renderer protocol, so callers don't need to special-case it. """ from __future__ import annotations @@ -13,6 +21,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -46,11 +59,38 @@ class Qwen3Renderer: """Deterministic message → token renderer for Qwen3 models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + # Native routing: when ``RENDERERS_NATIVE`` opts qwen3 into the + # Rust path and the extension is installed, return the native + # instance directly. Otherwise fall through to the pure-Python + # constructor below. + if native_enabled("qwen3"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.qwen3( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, config: Qwen3RendererConfig | None = None, ): + # If __new__ returned a native instance, Python won't call this + # __init__ (different type). For the pure-Python instance, do + # the normal setup. self._tokenizer = tokenizer self.config = config or Qwen3RendererConfig() diff --git a/tests/test_native_parity.py b/tests/test_native_parity.py new file mode 100644 index 0000000..16468e7 --- /dev/null +++ b/tests/test_native_parity.py @@ -0,0 +1,318 @@ +"""Byte-for-byte parity: native (Rust) vs pure-Python. + +For every family that has been ported to Rust, build *both* a +pure-Python renderer and a native renderer from the same tokenizer and +assert their outputs are identical across a representative set of +conversation shapes. + +This complements two existing parity gates: + +- ``tests/test_render_ids.py`` — Python (or, when the env var routes, + native) vs HuggingFace's ``apply_chat_template``. Catches drift from + the upstream reference. Run the suite with + ``RENDERERS_NATIVE=qwen3 pytest tests/test_render_ids.py`` to exercise + the native path through that gate. +- This file — Python vs native, holding the reference fixed. Catches + drift between the two implementations even if HF changes its + template. Cheaper because the HF call isn't on the path. + +Both tests require a real ``tokenizer.json`` on disk. The fixtures here +skip with a clear message when the tokenizer can't be located or the +native extension isn't built — so the test file is safe to import in +sandboxed CI where neither is available. +""" + +from __future__ import annotations + +import os +from typing import Any + +import pytest + +from renderers import _native_router as router + +pytestmark = pytest.mark.parity + + +# ── Test matrix ────────────────────────────────────────────────────── + + +# (model_id, family-key, extra-kwargs) +NATIVE_PARITY_FAMILIES = [ + ("Qwen/Qwen3-8B", "qwen3", {}), +] + + +# ── Fixtures ───────────────────────────────────────────────────────── + + +@pytest.fixture(scope="module") +def native_module(): + mod = router.load_native() + if mod is None: + pytest.skip("renderers_native not built; run `maturin develop`") + return mod + + +@pytest.fixture(scope="module", params=NATIVE_PARITY_FAMILIES, ids=lambda p: p[1]) +def native_pair(request, native_module): + """Return ``(py_renderer, native_renderer, tokenizer)`` for one family.""" + model_id, family, extra = request.param + + # Locate tokenizer.json on disk. Skip cleanly if not in HF cache — + # this test is most useful locally with a real model snapshot. + try: + from renderers.base import load_tokenizer + + tokenizer = load_tokenizer(model_id) + except Exception as exc: + pytest.skip(f"could not load tokenizer for {model_id}: {exc}") + + try: + tok_path = router.resolve_tokenizer_path(tokenizer) + except Exception as exc: + pytest.skip(f"could not resolve tokenizer.json for {model_id}: {exc}") + if not os.path.exists(tok_path): + pytest.skip(f"tokenizer.json missing on disk at {tok_path}") + + # Build the pure-Python renderer with the env var explicitly off so + # the ``__new__`` routing doesn't return a native instance. + saved = os.environ.pop("RENDERERS_NATIVE", None) + try: + if family == "qwen3": + from renderers.qwen3 import Qwen3Renderer + + py_renderer = Qwen3Renderer(tokenizer, **extra) + else: + pytest.skip(f"no python builder wired for {family}") + finally: + if saved is not None: + os.environ["RENDERERS_NATIVE"] = saved + + # Build the native renderer directly through the module surface — + # bypasses the env-var routing entirely. + if family == "qwen3": + native_renderer = native_module.Renderer.qwen3(tok_path, **extra) + else: + pytest.skip(f"no native builder wired for {family}") + + return py_renderer, native_renderer, tokenizer + + +# ── Conversation fixtures (a representative cross-section) ─────────── + + +CONVERSATIONS: list[tuple[str, list[dict[str, Any]]]] = [ + ( + "system_and_user", + [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello!"}, + ], + ), + ( + "single_turn", + [ + {"role": "system", "content": "You are a math tutor."}, + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "4"}, + ], + ), + ( + "no_system_message", + [ + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi there!"}, + ], + ), + ( + "multi_turn", + [ + {"role": "user", "content": "A"}, + {"role": "assistant", "content": "B"}, + {"role": "user", "content": "C"}, + {"role": "assistant", "content": "D"}, + ], + ), + ( + "reasoning_content_field", + [ + {"role": "user", "content": "What is 2+2?"}, + { + "role": "assistant", + "reasoning_content": "Simple arithmetic", + "content": "4", + }, + ], + ), + ( + "tool_call_single", + [ + {"role": "user", "content": "What's the weather in Paris?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + }, + } + ], + }, + ], + ), + ( + "tool_call_with_response", + [ + {"role": "user", "content": "Weather?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + }, + } + ], + }, + {"role": "tool", "content": "sunny, 22°C"}, + {"role": "assistant", "content": "It's sunny and 22°C in Paris."}, + ], + ), +] + + +TOOLS = [ + { + "name": "get_weather", + "description": "Get current weather for a city.", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + } +] + + +# ── Tests ──────────────────────────────────────────────────────────── + + +@pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) +def test_render_ids_parity(native_pair, case, messages): + py_renderer, native_renderer, _tok = native_pair + py_ids = list(py_renderer.render_ids(messages)) + rs_ids = list(native_renderer.render_ids(messages)) + assert py_ids == rs_ids, ( + f"render_ids mismatch for {case}:\n" + f" python: {py_ids[:30]}... (len={len(py_ids)})\n" + f" native: {rs_ids[:30]}... (len={len(rs_ids)})" + ) + + +@pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) +def test_render_ids_with_gen_prompt_parity(native_pair, case, messages): + py_renderer, native_renderer, _tok = native_pair + py_ids = list(py_renderer.render_ids(messages, add_generation_prompt=True)) + rs_ids = list(native_renderer.render_ids(messages, add_generation_prompt=True)) + assert py_ids == rs_ids + + +@pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) +def test_render_ids_with_tools_parity(native_pair, case, messages): + py_renderer, native_renderer, _tok = native_pair + py_ids = list(py_renderer.render_ids(messages, tools=TOOLS)) + rs_ids = list(native_renderer.render_ids(messages, tools=TOOLS)) + assert py_ids == rs_ids + + +@pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) +def test_message_indices_parity(native_pair, case, messages): + """Per-token attribution must match — critical for training loss masks.""" + py_renderer, native_renderer, _tok = native_pair + py_out = py_renderer.render(messages) + rs_out = native_renderer.render(messages) + assert list(py_out.token_ids) == list(rs_out.token_ids) + assert list(py_out.message_indices) == list(rs_out.message_indices) + + +def test_stop_token_ids_parity(native_pair): + py_renderer, native_renderer, _tok = native_pair + assert list(py_renderer.get_stop_token_ids()) == list( + native_renderer.get_stop_token_ids() + ) + + +def test_parse_response_no_tool_calls_parity(native_pair): + """Parse a simple text completion through both.""" + py_renderer, native_renderer, _tok = native_pair + # Render a small assistant turn, take the assistant tokens, parse. + msgs = [{"role": "user", "content": "say hi"}] + completion_ids = py_renderer.render_ids( + msgs + [{"role": "assistant", "content": "Hello there!"}] + ) + # Slice out just the assistant section by re-rendering up to the user. + prompt_ids = py_renderer.render_ids(msgs, add_generation_prompt=True) + assistant_ids = completion_ids[len(prompt_ids):] + + py_parsed = py_renderer.parse_response(assistant_ids) + rs_parsed = native_renderer.parse_response(assistant_ids) + assert py_parsed.content == rs_parsed.content + assert (py_parsed.reasoning_content or None) == (rs_parsed.reasoning_content or None) + assert len(py_parsed.tool_calls) == len(rs_parsed.tool_calls) + + +def test_bridge_to_next_turn_parity(native_pair): + py_renderer, native_renderer, _tok = native_pair + initial = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi! How can I help?"}, + ] + prev_prompt_ids = py_renderer.render_ids(initial[:-1], add_generation_prompt=True) + prev_completion_ids = py_renderer.render_ids(initial)[len(prev_prompt_ids):] + new_messages = [{"role": "user", "content": "Tell me about Rust."}] + + py_b = py_renderer.bridge_to_next_turn( + prev_prompt_ids, prev_completion_ids, new_messages + ) + rs_b = native_renderer.bridge_to_next_turn( + prev_prompt_ids, prev_completion_ids, new_messages + ) + + # Either both return None (refused) or both produce identical tokens. + if py_b is None: + assert rs_b is None + return + assert rs_b is not None + assert list(py_b.token_ids) == list(rs_b.token_ids) + + +def test_bridge_refuses_assistant_in_extension(native_pair): + py_renderer, native_renderer, _tok = native_pair + initial = [{"role": "user", "content": "Hi"}] + prompt_ids = py_renderer.render_ids(initial, add_generation_prompt=True) + completion_ids = list(py_renderer.get_stop_token_ids())[:1] + + # Assistant in the extension → both must return None. + assert ( + py_renderer.bridge_to_next_turn( + prompt_ids, + completion_ids, + [{"role": "assistant", "content": "x"}], + ) + is None + ) + assert ( + native_renderer.bridge_to_next_turn( + prompt_ids, + completion_ids, + [{"role": "assistant", "content": "x"}], + ) + is None + ) diff --git a/tests/test_native_router.py b/tests/test_native_router.py new file mode 100644 index 0000000..6acbe5e --- /dev/null +++ b/tests/test_native_router.py @@ -0,0 +1,106 @@ +"""Unit tests for the Python/native routing layer. + +These are isolated from the inference engines and don't require a +network connection — they exercise just the env-var parsing, the +lazy import, and (where the wheel is built) the native module's +class surface. +""" + +from __future__ import annotations + +import os +from unittest import mock + +import pytest + +from renderers import _native_router as router + + +def test_native_disabled_by_default(): + with mock.patch.dict(os.environ, {}, clear=True): + assert not router.native_enabled("qwen3") + + +@pytest.mark.parametrize("value", ["", "0"]) +def test_native_off_values(value): + with mock.patch.dict(os.environ, {"RENDERERS_NATIVE": value}, clear=True): + assert not router.native_enabled("qwen3") + + +@pytest.mark.parametrize("value", ["1", "all"]) +def test_native_on_global(value): + with mock.patch.dict(os.environ, {"RENDERERS_NATIVE": value}, clear=True): + assert router.native_enabled("qwen3") + assert router.native_enabled("qwen35") + assert router.native_enabled("glm5") + + +def test_native_csv_specific_families(): + with mock.patch.dict( + os.environ, {"RENDERERS_NATIVE": "qwen3,glm5"}, clear=True + ): + assert router.native_enabled("qwen3") + assert router.native_enabled("glm5") + assert not router.native_enabled("qwen35") + + +def test_native_csv_whitespace_tolerant(): + with mock.patch.dict( + os.environ, {"RENDERERS_NATIVE": " qwen3 , glm5 "}, clear=True + ): + assert router.native_enabled("qwen3") + assert router.native_enabled("glm5") + + +def test_load_native_caches_result(): + # Reset the loader cache for the test. + router._NATIVE_MODULE = None + router._NATIVE_LOAD_ATTEMPTED = False + first = router.load_native() + second = router.load_native() + assert first is second # cached + + +def test_resolve_tokenizer_path_from_string(tmp_path): + # Pass a directory containing tokenizer.json — get the file path back. + (tmp_path / "tokenizer.json").write_text("{}") + assert router.resolve_tokenizer_path(str(tmp_path)).endswith("tokenizer.json") + + +def test_resolve_tokenizer_path_from_exact_file(tmp_path): + f = tmp_path / "tokenizer.json" + f.write_text("{}") + # Pass a file path directly — return as-is. + assert router.resolve_tokenizer_path(str(f)) == str(f) + + +# ── Native module surface (only runs when the wheel is built) ──────── + + +@pytest.fixture +def native(): + mod = router.load_native() + if mod is None: + pytest.skip("renderers_native not built; run `maturin develop`") + return mod + + +def test_native_exports(native): + # The five classes the Python shim relies on. + for name in ( + "Renderer", + "RenderedTokens", + "ParsedResponse", + "ParsedToolCall", + "ToolCallParseStatus", + ): + assert hasattr(native, name), f"missing {name}" + + +def test_native_status_constants(native): + s = native.ToolCallParseStatus + assert s.OK == "ok" + assert s.INVALID_JSON == "invalid_json" + assert s.UNCLOSED_BLOCK == "unclosed_block" + assert s.MISSING_NAME == "missing_name" + assert s.MALFORMED_STRUCTURE == "malformed_structure" From 57fc44a5dbbb88504e07af79439950e3bcc60a85 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:44:12 +0200 Subject: [PATCH 03/35] Add Qwen3.5 native parity path --- crates/README.md | 2 +- crates/renderers-core/src/families/mod.rs | 2 + crates/renderers-core/src/families/qwen35.rs | 531 +++++++++++++++++++ crates/renderers-core/src/parsing/mod.rs | 1 + crates/renderers-core/src/parsing/qwen35.rs | 213 ++++++++ crates/renderers-core/src/registry.rs | 17 +- crates/renderers-py/src/lib.rs | 38 +- renderers/qwen35.py | 34 ++ tests/test_native_parity.py | 7 + 9 files changed, 842 insertions(+), 3 deletions(-) create mode 100644 crates/renderers-core/src/families/qwen35.rs create mode 100644 crates/renderers-core/src/parsing/qwen35.rs diff --git a/crates/README.md b/crates/README.md index b249442..1ab5179 100644 --- a/crates/README.md +++ b/crates/README.md @@ -74,7 +74,7 @@ shim logs a one-shot info message and falls back to Python. | Family | Status | | ------------ | ----------------------------------------------- | | Qwen3 | ✅ ported (Phase 2) | -| Qwen3.5 | planned (Phase 3) | +| Qwen3.5 | ✅ ported text-only (Phase 3) — multimodal Phase 5 | | GLM 4.5 / 5 | planned (Phase 3) | | DeepSeek V3 | planned (Phase 3) | | Nemotron3 | planned (Phase 3) | diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index 0a507d3..843471a 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -5,5 +5,7 @@ //! registry entry in [`crate::registry`]. pub mod qwen3; +pub mod qwen35; pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; +pub use qwen35::{Qwen35Renderer, Qwen35RendererBuilder}; diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs new file mode 100644 index 0000000..35c4b67 --- /dev/null +++ b/crates/renderers-core/src/families/qwen35.rs @@ -0,0 +1,531 @@ +//! Qwen3.5 renderer (text-only). Port of `renderers/qwen35.py` minus the +//! multimodal path; multimodal lands in Phase 5 with the vision processor. +//! +//! Differences from Qwen3: +//! +//! - `` / `` are **special tokens**, not text tags. +//! - Tool calls use XML format with `` and +//! `` blocks. +//! - System prompt includes a verbose tool-instructions block. +//! - Generation prompt prefills `\n` (or the empty-think block +//! when `enable_thinking` is false), with polarity defaulting to +//! `enable_thinking=true` for big-size models. +//! +//! `enable_thinking` polarity detection (which the Python implementation +//! probes via a one-shot `apply_chat_template` call) is **not** done in +//! Rust — the caller passes it explicitly through the builder. The +//! Python shim handles the polarity probe and forwards the result. + +use serde_json::json; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::qwen35::parse_qwen35; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, +}; + +const TOOLS_HEADER: &str = "# Tools\n\nYou have access to the following functions:\n\n"; +const TOOLS_FOOTER: &str = "\n"; +const TOOLS_INSTRUCTIONS: &str = "\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n"; + +#[derive(Debug, Clone)] +pub struct Qwen35RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for Qwen35RendererBuilder { + fn default() -> Self { + Self { + // Big-size model default. The Python shim probes the tokenizer's + // Jinja template to discover the per-model polarity; callers can + // pass an explicit override here. + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl Qwen35RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + Qwen35Renderer::new_with(tokenizer, self) + } +} + +#[derive(Debug, Clone)] +pub struct Qwen35Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + im_start: u32, + im_end: u32, + #[allow(dead_code)] + endoftext: u32, + think: u32, + think_end: u32, + tool_call: u32, + tool_call_end: u32, + tool_response: u32, + tool_response_end: u32, + + stop_tokens: Vec, +} + +impl Qwen35Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + Qwen35RendererBuilder::default().build(tokenizer) + } + + pub fn builder() -> Qwen35RendererBuilder { + Qwen35RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: Qwen35RendererBuilder) -> Result { + let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; + let think = tokenizer.token_to_id_strict("")?; + let think_end = tokenizer.token_to_id_strict("")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let tool_response = tokenizer.token_to_id_strict("")?; + let tool_response_end = tokenizer.token_to_id_strict("")?; + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_start, + im_end, + endoftext, + think, + think_end, + tool_call, + tool_call_end, + tool_response, + tool_response_end, + stop_tokens: vec![im_end, endoftext], + }) + } + + /// Index of the most recent non-tool-response user message; + /// `messages.len()` when none — that out-of-range value makes + /// `msg_idx > last_query_index` uniformly false, matching the + /// Python contract. + fn last_query_index(messages: &[Message]) -> i32 { + for (i, msg) in messages.iter().enumerate().rev() { + if msg.role != "user" { + continue; + } + let content = msg.text_content().trim(); + if !(content.starts_with("") && content.ends_with("")) { + return i as i32; + } + } + messages.len() as i32 + } + + fn emit_system_with_tools( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + tools: &[ToolSpec], + first_is_system: bool, + ) -> Result<(), RenderError> { + let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; + buf.special(self.im_start, sys_idx); + buf.text("system\n", sys_idx)?; + + let mut tool_text = String::with_capacity(TOOLS_HEADER.len() + TOOLS_INSTRUCTIONS.len() + 256); + tool_text.push_str(TOOLS_HEADER); + for tool in tools { + tool_text.push('\n'); + let spec = json!({ + "name": tool.name, + "description": tool.description, + "parameters": tool.parameters, + }); + tool_text.push_str(&serde_json::to_string(&spec).map_err(|e| { + RenderError::Invalid(format!("tool spec serialisation failed: {e}")) + })?); + } + tool_text.push_str(TOOLS_FOOTER); + tool_text.push_str(TOOLS_INSTRUCTIONS); + + if first_is_system { + let sys_content = messages[0].text_content().trim(); + if !sys_content.is_empty() { + tool_text.push_str("\n\n"); + tool_text.push_str(sys_content); + } + } + + buf.text(&tool_text, sys_idx)?; + buf.special(self.im_end, sys_idx); + buf.text("\n", sys_idx)?; + Ok(()) + } + + fn emit_system_no_tools( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + ) -> Result<(), RenderError> { + let content = messages[0].text_content().trim(); + buf.special(self.im_start, 0); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, 0)?; + buf.special(self.im_end, 0); + buf.text("\n", 0)?; + Ok(()) + } + + fn emit_user( + &self, + buf: &mut RenderBuf<'_>, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + msg_idx: usize, + content: &str, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let next_is_tool = + msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let idx = msg_idx as i32; + + if !prev_is_tool { + buf.special(self.im_start, idx); + buf.text("user", idx)?; + } + buf.text("\n", idx)?; + buf.special(self.tool_response, idx); + let mut wrapped = String::with_capacity(content.len() + 2); + wrapped.push('\n'); + wrapped.push_str(content); + wrapped.push('\n'); + buf.text(&wrapped, idx)?; + buf.special(self.tool_response_end, idx); + if !next_is_tool { + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + } + Ok(()) + } + + fn render_arg_value(arg_value: &serde_json::Value) -> String { + // Mirrors the Python `_render_arg_value`: dict/list → compact + // JSON; everything else → `str(...)` (Python's str() for bool, + // int, None, etc. — handled here by matching each scalar). + match arg_value { + serde_json::Value::Object(_) | serde_json::Value::Array(_) => { + serde_json::to_string(arg_value).unwrap_or_default() + } + serde_json::Value::String(s) => s.clone(), + serde_json::Value::Bool(b) => { + // Python str(True) == "True", str(False) == "False" + if *b { "True".to_string() } else { "False".to_string() } + } + serde_json::Value::Null => "None".to_string(), + serde_json::Value::Number(n) => n.to_string(), + } + } + + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: usize, + last_query_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let raw_content = msg.text_content(); + let (reasoning_content, content_after) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let reasoning = if let Some((_, inner)) = before.rsplit_once("") { + inner.trim_start_matches('\n').trim_end_matches('\n').to_string() + } else { + before.trim_start_matches('\n').trim_end_matches('\n').to_string() + }; + (reasoning, after.trim_start_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + let reasoning_content = reasoning_content.trim().to_string(); + let content = content_after.trim().to_string(); + + let idx = msg_idx as i32; + buf.special(self.im_start, idx); + + let emit_thinking = (msg_idx as i32) > last_query_index + || (preserve_thinking && !reasoning_content.is_empty()); + + if emit_thinking { + buf.text("assistant\n", idx)?; + buf.special(self.think, idx); + let mut s = String::with_capacity(reasoning_content.len() + 2); + s.push('\n'); + s.push_str(&reasoning_content); + s.push('\n'); + buf.text(&s, idx)?; + buf.special(self.think_end, idx); + let mut tail = String::with_capacity(content.len() + 2); + tail.push_str("\n\n"); + tail.push_str(&content); + buf.text(&tail, idx)?; + } else { + let mut s = String::with_capacity(content.len() + 10); + s.push_str("assistant\n"); + s.push_str(&content); + buf.text(&s, idx)?; + } + + for (tc_idx, tc) in msg.tool_calls.iter().enumerate() { + let name = tc.function.name.as_str(); + // Separator before this tool call + if tc_idx == 0 { + if !content.is_empty() { + buf.text("\n\n", idx)?; + } + } else { + buf.text("\n", idx)?; + } + + buf.special(self.tool_call, idx); + let mut payload = String::with_capacity(name.len() + 32); + payload.push_str("\n\n"); + buf.text(&payload, idx)?; + + // Arguments — accept JSON string (decode first) or object + let args_value = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => serde_json::from_str(s) + .unwrap_or(serde_json::Value::Object(Default::default())), + }; + if let Some(obj) = args_value.as_object() { + for (arg_name, arg_value) in obj { + let value_str = Self::render_arg_value(arg_value); + let mut param = String::with_capacity(arg_name.len() + value_str.len() + 24); + param.push_str("\n"); + param.push_str(&value_str); + param.push_str("\n\n"); + buf.text(¶m, idx)?; + } + } + + buf.text("\n", idx)?; + buf.special(self.tool_call_end, idx); + } + + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + Ok(()) + } + + fn emit_generation_prompt(&self, buf: &mut RenderBuf<'_>) -> Result<(), RenderError> { + buf.scaffold_special(self.im_start); + buf.scaffold_text("assistant\n")?; + if self.enable_thinking { + buf.scaffold_special(self.think); + buf.scaffold_text("\n")?; + } else { + buf.scaffold_special(self.think); + buf.scaffold_text("\n\n")?; + buf.scaffold_special(self.think_end); + buf.scaffold_text("\n\n")?; + } + Ok(()) + } + + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + let base = messages.len().max(1) * 256; + let tools_bonus = tools.map(|t| 256 * t.len().max(1) + 512).unwrap_or(0); + base + tools_bonus + } +} + +impl Renderer for Qwen35Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + + let first_is_system = messages[0].role == "system"; + + match tools { + Some(t) if !t.is_empty() => { + self.emit_system_with_tools(&mut buf, messages, t, first_is_system)?; + } + _ => { + if first_is_system { + self.emit_system_no_tools(&mut buf, messages)?; + } + } + } + + let last_qi = Self::last_query_index(messages); + + for (i, msg) in messages.iter().enumerate() { + let content = msg.text_content().trim(); + match msg.role.as_str() { + "system" => { + if i != 0 { + return Err(RenderError::Invalid( + "system message must be at the beginning".into(), + )); + } + // Already handled above + } + "user" => self.emit_user(&mut buf, content, i as i32)?, + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(&mut buf, msg, i, last_qi, preserve_thinking)?; + } + "tool" => self.emit_tool(&mut buf, messages, i, content)?, + _ => { + return Err(RenderError::Invalid(format!( + "unexpected message role: {}", + msg.role + ))); + } + } + } + + if add_generation_prompt { + self.emit_generation_prompt(&mut buf)?; + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_qwen35( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.think, + self.think_end, + self.tool_call, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(new_messages, None)); + // Trailing newline that the prior render emitted but vLLM stopped on + buf.scaffold_text("\n")?; + + for (i, msg) in new_messages.iter().enumerate() { + let content = msg.text_content().trim(); + let idx = i as i32; + match msg.role.as_str() { + "user" => self.emit_user(&mut buf, content, idx)?, + "system" => { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + } + "tool" => self.emit_tool(&mut buf, new_messages, i, content)?, + _ => return Ok(None), + } + } + + self.emit_generation_prompt(&mut buf)?; + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs index 59b81ea..77ad38c 100644 --- a/crates/renderers-core/src/parsing/mod.rs +++ b/crates/renderers-core/src/parsing/mod.rs @@ -10,6 +10,7 @@ //! they vanish into the family parsers at -O. pub mod qwen3; +pub mod qwen35; use crate::tokenizer::Tokenizer; use crate::types::RenderError; diff --git a/crates/renderers-core/src/parsing/qwen35.rs b/crates/renderers-core/src/parsing/qwen35.rs new file mode 100644 index 0000000..1d29683 --- /dev/null +++ b/crates/renderers-core/src/parsing/qwen35.rs @@ -0,0 +1,213 @@ +//! Qwen3.5 tool-call parser — XML-style tool calls with special-token thinking. +//! +//! Port of `renderers/parsing.py:parse_qwen35` + `_parse_xml_tool_calls`. +//! +//! Structural shape: +//! +//! ```text +//! +//! ...reasoning text... +//! +//! +//! ...content text... +//! +//! +//! +//! +//! value1 +//! +//! +//! value2 +//! +//! +//! +//! ``` +//! +//! `` and `` are special tokens. Tool-call block contents are +//! parsed by regex on the decoded text — but the regex only runs inside the +//! bounded `...` span, never on the full completion. + +use std::ops::Range; +use std::sync::LazyLock; + +use regex::Regex; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +static FUNCTION_NAME_RE: LazyLock = + LazyLock::new(|| Regex::new(r"]+)>").expect("function-name regex")); + +static PARAMETER_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?s)]+)>\n?(.*?)\n?").expect("parameter regex") +}); + +#[allow(clippy::too_many_arguments)] +pub fn parse_qwen35( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + think_id: u32, + think_end_id: u32, + tool_call_id: u32, + tool_call_end_id: u32, +) -> ParsedResponse { + let ids = strip_stop_tokens(token_ids, stop_ids); + + // ── Thinking: find by token ID ───────────────────────── + let mut reasoning: Option = None; + let mut parse_offset: usize = 0; + let working_ids: Vec; + let ids_after_think: &[u32] = match find(ids, think_end_id) { + Some(think_end) => { + // Filter out think_id tokens from the reasoning span so the + // decoded text doesn't include the opening marker. + let reasoning_ids: Vec = ids[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()); + parse_offset = think_end + 1; + &ids[think_end + 1..] + } + None => { + // present but no — truncated reasoning; + // return early with reasoning-only response. + if let Some(think_start) = find(ids, think_id) { + let txt = decode(tokenizer, &ids[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; + } + working_ids = ids.to_vec(); + &working_ids + } + }; + + // ── Tool calls (token-bounded, regex-on-decoded-span) ─────────── + let (content_text, tool_calls) = match find(ids_after_think, tool_call_id) { + Some(tc_start) => { + let content = decode(tokenizer, &ids_after_think[..tc_start]) + .unwrap_or_default() + .trim() + .to_string(); + let tcs = parse_xml_tool_calls( + tokenizer, + &ids_after_think[tc_start..], + tool_call_id, + tool_call_end_id, + parse_offset + tc_start, + ); + (content, tcs) + } + None => { + let content = decode(tokenizer, ids_after_think) + .unwrap_or_default() + .trim() + .to_string(); + (content, Vec::new()) + } + }; + + ParsedResponse { + content: content_text, + reasoning_content: reasoning.filter(|s| !s.is_empty()), + tool_calls, + } +} + +fn parse_xml_tool_calls( + tokenizer: &Tokenizer, + ids: &[u32], + tc_id: u32, + tc_end_id: u32, + section_offset: usize, +) -> Vec { + let mut out: Vec = Vec::new(); + let mut i = 0usize; + + while i < ids.len() { + if ids[i] != tc_id { + i += 1; + continue; + } + let span_start = section_offset + i; + + let end = match find_from(ids, tc_end_id, i + 1) { + Some(end) => end, + None => { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; + } + }; + + let block_text = decode(tokenizer, &ids[i + 1..end]).unwrap_or_default(); + let span = Range { + start: span_start, + end: section_offset + end + 1, + }; + + let name_match = match FUNCTION_NAME_RE.captures(&block_text) { + Some(c) => c, + None => { + out.push(ParsedToolCall { + raw: block_text, + token_span: Some(span), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + i = end + 1; + continue; + } + }; + let name = name_match.get(1).map(|m| m.as_str().to_string()).unwrap_or_default(); + + let mut arguments = serde_json::Map::new(); + let mut any_json_fallback = false; + for pm in PARAMETER_RE.captures_iter(&block_text) { + let arg_name = pm.get(1).map(|m| m.as_str()).unwrap_or("").to_string(); + let arg_value = pm.get(2).map(|m| m.as_str().trim()).unwrap_or(""); + match serde_json::from_str::(arg_value) { + Ok(v) => { + arguments.insert(arg_name, v); + } + Err(_) => { + arguments.insert(arg_name, serde_json::Value::String(arg_value.to_string())); + any_json_fallback = true; + } + } + } + + let status = if any_json_fallback { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + + out.push(ParsedToolCall { + raw: block_text, + name: if name.is_empty() { None } else { Some(name) }, + arguments: Some(ToolArguments::Object(serde_json::Value::Object(arguments))), + token_span: Some(span), + status, + ..Default::default() + }); + i = end + 1; + } + + out +} diff --git a/crates/renderers-core/src/registry.rs b/crates/renderers-core/src/registry.rs index 6e27c3c..f7184d4 100644 --- a/crates/renderers-core/src/registry.rs +++ b/crates/renderers-core/src/registry.rs @@ -4,7 +4,7 @@ //! families ported to Rust so far. New families slot in by adding a //! match arm in [`create_renderer`]. -use crate::families::Qwen3Renderer; +use crate::families::{Qwen35Renderer, Qwen3Renderer}; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::RenderError; @@ -14,12 +14,14 @@ use crate::types::RenderError; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum RendererKind { Qwen3, + Qwen35, } impl RendererKind { pub fn from_str(name: &str) -> Option { match name { "qwen3" | "Qwen3" => Some(Self::Qwen3), + "qwen35" | "qwen3.5" | "Qwen3.5" => Some(Self::Qwen35), _ => None, } } @@ -30,6 +32,10 @@ impl RendererKind { pub struct RendererConfig { pub preserve_all_thinking: bool, pub preserve_thinking_between_tool_calls: bool, + /// `None` keeps the family default; the Qwen3.5 Python shim probes + /// the tokenizer's Jinja template to pick the right polarity and + /// forwards the result here so the Rust side stays template-agnostic. + pub enable_thinking: Option, } /// Build a renderer of the requested kind backed by `tokenizer`. @@ -45,5 +51,14 @@ pub fn create_renderer( .preserve_thinking_between_tool_calls(cfg.preserve_thinking_between_tool_calls) .build(tokenizer)?, )), + RendererKind::Qwen35 => { + let mut b = Qwen35Renderer::builder() + .preserve_all_thinking(cfg.preserve_all_thinking) + .preserve_thinking_between_tool_calls(cfg.preserve_thinking_between_tool_calls); + if let Some(en) = cfg.enable_thinking { + b = b.enable_thinking(en); + } + Ok(Box::new(b.build(tokenizer)?)) + } } } diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 7a29bb2..b669517 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -13,7 +13,7 @@ use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; -use renderers_core::families::Qwen3RendererBuilder; +use renderers_core::families::{Qwen35RendererBuilder, Qwen3RendererBuilder}; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ Message, ParsedResponse, ParsedToolCall, RenderedTokens, ToolArguments, ToolCallParseStatus, @@ -269,6 +269,42 @@ impl PyRenderer { }) } + /// Build a Qwen3.5 renderer (text-only path) from a tokenizer.json. + /// + /// `enable_thinking` defaults to `True` (big-size variant). The Python + /// shim is expected to probe the tokenizer's Jinja template to pick + /// the right polarity for 0.8B / 2B models and forward it explicitly. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn qwen35( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + Qwen35RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] fn render( &self, diff --git a/renderers/qwen35.py b/renderers/qwen35.py index b3c6af7..fc706a4 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -19,6 +19,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, MultiModalData, @@ -109,6 +114,35 @@ def _default_enable_thinking(tokenizer) -> bool: class Qwen35Renderer: """Deterministic message → token renderer for Qwen3.5 models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + processor: Any = None, + enable_thinking: bool | None = None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + image_cache_max: int = 256, + ): + # Route to native only when: + # 1. the user opted in via RENDERERS_NATIVE, + # 2. the wheel is installed, + # 3. the message stream is text-only (no processor / images). + # Phase 5 will lift restriction 3. + if native_enabled("qwen35") and processor is None: + native = load_native() + if native is not None: + if enable_thinking is None: + enable_thinking = _detect_enable_thinking_default(tokenizer) + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.qwen35( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + _config_cls: type = Qwen35RendererConfig def __init__( diff --git a/tests/test_native_parity.py b/tests/test_native_parity.py index 16468e7..2dab862 100644 --- a/tests/test_native_parity.py +++ b/tests/test_native_parity.py @@ -40,6 +40,7 @@ # (model_id, family-key, extra-kwargs) NATIVE_PARITY_FAMILIES = [ ("Qwen/Qwen3-8B", "qwen3", {}), + ("Qwen/Qwen3.5-9B", "qwen35", {}), ] @@ -83,6 +84,10 @@ def native_pair(request, native_module): from renderers.qwen3 import Qwen3Renderer py_renderer = Qwen3Renderer(tokenizer, **extra) + elif family == "qwen35": + from renderers.qwen35 import Qwen35Renderer + + py_renderer = Qwen35Renderer(tokenizer, **extra) else: pytest.skip(f"no python builder wired for {family}") finally: @@ -93,6 +98,8 @@ def native_pair(request, native_module): # bypasses the env-var routing entirely. if family == "qwen3": native_renderer = native_module.Renderer.qwen3(tok_path, **extra) + elif family == "qwen35": + native_renderer = native_module.Renderer.qwen35(tok_path, **extra) else: pytest.skip(f"no native builder wired for {family}") From 930775f10d7897b6d4f9c06a0e22b5f9cc4c3c3e Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:44:20 +0200 Subject: [PATCH 04/35] Add DeepSeek V3 native parity path --- crates/README.md | 2 +- .../src/families/deepseek_v3.rs | 392 ++++++++++++++++++ crates/renderers-core/src/families/mod.rs | 2 + .../renderers-core/src/parsing/deepseek_v3.rs | 194 +++++++++ crates/renderers-core/src/parsing/mod.rs | 1 + crates/renderers-core/src/registry.rs | 11 +- crates/renderers-py/src/lib.rs | 30 +- renderers/deepseek_v3.py | 23 + tests/test_native_parity.py | 7 + 9 files changed, 659 insertions(+), 3 deletions(-) create mode 100644 crates/renderers-core/src/families/deepseek_v3.rs create mode 100644 crates/renderers-core/src/parsing/deepseek_v3.rs diff --git a/crates/README.md b/crates/README.md index 1ab5179..5d02547 100644 --- a/crates/README.md +++ b/crates/README.md @@ -76,7 +76,7 @@ shim logs a one-shot info message and falls back to Python. | Qwen3 | ✅ ported (Phase 2) | | Qwen3.5 | ✅ ported text-only (Phase 3) — multimodal Phase 5 | | GLM 4.5 / 5 | planned (Phase 3) | -| DeepSeek V3 | planned (Phase 3) | +| DeepSeek V3 | ✅ ported (Phase 3) | | Nemotron3 | planned (Phase 3) | | Kimi K2 | planned (Phase 4) | | Kimi K2.5 | planned (Phase 4 — text; multimodal Phase 5) | diff --git a/crates/renderers-core/src/families/deepseek_v3.rs b/crates/renderers-core/src/families/deepseek_v3.rs new file mode 100644 index 0000000..064b9e2 --- /dev/null +++ b/crates/renderers-core/src/families/deepseek_v3.rs @@ -0,0 +1,392 @@ +//! DeepSeek V3 renderer. Port of `renderers/deepseek_v3.py`. +//! +//! Key differences from the Qwen-family renderers: +//! +//! - Special tokens use **fullwidth Unicode** delimiters (`|` = U+FF5C, +//! `▁` = U+2581). Token names are e.g. `<|begin▁of▁sentence|>`. +//! - **Implicit role markers** — `<|User|>` and `<|Assistant|>` carry the +//! role themselves; there's no role-name text after the marker the way +//! Qwen has `<|im_start|>user\n`. +//! - **All leading system messages are concatenated** with `\n\n` and +//! emitted as plain text *before* the first non-system role token (no +//! marker for the system block). +//! - Thinking is plain text `...` tags, not special tokens. +//! - Tool calls live in `<|tool▁calls▁begin|>...<|tool▁calls▁end|>` with +//! each call as `<|tool▁call▁begin|>function<|tool▁sep|>name\n +//! ` ```json\n{args}\n``` `<|tool▁call▁end|>`. + +use serde_json::Value as JsonValue; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::deepseek_v3::parse_deepseek_v3; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, +}; + +const SEP: char = '\u{FF5C}'; // | +const US: char = '\u{2581}'; // ▁ + +fn ds_token(name: &str) -> String { + let mut s = String::with_capacity(name.len() + 4); + s.push('<'); + s.push(SEP); + s.push_str(name); + s.push(SEP); + s.push('>'); + s +} + +#[derive(Debug, Clone)] +pub struct DeepSeekV3RendererBuilder { + enable_thinking: bool, +} + +impl Default for DeepSeekV3RendererBuilder { + fn default() -> Self { + Self { enable_thinking: true } + } +} + +impl DeepSeekV3RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + DeepSeekV3Renderer::new_with(tokenizer, self) + } +} + +#[derive(Debug, Clone)] +pub struct DeepSeekV3Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + + bos: u32, + eos: u32, + user_token: u32, + assistant_token: u32, + tool_calls_begin: u32, + tool_calls_end: u32, + tool_call_begin: u32, + tool_call_end: u32, + tool_sep: u32, + tool_outputs_begin: u32, + tool_outputs_end: u32, + tool_output_begin: u32, + tool_output_end: u32, + + stop_tokens: Vec, +} + +impl DeepSeekV3Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + DeepSeekV3RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> DeepSeekV3RendererBuilder { + DeepSeekV3RendererBuilder::default() + } + + /// Encode a DeepSeek special token via the tokenizer's encode path and + /// assert it maps to exactly one id. Matches the Python + /// `_get_special_token` helper — required because the tokenizer + /// doesn't expose these by `token_to_id` directly (the fullwidth + /// characters are part of the BPE vocab as a single piece). + fn resolve(tokenizer: &Tokenizer, name: &str) -> Result { + let token_str = ds_token(name); + let encoded = tokenizer.encode_no_special(&token_str)?; + let ids = encoded.as_slice(); + if ids.len() != 1 { + return Err(RenderError::MissingSpecialToken(token_str)); + } + Ok(ids[0]) + } + + fn new_with( + tokenizer: Tokenizer, + cfg: DeepSeekV3RendererBuilder, + ) -> Result { + let bos = Self::resolve(&tokenizer, &format!("begin{US}of{US}sentence"))?; + let eos = Self::resolve(&tokenizer, &format!("end{US}of{US}sentence"))?; + let user_token = Self::resolve(&tokenizer, "User")?; + let assistant_token = Self::resolve(&tokenizer, "Assistant")?; + let tool_calls_begin = Self::resolve(&tokenizer, &format!("tool{US}calls{US}begin"))?; + let tool_calls_end = Self::resolve(&tokenizer, &format!("tool{US}calls{US}end"))?; + let tool_call_begin = Self::resolve(&tokenizer, &format!("tool{US}call{US}begin"))?; + let tool_call_end = Self::resolve(&tokenizer, &format!("tool{US}call{US}end"))?; + let tool_sep = Self::resolve(&tokenizer, &format!("tool{US}sep"))?; + let tool_outputs_begin = Self::resolve(&tokenizer, &format!("tool{US}outputs{US}begin"))?; + let tool_outputs_end = Self::resolve(&tokenizer, &format!("tool{US}outputs{US}end"))?; + let tool_output_begin = Self::resolve(&tokenizer, &format!("tool{US}output{US}begin"))?; + let tool_output_end = Self::resolve(&tokenizer, &format!("tool{US}output{US}end"))?; + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + bos, + eos, + user_token, + assistant_token, + tool_calls_begin, + tool_calls_end, + tool_call_begin, + tool_call_end, + tool_sep, + tool_outputs_begin, + tool_outputs_end, + tool_output_begin, + tool_output_end, + stop_tokens: vec![eos], + }) + } + + fn args_to_json_string(args: &ToolArguments) -> String { + match args { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => serde_json::to_string(v).unwrap_or_else(|_| "{}".into()), + } + } + + fn estimate_capacity(messages: &[Message]) -> usize { + messages.len().max(1) * 256 + 64 + } +} + +impl Renderer for DeepSeekV3Renderer { + fn render( + &self, + messages: &[Message], + _tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages)); + + // BOS + buf.scaffold_special(self.bos); + + // Leading system messages: concat with "\n\n", emit as plain text + // before any role marker, attributed to message index 0. + let mut first_non_sys = 0usize; + let mut sys_parts: Vec<&str> = Vec::new(); + for msg in messages.iter() { + if msg.role != "system" { + break; + } + sys_parts.push(msg.text_content()); + first_non_sys += 1; + } + if !sys_parts.is_empty() { + let joined = sys_parts.join("\n\n"); + buf.text(&joined, 0)?; + } + + for (i, msg) in messages.iter().enumerate().skip(first_non_sys) { + let idx = i as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "system" => { + // Post-initial system → treat as user + buf.special(self.user_token, idx); + buf.text(content, idx)?; + } + "user" => { + buf.special(self.user_token, idx); + buf.text(content, idx)?; + } + "assistant" => self.emit_assistant(&mut buf, msg, i, messages)?, + "tool" => self.emit_tool(&mut buf, messages, i)?, + _ => {} // mirror Python: silent skip on unknown role + } + } + + // Generation prompt — skip <|Assistant|> after a tool output + if add_generation_prompt { + let last_role = messages.last().map(|m| m.role.as_str()).unwrap_or(""); + if last_role != "tool" { + buf.scaffold_special(self.assistant_token); + } + if self.enable_thinking { + buf.scaffold_text("\n")?; + } + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_deepseek_v3( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.tool_calls_begin, + self.tool_calls_end, + self.tool_call_begin, + self.tool_call_end, + self.tool_sep, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.eos), + ) else { + return Ok(None); + }; + + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(new_messages)); + + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "user" | "system" => { + buf.special(self.user_token, idx); + buf.text(content, idx)?; + } + "tool" => { + let prev_is_tool = i > 0 && new_messages[i - 1].role == "tool"; + let next_is_tool = + i + 1 < new_messages.len() && new_messages[i + 1].role == "tool"; + if !prev_is_tool { + buf.special(self.tool_outputs_begin, idx); + } + buf.special(self.tool_output_begin, idx); + buf.text(content, idx)?; + buf.special(self.tool_output_end, idx); + if !next_is_tool { + buf.special(self.tool_outputs_end, idx); + } + } + _ => return Ok(None), + } + } + + let last_role = new_messages + .last() + .map(|m| m.role.as_str()) + .unwrap_or(""); + if last_role != "tool" { + buf.scaffold_special(self.assistant_token); + } + if self.enable_thinking { + buf.scaffold_text("\n")?; + } + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +impl DeepSeekV3Renderer { + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: usize, + messages: &[Message], + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let idx = msg_idx as i32; + + // Build the content text, with reasoning_content wrapped in if present + let mut content = msg.text_content().to_string(); + if let Some(reasoning) = msg.reasoning_content.as_deref() { + if !reasoning.is_empty() { + let mut wrapped = String::with_capacity(reasoning.len() + content.len() + 16); + wrapped.push_str(""); + wrapped.push_str(reasoning); + wrapped.push_str(""); + wrapped.push_str(&content); + content = wrapped; + } + } + + if !prev_is_tool { + buf.special(self.assistant_token, idx); + } + + // Pre-tool-call content + buf.text(&content, idx)?; + + if !msg.tool_calls.is_empty() { + buf.special(self.tool_calls_begin, idx); + for tc in &msg.tool_calls { + let name = tc.function.name.as_str(); + let args_str = Self::args_to_json_string(&tc.function.arguments); + let _ = JsonValue::Null; // keep import in scope for future use + + buf.special(self.tool_call_begin, idx); + buf.text("function", idx)?; + buf.special(self.tool_sep, idx); + let mut payload = String::with_capacity(name.len() + args_str.len() + 16); + payload.push_str(name); + payload.push_str("\n```json\n"); + payload.push_str(&args_str); + payload.push_str("\n```"); + buf.text(&payload, idx)?; + buf.special(self.tool_call_end, idx); + } + buf.special(self.tool_calls_end, idx); + } + + buf.special(self.eos, idx); + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + msg_idx: usize, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let next_is_tool = + msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let idx = msg_idx as i32; + let content = messages[msg_idx].text_content(); + + if !prev_is_tool { + buf.special(self.tool_outputs_begin, idx); + } + buf.special(self.tool_output_begin, idx); + buf.text(content, idx)?; + buf.special(self.tool_output_end, idx); + if !next_is_tool { + buf.special(self.tool_outputs_end, idx); + } + Ok(()) + } +} diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index 843471a..8506644 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -4,8 +4,10 @@ //! stays focused. New families slot in by adding a module here and a //! registry entry in [`crate::registry`]. +pub mod deepseek_v3; pub mod qwen3; pub mod qwen35; +pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; pub use qwen35::{Qwen35Renderer, Qwen35RendererBuilder}; diff --git a/crates/renderers-core/src/parsing/deepseek_v3.rs b/crates/renderers-core/src/parsing/deepseek_v3.rs new file mode 100644 index 0000000..b4d994e --- /dev/null +++ b/crates/renderers-core/src/parsing/deepseek_v3.rs @@ -0,0 +1,194 @@ +//! DeepSeek V3 tool-call parser. Port of +//! `renderers/parsing.py:parse_deepseek_v3` + `_parse_deepseek_tool_calls`. +//! +//! Structural shape: +//! +//! ```text +//! ...content... +//! ...reasoning... +//! <|tool▁calls▁begin|> +//! <|tool▁call▁begin|>function<|tool▁sep|>{name} +//! ```json +//! {args} +//! ```<|tool▁call▁end|> +//! <|tool▁calls▁end|> +//! ``` +//! +//! Thinking is **text tags** (not special tokens) — DeepSeek emits +//! `...` as decoded text. Tool calls are special-token +//! delimited. The fenced JSON inside is parsed with a small anchored regex. + +use std::ops::Range; +use std::sync::LazyLock; + +use regex::Regex; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +static JSON_FENCE_RE: LazyLock = LazyLock::new(|| { + // Matches ```json\n\n``` or ```\n\n``` at the end of the string. + Regex::new(r"(?s)^```(?:json)?\s*(.*?)\s*```$").expect("json-fence regex") +}); + +#[allow(clippy::too_many_arguments)] +pub fn parse_deepseek_v3( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + tool_calls_begin_id: u32, + tool_calls_end_id: u32, + tool_call_begin_id: u32, + tool_call_end_id: u32, + tool_sep_id: u32, +) -> ParsedResponse { + let ids = strip_stop_tokens(token_ids, stop_ids); + + let (content_ids, tool_calls) = match find(ids, tool_calls_begin_id) { + Some(section_start) => { + let content = &ids[..section_start]; + let tcs = parse_deepseek_tool_calls( + tokenizer, + &ids[section_start..], + tool_calls_begin_id, + tool_calls_end_id, + tool_call_begin_id, + tool_call_end_id, + tool_sep_id, + section_start, + ); + (content, tcs) + } + None => (ids, Vec::new()), + }; + + let text = decode(tokenizer, content_ids).unwrap_or_default(); + + // Split out `...` from the decoded content. Plain text + // tags here (no special tokens — that's the DeepSeek convention). + let (reasoning, content) = match text.split_once("") { + Some((before, after)) => { + let r = before + .replace("", "") + .trim_matches('\n') + .trim() + .to_string(); + let c = after.trim_start_matches('\n').trim().to_string(); + (Some(r), c) + } + None => (None, text.trim().to_string()), + }; + + ParsedResponse { + content, + reasoning_content: reasoning.filter(|s| !s.is_empty()), + tool_calls, + } +} + +#[allow(clippy::too_many_arguments)] +fn parse_deepseek_tool_calls( + tokenizer: &Tokenizer, + ids: &[u32], + tc_begin_id: u32, + tc_end_id: u32, + call_begin_id: u32, + call_end_id: u32, + sep_id: u32, + section_offset: usize, +) -> Vec { + let mut out: Vec = Vec::new(); + + let Some(section_start) = find(ids, tc_begin_id) else { + return out; + }; + let section_end = find_from(ids, tc_end_id, section_start + 1).unwrap_or(ids.len()); + let inner_offset = section_offset + section_start + 1; + let section_ids = &ids[section_start + 1..section_end]; + + let mut i = 0usize; + while i < section_ids.len() { + if section_ids[i] != call_begin_id { + i += 1; + continue; + } + let (end, unclosed) = match find_from(section_ids, call_end_id, i + 1) { + Some(end) => (end, false), + None => (section_ids.len(), true), + }; + let call_ids = §ion_ids[i + 1..end]; + let block_text = decode(tokenizer, call_ids).unwrap_or_default(); + let span = Range { + start: inner_offset + i, + end: inner_offset + end + if unclosed { 0 } else { 1 }, + }; + + let Some(sep_pos) = find(call_ids, sep_id) else { + out.push(ParsedToolCall { + raw: block_text, + token_span: Some(span), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + i = end + 1; + continue; + }; + + let after_sep = decode(tokenizer, &call_ids[sep_pos + 1..]) + .unwrap_or_default() + .trim() + .to_string(); + + let (name, args_str) = match after_sep.find('\n') { + Some(nl) => { + let n = after_sep[..nl].trim().to_string(); + let rest = after_sep[nl + 1..].trim(); + let args = match JSON_FENCE_RE.captures(rest) { + Some(c) => c.get(1).map(|m| m.as_str().trim()).unwrap_or("").to_string(), + None => rest.to_string(), + }; + (n, args) + } + None => (after_sep.clone(), String::new()), + }; + + let mut invalid_json = false; + let arguments = if args_str.is_empty() { + ToolArguments::Object(serde_json::Value::Object(Default::default())) + } else { + match serde_json::from_str::(&args_str) { + Ok(v) => ToolArguments::Object(v), + Err(_) => { + invalid_json = true; + ToolArguments::Raw(args_str.clone()) + } + } + }; + + let status = if unclosed { + ToolCallParseStatus::UnclosedBlock + } else if name.is_empty() { + ToolCallParseStatus::MissingName + } else if invalid_json { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + + out.push(ParsedToolCall { + raw: block_text, + name: if name.is_empty() { None } else { Some(name) }, + arguments: Some(arguments), + token_span: Some(span), + status, + ..Default::default() + }); + i = end + 1; + if unclosed { + break; + } + } + + out +} diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs index 77ad38c..9fbe0c5 100644 --- a/crates/renderers-core/src/parsing/mod.rs +++ b/crates/renderers-core/src/parsing/mod.rs @@ -9,6 +9,7 @@ //! All helpers operate on `&[u32]` slices and are `#[inline]`-marked so //! they vanish into the family parsers at -O. +pub mod deepseek_v3; pub mod qwen3; pub mod qwen35; diff --git a/crates/renderers-core/src/registry.rs b/crates/renderers-core/src/registry.rs index f7184d4..e8f1a22 100644 --- a/crates/renderers-core/src/registry.rs +++ b/crates/renderers-core/src/registry.rs @@ -4,7 +4,7 @@ //! families ported to Rust so far. New families slot in by adding a //! match arm in [`create_renderer`]. -use crate::families::{Qwen35Renderer, Qwen3Renderer}; +use crate::families::{DeepSeekV3Renderer, Qwen35Renderer, Qwen3Renderer}; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::RenderError; @@ -15,6 +15,7 @@ use crate::types::RenderError; pub enum RendererKind { Qwen3, Qwen35, + DeepSeekV3, } impl RendererKind { @@ -22,6 +23,7 @@ impl RendererKind { match name { "qwen3" | "Qwen3" => Some(Self::Qwen3), "qwen35" | "qwen3.5" | "Qwen3.5" => Some(Self::Qwen35), + "deepseek_v3" | "deepseek-v3" | "DeepSeekV3" => Some(Self::DeepSeekV3), _ => None, } } @@ -60,5 +62,12 @@ pub fn create_renderer( } Ok(Box::new(b.build(tokenizer)?)) } + RendererKind::DeepSeekV3 => { + let mut b = DeepSeekV3Renderer::builder(); + if let Some(en) = cfg.enable_thinking { + b = b.enable_thinking(en); + } + Ok(Box::new(b.build(tokenizer)?)) + } } } diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index b669517..f15151d 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -13,7 +13,9 @@ use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; -use renderers_core::families::{Qwen35RendererBuilder, Qwen3RendererBuilder}; +use renderers_core::families::{ + DeepSeekV3RendererBuilder, Qwen35RendererBuilder, Qwen3RendererBuilder, +}; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ Message, ParsedResponse, ParsedToolCall, RenderedTokens, ToolArguments, ToolCallParseStatus, @@ -305,6 +307,32 @@ impl PyRenderer { }) } + /// Build a DeepSeek V3 renderer from a tokenizer.json. + /// + /// `enable_thinking=True` (default) prefills the generation prompt + /// with `\n` to trigger reasoning. The Python shim mirrors + /// the upstream class signature. + #[classmethod] + #[pyo3(signature = (tokenizer_path, *, enable_thinking = true))] + fn deepseek_v3( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + DeepSeekV3RendererBuilder::default() + .enable_thinking(enable_thinking) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] fn render( &self, diff --git a/renderers/deepseek_v3.py b/renderers/deepseek_v3.py index 4efe3ef..39dc5dc 100644 --- a/renderers/deepseek_v3.py +++ b/renderers/deepseek_v3.py @@ -16,6 +16,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -52,6 +57,24 @@ class DeepSeekV3Renderer: no-ops here too; stored for protocol uniformity. """ + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if native_enabled("deepseek_v3") or native_enabled("deepseek-v3"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.deepseek_v3( + path, + enable_thinking=enable_thinking, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/tests/test_native_parity.py b/tests/test_native_parity.py index 2dab862..c4d7b2f 100644 --- a/tests/test_native_parity.py +++ b/tests/test_native_parity.py @@ -41,6 +41,7 @@ NATIVE_PARITY_FAMILIES = [ ("Qwen/Qwen3-8B", "qwen3", {}), ("Qwen/Qwen3.5-9B", "qwen35", {}), + ("deepseek-ai/DeepSeek-V3", "deepseek_v3", {}), ] @@ -88,6 +89,10 @@ def native_pair(request, native_module): from renderers.qwen35 import Qwen35Renderer py_renderer = Qwen35Renderer(tokenizer, **extra) + elif family == "deepseek_v3": + from renderers.deepseek_v3 import DeepSeekV3Renderer + + py_renderer = DeepSeekV3Renderer(tokenizer, **extra) else: pytest.skip(f"no python builder wired for {family}") finally: @@ -100,6 +105,8 @@ def native_pair(request, native_module): native_renderer = native_module.Renderer.qwen3(tok_path, **extra) elif family == "qwen35": native_renderer = native_module.Renderer.qwen35(tok_path, **extra) + elif family == "deepseek_v3": + native_renderer = native_module.Renderer.deepseek_v3(tok_path, **extra) else: pytest.skip(f"no native builder wired for {family}") From bd958171a0d0485f31e08015aa5cf56c53556557 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:44:30 +0200 Subject: [PATCH 05/35] Add Qwen3.6 native parity path --- crates/renderers-core/src/families/mod.rs | 2 + crates/renderers-core/src/families/qwen35.rs | 51 ++++++++++++++------ crates/renderers-core/src/families/qwen36.rs | 45 +++++++++++++++++ crates/renderers-py/src/lib.rs | 34 ++++++++++++- renderers/qwen36.py | 36 +++++++++++++- 5 files changed, 152 insertions(+), 16 deletions(-) create mode 100644 crates/renderers-core/src/families/qwen36.rs diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index 8506644..fee63dc 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -7,7 +7,9 @@ pub mod deepseek_v3; pub mod qwen3; pub mod qwen35; +pub mod qwen36; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; pub use qwen35::{Qwen35Renderer, Qwen35RendererBuilder}; +pub use qwen36::{Qwen36Renderer, Qwen36RendererBuilder}; diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs index 35c4b67..bbb0be1 100644 --- a/crates/renderers-core/src/families/qwen35.rs +++ b/crates/renderers-core/src/families/qwen35.rs @@ -37,6 +37,12 @@ pub struct Qwen35RendererBuilder { enable_thinking: bool, preserve_all_thinking: bool, preserve_thinking_between_tool_calls: bool, + /// When `true`, every non-string tool-call argument is serialised via + /// `serde_json::to_string` instead of Python's `str(...)` rules. This + /// is the only behavioural change Qwen3.6 introduces vs Qwen3.5 — + /// kept as a flag here so Qwen3.6 is a config delta, not a code + /// duplicate. + args_as_json: bool, } impl Default for Qwen35RendererBuilder { @@ -48,6 +54,7 @@ impl Default for Qwen35RendererBuilder { enable_thinking: true, preserve_all_thinking: false, preserve_thinking_between_tool_calls: false, + args_as_json: false, } } } @@ -65,6 +72,11 @@ impl Qwen35RendererBuilder { self.preserve_thinking_between_tool_calls = on; self } + /// Qwen3.6 flag — JSON-serialise every non-string tool argument. + pub fn args_as_json(mut self, on: bool) -> Self { + self.args_as_json = on; + self + } pub fn build(self, tokenizer: Tokenizer) -> Result { Qwen35Renderer::new_with(tokenizer, self) } @@ -76,6 +88,7 @@ pub struct Qwen35Renderer { enable_thinking: bool, preserve_all_thinking: bool, preserve_thinking_between_tool_calls: bool, + args_as_json: bool, im_start: u32, im_end: u32, @@ -116,6 +129,7 @@ impl Qwen35Renderer { enable_thinking: cfg.enable_thinking, preserve_all_thinking: cfg.preserve_all_thinking, preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + args_as_json: cfg.args_as_json, im_start, im_end, endoftext, @@ -250,21 +264,30 @@ impl Qwen35Renderer { Ok(()) } - fn render_arg_value(arg_value: &serde_json::Value) -> String { - // Mirrors the Python `_render_arg_value`: dict/list → compact - // JSON; everything else → `str(...)` (Python's str() for bool, - // int, None, etc. — handled here by matching each scalar). - match arg_value { - serde_json::Value::Object(_) | serde_json::Value::Array(_) => { - serde_json::to_string(arg_value).unwrap_or_default() + fn render_arg_value(arg_value: &serde_json::Value, args_as_json: bool) -> String { + if args_as_json { + // Qwen3.6: every non-string serialises via serde_json (bools + // become "true"/"false", None becomes "null"). Strings still + // render verbatim — JSON would re-quote them. + match arg_value { + serde_json::Value::String(s) => s.clone(), + _ => serde_json::to_string(arg_value).unwrap_or_default(), } - serde_json::Value::String(s) => s.clone(), - serde_json::Value::Bool(b) => { - // Python str(True) == "True", str(False) == "False" - if *b { "True".to_string() } else { "False".to_string() } + } else { + // Qwen3.5: Python's str() rules — dict/list go through JSON, + // bools become "True"/"False", None becomes "None", numbers + // and strings render verbatim. + match arg_value { + serde_json::Value::Object(_) | serde_json::Value::Array(_) => { + serde_json::to_string(arg_value).unwrap_or_default() + } + serde_json::Value::String(s) => s.clone(), + serde_json::Value::Bool(b) => { + if *b { "True".to_string() } else { "False".to_string() } + } + serde_json::Value::Null => "None".to_string(), + serde_json::Value::Number(n) => n.to_string(), } - serde_json::Value::Null => "None".to_string(), - serde_json::Value::Number(n) => n.to_string(), } } @@ -347,7 +370,7 @@ impl Qwen35Renderer { }; if let Some(obj) = args_value.as_object() { for (arg_name, arg_value) in obj { - let value_str = Self::render_arg_value(arg_value); + let value_str = Self::render_arg_value(arg_value, self.args_as_json); let mut param = String::with_capacity(arg_name.len() + value_str.len() + 24); param.push_str(" Self { + self.inner = self.inner.enable_thinking(on); + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.inner = self.inner.preserve_all_thinking(on); + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.inner = self.inner.preserve_thinking_between_tool_calls(on); + self + } + pub fn build( + self, + tokenizer: crate::tokenizer::Tokenizer, + ) -> Result { + self.inner.args_as_json(true).build(tokenizer) + } +} diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index f15151d..f89b28d 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -14,7 +14,7 @@ use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; use renderers_core::families::{ - DeepSeekV3RendererBuilder, Qwen35RendererBuilder, Qwen3RendererBuilder, + DeepSeekV3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, }; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ @@ -307,6 +307,38 @@ impl PyRenderer { }) } + /// Build a Qwen3.6 renderer (Qwen3.5 + JSON-flavoured tool args). + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn qwen36( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + Qwen36RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + /// Build a DeepSeek V3 renderer from a tokenizer.json. /// /// `enable_thinking=True` (default) prefills the generation prompt diff --git a/renderers/qwen36.py b/renderers/qwen36.py index 6adf867..8b959b5 100644 --- a/renderers/qwen36.py +++ b/renderers/qwen36.py @@ -24,7 +24,12 @@ from typing import Any from renderers.configs import Qwen36RendererConfig -from renderers.qwen35 import Qwen35Renderer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) +from renderers.qwen35 import Qwen35Renderer, _detect_enable_thinking_default class Qwen36Renderer(Qwen35Renderer): @@ -32,6 +37,35 @@ class Qwen36Renderer(Qwen35Renderer): _config_cls = Qwen36RendererConfig + def __new__( + cls, + tokenizer, + *, + processor=None, + enable_thinking=None, + preserve_all_thinking=False, + preserve_thinking_between_tool_calls=False, + image_cache_max=256, + ): + # Route to native only for Qwen3.6 specifically — never fall + # through to the parent's qwen35 router (the renderer flag is + # different). + if native_enabled("qwen36") and processor is None: + native = load_native() + if native is not None: + if enable_thinking is None: + enable_thinking = _detect_enable_thinking_default(tokenizer) + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.qwen36( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + # Skip Qwen35Renderer.__new__ (would also try to route, with the + # wrong flag). Go straight to object. + return object.__new__(cls) + @staticmethod def _render_arg_value(arg_value: Any) -> str: if isinstance(arg_value, str): From 98c9ffa764715d0ababafcf745de2f68348f7205 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:44:39 +0200 Subject: [PATCH 06/35] Add Nemotron 3 native parity path --- crates/renderers-core/src/families/mod.rs | 2 + .../renderers-core/src/families/nemotron3.rs | 666 ++++++++++++++++++ crates/renderers-py/src/lib.rs | 38 +- renderers/nemotron3.py | 25 + 4 files changed, 730 insertions(+), 1 deletion(-) create mode 100644 crates/renderers-core/src/families/nemotron3.rs diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index fee63dc..ac7a5f1 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -5,11 +5,13 @@ //! registry entry in [`crate::registry`]. pub mod deepseek_v3; +pub mod nemotron3; pub mod qwen3; pub mod qwen35; pub mod qwen36; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; +pub use nemotron3::{Nemotron3Renderer, Nemotron3RendererBuilder}; pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; pub use qwen35::{Qwen35Renderer, Qwen35RendererBuilder}; pub use qwen36::{Qwen36Renderer, Qwen36RendererBuilder}; diff --git a/crates/renderers-core/src/families/nemotron3.rs b/crates/renderers-core/src/families/nemotron3.rs new file mode 100644 index 0000000..17f76cb --- /dev/null +++ b/crates/renderers-core/src/families/nemotron3.rs @@ -0,0 +1,666 @@ +//! Nemotron 3 renderer. Port of `renderers/nemotron3.py`. +//! +//! Same `<|im_start|>/<|im_end|>` framing as Qwen3.5, but with several +//! template-specific quirks: +//! +//! - Tool declarations use XML (`...` with nested +//! `` blocks), not JSON-per-line. +//! - System prompt is emitted BEFORE the tools block (Qwen3.5 puts +//! tools first). +//! - An empty system message is auto-injected if none is present. +//! - `` is emitted on EVERY assistant message, even +//! those without reasoning content (collapses to empty block). +//! - Single `\n` after `` (Qwen3.5 uses `\n\n`). +//! - Disable-thinking generation suffix is `` with no +//! trailing newlines. +//! - Trailing `\n` after ``. +//! - `<|endoftext|>` is *optional* — Nemotron-3 Nano / Super ship with +//! only `<|im_end|>` as EOS; larger variants additionally include +//! `<|endoftext|>`. + +use serde_json::Value as JsonValue; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::qwen35::parse_qwen35; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, +}; + +const TOOLS_HEADER: &str = "# Tools\n\nYou have access to the following functions:\n\n"; +const TOOLS_FOOTER: &str = "\n"; +const TOOLS_INSTRUCTIONS: &str = "\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n"; + +#[derive(Debug, Clone)] +pub struct Nemotron3RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for Nemotron3RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl Nemotron3RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + Nemotron3Renderer::new_with(tokenizer, self) + } +} + +#[derive(Debug, Clone)] +pub struct Nemotron3Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + im_start: u32, + im_end: u32, + /// `<|endoftext|>` is optional — Nemotron-3 Nano / Super tokenizers + /// don't ship it. + endoftext: Option, + think: u32, + think_end: u32, + tool_call: u32, + tool_call_end: u32, + tool_response: u32, + tool_response_end: u32, + + stop_tokens: Vec, +} + +impl Nemotron3Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + Nemotron3RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> Nemotron3RendererBuilder { + Nemotron3RendererBuilder::default() + } + + fn new_with( + tokenizer: Tokenizer, + cfg: Nemotron3RendererBuilder, + ) -> Result { + let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let endoftext = tokenizer.token_to_id("<|endoftext|>"); + let think = tokenizer.token_to_id_strict("")?; + let think_end = tokenizer.token_to_id_strict("")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let tool_response = tokenizer.token_to_id_strict("")?; + let tool_response_end = tokenizer.token_to_id_strict("")?; + + let mut stop_tokens = vec![im_end]; + if let Some(eot) = endoftext { + stop_tokens.push(eot); + } + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_start, + im_end, + endoftext, + think, + think_end, + tool_call, + tool_call_end, + tool_response, + tool_response_end, + stop_tokens, + }) + } + + /// Render a single tool declaration in Nemotron 3's XML format. + /// Mirrors `_format_tool_declaration` in the Python impl. + fn format_tool_declaration(tool: &ToolSpec) -> String { + let mut out = String::with_capacity(256); + out.push_str("\n"); + out.push_str(&tool.name); + out.push_str(""); + let desc = tool.description.trim(); + if !desc.is_empty() { + out.push_str("\n"); + out.push_str(desc); + out.push_str(""); + } + out.push_str("\n"); + + if let Some(props) = tool.parameters.get("properties").and_then(|v| v.as_object()) { + for (param_name, param_fields) in props { + out.push_str("\n\n"); + out.push_str(param_name); + out.push_str(""); + if let Some(t) = param_fields.get("type") { + out.push_str("\n"); + Self::write_value_as_text(&mut out, t); + out.push_str(""); + } + if let Some(d) = param_fields.get("description").and_then(|v| v.as_str()) { + out.push_str("\n"); + out.push_str(d.trim()); + out.push_str(""); + } + if let Some(e) = param_fields.get("enum") { + out.push_str("\n"); + out.push_str(&serde_json::to_string(e).unwrap_or_default()); + out.push_str(""); + } + if let Some(obj) = param_fields.as_object() { + Self::render_extra_keys( + &mut out, + obj, + &["name", "type", "description", "enum"], + ); + } + out.push_str("\n"); + } + } + if let Some(obj) = tool.parameters.as_object() { + Self::render_extra_keys(&mut out, obj, &["type", "properties", "required"]); + } + if let Some(req) = tool.parameters.get("required") { + out.push_str("\n"); + out.push_str(&serde_json::to_string(req).unwrap_or_default()); + out.push_str(""); + } + out.push_str("\n"); + out.push_str("\n"); + out + } + + /// Mirror Python's `str(value)` for non-string JSON values + /// (used inside `{value}` tags). + fn write_value_as_text(out: &mut String, value: &JsonValue) { + match value { + JsonValue::String(s) => out.push_str(s), + JsonValue::Bool(true) => out.push_str("True"), + JsonValue::Bool(false) => out.push_str("False"), + JsonValue::Null => out.push_str("None"), + JsonValue::Number(n) => out.push_str(&n.to_string()), + _ => out.push_str(&serde_json::to_string(value).unwrap_or_default()), + } + } + + /// Mirror Python's `_render_extra_keys` — emit `value` + /// for every key not already handled. + fn render_extra_keys( + out: &mut String, + obj: &serde_json::Map, + handled: &[&str], + ) { + for (k, v) in obj { + if handled.iter().any(|h| *h == k.as_str()) { + continue; + } + out.push_str("\n<"); + out.push_str(k); + out.push('>'); + match v { + JsonValue::Object(_) | JsonValue::Array(_) => { + out.push_str(&serde_json::to_string(v).unwrap_or_default()); + } + _ => Self::write_value_as_text(out, v), + } + out.push_str("'); + } + } + + fn emit_system_with_tools( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + tools: &[ToolSpec], + first_is_system: bool, + ) -> Result<(), RenderError> { + let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; + buf.special(self.im_start, sys_idx); + buf.text("system\n", sys_idx)?; + + let mut full_sys = String::with_capacity(512); + if first_is_system { + full_sys.push_str(messages[0].text_content().trim()); + } + let mut tools_block = String::with_capacity(512); + tools_block.push_str(TOOLS_HEADER); + tools_block.push('\n'); + let mut first = true; + for t in tools { + if !first { + tools_block.push('\n'); + } + tools_block.push_str(&Self::format_tool_declaration(t)); + first = false; + } + tools_block.push_str(TOOLS_FOOTER); + tools_block.push_str(TOOLS_INSTRUCTIONS); + + if !full_sys.is_empty() { + full_sys.push_str("\n\n"); + } + full_sys.push_str(&tools_block); + + buf.text(&full_sys, sys_idx)?; + buf.special(self.im_end, sys_idx); + buf.text("\n", sys_idx)?; + Ok(()) + } + + fn emit_system_no_tools( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + sys_idx: i32, + ) -> Result<(), RenderError> { + let content = messages[0].text_content().trim(); + buf.special(self.im_start, sys_idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, sys_idx)?; + buf.special(self.im_end, sys_idx); + buf.text("\n", sys_idx)?; + Ok(()) + } + + fn emit_user( + &self, + buf: &mut RenderBuf<'_>, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + msg_idx: usize, + content: &str, + msg_orig_idx: i32, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let next_is_tool = + msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + + if !prev_is_tool { + buf.special(self.im_start, msg_orig_idx); + buf.text("user\n", msg_orig_idx)?; + } + buf.special(self.tool_response, msg_orig_idx); + let mut wrapped = String::with_capacity(content.len() + 2); + wrapped.push('\n'); + wrapped.push_str(content); + wrapped.push('\n'); + buf.text(&wrapped, msg_orig_idx)?; + buf.special(self.tool_response_end, msg_orig_idx); + // Nemotron 3: trailing \n after + buf.text("\n", msg_orig_idx)?; + + if !next_is_tool { + buf.special(self.im_end, msg_orig_idx); + buf.text("\n", msg_orig_idx)?; + } + Ok(()) + } + + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_orig_idx: i32, + is_last_turn: bool, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + // Recover reasoning_content either from the field or from inline tags. + let raw_content = msg.text_content().trim(); + let (reasoning_content, content) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let r = if let Some((_, inner)) = before.rsplit_once("") { + inner.trim_start_matches('\n').trim_end_matches('\n').to_string() + } else { + before.trim_start_matches('\n').trim_end_matches('\n').to_string() + }; + (r, after.trim_start_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + let reasoning_content = reasoning_content.trim().to_string(); + + buf.special(self.im_start, msg_orig_idx); + buf.text("assistant\n", msg_orig_idx)?; + + let tool_calls = &msg.tool_calls; + let content_suffix = if !tool_calls.is_empty() { "\n" } else { "" }; + + if !reasoning_content.is_empty() && (is_last_turn || preserve_thinking) { + buf.special(self.think, msg_orig_idx); + let mut s = String::with_capacity(reasoning_content.len() + 2); + s.push('\n'); + s.push_str(&reasoning_content); + s.push('\n'); + buf.text(&s, msg_orig_idx)?; + buf.special(self.think_end, msg_orig_idx); + // Single \n separator (not \n\n like Qwen3.5) + let mut tail = String::with_capacity(content.len() + 2); + tail.push('\n'); + tail.push_str(&content); + tail.push_str(content_suffix); + buf.text(&tail, msg_orig_idx)?; + } else if !reasoning_content.is_empty() { + // Historical assistant whose reasoning got stripped — collapsed + // + single \n + content. + buf.special(self.think, msg_orig_idx); + buf.special(self.think_end, msg_orig_idx); + let mut tail = String::with_capacity(content.len() + 2); + tail.push('\n'); + tail.push_str(&content); + tail.push_str(content_suffix); + buf.text(&tail, msg_orig_idx)?; + } else { + // No reasoning ever — glued directly to content. + buf.special(self.think, msg_orig_idx); + buf.special(self.think_end, msg_orig_idx); + let mut tail = String::with_capacity(content.len() + 2); + tail.push_str(&content); + tail.push_str(content_suffix); + buf.text(&tail, msg_orig_idx)?; + } + + for tc in tool_calls { + let name = tc.function.name.as_str(); + buf.special(self.tool_call, msg_orig_idx); + let mut head = String::with_capacity(name.len() + 16); + head.push_str("\n\n"); + buf.text(&head, msg_orig_idx)?; + + let args_value = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => serde_json::from_str(s) + .unwrap_or(JsonValue::Object(Default::default())), + }; + if let Some(obj) = args_value.as_object() { + for (arg_name, arg_value) in obj { + let val_str = match arg_value { + JsonValue::Object(_) | JsonValue::Array(_) => { + serde_json::to_string(arg_value).unwrap_or_default() + } + JsonValue::String(s) => s.clone(), + JsonValue::Bool(b) => if *b { "True".into() } else { "False".into() }, + JsonValue::Null => "None".into(), + JsonValue::Number(n) => n.to_string(), + }; + let mut param = + String::with_capacity(arg_name.len() + val_str.len() + 24); + param.push_str("\n"); + param.push_str(&val_str); + param.push_str("\n\n"); + buf.text(¶m, msg_orig_idx)?; + } + } + + buf.text("\n", msg_orig_idx)?; + buf.special(self.tool_call_end, msg_orig_idx); + // Nemotron 3: trailing \n after + buf.text("\n", msg_orig_idx)?; + } + + buf.special(self.im_end, msg_orig_idx); + buf.text("\n", msg_orig_idx)?; + Ok(()) + } + + fn emit_generation_prompt(&self, buf: &mut RenderBuf<'_>) -> Result<(), RenderError> { + buf.scaffold_special(self.im_start); + buf.scaffold_text("assistant\n")?; + if self.enable_thinking { + buf.scaffold_special(self.think); + buf.scaffold_text("\n")?; + } else { + // Disable-thinking suffix: with no trailing newlines + buf.scaffold_special(self.think); + buf.scaffold_special(self.think_end); + } + Ok(()) + } + + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + let base = messages.len().max(1) * 256; + let tools_bonus = tools.map(|t| 384 * t.len().max(1) + 512).unwrap_or(0); + base + tools_bonus + } +} + +impl Renderer for Nemotron3Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + + // Normalise: prepend empty system message if none is present. + let mut normalised: Vec; + let auto_system_injected: bool; + let messages_ref: &[Message] = if messages[0].role != "system" { + auto_system_injected = true; + normalised = Vec::with_capacity(messages.len() + 1); + normalised.push(Message { + role: "system".to_string(), + content: crate::types::Content::Text(String::new()), + ..Default::default() + }); + normalised.extend_from_slice(messages); + &normalised + } else { + auto_system_injected = false; + messages + }; + + // Map normalised index back to caller's original index. Injected + // system uses SCAFFOLD_IDX (-1) so build_training_sample can't + // dereference past the caller's input. + let orig_idx = |i: usize| -> i32 { + if auto_system_injected { + if i == 0 { SCAFFOLD_IDX } else { (i - 1) as i32 } + } else { + i as i32 + } + }; + + let first_is_system = messages_ref[0].role == "system"; + + match tools { + Some(t) if !t.is_empty() => { + self.emit_system_with_tools(&mut buf, messages_ref, t, first_is_system)?; + } + _ => { + if first_is_system { + self.emit_system_no_tools(&mut buf, messages_ref, orig_idx(0))?; + } + } + } + + // Find the most-recent plain (non-tool-call) assistant — reasoning + // is preserved on it and on later turns; earlier assistants + // collapse to . + let last_plain_assistant_idx: i32 = { + let mut found: i32 = -1; + for (j, m) in messages_ref.iter().enumerate().rev() { + if m.role == "assistant" && m.tool_calls.is_empty() { + found = j as i32; + break; + } + } + found + }; + + for (i, msg) in messages_ref.iter().enumerate() { + let content = msg.text_content().trim(); + let oi = orig_idx(i); + match msg.role.as_str() { + "system" => { + if i != 0 { + return Err(RenderError::Invalid( + "system message must be at the beginning".into(), + )); + } + // Already handled above + } + "user" => self.emit_user(&mut buf, content, oi)?, + "assistant" => { + let is_last_turn = (i as i32) >= last_plain_assistant_idx; + let preserve_thinking = oi >= 0 + && should_preserve_past_thinking( + messages, + oi as usize, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(&mut buf, msg, oi, is_last_turn, preserve_thinking)?; + } + "tool" => self.emit_tool(&mut buf, messages_ref, i, content, oi)?, + _ => { + return Err(RenderError::Invalid(format!( + "unexpected message role: {}", + msg.role + ))); + } + } + } + + if add_generation_prompt { + self.emit_generation_prompt(&mut buf)?; + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_qwen35( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.think, + self.think_end, + self.tool_call, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(new_messages, None)); + buf.scaffold_text("\n")?; + + for (i, msg) in new_messages.iter().enumerate() { + let content = msg.text_content().trim(); + let idx = i as i32; + match msg.role.as_str() { + "user" => self.emit_user(&mut buf, content, idx)?, + "system" => { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + } + "tool" => self.emit_tool(&mut buf, new_messages, i, content, idx)?, + _ => return Ok(None), + } + } + + self.emit_generation_prompt(&mut buf)?; + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +// Keep the field readable; suppresses dead-code warning since we only use it via the Option arm above. +#[allow(dead_code)] +impl Nemotron3Renderer { + pub fn has_endoftext(&self) -> bool { + self.endoftext.is_some() + } +} diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index f89b28d..3e137f9 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -14,7 +14,8 @@ use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; use renderers_core::families::{ - DeepSeekV3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, + DeepSeekV3RendererBuilder, Nemotron3RendererBuilder, Qwen35RendererBuilder, + Qwen36RendererBuilder, Qwen3RendererBuilder, }; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ @@ -339,6 +340,41 @@ impl PyRenderer { }) } + /// Build a Nemotron 3 renderer from a tokenizer.json. + /// + /// `<|endoftext|>` is auto-detected: Nemotron-3 Nano / Super ship + /// with only `<|im_end|>` as EOS; larger variants add `<|endoftext|>`. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn nemotron3( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + Nemotron3RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + /// Build a DeepSeek V3 renderer from a tokenizer.json. /// /// `enable_thinking=True` (default) prefills the generation prompt diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py index 06d9d4d..be6723d 100644 --- a/renderers/nemotron3.py +++ b/renderers/nemotron3.py @@ -19,6 +19,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -77,6 +82,26 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str] class Nemotron3Renderer: """Deterministic message → token renderer for Nemotron 3 models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if native_enabled("nemotron3"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.nemotron3( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, From 63713c3426dff9212f7f4715adbe171ce859b28d Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:44:48 +0200 Subject: [PATCH 07/35] Add GLM native parity paths --- crates/renderers-core/src/families/glm.rs | 655 ++++++++++++++++++++++ crates/renderers-core/src/families/mod.rs | 2 + crates/renderers-core/src/parsing/glm.rs | 223 ++++++++ crates/renderers-core/src/parsing/mod.rs | 1 + crates/renderers-py/src/lib.rs | 94 +++- renderers/glm45.py | 25 + renderers/glm5.py | 32 ++ 7 files changed, 1030 insertions(+), 2 deletions(-) create mode 100644 crates/renderers-core/src/families/glm.rs create mode 100644 crates/renderers-core/src/parsing/glm.rs diff --git a/crates/renderers-core/src/families/glm.rs b/crates/renderers-core/src/families/glm.rs new file mode 100644 index 0000000..8feaa8c --- /dev/null +++ b/crates/renderers-core/src/families/glm.rs @@ -0,0 +1,655 @@ +//! GLM family renderers — covers GLM-5, GLM-5.1, and GLM-4.5 Air. +//! +//! Port of `renderers/glm5.py` (+ `GLM51Renderer`) and `renderers/glm45.py`. +//! +//! Shared template shape: +//! +//! - Prefix: `[gMASK]` before all content +//! - Role markers: `<|system|>`, `<|user|>`, `<|assistant|>`, +//! `<|observation|>`. No role-name text follows the marker. +//! - **No close token** — turns end when the next role marker appears. +//! `bridge_to_next_turn` exploits this: the prior turn's tail +//! contains one of `{<|endoftext|>, <|user|>, <|observation|>}` +//! (the stop ids), so the bridge synthesises `<|endoftext|>` only on +//! truncation. +//! - Tool calls: `namekv...` +//! +//! Variants in this module: +//! +//! | Flag | GLM-5 | GLM-5.1 | GLM-4.5 | +//! | ----------------------------- | ----- | ------- | ------- | +//! | newlines after role markers | no | no | yes | +//! | newlines inside tool-call | no | no | yes | +//! | `/nothink` user suffix | no | no | yes | +//! | empty `` wrap | no | yes | no | +//! | unwrap OpenAI tool envelope | no | yes | no | +//! +//! The flags are surfaced on the builder; the three variants pick +//! their own combination at construction time. + +use serde_json::Value as JsonValue; + +use crate::bridge::reject_assistant_in_extension; +use crate::emit::RenderBuf; +use crate::parsing::glm::parse_glm; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, +}; + +const TOOLS_HEADER_GLM5: &str = "\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n"; +const TOOLS_FOOTER_GLM5: &str = "\n\nFor each function call, output the function name and arguments within the following XML format:\n{function-name}{arg-key-1}{arg-value-1}{arg-key-2}{arg-value-2}..."; + +const TOOLS_FOOTER_GLM45: &str = "\n\nFor each function call, output the function name and arguments within the following XML format:\n{function-name}\n{arg-key-1}\n{arg-value-1}\n{arg-key-2}\n{arg-value-2}\n...\n"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Variant { + Glm5, + Glm51, + Glm45, +} + +#[derive(Debug, Clone)] +pub struct GlmRendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + variant: Variant, +} + +impl GlmRendererBuilder { + pub fn glm5() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + variant: Variant::Glm5, + } + } + pub fn glm51() -> Self { + Self { variant: Variant::Glm51, ..Self::glm5() } + } + pub fn glm45() -> Self { + Self { variant: Variant::Glm45, ..Self::glm5() } + } + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + GlmRenderer::new_with(tokenizer, self) + } +} + +#[derive(Debug, Clone)] +pub struct GlmRenderer { + tokenizer: Tokenizer, + variant: Variant, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + gmask: u32, + sop: u32, + system: u32, + user: u32, + assistant: u32, + observation: u32, + endoftext: u32, + think: u32, + think_end: u32, + tool_call: u32, + tool_call_end: u32, + arg_key: u32, + arg_key_end: u32, + arg_value: u32, + arg_value_end: u32, + // GLM-5 also exposes tokens; GLM-4.5 emits them as text. + tool_response: Option, + tool_response_end: Option, + + stop_tokens: Vec, +} + +impl GlmRenderer { + pub fn glm5(tokenizer: Tokenizer) -> Result { + GlmRendererBuilder::glm5().build(tokenizer) + } + pub fn glm51(tokenizer: Tokenizer) -> Result { + GlmRendererBuilder::glm51().build(tokenizer) + } + pub fn glm45(tokenizer: Tokenizer) -> Result { + GlmRendererBuilder::glm45().build(tokenizer) + } + + fn new_with(tokenizer: Tokenizer, cfg: GlmRendererBuilder) -> Result { + let gmask = tokenizer.token_to_id_strict("[gMASK]")?; + let sop = tokenizer.token_to_id_strict("")?; + let system = tokenizer.token_to_id_strict("<|system|>")?; + let user = tokenizer.token_to_id_strict("<|user|>")?; + let assistant = tokenizer.token_to_id_strict("<|assistant|>")?; + let observation = tokenizer.token_to_id_strict("<|observation|>")?; + let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; + let think = tokenizer.token_to_id_strict("")?; + let think_end = tokenizer.token_to_id_strict("")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let arg_key = tokenizer.token_to_id_strict("")?; + let arg_key_end = tokenizer.token_to_id_strict("")?; + let arg_value = tokenizer.token_to_id_strict("")?; + let arg_value_end = tokenizer.token_to_id_strict("")?; + + // GLM-5 uses special tokens; GLM-4.5 emits them + // as plain text. Resolve optionally so the same struct serves + // both variants. + let (tool_response, tool_response_end) = if cfg.variant == Variant::Glm45 { + (None, None) + } else { + ( + Some(tokenizer.token_to_id_strict("")?), + Some(tokenizer.token_to_id_strict("")?), + ) + }; + + Ok(Self { + tokenizer, + variant: cfg.variant, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + gmask, + sop, + system, + user, + assistant, + observation, + endoftext, + think, + think_end, + tool_call, + tool_call_end, + arg_key, + arg_key_end, + arg_value, + arg_value_end, + tool_response, + tool_response_end, + stop_tokens: vec![endoftext, user, observation], + }) + } + + fn nl_after_role(&self) -> &'static str { + if self.variant == Variant::Glm45 { "\n" } else { "" } + } + + fn empty_think_on_last_assistant(&self) -> bool { + self.variant == Variant::Glm51 + } + + fn last_user_index(messages: &[Message]) -> i32 { + for (i, m) in messages.iter().enumerate().rev() { + if m.role == "user" { + return i as i32; + } + } + -1 + } + + fn format_tool_spec(&self, tool: &ToolSpec) -> Result { + // GLM-5 / GLM-4.5 render the spec verbatim; GLM-5.1 unwraps the + // OpenAI envelope (`{"type":"function","function":{...}}`) and + // strips internal-only keys. + // + // Our `ToolSpec` is already the inner shape, so the GLM-5.1 + // unwrap is a no-op in Rust — kept here as a structural note. + let spec = serde_json::json!({ + "name": tool.name, + "description": tool.description, + "parameters": tool.parameters, + }); + serde_json::to_string(&spec) + .map_err(|e| RenderError::Invalid(format!("tool spec serialisation: {e}"))) + } + + fn render_arg_value(arg_value: &JsonValue) -> String { + match arg_value { + JsonValue::String(s) => s.clone(), + _ => serde_json::to_string(arg_value).unwrap_or_default(), + } + } +} + +impl Renderer for GlmRenderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let nl = self.nl_after_role(); + let mut buf = RenderBuf::new( + &self.tokenizer, + messages.len().max(1) * 256 + tools.map(|t| t.len() * 256 + 256).unwrap_or(0), + ); + + // Prefix + buf.scaffold_special(self.gmask); + buf.scaffold_special(self.sop); + + // Tools system block + if let Some(t) = tools { + if !t.is_empty() { + buf.scaffold_special(self.system); + let mut s = String::with_capacity(512); + if !nl.is_empty() { + s.push_str(nl); + } + s.push_str(TOOLS_HEADER_GLM5); + for tool in t { + s.push_str(&self.format_tool_spec(tool)?); + s.push('\n'); + } + s.push_str(if self.variant == Variant::Glm45 { + TOOLS_FOOTER_GLM45 + } else { + TOOLS_FOOTER_GLM5 + }); + buf.scaffold_text(&s)?; + } + } + + let last_ui = Self::last_user_index(messages); + + for (i, msg) in messages.iter().enumerate() { + let content = msg.text_content(); + let idx = i as i32; + match msg.role.as_str() { + "system" => { + buf.special(self.system, idx); + let mut s = String::with_capacity(content.len() + 2); + s.push_str(nl); + s.push_str(content); + buf.text(&s, idx)?; + } + "user" => { + buf.special(self.user, idx); + let mut s = String::with_capacity(content.len() + 12); + s.push_str(nl); + s.push_str(content); + if self.variant == Variant::Glm45 + && !self.enable_thinking + && !content.ends_with("/nothink") + { + s.push_str("/nothink"); + } + buf.text(&s, idx)?; + } + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(&mut buf, msg, idx, last_ui, preserve_thinking)?; + } + "tool" => self.emit_tool(&mut buf, messages, i, content, idx)?, + _ => {} // mirror Python: silent skip + } + } + + if add_generation_prompt { + buf.scaffold_special(self.assistant); + if self.variant == Variant::Glm45 { + if !self.enable_thinking { + buf.scaffold_text("\n")?; + buf.scaffold_special(self.think); + buf.scaffold_special(self.think_end); + } + // GLM-4.5 enable_thinking=True: just <|assistant|>, nothing else + } else if self.enable_thinking { + buf.scaffold_special(self.think); + } else { + buf.scaffold_special(self.think_end); + } + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_glm( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.think, + self.think_end, + self.tool_call, + self.tool_call_end, + self.arg_key, + self.arg_key_end, + self.arg_value, + self.arg_value_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + + // GLM has no per-turn close token. Build the combined prefix and + // synthesise <|endoftext|> when the model's completion ran past + // max_tokens (no stop-id at the tail). + let mut combined: Vec = + Vec::with_capacity(previous_prompt_ids.len() + previous_completion_ids.len() + 1); + combined.extend_from_slice(previous_prompt_ids); + combined.extend_from_slice(previous_completion_ids); + + let need_synth = match combined.last() { + None => true, + Some(&t) if !self.stop_tokens.contains(&t) => true, + _ => previous_completion_ids.is_empty(), + }; + if need_synth { + combined.push(self.endoftext); + } + let last_prev = *combined.last().expect("non-empty"); + + let nl = self.nl_after_role(); + let mut buf = RenderBuf::new( + &self.tokenizer, + new_messages.len().max(1) * 256, + ); + + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "user" => { + if !(i == 0 && last_prev == self.user) { + buf.special(self.user, idx); + } + let mut s = String::with_capacity(content.len() + 12); + s.push_str(nl); + s.push_str(content); + if self.variant == Variant::Glm45 + && !self.enable_thinking + && !content.ends_with("/nothink") + { + s.push_str("/nothink"); + } + buf.text(&s, idx)?; + } + "system" => { + buf.special(self.system, idx); + let mut s = String::with_capacity(content.len() + 2); + s.push_str(nl); + s.push_str(content); + buf.text(&s, idx)?; + } + "tool" => { + let prev_is_tool = i > 0 && new_messages[i - 1].role == "tool"; + if i == 0 && last_prev == self.observation { + // model already emitted the marker; don't repeat + } else if !prev_is_tool { + buf.special(self.observation, idx); + } + self.emit_tool_response(&mut buf, content, idx)?; + } + _ => return Ok(None), + } + } + + // Generation prompt + buf.scaffold_special(self.assistant); + if self.variant == Variant::Glm45 { + if !self.enable_thinking { + buf.scaffold_text("\n")?; + buf.scaffold_special(self.think); + buf.scaffold_special(self.think_end); + } + } else if self.enable_thinking { + buf.scaffold_special(self.think); + } else { + buf.scaffold_special(self.think_end); + } + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(combined.len() + ext.len()); + out.extend_from_slice(&combined); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +impl GlmRenderer { + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + last_user_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let raw_content = msg.text_content(); + let (reasoning_content, content) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let r = if let Some((_, inner)) = before.rsplit_once("") { + inner.trim_start_matches('\n').trim_end_matches('\n').to_string() + } else { + before.trim_start_matches('\n').trim_end_matches('\n').to_string() + }; + (r, after.trim_start_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + let reasoning_content = reasoning_content.trim().to_string(); + let content = content.trim().to_string(); + + buf.special(self.assistant, msg_idx); + + if self.variant == Variant::Glm45 { + self.emit_assistant_glm45(buf, msg, msg_idx, &reasoning_content, &content, last_user_index, preserve_thinking) + } else { + self.emit_assistant_glm5_family(buf, msg, msg_idx, &reasoning_content, &content, last_user_index, preserve_thinking) + } + } + + fn emit_assistant_glm5_family( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + reasoning_content: &str, + content: &str, + last_user_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let include_thinking = + (msg_idx > last_user_index || preserve_thinking) && !reasoning_content.is_empty(); + if include_thinking { + buf.special(self.think, msg_idx); + buf.text(reasoning_content.trim(), msg_idx)?; + buf.special(self.think_end, msg_idx); + } else if self.empty_think_on_last_assistant() && msg_idx > last_user_index { + // GLM-5.1: wrap the last assistant with empty + buf.special(self.think, msg_idx); + buf.special(self.think_end, msg_idx); + } else { + buf.special(self.think_end, msg_idx); + } + + if !content.trim().is_empty() { + buf.text(content.trim(), msg_idx)?; + } + + for tc in &msg.tool_calls { + let name = tc.function.name.as_str(); + buf.special(self.tool_call, msg_idx); + buf.text(name, msg_idx)?; + let args_value = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())), + }; + if let Some(obj) = args_value.as_object() { + for (k, v) in obj { + buf.special(self.arg_key, msg_idx); + buf.text(k, msg_idx)?; + buf.special(self.arg_key_end, msg_idx); + buf.special(self.arg_value, msg_idx); + buf.text(&Self::render_arg_value(v), msg_idx)?; + buf.special(self.arg_value_end, msg_idx); + } + } + buf.special(self.tool_call_end, msg_idx); + } + Ok(()) + } + + fn emit_assistant_glm45( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + reasoning_content: &str, + content: &str, + last_user_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + if (msg_idx > last_user_index || preserve_thinking) && !reasoning_content.is_empty() { + buf.text("\n", msg_idx)?; + buf.special(self.think, msg_idx); + buf.text(reasoning_content.trim(), msg_idx)?; + buf.special(self.think_end, msg_idx); + } else { + buf.text("\n", msg_idx)?; + buf.special(self.think, msg_idx); + buf.special(self.think_end, msg_idx); + } + + let tool_calls = &msg.tool_calls; + let trimmed = content.trim(); + if !trimmed.is_empty() && !tool_calls.is_empty() { + let mut s = String::with_capacity(trimmed.len() + 2); + s.push('\n'); + s.push_str(trimmed); + s.push('\n'); + buf.text(&s, msg_idx)?; + } else if !trimmed.is_empty() { + let mut s = String::with_capacity(trimmed.len() + 1); + s.push('\n'); + s.push_str(trimmed); + buf.text(&s, msg_idx)?; + } + + for tc in tool_calls { + let name = tc.function.name.as_str(); + if trimmed.is_empty() { + buf.text("\n", msg_idx)?; + } + buf.special(self.tool_call, msg_idx); + let mut head = String::with_capacity(name.len() + 1); + head.push_str(name); + head.push('\n'); + buf.text(&head, msg_idx)?; + + let args_value = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())), + }; + if let Some(obj) = args_value.as_object() { + for (k, v) in obj { + buf.special(self.arg_key, msg_idx); + buf.text(k, msg_idx)?; + buf.special(self.arg_key_end, msg_idx); + buf.text("\n", msg_idx)?; + buf.special(self.arg_value, msg_idx); + buf.text(&Self::render_arg_value(v), msg_idx)?; + buf.special(self.arg_value_end, msg_idx); + buf.text("\n", msg_idx)?; + } + } + buf.special(self.tool_call_end, msg_idx); + } + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + msg_idx: usize, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + if !prev_is_tool { + buf.special(self.observation, idx); + } + self.emit_tool_response(buf, content, idx) + } + + fn emit_tool_response( + &self, + buf: &mut RenderBuf<'_>, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + if self.variant == Variant::Glm45 { + // GLM-4.5 emits the tool_response wrapper as plain text + let mut s = String::with_capacity(content.len() + 32); + s.push_str("\n\n"); + s.push_str(content); + s.push_str("\n"); + buf.text(&s, idx)?; + } else { + // GLM-5 / GLM-5.1 use special tokens + buf.special(self.tool_response.expect("tool_response token"), idx); + buf.text(content, idx)?; + buf.special(self.tool_response_end.expect("tool_response_end token"), idx); + } + Ok(()) + } +} + +// Kept for completeness; GLM-5 doesn't ship the `<|endoftext|>` flag the +// way Nemotron does, so the field is always Some. +#[allow(dead_code)] +fn _glm_invariants() { + let _ = SCAFFOLD_IDX; +} diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index ac7a5f1..22fd9cc 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -5,12 +5,14 @@ //! registry entry in [`crate::registry`]. pub mod deepseek_v3; +pub mod glm; pub mod nemotron3; pub mod qwen3; pub mod qwen35; pub mod qwen36; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; +pub use glm::{GlmRenderer, GlmRendererBuilder}; pub use nemotron3::{Nemotron3Renderer, Nemotron3RendererBuilder}; pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; pub use qwen35::{Qwen35Renderer, Qwen35RendererBuilder}; diff --git a/crates/renderers-core/src/parsing/glm.rs b/crates/renderers-core/src/parsing/glm.rs new file mode 100644 index 0000000..0c118d1 --- /dev/null +++ b/crates/renderers-core/src/parsing/glm.rs @@ -0,0 +1,223 @@ +//! GLM tool-call parser — covers GLM-5 / GLM-5.1 / GLM-4.5. +//! +//! Port of `renderers/parsing.py:parse_glm` + `_parse_glm_tool_calls`. +//! +//! Structural shape: +//! +//! ```text +//! <|assistant|>...content... +//! ...reasoning... +//! fn_name +//! k1v1 +//! k2v2 +//! +//! ``` +//! +//! Thinking is special-token (`` / ``). Each argument is +//! a pair of special-token-delimited spans inside the tool-call block. +//! All scanning is token-id based — no decoded-text regex. + +use std::ops::Range; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +#[allow(clippy::too_many_arguments)] +pub fn parse_glm( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + think_id: u32, + think_end_id: u32, + tool_call_id: u32, + tool_call_end_id: u32, + arg_key_id: u32, + arg_key_end_id: u32, + arg_value_id: u32, + arg_value_end_id: u32, +) -> ParsedResponse { + let stripped = strip_stop_tokens(token_ids, stop_ids); + + // Thinking — find by token id. + let mut reasoning: Option = None; + let mut parse_offset = 0usize; + let working_ids: Vec; + let ids: &[u32] = match find(stripped, think_end_id) { + Some(think_end) => { + let reasoning_ids: Vec = stripped[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()).filter(|s| !s.is_empty()); + parse_offset = think_end + 1; + &stripped[think_end + 1..] + } + None => { + // Truncated reasoning — without + if let Some(think_start) = find(stripped, think_id) { + let txt = decode(tokenizer, &stripped[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; + } + working_ids = stripped.to_vec(); + &working_ids + } + }; + + let (content_text, tool_calls) = match find(ids, tool_call_id) { + Some(tc_start) => { + let content = decode(tokenizer, &ids[..tc_start]) + .unwrap_or_default() + .trim() + .to_string(); + let tcs = parse_glm_tool_calls( + tokenizer, + &ids[tc_start..], + tool_call_id, + tool_call_end_id, + arg_key_id, + arg_key_end_id, + arg_value_id, + arg_value_end_id, + parse_offset + tc_start, + ); + (content, tcs) + } + None => ( + decode(tokenizer, ids).unwrap_or_default().trim().to_string(), + Vec::new(), + ), + }; + + ParsedResponse { + content: content_text, + reasoning_content: reasoning, + tool_calls, + } +} + +#[allow(clippy::too_many_arguments)] +fn parse_glm_tool_calls( + tokenizer: &Tokenizer, + ids: &[u32], + tc_id: u32, + tc_end_id: u32, + ak_id: u32, + ake_id: u32, + av_id: u32, + ave_id: u32, + section_offset: usize, +) -> Vec { + let mut out: Vec = Vec::new(); + let mut i = 0usize; + + while i < ids.len() { + if ids[i] != tc_id { + i += 1; + continue; + } + let span_start = section_offset + i; + + let end = match find_from(ids, tc_end_id, i + 1) { + Some(end) => end, + None => { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; + } + }; + + let block = &ids[i + 1..end]; + let block_text = decode(tokenizer, block).unwrap_or_default(); + let span = Range { + start: span_start, + end: section_offset + end + 1, + }; + + let first_ak = find(block, ak_id); + let mut arguments = serde_json::Map::new(); + let mut any_json_fallback = false; + let mut structure_broke = false; + let name = match first_ak { + None => decode(tokenizer, block).unwrap_or_default().trim().to_string(), + Some(first) => { + let n = decode(tokenizer, &block[..first]) + .unwrap_or_default() + .trim() + .to_string(); + let mut j = first; + while j < block.len() { + if block[j] != ak_id { + j += 1; + continue; + } + let Some(ake) = find_from(block, ake_id, j + 1) else { + structure_broke = true; + break; + }; + let key = decode(tokenizer, &block[j + 1..ake]) + .unwrap_or_default() + .trim() + .to_string(); + let Some(av) = find_from(block, av_id, ake + 1) else { + structure_broke = true; + break; + }; + let Some(ave) = find_from(block, ave_id, av + 1) else { + structure_broke = true; + break; + }; + let val_text = decode(tokenizer, &block[av + 1..ave]) + .unwrap_or_default() + .trim() + .to_string(); + let val = match serde_json::from_str::(&val_text) { + Ok(v) => v, + Err(_) => { + any_json_fallback = true; + serde_json::Value::String(val_text) + } + }; + arguments.insert(key, val); + j = ave + 1; + } + n + } + }; + + let status = if name.is_empty() { + ToolCallParseStatus::MissingName + } else if structure_broke { + ToolCallParseStatus::MalformedStructure + } else if any_json_fallback { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + + out.push(ParsedToolCall { + raw: block_text, + name: if name.is_empty() { None } else { Some(name) }, + arguments: Some(ToolArguments::Object(serde_json::Value::Object(arguments))), + token_span: Some(span), + status, + ..Default::default() + }); + i = end + 1; + } + out +} diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs index 9fbe0c5..0fc303d 100644 --- a/crates/renderers-core/src/parsing/mod.rs +++ b/crates/renderers-core/src/parsing/mod.rs @@ -10,6 +10,7 @@ //! they vanish into the family parsers at -O. pub mod deepseek_v3; +pub mod glm; pub mod qwen3; pub mod qwen35; diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 3e137f9..389451e 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -14,8 +14,8 @@ use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; use renderers_core::families::{ - DeepSeekV3RendererBuilder, Nemotron3RendererBuilder, Qwen35RendererBuilder, - Qwen36RendererBuilder, Qwen3RendererBuilder, + DeepSeekV3RendererBuilder, GlmRendererBuilder, Nemotron3RendererBuilder, + Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, }; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ @@ -340,6 +340,96 @@ impl PyRenderer { }) } + /// Build a GLM-5 renderer from a tokenizer.json. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn glm5( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + GlmRendererBuilder::glm5() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { inner: Arc::new(renderer) }) + } + + /// Build a GLM-5.1 renderer (GLM-5 + empty on last assistant). + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn glm51( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + GlmRendererBuilder::glm51() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { inner: Arc::new(renderer) }) + } + + /// Build a GLM-4.5 Air renderer from a tokenizer.json. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn glm45( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + GlmRendererBuilder::glm45() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { inner: Arc::new(renderer) }) + } + /// Build a Nemotron 3 renderer from a tokenizer.json. /// /// `<|endoftext|>` is auto-detected: Nemotron-3 Nano / Super ship diff --git a/renderers/glm45.py b/renderers/glm45.py index efea47b..f78e82a 100644 --- a/renderers/glm45.py +++ b/renderers/glm45.py @@ -15,6 +15,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -51,6 +56,26 @@ class GLM45Renderer: """Deterministic message → token renderer for GLM-4.5 Air models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if native_enabled("glm45"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.glm45( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/renderers/glm5.py b/renderers/glm5.py index a42a0af..9cc3cd8 100644 --- a/renderers/glm5.py +++ b/renderers/glm5.py @@ -16,6 +16,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -55,6 +60,31 @@ class GLM5Renderer: # GLM51Renderer; GLM-5 proper keeps this off. empty_think_on_last_assistant: bool = False + # Native-routing family key. Overridden in GLM51Renderer. + _NATIVE_KEY = "glm5" + _NATIVE_METHOD = "glm5" + + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if native_enabled(cls._NATIVE_KEY): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + builder = getattr(native.Renderer, cls._NATIVE_METHOD) + return builder( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + # GLM-5.1 uses the same template surface and binds the same kwargs. # Subclassed in ``GLM51Renderer`` so the registry can dispatch on the # ``glm-5.1`` discriminator while sharing this implementation. @@ -646,6 +676,8 @@ class GLM51Renderer(GLM5Renderer): empty_think_on_last_assistant = True _config_cls = GLM51RendererConfig + _NATIVE_KEY = "glm51" + _NATIVE_METHOD = "glm51" @staticmethod def _format_tool_spec(tool: ToolSpec) -> str: From 6804f55f9aa0cddee6d4681fdd1ec6f36a02d7ab Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:44:55 +0200 Subject: [PATCH 08/35] Add Kimi K2 native parity path --- crates/renderers-core/src/families/kimi_k2.rs | 457 ++++++++++++++++++ crates/renderers-core/src/families/mod.rs | 2 + crates/renderers-core/src/parsing/kimi_k2.rs | 178 +++++++ crates/renderers-core/src/parsing/mod.rs | 1 + crates/renderers-py/src/lib.rs | 34 +- renderers/kimi_k2.py | 25 + 6 files changed, 695 insertions(+), 2 deletions(-) create mode 100644 crates/renderers-core/src/families/kimi_k2.rs create mode 100644 crates/renderers-core/src/parsing/kimi_k2.rs diff --git a/crates/renderers-core/src/families/kimi_k2.rs b/crates/renderers-core/src/families/kimi_k2.rs new file mode 100644 index 0000000..ef82e8f --- /dev/null +++ b/crates/renderers-core/src/families/kimi_k2.rs @@ -0,0 +1,457 @@ +//! Kimi K2 renderer. Port of `renderers/kimi_k2.py`. +//! +//! Distinctive features: +//! +//! - Per-message framing: `<|im_*|>{role}<|im_middle|>{content}<|im_end|>`. +//! Role tokens vary by role: `<|im_user|>`, `<|im_assistant|>`, +//! `<|im_system|>`. +//! - Tool calls wrapped in +//! `<|tool_calls_section_begin|>` + N × call + `<|tool_calls_section_end|>`, +//! with each call as +//! `<|tool_call_begin|>{id}<|tool_call_argument_begin|>{json}<|tool_call_end|>`. +//! - Tool declarations rendered as a `role="tool_declare"` system-style +//! message with `tojson(separators=(',',':'), sort_keys=True)` JSON. +//! - Tool results: `<|im_system|>{name}<|im_middle|>## Return of {id}\n{content}<|im_end|>`. +//! - Default system message auto-injected if missing +//! ("You are Kimi, an AI assistant created by Moonshot AI."). +//! - Thinking is plain text `...` (not special tokens). +//! The template doesn't read `reasoning_content` — assistant content +//! renders verbatim, inline `` tags and all. + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::kimi_k2::parse_kimi_k2; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, +}; + +const DEFAULT_SYSTEM: &str = "You are Kimi, an AI assistant created by Moonshot AI."; + +#[derive(Debug, Clone)] +pub struct KimiK2RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for KimiK2RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl KimiK2RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + KimiK2Renderer::new_with(tokenizer, self) + } +} + +#[derive(Debug, Clone)] +pub struct KimiK2Renderer { + tokenizer: Tokenizer, + // Stored for API parity; the Kimi template ignores these flags. + #[allow(dead_code)] + enable_thinking: bool, + #[allow(dead_code)] + preserve_all_thinking: bool, + #[allow(dead_code)] + preserve_thinking_between_tool_calls: bool, + + im_user: u32, + im_assistant: u32, + im_system: u32, + im_middle: u32, + im_end: u32, + tool_calls_section_begin: u32, + tool_calls_section_end: u32, + tool_call_begin: u32, + tool_call_argument_begin: u32, + tool_call_end: u32, + + stop_tokens: Vec, +} + +impl KimiK2Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + KimiK2RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> KimiK2RendererBuilder { + KimiK2RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: KimiK2RendererBuilder) -> Result { + let im_user = tokenizer.token_to_id_strict("<|im_user|>")?; + let im_assistant = tokenizer.token_to_id_strict("<|im_assistant|>")?; + let im_system = tokenizer.token_to_id_strict("<|im_system|>")?; + let im_middle = tokenizer.token_to_id_strict("<|im_middle|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let tool_calls_section_begin = + tokenizer.token_to_id_strict("<|tool_calls_section_begin|>")?; + let tool_calls_section_end = + tokenizer.token_to_id_strict("<|tool_calls_section_end|>")?; + let tool_call_begin = tokenizer.token_to_id_strict("<|tool_call_begin|>")?; + let tool_call_argument_begin = + tokenizer.token_to_id_strict("<|tool_call_argument_begin|>")?; + let tool_call_end = tokenizer.token_to_id_strict("<|tool_call_end|>")?; + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_user, + im_assistant, + im_system, + im_middle, + im_end, + tool_calls_section_begin, + tool_calls_section_end, + tool_call_begin, + tool_call_argument_begin, + tool_call_end, + stop_tokens: vec![im_end], + }) + } + + /// Serialise the tools list as compact, key-sorted JSON. The Python + /// template uses `tojson(separators=(',', ':'), sort_keys=True)` — + /// match both options here for byte-identical output. + fn serialize_tools(tools: &[ToolSpec]) -> String { + // Build an ordered map via serde_json::Map (preserves insertion); + // for sort_keys behaviour we use a BTreeMap-backed Value tree. + // serde_json's `serialize` of a BTreeMap sorts keys by Ord. + use std::collections::BTreeMap; + let mut arr: Vec> = Vec::with_capacity(tools.len()); + for tool in tools { + let mut m: BTreeMap = BTreeMap::new(); + m.insert("name".into(), serde_json::Value::String(tool.name.clone())); + m.insert("description".into(), serde_json::Value::String(tool.description.clone())); + m.insert("parameters".into(), Self::sort_keys(&tool.parameters)); + arr.push(m); + } + serde_json::to_string(&arr).unwrap_or_else(|_| "[]".to_string()) + } + + fn sort_keys(v: &serde_json::Value) -> serde_json::Value { + use std::collections::BTreeMap; + match v { + serde_json::Value::Object(o) => { + let sorted: BTreeMap = o + .iter() + .map(|(k, v)| (k.clone(), Self::sort_keys(v))) + .collect(); + serde_json::to_value(sorted).unwrap_or(serde_json::Value::Null) + } + serde_json::Value::Array(a) => { + serde_json::Value::Array(a.iter().map(Self::sort_keys).collect()) + } + other => other.clone(), + } + } + + fn args_to_string(args: &ToolArguments) -> String { + match args { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => serde_json::to_string(v).unwrap_or_else(|_| "{}".into()), + } + } + + fn emit_im_role( + &self, + buf: &mut RenderBuf<'_>, + role_token: u32, + role_name: &str, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(role_token, idx); + buf.text(role_name, idx)?; + buf.special(self.im_middle, idx); + buf.text(content, idx)?; + buf.special(self.im_end, idx); + Ok(()) + } +} + +impl Renderer for KimiK2Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + + // Inject tool_declare + default system into a working copy, tracking + // which slots are injected so message_indices stays aligned to the + // caller's original list. + let mut working: Vec = Vec::with_capacity(messages.len() + 2); + let mut injected: Vec = Vec::with_capacity(messages.len() + 2); + + // tool_declare goes first if tools were provided and the caller + // didn't already include a tool_declare message. + let tools_pending = tools.map(|t| !t.is_empty()).unwrap_or(false); + let already_has_tool_declare = + !messages.is_empty() && messages[0].role == "tool_declare"; + if tools_pending && !already_has_tool_declare { + working.push(Message { + role: "tool_declare".to_string(), + content: crate::types::Content::Text(Self::serialize_tools(tools.unwrap())), + ..Default::default() + }); + injected.push(true); + } + + // Then the optional default system message + let auto_system_position: Option = if !messages.is_empty() + && messages[0].role == "tool_declare" + { + // tool_declare present in caller's input → if next isn't system, + // inject default system AFTER tool_declare + if messages.len() < 2 || messages[1].role != "system" { + Some(working.len() + 1) // will be inserted between tool_declare and the rest + } else { + None + } + } else if messages.is_empty() || messages[0].role != "system" { + Some(working.len()) + } else { + None + }; + + // Now lay out the rest: + if let Some(pos) = auto_system_position { + // Replicate the Python logic: if caller's first message is + // tool_declare, push it then the default system then the rest. + if !messages.is_empty() && messages[0].role == "tool_declare" { + working.push(messages[0].clone()); + injected.push(false); + working.push(Message { + role: "system".to_string(), + content: crate::types::Content::Text(DEFAULT_SYSTEM.to_string()), + ..Default::default() + }); + injected.push(true); + for m in &messages[1..] { + working.push(m.clone()); + injected.push(false); + } + } else { + working.push(Message { + role: "system".to_string(), + content: crate::types::Content::Text(DEFAULT_SYSTEM.to_string()), + ..Default::default() + }); + injected.push(true); + for m in messages { + working.push(m.clone()); + injected.push(false); + } + } + let _ = pos; + } else { + for m in messages { + working.push(m.clone()); + injected.push(false); + } + } + + // Map normalised index → caller's index (sentinel for injected). + let orig_idx = |i: usize| -> i32 { + if injected[i] { + SCAFFOLD_IDX + } else { + let real: usize = + injected[..=i].iter().filter(|&&inj| !inj).count() - 1; + real as i32 + } + }; + + // Index of the auto-injected system message (if any) — emits a + // trailing literal "\n" after its <|im_end|>. + let auto_system_idx: Option = working + .iter() + .enumerate() + .find(|(i, m)| injected[*i] && m.role == "system") + .map(|(i, _)| i); + + let mut buf = RenderBuf::new( + &self.tokenizer, + working.len().max(1) * 256 + tools.map(|t| 64 * t.len() + 256).unwrap_or(0), + ); + + for (i, msg) in working.iter().enumerate() { + let oi = orig_idx(i); + let content = msg.text_content(); + match msg.role.as_str() { + "system" => { + self.emit_im_role(&mut buf, self.im_system, "system", content, oi)?; + if Some(i) == auto_system_idx { + buf.text("\n", oi)?; + } + } + "tool_declare" => { + self.emit_im_role(&mut buf, self.im_system, "tool_declare", content, oi)?; + } + "user" => { + self.emit_im_role(&mut buf, self.im_user, "user", content, oi)?; + } + "assistant" => self.emit_assistant(&mut buf, msg, oi)?, + "tool" => self.emit_tool(&mut buf, msg, content, oi)?, + other => { + // Unknown role: render system-style + self.emit_im_role(&mut buf, self.im_system, other, content, oi)?; + } + } + } + + if add_generation_prompt { + buf.scaffold_special(self.im_assistant); + buf.scaffold_text("assistant")?; + buf.scaffold_special(self.im_middle); + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_kimi_k2( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.tool_calls_section_begin, + self.tool_calls_section_end, + self.tool_call_begin, + self.tool_call_argument_begin, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let mut buf = RenderBuf::new(&self.tokenizer, new_messages.len().max(1) * 256); + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "user" => self.emit_im_role(&mut buf, self.im_user, "user", content, idx)?, + "system" => self.emit_im_role(&mut buf, self.im_system, "system", content, idx)?, + "tool" => self.emit_tool(&mut buf, msg, content, idx)?, + _ => return Ok(None), + } + } + + buf.scaffold_special(self.im_assistant); + buf.scaffold_text("assistant")?; + buf.scaffold_special(self.im_middle); + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +impl KimiK2Renderer { + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_assistant, idx); + buf.text("assistant", idx)?; + buf.special(self.im_middle, idx); + + // Kimi's template renders content verbatim; reasoning_content is + // ignored (not read by the Jinja). + buf.text(msg.text_content(), idx)?; + + if !msg.tool_calls.is_empty() { + buf.special(self.tool_calls_section_begin, idx); + for tc in &msg.tool_calls { + let args_str = Self::args_to_string(&tc.function.arguments); + let tc_id = tc.id.clone().unwrap_or_default(); + buf.special(self.tool_call_begin, idx); + buf.text(&tc_id, idx)?; + buf.special(self.tool_call_argument_begin, idx); + buf.text(&args_str, idx)?; + buf.special(self.tool_call_end, idx); + } + buf.special(self.tool_calls_section_end, idx); + } + buf.special(self.im_end, idx); + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + let name = msg.name.as_deref().unwrap_or("tool"); + let tool_call_id = msg.tool_call_id.as_deref().unwrap_or(""); + buf.special(self.im_system, idx); + buf.text(name, idx)?; + buf.special(self.im_middle, idx); + let mut header = String::with_capacity(tool_call_id.len() + 16); + header.push_str("## Return of "); + header.push_str(tool_call_id); + header.push('\n'); + buf.text(&header, idx)?; + buf.text(content, idx)?; + buf.special(self.im_end, idx); + Ok(()) + } +} diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index 22fd9cc..faf5bf3 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -6,6 +6,7 @@ pub mod deepseek_v3; pub mod glm; +pub mod kimi_k2; pub mod nemotron3; pub mod qwen3; pub mod qwen35; @@ -13,6 +14,7 @@ pub mod qwen36; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; pub use glm::{GlmRenderer, GlmRendererBuilder}; +pub use kimi_k2::{KimiK2Renderer, KimiK2RendererBuilder}; pub use nemotron3::{Nemotron3Renderer, Nemotron3RendererBuilder}; pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; pub use qwen35::{Qwen35Renderer, Qwen35RendererBuilder}; diff --git a/crates/renderers-core/src/parsing/kimi_k2.rs b/crates/renderers-core/src/parsing/kimi_k2.rs new file mode 100644 index 0000000..0c77b41 --- /dev/null +++ b/crates/renderers-core/src/parsing/kimi_k2.rs @@ -0,0 +1,178 @@ +//! Kimi K2 tool-call parser. Port of +//! `renderers/parsing.py:parse_kimi_k2` + `_parse_kimi_k2_tool_calls`. +//! +//! Structural shape: +//! +//! ```text +//! ...content with optional ... text tags... +//! <|tool_calls_section_begin|> +//! <|tool_call_begin|>{id}<|tool_call_argument_begin|>{json_args}<|tool_call_end|> +//! ... +//! <|tool_calls_section_end|> +//! ``` +//! +//! `{id}` is `functions.{name}:{index}`. The parser strips the +//! `functions.` prefix and `:index` suffix to recover the function name. + +use std::ops::Range; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +#[allow(clippy::too_many_arguments)] +pub fn parse_kimi_k2( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + tool_calls_section_begin_id: u32, + tool_calls_section_end_id: u32, + tool_call_begin_id: u32, + tool_call_argument_begin_id: u32, + tool_call_end_id: u32, +) -> ParsedResponse { + let ids = strip_stop_tokens(token_ids, stop_ids); + + let (content_ids, tool_calls) = match find(ids, tool_calls_section_begin_id) { + Some(section_start) => { + let content = &ids[..section_start]; + let section_end = + find_from(ids, tool_calls_section_end_id, section_start + 1).unwrap_or(ids.len()); + let section_ids = &ids[section_start + 1..section_end]; + let tcs = parse_kimi_k2_calls( + tokenizer, + section_ids, + tool_call_begin_id, + tool_call_argument_begin_id, + tool_call_end_id, + section_start + 1, + ); + (content, tcs) + } + None => (ids, Vec::new()), + }; + + let text = decode(tokenizer, content_ids).unwrap_or_default(); + let (reasoning, content) = match text.split_once("") { + Some((before, after)) => { + let raw = before.replacen("", "", 1); + let r = raw.trim_matches('\n').trim().to_string(); + let c = after.trim_matches('\n').to_string(); + (Some(r).filter(|s| !s.is_empty()), c) + } + None => { + if let Some(think_at) = text.find("") { + // Truncated thinking — no closing tag + let raw = &text[think_at + "".len()..]; + let r = raw.trim_matches('\n').trim().to_string(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(r).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; + } + (None, text) + } + }; + + ParsedResponse { + content: content.trim().to_string(), + reasoning_content: reasoning, + tool_calls, + } +} + +fn parse_kimi_k2_calls( + tokenizer: &Tokenizer, + ids: &[u32], + tc_begin_id: u32, + tc_arg_begin_id: u32, + tc_end_id: u32, + section_offset: usize, +) -> Vec { + let mut out: Vec = Vec::new(); + let mut i = 0usize; + + while i < ids.len() { + if ids[i] != tc_begin_id { + i += 1; + continue; + } + let arg_begin = match find_from(ids, tc_arg_begin_id, i + 1) { + Some(v) => v, + None => { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: section_offset + i, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + break; + } + }; + + let (tc_end, unclosed) = match find_from(ids, tc_end_id, arg_begin + 1) { + Some(v) => (v, false), + None => (ids.len(), true), + }; + + let raw_id = decode(tokenizer, &ids[i + 1..arg_begin]) + .unwrap_or_default() + .trim() + .to_string(); + let args_str = decode(tokenizer, &ids[arg_begin + 1..tc_end]) + .unwrap_or_default() + .trim() + .to_string(); + let block_text = decode(tokenizer, &ids[i + 1..tc_end]).unwrap_or_default(); + let span = Range { + start: section_offset + i, + end: section_offset + tc_end + if unclosed { 0 } else { 1 }, + }; + + // Extract function name from "functions.{name}:{index}" + let name_part = raw_id.split(':').next().unwrap_or(""); + let func_name = if let Some((_, n)) = name_part.split_once('.') { + n.to_string() + } else { + name_part.to_string() + }; + + let mut invalid_json = false; + let arguments = match serde_json::from_str::(&args_str) { + Ok(v) => ToolArguments::Object(v), + Err(_) => { + invalid_json = true; + ToolArguments::Raw(args_str.clone()) + } + }; + + let status = if unclosed { + ToolCallParseStatus::UnclosedBlock + } else if func_name.is_empty() { + ToolCallParseStatus::MissingName + } else if invalid_json { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + + out.push(ParsedToolCall { + raw: block_text, + name: if func_name.is_empty() { None } else { Some(func_name) }, + arguments: Some(arguments), + token_span: Some(span), + status, + id: if raw_id.is_empty() { None } else { Some(raw_id) }, + }); + i = tc_end + 1; + if unclosed { + break; + } + } + out +} diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs index 0fc303d..48e55d8 100644 --- a/crates/renderers-core/src/parsing/mod.rs +++ b/crates/renderers-core/src/parsing/mod.rs @@ -11,6 +11,7 @@ pub mod deepseek_v3; pub mod glm; +pub mod kimi_k2; pub mod qwen3; pub mod qwen35; diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 389451e..6cb63a1 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -14,8 +14,8 @@ use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; use renderers_core::families::{ - DeepSeekV3RendererBuilder, GlmRendererBuilder, Nemotron3RendererBuilder, - Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, + DeepSeekV3RendererBuilder, GlmRendererBuilder, KimiK2RendererBuilder, + Nemotron3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, }; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ @@ -430,6 +430,36 @@ impl PyRenderer { Ok(PyRenderer { inner: Arc::new(renderer) }) } + /// Build a Kimi K2 renderer from a tokenizer.json. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn kimi_k2( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + KimiK2RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { inner: Arc::new(renderer) }) + } + /// Build a Nemotron 3 renderer from a tokenizer.json. /// /// `<|endoftext|>` is auto-detected: Nemotron-3 Nano / Super ship diff --git a/renderers/kimi_k2.py b/renderers/kimi_k2.py index 54d6f53..0e87a1b 100644 --- a/renderers/kimi_k2.py +++ b/renderers/kimi_k2.py @@ -18,6 +18,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -42,6 +47,26 @@ class KimiK2Renderer: have no effect on the byte-level output. """ + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if native_enabled("kimi_k2") or native_enabled("kimi-k2"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.kimi_k2( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, From 3067ed32acb6a4e5f8cb99021f3b0913787cfafc Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:45:07 +0200 Subject: [PATCH 09/35] Add MiniMax M2 native parity path --- .../renderers-core/src/families/minimax_m2.rs | 471 ++++++++++++++++++ crates/renderers-core/src/families/mod.rs | 2 + crates/renderers-core/src/parsing/minimax.rs | 173 +++++++ crates/renderers-core/src/parsing/mod.rs | 1 + crates/renderers-py/src/lib.rs | 30 +- renderers/minimax_m2.py | 29 ++ 6 files changed, 705 insertions(+), 1 deletion(-) create mode 100644 crates/renderers-core/src/families/minimax_m2.rs create mode 100644 crates/renderers-core/src/parsing/minimax.rs diff --git a/crates/renderers-core/src/families/minimax_m2.rs b/crates/renderers-core/src/families/minimax_m2.rs new file mode 100644 index 0000000..3e53915 --- /dev/null +++ b/crates/renderers-core/src/families/minimax_m2.rs @@ -0,0 +1,471 @@ +//! MiniMax M2.5 renderer. Port of `renderers/minimax_m2.py`. +//! +//! Unique characteristics: +//! +//! - Token format: `]~!b[` (BOS), `]~b]` (role prefix), `[e~[` (EOS). +//! Role "assistant" is rendered as "ai". +//! - System block always present — default system message +//! ("You are a helpful assistant. Your name is MiniMax-M2.5 and is +//! built by MiniMax.") auto-injected if missing. +//! - Tools, when supplied, are appended to the system message as +//! `{json}` lines inside a `...` block, +//! followed by a verbose instructions block. +//! - Tool calls use XML wrapper + nested invokes: +//! `v... +//! ` +//! - Tool responses wrapped in literal `...` +//! (plain text, no special token). +//! - Thinking emitted only for assistants after the last user turn +//! (or when preserve_all_thinking is on). + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::minimax::parse_minimax; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, +}; + +const DEFAULT_SYSTEM: &str = + "You are a helpful assistant. Your name is MiniMax-M2.5 and is built by MiniMax."; + +const TOOLS_HEADER: &str = "\n\n# Tools\nYou may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:\n\n\n"; +const TOOLS_FOOTER_PREFIX: &str = "\n\n"; +const TOOLS_INSTRUCTIONS: &str = "When making tool calls, use XML format to invoke tools and pass parameters:\n\n\n\nparam-value-1\nparam-value-2\n...\n\n"; + +#[derive(Debug, Clone)] +pub struct MiniMaxM2RendererBuilder { + default_system: String, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for MiniMaxM2RendererBuilder { + fn default() -> Self { + Self { + default_system: DEFAULT_SYSTEM.to_string(), + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl MiniMaxM2RendererBuilder { + pub fn default_system(mut self, s: impl Into) -> Self { + self.default_system = s.into(); + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + MiniMaxM2Renderer::new_with(tokenizer, self) + } +} + +#[derive(Debug, Clone)] +pub struct MiniMaxM2Renderer { + tokenizer: Tokenizer, + default_system: String, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + bos: u32, + role: u32, + eos: u32, + think: u32, + think_end: u32, + tool_call: u32, + tool_call_end: u32, + + stop_tokens: Vec, +} + +impl MiniMaxM2Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + MiniMaxM2RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> MiniMaxM2RendererBuilder { + MiniMaxM2RendererBuilder::default() + } + + fn new_with( + tokenizer: Tokenizer, + cfg: MiniMaxM2RendererBuilder, + ) -> Result { + let bos = tokenizer.token_to_id_strict("]~!b[")?; + let role = tokenizer.token_to_id_strict("]~b]")?; + let eos = tokenizer.token_to_id_strict("[e~[")?; + let think = tokenizer.token_to_id_strict("")?; + let think_end = tokenizer.token_to_id_strict("")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + + Ok(Self { + tokenizer, + default_system: cfg.default_system, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + bos, + role, + eos, + think, + think_end, + tool_call, + tool_call_end, + stop_tokens: vec![eos], + }) + } + + fn build_system_text(&self, sys_content: &str, tools: Option<&[ToolSpec]>) -> String { + let mut s = String::with_capacity(512); + s.push_str("system\n"); + if sys_content.is_empty() { + s.push_str(&self.default_system); + } else { + s.push_str(sys_content); + } + if let Some(tools) = tools { + if !tools.is_empty() { + s.push_str(TOOLS_HEADER); + for tool in tools { + s.push_str(""); + let spec = serde_json::json!({ + "name": tool.name, + "description": tool.description, + "parameters": tool.parameters, + }); + s.push_str(&serde_json::to_string(&spec).unwrap_or_default()); + s.push_str("\n"); + } + s.push_str(TOOLS_FOOTER_PREFIX); + s.push_str(TOOLS_INSTRUCTIONS); + } + } + s + } + + fn args_to_value(args: &ToolArguments) -> serde_json::Value { + match args { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(serde_json::Value::Object(Default::default())) + } + } + } +} + +impl Renderer for MiniMaxM2Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut buf = RenderBuf::new( + &self.tokenizer, + messages.len().max(1) * 256 + + tools.map(|t| t.len() * 256 + 512).unwrap_or(0), + ); + + let first_is_system = messages[0].role == "system"; + let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; + + // System block + buf.special(self.bos, sys_idx); + buf.special(self.role, sys_idx); + let sys_content = if first_is_system { + messages[0].text_content().to_string() + } else { + String::new() + }; + let system_text = self.build_system_text(&sys_content, tools); + buf.text(&system_text, sys_idx)?; + buf.special(self.eos, sys_idx); + buf.text("\n", sys_idx)?; + + // Conversation messages — skip the leading system if present + let conversation_start = if first_is_system { 1 } else { 0 }; + let conversation = &messages[conversation_start..]; + + // last_user_index relative to the conversation + let mut last_ui: i32 = -1; + for (ci, m) in conversation.iter().enumerate() { + if m.role == "user" { + last_ui = ci as i32; + } + } + + for (ci, msg) in conversation.iter().enumerate() { + let orig_idx = (ci + conversation_start) as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "user" => { + buf.special(self.role, orig_idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, orig_idx)?; + buf.special(self.eos, orig_idx); + buf.text("\n", orig_idx)?; + } + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + orig_idx as usize, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant( + &mut buf, + msg, + orig_idx, + ci as i32, + last_ui, + preserve_thinking, + )?; + } + "tool" => self.emit_tool(&mut buf, conversation, ci, orig_idx)?, + _ => {} + } + } + + if add_generation_prompt { + buf.scaffold_special(self.role); + buf.scaffold_text("ai\n")?; + buf.scaffold_special(self.think); + buf.scaffold_text("\n")?; + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_minimax( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.think, + self.think_end, + self.tool_call, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.eos), + ) else { + return Ok(None); + }; + + let mut buf = RenderBuf::new(&self.tokenizer, new_messages.len().max(1) * 256); + // Trailing \n after the prior turn's [e~[ + buf.scaffold_text("\n")?; + + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "user" => { + buf.special(self.role, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.eos, idx); + buf.text("\n", idx)?; + } + "system" => { + buf.special(self.role, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.eos, idx); + buf.text("\n", idx)?; + } + "tool" => self.emit_tool(&mut buf, new_messages, i, idx)?, + _ => return Ok(None), + } + } + + buf.scaffold_special(self.role); + buf.scaffold_text("ai\n")?; + buf.scaffold_special(self.think); + buf.scaffold_text("\n")?; + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +impl MiniMaxM2Renderer { + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + orig_idx: i32, + conv_idx: i32, + last_user_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let raw_content = msg.text_content(); + let (reasoning_content, content_text) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let r = if let Some((_, inner)) = before.rsplit_once("") { + inner.trim_matches('\n').to_string() + } else { + before.trim_matches('\n').to_string() + }; + (r, after.trim_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + + buf.special(self.role, orig_idx); + + let tool_calls = &msg.tool_calls; + let emit_think = !reasoning_content.is_empty() + && (conv_idx > last_user_index || preserve_thinking); + + let after_think: String = if emit_think { + buf.text("ai\n", orig_idx)?; + buf.special(self.think, orig_idx); + let mut head = String::with_capacity(reasoning_content.len() + 2); + head.push('\n'); + head.push_str(&reasoning_content); + head.push('\n'); + buf.text(&head, orig_idx)?; + buf.special(self.think_end, orig_idx); + // After , the rest is "\n\n" + content (or just "\n\n") + if content_text.is_empty() { + "\n\n".to_string() + } else { + let mut s = String::with_capacity(content_text.len() + 2); + s.push_str("\n\n"); + s.push_str(&content_text); + s + } + } else if content_text.is_empty() { + "ai\n".to_string() + } else { + let mut s = String::with_capacity(content_text.len() + 4); + s.push_str("ai\n"); + s.push_str(&content_text); + s + }; + + if !tool_calls.is_empty() { + // \n before contiguous with preceding text + let mut head = after_think; + head.push('\n'); + buf.text(&head, orig_idx)?; + buf.special(self.tool_call, orig_idx); + + let mut invoke_block = String::from("\n"); + for tc in tool_calls { + let name = tc.function.name.as_str(); + invoke_block.push_str("\n"); + let args_value = Self::args_to_value(&tc.function.arguments); + if let Some(obj) = args_value.as_object() { + for (arg_name, arg_value) in obj { + let val_str = match arg_value { + serde_json::Value::String(s) => s.clone(), + _ => serde_json::to_string(arg_value).unwrap_or_default(), + }; + invoke_block.push_str(""); + invoke_block.push_str(&val_str); + invoke_block.push_str("\n"); + } + } + invoke_block.push_str("\n"); + } + buf.text(&invoke_block, orig_idx)?; + buf.special(self.tool_call_end, orig_idx); + } else { + buf.text(&after_think, orig_idx)?; + } + + buf.special(self.eos, orig_idx); + buf.text("\n", orig_idx)?; + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + conversation: &[Message], + conv_idx: usize, + orig_idx: i32, + ) -> Result<(), RenderError> { + let prev_is_tool = conv_idx > 0 && conversation[conv_idx - 1].role == "tool"; + let next_is_tool = + conv_idx + 1 < conversation.len() && conversation[conv_idx + 1].role == "tool"; + + if !prev_is_tool { + buf.special(self.role, orig_idx); + buf.text("tool", orig_idx)?; + } + let prefix = if prev_is_tool { "" } else { "\n" }; + let suffix = if next_is_tool { "\n" } else { "" }; + let content = conversation[conv_idx].text_content(); + let mut s = String::with_capacity(content.len() + 32); + s.push_str(prefix); + s.push_str(""); + s.push_str(content); + s.push_str(""); + s.push_str(suffix); + buf.text(&s, orig_idx)?; + + if !next_is_tool { + buf.special(self.eos, orig_idx); + buf.text("\n", orig_idx)?; + } + Ok(()) + } +} diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index faf5bf3..bb42dbb 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -7,6 +7,7 @@ pub mod deepseek_v3; pub mod glm; pub mod kimi_k2; +pub mod minimax_m2; pub mod nemotron3; pub mod qwen3; pub mod qwen35; @@ -15,6 +16,7 @@ pub mod qwen36; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; pub use glm::{GlmRenderer, GlmRendererBuilder}; pub use kimi_k2::{KimiK2Renderer, KimiK2RendererBuilder}; +pub use minimax_m2::{MiniMaxM2Renderer, MiniMaxM2RendererBuilder}; pub use nemotron3::{Nemotron3Renderer, Nemotron3RendererBuilder}; pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; pub use qwen35::{Qwen35Renderer, Qwen35RendererBuilder}; diff --git a/crates/renderers-core/src/parsing/minimax.rs b/crates/renderers-core/src/parsing/minimax.rs new file mode 100644 index 0000000..e931829 --- /dev/null +++ b/crates/renderers-core/src/parsing/minimax.rs @@ -0,0 +1,173 @@ +//! MiniMax M2 tool-call parser. Port of +//! `renderers/parsing.py:parse_minimax`. +//! +//! Structural shape: +//! +//! ```text +//! ...content... +//! ...reasoning... (special tokens) +//! +//! +//! value1 +//! value2 +//! +//! ...possibly more blocks in one wrapper... +//! +//! ``` +//! +//! Thinking is special-token (`` / ``); the +//! tool-call block is bounded by special tokens but the inner +//! `` / `` structure is parsed by regex on the +//! decoded span. + +use std::ops::Range; +use std::sync::LazyLock; + +use regex::Regex; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +static INVOKE_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"(?s)(.*?)"#).expect("invoke regex") +}); +static PARAMETER_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"(?s)(.*?)"#).expect("parameter regex") +}); + +#[allow(clippy::too_many_arguments)] +pub fn parse_minimax( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + think_id: u32, + think_end_id: u32, + tool_call_id: u32, + tool_call_end_id: u32, +) -> ParsedResponse { + let stripped = strip_stop_tokens(token_ids, stop_ids); + + // Thinking + let mut reasoning: Option = None; + let mut parse_offset = 0usize; + let working: Vec; + let ids: &[u32] = match find(stripped, think_end_id) { + Some(think_end) => { + let reasoning_ids: Vec = stripped[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()).filter(|s| !s.is_empty()); + parse_offset = think_end + 1; + &stripped[think_end + 1..] + } + None => { + if let Some(think_start) = find(stripped, think_id) { + let txt = decode(tokenizer, &stripped[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; + } + working = stripped.to_vec(); + &working + } + }; + + let mut tool_calls: Vec = Vec::new(); + let content_text = match find(ids, tool_call_id) { + None => decode(tokenizer, ids).unwrap_or_default().trim().to_string(), + Some(tc_start) => { + let content = decode(tokenizer, &ids[..tc_start]) + .unwrap_or_default() + .trim() + .to_string(); + let mut i = tc_start; + while i < ids.len() { + if ids[i] != tool_call_id { + i += 1; + continue; + } + let span_start = parse_offset + i; + + let end = match find_from(ids, tool_call_end_id, i + 1) { + Some(end) => end, + None => { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + tool_calls.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: parse_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; + } + }; + let block_text = decode(tokenizer, &ids[i + 1..end]).unwrap_or_default(); + let span = Range { + start: span_start, + end: parse_offset + end + 1, + }; + + let invokes: Vec<_> = INVOKE_RE.captures_iter(&block_text).collect(); + if invokes.is_empty() { + tool_calls.push(ParsedToolCall { + raw: block_text, + token_span: Some(span), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + } else { + for inv in invokes { + let name = inv.get(1).map(|m| m.as_str()).unwrap_or(""); + let body = inv.get(2).map(|m| m.as_str()).unwrap_or(""); + let mut arguments = serde_json::Map::new(); + let mut any_json_fallback = false; + for pm in PARAMETER_RE.captures_iter(body) { + let pname = pm.get(1).map(|m| m.as_str()).unwrap_or(""); + let pval = pm.get(2).map(|m| m.as_str().trim()).unwrap_or(""); + let v = match serde_json::from_str::(pval) { + Ok(v) => v, + Err(_) => { + any_json_fallback = true; + serde_json::Value::String(pval.to_string()) + } + }; + arguments.insert(pname.to_string(), v); + } + let status = if any_json_fallback { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + tool_calls.push(ParsedToolCall { + raw: block_text.clone(), + name: Some(name.to_string()), + arguments: Some(ToolArguments::Object(serde_json::Value::Object( + arguments, + ))), + token_span: Some(span.clone()), + status, + ..Default::default() + }); + } + } + i = end + 1; + } + content + } + }; + + ParsedResponse { + content: content_text, + reasoning_content: reasoning, + tool_calls, + } +} diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs index 48e55d8..37f7921 100644 --- a/crates/renderers-core/src/parsing/mod.rs +++ b/crates/renderers-core/src/parsing/mod.rs @@ -12,6 +12,7 @@ pub mod deepseek_v3; pub mod glm; pub mod kimi_k2; +pub mod minimax; pub mod qwen3; pub mod qwen35; diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 6cb63a1..a2fe37f 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -15,7 +15,8 @@ use pyo3::types::{PyList, PyType}; use renderers_core::families::{ DeepSeekV3RendererBuilder, GlmRendererBuilder, KimiK2RendererBuilder, - Nemotron3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, + MiniMaxM2RendererBuilder, Nemotron3RendererBuilder, Qwen35RendererBuilder, + Qwen36RendererBuilder, Qwen3RendererBuilder, }; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ @@ -430,6 +431,33 @@ impl PyRenderer { Ok(PyRenderer { inner: Arc::new(renderer) }) } + /// Build a MiniMax M2 / M2.5 renderer from a tokenizer.json. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn minimax_m2( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + MiniMaxM2RendererBuilder::default() + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { inner: Arc::new(renderer) }) + } + /// Build a Kimi K2 renderer from a tokenizer.json. #[classmethod] #[pyo3(signature = ( diff --git a/renderers/minimax_m2.py b/renderers/minimax_m2.py index 39c12fa..601d55c 100644 --- a/renderers/minimax_m2.py +++ b/renderers/minimax_m2.py @@ -16,6 +16,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -53,6 +58,30 @@ class MiniMaxM2Renderer: """Deterministic message → token renderer for MiniMax M2 / M2.5 models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + default_system: str = _DEFAULT_SYSTEM, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + # Native routing: only when the caller relies on the default + # system message; a custom default_system isn't wired through to + # the native classmethod yet. + if ( + native_enabled("minimax_m2") or native_enabled("minimax-m2") + ) and default_system == _DEFAULT_SYSTEM: + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.minimax_m2( + path, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, From 6d303bbcc07d2c723c9943cacdb2a230a29562db Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:45:16 +0200 Subject: [PATCH 10/35] Add Kimi K2.5 native parity path --- .../renderers-core/src/families/kimi_k25.rs | 376 ++++++++++++++++++ crates/renderers-core/src/families/mod.rs | 2 + crates/renderers-py/src/lib.rs | 37 +- renderers/kimi_k25.py | 41 ++ 4 files changed, 455 insertions(+), 1 deletion(-) create mode 100644 crates/renderers-core/src/families/kimi_k25.rs diff --git a/crates/renderers-core/src/families/kimi_k25.rs b/crates/renderers-core/src/families/kimi_k25.rs new file mode 100644 index 0000000..297eac8 --- /dev/null +++ b/crates/renderers-core/src/families/kimi_k25.rs @@ -0,0 +1,376 @@ +//! Kimi K2.5 renderer (text-only path, no tools). +//! +//! Port of `renderers/kimi_k25.py` covering the most common call shape: +//! chat without function-calling tools and without images. The path with +//! TypeScript-style tool declarations and the multimodal path are +//! deferred to Phase 5 (the Python shim keeps those on the pure-Python +//! implementation for now). +//! +//! Distinctive features vs Kimi K2: +//! +//! - Generation prompt prefills `` (enable_thinking=True) or the +//! empty block `` (enable_thinking=False) to control +//! thinking mode at sample time. `` and `` may be +//! multi-token; the renderer encodes them as text. +//! - Assistant body uses the hist/suffix split: the last non-tool-call +//! assistant + all later assistants keep `reasoning_content`; +//! historical assistants collapse to a literal ``. +//! - Default system message is the same as K2 +//! ("You are Kimi, an AI assistant created by Moonshot AI.") but the +//! Python class doesn't auto-inject it — neither does this port. + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::kimi_k2::parse_kimi_k2; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, +}; + +#[derive(Debug, Clone)] +pub struct KimiK25RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for KimiK25RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl KimiK25RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + KimiK25Renderer::new_with(tokenizer, self) + } +} + +#[derive(Debug, Clone)] +pub struct KimiK25Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + im_user: u32, + im_assistant: u32, + im_system: u32, + im_middle: u32, + im_end: u32, + tool_calls_section_begin: u32, + tool_calls_section_end: u32, + tool_call_begin: u32, + tool_call_argument_begin: u32, + tool_call_end: u32, + + stop_tokens: Vec, +} + +impl KimiK25Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + KimiK25RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> KimiK25RendererBuilder { + KimiK25RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: KimiK25RendererBuilder) -> Result { + let im_user = tokenizer.token_to_id_strict("<|im_user|>")?; + let im_assistant = tokenizer.token_to_id_strict("<|im_assistant|>")?; + let im_system = tokenizer.token_to_id_strict("<|im_system|>")?; + let im_middle = tokenizer.token_to_id_strict("<|im_middle|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let tool_calls_section_begin = + tokenizer.token_to_id_strict("<|tool_calls_section_begin|>")?; + let tool_calls_section_end = + tokenizer.token_to_id_strict("<|tool_calls_section_end|>")?; + let tool_call_begin = tokenizer.token_to_id_strict("<|tool_call_begin|>")?; + let tool_call_argument_begin = + tokenizer.token_to_id_strict("<|tool_call_argument_begin|>")?; + let tool_call_end = tokenizer.token_to_id_strict("<|tool_call_end|>")?; + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_user, + im_assistant, + im_system, + im_middle, + im_end, + tool_calls_section_begin, + tool_calls_section_end, + tool_call_begin, + tool_call_argument_begin, + tool_call_end, + stop_tokens: vec![im_end], + }) + } + + fn args_to_string(args: &ToolArguments) -> String { + match args { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => serde_json::to_string(v).unwrap_or_else(|_| "{}".into()), + } + } + + fn role_token(&self, role: &str) -> u32 { + match role { + "user" => self.im_user, + "assistant" => self.im_assistant, + _ => self.im_system, + } + } + + /// Extract `(reasoning_content, text_content)` from a message, + /// honouring the explicit `reasoning_content` field and the inline + /// `...` tag fallback. Mirrors the Python K2.5 + /// `_render_assistant_body` extraction. + fn extract_reasoning(msg: &Message) -> (String, String) { + if let Some(r) = &msg.reasoning_content { + return (r.clone(), msg.text_content().to_string()); + } + let content = msg.text_content(); + if let Some((before, after)) = content.split_once("") { + let reasoning = if let Some((_, inner)) = before.rsplit_once("") { + inner.to_string() + } else { + before.to_string() + }; + return (reasoning, after.trim_start_matches('\n').to_string()); + } + (String::new(), content.to_string()) + } + + fn emit_assistant_body( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + is_suffix: bool, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let (reasoning_content, text_content) = Self::extract_reasoning(msg); + + // hist/suffix split: hist drops reasoning, suffix preserves it. + if is_suffix || (preserve_thinking && !reasoning_content.is_empty()) { + let mut s = String::with_capacity(reasoning_content.len() + 16); + s.push_str(""); + s.push_str(&reasoning_content); + s.push_str(""); + buf.text(&s, msg_idx)?; + } else { + buf.text("", msg_idx)?; + } + buf.text(&text_content, msg_idx)?; + + if !msg.tool_calls.is_empty() { + buf.special(self.tool_calls_section_begin, msg_idx); + for tc in &msg.tool_calls { + let args_str = Self::args_to_string(&tc.function.arguments); + let tool_id = tc.id.clone().unwrap_or_default(); + buf.special(self.tool_call_begin, msg_idx); + buf.text(&tool_id, msg_idx)?; + buf.special(self.tool_call_argument_begin, msg_idx); + buf.text(&args_str, msg_idx)?; + buf.special(self.tool_call_end, msg_idx); + } + buf.special(self.tool_calls_section_end, msg_idx); + } + Ok(()) + } + + fn emit_tool_body(&self, buf: &mut RenderBuf<'_>, msg: &Message, msg_idx: i32) -> Result<(), RenderError> { + let tool_call_id = msg.tool_call_id.as_deref().unwrap_or(""); + let mut header = String::with_capacity(tool_call_id.len() + 16); + header.push_str("## Return of "); + header.push_str(tool_call_id); + header.push('\n'); + buf.text(&header, msg_idx)?; + let content = msg.text_content(); + if !content.is_empty() { + buf.text(content, msg_idx)?; + } + Ok(()) + } +} + +impl Renderer for KimiK25Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + // Tools route to Python — the TS-style declaration formatter + // (~270 lines) isn't ported yet. The Python shim avoids native + // routing when tools are present, so this is a hard error if we + // got here with tools. + if tools.map(|t| !t.is_empty()).unwrap_or(false) { + return Err(RenderError::Invalid( + "Kimi K2.5 with tools not supported on the native path yet; the Python shim should route to pure Python in this case".into(), + )); + } + + let mut buf = RenderBuf::new(&self.tokenizer, messages.len().max(1) * 256); + + // Find last non-tool-call assistant for the hist/suffix split + let mut last_non_tc_assistant: i32 = -1; + for (i, m) in messages.iter().enumerate().rev() { + if m.role == "assistant" && m.tool_calls.is_empty() { + last_non_tc_assistant = i as i32; + break; + } + } + + for (i, msg) in messages.iter().enumerate() { + let idx = i as i32; + buf.special(self.role_token(&msg.role), idx); + // K2.5 uses `msg.name or role` as the role-name literal + let role_name = msg.name.as_deref().unwrap_or(&msg.role); + buf.text(role_name, idx)?; + buf.special(self.im_middle, idx); + + match msg.role.as_str() { + "assistant" => { + let is_suffix = idx > last_non_tc_assistant; + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant_body(&mut buf, msg, idx, is_suffix, preserve_thinking)?; + } + "tool" => self.emit_tool_body(&mut buf, msg, idx)?, + _ => { + let content = msg.text_content(); + if !content.is_empty() { + buf.text(content, idx)?; + } + } + } + buf.special(self.im_end, idx); + } + + // Generation prompt + if add_generation_prompt { + buf.scaffold_special(self.im_assistant); + buf.scaffold_text("assistant")?; + buf.scaffold_special(self.im_middle); + if self.enable_thinking { + buf.scaffold_text("")?; + } else { + buf.scaffold_text("")?; + } + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + // K2.5 reuses the K2 parser shape; only differences are the + // thinking-tag handling, which the K2 parser already does via the + // decoded-text branch. + parse_kimi_k2( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.tool_calls_section_begin, + self.tool_calls_section_end, + self.tool_call_begin, + self.tool_call_argument_begin, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let mut buf = RenderBuf::new(&self.tokenizer, new_messages.len().max(1) * 256); + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + buf.special(self.role_token(&msg.role), idx); + let role_name = msg.name.as_deref().unwrap_or(&msg.role); + buf.text(role_name, idx)?; + buf.special(self.im_middle, idx); + match msg.role.as_str() { + "user" | "system" => { + let content = msg.text_content(); + if !content.is_empty() { + buf.text(content, idx)?; + } + } + "tool" => self.emit_tool_body(&mut buf, msg, idx)?, + _ => return Ok(None), + } + buf.special(self.im_end, idx); + } + + // Generation prompt + buf.scaffold_special(self.im_assistant); + buf.scaffold_text("assistant")?; + buf.scaffold_special(self.im_middle); + if self.enable_thinking { + buf.scaffold_text("")?; + } else { + buf.scaffold_text("")?; + } + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index bb42dbb..d91154a 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -7,6 +7,7 @@ pub mod deepseek_v3; pub mod glm; pub mod kimi_k2; +pub mod kimi_k25; pub mod minimax_m2; pub mod nemotron3; pub mod qwen3; @@ -16,6 +17,7 @@ pub mod qwen36; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; pub use glm::{GlmRenderer, GlmRendererBuilder}; pub use kimi_k2::{KimiK2Renderer, KimiK2RendererBuilder}; +pub use kimi_k25::{KimiK25Renderer, KimiK25RendererBuilder}; pub use minimax_m2::{MiniMaxM2Renderer, MiniMaxM2RendererBuilder}; pub use nemotron3::{Nemotron3Renderer, Nemotron3RendererBuilder}; pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index a2fe37f..7d882d2 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -14,7 +14,7 @@ use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; use renderers_core::families::{ - DeepSeekV3RendererBuilder, GlmRendererBuilder, KimiK2RendererBuilder, + DeepSeekV3RendererBuilder, GlmRendererBuilder, KimiK25RendererBuilder, KimiK2RendererBuilder, MiniMaxM2RendererBuilder, Nemotron3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, }; @@ -458,6 +458,41 @@ impl PyRenderer { Ok(PyRenderer { inner: Arc::new(renderer) }) } + /// Build a Kimi K2.5 renderer (text-only, no tools). + /// + /// The Python shim is expected to route Kimi K2.5 to native ONLY + /// when there are no tools and no image / video content — the + /// TypeScript-style tool declaration formatter and the vision + /// processor are still pure-Python in this phase. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn kimi_k25( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .allow_threads(|| { + KimiK25RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { inner: Arc::new(renderer) }) + } + /// Build a Kimi K2 renderer from a tokenizer.json. #[classmethod] #[pyo3(signature = ( diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 352a9ee..7eac888 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -27,6 +27,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, MultiModalData, @@ -49,6 +54,17 @@ _load_pil_image, ) + +def _messages_have_media(messages: list[Message]) -> bool: + """Return True if any message carries image / video content parts.""" + for m in messages: + c = m.get("content") if isinstance(m, dict) else getattr(m, "content", None) + if isinstance(c, list): + for p in c: + if isinstance(p, dict) and p.get("type") in ("image", "image_url", "video", "video_url"): + return True + return False + # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- @@ -576,6 +592,31 @@ class KimiK25Renderer: The tokenizer should be ``moonshotai/Kimi-K2-Instruct`` (same as K2). """ + def __new__( + cls, + tokenizer, + *, + processor=None, + enable_thinking=True, + preserve_all_thinking=False, + preserve_thinking_between_tool_calls=False, + image_cache_max=256, + # Tools / messages are bound to render-time, but native routing + # decides eagerly here based on builder-time signals: skip native + # when a processor is configured (caller will pass images later). + ): + if native_enabled("kimi_k25") and processor is None: + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.kimi_k25( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, From ee68dd4a7db2b6db4529572d88d2adc7acb23577 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:45:29 +0200 Subject: [PATCH 11/35] Add GPT-OSS native parity path --- crates/renderers-core/Cargo.toml | 1 + crates/renderers-core/src/families/gpt_oss.rs | 706 ++++++++++++++++++ crates/renderers-core/src/families/mod.rs | 2 + crates/renderers-py/src/lib.rs | 62 +- renderers/gpt_oss.py | 34 + 5 files changed, 802 insertions(+), 3 deletions(-) create mode 100644 crates/renderers-core/src/families/gpt_oss.rs diff --git a/crates/renderers-core/Cargo.toml b/crates/renderers-core/Cargo.toml index 4291afb..5c870b1 100644 --- a/crates/renderers-core/Cargo.toml +++ b/crates/renderers-core/Cargo.toml @@ -20,6 +20,7 @@ thiserror = { workspace = true } smallvec = { workspace = true } bumpalo = { workspace = true } phf = { workspace = true } +openai-harmony = { version = "0.0.8", default-features = false } [dev-dependencies] serde_json = { workspace = true } diff --git a/crates/renderers-core/src/families/gpt_oss.rs b/crates/renderers-core/src/families/gpt_oss.rs new file mode 100644 index 0000000..15d6391 --- /dev/null +++ b/crates/renderers-core/src/families/gpt_oss.rs @@ -0,0 +1,706 @@ +//! GPT-OSS (Harmony) renderer. +//! +//! Thin adapter over the `openai-harmony` Rust crate. Wire format is +//! harmony (channel-based, no BOS). The Python implementation goes +//! through the same library, so matching its conversion logic guarantees +//! byte-identical tokens. +//! +//! Architecture: +//! +//! - Holds a [`HarmonyEncoding`] (lazily loaded from +//! [`HarmonyEncodingName::HarmonyGptOss`]) and a cache of the +//! special-token ids it exposes. +//! - `render` builds a prefix conversation (SystemContent + DeveloperContent +//! when a system message or tools are present) via +//! `render_conversation`, then walks the remaining messages and renders +//! each one individually via `render(msg)` so per-token attribution +//! stays per-source-message. +//! - `parse_response` walks the completion tokens with our own scanner +//! (token-id based) — matching what `renderers/parsing.py:parse_gpt_oss` +//! does — so we don't need to manage a `StreamableParser`'s lifetime. +//! +//! This renderer does NOT need a HuggingFace `tokenizer.json`; the +//! harmony encoding embeds its own tiktoken-based tokenizer. + +use std::sync::Arc; + +use openai_harmony::chat::{ + Author, ChannelConfig, Conversation, DeveloperContent, Message as HarmonyMessage, + ReasoningEffort, Role as HarmonyRole, SystemContent, ToolDescription, +}; +use openai_harmony::{HarmonyEncoding, HarmonyEncodingName, load_harmony_encoding}; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::thinking::should_preserve_past_thinking; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, ParsedToolCall, RenderError, RenderedTokens, ToolArguments, + ToolCallParseStatus, ToolSpec, SCAFFOLD_IDX, +}; + +fn harmony_err(e: E) -> RenderError { + RenderError::Invalid(format!("harmony: {e}")) +} + +/// Builder for [`GptOssRenderer`]. +#[derive(Debug, Clone)] +pub struct GptOssRendererBuilder { + use_system_prompt: bool, + reasoning_effort: ReasoningEffort, + conversation_start_date: Option, + knowledge_cutoff: Option, + model_identity: Option, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for GptOssRendererBuilder { + fn default() -> Self { + Self { + use_system_prompt: true, + reasoning_effort: ReasoningEffort::Medium, + conversation_start_date: None, + knowledge_cutoff: None, + model_identity: None, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl GptOssRendererBuilder { + pub fn use_system_prompt(mut self, on: bool) -> Self { + self.use_system_prompt = on; + self + } + pub fn reasoning_effort(mut self, effort: &str) -> Result { + self.reasoning_effort = match effort.to_ascii_lowercase().as_str() { + "low" => ReasoningEffort::Low, + "medium" => ReasoningEffort::Medium, + "high" => ReasoningEffort::High, + other => return Err(RenderError::Invalid(format!("unknown reasoning effort: {other}"))), + }; + Ok(self) + } + pub fn conversation_start_date(mut self, d: impl Into) -> Self { + self.conversation_start_date = Some(d.into()); + self + } + pub fn knowledge_cutoff(mut self, k: impl Into) -> Self { + self.knowledge_cutoff = Some(k.into()); + self + } + pub fn model_identity(mut self, m: impl Into) -> Self { + self.model_identity = Some(m.into()); + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self) -> Result { + GptOssRenderer::new_with(self) + } +} + +#[derive(Debug, Clone)] +pub struct GptOssRenderer { + enc: Arc, + use_system_prompt: bool, + reasoning_effort: ReasoningEffort, + conversation_start_date: String, + knowledge_cutoff: Option, + model_identity: Option, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + // Cached special-token ids — used by the parser and the generation prompt. + start: u32, + end: u32, + return_tok: u32, + call: u32, + channel: u32, + message: u32, + #[allow(dead_code)] + constrain: u32, + + stop_tokens: Vec, +} + +impl GptOssRenderer { + pub fn new() -> Result { + GptOssRendererBuilder::default().build() + } + pub fn builder() -> GptOssRendererBuilder { + GptOssRendererBuilder::default() + } + + fn new_with(cfg: GptOssRendererBuilder) -> Result { + let enc = load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss).map_err(harmony_err)?; + + // Resolve special-token ids by encoding their canonical text and + // asserting a single-token round-trip. The harmony encoding + // exposes a `tokenizer()` accessor (tiktoken CoreBPE) so we use + // its public special-token API. Bound to `enc` here directly so + // the rest of the constructor doesn't need to name CoreBPE + // (private outside the harmony crate). + let resolve = |s: &str| -> Result { + let ids = enc.tokenizer().encode_with_special_tokens(s); + if ids.len() != 1 { + return Err(RenderError::MissingSpecialToken(s.to_string())); + } + u32::try_from(ids[0]) + .map_err(|_| RenderError::MissingSpecialToken(s.to_string())) + }; + let start = resolve("<|start|>")?; + let end = resolve("<|end|>")?; + let return_tok = resolve("<|return|>")?; + let call = resolve("<|call|>")?; + let channel = resolve("<|channel|>")?; + let message = resolve("<|message|>")?; + let constrain = resolve("<|constrain|>")?; + + let start_date = cfg + .conversation_start_date + .clone() + .unwrap_or_else(today_yyyy_mm_dd); + + Ok(Self { + enc: Arc::new(enc), + use_system_prompt: cfg.use_system_prompt, + reasoning_effort: cfg.reasoning_effort, + conversation_start_date: start_date, + knowledge_cutoff: cfg.knowledge_cutoff, + model_identity: cfg.model_identity, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + start, + end, + return_tok, + call, + channel, + message, + constrain, + stop_tokens: vec![return_tok, call], + }) + } + + /// Append rendered ids to `tokens`, attribute each to `msg_idx`. + fn emit_render( + &self, + tokens: &mut Vec, + indices: &mut Vec, + msg_idx: i32, + message: &HarmonyMessage, + ) -> Result<(), RenderError> { + let mut out: Vec = Vec::new(); + self.enc + .render_into(message, &mut out, None) + .map_err(harmony_err)?; + let len = out.len(); + tokens.append(&mut out); + indices.extend(std::iter::repeat(msg_idx).take(len)); + Ok(()) + } + + /// Encode a UTF-8 string via the harmony tokenizer, returning u32 ids. + /// Helper so the call sites don't need to name CoreBPE (which is not + /// re-exported from the harmony crate). + fn encode_text(&self, text: &str) -> Vec { + self.enc + .tokenizer() + .encode_with_special_tokens(text) + .iter() + .map(|&r| r as u32) + .collect() + } + + /// Decode a slice of token ids via the harmony tokenizer. + fn decode_text(&self, ids: &[u32]) -> String { + if ids.is_empty() { + return String::new(); + } + // `Rank` in tiktoken is `u32` — pass ids directly without casting. + self.enc + .tokenizer() + .decode_utf8(ids.iter().copied()) + .unwrap_or_default() + } + + fn render_conversation_tokens( + &self, + messages: Vec, + ) -> Result, RenderError> { + let convo = Conversation::from_messages(messages); + let mut out: Vec = Vec::new(); + self.enc + .render_conversation_into(convo.messages.iter(), &mut out, None) + .map_err(harmony_err)?; + Ok(out) + } + + /// Build the harmony Author for tool messages — needs the function + /// name, which we recover from `msg.name` (set client-side by + /// `_attach_tool_call_names`). + fn tool_author(msg: &Message) -> Author { + let name = msg.name.as_deref().unwrap_or("unknown"); + let qualified: String = if name.starts_with("functions.") { + name.to_string() + } else { + format!("functions.{name}") + }; + Author { + role: HarmonyRole::Tool, + name: Some(qualified), + } + } + + fn message_to_harmony( + &self, + msg: &Message, + preserve_thinking: bool, + ) -> Vec { + match msg.role.as_str() { + "user" => vec![HarmonyMessage::from_role_and_content( + HarmonyRole::User, + msg.text_content().to_string(), + )], + "system" | "developer" => { + let dev = DeveloperContent::new().with_instructions(msg.text_content()); + vec![HarmonyMessage::from_role_and_content( + HarmonyRole::Developer, + dev, + )] + } + "tool" => { + let m = HarmonyMessage::from_author_and_content( + Self::tool_author(msg), + msg.text_content().to_string(), + ) + .with_recipient("assistant") + .with_channel("commentary"); + vec![m] + } + "assistant" => self.assistant_to_harmony(msg, preserve_thinking), + _ => { + let dev = DeveloperContent::new().with_instructions(msg.text_content()); + vec![HarmonyMessage::from_role_and_content( + HarmonyRole::Developer, + dev, + )] + } + } + } + + fn assistant_to_harmony( + &self, + msg: &Message, + preserve_thinking: bool, + ) -> Vec { + let mut out: Vec = Vec::new(); + + if preserve_thinking { + if let Some(reasoning) = msg.reasoning_content.as_deref() { + if !reasoning.is_empty() { + let m = HarmonyMessage::from_role_and_content( + HarmonyRole::Assistant, + reasoning.to_string(), + ) + .with_channel("analysis"); + out.push(m); + } + } + } + + // Text content goes on the `final` channel. + let text = msg.text_content(); + if !text.is_empty() { + let m = HarmonyMessage::from_role_and_content( + HarmonyRole::Assistant, + text.to_string(), + ) + .with_channel("final"); + out.push(m); + } + + // Each tool_call becomes its own assistant message on the + // commentary channel with recipient=functions.. + for tc in &msg.tool_calls { + let name = &tc.function.name; + let args = match &tc.function.arguments { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => serde_json::to_string(v).unwrap_or_default(), + }; + let recipient = if name.starts_with("functions.") { + name.clone() + } else { + format!("functions.{name}") + }; + let m = HarmonyMessage::from_role_and_content(HarmonyRole::Assistant, args) + .with_channel("commentary") + .with_recipient(recipient); + out.push(m); + } + + // Empty assistant with no text and no tool_calls: emit empty + // final-channel message so per-token attribution still produces + // at least one token slot. + if out.is_empty() { + let m = HarmonyMessage::from_role_and_content( + HarmonyRole::Assistant, + String::new(), + ) + .with_channel("final"); + out.push(m); + } + + out + } + + fn tool_to_description(tool: &ToolSpec) -> ToolDescription { + ToolDescription::new( + tool.name.as_str(), + tool.description.as_str(), + Some(tool.parameters.clone()), + ) + } + + fn build_system_content(&self) -> SystemContent { + let mut s = SystemContent::new().with_reasoning_effort(self.reasoning_effort); + s = s.with_conversation_start_date(self.conversation_start_date.as_str()); + if let Some(k) = &self.knowledge_cutoff { + s = s.with_knowledge_cutoff(k.as_str()); + } + if let Some(m) = &self.model_identity { + s = s.with_model_identity(m.as_str()); + } + s + } + + fn emit_generation_prompt(&self, tokens: &mut Vec, indices: &mut Vec) { + tokens.push(self.start); + indices.push(SCAFFOLD_IDX); + // "assistant" + <|channel|> + "analysis" + <|message|> + for id in self.encode_text("assistant") { + tokens.push(id); + indices.push(SCAFFOLD_IDX); + } + tokens.push(self.channel); + indices.push(SCAFFOLD_IDX); + for id in self.encode_text("analysis") { + tokens.push(id); + indices.push(SCAFFOLD_IDX); + } + tokens.push(self.message); + indices.push(SCAFFOLD_IDX); + } +} + +impl Renderer for GptOssRenderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut tokens: Vec = Vec::with_capacity(messages.len() * 256); + let mut indices: Vec = Vec::with_capacity(messages.len() * 256); + + let first_system_idx = messages.iter().position(|m| m.role == "system"); + + // Prefix: SystemContent + DeveloperContent (when tools or a + // caller-supplied system are present). + let mut prefix_msgs: Vec = Vec::new(); + if self.use_system_prompt { + let sys = self.build_system_content(); + let sys = match tools { + Some(t) if !t.is_empty() => { + sys.with_channel_config(ChannelConfig::require_channels([ + "analysis", + "commentary", + "final", + ])) + } + _ => sys, + }; + prefix_msgs.push(HarmonyMessage::from_role_and_content( + HarmonyRole::System, + sys, + )); + } + let has_dev = first_system_idx.is_some() + || tools.map(|t| !t.is_empty()).unwrap_or(false); + if has_dev { + let mut dev = DeveloperContent::new(); + if let Some(idx) = first_system_idx { + let instr = messages[idx].text_content(); + if !instr.is_empty() { + dev = dev.with_instructions(instr); + } + } + if let Some(t) = tools { + if !t.is_empty() { + let descs: Vec = + t.iter().map(Self::tool_to_description).collect(); + dev = dev.with_function_tools(descs); + } + } + prefix_msgs.push(HarmonyMessage::from_role_and_content( + HarmonyRole::Developer, + dev, + )); + } + if !prefix_msgs.is_empty() { + let prefix_tokens = self.render_conversation_tokens(prefix_msgs)?; + let attr_idx: i32 = first_system_idx.map(|i| i as i32).unwrap_or(SCAFFOLD_IDX); + for id in prefix_tokens { + tokens.push(id); + indices.push(attr_idx); + } + } + + // Body + let last_idx = messages.len() - 1; + for (i, msg) in messages.iter().enumerate() { + if Some(i) == first_system_idx { + continue; + } + let preserve_thinking = msg.role == "assistant" + && should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + for hm in self.message_to_harmony(msg, preserve_thinking) { + self.emit_render(&mut tokens, &mut indices, i as i32, &hm)?; + } + } + + // Terminal close: if the conversation ends on a plain assistant + // turn (no tool_calls) and we're not asking for a generation + // prompt, swap the trailing <|end|> for <|return|> — matches + // apply_chat_template. + if !add_generation_prompt + && last_idx < messages.len() + && messages[last_idx].role == "assistant" + && messages[last_idx].tool_calls.is_empty() + && tokens.last().copied() == Some(self.end) + { + *tokens.last_mut().expect("non-empty") = self.return_tok; + } + + if add_generation_prompt { + self.emit_generation_prompt(&mut tokens, &mut indices); + } + + Ok(RenderedTokens { + token_ids: tokens, + message_indices: indices, + multi_modal_data: None, + }) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + // Walk tokens block-by-block: `<|start|>{header}<|message|>{body}{terminator}`. + // Terminator is one of `<|start|>` (next block), `<|end|>`, `<|call|>`. + // `<|return|>` truncates the entire response. + let return_pos = token_ids.iter().position(|&t| t == self.return_tok); + let ids: &[u32] = match return_pos { + Some(p) => &token_ids[..p], + None => token_ids, + }; + + let mut reasoning_parts: Vec = Vec::new(); + let mut content_parts: Vec = Vec::new(); + let mut tool_calls: Vec = Vec::new(); + + let mut i = 0usize; + while i < ids.len() { + if ids[i] != self.start { + i += 1; + continue; + } + let block_start = i; + let Some(msg_pos) = ids[i + 1..].iter().position(|&t| t == self.message).map(|p| p + i + 1) + else { + break; + }; + let header_ids = &ids[i + 1..msg_pos]; + let header_text = self.decode_text(header_ids); + + let body_start = msg_pos + 1; + let body_end = ids[body_start..] + .iter() + .position(|&t| t == self.start || t == self.end || t == self.call) + .map(|p| p + body_start) + .unwrap_or(ids.len()); + let body_closed = body_end < ids.len() && (ids[body_end] == self.end || ids[body_end] == self.call); + let body_text = self.decode_text(&ids[body_start..body_end]); + + // Channel: look for <|channel|>NAME in header — NAME is the + // text between the channel token and the next whitespace / + // special token. + let channel = header_ids + .iter() + .position(|&t| t == self.channel) + .map(|p| { + let after = &header_ids[p + 1..]; + // Take tokens until newline/space — but since header + // is short, just decode the rest and split. + self.decode_text(after).trim().to_string() + }) + .unwrap_or_default(); + + // Recipient: header text may contain "to=functions.NAME" + let recipient: Option<&str> = header_text + .split("to=") + .nth(1) + .map(|s| s.split(|c: char| c.is_whitespace() || c == '<').next().unwrap_or("")); + + if let Some(r) = recipient { + if r.starts_with("functions.") { + let tool_name = &r["functions.".len()..]; + let block_end = if body_closed { body_end + 1 } else { body_end }; + let span = block_start..block_end; + match serde_json::from_str::(&body_text) { + Ok(v) => { + tool_calls.push(ParsedToolCall { + raw: body_text.clone(), + name: Some(tool_name.to_string()), + arguments: Some(ToolArguments::Object(v)), + token_span: Some(span), + status: ToolCallParseStatus::Ok, + ..Default::default() + }); + } + Err(_) => { + tool_calls.push(ParsedToolCall { + raw: body_text.clone(), + name: Some(tool_name.to_string()), + arguments: Some(ToolArguments::Raw(body_text.clone())), + token_span: Some(span), + status: ToolCallParseStatus::InvalidJson, + ..Default::default() + }); + } + } + i = if body_closed { body_end + 1 } else { body_end }; + continue; + } + } + + match channel.split_whitespace().next() { + Some("analysis") => reasoning_parts.push(body_text), + Some("final") | _ => content_parts.push(body_text), + } + + i = if body_closed { body_end + 1 } else { body_end }; + } + + let reasoning_content = if reasoning_parts.is_empty() { + None + } else { + Some(reasoning_parts.join("").trim().to_string()).filter(|s| !s.is_empty()) + }; + + ParsedResponse { + content: content_parts.join("").trim().to_string(), + reasoning_content, + tool_calls, + } + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &[self.return_tok, self.call], + Some(self.end), + ) else { + return Ok(None); + }; + + let mut ext: Vec = Vec::new(); + for msg in new_messages { + match msg.role.as_str() { + "tool" | "user" | "system" | "developer" => {} + _ => return Ok(None), + } + for hm in self.message_to_harmony(msg, false) { + let mut out: Vec = Vec::new(); + self.enc.render_into(&hm, &mut out, None).map_err(harmony_err)?; + ext.extend(out); + } + } + + // Generation prompt + ext.push(self.start); + ext.extend(self.encode_text("assistant")); + ext.push(self.channel); + ext.extend(self.encode_text("analysis")); + ext.push(self.message); + + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +fn today_yyyy_mm_dd() -> String { + // Avoid pulling chrono — use std::time::SystemTime and a small + // conversion that's good enough for "today" in UTC. + use std::time::{SystemTime, UNIX_EPOCH}; + let secs = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + let days = secs / 86_400; + // 1970-01-01 + days + let (y, m, d) = civil_from_days(days as i64); + format!("{y:04}-{m:02}-{d:02}") +} + +/// Convert days since 1970-01-01 to (year, month, day) — Howard Hinnant's +/// algorithm, public-domain. +fn civil_from_days(z: i64) -> (i32, u32, u32) { + let z = z + 719_468; + let era = if z >= 0 { z } else { z - 146_096 } / 146_097; + let doe = (z - era * 146_097) as u32; + let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365; + let y = yoe as i32 + era as i32 * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = doy - (153 * mp + 2) / 5 + 1; + let m = if mp < 10 { mp + 3 } else { mp - 9 }; + let y = if m <= 2 { y + 1 } else { y }; + (y, m, d) +} diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index d91154a..51c8b69 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -6,6 +6,7 @@ pub mod deepseek_v3; pub mod glm; +pub mod gpt_oss; pub mod kimi_k2; pub mod kimi_k25; pub mod minimax_m2; @@ -16,6 +17,7 @@ pub mod qwen36; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; pub use glm::{GlmRenderer, GlmRendererBuilder}; +pub use gpt_oss::{GptOssRenderer, GptOssRendererBuilder}; pub use kimi_k2::{KimiK2Renderer, KimiK2RendererBuilder}; pub use kimi_k25::{KimiK25Renderer, KimiK25RendererBuilder}; pub use minimax_m2::{MiniMaxM2Renderer, MiniMaxM2RendererBuilder}; diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 7d882d2..9ab8acd 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -14,9 +14,9 @@ use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; use renderers_core::families::{ - DeepSeekV3RendererBuilder, GlmRendererBuilder, KimiK25RendererBuilder, KimiK2RendererBuilder, - MiniMaxM2RendererBuilder, Nemotron3RendererBuilder, Qwen35RendererBuilder, - Qwen36RendererBuilder, Qwen3RendererBuilder, + DeepSeekV3RendererBuilder, GlmRendererBuilder, GptOssRendererBuilder, KimiK25RendererBuilder, + KimiK2RendererBuilder, MiniMaxM2RendererBuilder, Nemotron3RendererBuilder, + Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, }; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ @@ -458,6 +458,62 @@ impl PyRenderer { Ok(PyRenderer { inner: Arc::new(renderer) }) } + /// Build a GPT-OSS (Harmony) renderer. + /// + /// Unlike the other families, GPT-OSS doesn't need a HuggingFace + /// `tokenizer.json` — the harmony encoding embeds its own + /// tiktoken-based tokenizer. The `tokenizer_path` argument is + /// ignored on this path but kept for API uniformity with the other + /// classmethods (callers can pass an empty string). + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + use_system_prompt = true, + reasoning_effort = None, + conversation_start_date = None, + knowledge_cutoff = None, + model_identity = None, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + #[allow(clippy::too_many_arguments)] + fn gpt_oss( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + use_system_prompt: bool, + reasoning_effort: Option<&str>, + conversation_start_date: Option<&str>, + knowledge_cutoff: Option<&str>, + model_identity: Option<&str>, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let _ = tokenizer_path; // not needed for harmony + let effort = reasoning_effort.unwrap_or("medium").to_string(); + let renderer = py + .allow_threads(move || -> Result<_, renderers_core::types::RenderError> { + let mut b = GptOssRendererBuilder::default() + .use_system_prompt(use_system_prompt) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls); + b = b.reasoning_effort(&effort)?; + if let Some(d) = conversation_start_date { + b = b.conversation_start_date(d); + } + if let Some(k) = knowledge_cutoff { + b = b.knowledge_cutoff(k); + } + if let Some(m) = model_identity { + b = b.model_identity(m); + } + b.build() + }) + .map_err(render_err)?; + Ok(PyRenderer { inner: Arc::new(renderer) }) + } + /// Build a Kimi K2.5 renderer (text-only, no tools). /// /// The Python shim is expected to route Kimi K2.5 to native ONLY diff --git a/renderers/gpt_oss.py b/renderers/gpt_oss.py index f1bb04a..99b9fc9 100644 --- a/renderers/gpt_oss.py +++ b/renderers/gpt_oss.py @@ -51,6 +51,11 @@ ) from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -119,6 +124,35 @@ def _arguments_to_str(arguments: Any) -> str: class GptOssRenderer: """Deterministic message → token renderer for OpenAI gpt-oss (harmony).""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + use_system_prompt: bool = True, + reasoning_effort: str | None = "medium", + conversation_start_date: str | None = None, + knowledge_cutoff: str | None = None, + model_identity: str | None = None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if native_enabled("gpt_oss") or native_enabled("gpt-oss"): + native = load_native() + if native is not None: + # GPT-OSS embeds its own tokenizer; the tokenizer_path + # argument is ignored on the native side. + return native.Renderer.gpt_oss( + "", + use_system_prompt=use_system_prompt, + reasoning_effort=reasoning_effort, + conversation_start_date=conversation_start_date, + knowledge_cutoff=knowledge_cutoff, + model_identity=model_identity, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, From 31939db64f470a6b32eb509b7ac5a20111f1fd42 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:45:43 +0200 Subject: [PATCH 12/35] Add DefaultRenderer native parity path --- crates/renderers-core/Cargo.toml | 1 + crates/renderers-core/src/families/default.rs | 375 ++++++++++++++++++ crates/renderers-core/src/families/mod.rs | 2 + crates/renderers-py/src/lib.rs | 53 ++- renderers/default.py | 47 +++ 5 files changed, 475 insertions(+), 3 deletions(-) create mode 100644 crates/renderers-core/src/families/default.rs diff --git a/crates/renderers-core/Cargo.toml b/crates/renderers-core/Cargo.toml index 5c870b1..78184fe 100644 --- a/crates/renderers-core/Cargo.toml +++ b/crates/renderers-core/Cargo.toml @@ -21,6 +21,7 @@ smallvec = { workspace = true } bumpalo = { workspace = true } phf = { workspace = true } openai-harmony = { version = "0.0.8", default-features = false } +minijinja = { version = "2", default-features = false, features = ["builtins", "serde"] } [dev-dependencies] serde_json = { workspace = true } diff --git a/crates/renderers-core/src/families/default.rs b/crates/renderers-core/src/families/default.rs new file mode 100644 index 0000000..ec51515 --- /dev/null +++ b/crates/renderers-core/src/families/default.rs @@ -0,0 +1,375 @@ +//! DefaultRenderer — Jinja-template fallback for models without a +//! hand-coded family. +//! +//! Port of `renderers/default.py`. Two key differences from the Python +//! implementation: +//! +//! - Renders the template with [`minijinja`] (vs HF's Python Jinja). The +//! `chat_template` string is loaded from the model's +//! `tokenizer_config.json` and rendered against a context built from +//! the messages + tools. minijinja covers the Jinja2 subset HF +//! templates actually use (`for`, `if`, `set`, filters like `tojson`, +//! `length`, `trim`); anything more exotic will return a render error +//! instead of silently miscompiling. +//! - Per-token attribution is incremental: render the conversation +//! prefix-by-prefix and attribute the delta to each message index. +//! Same algorithm as the Python class, but driven by minijinja +//! instead of HF's `apply_chat_template`. +//! +//! `parse_response` is intentionally basic: strip stop tokens, decode, +//! split on `` if present. Models with structured tool calls +//! need a hand-coded family — DefaultRenderer doesn't try to guess. +//! +//! `bridge_to_next_turn` returns `None` unconditionally: without +//! template-specific knowledge of the turn-close token, the bridge +//! contract can't be proven, so the caller falls back to a full +//! re-render. + +use std::sync::Arc; + +use minijinja::value::Value as MjValue; +use minijinja::{Environment, context}; +use serde_json::Value as JsonValue; + +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, +}; + +/// Builder for [`DefaultRenderer`]. +pub struct DefaultRendererBuilder { + chat_template: String, + stop_token_ids: Vec, + extra_context: Vec<(String, JsonValue)>, +} + +impl DefaultRendererBuilder { + pub fn new(chat_template: impl Into) -> Self { + Self { + chat_template: chat_template.into(), + stop_token_ids: Vec::new(), + extra_context: Vec::new(), + } + } + /// Stop tokens — typically `[eos_token_id]`. The caller decides; the + /// renderer doesn't probe the tokenizer for `eos_token` since the + /// canonical id varies per model. + pub fn stop_token_ids(mut self, ids: Vec) -> Self { + self.stop_token_ids = ids; + self + } + /// Add a `key=value` context variable for the Jinja template. + /// Common entries: `bos_token`, `eos_token`, `add_generation_prompt`. + pub fn add_context(mut self, key: impl Into, value: JsonValue) -> Self { + self.extra_context.push((key.into(), value)); + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + DefaultRenderer::new_with(tokenizer, self) + } +} + +impl std::fmt::Debug for DefaultRendererBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DefaultRendererBuilder") + .field("chat_template_len", &self.chat_template.len()) + .field("stop_token_ids", &self.stop_token_ids) + .field("extra_context_keys", &self.extra_context.len()) + .finish() + } +} + +pub struct DefaultRenderer { + tokenizer: Tokenizer, + env: Arc>, + extra_context: Vec<(String, JsonValue)>, + stop_token_ids: Vec, +} + +impl std::fmt::Debug for DefaultRenderer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DefaultRenderer") + .field("stop_token_ids", &self.stop_token_ids) + .field("extra_context_keys", &self.extra_context.len()) + .finish() + } +} + +impl Clone for DefaultRenderer { + fn clone(&self) -> Self { + Self { + tokenizer: self.tokenizer.clone(), + env: self.env.clone(), + extra_context: self.extra_context.clone(), + stop_token_ids: self.stop_token_ids.clone(), + } + } +} + +impl DefaultRenderer { + fn new_with( + tokenizer: Tokenizer, + cfg: DefaultRendererBuilder, + ) -> Result { + let mut env = Environment::new(); + // HF chat templates use whitespace-stripped markers freely + // (e.g. `{%- if foo -%}`); minijinja respects that via the + // `lstrip_blocks` / `trim_blocks` knobs below. + env.set_lstrip_blocks(true); + env.set_trim_blocks(true); + env.add_template_owned("chat", cfg.chat_template) + .map_err(|e| RenderError::Invalid(format!("chat_template parse: {e}")))?; + Ok(Self { + tokenizer, + env: Arc::new(env), + extra_context: cfg.extra_context, + stop_token_ids: cfg.stop_token_ids, + }) + } + + pub fn builder(chat_template: impl Into) -> DefaultRendererBuilder { + DefaultRendererBuilder::new(chat_template) + } + + /// Render the template up to `messages[..end]` (exclusive). When + /// `add_generation_prompt` is true the template's gen-prompt branch + /// fires. + fn render_jinja( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let messages_value = messages_to_value(messages)?; + let tools_value: MjValue = match tools { + Some(t) => tools_to_value(t)?, + None => MjValue::from(Vec::::new()), + }; + let mut ctx = context! { + messages => messages_value, + tools => tools_value, + add_generation_prompt => add_generation_prompt, + }; + for (k, v) in &self.extra_context { + // Merge — minijinja contexts compose by re-emitting. + ctx = minijinja::Value::from_object(MergedCtx { + base: ctx.clone(), + key: k.clone(), + value: MjValue::from_serialize(v), + }) + .into(); + } + let tmpl = self + .env + .get_template("chat") + .map_err(|e| RenderError::Invalid(format!("chat_template lookup: {e}")))?; + tmpl.render(ctx) + .map_err(|e| RenderError::Invalid(format!("chat_template render: {e}"))) + } + + fn encode_full(&self, text: &str) -> Result, RenderError> { + Ok(self.tokenizer.encode_no_special(text)?.as_slice().to_vec()) + } +} + +impl Renderer for DefaultRenderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + // Incremental render: tokenise prefix-by-prefix, attribute the + // delta to each message index. Same approach as the Python class. + let mut token_ids: Vec = Vec::new(); + let mut message_indices: Vec = Vec::new(); + let mut prev_len = 0usize; + + for (i, _) in messages.iter().enumerate() { + let text = self.render_jinja(&messages[..=i], tools, false)?; + let ids = self.encode_full(&text)?; + if ids.len() < prev_len { + // Template didn't extend prefix-monotonically — fall back to + // a single full render attributed entirely to scaffolding. + let all = self.encode_full(&self.render_jinja(messages, tools, add_generation_prompt)?)?; + return Ok(RenderedTokens { + token_ids: all.clone(), + message_indices: vec![SCAFFOLD_IDX; all.len()], + multi_modal_data: None, + }); + } + let new_count = ids.len() - prev_len; + message_indices.extend(std::iter::repeat(i as i32).take(new_count)); + token_ids = ids; + prev_len = token_ids.len(); + } + + if add_generation_prompt { + let full = self.render_jinja(messages, tools, true)?; + let full_ids = self.encode_full(&full)?; + if full_ids.len() >= prev_len { + let gen_count = full_ids.len() - prev_len; + message_indices.extend(std::iter::repeat(SCAFFOLD_IDX).take(gen_count)); + token_ids = full_ids; + } else { + token_ids = full_ids; + message_indices.truncate(token_ids.len()); + } + } + + Ok(RenderedTokens { + token_ids, + message_indices, + multi_modal_data: None, + }) + } + + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + // Fast path: one full render instead of N prefix renders. Used by + // callers that don't need per-token attribution. + let text = self.render_jinja(messages, tools, add_generation_prompt)?; + self.encode_full(&text) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + // Truncate at the first stop token. + let end = token_ids + .iter() + .position(|t| self.stop_token_ids.contains(t)) + .unwrap_or(token_ids.len()); + let text = self.tokenizer.decode(&token_ids[..end]).unwrap_or_default(); + + // Split out a `...` block if present. Same logic + // as the Python fallback. + let (reasoning_content, content) = match text.split_once("") { + Some((before, after)) => { + let r = if let Some((_, inner)) = before.rsplit_once("") { + inner.to_string() + } else { + before.to_string() + }; + (Some(r).filter(|s| !s.is_empty()), after.to_string()) + } + None => (None, text.clone()), + }; + + ParsedResponse { + content, + reasoning_content, + tool_calls: Vec::new(), + } + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_token_ids + } + + fn bridge_to_next_turn( + &self, + _previous_prompt_ids: &[u32], + _previous_completion_ids: &[u32], + _new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + // Same contract as the Python DefaultRenderer: without family + // knowledge of the turn-close token, the bridge can't be proven. + Ok(None) + } +} + +// ── Jinja context conversion ────────────────────────────────────────── + +fn messages_to_value(messages: &[Message]) -> Result { + let mut out: Vec = Vec::with_capacity(messages.len()); + for m in messages { + let mut map = serde_json::Map::new(); + map.insert("role".into(), JsonValue::String(m.role.clone())); + // Content: string fast-path, structured parts pass through as JSON + let content_value = match &m.content { + crate::types::Content::Text(s) => JsonValue::String(s.clone()), + crate::types::Content::Parts(parts) => serde_json::to_value(parts) + .map_err(|e| RenderError::Invalid(format!("content serialisation: {e}")))?, + }; + map.insert("content".into(), content_value); + if let Some(name) = &m.name { + map.insert("name".into(), JsonValue::String(name.clone())); + } + if let Some(tcid) = &m.tool_call_id { + map.insert("tool_call_id".into(), JsonValue::String(tcid.clone())); + } + if let Some(r) = &m.reasoning_content { + map.insert("reasoning_content".into(), JsonValue::String(r.clone())); + } + if !m.tool_calls.is_empty() { + let tcs: Vec = m + .tool_calls + .iter() + .map(|tc| { + let args = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => serde_json::from_str(s) + .unwrap_or(JsonValue::String(s.clone())), + }; + serde_json::json!({ + "type": tc.kind, + "id": tc.id, + "function": { + "name": tc.function.name, + "arguments": args, + }, + }) + }) + .collect(); + map.insert("tool_calls".into(), JsonValue::Array(tcs)); + } + out.push(MjValue::from_serialize(JsonValue::Object(map))); + } + Ok(MjValue::from(out)) +} + +fn tools_to_value(tools: &[ToolSpec]) -> Result { + let mut out: Vec = Vec::with_capacity(tools.len()); + for t in tools { + let v = serde_json::json!({ + "type": "function", + "function": { + "name": t.name, + "description": t.description, + "parameters": t.parameters, + }, + }); + out.push(MjValue::from_serialize(v)); + } + Ok(MjValue::from(out)) +} + +/// Minijinja value adapter that merges an extra `(key, value)` pair +/// into an existing context. The HF templates expect `bos_token`, +/// `eos_token`, etc. to be addressable directly off the top-level +/// context. +#[derive(Debug, Clone)] +struct MergedCtx { + base: MjValue, + key: String, + value: MjValue, +} + +impl minijinja::value::Object for MergedCtx { + fn get_value(self: &Arc, key: &MjValue) -> Option { + if let Some(k) = key.as_str() { + if k == self.key { + return Some(self.value.clone()); + } + } + self.base.get_item(key).ok() + } +} diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index 51c8b69..afd784c 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -4,6 +4,7 @@ //! stays focused. New families slot in by adding a module here and a //! registry entry in [`crate::registry`]. +pub mod default; pub mod deepseek_v3; pub mod glm; pub mod gpt_oss; @@ -15,6 +16,7 @@ pub mod qwen3; pub mod qwen35; pub mod qwen36; +pub use default::{DefaultRenderer, DefaultRendererBuilder}; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; pub use glm::{GlmRenderer, GlmRendererBuilder}; pub use gpt_oss::{GptOssRenderer, GptOssRendererBuilder}; diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 9ab8acd..756581b 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -14,9 +14,9 @@ use pyo3::prelude::*; use pyo3::types::{PyList, PyType}; use renderers_core::families::{ - DeepSeekV3RendererBuilder, GlmRendererBuilder, GptOssRendererBuilder, KimiK25RendererBuilder, - KimiK2RendererBuilder, MiniMaxM2RendererBuilder, Nemotron3RendererBuilder, - Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, + DefaultRendererBuilder, DeepSeekV3RendererBuilder, GlmRendererBuilder, GptOssRendererBuilder, + KimiK25RendererBuilder, KimiK2RendererBuilder, MiniMaxM2RendererBuilder, + Nemotron3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, }; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ @@ -458,6 +458,53 @@ impl PyRenderer { Ok(PyRenderer { inner: Arc::new(renderer) }) } + /// Build a DefaultRenderer (Jinja fallback via minijinja). + /// + /// `chat_template` is the model's Jinja chat template (usually the + /// `chat_template` field of `tokenizer_config.json` or the contents + /// of `chat_template.jinja`). `stop_token_ids` is typically + /// `[eos_token_id]`; pass `None` to leave it empty. + #[classmethod] + #[pyo3(signature = (tokenizer_path, chat_template, *, stop_token_ids = None, extra_context = None))] + fn default_renderer( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + chat_template: &str, + stop_token_ids: Option<&Bound<'_, PyAny>>, + extra_context: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let stop_ids: Vec = match stop_token_ids { + None => Vec::new(), + Some(obj) if obj.is_none() => Vec::new(), + Some(obj) => parse_u32_list(obj)?, + }; + let extras: Vec<(String, serde_json::Value)> = match extra_context { + None => Vec::new(), + Some(obj) if obj.is_none() => Vec::new(), + Some(obj) => { + let v: serde_json::Value = pythonize::depythonize(obj) + .map_err(|e| invalid(format!("extra_context: {e}")))?; + match v { + serde_json::Value::Object(m) => m.into_iter().collect(), + _ => return Err(invalid("extra_context must be a dict")), + } + } + }; + let ct = chat_template.to_string(); + let renderer = py + .allow_threads(move || { + let mut b = DefaultRendererBuilder::new(ct).stop_token_ids(stop_ids); + for (k, v) in extras { + b = b.add_context(k, v); + } + b.build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { inner: Arc::new(renderer) }) + } + /// Build a GPT-OSS (Harmony) renderer. /// /// Unlike the other families, GPT-OSS doesn't need a HuggingFace diff --git a/renderers/default.py b/renderers/default.py index e969421..d3f24c0 100644 --- a/renderers/default.py +++ b/renderers/default.py @@ -13,6 +13,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -88,6 +93,48 @@ class DefaultRenderer: :class:`renderers.DefaultRendererConfig`). """ + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + *, + tool_parser=None, + reasoning_parser=None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + **chat_template_kwargs, + ): + # Native routing: only when there are no plugged parsers and no + # exotic chat_template kwargs — the Rust path uses minijinja and + # doesn't know about Python-side parser instances. + if ( + native_enabled("default") + and tool_parser is None + and reasoning_parser is None + and not preserve_all_thinking + and not preserve_thinking_between_tool_calls + ): + native = load_native() + if native is not None: + ct = getattr(tokenizer, "chat_template", None) + if isinstance(ct, str) and ct: + path = resolve_tokenizer_path(tokenizer) + stop = ( + [tokenizer.eos_token_id] + if getattr(tokenizer, "eos_token_id", None) is not None + else None + ) + extras = { + "bos_token": getattr(tokenizer, "bos_token", None) or "", + "eos_token": getattr(tokenizer, "eos_token", None) or "", + } + return native.Renderer.default_renderer( + path, + ct, + stop_token_ids=stop, + extra_context=extras, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, From 32df0792f4686eaaf5cad915a7f437d05f1df902 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:49:04 +0200 Subject: [PATCH 13/35] Add Qwen-VL native multimodal parity --- Cargo.lock | 3578 +++++++++++++++--- crates/renderers-core/src/families/qwen35.rs | 253 +- crates/renderers-py/src/lib.rs | 89 + 3 files changed, 3451 insertions(+), 469 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa35e8f..287835c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,12 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "aho-corasick" @@ -12,839 +18,3453 @@ dependencies = [ ] [[package]] -name = "autocfg" -version = "1.5.0" +name = "aligned" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685" +dependencies = [ + "as-slice", +] [[package]] -name = "base64" -version = "0.13.1" +name = "aligned-vec" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] [[package]] -name = "bitflags" -version = "2.11.1" +name = "android_system_properties" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] [[package]] -name = "bumpalo" -version = "3.20.2" +name = "anes" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] -name = "cc" -version = "1.2.62" +name = "anstream" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ - "find-msvc-tools", - "shlex", + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", ] [[package]] -name = "cfg-if" -version = "1.0.4" +name = "anstyle" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] -name = "crossbeam-deque" -version = "0.8.6" +name = "anstyle-parse" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", + "utf8parse", ] [[package]] -name = "crossbeam-epoch" -version = "0.9.18" +name = "anstyle-query" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "crossbeam-utils", + "windows-sys 0.61.2", ] [[package]] -name = "crossbeam-utils" -version = "0.8.21" +name = "anstyle-wincon" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] [[package]] -name = "darling" -version = "0.20.11" +name = "anyhow" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" -dependencies = [ - "darling_core", - "darling_macro", -] +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] -name = "darling_core" -version = "0.20.11" +name = "arbitrary" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" + +[[package]] +name = "arg_enum_proc_macro" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ - "fnv", - "ident_case", "proc-macro2", "quote", - "strsim", "syn", ] [[package]] -name = "darling_macro" -version = "0.20.11" +name = "arrayvec" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" -dependencies = [ - "darling_core", - "quote", - "syn", -] +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] -name = "derive_builder" -version = "0.20.2" +name = "as-slice" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" dependencies = [ - "derive_builder_macro", + "stable_deref_trait", ] [[package]] -name = "derive_builder_core" -version = "0.20.2" +name = "atomic-waker" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" -dependencies = [ - "darling", - "proc-macro2", - "quote", - "syn", -] +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] -name = "derive_builder_macro" -version = "0.20.2" +name = "autocfg" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" -dependencies = [ - "derive_builder_core", - "syn", -] +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] -name = "either" -version = "1.15.0" +name = "av-scenechange" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394" +dependencies = [ + "aligned", + "anyhow", + "arg_enum_proc_macro", + "arrayvec", + "log", + "num-rational", + "num-traits", + "pastey", + "rayon", + "thiserror 2.0.18", + "v_frame", + "y4m", +] [[package]] -name = "equivalent" -version = "1.0.2" +name = "av1-grain" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +checksum = "8cfddb07216410377231960af4fcab838eaa12e013417781b78bd95ee22077f8" +dependencies = [ + "anyhow", + "arrayvec", + "log", + "nom 8.0.0", + "num-rational", + "v_frame", +] [[package]] -name = "esaxx-rs" -version = "0.1.10" +name = "avif-serialize" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +checksum = "e7178fe5f7d460b13895ebb9dcb28a3a6216d2df2574a0806cb51b555d297f38" dependencies = [ - "cc", + "arrayvec", ] [[package]] -name = "find-msvc-tools" -version = "0.1.9" +name = "base64" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" [[package]] -name = "fnv" -version = "1.0.7" +name = "base64" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] -name = "getrandom" -version = "0.2.17" +name = "bit-set" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" dependencies = [ - "cfg-if", - "libc", - "wasi", + "bit-vec", ] [[package]] -name = "hashbrown" -version = "0.17.1" +name = "bit-vec" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" [[package]] -name = "heck" -version = "0.5.0" +name = "bit_field" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" [[package]] -name = "ident_case" -version = "1.0.1" +name = "bitflags" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" [[package]] -name = "indexmap" -version = "2.14.0" +name = "bitstream-io" +version = "4.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +checksum = "7eff00be299a18769011411c9def0d827e8f2d7bf0c3dbf53633147a8867fd1f" dependencies = [ - "equivalent", - "hashbrown", + "no_std_io2", ] [[package]] -name = "indoc" -version = "2.0.7" +name = "block-buffer" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ - "rustversion", + "generic-array", ] [[package]] -name = "itertools" -version = "0.11.0" +name = "bs58" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +checksum = "bf88ba1141d185c399bee5288d850d63b8369520c1eafc32a0430b5b6c287bf4" dependencies = [ - "either", + "tinyvec", ] [[package]] -name = "itertools" -version = "0.12.1" +name = "bstr" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" dependencies = [ - "either", + "memchr", + "regex-automata", + "serde", ] [[package]] -name = "itoa" -version = "1.0.18" +name = "built" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +checksum = "f4ad8f11f288f48ca24471bbd51ac257aaeaaa07adae295591266b792902ae64" [[package]] -name = "lazy_static" -version = "1.5.0" +name = "bumpalo" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] -name = "libc" -version = "0.2.186" +name = "bytemuck" +version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" [[package]] -name = "log" -version = "0.4.29" +name = "byteorder-lite" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" [[package]] -name = "macro_rules_attribute" -version = "0.2.2" +name = "bytes" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" -dependencies = [ - "macro_rules_attribute-proc_macro", - "paste", -] +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] -name = "macro_rules_attribute-proc_macro" -version = "0.2.2" +name = "cast" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] -name = "memchr" -version = "2.8.0" +name = "cc" +version = "1.2.62" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] [[package]] -name = "memoffset" -version = "0.9.1" +name = "cfg-if" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] -name = "minimal-lexical" +name = "cfg_aliases" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] -name = "monostate" -version = "0.1.18" +name = "chrono" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ - "monostate-impl", + "iana-time-zone", + "num-traits", "serde", - "serde_core", + "windows-link", ] [[package]] -name = "monostate-impl" -version = "0.1.18" +name = "ciborium" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" dependencies = [ - "proc-macro2", - "quote", - "syn", + "ciborium-io", + "ciborium-ll", + "serde", ] [[package]] -name = "nom" -version = "7.1.3" +name = "ciborium-io" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" dependencies = [ - "memchr", - "minimal-lexical", + "ciborium-io", + "half", ] [[package]] -name = "once_cell" -version = "1.21.4" +name = "clap" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] [[package]] -name = "onig" -version = "6.5.3" +name = "clap_builder" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ - "bitflags", - "libc", - "once_cell", - "onig_sys", + "anstream", + "anstyle", + "clap_lex", + "strsim", ] [[package]] -name = "onig_sys" -version = "69.9.3" +name = "clap_derive" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e68317604e77e53b85896388e1a803c1d21b74c899ec9e5e1112db90735edd7" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" dependencies = [ - "cc", - "pkg-config", + "heck", + "proc-macro2", + "quote", + "syn", ] [[package]] -name = "paste" -version = "1.0.15" +name = "clap_lex" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] -name = "phf" -version = "0.11.3" +name = "color_quant" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" -dependencies = [ - "phf_macros", - "phf_shared", +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools 0.10.5", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools 0.10.5", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core 0.23.0", + "darling_macro 0.23.0", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core 0.23.0", + "quote", + "syn", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] + +[[package]] +name = "exr" +version = "1.74.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4300e043a56aa2cb633c01af81ca8f699a321879a7854d3896a0ba89056363be" +dependencies = [ + "bit_field", + "half", + "lebe", + "miniz_oxide", + "rayon-core", + "smallvec", + "zune-inflate", +] + +[[package]] +name = "fancy-regex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "fax" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf1079563223d5d59d83c85886a56e586cfd5c1a26292e971a0fa266531ac5a" + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "gif" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159" +dependencies = [ + "color_quant", + "weezl", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "hyper" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "image" +version = "0.25.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "exr", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "qoi", + "ravif", + "rayon", + "rgb", + "tiff", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "imgref" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fac9d56ed6437b198fddba683305e8e2d651aa42647f00f5ae542e7f5c94a2" + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.1", + "serde", + "serde_core", +] + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "interpolate_name" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67df7112613f8bfd9150013a0314e196f4800d3201ae742489d999db2f979f08" +dependencies = [ + "cfg-if", + "futures-util", + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lebe" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f12a681b7dd8ce12bff52488013ba614b869148d54dd79836ab85aafdd53f08d" +dependencies = [ + "arbitrary", + "cc", +] + +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "loop9" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" +dependencies = [ + "imgref", +] + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + +[[package]] +name = "maybe-rayon" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" +dependencies = [ + "cfg-if", + "rayon", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memo-map" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "minijinja" +version = "2.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "805bfd7352166bae857ee569628b52bcd85a1cecf7810861ebceb1686b72b75d" +dependencies = [ + "memo-map", + "serde", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "mio" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "moxcms" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b" +dependencies = [ + "num-traits", + "pxfm", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "no_std_io2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "noop_proc_macro" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-conv" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" + +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "onig" +version = "6.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" +dependencies = [ + "bitflags", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e68317604e77e53b85896388e1a803c1d21b74c899ec9e5e1112db90735edd7" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "openai-harmony" +version = "0.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e77e82af451fc95deeb728a40b84db8ee82d341e136c268de415123a560b9b72" +dependencies = [ + "anyhow", + "base64 0.22.1", + "bstr", + "clap", + "fancy-regex", + "futures", + "image", + "regex", + "reqwest", + "rustc-hash 1.1.0", + "serde", + "serde_json", + "serde_with", + "sha1", + "sha2", + "thiserror 2.0.18", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand 0.8.6", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "profiling" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d595e54a326bc53c1c197b32d295e14b169e3cfeaa8dc82b529f947fba6bcf5" +dependencies = [ + "profiling-procmacros", +] + +[[package]] +name = "profiling-procmacros" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4488a4a36b9a4ba6b9334a32a39971f77c1436ec82c38707bce707699cc3bbcb" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "pxfm" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f" + +[[package]] +name = "pyo3" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "pythonize" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91a6ee7a084f913f98d70cdc3ebec07e852b735ae3059a1500db2661265da9ff" +dependencies = [ + "pyo3", + "serde", +] + +[[package]] +name = "qoi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash 2.1.2", + "rustls", + "socket2", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.4", + "ring", + "rustc-hash 2.1.2", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rav1e" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b" +dependencies = [ + "aligned-vec", + "arbitrary", + "arg_enum_proc_macro", + "arrayvec", + "av-scenechange", + "av1-grain", + "bitstream-io", + "built", + "cfg-if", + "interpolate_name", + "itertools 0.14.0", + "libc", + "libfuzzer-sys", + "log", + "maybe-rayon", + "new_debug_unreachable", + "noop_proc_macro", + "num-derive", + "num-traits", + "paste", + "profiling", + "rand 0.9.4", + "rand_chacha 0.9.0", + "simd_helpers", + "thiserror 2.0.18", + "v_frame", + "wasm-bindgen", +] + +[[package]] +name = "ravif" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e52310197d971b0f5be7fe6b57530dcd27beb35c1b013f29d66c1ad73fbbcc45" +dependencies = [ + "avif-serialize", + "imgref", + "loop9", + "quick-error", + "rav1e", + "rayon", + "rgb", +] + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +dependencies = [ + "either", + "itertools 0.11.0", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "renderers-cli" +version = "0.1.0" +dependencies = [ + "clap", + "criterion", + "renderers-core", + "serde", + "serde_json", +] + +[[package]] +name = "renderers-core" +version = "0.1.0" +dependencies = [ + "bumpalo", + "minijinja", + "openai-harmony", + "phf", + "regex", + "serde", + "serde_json", + "smallvec", + "thiserror 1.0.69", + "tokenizers", +] + +[[package]] +name = "renderers-py" +version = "0.1.0" +dependencies = [ + "pyo3", + "pythonize", + "renderers-core", + "serde", + "serde_json", +] + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "mime_guess", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "rgb" +version = "0.8.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4" + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustls" +version = "0.23.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +dependencies = [ + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schemars" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "indexmap 2.14.0", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e72c1c2cb7b223fafb600a619537a871c2818583d619401b785e7c0b746ccde2" +dependencies = [ + "base64 0.22.1", + "bs58", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.14.0", + "schemars 0.9.0", + "schemars 1.2.1", + "serde_core", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b90c488738ecb4fb0262f41f43bc40efc5868d9fb744319ddf5f5317f417bfac" +dependencies = [ + "darling 0.23.0", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "simd_helpers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" +dependencies = [ + "quote", +] + +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom 7.1.3", + "serde", + "unicode-segmentation", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] -name = "phf_generator" +name = "tiff" version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +checksum = "b63feaf3343d35b6ca4d50483f94843803b0f51634937cc2ec519fc32232bc52" dependencies = [ - "phf_shared", - "rand", + "fax", + "flate2", + "half", + "quick-error", + "weezl", + "zune-jpeg", ] [[package]] -name = "phf_macros" -version = "0.11.3" +name = "time" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ - "phf_generator", - "phf_shared", - "proc-macro2", - "quote", - "syn", + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", ] [[package]] -name = "phf_shared" -version = "0.11.3" +name = "time-core" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ - "siphasher", + "num-conv", + "time-core", ] [[package]] -name = "pkg-config" -version = "0.3.33" +name = "tinystr" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] [[package]] -name = "portable-atomic" -version = "1.13.1" +name = "tinytemplate" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] [[package]] -name = "ppv-lite86" -version = "0.2.21" +name = "tinyvec" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" dependencies = [ - "zerocopy", + "tinyvec_macros", ] [[package]] -name = "proc-macro2" -version = "1.0.106" +name = "tinyvec_macros" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokenizers" +version = "0.20.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b08cc37428a476fc9e20ac850132a513a2e1ce32b6a31addf2b74fa7033b905" dependencies = [ - "unicode-ident", + "aho-corasick", + "derive_builder", + "esaxx-rs", + "getrandom 0.2.17", + "itertools 0.12.1", + "lazy_static", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand 0.8.6", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror 1.0.69", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", ] [[package]] -name = "pyo3" -version = "0.22.6" +name = "tokio" +version = "1.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" dependencies = [ - "cfg-if", - "indoc", + "bytes", "libc", - "memoffset", + "mio", + "pin-project-lite", + "socket2", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68d6fdd9f81c2819c9a8b0e0cd91660e7746a8e6ea2ba7c6b2b057985f6bcb51" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", + "url", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ "once_cell", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", ] [[package]] -name = "pyo3-build-config" -version = "0.22.6" +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" + +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "v_frame" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2" +dependencies = [ + "aligned-vec", + "num-traits", + "wasm-bindgen", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ace1d07c165b0864824eee619580c4689389afa9dc9ed3a4c75040d82e6790" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96492d0d3ffba25305a7dc88720d250b1401d7edca02cc3bcd50633b424673b8" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e68e6f4afd367a562002c05637acb8578ff2dea1943df76afb9e83d177c8578" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d95a9ec35c64b2a7cb35d3fead40c4238d0940c86d107136999567a4703259f2" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4e0100b01e9f0d03189a92b96772a1fb998639d981193d7dbab487302513441" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b572dff8bcf38bad0fa19729c89bb5748b2b9b1d8be70cf90df697e3a8f32aa" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + +[[package]] +name = "winapi-util" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "once_cell", - "target-lexicon", + "windows-sys 0.61.2", ] [[package]] -name = "pyo3-ffi" -version = "0.22.6" +name = "windows-core" +version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ - "libc", - "pyo3-build-config", + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", ] [[package]] -name = "pyo3-macros" -version = "0.22.6" +name = "windows-implement" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", - "pyo3-macros-backend", "quote", "syn", ] [[package]] -name = "pyo3-macros-backend" -version = "0.22.6" +name = "windows-interface" +version = "0.59.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ - "heck", "proc-macro2", - "pyo3-build-config", "quote", "syn", ] [[package]] -name = "pythonize" -version = "0.22.0" +name = "windows-link" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcf491425978bd889015d5430f6473d91bdfa2097262f1e731aadcf6c2113e" -dependencies = [ - "pyo3", - "serde", -] +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] -name = "quote" -version = "1.0.45" +name = "windows-result" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" dependencies = [ - "proc-macro2", + "windows-link", ] [[package]] -name = "rand" -version = "0.8.6" +name = "windows-strings" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" dependencies = [ - "libc", - "rand_chacha", - "rand_core", + "windows-link", ] [[package]] -name = "rand_chacha" -version = "0.3.1" +name = "windows-sys" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "ppv-lite86", - "rand_core", + "windows-targets 0.52.6", ] [[package]] -name = "rand_core" -version = "0.6.4" +name = "windows-sys" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "getrandom", + "windows-targets 0.53.5", ] [[package]] -name = "rayon" -version = "1.12.0" +name = "windows-sys" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ - "either", - "rayon-core", + "windows-link", ] [[package]] -name = "rayon-cond" -version = "0.3.0" +name = "windows-targets" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "either", - "itertools 0.11.0", - "rayon", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] -name = "rayon-core" -version = "1.13.0" +name = "windows-targets" +version = "0.53.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ - "crossbeam-deque", - "crossbeam-utils", + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] [[package]] -name = "regex" -version = "1.12.3" +name = "windows_aarch64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] -name = "regex-automata" -version = "0.4.14" +name = "windows_aarch64_gnullvm" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" [[package]] -name = "regex-syntax" -version = "0.8.10" +name = "windows_aarch64_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] -name = "renderers-core" -version = "0.1.0" -dependencies = [ - "bumpalo", - "once_cell", - "phf", - "regex", - "serde", - "serde_json", - "smallvec", - "thiserror", - "tokenizers", -] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" [[package]] -name = "renderers-py" -version = "0.1.0" -dependencies = [ - "pyo3", - "pythonize", - "renderers-core", - "serde", - "serde_json", -] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] -name = "rustversion" -version = "1.0.22" +name = "windows_i686_gnu" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" [[package]] -name = "serde" -version = "1.0.228" +name = "windows_i686_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] -name = "serde_core" -version = "1.0.228" +name = "windows_i686_gnullvm" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" [[package]] -name = "serde_derive" -version = "1.0.228" +name = "windows_i686_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] -name = "serde_json" -version = "1.0.149" +name = "windows_i686_msvc" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" -dependencies = [ - "indexmap", - "itoa", - "memchr", - "serde", - "serde_core", - "zmij", -] +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" [[package]] -name = "shlex" -version = "1.3.0" +name = "windows_x86_64_gnu" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] -name = "siphasher" -version = "1.0.3" +name = "windows_x86_64_gnu" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" [[package]] -name = "smallvec" -version = "1.15.1" +name = "windows_x86_64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] -name = "spm_precompiled" -version = "0.1.4" +name = "windows_x86_64_gnullvm" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" -dependencies = [ - "base64", - "nom", - "serde", - "unicode-segmentation", -] +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" [[package]] -name = "strsim" -version = "0.11.1" +name = "windows_x86_64_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] -name = "syn" -version = "2.0.117" +name = "windows_x86_64_msvc" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] -name = "target-lexicon" -version = "0.12.16" +name = "wit-bindgen" +version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" [[package]] -name = "thiserror" -version = "1.0.69" +name = "writeable" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "y4m" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" + +[[package]] +name = "yoke" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" dependencies = [ - "thiserror-impl", + "stable_deref_trait", + "yoke-derive", + "zerofrom", ] [[package]] -name = "thiserror-impl" -version = "1.0.69" +name = "yoke-derive" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", "syn", + "synstructure", ] [[package]] -name = "tokenizers" -version = "0.20.4" +name = "zerocopy" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b08cc37428a476fc9e20ac850132a513a2e1ce32b6a31addf2b74fa7033b905" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ - "aho-corasick", - "derive_builder", - "esaxx-rs", - "getrandom", - "itertools 0.12.1", - "lazy_static", - "log", - "macro_rules_attribute", - "monostate", - "onig", - "paste", - "rand", - "rayon", - "rayon-cond", - "regex", - "regex-syntax", - "serde", - "serde_json", - "spm_precompiled", - "thiserror", - "unicode-normalization-alignments", - "unicode-segmentation", - "unicode_categories", + "zerocopy-derive", ] [[package]] -name = "unicode-ident" -version = "1.0.24" +name = "zerocopy-derive" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] -name = "unicode-normalization-alignments" -version = "0.1.12" +name = "zerofrom" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" dependencies = [ - "smallvec", + "zerofrom-derive", ] [[package]] -name = "unicode-segmentation" -version = "1.13.2" +name = "zerofrom-derive" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] [[package]] -name = "unicode_categories" -version = "0.1.1" +name = "zeroize" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] -name = "unindent" +name = "zerotrie" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" - -[[package]] -name = "wasi" -version = "0.11.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] [[package]] -name = "zerocopy" -version = "0.8.48" +name = "zerovec" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ - "zerocopy-derive", + "yoke", + "zerofrom", + "zerovec-derive", ] [[package]] -name = "zerocopy-derive" -version = "0.8.48" +name = "zerovec-derive" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", @@ -856,3 +3476,27 @@ name = "zmij" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-inflate" +version = "0.2.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "zune-jpeg" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296" +dependencies = [ + "zune-core", +] diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs index bbb0be1..c56914e 100644 --- a/crates/renderers-core/src/families/qwen35.rs +++ b/crates/renderers-core/src/families/qwen35.rs @@ -23,9 +23,10 @@ use crate::emit::RenderBuf; use crate::parsing::qwen35::parse_qwen35; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; -use crate::traits::Renderer; +use crate::traits::{MultimodalRenderer, Renderer}; use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, + MediaBundle, MediaItem, Message, Modality, MultiModalData, ParsedResponse, PlaceholderRange, + RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, }; const TOOLS_HEADER: &str = "# Tools\n\nYou have access to the following functions:\n\n"; @@ -101,6 +102,18 @@ pub struct Qwen35Renderer { tool_response: u32, tool_response_end: u32, + // Multimodal placeholder tokens — resolved as optional so the + // text-only Qwen3.5 tokenizers (which don't ship the vision + // specials) still construct cleanly. `as_multimodal()` returns None + // when these are absent. + vision_start: Option, + vision_end: Option, + image_pad: Option, + video_pad: Option, + /// `[(token_id, modality_marker)]` — 1 = image, 2 = video. Empty + /// when this tokenizer doesn't have the vision specials. + mm_token_type_ids: Vec<(u32, u8)>, + stop_tokens: Vec, } @@ -124,6 +137,22 @@ impl Qwen35Renderer { let tool_response = tokenizer.token_to_id_strict("")?; let tool_response_end = tokenizer.token_to_id_strict("")?; + // Multimodal tokens are optional — text-only tokenizers (e.g. + // Qwen3.5-9B, no `-VL` suffix) don't ship them. Resolve via + // `token_to_id` (non-strict) so the renderer constructs in both + // cases. + let vision_start = tokenizer.token_to_id("<|vision_start|>"); + let vision_end = tokenizer.token_to_id("<|vision_end|>"); + let image_pad = tokenizer.token_to_id("<|image_pad|>"); + let video_pad = tokenizer.token_to_id("<|video_pad|>"); + let mut mm_token_type_ids: Vec<(u32, u8)> = Vec::new(); + if let Some(p) = image_pad { + mm_token_type_ids.push((p, 1)); + } + if let Some(p) = video_pad { + mm_token_type_ids.push((p, 2)); + } + Ok(Self { tokenizer, enable_thinking: cfg.enable_thinking, @@ -139,10 +168,23 @@ impl Qwen35Renderer { tool_call_end, tool_response, tool_response_end, + vision_start, + vision_end, + image_pad, + video_pad, + mm_token_type_ids, stop_tokens: vec![im_end, endoftext], }) } + /// True when the underlying tokenizer ships the vision special + /// tokens. Used by [`Renderer::as_multimodal`]. + pub fn supports_multimodal(&self) -> bool { + self.vision_start.is_some() + && self.vision_end.is_some() + && self.image_pad.is_some() + } + /// Index of the most recent non-tool-response user message; /// `messages.len()` when none — that out-of-range value makes /// `msg_idx > last_query_index` uniformly false, matching the @@ -551,4 +593,211 @@ impl Renderer for Qwen35Renderer { multi_modal_data: None, })) } + + fn as_multimodal(&self) -> Option<&dyn MultimodalRenderer> { + if self.supports_multimodal() { + Some(self) + } else { + None + } + } } + +// ── Multimodal implementation ───────────────────────────────────────── +// +// Qwen3.5-VL emits the canonical Qwen-style placeholder block per image: +// <|vision_start|> + num_tokens × <|image_pad|> + <|vision_end|> +// +// where `num_tokens` is the pre-computed expansion the caller obtained +// from the HF processor (image_grid_thw.prod() / merge_size²). The +// renderer never touches pixel data; `MediaItem::hf_payload` rides +// through as opaque JSON into `MultiModalData::mm_items`. + +impl Qwen35Renderer { + /// Walk the user-message content parts and pop matching media items + /// from `bundle`, emitting placeholder spans inline. Stops at + /// content boundaries and accumulates `MultiModalData` side-by-side + /// with the token buffer. + fn emit_user_with_media( + &self, + buf: &mut RenderBuf<'_>, + msg_idx: usize, + content: &str, + media: &MediaBundle, + mm: &mut MultiModalData, + ) -> Result<(), RenderError> { + let idx = msg_idx as i32; + buf.special(self.im_start, idx); + buf.text("user\n", idx)?; + // Emit the text body + if !content.is_empty() { + buf.text(content, idx)?; + } + // Then any media items attached to this message + for (m_idx, item) in &media.items { + if *m_idx != msg_idx { + continue; + } + self.emit_media_item(buf, idx, item, mm)?; + } + buf.special(self.im_end, idx); + buf.text("\n", idx)?; + Ok(()) + } + + fn emit_media_item( + &self, + buf: &mut RenderBuf<'_>, + idx: i32, + item: &MediaItem, + mm: &mut MultiModalData, + ) -> Result<(), RenderError> { + let pad = match item.modality { + Modality::Image => self.image_pad, + Modality::Video => self.video_pad, + } + .ok_or_else(|| { + RenderError::MissingSpecialToken(match item.modality { + Modality::Image => "<|image_pad|>".into(), + Modality::Video => "<|video_pad|>".into(), + }) + })?; + let vs = self + .vision_start + .ok_or_else(|| RenderError::MissingSpecialToken("<|vision_start|>".into()))?; + let ve = self + .vision_end + .ok_or_else(|| RenderError::MissingSpecialToken("<|vision_end|>".into()))?; + + buf.special(vs, idx); + let offset = buf.len(); + for _ in 0..item.num_tokens { + buf.special(pad, idx); + } + buf.special(ve, idx); + + // Update MultiModalData. Key by modality string ("image" / + // "video") so the inference engine glue can route per-key. + let key = item.modality.as_str().to_string(); + mm.mm_hashes.entry(key.clone()).or_default().push(item.hash.clone()); + mm.mm_placeholders + .entry(key.clone()) + .or_default() + .push(PlaceholderRange { + offset, + length: item.num_tokens, + }); + mm.mm_items.entry(key).or_default().push(item.hf_payload.clone()); + Ok(()) + } +} + +impl MultimodalRenderer for Qwen35Renderer { + fn mm_token_type_id_map(&self) -> &[(u32, u8)] { + &self.mm_token_type_ids + } + + fn render_with_media( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + media: &MediaBundle, + add_generation_prompt: bool, + ) -> Result { + // Fast path: no media → defer to the text-only render. + if media.is_empty() { + return self.render(messages, tools, add_generation_prompt); + } + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + let first_is_system = messages[0].role == "system"; + + match tools { + Some(t) if !t.is_empty() => { + self.emit_system_with_tools(&mut buf, messages, t, first_is_system)?; + } + _ => { + if first_is_system { + self.emit_system_no_tools(&mut buf, messages)?; + } + } + } + + let last_qi = Self::last_query_index(messages); + let mut mm = MultiModalData::default(); + + for (i, msg) in messages.iter().enumerate() { + let content = msg.text_content().trim(); + match msg.role.as_str() { + "system" => { + if i != 0 { + return Err(RenderError::Invalid( + "system message must be at the beginning".into(), + )); + } + } + "user" => { + // Check if this message has attached media; if so, use + // the multimodal emit path. + let has_media = media.items.iter().any(|(idx, _)| *idx == i); + if has_media { + self.emit_user_with_media(&mut buf, i, content, media, &mut mm)?; + } else { + self.emit_user(&mut buf, content, i as i32)?; + } + } + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(&mut buf, msg, i, last_qi, preserve_thinking)?; + } + "tool" => self.emit_tool(&mut buf, messages, i, content)?, + _ => { + return Err(RenderError::Invalid(format!( + "unexpected message role: {}", + msg.role + ))); + } + } + } + + if add_generation_prompt { + self.emit_generation_prompt(&mut buf)?; + } + + let mut out = buf.into_rendered(); + if !mm.is_empty() { + out.multi_modal_data = Some(mm); + } + Ok(out) + } + + fn bridge_to_next_turn_with_media( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + tools: Option<&[ToolSpec]>, + _new_media: &MediaBundle, + _previous_multi_modal_data: Option<&MultiModalData>, + ) -> Result, RenderError> { + // Phase 5a scope: bridge ignores media on the new-turn side + // (the prior turn's mm_data is carried forward by the caller's + // glue layer, not by this function). When new_media is + // non-empty, fall back to a full re-render — bridging + // image-bearing turns through a verbatim prefix is fragile + // because placeholder offsets shift if the prior turn was + // truncated mid-image. Phase 5b can revisit. + if !_new_media.is_empty() { + return Ok(None); + } + self.bridge_to_next_turn(previous_prompt_ids, previous_completion_ids, new_messages, tools) + } +} + diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 756581b..f1cb84b 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -18,6 +18,7 @@ use renderers_core::families::{ KimiK25RendererBuilder, KimiK2RendererBuilder, MiniMaxM2RendererBuilder, Nemotron3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, }; +use renderers_core::types::{MediaBundle, MediaItem, Modality}; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ Message, ParsedResponse, ParsedToolCall, RenderedTokens, ToolArguments, ToolCallParseStatus, @@ -53,6 +54,48 @@ fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult> Ok(Some(parsed)) } +/// Decode a Python list of media-item dicts into a [`MediaBundle`]. +fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { + let value: serde_json::Value = pythonize::depythonize(obj) + .map_err(|e| invalid(format!("media must be a list of dicts: {e}")))?; + let arr = match value { + serde_json::Value::Array(a) => a, + _ => return Err(invalid("media must be a list")), + }; + let mut bundle = MediaBundle::new(); + for item in arr { + let obj = item.as_object().ok_or_else(|| invalid("media item must be a dict"))?; + let message_idx = obj + .get("message_idx") + .and_then(|v| v.as_u64()) + .ok_or_else(|| invalid("media item missing message_idx"))? as usize; + let modality_str = obj + .get("modality") + .and_then(|v| v.as_str()) + .ok_or_else(|| invalid("media item missing modality"))?; + let modality = match modality_str { + "image" => Modality::Image, + "video" => Modality::Video, + other => return Err(invalid(format!("unknown modality: {other}"))), + }; + let num_tokens = obj + .get("num_tokens") + .and_then(|v| v.as_u64()) + .ok_or_else(|| invalid("media item missing num_tokens"))? as usize; + let hash = obj + .get("hash") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_default(); + let hf_payload = obj.get("hf_payload").cloned().unwrap_or(serde_json::Value::Null); + bundle.push( + message_idx, + MediaItem { modality, hash, num_tokens, hf_payload }, + ); + } + Ok(bundle) +} + fn parse_u32_list(obj: &Bound<'_, PyAny>) -> PyResult> { // Accept either a Python list of ints or a numpy-style sequence. let list = obj.downcast::().map_err(|_| invalid("expected list[int]"))?; @@ -738,6 +781,52 @@ impl PyRenderer { PyList::new_bound(py, self.inner.stop_token_ids().iter().map(|&t| t as i64)) } + /// Render with pre-resolved multimodal media items. + /// + /// `media` is a list of dicts each shaped like + /// ``{"message_idx": int, "modality": "image" | "video", + /// "num_tokens": int, "hash": str, "hf_payload": }``. + /// `num_tokens` is the placeholder expansion count pre-computed by + /// the caller's vision processor (HF + /// ``image_grid_thw.prod()/merge_size**2`` for Qwen-VL). The Rust + /// renderer never touches pixel data — `hf_payload` rides through + /// as opaque JSON into `multi_modal_data.mm_items`. + /// + /// Raises ``RuntimeError`` when the underlying family doesn't + /// support multimodal (e.g. a Qwen3.5 text-only tokenizer that + /// doesn't ship the ``<|vision_start|>`` token). + #[pyo3(signature = (messages, media, *, tools = None, add_generation_prompt = false))] + fn render_with_media( + &self, + py: Python<'_>, + messages: &Bound<'_, PyAny>, + media: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult { + let msgs = parse_messages(messages)?; + let tools = parse_tools(tools)?; + let bundle = parse_media_bundle(media)?; + let renderer = self.inner.clone(); + let out = py + .allow_threads(move || -> Result<_, renderers_core::types::RenderError> { + let mm = renderer + .as_multimodal() + .ok_or_else(|| renderers_core::types::RenderError::Invalid( + "this renderer does not support multimodal — use a -VL tokenizer or check supports_multimodal()".into(), + ))?; + mm.render_with_media(&msgs, tools.as_deref(), &bundle, add_generation_prompt) + }) + .map_err(render_err)?; + Ok(PyRenderedTokens { inner: out }) + } + + /// True when the underlying family supports the multimodal trait + /// AND the loaded tokenizer ships the modality special tokens. + fn supports_multimodal(&self) -> bool { + self.inner.as_multimodal().is_some() + } + #[pyo3(signature = (previous_prompt_ids, previous_completion_ids, new_messages, *, tools = None))] fn bridge_to_next_turn( &self, From 1b5c7cb23b9b4200ae46eeb046186913a4f3bc09 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:49:26 +0200 Subject: [PATCH 14/35] Add native Qwen-VL image processing parity --- Cargo.lock | 52 +++ crates/renderers-core/Cargo.toml | 3 + crates/renderers-core/src/lib.rs | 1 + crates/renderers-core/src/processing/mod.rs | 23 ++ .../renderers-core/src/processing/qwen3_vl.rs | 315 ++++++++++++++++++ .../renderers-core/src/processing/resolver.rs | 93 ++++++ crates/renderers-py/src/lib.rs | 129 +++++++ renderers/_native_vision.py | 130 ++++++++ 8 files changed, 746 insertions(+) create mode 100644 crates/renderers-core/src/processing/mod.rs create mode 100644 crates/renderers-core/src/processing/qwen3_vl.rs create mode 100644 crates/renderers-core/src/processing/resolver.rs create mode 100644 renderers/_native_vision.py diff --git a/Cargo.lock b/Cargo.lock index 287835c..f7c7b16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1420,6 +1420,16 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "maybe-rayon" version = "0.1.1" @@ -1536,6 +1546,21 @@ dependencies = [ "pxfm", ] +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -1586,6 +1611,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.2.1" @@ -1815,6 +1849,15 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "portable-atomic-util" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" +dependencies = [ + "portable-atomic", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -2140,6 +2183,12 @@ dependencies = [ "rgb", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.12.0" @@ -2236,12 +2285,15 @@ name = "renderers-core" version = "0.1.0" dependencies = [ "bumpalo", + "image", "minijinja", + "ndarray", "openai-harmony", "phf", "regex", "serde", "serde_json", + "sha2", "smallvec", "thiserror 1.0.69", "tokenizers", diff --git a/crates/renderers-core/Cargo.toml b/crates/renderers-core/Cargo.toml index 78184fe..f5973f8 100644 --- a/crates/renderers-core/Cargo.toml +++ b/crates/renderers-core/Cargo.toml @@ -22,6 +22,9 @@ bumpalo = { workspace = true } phf = { workspace = true } openai-harmony = { version = "0.0.8", default-features = false } minijinja = { version = "2", default-features = false, features = ["builtins", "serde"] } +image = { version = "0.25", default-features = false, features = ["jpeg", "png", "webp"] } +ndarray = "0.16" +sha2 = "0.10" [dev-dependencies] serde_json = { workspace = true } diff --git a/crates/renderers-core/src/lib.rs b/crates/renderers-core/src/lib.rs index 597f57c..cbad59c 100644 --- a/crates/renderers-core/src/lib.rs +++ b/crates/renderers-core/src/lib.rs @@ -29,6 +29,7 @@ pub mod bridge; pub mod emit; pub mod families; pub mod parsing; +pub mod processing; pub mod registry; pub mod thinking; pub mod tokenizer; diff --git a/crates/renderers-core/src/processing/mod.rs b/crates/renderers-core/src/processing/mod.rs new file mode 100644 index 0000000..08f1063 --- /dev/null +++ b/crates/renderers-core/src/processing/mod.rs @@ -0,0 +1,23 @@ +//! Vision processors — port of the HuggingFace image processor pipelines. +//! +//! Phase 5b: actual pixel-data preprocessing in Rust. Decode image bytes, +//! smart-resize, normalise, patch-extract, and produce the tensors the +//! vision encoder consumes — same shape as HF's processors, without +//! crossing back to Python. +//! +//! Currently shipped: +//! +//! - [`qwen3_vl::Qwen3VlImageProcessor`] — covers Qwen2-VL, Qwen3-VL, +//! and Qwen3.5-VL (they share the processor). +//! +//! Future: +//! +//! - Kimi K2.5 — different smart_resize defaults and a single-pad +//! placeholder convention (Phase 5b follow-up). +//! - Video frame sampling — needs `video-rs` or `ffmpeg-next` (Phase 5c). + +pub mod qwen3_vl; +pub mod resolver; + +pub use qwen3_vl::{ProcessedImage, Qwen3VlImageProcessor, CLIP_MEAN, CLIP_STD}; +pub use resolver::Qwen3VlResolver; diff --git a/crates/renderers-core/src/processing/qwen3_vl.rs b/crates/renderers-core/src/processing/qwen3_vl.rs new file mode 100644 index 0000000..c2d7b58 --- /dev/null +++ b/crates/renderers-core/src/processing/qwen3_vl.rs @@ -0,0 +1,315 @@ +//! Vision image processing for Qwen-VL family models (Qwen2-VL, +//! Qwen3-VL, Qwen3.5-VL). +//! +//! Port of the HuggingFace `Qwen2VLImageProcessor` / `Qwen3VLImageProcessor` +//! pipeline. Given an image (bytes or decoded RGB), produces: +//! +//! - `pixel_values`: `ndarray::Array2` of shape +//! `(grid_h * grid_w, 3 * temporal_patch_size * patch_size * patch_size)`. +//! This is what the vision encoder consumes. +//! - `image_grid_thw`: `[1, grid_h, grid_w]` — the temporal × height × width +//! patch count. +//! - `num_tokens`: `grid_h * grid_w / (merge_size * merge_size)` — the +//! placeholder count the renderer emits between +//! `<|vision_start|>` and `<|vision_end|>`. +//! +//! # Parity caveat +//! +//! The grid dimensions, num_tokens, and tensor shape match HF exactly. +//! The pixel values themselves use the `image` crate's bicubic +//! (CatmullRom) resize, which differs from PIL's bicubic in the last +//! few decimals — typical RMS difference ≈ 1e-3 on normalized pixels. +//! Downstream models tolerate this level of noise (it's far below the +//! quantization floor of vision encoders); but if exact pixel parity +//! is required (e.g. for regression tests against PIL-rendered +//! fixtures) keep the Python processor on the path. + +use std::io::Cursor; + +use ndarray::{Array2, Array3}; +use sha2::{Digest, Sha256}; + +use crate::types::RenderError; + +/// OpenAI CLIP normalisation constants — Qwen-VL inherits these. +pub const CLIP_MEAN: [f32; 3] = [0.481_454_66, 0.457_827_5, 0.408_210_73]; +pub const CLIP_STD: [f32; 3] = [0.268_629_54, 0.261_302_58, 0.275_777_11]; + +/// Configuration for the Qwen-VL image processor pipeline. +#[derive(Debug, Clone)] +pub struct Qwen3VlImageProcessor { + /// Lower bound on resized pixel count. Default for Qwen2-VL / Qwen3-VL: + /// `56 * 56 = 3136`. Resized images smaller than this get scaled up. + pub min_pixels: u32, + /// Upper bound on resized pixel count. Default: `28*28*1280 = 1_003_520`. + pub max_pixels: u32, + /// Patch size in pixels. Default: 14. + pub patch_size: u32, + /// Temporal patch size — `pixel_values` is duplicated across this + /// axis for static images so the same tensor shape serves images + /// and video frames. Default: 2. + pub temporal_patch_size: u32, + /// Spatial merge factor between vision encoder output and the + /// model's input — placeholders count divides by `merge²`. Default: 2. + pub merge_size: u32, + /// Rescale factor applied before normalisation. Default: 1/255. + pub rescale_factor: f32, + /// Per-channel mean / std for normalisation (after rescale). + pub image_mean: [f32; 3], + pub image_std: [f32; 3], +} + +impl Default for Qwen3VlImageProcessor { + fn default() -> Self { + Self { + min_pixels: 56 * 56, + max_pixels: 28 * 28 * 1280, + patch_size: 14, + temporal_patch_size: 2, + merge_size: 2, + rescale_factor: 1.0 / 255.0, + image_mean: CLIP_MEAN, + image_std: CLIP_STD, + } + } +} + +/// Output of one image's processing run. +#[derive(Debug, Clone)] +pub struct ProcessedImage { + /// Flattened patches: shape (grid_h * grid_w, channel * temporal * patch²). + pub pixel_values: Array2, + /// `[1, grid_h, grid_w]` — temporal × height × width patch count. + pub image_grid_thw: [u32; 3], + /// `grid_h * grid_w / merge²` — count of placeholder tokens to emit. + pub num_tokens: usize, + /// Stable SHA-256 prefix of the resolved RGB bytes — useful as a + /// cache key. + pub hash: String, +} + +impl Qwen3VlImageProcessor { + /// Compute the resized (height, width) for an input image. Mirrors + /// `transformers.models.qwen2_vl.image_processing_qwen2_vl.smart_resize`. + /// + /// `factor = patch_size * merge_size` (28 by default). + pub fn smart_resize(&self, height: u32, width: u32) -> Result<(u32, u32), RenderError> { + let factor = self.patch_size * self.merge_size; + let (h, w) = (height as f64, width as f64); + let max_dim = h.max(w); + let min_dim = h.min(w); + if min_dim == 0.0 { + return Err(RenderError::Invalid("image dimension is zero".into())); + } + if max_dim / min_dim > 200.0 { + return Err(RenderError::Invalid(format!( + "absolute aspect ratio must be smaller than 200, got {:.2}", + max_dim / min_dim + ))); + } + let f = factor as f64; + let mut h_bar = (h / f).round() * f; + let mut w_bar = (w / f).round() * f; + + let max_pixels = self.max_pixels as f64; + let min_pixels = self.min_pixels as f64; + + if h_bar * w_bar > max_pixels { + let beta = ((h * w) / max_pixels).sqrt(); + h_bar = ((h / beta) / f).floor() * f; + w_bar = ((w / beta) / f).floor() * f; + h_bar = h_bar.max(f); + w_bar = w_bar.max(f); + } else if h_bar * w_bar < min_pixels { + let beta = (min_pixels / (h * w)).sqrt(); + h_bar = ((h * beta) / f).ceil() * f; + w_bar = ((w * beta) / f).ceil() * f; + } + Ok((h_bar as u32, w_bar as u32)) + } + + /// Decode arbitrary image bytes (PNG/JPEG/WebP via the `image` + /// crate's auto-detect) to RGB pixel arrays. + pub fn decode(bytes: &[u8]) -> Result { + let reader = image::ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .map_err(|e| RenderError::Invalid(format!("image format detection: {e}")))?; + let dynamic = reader + .decode() + .map_err(|e| RenderError::Invalid(format!("image decode: {e}")))?; + Ok(dynamic.to_rgb8()) + } + + /// Hash the resolved RGB bytes — same shape as the Python + /// `_image_hash` so the cache key is comparable. + pub fn hash_rgb(rgb: &image::RgbImage) -> String { + let mut h = Sha256::new(); + h.update(rgb.as_raw()); + h.update(format!("({}, {})", rgb.width(), rgb.height()).as_bytes()); + let digest = h.finalize(); + // Trim to 32 hex chars to match the Python implementation. + let hex: String = digest + .iter() + .map(|b| format!("{b:02x}")) + .collect(); + hex[..32].to_string() + } + + /// Process a single decoded RGB image end-to-end. + pub fn process_rgb(&self, rgb: &image::RgbImage) -> Result { + let (orig_w, orig_h) = (rgb.width(), rgb.height()); + let (new_h, new_w) = self.smart_resize(orig_h, orig_w)?; + + // Resize: image crate's CatmullRom is the closest match to PIL's + // bicubic. See module-level docs for the parity caveat. + let resized = image::imageops::resize( + rgb, + new_w, + new_h, + image::imageops::FilterType::CatmullRom, + ); + + // Build a (C=3, H, W) f32 array, normalised. + let (h, w) = (new_h as usize, new_w as usize); + let mut chw = Array3::::zeros((3, h, w)); + for y in 0..h { + for x in 0..w { + let p = resized.get_pixel(x as u32, y as u32); + for c in 0..3 { + let v = (p[c] as f32) * self.rescale_factor; + chw[(c, y, x)] = (v - self.image_mean[c]) / self.image_std[c]; + } + } + } + + // Patch layout. The HF pipeline reshapes to: + // (C, grid_h/merge, merge, patch, grid_w/merge, merge, patch) + // then permutes to: + // (grid_h/merge, grid_w/merge, merge, merge, C, patch, patch) + // then unsqueezes a temporal axis and expands to temporal_patch_size, + // finally flattening to (grid_h*grid_w, C*temporal*patch*patch). + // + // The output layout is (token_idx, feature) where token_idx + // iterates in row-major order over the merged grid: + // token_idx = (m_row * grid_w/merge + m_col) * merge² + mi*merge + mj + // and the feature vector packs (C, temporal, patch, patch) in + // row-major order. + let ps = self.patch_size as usize; + let merge = self.merge_size as usize; + let temporal = self.temporal_patch_size as usize; + let grid_h = h / ps; + let grid_w = w / ps; + if grid_h % merge != 0 || grid_w % merge != 0 { + return Err(RenderError::Invalid(format!( + "resized grid ({grid_h}x{grid_w}) not divisible by merge_size {merge}" + ))); + } + let token_count = grid_h * grid_w; + let feature_len = 3 * temporal * ps * ps; + let mut pixel_values = Array2::::zeros((token_count, feature_len)); + + // Fill: for each token (m_row, m_col, mi, mj), copy the corresponding + // (patch_size × patch_size × 3) sub-block, replicated across the + // temporal axis. + let merged_grid_h = grid_h / merge; + let merged_grid_w = grid_w / merge; + for m_row in 0..merged_grid_h { + for m_col in 0..merged_grid_w { + for mi in 0..merge { + for mj in 0..merge { + let token_idx = + ((m_row * merged_grid_w + m_col) * merge + mi) * merge + mj; + // Patch top-left in pixel coordinates: + let py = (m_row * merge + mi) * ps; + let px = (m_col * merge + mj) * ps; + let mut feature_idx = 0usize; + for c in 0..3 { + for _t in 0..temporal { + for dy in 0..ps { + for dx in 0..ps { + pixel_values[(token_idx, feature_idx)] = + chw[(c, py + dy, px + dx)]; + feature_idx += 1; + } + } + } + } + } + } + } + } + + let num_tokens = (grid_h * grid_w) / (merge * merge); + let hash = Self::hash_rgb(rgb); + + Ok(ProcessedImage { + pixel_values, + image_grid_thw: [1, grid_h as u32, grid_w as u32], + num_tokens, + hash, + }) + } + + /// Convenience: decode bytes then process. + pub fn process_bytes(&self, bytes: &[u8]) -> Result { + let rgb = Self::decode(bytes)?; + self.process_rgb(&rgb) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smart_resize_round_trip() { + let p = Qwen3VlImageProcessor::default(); + let (h, w) = p.smart_resize(480, 640).unwrap(); + // 480*640 = 307_200 → under max_pixels, both align to factor 28 + assert_eq!(h % 28, 0); + assert_eq!(w % 28, 0); + } + + #[test] + fn smart_resize_scales_down_oversized() { + let p = Qwen3VlImageProcessor::default(); + // 4000*3000 = 12M pixels — must scale down + let (h, w) = p.smart_resize(4000, 3000).unwrap(); + assert!(h * w <= p.max_pixels); + assert_eq!(h % 28, 0); + assert_eq!(w % 28, 0); + } + + #[test] + fn smart_resize_scales_up_undersized() { + let p = Qwen3VlImageProcessor::default(); + // 16x16 = 256 pixels — below min, must scale up + let (h, w) = p.smart_resize(16, 16).unwrap(); + assert!(h * w >= p.min_pixels); + assert_eq!(h % 28, 0); + assert_eq!(w % 28, 0); + } + + #[test] + fn smart_resize_rejects_extreme_aspect_ratio() { + let p = Qwen3VlImageProcessor::default(); + assert!(p.smart_resize(10, 10_000).is_err()); + } + + #[test] + fn process_small_image() { + let p = Qwen3VlImageProcessor::default(); + // Synthesise a 56x56 RGB image + let mut rgb = image::RgbImage::new(56, 56); + for y in 0..56 { + for x in 0..56 { + rgb.put_pixel(x, y, image::Rgb([x as u8, y as u8, 128])); + } + } + let out = p.process_rgb(&rgb).unwrap(); + assert_eq!(out.image_grid_thw, [1, 4, 4]); + assert_eq!(out.num_tokens, 4); // 16 / (2*2) + // pixel_values shape: (16 tokens, 3*2*14*14 = 1176) + assert_eq!(out.pixel_values.shape(), &[16, 1176]); + } +} diff --git a/crates/renderers-core/src/processing/resolver.rs b/crates/renderers-core/src/processing/resolver.rs new file mode 100644 index 0000000..926eda5 --- /dev/null +++ b/crates/renderers-core/src/processing/resolver.rs @@ -0,0 +1,93 @@ +//! [`MediaResolver`] implementations backed by the in-crate vision +//! processors. Lets pure-Rust callers go from "image bytes / URL / +//! path" straight to a [`MediaItem`] without a Python round-trip. + +use std::fs; + +use serde_json::json; + +use crate::processing::qwen3_vl::{ProcessedImage, Qwen3VlImageProcessor}; +use crate::traits::{MediaResolver, MediaSource}; +use crate::types::{MediaItem, Modality, RenderError}; + +/// `MediaResolver` backed by [`Qwen3VlImageProcessor`]. Stores the +/// processed tensor inside `MediaItem.hf_payload` as a JSON object so +/// the inference engine glue can route it through the same path as +/// the Python-resolved case. +/// +/// The serialised payload shape is: +/// +/// ```json +/// { +/// "pixel_values": { "shape": [tokens, features], "data": [f32, ...] }, +/// "image_grid_thw": { "shape": [1, 3], "data": [1, h, w] } +/// } +/// ``` +/// +/// Callers that need zero-copy `numpy`/`torch` arrays should consume +/// the [`ProcessedImage`] struct directly via +/// [`Qwen3VlResolver::process_bytes`] instead of going through the +/// `MediaItem.hf_payload` field. +#[derive(Debug, Clone, Default)] +pub struct Qwen3VlResolver { + processor: Qwen3VlImageProcessor, +} + +impl Qwen3VlResolver { + pub fn new(processor: Qwen3VlImageProcessor) -> Self { + Self { processor } + } + + pub fn processor(&self) -> &Qwen3VlImageProcessor { + &self.processor + } + + /// Process raw image bytes into the structured [`ProcessedImage`] + /// — the zero-loss representation. The [`MediaResolver`] impl + /// wraps this and re-serialises into `MediaItem.hf_payload`. + pub fn process_bytes(&self, bytes: &[u8]) -> Result { + self.processor.process_bytes(bytes) + } + + fn to_media_item(processed: ProcessedImage) -> MediaItem { + let shape = processed.pixel_values.shape(); + let pixel_shape = vec![shape[0] as u64, shape[1] as u64]; + let pixel_data: Vec = processed.pixel_values.iter().copied().collect(); + let grid: Vec = processed.image_grid_thw.to_vec(); + + let payload = json!({ + "pixel_values": { + "shape": pixel_shape, + "data": pixel_data, + }, + "image_grid_thw": { + "shape": [1u32, 3u32], + "data": grid, + }, + }); + + MediaItem { + modality: Modality::Image, + hash: processed.hash, + num_tokens: processed.num_tokens, + hf_payload: payload, + } + } +} + +impl MediaResolver for Qwen3VlResolver { + fn resolve_image(&self, source: &MediaSource<'_>) -> Result { + let bytes: Vec = match source { + MediaSource::Bytes(b) => b.to_vec(), + MediaSource::Path(p) => fs::read(p) + .map_err(|e| RenderError::Invalid(format!("read image {p:?}: {e}")))?, + MediaSource::Url(_) => { + return Err(RenderError::Invalid( + "URL sources require an async fetch — pass already-downloaded bytes instead".into(), + )); + } + }; + let processed = self.process_bytes(&bytes)?; + Ok(Self::to_media_item(processed)) + } +} diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index f1cb84b..1ee978e 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -18,6 +18,7 @@ use renderers_core::families::{ KimiK25RendererBuilder, KimiK2RendererBuilder, MiniMaxM2RendererBuilder, Nemotron3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, }; +use renderers_core::processing::{ProcessedImage, Qwen3VlImageProcessor}; use renderers_core::types::{MediaBundle, MediaItem, Modality}; use renderers_core::tokenizer::Tokenizer; use renderers_core::types::{ @@ -850,6 +851,133 @@ impl PyRenderer { } } +// ── Vision: Qwen3-VL image processor ────────────────────────────────── + +/// Rust port of HF's `Qwen3VLImageProcessor` / `Qwen2VLImageProcessor`. +/// +/// Decodes image bytes, smart-resizes, normalises with the OpenAI CLIP +/// mean / std, and produces `pixel_values` + `image_grid_thw` tensors +/// in the exact shape the model expects. Equivalent to the Python +/// processor end-to-end; pixel-byte parity is approximate (CatmullRom +/// vs PIL bicubic), but grid dims, num_tokens, and tensor shape match +/// exactly. +#[pyclass(name = "Qwen3VlImageProcessor", module = "renderers_native")] +struct PyQwen3VlImageProcessor { + inner: Qwen3VlImageProcessor, +} + +#[pymethods] +impl PyQwen3VlImageProcessor { + #[new] + #[pyo3(signature = ( + *, + min_pixels = None, + max_pixels = None, + patch_size = None, + temporal_patch_size = None, + merge_size = None, + ))] + fn new( + min_pixels: Option, + max_pixels: Option, + patch_size: Option, + temporal_patch_size: Option, + merge_size: Option, + ) -> PyResult { + let mut p = Qwen3VlImageProcessor::default(); + if let Some(v) = min_pixels { p.min_pixels = v; } + if let Some(v) = max_pixels { p.max_pixels = v; } + if let Some(v) = patch_size { p.patch_size = v; } + if let Some(v) = temporal_patch_size { p.temporal_patch_size = v; } + if let Some(v) = merge_size { p.merge_size = v; } + Ok(Self { inner: p }) + } + + /// Compute the resized `(height, width)` for an input image + /// without doing any actual pixel work — useful for placeholder + /// counting in test harnesses. + fn smart_resize(&self, height: u32, width: u32) -> PyResult<(u32, u32)> { + self.inner.smart_resize(height, width).map_err(render_err) + } + + /// Process raw image bytes (PNG / JPEG / WebP) into a dict shaped + /// for direct consumption by `Renderer.render_with_media`: + /// + /// ```python + /// { + /// "modality": "image", + /// "num_tokens": int, + /// "hash": str, + /// "hf_payload": { + /// "pixel_values": {"shape": [tokens, features], "data": [...]}, + /// "image_grid_thw": {"shape": [1, 3], "data": [1, h, w]}, + /// }, + /// } + /// ``` + /// + /// `message_idx` is up to the caller — it's not added here. + fn process_bytes<'py>( + &self, + py: Python<'py>, + bytes: &[u8], + ) -> PyResult> { + // Clone so the move into allow_threads is straightforward + let processed: ProcessedImage = py + .allow_threads(|| self.inner.process_bytes(bytes)) + .map_err(render_err)?; + processed_to_pyobject(py, processed) + } + + /// Convenience: read a file and process it. + fn process_path<'py>(&self, py: Python<'py>, path: &str) -> PyResult> { + let bytes = std::fs::read(path) + .map_err(|e| invalid(format!("read image {path:?}: {e}")))?; + let processed: ProcessedImage = py + .allow_threads(|| self.inner.process_bytes(&bytes)) + .map_err(render_err)?; + processed_to_pyobject(py, processed) + } + + #[getter] + fn patch_size(&self) -> u32 { self.inner.patch_size } + #[getter] + fn merge_size(&self) -> u32 { self.inner.merge_size } + #[getter] + fn temporal_patch_size(&self) -> u32 { self.inner.temporal_patch_size } + #[getter] + fn min_pixels(&self) -> u32 { self.inner.min_pixels } + #[getter] + fn max_pixels(&self) -> u32 { self.inner.max_pixels } +} + +fn processed_to_pyobject<'py>( + py: Python<'py>, + p: ProcessedImage, +) -> PyResult> { + // Serialise via serde_json::Value first, then convert to a Python + // dict. The shape is identical to what the HF processor produces + // (lists of f32 + integer dims), so downstream glue can route it + // unchanged. + let shape = p.pixel_values.shape().to_vec(); + let value = serde_json::json!({ + "modality": "image", + "num_tokens": p.num_tokens, + "hash": p.hash, + "hf_payload": { + "pixel_values": { + "shape": [shape[0] as u64, shape[1] as u64], + "data": p.pixel_values.iter().copied().collect::>(), + }, + "image_grid_thw": { + "shape": [1u32, 3u32], + "data": p.image_grid_thw.to_vec(), + }, + }, + }); + pythonize::pythonize(py, &value) + .map_err(|e| invalid(format!("processed image → py: {e}"))) +} + #[pymodule] fn renderers_native(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { let _ = py; @@ -858,5 +986,6 @@ fn renderers_native(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/renderers/_native_vision.py b/renderers/_native_vision.py new file mode 100644 index 0000000..452c006 --- /dev/null +++ b/renderers/_native_vision.py @@ -0,0 +1,130 @@ +"""Bridge helpers for the native Qwen-VL image processor. + +The Rust pipeline in ``renderers_native.Qwen3VlImageProcessor`` produces +``{pixel_values, image_grid_thw, num_tokens, hash}`` dicts that match +what HF's ``Qwen3VLImageProcessor.preprocess(...)`` emits — same shapes, +same OpenAI CLIP normalisation, same patch layout. Pixel-byte parity +is approximate (CatmullRom vs PIL bicubic) but grid dims and token +counts are exact. + +These helpers convert the dict shape into numpy arrays so the result +plugs into vLLM's ``MultiModalKwargsItem`` / SGLang's payload without +extra glue: + + from renderers._native_vision import process_image_for_qwen_vl + media_item = process_image_for_qwen_vl(pil_or_bytes, message_idx=2) + # media_item is the dict shape Renderer.render_with_media expects. +""" + +from __future__ import annotations + +import io +from typing import Any + +try: + import renderers_native # type: ignore[import-not-found] + _NATIVE = renderers_native +except ImportError: + _NATIVE = None + + +_PROCESSOR_CACHE: dict[tuple[int, int, int, int, int], Any] = {} + + +def get_qwen_vl_processor( + *, + min_pixels: int | None = None, + max_pixels: int | None = None, + patch_size: int = 14, + temporal_patch_size: int = 2, + merge_size: int = 2, +): + """Return a cached ``Qwen3VlImageProcessor`` with the given config. + + Raises ``RuntimeError`` if the native extension isn't built. The + processor itself is cheap to construct (no model weights) so the + cache here is just a courtesy — repeated calls with the same kwargs + return the same handle. + """ + if _NATIVE is None: + raise RuntimeError( + "renderers_native is not installed; build it with " + "`maturin develop --manifest-path crates/renderers-py/Cargo.toml --release`" + ) + key = ( + min_pixels if min_pixels is not None else 56 * 56, + max_pixels if max_pixels is not None else 28 * 28 * 1280, + patch_size, + temporal_patch_size, + merge_size, + ) + cached = _PROCESSOR_CACHE.get(key) + if cached is None: + cached = _NATIVE.Qwen3VlImageProcessor( + min_pixels=key[0], + max_pixels=key[1], + patch_size=key[2], + temporal_patch_size=key[3], + merge_size=key[4], + ) + _PROCESSOR_CACHE[key] = cached + return cached + + +def process_image_for_qwen_vl( + image: Any, + *, + message_idx: int, + return_numpy: bool = True, + **processor_kwargs, +) -> dict[str, Any]: + """Process a single image into the dict shape + ``Renderer.render_with_media`` expects. + + Args: + image: Either ``bytes`` (raw image data), a filesystem path, or + a PIL ``Image.Image`` instance. + message_idx: Index of the user message this image is attached + to. Threaded into the returned dict so the caller can + ``[*items]`` straight into ``render_with_media``. + return_numpy: When True (default), unpack ``pixel_values`` and + ``image_grid_thw`` into numpy arrays before returning. Set + False to keep the lossless list-of-floats shape (useful for + JSON serialisation). + **processor_kwargs: Forwarded to + ``get_qwen_vl_processor`` (``min_pixels`` / ``max_pixels`` / + ``patch_size`` / ``temporal_patch_size`` / ``merge_size``). + + Returns: + A dict shaped as + ``{"message_idx", "modality", "num_tokens", "hash", "hf_payload"}``. + """ + proc = get_qwen_vl_processor(**processor_kwargs) + + if isinstance(image, (bytes, bytearray, memoryview)): + raw = bytes(image) + out = proc.process_bytes(raw) + elif isinstance(image, str): + out = proc.process_path(image) + else: + # Treat as PIL Image — re-encode to PNG bytes. + buf = io.BytesIO() + image.convert("RGB").save(buf, format="PNG") + out = proc.process_bytes(buf.getvalue()) + + if return_numpy: + import numpy as np # local to keep import cost off the hot path + + pv = out["hf_payload"]["pixel_values"] + gt = out["hf_payload"]["image_grid_thw"] + out["hf_payload"] = { + "pixel_values": np.asarray(pv["data"], dtype=np.float32).reshape( + tuple(pv["shape"]) + ), + "image_grid_thw": np.asarray(gt["data"], dtype=np.int64).reshape( + tuple(gt["shape"]) + ), + } + + out["message_idx"] = message_idx + return out From 9899d2c65a3a199df224bc2657e2602d7f1d2866 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:51:56 +0200 Subject: [PATCH 15/35] Harden native family parity --- Cargo.lock | 17 ++ crates/README.md | 10 +- .../src/families/deepseek_v3.rs | 23 +- crates/renderers-core/src/families/default.rs | 80 +++-- crates/renderers-core/src/families/glm.rs | 68 ++++- crates/renderers-core/src/families/gpt_oss.rs | 82 +++-- crates/renderers-core/src/families/kimi_k2.rs | 60 ++-- .../renderers-core/src/families/kimi_k25.rs | 280 +++++++++++++++++- .../renderers-core/src/families/minimax_m2.rs | 20 +- crates/renderers-core/src/families/mod.rs | 4 +- .../renderers-core/src/families/nemotron3.rs | 44 ++- crates/renderers-core/src/families/qwen3.rs | 23 +- crates/renderers-core/src/families/qwen35.rs | 86 ++++-- crates/renderers-core/src/lib.rs | 5 +- .../renderers-core/src/parsing/deepseek_v3.rs | 6 +- crates/renderers-core/src/parsing/glm.rs | 10 +- crates/renderers-core/src/parsing/kimi_k2.rs | 12 +- crates/renderers-core/src/parsing/minimax.rs | 5 +- crates/renderers-core/src/parsing/mod.rs | 5 +- crates/renderers-core/src/parsing/qwen35.rs | 5 +- crates/renderers-core/src/processing/mod.rs | 2 +- .../renderers-core/src/processing/qwen3_vl.rs | 18 +- .../renderers-core/src/processing/resolver.rs | 8 +- crates/renderers-core/src/registry.rs | 8 +- crates/renderers-core/src/thinking.rs | 4 +- crates/renderers-core/src/traits.rs | 84 +++++- crates/renderers-core/src/types.rs | 14 +- crates/renderers-py/Cargo.toml | 2 + crates/renderers-py/src/lib.rs | 234 ++++++++++----- tests/test_native_parity.py | 88 ++++-- 30 files changed, 950 insertions(+), 357 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f7c7b16..bb0380f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1666,6 +1666,21 @@ dependencies = [ "autocfg", ] +[[package]] +name = "numpy" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94caae805f998a07d33af06e6a3891e38556051b8045c615470a71590e13e78" +dependencies = [ + "libc", + "ndarray", + "num-complex", + "num-integer", + "num-traits", + "pyo3", + "rustc-hash 2.1.2", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -2303,6 +2318,8 @@ dependencies = [ name = "renderers-py" version = "0.1.0" dependencies = [ + "ndarray", + "numpy", "pyo3", "pythonize", "renderers-core", diff --git a/crates/README.md b/crates/README.md index 5d02547..cd8f7ea 100644 --- a/crates/README.md +++ b/crates/README.md @@ -75,13 +75,13 @@ shim logs a one-shot info message and falls back to Python. | ------------ | ----------------------------------------------- | | Qwen3 | ✅ ported (Phase 2) | | Qwen3.5 | ✅ ported text-only (Phase 3) — multimodal Phase 5 | -| GLM 4.5 / 5 | planned (Phase 3) | +| GLM 4.5 / 5 | ✅ ported (Phase 3) — GLM-5, GLM-5.1, GLM-4.5 | | DeepSeek V3 | ✅ ported (Phase 3) | -| Nemotron3 | planned (Phase 3) | -| Kimi K2 | planned (Phase 4) | +| Nemotron3 | ✅ ported (Phase 3) | +| Kimi K2 | ✅ ported (Phase 4) | | Kimi K2.5 | planned (Phase 4 — text; multimodal Phase 5) | -| MiniMax M2 | planned (Phase 4) | -| Qwen3.6 | planned (Phase 4) | +| MiniMax M2 | ✅ ported (Phase 4) | +| Qwen3.6 | ✅ ported (Phase 4) | | Qwen3-VL | planned (Phase 5 — multimodal incl. processor) | | Qwen3.5 mm | planned (Phase 5) | | GPT-OSS | planned (Phase 6 — via `openai-harmony` crate) | diff --git a/crates/renderers-core/src/families/deepseek_v3.rs b/crates/renderers-core/src/families/deepseek_v3.rs index 064b9e2..e7b090f 100644 --- a/crates/renderers-core/src/families/deepseek_v3.rs +++ b/crates/renderers-core/src/families/deepseek_v3.rs @@ -22,12 +22,10 @@ use crate::emit::RenderBuf; use crate::parsing::deepseek_v3::parse_deepseek_v3; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; -use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, -}; +use crate::types::{Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec}; const SEP: char = '\u{FF5C}'; // | -const US: char = '\u{2581}'; // ▁ +const US: char = '\u{2581}'; // ▁ fn ds_token(name: &str) -> String { let mut s = String::with_capacity(name.len() + 4); @@ -46,7 +44,9 @@ pub struct DeepSeekV3RendererBuilder { impl Default for DeepSeekV3RendererBuilder { fn default() -> Self { - Self { enable_thinking: true } + Self { + enable_thinking: true, + } } } @@ -105,10 +105,7 @@ impl DeepSeekV3Renderer { Ok(ids[0]) } - fn new_with( - tokenizer: Tokenizer, - cfg: DeepSeekV3RendererBuilder, - ) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: DeepSeekV3RendererBuilder) -> Result { let bos = Self::resolve(&tokenizer, &format!("begin{US}of{US}sentence"))?; let eos = Self::resolve(&tokenizer, &format!("end{US}of{US}sentence"))?; let user_token = Self::resolve(&tokenizer, "User")?; @@ -287,10 +284,7 @@ impl Renderer for DeepSeekV3Renderer { } } - let last_role = new_messages - .last() - .map(|m| m.role.as_str()) - .unwrap_or(""); + let last_role = new_messages.last().map(|m| m.role.as_str()).unwrap_or(""); if last_role != "tool" { buf.scaffold_special(self.assistant_token); } @@ -373,8 +367,7 @@ impl DeepSeekV3Renderer { msg_idx: usize, ) -> Result<(), RenderError> { let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; - let next_is_tool = - msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let next_is_tool = msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; let idx = msg_idx as i32; let content = messages[msg_idx].text_content(); diff --git a/crates/renderers-core/src/families/default.rs b/crates/renderers-core/src/families/default.rs index ec51515..ecd5fa5 100644 --- a/crates/renderers-core/src/families/default.rs +++ b/crates/renderers-core/src/families/default.rs @@ -27,14 +27,14 @@ use std::sync::Arc; +use minijinja::Environment; use minijinja::value::Value as MjValue; -use minijinja::{Environment, context}; use serde_json::Value as JsonValue; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, }; /// Builder for [`DefaultRenderer`]. @@ -108,10 +108,7 @@ impl Clone for DefaultRenderer { } impl DefaultRenderer { - fn new_with( - tokenizer: Tokenizer, - cfg: DefaultRendererBuilder, - ) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: DefaultRendererBuilder) -> Result { let mut env = Environment::new(); // HF chat templates use whitespace-stripped markers freely // (e.g. `{%- if foo -%}`); minijinja respects that via the @@ -141,25 +138,33 @@ impl DefaultRenderer { tools: Option<&[ToolSpec]>, add_generation_prompt: bool, ) -> Result { - let messages_value = messages_to_value(messages)?; + // Build a single flat context map up front. minijinja's + // `context!` macro and `Value::from_object` produce equivalent + // results, but a single dict keeps the per-render allocation + // count constant regardless of how many extra context keys the + // caller passes (vs the wrapped-Object chain previously used). + let mut ctx_map = serde_json::Map::new(); + ctx_map.insert( + "messages".into(), + serde_json::to_value(messages_to_value(messages)?).unwrap_or(JsonValue::Null), + ); let tools_value: MjValue = match tools { Some(t) => tools_to_value(t)?, None => MjValue::from(Vec::::new()), }; - let mut ctx = context! { - messages => messages_value, - tools => tools_value, - add_generation_prompt => add_generation_prompt, - }; + ctx_map.insert( + "tools".into(), + serde_json::to_value(tools_value).unwrap_or(JsonValue::Null), + ); + ctx_map.insert( + "add_generation_prompt".into(), + JsonValue::Bool(add_generation_prompt), + ); for (k, v) in &self.extra_context { - // Merge — minijinja contexts compose by re-emitting. - ctx = minijinja::Value::from_object(MergedCtx { - base: ctx.clone(), - key: k.clone(), - value: MjValue::from_serialize(v), - }) - .into(); + ctx_map.insert(k.clone(), v.clone()); } + let ctx = MjValue::from_serialize(JsonValue::Object(ctx_map)); + let tmpl = self .env .get_template("chat") @@ -195,7 +200,11 @@ impl Renderer for DefaultRenderer { if ids.len() < prev_len { // Template didn't extend prefix-monotonically — fall back to // a single full render attributed entirely to scaffolding. - let all = self.encode_full(&self.render_jinja(messages, tools, add_generation_prompt)?)?; + let all = self.encode_full(&self.render_jinja( + messages, + tools, + add_generation_prompt, + )?)?; return Ok(RenderedTokens { token_ids: all.clone(), message_indices: vec![SCAFFOLD_IDX; all.len()], @@ -203,7 +212,7 @@ impl Renderer for DefaultRenderer { }); } let new_count = ids.len() - prev_len; - message_indices.extend(std::iter::repeat(i as i32).take(new_count)); + message_indices.extend(std::iter::repeat_n(i as i32, new_count)); token_ids = ids; prev_len = token_ids.len(); } @@ -213,7 +222,7 @@ impl Renderer for DefaultRenderer { let full_ids = self.encode_full(&full)?; if full_ids.len() >= prev_len { let gen_count = full_ids.len() - prev_len; - message_indices.extend(std::iter::repeat(SCAFFOLD_IDX).take(gen_count)); + message_indices.extend(std::iter::repeat_n(SCAFFOLD_IDX, gen_count)); token_ids = full_ids; } else { token_ids = full_ids; @@ -316,8 +325,9 @@ fn messages_to_value(messages: &[Message]) -> Result { .map(|tc| { let args = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), - ToolArguments::Raw(s) => serde_json::from_str(s) - .unwrap_or(JsonValue::String(s.clone())), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(JsonValue::String(s.clone())) + } }; serde_json::json!({ "type": tc.kind, @@ -351,25 +361,3 @@ fn tools_to_value(tools: &[ToolSpec]) -> Result { } Ok(MjValue::from(out)) } - -/// Minijinja value adapter that merges an extra `(key, value)` pair -/// into an existing context. The HF templates expect `bos_token`, -/// `eos_token`, etc. to be addressable directly off the top-level -/// context. -#[derive(Debug, Clone)] -struct MergedCtx { - base: MjValue, - key: String, - value: MjValue, -} - -impl minijinja::value::Object for MergedCtx { - fn get_value(self: &Arc, key: &MjValue) -> Option { - if let Some(k) = key.as_str() { - if k == self.key { - return Some(self.value.clone()); - } - } - self.base.get_item(key).ok() - } -} diff --git a/crates/renderers-core/src/families/glm.rs b/crates/renderers-core/src/families/glm.rs index 8feaa8c..7da9c23 100644 --- a/crates/renderers-core/src/families/glm.rs +++ b/crates/renderers-core/src/families/glm.rs @@ -36,7 +36,7 @@ use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, }; const TOOLS_HEADER_GLM5: &str = "\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n"; @@ -69,10 +69,16 @@ impl GlmRendererBuilder { } } pub fn glm51() -> Self { - Self { variant: Variant::Glm51, ..Self::glm5() } + Self { + variant: Variant::Glm51, + ..Self::glm5() + } } pub fn glm45() -> Self { - Self { variant: Variant::Glm45, ..Self::glm5() } + Self { + variant: Variant::Glm45, + ..Self::glm5() + } } pub fn enable_thinking(mut self, on: bool) -> Self { self.enable_thinking = on; @@ -189,7 +195,11 @@ impl GlmRenderer { } fn nl_after_role(&self) -> &'static str { - if self.variant == Variant::Glm45 { "\n" } else { "" } + if self.variant == Variant::Glm45 { + "\n" + } else { + "" + } } fn empty_think_on_last_assistant(&self) -> bool { @@ -383,10 +393,7 @@ impl Renderer for GlmRenderer { let last_prev = *combined.last().expect("non-empty"); let nl = self.nl_after_role(); - let mut buf = RenderBuf::new( - &self.tokenizer, - new_messages.len().max(1) * 256, - ); + let mut buf = RenderBuf::new(&self.tokenizer, new_messages.len().max(1) * 256); for (i, msg) in new_messages.iter().enumerate() { let idx = i as i32; @@ -468,9 +475,15 @@ impl GlmRenderer { None => { if let Some((before, after)) = raw_content.split_once("") { let r = if let Some((_, inner)) = before.rsplit_once("") { - inner.trim_start_matches('\n').trim_end_matches('\n').to_string() + inner + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() } else { - before.trim_start_matches('\n').trim_end_matches('\n').to_string() + before + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() }; (r, after.trim_start_matches('\n').to_string()) } else { @@ -484,12 +497,29 @@ impl GlmRenderer { buf.special(self.assistant, msg_idx); if self.variant == Variant::Glm45 { - self.emit_assistant_glm45(buf, msg, msg_idx, &reasoning_content, &content, last_user_index, preserve_thinking) + self.emit_assistant_glm45( + buf, + msg, + msg_idx, + &reasoning_content, + &content, + last_user_index, + preserve_thinking, + ) } else { - self.emit_assistant_glm5_family(buf, msg, msg_idx, &reasoning_content, &content, last_user_index, preserve_thinking) + self.emit_assistant_glm5_family( + buf, + msg, + msg_idx, + &reasoning_content, + &content, + last_user_index, + preserve_thinking, + ) } } + #[allow(clippy::too_many_arguments)] fn emit_assistant_glm5_family( &self, buf: &mut RenderBuf<'_>, @@ -524,7 +554,9 @@ impl GlmRenderer { buf.text(name, msg_idx)?; let args_value = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), - ToolArguments::Raw(s) => serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())) + } }; if let Some(obj) = args_value.as_object() { for (k, v) in obj { @@ -541,6 +573,7 @@ impl GlmRenderer { Ok(()) } + #[allow(clippy::too_many_arguments)] fn emit_assistant_glm45( &self, buf: &mut RenderBuf<'_>, @@ -590,7 +623,9 @@ impl GlmRenderer { let args_value = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), - ToolArguments::Raw(s) => serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())) + } }; if let Some(obj) = args_value.as_object() { for (k, v) in obj { @@ -641,7 +676,10 @@ impl GlmRenderer { // GLM-5 / GLM-5.1 use special tokens buf.special(self.tool_response.expect("tool_response token"), idx); buf.text(content, idx)?; - buf.special(self.tool_response_end.expect("tool_response_end token"), idx); + buf.special( + self.tool_response_end.expect("tool_response_end token"), + idx, + ); } Ok(()) } diff --git a/crates/renderers-core/src/families/gpt_oss.rs b/crates/renderers-core/src/families/gpt_oss.rs index 15d6391..108153f 100644 --- a/crates/renderers-core/src/families/gpt_oss.rs +++ b/crates/renderers-core/src/families/gpt_oss.rs @@ -34,8 +34,8 @@ use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; use crate::thinking::should_preserve_past_thinking; use crate::traits::Renderer; use crate::types::{ - Message, ParsedResponse, ParsedToolCall, RenderError, RenderedTokens, ToolArguments, - ToolCallParseStatus, ToolSpec, SCAFFOLD_IDX, + Message, ParsedResponse, ParsedToolCall, RenderError, RenderedTokens, SCAFFOLD_IDX, + ToolArguments, ToolCallParseStatus, ToolSpec, }; fn harmony_err(e: E) -> RenderError { @@ -78,7 +78,11 @@ impl GptOssRendererBuilder { "low" => ReasoningEffort::Low, "medium" => ReasoningEffort::Medium, "high" => ReasoningEffort::High, - other => return Err(RenderError::Invalid(format!("unknown reasoning effort: {other}"))), + other => { + return Err(RenderError::Invalid(format!( + "unknown reasoning effort: {other}" + ))); + } }; Ok(self) } @@ -153,8 +157,8 @@ impl GptOssRenderer { if ids.len() != 1 { return Err(RenderError::MissingSpecialToken(s.to_string())); } - u32::try_from(ids[0]) - .map_err(|_| RenderError::MissingSpecialToken(s.to_string())) + // `Rank` in tiktoken is `u32`; no conversion needed. + Ok(ids[0]) }; let start = resolve("<|start|>")?; let end = resolve("<|end|>")?; @@ -203,7 +207,7 @@ impl GptOssRenderer { .map_err(harmony_err)?; let len = out.len(); tokens.append(&mut out); - indices.extend(std::iter::repeat(msg_idx).take(len)); + indices.extend(std::iter::repeat_n(msg_idx, len)); Ok(()) } @@ -211,12 +215,8 @@ impl GptOssRenderer { /// Helper so the call sites don't need to name CoreBPE (which is not /// re-exported from the harmony crate). fn encode_text(&self, text: &str) -> Vec { - self.enc - .tokenizer() - .encode_with_special_tokens(text) - .iter() - .map(|&r| r as u32) - .collect() + // `Rank` is `u32`; encode_with_special_tokens already returns Vec. + self.enc.tokenizer().encode_with_special_tokens(text) } /// Decode a slice of token ids via the harmony tokenizer. @@ -259,11 +259,7 @@ impl GptOssRenderer { } } - fn message_to_harmony( - &self, - msg: &Message, - preserve_thinking: bool, - ) -> Vec { + fn message_to_harmony(&self, msg: &Message, preserve_thinking: bool) -> Vec { match msg.role.as_str() { "user" => vec![HarmonyMessage::from_role_and_content( HarmonyRole::User, @@ -296,11 +292,7 @@ impl GptOssRenderer { } } - fn assistant_to_harmony( - &self, - msg: &Message, - preserve_thinking: bool, - ) -> Vec { + fn assistant_to_harmony(&self, msg: &Message, preserve_thinking: bool) -> Vec { let mut out: Vec = Vec::new(); if preserve_thinking { @@ -319,11 +311,8 @@ impl GptOssRenderer { // Text content goes on the `final` channel. let text = msg.text_content(); if !text.is_empty() { - let m = HarmonyMessage::from_role_and_content( - HarmonyRole::Assistant, - text.to_string(), - ) - .with_channel("final"); + let m = HarmonyMessage::from_role_and_content(HarmonyRole::Assistant, text.to_string()) + .with_channel("final"); out.push(m); } @@ -350,11 +339,8 @@ impl GptOssRenderer { // final-channel message so per-token attribution still produces // at least one token slot. if out.is_empty() { - let m = HarmonyMessage::from_role_and_content( - HarmonyRole::Assistant, - String::new(), - ) - .with_channel("final"); + let m = HarmonyMessage::from_role_and_content(HarmonyRole::Assistant, String::new()) + .with_channel("final"); out.push(m); } @@ -435,8 +421,7 @@ impl Renderer for GptOssRenderer { sys, )); } - let has_dev = first_system_idx.is_some() - || tools.map(|t| !t.is_empty()).unwrap_or(false); + let has_dev = first_system_idx.is_some() || tools.map(|t| !t.is_empty()).unwrap_or(false); if has_dev { let mut dev = DeveloperContent::new(); if let Some(idx) = first_system_idx { @@ -529,7 +514,10 @@ impl Renderer for GptOssRenderer { continue; } let block_start = i; - let Some(msg_pos) = ids[i + 1..].iter().position(|&t| t == self.message).map(|p| p + i + 1) + let Some(msg_pos) = ids[i + 1..] + .iter() + .position(|&t| t == self.message) + .map(|p| p + i + 1) else { break; }; @@ -542,7 +530,8 @@ impl Renderer for GptOssRenderer { .position(|&t| t == self.start || t == self.end || t == self.call) .map(|p| p + body_start) .unwrap_or(ids.len()); - let body_closed = body_end < ids.len() && (ids[body_end] == self.end || ids[body_end] == self.call); + let body_closed = + body_end < ids.len() && (ids[body_end] == self.end || ids[body_end] == self.call); let body_text = self.decode_text(&ids[body_start..body_end]); // Channel: look for <|channel|>NAME in header — NAME is the @@ -560,14 +549,14 @@ impl Renderer for GptOssRenderer { .unwrap_or_default(); // Recipient: header text may contain "to=functions.NAME" - let recipient: Option<&str> = header_text - .split("to=") - .nth(1) - .map(|s| s.split(|c: char| c.is_whitespace() || c == '<').next().unwrap_or("")); + let recipient: Option<&str> = header_text.split("to=").nth(1).map(|s| { + s.split(|c: char| c.is_whitespace() || c == '<') + .next() + .unwrap_or("") + }); if let Some(r) = recipient { - if r.starts_with("functions.") { - let tool_name = &r["functions.".len()..]; + if let Some(tool_name) = r.strip_prefix("functions.") { let block_end = if body_closed { body_end + 1 } else { body_end }; let span = block_start..block_end; match serde_json::from_str::(&body_text) { @@ -597,9 +586,12 @@ impl Renderer for GptOssRenderer { } } + // analysis → reasoning_content; everything else (final, + // commentary without a tool recipient, missing channel) + // collapses into the visible content stream. match channel.split_whitespace().next() { Some("analysis") => reasoning_parts.push(body_text), - Some("final") | _ => content_parts.push(body_text), + _ => content_parts.push(body_text), } i = if body_closed { body_end + 1 } else { body_end }; @@ -652,7 +644,9 @@ impl Renderer for GptOssRenderer { } for hm in self.message_to_harmony(msg, false) { let mut out: Vec = Vec::new(); - self.enc.render_into(&hm, &mut out, None).map_err(harmony_err)?; + self.enc + .render_into(&hm, &mut out, None) + .map_err(harmony_err)?; ext.extend(out); } } diff --git a/crates/renderers-core/src/families/kimi_k2.rs b/crates/renderers-core/src/families/kimi_k2.rs index ef82e8f..70ef98f 100644 --- a/crates/renderers-core/src/families/kimi_k2.rs +++ b/crates/renderers-core/src/families/kimi_k2.rs @@ -24,7 +24,7 @@ use crate::parsing::kimi_k2::parse_kimi_k2; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, }; const DEFAULT_SYSTEM: &str = "You are Kimi, an AI assistant created by Moonshot AI."; @@ -105,8 +105,7 @@ impl KimiK2Renderer { let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; let tool_calls_section_begin = tokenizer.token_to_id_strict("<|tool_calls_section_begin|>")?; - let tool_calls_section_end = - tokenizer.token_to_id_strict("<|tool_calls_section_end|>")?; + let tool_calls_section_end = tokenizer.token_to_id_strict("<|tool_calls_section_end|>")?; let tool_call_begin = tokenizer.token_to_id_strict("<|tool_call_begin|>")?; let tool_call_argument_begin = tokenizer.token_to_id_strict("<|tool_call_argument_begin|>")?; @@ -143,7 +142,10 @@ impl KimiK2Renderer { for tool in tools { let mut m: BTreeMap = BTreeMap::new(); m.insert("name".into(), serde_json::Value::String(tool.name.clone())); - m.insert("description".into(), serde_json::Value::String(tool.description.clone())); + m.insert( + "description".into(), + serde_json::Value::String(tool.description.clone()), + ); m.insert("parameters".into(), Self::sort_keys(&tool.parameters)); arr.push(m); } @@ -211,8 +213,7 @@ impl Renderer for KimiK2Renderer { // tool_declare goes first if tools were provided and the caller // didn't already include a tool_declare message. let tools_pending = tools.map(|t| !t.is_empty()).unwrap_or(false); - let already_has_tool_declare = - !messages.is_empty() && messages[0].role == "tool_declare"; + let already_has_tool_declare = !messages.is_empty() && messages[0].role == "tool_declare"; if tools_pending && !already_has_tool_declare { working.push(Message { role: "tool_declare".to_string(), @@ -223,21 +224,20 @@ impl Renderer for KimiK2Renderer { } // Then the optional default system message - let auto_system_position: Option = if !messages.is_empty() - && messages[0].role == "tool_declare" - { - // tool_declare present in caller's input → if next isn't system, - // inject default system AFTER tool_declare - if messages.len() < 2 || messages[1].role != "system" { - Some(working.len() + 1) // will be inserted between tool_declare and the rest + let auto_system_position: Option = + if !messages.is_empty() && messages[0].role == "tool_declare" { + // tool_declare present in caller's input → if next isn't system, + // inject default system AFTER tool_declare + if messages.len() < 2 || messages[1].role != "system" { + Some(working.len() + 1) // will be inserted between tool_declare and the rest + } else { + None + } + } else if messages.is_empty() || messages[0].role != "system" { + Some(working.len()) } else { None - } - } else if messages.is_empty() || messages[0].role != "system" { - Some(working.len()) - } else { - None - }; + }; // Now lay out the rest: if let Some(pos) = auto_system_position { @@ -277,15 +277,23 @@ impl Renderer for KimiK2Renderer { } // Map normalised index → caller's index (sentinel for injected). - let orig_idx = |i: usize| -> i32 { - if injected[i] { - SCAFFOLD_IDX - } else { - let real: usize = - injected[..=i].iter().filter(|&&inj| !inj).count() - 1; - real as i32 + // Precompute as a flat Vec so the lookup is O(1) instead of an + // O(i) filter inside the render loop — saves an O(n²) walk on + // long conversations. + let orig_idx_table: Vec = { + let mut table = Vec::with_capacity(working.len()); + let mut real: i32 = -1; + for &inj in &injected { + if inj { + table.push(SCAFFOLD_IDX); + } else { + real += 1; + table.push(real); + } } + table }; + let orig_idx = |i: usize| -> i32 { orig_idx_table[i] }; // Index of the auto-injected system message (if any) — emits a // trailing literal "\n" after its <|im_end|>. diff --git a/crates/renderers-core/src/families/kimi_k25.rs b/crates/renderers-core/src/families/kimi_k25.rs index 297eac8..d78f893 100644 --- a/crates/renderers-core/src/families/kimi_k25.rs +++ b/crates/renderers-core/src/families/kimi_k25.rs @@ -24,9 +24,10 @@ use crate::emit::RenderBuf; use crate::parsing::kimi_k2::parse_kimi_k2; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; -use crate::traits::Renderer; +use crate::traits::{MultimodalRenderer, Renderer}; use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, + MediaBundle, MediaItem, Message, Modality, MultiModalData, ParsedResponse, PlaceholderRange, + RenderError, RenderedTokens, ToolArguments, ToolSpec, }; #[derive(Debug, Clone)] @@ -82,6 +83,14 @@ pub struct KimiK25Renderer { tool_call_argument_begin: u32, tool_call_end: u32, + // Media tokens — present on K2.5 tokenizers, absent on K2 proper. + // When absent, as_multimodal() returns None. + media_begin: Option, + media_content: Option, + media_pad: Option, + media_end: Option, + mm_token_type_ids: Vec<(u32, u8)>, + stop_tokens: Vec, } @@ -101,13 +110,22 @@ impl KimiK25Renderer { let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; let tool_calls_section_begin = tokenizer.token_to_id_strict("<|tool_calls_section_begin|>")?; - let tool_calls_section_end = - tokenizer.token_to_id_strict("<|tool_calls_section_end|>")?; + let tool_calls_section_end = tokenizer.token_to_id_strict("<|tool_calls_section_end|>")?; let tool_call_begin = tokenizer.token_to_id_strict("<|tool_call_begin|>")?; let tool_call_argument_begin = tokenizer.token_to_id_strict("<|tool_call_argument_begin|>")?; let tool_call_end = tokenizer.token_to_id_strict("<|tool_call_end|>")?; + // Media tokens optional — K2 proper doesn't ship them. + let media_begin = tokenizer.token_to_id("<|media_begin|>"); + let media_content = tokenizer.token_to_id("<|media_content|>"); + let media_pad = tokenizer.token_to_id("<|media_pad|>"); + let media_end = tokenizer.token_to_id("<|media_end|>"); + let mut mm_token_type_ids: Vec<(u32, u8)> = Vec::new(); + if let Some(p) = media_pad { + mm_token_type_ids.push((p, 1)); // image marker; K2.5 handles video via the same pad + } + Ok(Self { tokenizer, enable_thinking: cfg.enable_thinking, @@ -123,10 +141,23 @@ impl KimiK25Renderer { tool_call_begin, tool_call_argument_begin, tool_call_end, + media_begin, + media_content, + media_pad, + media_end, + mm_token_type_ids, stop_tokens: vec![im_end], }) } + /// True when the loaded tokenizer ships the K2.5 media tokens. + pub fn supports_multimodal(&self) -> bool { + self.media_begin.is_some() + && self.media_content.is_some() + && self.media_pad.is_some() + && self.media_end.is_some() + } + fn args_to_string(args: &ToolArguments) -> String { match args { ToolArguments::Raw(s) => s.clone(), @@ -200,7 +231,12 @@ impl KimiK25Renderer { Ok(()) } - fn emit_tool_body(&self, buf: &mut RenderBuf<'_>, msg: &Message, msg_idx: i32) -> Result<(), RenderError> { + fn emit_tool_body( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + ) -> Result<(), RenderError> { let tool_call_id = msg.tool_call_id.as_deref().unwrap_or(""); let mut header = String::with_capacity(tool_call_id.len() + 16); header.push_str("## Return of "); @@ -373,4 +409,238 @@ impl Renderer for KimiK25Renderer { multi_modal_data: None, })) } + + fn as_multimodal(&self) -> Option<&dyn MultimodalRenderer> { + if self.supports_multimodal() { + Some(self) + } else { + None + } + } +} + +// ── Multimodal implementation ───────────────────────────────────────── +// +// Kimi K2.5's placeholder shape diverges from Qwen-VL: each image gets +// exactly ONE `<|media_pad|>` token in the input stream, regardless of +// image size. The model's vision encoder expands per-patch attention +// internally from `pixel_values` + `grid_thws`. The renderer's job is +// just to emit the per-image wrapper: +// +// <|media_begin|>image<|media_content|><|media_pad|><|media_end|>\n +// +// and accumulate the corresponding placeholder ranges + opaque payloads. + +impl KimiK25Renderer { + fn emit_media_item( + &self, + buf: &mut RenderBuf<'_>, + idx: i32, + item: &MediaItem, + mm: &mut MultiModalData, + ) -> Result<(), RenderError> { + let begin = self + .media_begin + .ok_or_else(|| RenderError::MissingSpecialToken("<|media_begin|>".into()))?; + let content = self + .media_content + .ok_or_else(|| RenderError::MissingSpecialToken("<|media_content|>".into()))?; + let pad = self + .media_pad + .ok_or_else(|| RenderError::MissingSpecialToken("<|media_pad|>".into()))?; + let end = self + .media_end + .ok_or_else(|| RenderError::MissingSpecialToken("<|media_end|>".into()))?; + + let label = match item.modality { + Modality::Image => "image", + Modality::Video => "video", + }; + + buf.special(begin, idx); + buf.text(label, idx)?; + buf.special(content, idx); + let offset = buf.len(); + buf.special(pad, idx); + buf.special(end, idx); + buf.text("\n", idx)?; + + // Always exactly 1 placeholder in the stream, regardless of + // image size — that's the K2.5 convention. + let key = item.modality.as_str().to_string(); + mm.mm_hashes + .entry(key.clone()) + .or_default() + .push(item.hash.clone()); + mm.mm_placeholders + .entry(key.clone()) + .or_default() + .push(PlaceholderRange { offset, length: 1 }); + mm.mm_items + .entry(key) + .or_default() + .push(item.hf_payload.clone()); + Ok(()) + } + + fn emit_user_body_with_media<'m>( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + media_iter: &mut impl Iterator, + mm: &mut MultiModalData, + ) -> Result<(), RenderError> { + match &msg.content { + crate::types::Content::Text(s) => { + // Plain-text + attached media: emit images first, then + // text. Same convention as Qwen-VL when the caller + // doesn't pass a structured content list. + for item in media_iter.by_ref() { + self.emit_media_item(buf, msg_idx, item, mm)?; + } + if !s.is_empty() { + buf.text(s, msg_idx)?; + } + } + crate::types::Content::Parts(parts) => { + use crate::types::ContentPart; + for part in parts { + match part { + ContentPart::Text { text } => { + if !text.is_empty() { + buf.text(text, msg_idx)?; + } + } + ContentPart::Thinking { .. } => {} + ContentPart::Image(_) | ContentPart::Video(_) => { + let item = media_iter.next().ok_or_else(|| { + RenderError::Invalid( + "K2.5 message content lists more media parts than the MediaBundle provides".into(), + ) + })?; + self.emit_media_item(buf, msg_idx, item, mm)?; + } + } + } + } + } + Ok(()) + } +} + +impl MultimodalRenderer for KimiK25Renderer { + fn mm_token_type_id_map(&self) -> &[(u32, u8)] { + &self.mm_token_type_ids + } + + fn render_with_media( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + media: &MediaBundle, + add_generation_prompt: bool, + ) -> Result { + if media.is_empty() { + return self.render(messages, tools, add_generation_prompt); + } + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + if tools.map(|t| !t.is_empty()).unwrap_or(false) { + return Err(RenderError::Invalid( + "Kimi K2.5 with tools not supported on the native path yet".into(), + )); + } + + // Per-message media iterator. The bundle is flat (message_idx, + // item), and K2.5 doesn't auto-inject system messages, so the + // indices align directly with the caller's input. + let mut buf = RenderBuf::new(&self.tokenizer, messages.len().max(1) * 256); + let mut mm = MultiModalData::default(); + + let mut last_non_tc_assistant: i32 = -1; + for (i, m) in messages.iter().enumerate().rev() { + if m.role == "assistant" && m.tool_calls.is_empty() { + last_non_tc_assistant = i as i32; + break; + } + } + + for (i, msg) in messages.iter().enumerate() { + let idx = i as i32; + buf.special(self.role_token(&msg.role), idx); + let role_name = msg.name.as_deref().unwrap_or(&msg.role); + buf.text(role_name, idx)?; + buf.special(self.im_middle, idx); + + match msg.role.as_str() { + "assistant" => { + let is_suffix = idx > last_non_tc_assistant; + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant_body(&mut buf, msg, idx, is_suffix, preserve_thinking)?; + } + "tool" => self.emit_tool_body(&mut buf, msg, idx)?, + _ => { + // user / system / other — interleave media inline + let mut media_iter = media + .items + .iter() + .filter_map(|(m, it)| (*m == i).then_some(it)); + self.emit_user_body_with_media(&mut buf, msg, idx, &mut media_iter, &mut mm)?; + if media_iter.next().is_some() { + return Err(RenderError::Invalid(format!( + "MediaBundle has more items for message {i} than the content's media parts" + ))); + } + } + } + buf.special(self.im_end, idx); + } + + if add_generation_prompt { + buf.scaffold_special(self.im_assistant); + buf.scaffold_text("assistant")?; + buf.scaffold_special(self.im_middle); + if self.enable_thinking { + buf.scaffold_text("")?; + } else { + buf.scaffold_text("")?; + } + } + + let mut out = buf.into_rendered(); + if !mm.is_empty() { + out.multi_modal_data = Some(mm); + } + Ok(out) + } + + fn bridge_to_next_turn_with_media( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + tools: Option<&[ToolSpec]>, + new_media: &MediaBundle, + _previous_multi_modal_data: Option<&MultiModalData>, + ) -> Result, RenderError> { + if !new_media.is_empty() { + // Same Phase 5a caveat as Qwen3.5: bridging media-bearing + // new turns is unsafe under truncation. Fall back to a full + // re-render. + return Ok(None); + } + self.bridge_to_next_turn( + previous_prompt_ids, + previous_completion_ids, + new_messages, + tools, + ) + } } diff --git a/crates/renderers-core/src/families/minimax_m2.rs b/crates/renderers-core/src/families/minimax_m2.rs index 3e53915..9f181d4 100644 --- a/crates/renderers-core/src/families/minimax_m2.rs +++ b/crates/renderers-core/src/families/minimax_m2.rs @@ -6,13 +6,13 @@ //! Role "assistant" is rendered as "ai". //! - System block always present — default system message //! ("You are a helpful assistant. Your name is MiniMax-M2.5 and is -//! built by MiniMax.") auto-injected if missing. +//! built by MiniMax.") auto-injected if missing. //! - Tools, when supplied, are appended to the system message as //! `{json}` lines inside a `...` block, //! followed by a verbose instructions block. //! - Tool calls use XML wrapper + nested invokes: -//! `v... -//! ` +//! `v... +//! ` //! - Tool responses wrapped in literal `...` //! (plain text, no special token). //! - Thinking emitted only for assistants after the last user turn @@ -25,7 +25,7 @@ use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, }; const DEFAULT_SYSTEM: &str = @@ -96,10 +96,7 @@ impl MiniMaxM2Renderer { MiniMaxM2RendererBuilder::default() } - fn new_with( - tokenizer: Tokenizer, - cfg: MiniMaxM2RendererBuilder, - ) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: MiniMaxM2RendererBuilder) -> Result { let bos = tokenizer.token_to_id_strict("]~!b[")?; let role = tokenizer.token_to_id_strict("]~b]")?; let eos = tokenizer.token_to_id_strict("[e~[")?; @@ -174,8 +171,7 @@ impl Renderer for MiniMaxM2Renderer { } let mut buf = RenderBuf::new( &self.tokenizer, - messages.len().max(1) * 256 - + tools.map(|t| t.len() * 256 + 512).unwrap_or(0), + messages.len().max(1) * 256 + tools.map(|t| t.len() * 256 + 512).unwrap_or(0), ); let first_is_system = messages[0].role == "system"; @@ -366,8 +362,8 @@ impl MiniMaxM2Renderer { buf.special(self.role, orig_idx); let tool_calls = &msg.tool_calls; - let emit_think = !reasoning_content.is_empty() - && (conv_idx > last_user_index || preserve_thinking); + let emit_think = + !reasoning_content.is_empty() && (conv_idx > last_user_index || preserve_thinking); let after_think: String = if emit_think { buf.text("ai\n", orig_idx)?; diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs index afd784c..4113797 100644 --- a/crates/renderers-core/src/families/mod.rs +++ b/crates/renderers-core/src/families/mod.rs @@ -4,8 +4,8 @@ //! stays focused. New families slot in by adding a module here and a //! registry entry in [`crate::registry`]. -pub mod default; pub mod deepseek_v3; +pub mod default; pub mod glm; pub mod gpt_oss; pub mod kimi_k2; @@ -16,8 +16,8 @@ pub mod qwen3; pub mod qwen35; pub mod qwen36; -pub use default::{DefaultRenderer, DefaultRendererBuilder}; pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; +pub use default::{DefaultRenderer, DefaultRendererBuilder}; pub use glm::{GlmRenderer, GlmRendererBuilder}; pub use gpt_oss::{GptOssRenderer, GptOssRendererBuilder}; pub use kimi_k2::{KimiK2Renderer, KimiK2RendererBuilder}; diff --git a/crates/renderers-core/src/families/nemotron3.rs b/crates/renderers-core/src/families/nemotron3.rs index 17f76cb..44cec7d 100644 --- a/crates/renderers-core/src/families/nemotron3.rs +++ b/crates/renderers-core/src/families/nemotron3.rs @@ -27,7 +27,7 @@ use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, }; const TOOLS_HEADER: &str = "# Tools\n\nYou have access to the following functions:\n\n"; @@ -99,10 +99,7 @@ impl Nemotron3Renderer { Nemotron3RendererBuilder::default() } - fn new_with( - tokenizer: Tokenizer, - cfg: Nemotron3RendererBuilder, - ) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: Nemotron3RendererBuilder) -> Result { let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; let endoftext = tokenizer.token_to_id("<|endoftext|>"); @@ -151,7 +148,11 @@ impl Nemotron3Renderer { } out.push_str("\n"); - if let Some(props) = tool.parameters.get("properties").and_then(|v| v.as_object()) { + if let Some(props) = tool + .parameters + .get("properties") + .and_then(|v| v.as_object()) + { for (param_name, param_fields) in props { out.push_str("\n\n"); out.push_str(param_name); @@ -215,7 +216,7 @@ impl Nemotron3Renderer { handled: &[&str], ) { for (k, v) in obj { - if handled.iter().any(|h| *h == k.as_str()) { + if handled.contains(&k.as_str()) { continue; } out.push_str("\n<"); @@ -315,8 +316,7 @@ impl Nemotron3Renderer { msg_orig_idx: i32, ) -> Result<(), RenderError> { let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; - let next_is_tool = - msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let next_is_tool = msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; if !prev_is_tool { buf.special(self.im_start, msg_orig_idx); @@ -354,9 +354,15 @@ impl Nemotron3Renderer { None => { if let Some((before, after)) = raw_content.split_once("") { let r = if let Some((_, inner)) = before.rsplit_once("") { - inner.trim_start_matches('\n').trim_end_matches('\n').to_string() + inner + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() } else { - before.trim_start_matches('\n').trim_end_matches('\n').to_string() + before + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() }; (r, after.trim_start_matches('\n').to_string()) } else { @@ -417,8 +423,9 @@ impl Nemotron3Renderer { let args_value = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), - ToolArguments::Raw(s) => serde_json::from_str(s) - .unwrap_or(JsonValue::Object(Default::default())), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())) + } }; if let Some(obj) = args_value.as_object() { for (arg_name, arg_value) in obj { @@ -427,12 +434,17 @@ impl Nemotron3Renderer { serde_json::to_string(arg_value).unwrap_or_default() } JsonValue::String(s) => s.clone(), - JsonValue::Bool(b) => if *b { "True".into() } else { "False".into() }, + JsonValue::Bool(b) => { + if *b { + "True".into() + } else { + "False".into() + } + } JsonValue::Null => "None".into(), JsonValue::Number(n) => n.to_string(), }; - let mut param = - String::with_capacity(arg_name.len() + val_str.len() + 24); + let mut param = String::with_capacity(arg_name.len() + val_str.len() + 24); param.push_str("\n"); diff --git a/crates/renderers-core/src/families/qwen3.rs b/crates/renderers-core/src/families/qwen3.rs index a3a7660..e48bb58 100644 --- a/crates/renderers-core/src/families/qwen3.rs +++ b/crates/renderers-core/src/families/qwen3.rs @@ -25,7 +25,7 @@ use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::{ - Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, }; const TOOLS_HEADER: &str = "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n"; @@ -110,10 +110,7 @@ impl Qwen3Renderer { Qwen3RendererBuilder::default() } - fn new_with( - tokenizer: Tokenizer, - cfg: Qwen3RendererBuilder, - ) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: Qwen3RendererBuilder) -> Result { let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; @@ -247,8 +244,7 @@ impl Qwen3Renderer { content: &str, ) -> Result<(), RenderError> { let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; - let next_is_tool = - msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let next_is_tool = msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; let idx = msg_idx as i32; if !prev_is_tool { @@ -289,9 +285,15 @@ impl Qwen3Renderer { None => { if let Some((before, after)) = raw_content.split_once("") { let reasoning = if let Some((_, inner)) = before.rsplit_once("") { - inner.trim_start_matches('\n').trim_end_matches('\n').to_string() + inner + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() } else { - before.trim_start_matches('\n').trim_end_matches('\n').to_string() + before + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() }; (reasoning, after.trim_start_matches('\n').to_string()) } else { @@ -309,7 +311,8 @@ impl Qwen3Renderer { let emit_via_override = preserve_thinking && !reasoning_content.is_empty(); let prefix = if emit_in_template_window || emit_via_override { - let mut s = String::with_capacity(reasoning_content.len() + content_after_think.len() + 32); + let mut s = + String::with_capacity(reasoning_content.len() + content_after_think.len() + 32); s.push_str("assistant\n\n"); s.push_str(reasoning_content.trim_matches('\n')); s.push_str("\n\n\n"); diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs index c56914e..7ad75bf 100644 --- a/crates/renderers-core/src/families/qwen35.rs +++ b/crates/renderers-core/src/families/qwen35.rs @@ -614,32 +614,79 @@ impl Renderer for Qwen35Renderer { // through as opaque JSON into `MultiModalData::mm_items`. impl Qwen35Renderer { - /// Walk the user-message content parts and pop matching media items - /// from `bundle`, emitting placeholder spans inline. Stops at - /// content boundaries and accumulates `MultiModalData` side-by-side - /// with the token buffer. + /// Walk the user-message content parts in order, interleaving + /// placeholder spans where Image / Video parts appear. Mirrors the + /// HF chat template's behaviour: text and images appear in the + /// same order the caller listed them in `Content::Parts`. + /// + /// `media` items are consumed positionally — the N-th media item + /// for this message matches the N-th Image/Video part in the + /// content. Mismatched counts return an `Invalid` error. fn emit_user_with_media( &self, buf: &mut RenderBuf<'_>, + msg: &Message, msg_idx: usize, - content: &str, media: &MediaBundle, mm: &mut MultiModalData, ) -> Result<(), RenderError> { let idx = msg_idx as i32; buf.special(self.im_start, idx); buf.text("user\n", idx)?; - // Emit the text body - if !content.is_empty() { - buf.text(content, idx)?; - } - // Then any media items attached to this message - for (m_idx, item) in &media.items { - if *m_idx != msg_idx { - continue; + + // Gather this message's media items in render order. + let mut media_iter = media + .items + .iter() + .filter_map(|(m, item)| (*m == msg_idx).then_some(item)); + + match &msg.content { + crate::types::Content::Text(s) => { + // Plain-text user message with attached media: emit + // images first (canonical Qwen-VL shape: + // <|vision_start|>...<|vision_end|>{text}), then text. + for item in media_iter.by_ref() { + self.emit_media_item(buf, idx, item, mm)?; + } + if !s.is_empty() { + buf.text(s.trim(), idx)?; + } + } + crate::types::Content::Parts(parts) => { + use crate::types::ContentPart; + for part in parts { + match part { + ContentPart::Text { text } => { + if !text.is_empty() { + buf.text(text, idx)?; + } + } + ContentPart::Thinking { .. } => { + // Thinking parts shouldn't appear in user + // content — silently skip to match the + // Python implementation's behaviour. + } + ContentPart::Image(_) | ContentPart::Video(_) => { + let item = media_iter.next().ok_or_else(|| { + RenderError::Invalid(format!( + "message {msg_idx} content lists more media parts than the MediaBundle provides" + )) + })?; + self.emit_media_item(buf, idx, item, mm)?; + } + } + } } - self.emit_media_item(buf, idx, item, mm)?; } + + // Reject extra media items in the bundle that didn't get used — + // catches off-by-one errors in caller's bundle construction. + if media_iter.next().is_some() { + return Err(RenderError::Invalid(format!( + "MediaBundle has more items for message {msg_idx} than the content's media parts" + ))); + } + buf.special(self.im_end, idx); buf.text("\n", idx)?; Ok(()) @@ -739,11 +786,14 @@ impl MultimodalRenderer for Qwen35Renderer { } } "user" => { - // Check if this message has attached media; if so, use - // the multimodal emit path. + // If this message has attached media OR the caller + // provided structured content with image/video parts, + // walk the parts inline so order matches the caller's + // list. Pure text paths bypass the heavier walk. let has_media = media.items.iter().any(|(idx, _)| *idx == i); - if has_media { - self.emit_user_with_media(&mut buf, i, content, media, &mut mm)?; + let has_structured = matches!(msg.content, crate::types::Content::Parts(_)); + if has_media || has_structured { + self.emit_user_with_media(&mut buf, msg, i, media, &mut mm)?; } else { self.emit_user(&mut buf, content, i as i32)?; } diff --git a/crates/renderers-core/src/lib.rs b/crates/renderers-core/src/lib.rs index cbad59c..fd93426 100644 --- a/crates/renderers-core/src/lib.rs +++ b/crates/renderers-core/src/lib.rs @@ -39,6 +39,7 @@ pub mod types; pub use traits::{MultimodalRenderer, Renderer}; pub use types::{ Content, ContentPart, ImageRef, Message, MultiModalData, ParsedResponse, ParsedToolCall, - PlaceholderRange, RenderError, RenderedTokens, ToolArguments, ToolCall, ToolCallFunction, - ToolCallParseStatus, ToolSpec, VideoRef, SCAFFOLD_IDX, + PlaceholderRange, RenderError, RenderedTokens, SCAFFOLD_IDX, + ToolArguments, ToolCall, ToolCallFunction, + ToolCallParseStatus, ToolSpec, VideoRef, }; diff --git a/crates/renderers-core/src/parsing/deepseek_v3.rs b/crates/renderers-core/src/parsing/deepseek_v3.rs index b4d994e..7d2871e 100644 --- a/crates/renderers-core/src/parsing/deepseek_v3.rs +++ b/crates/renderers-core/src/parsing/deepseek_v3.rs @@ -145,7 +145,11 @@ fn parse_deepseek_tool_calls( let n = after_sep[..nl].trim().to_string(); let rest = after_sep[nl + 1..].trim(); let args = match JSON_FENCE_RE.captures(rest) { - Some(c) => c.get(1).map(|m| m.as_str().trim()).unwrap_or("").to_string(), + Some(c) => c + .get(1) + .map(|m| m.as_str().trim()) + .unwrap_or("") + .to_string(), None => rest.to_string(), }; (n, args) diff --git a/crates/renderers-core/src/parsing/glm.rs b/crates/renderers-core/src/parsing/glm.rs index 0c118d1..e9c4943 100644 --- a/crates/renderers-core/src/parsing/glm.rs +++ b/crates/renderers-core/src/parsing/glm.rs @@ -90,7 +90,10 @@ pub fn parse_glm( (content, tcs) } None => ( - decode(tokenizer, ids).unwrap_or_default().trim().to_string(), + decode(tokenizer, ids) + .unwrap_or_default() + .trim() + .to_string(), Vec::new(), ), }; @@ -153,7 +156,10 @@ fn parse_glm_tool_calls( let mut any_json_fallback = false; let mut structure_broke = false; let name = match first_ak { - None => decode(tokenizer, block).unwrap_or_default().trim().to_string(), + None => decode(tokenizer, block) + .unwrap_or_default() + .trim() + .to_string(), Some(first) => { let n = decode(tokenizer, &block[..first]) .unwrap_or_default() diff --git a/crates/renderers-core/src/parsing/kimi_k2.rs b/crates/renderers-core/src/parsing/kimi_k2.rs index 0c77b41..8735f68 100644 --- a/crates/renderers-core/src/parsing/kimi_k2.rs +++ b/crates/renderers-core/src/parsing/kimi_k2.rs @@ -163,11 +163,19 @@ fn parse_kimi_k2_calls( out.push(ParsedToolCall { raw: block_text, - name: if func_name.is_empty() { None } else { Some(func_name) }, + name: if func_name.is_empty() { + None + } else { + Some(func_name) + }, arguments: Some(arguments), token_span: Some(span), status, - id: if raw_id.is_empty() { None } else { Some(raw_id) }, + id: if raw_id.is_empty() { + None + } else { + Some(raw_id) + }, }); i = tc_end + 1; if unclosed { diff --git a/crates/renderers-core/src/parsing/minimax.rs b/crates/renderers-core/src/parsing/minimax.rs index e931829..f1e309e 100644 --- a/crates/renderers-core/src/parsing/minimax.rs +++ b/crates/renderers-core/src/parsing/minimax.rs @@ -80,7 +80,10 @@ pub fn parse_minimax( let mut tool_calls: Vec = Vec::new(); let content_text = match find(ids, tool_call_id) { - None => decode(tokenizer, ids).unwrap_or_default().trim().to_string(), + None => decode(tokenizer, ids) + .unwrap_or_default() + .trim() + .to_string(), Some(tc_start) => { let content = decode(tokenizer, &ids[..tc_start]) .unwrap_or_default() diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs index 37f7921..a2fa815 100644 --- a/crates/renderers-core/src/parsing/mod.rs +++ b/crates/renderers-core/src/parsing/mod.rs @@ -28,7 +28,10 @@ pub fn find(ids: &[u32], target: u32) -> Option { /// Find the first index of `target` in `ids[start..]`, or `None`. #[inline] pub fn find_from(ids: &[u32], target: u32, start: usize) -> Option { - ids[start..].iter().position(|&x| x == target).map(|i| i + start) + ids[start..] + .iter() + .position(|&x| x == target) + .map(|i| i + start) } /// Find the first index of any token in `targets`, or `None`. `targets` diff --git a/crates/renderers-core/src/parsing/qwen35.rs b/crates/renderers-core/src/parsing/qwen35.rs index 1d29683..cb105bd 100644 --- a/crates/renderers-core/src/parsing/qwen35.rs +++ b/crates/renderers-core/src/parsing/qwen35.rs @@ -174,7 +174,10 @@ fn parse_xml_tool_calls( continue; } }; - let name = name_match.get(1).map(|m| m.as_str().to_string()).unwrap_or_default(); + let name = name_match + .get(1) + .map(|m| m.as_str().to_string()) + .unwrap_or_default(); let mut arguments = serde_json::Map::new(); let mut any_json_fallback = false; diff --git a/crates/renderers-core/src/processing/mod.rs b/crates/renderers-core/src/processing/mod.rs index 08f1063..bc45bad 100644 --- a/crates/renderers-core/src/processing/mod.rs +++ b/crates/renderers-core/src/processing/mod.rs @@ -19,5 +19,5 @@ pub mod qwen3_vl; pub mod resolver; -pub use qwen3_vl::{ProcessedImage, Qwen3VlImageProcessor, CLIP_MEAN, CLIP_STD}; +pub use qwen3_vl::{CLIP_MEAN, CLIP_STD, ProcessedImage, Qwen3VlImageProcessor}; pub use resolver::Qwen3VlResolver; diff --git a/crates/renderers-core/src/processing/qwen3_vl.rs b/crates/renderers-core/src/processing/qwen3_vl.rs index c2d7b58..e359121 100644 --- a/crates/renderers-core/src/processing/qwen3_vl.rs +++ b/crates/renderers-core/src/processing/qwen3_vl.rs @@ -33,7 +33,7 @@ use crate::types::RenderError; /// OpenAI CLIP normalisation constants — Qwen-VL inherits these. pub const CLIP_MEAN: [f32; 3] = [0.481_454_66, 0.457_827_5, 0.408_210_73]; -pub const CLIP_STD: [f32; 3] = [0.268_629_54, 0.261_302_58, 0.275_777_11]; +pub const CLIP_STD: [f32; 3] = [0.268_629_54, 0.261_302_6, 0.275_777_1]; /// Configuration for the Qwen-VL image processor pipeline. #[derive(Debug, Clone)] @@ -148,10 +148,7 @@ impl Qwen3VlImageProcessor { h.update(format!("({}, {})", rgb.width(), rgb.height()).as_bytes()); let digest = h.finalize(); // Trim to 32 hex chars to match the Python implementation. - let hex: String = digest - .iter() - .map(|b| format!("{b:02x}")) - .collect(); + let hex: String = digest.iter().map(|b| format!("{b:02x}")).collect(); hex[..32].to_string() } @@ -162,12 +159,8 @@ impl Qwen3VlImageProcessor { // Resize: image crate's CatmullRom is the closest match to PIL's // bicubic. See module-level docs for the parity caveat. - let resized = image::imageops::resize( - rgb, - new_w, - new_h, - image::imageops::FilterType::CatmullRom, - ); + let resized = + image::imageops::resize(rgb, new_w, new_h, image::imageops::FilterType::CatmullRom); // Build a (C=3, H, W) f32 array, normalised. let (h, w) = (new_h as usize, new_w as usize); @@ -217,8 +210,7 @@ impl Qwen3VlImageProcessor { for m_col in 0..merged_grid_w { for mi in 0..merge { for mj in 0..merge { - let token_idx = - ((m_row * merged_grid_w + m_col) * merge + mi) * merge + mj; + let token_idx = ((m_row * merged_grid_w + m_col) * merge + mi) * merge + mj; // Patch top-left in pixel coordinates: let py = (m_row * merge + mi) * ps; let px = (m_col * merge + mj) * ps; diff --git a/crates/renderers-core/src/processing/resolver.rs b/crates/renderers-core/src/processing/resolver.rs index 926eda5..8411057 100644 --- a/crates/renderers-core/src/processing/resolver.rs +++ b/crates/renderers-core/src/processing/resolver.rs @@ -79,11 +79,13 @@ impl MediaResolver for Qwen3VlResolver { fn resolve_image(&self, source: &MediaSource<'_>) -> Result { let bytes: Vec = match source { MediaSource::Bytes(b) => b.to_vec(), - MediaSource::Path(p) => fs::read(p) - .map_err(|e| RenderError::Invalid(format!("read image {p:?}: {e}")))?, + MediaSource::Path(p) => { + fs::read(p).map_err(|e| RenderError::Invalid(format!("read image {p:?}: {e}")))? + } MediaSource::Url(_) => { return Err(RenderError::Invalid( - "URL sources require an async fetch — pass already-downloaded bytes instead".into(), + "URL sources require an async fetch — pass already-downloaded bytes instead" + .into(), )); } }; diff --git a/crates/renderers-core/src/registry.rs b/crates/renderers-core/src/registry.rs index e8f1a22..5b8a193 100644 --- a/crates/renderers-core/src/registry.rs +++ b/crates/renderers-core/src/registry.rs @@ -4,7 +4,7 @@ //! families ported to Rust so far. New families slot in by adding a //! match arm in [`create_renderer`]. -use crate::families::{DeepSeekV3Renderer, Qwen35Renderer, Qwen3Renderer}; +use crate::families::{DeepSeekV3Renderer, Qwen3Renderer, Qwen35Renderer}; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; use crate::types::RenderError; @@ -19,7 +19,11 @@ pub enum RendererKind { } impl RendererKind { - pub fn from_str(name: &str) -> Option { + /// Resolve a renderer kind from its registry name. Accepts the + /// canonical lowercase form plus common aliases. Not named + /// `from_str` to avoid the `std::str::FromStr` trait collision — + /// it's an inherent method. + pub fn parse(name: &str) -> Option { match name { "qwen3" | "Qwen3" => Some(Self::Qwen3), "qwen35" | "qwen3.5" | "Qwen3.5" => Some(Self::Qwen35), diff --git a/crates/renderers-core/src/thinking.rs b/crates/renderers-core/src/thinking.rs index c291658..cd89887 100644 --- a/crates/renderers-core/src/thinking.rs +++ b/crates/renderers-core/src/thinking.rs @@ -39,9 +39,7 @@ pub fn should_preserve_past_thinking( } // The current segment must contain a tool response for the block to // count as an in-flight tool cycle. - messages[last_user + 1..] - .iter() - .any(|m| m.role == "tool") + messages[last_user + 1..].iter().any(|m| m.role == "tool") } #[cfg(test)] diff --git a/crates/renderers-core/src/traits.rs b/crates/renderers-core/src/traits.rs index b0bbb2b..3ef7e5e 100644 --- a/crates/renderers-core/src/traits.rs +++ b/crates/renderers-core/src/traits.rs @@ -4,7 +4,7 @@ //! at the public boundary works without extra ceremony. Family-specific //! configuration lives on the concrete struct that impls these traits. -use crate::types::{MultiModalData, ParsedResponse, RenderError, RenderedTokens}; +use crate::types::{MediaBundle, MultiModalData, ParsedResponse, RenderError, RenderedTokens}; use crate::types::{Message, ToolSpec}; /// Deterministic message → token renderer for a specific model family. @@ -36,7 +36,9 @@ pub trait Renderer: Send + Sync + std::fmt::Debug { tools: Option<&[ToolSpec]>, add_generation_prompt: bool, ) -> Result, RenderError> { - Ok(self.render(messages, tools, add_generation_prompt)?.token_ids) + Ok(self + .render(messages, tools, add_generation_prompt)? + .token_ids) } /// Parse a completion's token ids back into a structured response. @@ -69,18 +71,88 @@ pub trait Renderer: Send + Sync + std::fmt::Debug { } /// Extension implemented by multimodal-capable renderers. +/// +/// Phase 5 design: the renderer **does not touch raw pixel data**. The +/// caller resolves image/video parts upstream (via the HF processor in +/// the Phase 5a Python shim, or a candle-backed [`MediaResolver`] in +/// Phase 5b) and hands the renderer a [`MediaBundle`] with each item's +/// placeholder count pre-computed. +/// +/// Concrete implementors are added in Phase 5a; this trait surface is +/// frozen now so that diff is purely additive on a stable API. pub trait MultimodalRenderer: Renderer { - /// Map of placeholder token id → modality marker (1 = image, 2 = video). + /// Placeholder token id → modality marker (1 = image, 2 = video). + /// Used by the trainer to build per-token `mm_type_ids` masks. fn mm_token_type_id_map(&self) -> &[(u32, u8)]; - /// Multimodal-aware bridge. The trailing argument carries the prior - /// turn's sidecar so placeholders survive across turns. - fn bridge_to_next_turn_mm( + /// Render `messages` with pre-resolved `media`. + /// + /// The renderer walks `messages` and pulls items from `media` in + /// order. Each `MediaItem.num_tokens` is the count of placeholder + /// tokens the renderer must emit between the modality's + /// start/end special tokens. The item's `hf_payload` rides through + /// as opaque data on [`RenderedTokens::multi_modal_data`]. + fn render_with_media( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + media: &MediaBundle, + add_generation_prompt: bool, + ) -> Result; + + /// Multimodal-aware bridge. Same contract as + /// [`Renderer::bridge_to_next_turn`] plus `new_media` for the + /// extension and `previous_multi_modal_data` so prior placeholders + /// (and their hashes / payloads) survive across turns. + fn bridge_to_next_turn_with_media( &self, previous_prompt_ids: &[u32], previous_completion_ids: &[u32], new_messages: &[Message], tools: Option<&[ToolSpec]>, + new_media: &MediaBundle, previous_multi_modal_data: Option<&MultiModalData>, ) -> Result, RenderError>; } + +/// Resolves raw image / video sources to processor outputs. +/// +/// Phase 5a uses a Python-side implementation that wraps HF's +/// `Qwen3VLImageProcessor` / `KimiVLImageProcessor` and delivers +/// [`MediaItem`]s pre-sized. Phase 5b will add a Rust-native +/// implementation backed by `candle` (or `ort`) so downstream Rust +/// callers can skip the Python boundary entirely. +/// +/// The trait is deliberately tiny: a single resolve call per item, +/// caller chooses the modality and source. +pub trait MediaResolver: Send + Sync + std::fmt::Debug { + /// Resolve a single source (URL / filesystem path / inline bytes) + /// to a sized [`MediaItem`]. Implementations are free to cache by + /// hash; the resolver lives for the lifetime of a renderer pool + /// slot. + fn resolve_image( + &self, + source: &MediaSource<'_>, + ) -> Result; + + /// Resolve a video source — Phase 5b only. The default impl returns + /// an error so Phase 5a callers don't accidentally pass through. + fn resolve_video( + &self, + _source: &MediaSource<'_>, + ) -> Result { + Err(RenderError::Invalid( + "video resolution not implemented in this resolver".into(), + )) + } +} + +/// A source descriptor for a media item the caller wants resolved. +#[derive(Clone, Debug)] +pub enum MediaSource<'a> { + Url(&'a str), + Path(&'a std::path::Path), + /// Inline image bytes (PNG / JPEG / WebP / etc.). The resolver + /// detects the format from the bytes themselves. + Bytes(&'a [u8]), +} diff --git a/crates/renderers-core/src/types.rs b/crates/renderers-core/src/types.rs index ba2ffcc..0c33d6c 100644 --- a/crates/renderers-core/src/types.rs +++ b/crates/renderers-core/src/types.rs @@ -23,13 +23,9 @@ pub const SCAFFOLD_IDX: i32 = -1; #[serde(tag = "type", rename_all = "snake_case")] pub enum ContentPart { /// Plain text. - Text { - text: String, - }, + Text { text: String }, /// Model chain-of-thought as a content part. - Thinking { - thinking: String, - }, + Thinking { thinking: String }, /// Image reference. Resolution to bytes / processor output happens /// in the multimodal renderer. Image(ImageRef), @@ -176,8 +172,9 @@ impl ToolArguments { pub fn to_json_string(&self) -> String { match self { ToolArguments::Raw(s) => s.clone(), - ToolArguments::Object(v) => serde_json::to_string(v) - .unwrap_or_else(|_| "{}".to_string()), + ToolArguments::Object(v) => { + serde_json::to_string(v).unwrap_or_else(|_| "{}".to_string()) + } } } } @@ -332,4 +329,3 @@ pub enum RenderError { #[error("invalid input: {0}")] Invalid(String), } - diff --git a/crates/renderers-py/Cargo.toml b/crates/renderers-py/Cargo.toml index e1437b6..3a89784 100644 --- a/crates/renderers-py/Cargo.toml +++ b/crates/renderers-py/Cargo.toml @@ -18,3 +18,5 @@ pyo3 = { version = "0.22", features = ["extension-module", "abi3-py310"] } serde = { workspace = true } serde_json = { workspace = true } pythonize = "0.22" +numpy = "0.23" +ndarray = "0.16" diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 1ee978e..dcc34d5 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -9,23 +9,24 @@ use std::sync::Arc; +use numpy::{IntoPyArray, PyArray2}; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; -use pyo3::types::{PyList, PyType}; +use pyo3::types::{PyDict, PyList, PyType}; +use renderers_core::Renderer as CoreRenderer; use renderers_core::families::{ - DefaultRendererBuilder, DeepSeekV3RendererBuilder, GlmRendererBuilder, GptOssRendererBuilder, - KimiK25RendererBuilder, KimiK2RendererBuilder, MiniMaxM2RendererBuilder, - Nemotron3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, Qwen3RendererBuilder, + DeepSeekV3RendererBuilder, DefaultRendererBuilder, GlmRendererBuilder, GptOssRendererBuilder, + KimiK2RendererBuilder, KimiK25RendererBuilder, MiniMaxM2RendererBuilder, + Nemotron3RendererBuilder, Qwen3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, }; use renderers_core::processing::{ProcessedImage, Qwen3VlImageProcessor}; -use renderers_core::types::{MediaBundle, MediaItem, Modality}; use renderers_core::tokenizer::Tokenizer; +use renderers_core::types::{MediaBundle, MediaItem, Modality}; use renderers_core::types::{ Message, ParsedResponse, ParsedToolCall, RenderedTokens, ToolArguments, ToolCallParseStatus, ToolSpec, }; -use renderers_core::Renderer as CoreRenderer; fn render_err(e: renderers_core::types::RenderError) -> PyErr { PyRuntimeError::new_err(e.to_string()) @@ -38,7 +39,9 @@ fn invalid(msg: impl Into) -> PyErr { /// Decode a Python `list[dict]` of messages via pythonize. fn parse_messages(obj: &Bound<'_, PyAny>) -> PyResult> { let value: serde_json::Value = pythonize::depythonize(obj).map_err(|e| { - invalid(format!("messages must be a list of dicts (decode failed: {e})")) + invalid(format!( + "messages must be a list of dicts (decode failed: {e})" + )) })?; serde_json::from_value(value).map_err(|e| invalid(format!("messages shape mismatch: {e}"))) } @@ -48,10 +51,13 @@ fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult> if obj.is_none() { return Ok(None); } - let value: serde_json::Value = pythonize::depythonize(obj) - .map_err(|e| invalid(format!("tools must be a list of dicts (decode failed: {e})")))?; - let parsed: Vec = serde_json::from_value(value) - .map_err(|e| invalid(format!("tools shape mismatch: {e}")))?; + let value: serde_json::Value = pythonize::depythonize(obj).map_err(|e| { + invalid(format!( + "tools must be a list of dicts (decode failed: {e})" + )) + })?; + let parsed: Vec = + serde_json::from_value(value).map_err(|e| invalid(format!("tools shape mismatch: {e}")))?; Ok(Some(parsed)) } @@ -65,11 +71,13 @@ fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { }; let mut bundle = MediaBundle::new(); for item in arr { - let obj = item.as_object().ok_or_else(|| invalid("media item must be a dict"))?; - let message_idx = obj - .get("message_idx") - .and_then(|v| v.as_u64()) - .ok_or_else(|| invalid("media item missing message_idx"))? as usize; + let obj = item + .as_object() + .ok_or_else(|| invalid("media item must be a dict"))?; + let message_idx = + obj.get("message_idx") + .and_then(|v| v.as_u64()) + .ok_or_else(|| invalid("media item missing message_idx"))? as usize; let modality_str = obj .get("modality") .and_then(|v| v.as_str()) @@ -79,19 +87,27 @@ fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { "video" => Modality::Video, other => return Err(invalid(format!("unknown modality: {other}"))), }; - let num_tokens = obj - .get("num_tokens") - .and_then(|v| v.as_u64()) - .ok_or_else(|| invalid("media item missing num_tokens"))? as usize; + let num_tokens = + obj.get("num_tokens") + .and_then(|v| v.as_u64()) + .ok_or_else(|| invalid("media item missing num_tokens"))? as usize; let hash = obj .get("hash") .and_then(|v| v.as_str()) .map(|s| s.to_string()) .unwrap_or_default(); - let hf_payload = obj.get("hf_payload").cloned().unwrap_or(serde_json::Value::Null); + let hf_payload = obj + .get("hf_payload") + .cloned() + .unwrap_or(serde_json::Value::Null); bundle.push( message_idx, - MediaItem { modality, hash, num_tokens, hf_payload }, + MediaItem { + modality, + hash, + num_tokens, + hf_payload, + }, ); } Ok(bundle) @@ -99,7 +115,9 @@ fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { fn parse_u32_list(obj: &Bound<'_, PyAny>) -> PyResult> { // Accept either a Python list of ints or a numpy-style sequence. - let list = obj.downcast::().map_err(|_| invalid("expected list[int]"))?; + let list = obj + .downcast::() + .map_err(|_| invalid("expected list[int]"))?; let mut out = Vec::with_capacity(list.len()); for item in list.iter() { let v: i64 = item.extract()?; @@ -177,8 +195,9 @@ impl PyParsedToolCall { fn arguments<'py>(&self, py: Python<'py>) -> PyResult> { match &self.inner.arguments { None => Ok(py.None().into_bound(py)), - Some(ToolArguments::Object(v)) => pythonize::pythonize(py, v) - .map_err(|e| invalid(format!("args serialisation: {e}"))), + Some(ToolArguments::Object(v)) => { + pythonize::pythonize(py, v).map_err(|e| invalid(format!("args serialisation: {e}"))) + } Some(ToolArguments::Raw(s)) => Ok(s.clone().into_py(py).into_bound(py)), } } @@ -317,6 +336,38 @@ impl PyRenderer { }) } + /// Build a Qwen3-VL renderer — alias for [`Renderer.qwen35`]. + /// + /// Qwen3-VL and Qwen3.5-VL share the same chat template and the + /// same set of special tokens, so the renderer implementation is + /// identical. The factory is exposed separately so callers reading + /// from a registry can spell the family name directly. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn qwen3_vl( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + Self::qwen35( + _cls, + py, + tokenizer_path, + enable_thinking, + preserve_all_thinking, + preserve_thinking_between_tool_calls, + ) + } + /// Build a Qwen3.5 renderer (text-only path) from a tokenizer.json. /// /// `enable_thinking` defaults to `True` (big-size variant). The Python @@ -412,7 +463,9 @@ impl PyRenderer { .build(tok) }) .map_err(render_err)?; - Ok(PyRenderer { inner: Arc::new(renderer) }) + Ok(PyRenderer { + inner: Arc::new(renderer), + }) } /// Build a GLM-5.1 renderer (GLM-5 + empty on last assistant). @@ -442,7 +495,9 @@ impl PyRenderer { .build(tok) }) .map_err(render_err)?; - Ok(PyRenderer { inner: Arc::new(renderer) }) + Ok(PyRenderer { + inner: Arc::new(renderer), + }) } /// Build a GLM-4.5 Air renderer from a tokenizer.json. @@ -472,7 +527,9 @@ impl PyRenderer { .build(tok) }) .map_err(render_err)?; - Ok(PyRenderer { inner: Arc::new(renderer) }) + Ok(PyRenderer { + inner: Arc::new(renderer), + }) } /// Build a MiniMax M2 / M2.5 renderer from a tokenizer.json. @@ -499,7 +556,9 @@ impl PyRenderer { .build(tok) }) .map_err(render_err)?; - Ok(PyRenderer { inner: Arc::new(renderer) }) + Ok(PyRenderer { + inner: Arc::new(renderer), + }) } /// Build a DefaultRenderer (Jinja fallback via minijinja). @@ -546,7 +605,9 @@ impl PyRenderer { b.build(tok) }) .map_err(render_err)?; - Ok(PyRenderer { inner: Arc::new(renderer) }) + Ok(PyRenderer { + inner: Arc::new(renderer), + }) } /// Build a GPT-OSS (Harmony) renderer. @@ -602,7 +663,9 @@ impl PyRenderer { b.build() }) .map_err(render_err)?; - Ok(PyRenderer { inner: Arc::new(renderer) }) + Ok(PyRenderer { + inner: Arc::new(renderer), + }) } /// Build a Kimi K2.5 renderer (text-only, no tools). @@ -637,7 +700,9 @@ impl PyRenderer { .build(tok) }) .map_err(render_err)?; - Ok(PyRenderer { inner: Arc::new(renderer) }) + Ok(PyRenderer { + inner: Arc::new(renderer), + }) } /// Build a Kimi K2 renderer from a tokenizer.json. @@ -667,7 +732,9 @@ impl PyRenderer { .build(tok) }) .map_err(render_err)?; - Ok(PyRenderer { inner: Arc::new(renderer) }) + Ok(PyRenderer { + inner: Arc::new(renderer), + }) } /// Build a Nemotron 3 renderer from a tokenizer.json. @@ -885,11 +952,21 @@ impl PyQwen3VlImageProcessor { merge_size: Option, ) -> PyResult { let mut p = Qwen3VlImageProcessor::default(); - if let Some(v) = min_pixels { p.min_pixels = v; } - if let Some(v) = max_pixels { p.max_pixels = v; } - if let Some(v) = patch_size { p.patch_size = v; } - if let Some(v) = temporal_patch_size { p.temporal_patch_size = v; } - if let Some(v) = merge_size { p.merge_size = v; } + if let Some(v) = min_pixels { + p.min_pixels = v; + } + if let Some(v) = max_pixels { + p.max_pixels = v; + } + if let Some(v) = patch_size { + p.patch_size = v; + } + if let Some(v) = temporal_patch_size { + p.temporal_patch_size = v; + } + if let Some(v) = merge_size { + p.merge_size = v; + } Ok(Self { inner: p }) } @@ -916,11 +993,7 @@ impl PyQwen3VlImageProcessor { /// ``` /// /// `message_idx` is up to the caller — it's not added here. - fn process_bytes<'py>( - &self, - py: Python<'py>, - bytes: &[u8], - ) -> PyResult> { + fn process_bytes<'py>(&self, py: Python<'py>, bytes: &[u8]) -> PyResult> { // Clone so the move into allow_threads is straightforward let processed: ProcessedImage = py .allow_threads(|| self.inner.process_bytes(bytes)) @@ -930,8 +1003,8 @@ impl PyQwen3VlImageProcessor { /// Convenience: read a file and process it. fn process_path<'py>(&self, py: Python<'py>, path: &str) -> PyResult> { - let bytes = std::fs::read(path) - .map_err(|e| invalid(format!("read image {path:?}: {e}")))?; + let bytes = + std::fs::read(path).map_err(|e| invalid(format!("read image {path:?}: {e}")))?; let processed: ProcessedImage = py .allow_threads(|| self.inner.process_bytes(&bytes)) .map_err(render_err)?; @@ -939,43 +1012,52 @@ impl PyQwen3VlImageProcessor { } #[getter] - fn patch_size(&self) -> u32 { self.inner.patch_size } + fn patch_size(&self) -> u32 { + self.inner.patch_size + } #[getter] - fn merge_size(&self) -> u32 { self.inner.merge_size } + fn merge_size(&self) -> u32 { + self.inner.merge_size + } #[getter] - fn temporal_patch_size(&self) -> u32 { self.inner.temporal_patch_size } + fn temporal_patch_size(&self) -> u32 { + self.inner.temporal_patch_size + } #[getter] - fn min_pixels(&self) -> u32 { self.inner.min_pixels } + fn min_pixels(&self) -> u32 { + self.inner.min_pixels + } #[getter] - fn max_pixels(&self) -> u32 { self.inner.max_pixels } + fn max_pixels(&self) -> u32 { + self.inner.max_pixels + } } -fn processed_to_pyobject<'py>( - py: Python<'py>, - p: ProcessedImage, -) -> PyResult> { - // Serialise via serde_json::Value first, then convert to a Python - // dict. The shape is identical to what the HF processor produces - // (lists of f32 + integer dims), so downstream glue can route it - // unchanged. - let shape = p.pixel_values.shape().to_vec(); - let value = serde_json::json!({ - "modality": "image", - "num_tokens": p.num_tokens, - "hash": p.hash, - "hf_payload": { - "pixel_values": { - "shape": [shape[0] as u64, shape[1] as u64], - "data": p.pixel_values.iter().copied().collect::>(), - }, - "image_grid_thw": { - "shape": [1u32, 3u32], - "data": p.image_grid_thw.to_vec(), - }, - }, - }); - pythonize::pythonize(py, &value) - .map_err(|e| invalid(format!("processed image → py: {e}"))) +fn processed_to_pyobject<'py>(py: Python<'py>, p: ProcessedImage) -> PyResult> { + // Zero-copy: hand numpy the Vec directly. The numpy array + // takes ownership of the buffer, so this avoids the per-element + // PyFloat allocation that the previous nested-list path triggered. + // Shape: (num_tokens × merge², 3 × temporal × patch²). + let shape = (p.pixel_values.shape()[0], p.pixel_values.shape()[1]); + let pixel_array: Bound<'py, PyArray2> = p.pixel_values.into_pyarray(py); + let grid_array: Bound<'py, PyArray2> = ndarray::Array2::from_shape_vec( + (1, 3), + p.image_grid_thw.iter().map(|&v| v as i64).collect(), + ) + .expect("image_grid_thw is always shape [1,3]") + .into_pyarray(py); + + let hf_payload = PyDict::new(py); + hf_payload.set_item("pixel_values", pixel_array)?; + hf_payload.set_item("image_grid_thw", grid_array)?; + + let out = PyDict::new(py); + out.set_item("modality", "image")?; + out.set_item("num_tokens", p.num_tokens)?; + out.set_item("hash", p.hash)?; + out.set_item("hf_payload", hf_payload)?; + let _ = shape; // shape captured in the numpy array's own metadata + Ok(out.into_any()) } #[pymodule] diff --git a/tests/test_native_parity.py b/tests/test_native_parity.py index c4d7b2f..9df3011 100644 --- a/tests/test_native_parity.py +++ b/tests/test_native_parity.py @@ -41,7 +41,14 @@ NATIVE_PARITY_FAMILIES = [ ("Qwen/Qwen3-8B", "qwen3", {}), ("Qwen/Qwen3.5-9B", "qwen35", {}), + ("Qwen/Qwen3.6-35B-A3B", "qwen36", {}), + ("zai-org/GLM-5", "glm5", {}), + ("zai-org/GLM-5.1", "glm51", {}), + ("THUDM/GLM-4.5-Air", "glm45", {}), ("deepseek-ai/DeepSeek-V3", "deepseek_v3", {}), + ("moonshotai/Kimi-K2-Instruct", "kimi_k2", {}), + ("MiniMaxAI/MiniMax-M2.5", "minimax_m2", {}), + ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "nemotron3", {}), ] @@ -81,19 +88,8 @@ def native_pair(request, native_module): # the ``__new__`` routing doesn't return a native instance. saved = os.environ.pop("RENDERERS_NATIVE", None) try: - if family == "qwen3": - from renderers.qwen3 import Qwen3Renderer - - py_renderer = Qwen3Renderer(tokenizer, **extra) - elif family == "qwen35": - from renderers.qwen35 import Qwen35Renderer - - py_renderer = Qwen35Renderer(tokenizer, **extra) - elif family == "deepseek_v3": - from renderers.deepseek_v3 import DeepSeekV3Renderer - - py_renderer = DeepSeekV3Renderer(tokenizer, **extra) - else: + py_renderer = _build_python_renderer(family, tokenizer, extra) + if py_renderer is None: pytest.skip(f"no python builder wired for {family}") finally: if saved is not None: @@ -101,18 +97,70 @@ def native_pair(request, native_module): # Build the native renderer directly through the module surface — # bypasses the env-var routing entirely. - if family == "qwen3": - native_renderer = native_module.Renderer.qwen3(tok_path, **extra) - elif family == "qwen35": - native_renderer = native_module.Renderer.qwen35(tok_path, **extra) - elif family == "deepseek_v3": - native_renderer = native_module.Renderer.deepseek_v3(tok_path, **extra) - else: + native_renderer = _build_native_renderer(native_module, family, tok_path, extra) + if native_renderer is None: pytest.skip(f"no native builder wired for {family}") return py_renderer, native_renderer, tokenizer +# ── Family-specific builder dispatch ───────────────────────────────── + + +def _build_python_renderer(family: str, tokenizer, extra): + """Return a pure-Python renderer for *family*, or ``None`` if missing.""" + if family == "qwen3": + from renderers.qwen3 import Qwen3Renderer + return Qwen3Renderer(tokenizer, **extra) + if family == "qwen35": + from renderers.qwen35 import Qwen35Renderer + return Qwen35Renderer(tokenizer, **extra) + if family == "qwen36": + from renderers.qwen36 import Qwen36Renderer + return Qwen36Renderer(tokenizer, **extra) + if family == "glm5": + from renderers.glm5 import GLM5Renderer + return GLM5Renderer(tokenizer, **extra) + if family == "glm51": + from renderers.glm5 import GLM51Renderer + return GLM51Renderer(tokenizer, **extra) + if family == "glm45": + from renderers.glm45 import GLM45Renderer + return GLM45Renderer(tokenizer, **extra) + if family == "deepseek_v3": + from renderers.deepseek_v3 import DeepSeekV3Renderer + return DeepSeekV3Renderer(tokenizer, **extra) + if family == "kimi_k2": + from renderers.kimi_k2 import KimiK2Renderer + return KimiK2Renderer(tokenizer, **extra) + if family == "minimax_m2": + from renderers.minimax_m2 import MiniMaxM2Renderer + return MiniMaxM2Renderer(tokenizer, **extra) + if family == "nemotron3": + from renderers.nemotron3 import Nemotron3Renderer + return Nemotron3Renderer(tokenizer, **extra) + return None + + +def _build_native_renderer(native_module, family: str, tok_path: str, extra): + """Return a native renderer for *family* via the explicit factory.""" + factory = { + "qwen3": native_module.Renderer.qwen3, + "qwen35": native_module.Renderer.qwen35, + "qwen36": native_module.Renderer.qwen36, + "glm5": native_module.Renderer.glm5, + "glm51": native_module.Renderer.glm51, + "glm45": native_module.Renderer.glm45, + "deepseek_v3": native_module.Renderer.deepseek_v3, + "kimi_k2": native_module.Renderer.kimi_k2, + "minimax_m2": native_module.Renderer.minimax_m2, + "nemotron3": native_module.Renderer.nemotron3, + }.get(family) + if factory is None: + return None + return factory(tok_path, **extra) + + # ── Conversation fixtures (a representative cross-section) ─────────── From 966dad4e2867389b76556e527987db2f68db47cc Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:52:37 +0200 Subject: [PATCH 16/35] Fix native review regressions --- crates/renderers-core/src/families/qwen35.rs | 95 +++++++++++++++----- crates/renderers-py/Cargo.toml | 2 +- renderers/_native_vision.py | 36 +++++--- renderers/kimi_k25.py | 57 +++++++++--- tests/test_native_parity.py | 27 ++++++ tests/test_native_router.py | 26 ++++++ tests/test_native_vision.py | 49 ++++++++++ uv.lock | 4 +- 8 files changed, 246 insertions(+), 50 deletions(-) create mode 100644 tests/test_native_vision.py diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs index 7ad75bf..35fbed5 100644 --- a/crates/renderers-core/src/families/qwen35.rs +++ b/crates/renderers-core/src/families/qwen35.rs @@ -16,6 +16,8 @@ //! Rust — the caller passes it explicitly through the builder. The //! Python shim handles the polarity probe and forwards the result. +use std::borrow::Cow; + use serde_json::json; use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; @@ -25,8 +27,9 @@ use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; use crate::traits::{MultimodalRenderer, Renderer}; use crate::types::{ - MediaBundle, MediaItem, Message, Modality, MultiModalData, ParsedResponse, PlaceholderRange, - RenderError, RenderedTokens, ToolArguments, ToolSpec, SCAFFOLD_IDX, + Content, ContentPart, MediaBundle, MediaItem, Message, Modality, MultiModalData, + ParsedResponse, PlaceholderRange, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, + ToolSpec, }; const TOOLS_HEADER: &str = "# Tools\n\nYou have access to the following functions:\n\n"; @@ -180,9 +183,26 @@ impl Qwen35Renderer { /// True when the underlying tokenizer ships the vision special /// tokens. Used by [`Renderer::as_multimodal`]. pub fn supports_multimodal(&self) -> bool { - self.vision_start.is_some() - && self.vision_end.is_some() - && self.image_pad.is_some() + self.vision_start.is_some() && self.vision_end.is_some() && self.image_pad.is_some() + } + + /// Text view of message content, matching the Python + /// `Qwen35Renderer._render_content` helper: join text parts and skip + /// media / thinking parts. This is used by the text-only native path + /// so OpenAI-style structured text content is not silently dropped. + fn render_content_text(content: &Content) -> Cow<'_, str> { + match content { + Content::Text(s) => Cow::Borrowed(s.as_str()), + Content::Parts(parts) => { + let mut out = String::new(); + for part in parts { + if let ContentPart::Text { text } = part { + out.push_str(text); + } + } + Cow::Owned(out) + } + } } /// Index of the most recent non-tool-response user message; @@ -194,7 +214,8 @@ impl Qwen35Renderer { if msg.role != "user" { continue; } - let content = msg.text_content().trim(); + let content = Self::render_content_text(&msg.content); + let content = content.trim(); if !(content.starts_with("") && content.ends_with("")) { return i as i32; } @@ -213,7 +234,8 @@ impl Qwen35Renderer { buf.special(self.im_start, sys_idx); buf.text("system\n", sys_idx)?; - let mut tool_text = String::with_capacity(TOOLS_HEADER.len() + TOOLS_INSTRUCTIONS.len() + 256); + let mut tool_text = + String::with_capacity(TOOLS_HEADER.len() + TOOLS_INSTRUCTIONS.len() + 256); tool_text.push_str(TOOLS_HEADER); for tool in tools { tool_text.push('\n'); @@ -230,7 +252,8 @@ impl Qwen35Renderer { tool_text.push_str(TOOLS_INSTRUCTIONS); if first_is_system { - let sys_content = messages[0].text_content().trim(); + let sys_content = Self::render_content_text(&messages[0].content); + let sys_content = sys_content.trim(); if !sys_content.is_empty() { tool_text.push_str("\n\n"); tool_text.push_str(sys_content); @@ -248,7 +271,8 @@ impl Qwen35Renderer { buf: &mut RenderBuf<'_>, messages: &[Message], ) -> Result<(), RenderError> { - let content = messages[0].text_content().trim(); + let content = Self::render_content_text(&messages[0].content); + let content = content.trim(); buf.special(self.im_start, 0); let mut s = String::with_capacity(content.len() + 8); s.push_str("system\n"); @@ -283,8 +307,7 @@ impl Qwen35Renderer { content: &str, ) -> Result<(), RenderError> { let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; - let next_is_tool = - msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let next_is_tool = msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; let idx = msg_idx as i32; if !prev_is_tool { @@ -325,7 +348,11 @@ impl Qwen35Renderer { } serde_json::Value::String(s) => s.clone(), serde_json::Value::Bool(b) => { - if *b { "True".to_string() } else { "False".to_string() } + if *b { + "True".to_string() + } else { + "False".to_string() + } } serde_json::Value::Null => "None".to_string(), serde_json::Value::Number(n) => n.to_string(), @@ -341,15 +368,21 @@ impl Qwen35Renderer { last_query_index: i32, preserve_thinking: bool, ) -> Result<(), RenderError> { - let raw_content = msg.text_content(); + let raw_content = Self::render_content_text(&msg.content); let (reasoning_content, content_after) = match &msg.reasoning_content { Some(s) => (s.clone(), raw_content.to_string()), None => { if let Some((before, after)) = raw_content.split_once("") { let reasoning = if let Some((_, inner)) = before.rsplit_once("") { - inner.trim_start_matches('\n').trim_end_matches('\n').to_string() + inner + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() } else { - before.trim_start_matches('\n').trim_end_matches('\n').to_string() + before + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() }; (reasoning, after.trim_start_matches('\n').to_string()) } else { @@ -407,8 +440,9 @@ impl Qwen35Renderer { // Arguments — accept JSON string (decode first) or object let args_value = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), - ToolArguments::Raw(s) => serde_json::from_str(s) - .unwrap_or(serde_json::Value::Object(Default::default())), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(serde_json::Value::Object(Default::default())) + } }; if let Some(obj) = args_value.as_object() { for (arg_name, arg_value) in obj { @@ -482,7 +516,8 @@ impl Renderer for Qwen35Renderer { let last_qi = Self::last_query_index(messages); for (i, msg) in messages.iter().enumerate() { - let content = msg.text_content().trim(); + let content = Self::render_content_text(&msg.content); + let content = content.trim(); match msg.role.as_str() { "system" => { if i != 0 { @@ -563,7 +598,8 @@ impl Renderer for Qwen35Renderer { buf.scaffold_text("\n")?; for (i, msg) in new_messages.iter().enumerate() { - let content = msg.text_content().trim(); + let content = Self::render_content_text(&msg.content); + let content = content.trim(); let idx = i as i32; match msg.role.as_str() { "user" => self.emit_user(&mut buf, content, idx)?, @@ -726,7 +762,10 @@ impl Qwen35Renderer { // Update MultiModalData. Key by modality string ("image" / // "video") so the inference engine glue can route per-key. let key = item.modality.as_str().to_string(); - mm.mm_hashes.entry(key.clone()).or_default().push(item.hash.clone()); + mm.mm_hashes + .entry(key.clone()) + .or_default() + .push(item.hash.clone()); mm.mm_placeholders .entry(key.clone()) .or_default() @@ -734,7 +773,10 @@ impl Qwen35Renderer { offset, length: item.num_tokens, }); - mm.mm_items.entry(key).or_default().push(item.hf_payload.clone()); + mm.mm_items + .entry(key) + .or_default() + .push(item.hf_payload.clone()); Ok(()) } } @@ -776,7 +818,8 @@ impl MultimodalRenderer for Qwen35Renderer { let mut mm = MultiModalData::default(); for (i, msg) in messages.iter().enumerate() { - let content = msg.text_content().trim(); + let content = Self::render_content_text(&msg.content); + let content = content.trim(); match msg.role.as_str() { "system" => { if i != 0 { @@ -847,7 +890,11 @@ impl MultimodalRenderer for Qwen35Renderer { if !_new_media.is_empty() { return Ok(None); } - self.bridge_to_next_turn(previous_prompt_ids, previous_completion_ids, new_messages, tools) + self.bridge_to_next_turn( + previous_prompt_ids, + previous_completion_ids, + new_messages, + tools, + ) } } - diff --git a/crates/renderers-py/Cargo.toml b/crates/renderers-py/Cargo.toml index 3a89784..dcd2c8f 100644 --- a/crates/renderers-py/Cargo.toml +++ b/crates/renderers-py/Cargo.toml @@ -14,7 +14,7 @@ path = "src/lib.rs" [dependencies] renderers-core = { path = "../renderers-core" } -pyo3 = { version = "0.22", features = ["extension-module", "abi3-py310"] } +pyo3 = { version = "0.22", features = ["abi3-py310"] } serde = { workspace = true } serde_json = { workspace = true } pythonize = "0.22" diff --git a/renderers/_native_vision.py b/renderers/_native_vision.py index 452c006..cd07b3a 100644 --- a/renderers/_native_vision.py +++ b/renderers/_native_vision.py @@ -112,18 +112,34 @@ def process_image_for_qwen_vl( image.convert("RGB").save(buf, format="PNG") out = proc.process_bytes(buf.getvalue()) - if return_numpy: - import numpy as np # local to keep import cost off the hot path + import numpy as np # local to keep import cost off the hot path + + pv = out["hf_payload"]["pixel_values"] + gt = out["hf_payload"]["image_grid_thw"] + + def _as_array(value, dtype): + if isinstance(value, dict): + return np.asarray(value["data"], dtype=dtype).reshape(tuple(value["shape"])) + return np.asarray(value, dtype=dtype) + + pixel_values = _as_array(pv, np.float32) + image_grid_thw = _as_array(gt, np.int64) - pv = out["hf_payload"]["pixel_values"] - gt = out["hf_payload"]["image_grid_thw"] + if return_numpy: + out["hf_payload"] = { + "pixel_values": pixel_values, + "image_grid_thw": image_grid_thw, + } + else: out["hf_payload"] = { - "pixel_values": np.asarray(pv["data"], dtype=np.float32).reshape( - tuple(pv["shape"]) - ), - "image_grid_thw": np.asarray(gt["data"], dtype=np.int64).reshape( - tuple(gt["shape"]) - ), + "pixel_values": { + "shape": list(pixel_values.shape), + "data": pixel_values.reshape(-1).tolist(), + }, + "image_grid_thw": { + "shape": list(image_grid_thw.shape), + "data": image_grid_thw.reshape(-1).tolist(), + }, } out["message_idx"] = message_idx diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 7eac888..4155e2e 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -601,20 +601,9 @@ def __new__( preserve_all_thinking=False, preserve_thinking_between_tool_calls=False, image_cache_max=256, - # Tools / messages are bound to render-time, but native routing - # decides eagerly here based on builder-time signals: skip native - # when a processor is configured (caller will pass images later). + # Tools / messages are bound to render-time, so native routing + # happens inside render() via a cached text-only delegate. ): - if native_enabled("kimi_k25") and processor is None: - native = load_native() - if native is not None: - path = resolve_tokenizer_path(tokenizer) - return native.Renderer.kimi_k25( - path, - enable_thinking=enable_thinking, - preserve_all_thinking=preserve_all_thinking, - preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, - ) return super().__new__(cls) def __init__( @@ -627,6 +616,17 @@ def __init__( self._tokenizer = tokenizer self._processor = processor self.config = config or KimiK25RendererConfig() + self._native_renderer = None + if native_enabled("kimi_k25") and processor is None: + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + self._native_renderer = native.Renderer.kimi_k25( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) # Core structural tokens — all must be single special tokens in the vocab self._im_user = self._token_id("<|im_user|>") @@ -668,6 +668,22 @@ def __init__( # consistency / debugging. self._image_cache: dict[str, tuple[Any, int]] = {} + @staticmethod + def _content_has_media(content: Any) -> bool: + if not isinstance(content, list): + return False + return any( + isinstance(part, dict) and (_is_image_part(part) or _is_video_part(part)) + for part in content + ) + + def _can_use_native( + self, messages: list[Message], tools: list[ToolSpec] | None + ) -> bool: + if self._native_renderer is None or tools: + return False + return not any(self._content_has_media(msg.get("content")) for msg in messages) + @property def mm_token_type_id_map(self) -> dict[int, int]: """Token-id → modality marker. For Kimi K2.5 only ``<|media_pad|>`` @@ -770,6 +786,13 @@ def render( - Generation prompt: ``<|im_assistant|>assistant<|im_middle|>`` + ```` (or ```` when thinking off) """ + if self._can_use_native(messages, tools): + return self._native_renderer.render( + messages, + tools=tools, + add_generation_prompt=add_generation_prompt, + ) + if not messages: raise ValueError("No messages provided.") @@ -997,6 +1020,14 @@ def render_ids( tools: list[ToolSpec] | None = None, add_generation_prompt: bool = False, ) -> list[int]: + if self._can_use_native(messages, tools): + return list( + self._native_renderer.render_ids( + messages, + tools=tools, + add_generation_prompt=add_generation_prompt, + ) + ) return self.render( messages, tools=tools, diff --git a/tests/test_native_parity.py b/tests/test_native_parity.py index 9df3011..f466967 100644 --- a/tests/test_native_parity.py +++ b/tests/test_native_parity.py @@ -294,6 +294,33 @@ def test_render_ids_with_tools_parity(native_pair, case, messages): assert py_ids == rs_ids +def test_qwen35_structured_text_parts_parity(native_pair): + py_renderer, native_renderer, _tok = native_pair + if type(py_renderer).__name__ not in {"Qwen35Renderer", "Qwen36Renderer"}: + pytest.skip("structured text part coverage is specific to Qwen3.5/Qwen3.6") + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Hello"}, + {"type": "text", "text": " from structured parts"}, + ], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Structured"}, + {"type": "text", "text": " reply"}, + ], + }, + ] + + py_ids = list(py_renderer.render_ids(messages)) + rs_ids = list(native_renderer.render_ids(messages)) + assert py_ids == rs_ids + + @pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) def test_message_indices_parity(native_pair, case, messages): """Per-token attribution must match — critical for training loss masks.""" diff --git a/tests/test_native_router.py b/tests/test_native_router.py index 6acbe5e..2a73ad5 100644 --- a/tests/test_native_router.py +++ b/tests/test_native_router.py @@ -74,6 +74,32 @@ def test_resolve_tokenizer_path_from_exact_file(tmp_path): assert router.resolve_tokenizer_path(str(f)) == str(f) +def test_kimi_k25_constructor_does_not_route_eagerly(monkeypatch): + from renderers.kimi_k25 import KimiK25Renderer + + fake_native = mock.Mock() + monkeypatch.setattr("renderers.kimi_k25.native_enabled", lambda _family: True) + monkeypatch.setattr("renderers.kimi_k25.load_native", lambda: fake_native) + + inst = KimiK25Renderer.__new__(KimiK25Renderer, object(), processor=None) + + assert isinstance(inst, KimiK25Renderer) + fake_native.Renderer.kimi_k25.assert_not_called() + + +def test_kimi_k25_native_delegate_rejects_render_time_tools(): + from renderers.kimi_k25 import KimiK25Renderer + + inst = object.__new__(KimiK25Renderer) + inst._native_renderer = object() + + assert inst._can_use_native([{"role": "user", "content": "hi"}], tools=None) + assert not inst._can_use_native( + [{"role": "user", "content": "hi"}], + tools=[{"name": "echo", "parameters": {}}], + ) + + # ── Native module surface (only runs when the wheel is built) ──────── diff --git a/tests/test_native_vision.py b/tests/test_native_vision.py new file mode 100644 index 0000000..61f6c29 --- /dev/null +++ b/tests/test_native_vision.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import numpy as np + +import renderers._native_vision as _native_vision + + +class _FakeProcessor: + def process_bytes(self, _raw: bytes): + return { + "modality": "image", + "num_tokens": 2, + "hash": "abc", + "hf_payload": { + "pixel_values": np.arange(6, dtype=np.float32).reshape(2, 3), + "image_grid_thw": np.array([[1, 2, 4]], dtype=np.int64), + }, + } + + +def test_process_image_for_qwen_vl_accepts_native_numpy_payload(monkeypatch): + monkeypatch.setattr( + _native_vision, "get_qwen_vl_processor", lambda **_kwargs: _FakeProcessor() + ) + + out = _native_vision.process_image_for_qwen_vl(b"image", message_idx=3) + + assert out["message_idx"] == 3 + assert out["hf_payload"]["pixel_values"].shape == (2, 3) + assert out["hf_payload"]["image_grid_thw"].shape == (1, 3) + + +def test_process_image_for_qwen_vl_return_numpy_false_converts_to_dict(monkeypatch): + monkeypatch.setattr( + _native_vision, "get_qwen_vl_processor", lambda **_kwargs: _FakeProcessor() + ) + + out = _native_vision.process_image_for_qwen_vl( + b"image", message_idx=3, return_numpy=False + ) + + assert out["hf_payload"]["pixel_values"] == { + "shape": [2, 3], + "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0], + } + assert out["hf_payload"]["image_grid_thw"] == { + "shape": [1, 3], + "data": [1, 2, 4], + } diff --git a/uv.lock b/uv.lock index 8096df3..6b22371 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-05-18T21:42:54.18041997Z" +exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "P7D" [options.exclude-newer-package] @@ -1409,7 +1409,7 @@ requires-dist = [ { name = "jinja2" }, { name = "numpy" }, { name = "openai", specifier = ">=1.108.1" }, - { name = "openai-harmony", specifier = ">=0.0.8" }, + { name = "openai-harmony", specifier = ">=0.0.4" }, { name = "prime-pydantic-config", specifier = ">=0.3.0.dev83" }, { name = "tiktoken" }, { name = "transformers", specifier = ">=4.50.0" }, From 69ba647ca6ff7709a4c977b2af136f4f8c4124f3 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:53:36 +0200 Subject: [PATCH 17/35] Fix native token parity --- .github/workflows/rust.yml | 81 ++++ Cargo.lock | 433 ++++++++++-------- Cargo.toml | 31 +- crates/renderers-cli/Cargo.toml | 29 ++ crates/renderers-cli/benches/qwen3.rs | 149 ++++++ crates/renderers-cli/src/main.rs | 214 +++++++++ crates/renderers-core/Cargo.toml | 7 +- crates/renderers-core/src/families/default.rs | 1 + crates/renderers-core/src/families/glm.rs | 28 +- .../renderers-core/src/families/kimi_k25.rs | 5 + .../renderers-core/src/families/minimax_m2.rs | 13 +- crates/renderers-core/src/families/qwen3.rs | 16 +- crates/renderers-core/src/families/qwen35.rs | 17 +- crates/renderers-core/src/json.rs | 78 ++++ crates/renderers-core/src/lib.rs | 1 + crates/renderers-core/src/types.rs | 20 +- crates/renderers-py/Cargo.toml | 11 +- crates/renderers-py/src/lib.rs | 91 ++-- renderers/_native_router.py | 32 +- renderers/kimi_k2.py | 17 +- renderers/kimi_k25.py | 17 +- rust-toolchain.toml | 4 + 22 files changed, 999 insertions(+), 296 deletions(-) create mode 100644 .github/workflows/rust.yml create mode 100644 crates/renderers-cli/Cargo.toml create mode 100644 crates/renderers-cli/benches/qwen3.rs create mode 100644 crates/renderers-cli/src/main.rs create mode 100644 crates/renderers-core/src/json.rs create mode 100644 rust-toolchain.toml diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..bfb6c39 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,81 @@ +name: Rust + +on: + push: + branches: [main] + paths: + - "**.rs" + - "**/Cargo.toml" + - "Cargo.lock" + - "rust-toolchain.toml" + - ".github/workflows/rust.yml" + pull_request: + branches: [main] + paths: + - "**.rs" + - "**/Cargo.toml" + - "Cargo.lock" + - "rust-toolchain.toml" + - ".github/workflows/rust.yml" + +env: + CARGO_TERM_COLOR: always + RUSTFLAGS: -D warnings + +jobs: + fmt: + name: Rustfmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install stable toolchain + run: rustup show active-toolchain || rustup toolchain install + - name: cargo fmt + run: cargo fmt --all -- --check + + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install stable toolchain + run: rustup show active-toolchain || rustup toolchain install + - uses: Swatinem/rust-cache@v2 + - name: cargo clippy + # `clippy::all` is `deny` via workspace lints; pedantic stays at `warn` + # for visibility. `-D warnings` here still escalates any remaining + # rustc warnings to errors. + run: cargo clippy --workspace --all-targets --locked + + test: + name: Test (Rust) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install stable toolchain + run: rustup show active-toolchain || rustup toolchain install + - uses: Swatinem/rust-cache@v2 + - name: cargo test + # Skip the PyO3 crate: it links libpython and is exercised via pytest. + run: cargo test --workspace --exclude renderers-py --locked + + miri: + name: Miri (renderers-core) + runs-on: ubuntu-latest + env: + # Miri's stricter UB detection. + MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check + steps: + - uses: actions/checkout@v4 + - name: Install nightly with miri + run: | + rustup toolchain install nightly --component miri + rustup +nightly component add rust-src + - uses: Swatinem/rust-cache@v2 + with: + key: miri + - name: cargo miri setup + run: cargo +nightly miri setup + - name: cargo miri test + # renderers-core only — PyO3 / FFI crates can't run under Miri. + run: cargo +nightly miri test -p renderers-core --lib --tests diff --git a/Cargo.lock b/Cargo.lock index bb0380f..a4e72a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,20 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "serde", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -35,6 +49,15 @@ dependencies = [ "equator", ] +[[package]] +name = "alloca" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4" +dependencies = [ + "cc", +] + [[package]] name = "android_system_properties" version = "0.1.5" @@ -165,7 +188,7 @@ dependencies = [ "num-traits", "pastey", "rayon", - "thiserror 2.0.18", + "thiserror", "v_frame", "y4m", ] @@ -250,6 +273,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-buffer" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +dependencies = [ + "hybrid-array", +] + [[package]] name = "bs58" version = "0.5.1" @@ -306,6 +338,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.62" @@ -421,6 +462,27 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "compact_str" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + +[[package]] +name = "const-oid" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -436,6 +498,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -447,25 +518,24 @@ dependencies = [ [[package]] name = "criterion" -version = "0.5.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +checksum = "950046b2aa2492f9a536f5f4f9a3de7b9e2476e575e05bd6c333371add4d98f3" dependencies = [ + "alloca", "anes", "cast", "ciborium", "clap", "criterion-plot", - "is-terminal", - "itertools 0.10.5", + "itertools 0.13.0", "num-traits", - "once_cell", "oorandom", + "page_size", "plotters", "rayon", "regex", "serde", - "serde_derive", "serde_json", "tinytemplate", "walkdir", @@ -473,12 +543,12 @@ dependencies = [ [[package]] name = "criterion-plot" -version = "0.5.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +checksum = "d8d80a2f4f5b554395e47b5d8305bc3d27813bacb73493eb1001e8f76dae29ea" dependencies = [ "cast", - "itertools 0.10.5", + "itertools 0.13.0", ] [[package]] @@ -522,6 +592,21 @@ dependencies = [ "typenum", ] +[[package]] +name = "crypto-common" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710" +dependencies = [ + "hybrid-array", +] + +[[package]] +name = "daachorse" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f55d7153ba3b507595872a3874803f07a8a81d1e888abed8e5db7da0597d6e2" + [[package]] name = "darling" version = "0.20.11" @@ -591,6 +676,15 @@ dependencies = [ "syn", ] +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" +dependencies = [ + "serde", +] + [[package]] name = "deranged" version = "0.5.8" @@ -638,8 +732,19 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer", - "crypto-common", + "block-buffer 0.10.4", + "crypto-common 0.1.7", +] + +[[package]] +name = "digest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" +dependencies = [ + "block-buffer 0.12.0", + "const-oid", + "crypto-common 0.2.1", ] [[package]] @@ -726,6 +831,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + [[package]] name = "fax" version = "0.2.7" @@ -936,12 +1047,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" - [[package]] name = "hex" version = "0.4.3" @@ -987,6 +1092,15 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "hybrid-array" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da" +dependencies = [ + "typenum", +] + [[package]] name = "hyper" version = "1.9.0" @@ -1242,15 +1356,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "indoc" -version = "2.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" -dependencies = [ - "rustversion", -] - [[package]] name = "interpolate_name" version = "0.2.4" @@ -1268,17 +1373,6 @@ version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" -[[package]] -name = "is-terminal" -version = "0.4.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.61.2", -] - [[package]] name = "is_terminal_polyfill" version = "1.70.2" @@ -1287,27 +1381,9 @@ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] - -[[package]] -name = "itertools" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] - -[[package]] -name = "itertools" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] @@ -1349,12 +1425,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "lebe" version = "0.5.3" @@ -1452,15 +1522,6 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b" -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - [[package]] name = "mime" version = "0.3.17" @@ -1548,9 +1609,9 @@ dependencies = [ [[package]] name = "ndarray" -version = "0.16.1" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d" dependencies = [ "matrixmultiply", "num-complex", @@ -1668,9 +1729,9 @@ dependencies = [ [[package]] name = "numpy" -version = "0.23.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94caae805f998a07d33af06e6a3891e38556051b8045c615470a71590e13e78" +checksum = "778da78c64ddc928ebf5ad9df5edf0789410ff3bdbf3619aed51cd789a6af1e2" dependencies = [ "libc", "ndarray", @@ -1678,6 +1739,7 @@ dependencies = [ "num-integer", "num-traits", "pyo3", + "pyo3-build-config", "rustc-hash 2.1.2", ] @@ -1741,8 +1803,18 @@ dependencies = [ "serde_json", "serde_with", "sha1", - "sha2", - "thiserror 2.0.18", + "sha2 0.10.9", + "thiserror", +] + +[[package]] +name = "page_size" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" +dependencies = [ + "libc", + "winapi", ] [[package]] @@ -1765,29 +1837,30 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "phf" -version = "0.11.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ "phf_macros", "phf_shared", + "serde", ] [[package]] name = "phf_generator" -version = "0.11.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" dependencies = [ + "fastrand", "phf_shared", - "rand 0.8.6", ] [[package]] name = "phf_macros" -version = "0.11.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" dependencies = [ "phf_generator", "phf_shared", @@ -1798,9 +1871,9 @@ dependencies = [ [[package]] name = "phf_shared" -version = "0.11.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" dependencies = [ "siphasher", ] @@ -1933,37 +2006,32 @@ checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f" [[package]] name = "pyo3" -version = "0.23.5" +version = "0.28.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +checksum = "91fd8e38a3b50ed1167fb981cd6fd60147e091784c427b8f7183a7ee32c31c12" dependencies = [ - "cfg-if", - "indoc", "libc", - "memoffset", "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", - "unindent", ] [[package]] name = "pyo3-build-config" -version = "0.23.5" +version = "0.28.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +checksum = "e368e7ddfdeb98c9bca7f8383be1648fd84ab466bf2bc015e94008db6d35611e" dependencies = [ - "once_cell", "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.23.5" +version = "0.28.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +checksum = "7f29e10af80b1f7ccaf7f69eace800a03ecd13e883acfacc1e5d0988605f651e" dependencies = [ "libc", "pyo3-build-config", @@ -1971,9 +2039,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.23.5" +version = "0.28.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +checksum = "df6e520eff47c45997d2fc7dd8214b25dd1310918bbb2642156ef66a67f29813" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -1983,9 +2051,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.23.5" +version = "0.28.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +checksum = "c4cdc218d835738f81c2338f822078af45b4afdf8b2e33cbb5916f108b813acb" dependencies = [ "heck", "proc-macro2", @@ -1996,9 +2064,9 @@ dependencies = [ [[package]] name = "pythonize" -version = "0.23.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91a6ee7a084f913f98d70cdc3ebec07e852b735ae3059a1500db2661265da9ff" +checksum = "0b79f670c9626c8b651c0581011b57b6ba6970bb69faf01a7c4c0cfc81c43f95" dependencies = [ "pyo3", "serde", @@ -2033,7 +2101,7 @@ dependencies = [ "rustc-hash 2.1.2", "rustls", "socket2", - "thiserror 2.0.18", + "thiserror", "tokio", "tracing", "web-time", @@ -2048,13 +2116,13 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.4", + "rand", "ring", "rustc-hash 2.1.2", "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.18", + "thiserror", "tinyvec", "tracing", "web-time", @@ -2089,35 +2157,14 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" -[[package]] -name = "rand" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - [[package]] name = "rand" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ - "rand_chacha 0.9.0", - "rand_core 0.9.5", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core 0.6.4", + "rand_chacha", + "rand_core", ] [[package]] @@ -2127,16 +2174,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.5", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.17", + "rand_core", ] [[package]] @@ -2175,10 +2213,10 @@ dependencies = [ "num-traits", "paste", "profiling", - "rand 0.9.4", - "rand_chacha 0.9.0", + "rand", + "rand_chacha", "simd_helpers", - "thiserror 2.0.18", + "thiserror", "v_frame", "wasm-bindgen", ] @@ -2216,12 +2254,12 @@ dependencies = [ [[package]] name = "rayon-cond" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" dependencies = [ "either", - "itertools 0.11.0", + "itertools 0.14.0", "rayon", ] @@ -2308,9 +2346,9 @@ dependencies = [ "regex", "serde", "serde_json", - "sha2", + "sha2 0.11.0", "smallvec", - "thiserror 1.0.69", + "thiserror", "tokenizers", ] @@ -2577,8 +2615,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", ] [[package]] @@ -2588,8 +2626,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.3", ] [[package]] @@ -2659,6 +2708,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.11.1" @@ -2704,18 +2759,9 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.16" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" - -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" [[package]] name = "thiserror" @@ -2723,18 +2769,7 @@ version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -2830,22 +2865,24 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokenizers" -version = "0.20.4" +version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b08cc37428a476fc9e20ac850132a513a2e1ce32b6a31addf2b74fa7033b905" +checksum = "44e5bea67576e04b6ff8564c5d9e09c2ef0cf476502245f2f120e497769d3112" dependencies = [ - "aho-corasick", + "ahash", + "compact_str", + "daachorse", + "dary_heap", "derive_builder", "esaxx-rs", - "getrandom 0.2.17", - "itertools 0.12.1", - "lazy_static", + "getrandom 0.3.4", + "itertools 0.14.0", "log", "macro_rules_attribute", "monostate", "onig", "paste", - "rand 0.8.6", + "rand", "rayon", "rayon-cond", "regex", @@ -2853,7 +2890,7 @@ dependencies = [ "serde", "serde_json", "spm_precompiled", - "thiserror 1.0.69", + "thiserror", "unicode-normalization-alignments", "unicode-segmentation", "unicode_categories", @@ -3005,12 +3042,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" -[[package]] -name = "unindent" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" - [[package]] name = "untrusted" version = "0.9.0" @@ -3195,6 +3226,22 @@ version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" @@ -3204,6 +3251,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" diff --git a/Cargo.toml b/Cargo.toml index f51c0b6..a7d827c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,24 +3,43 @@ resolver = "2" members = [ "crates/renderers-core", "crates/renderers-py", + "crates/renderers-cli", ] [workspace.package] -edition = "2021" +edition = "2024" license = "Apache-2.0" repository = "https://github.com/thomaub/renderers" -rust-version = "1.78" +rust-version = "1.85" [workspace.dependencies] -tokenizers = { version = "0.20", default-features = false, features = ["onig", "esaxx_fast"] } +tokenizers = { version = "0.23", default-features = false, features = ["onig", "esaxx_fast"] } serde = { version = "1", features = ["derive"] } serde_json = { version = "1", features = ["preserve_order"] } regex = "1" -once_cell = "1" -thiserror = "1" +thiserror = "2" smallvec = { version = "1", features = ["union", "const_generics"] } bumpalo = { version = "3", features = ["collections"] } -phf = { version = "0.11", features = ["macros"] } +phf = { version = "0.13", features = ["macros"] } +clap = { version = "4.6", features = ["derive"] } + +[workspace.lints.rust] +unsafe_op_in_unsafe_fn = "deny" +rust_2018_idioms = { level = "warn", priority = -1 } + +[workspace.lints.clippy] +# clippy::all = correctness + suspicious + style + complexity + perf. +# Already clean today; keep it that way. +all = { level = "deny", priority = -1 } +# Pedantic is informational only — flags style improvements without breaking CI. +pedantic = { level = "warn", priority = -1 } +# Stylistic pedantic lints we deliberately tolerate. +must_use_candidate = "allow" +cast_possible_truncation = "allow" +cast_possible_wrap = "allow" +missing_errors_doc = "allow" +return_self_not_must_use = "allow" +module_name_repetitions = "allow" [profile.release] opt-level = 3 diff --git a/crates/renderers-cli/Cargo.toml b/crates/renderers-cli/Cargo.toml new file mode 100644 index 0000000..f972537 --- /dev/null +++ b/crates/renderers-cli/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "renderers-cli" +version = "0.1.0" +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +description = "CLI runner for renderers-core — golden-test fixtures and ad-hoc rendering." +publish = false + +[[bin]] +name = "renderers-cli" +path = "src/main.rs" + +[dependencies] +renderers-core = { path = "../renderers-core" } +serde = { workspace = true } +serde_json = { workspace = true } +clap = { workspace = true } + +[dev-dependencies] +criterion = { version = "0.8", features = ["html_reports"] } + +[[bench]] +name = "qwen3" +harness = false + +[lints] +workspace = true diff --git a/crates/renderers-cli/benches/qwen3.rs b/crates/renderers-cli/benches/qwen3.rs new file mode 100644 index 0000000..73b23a5 --- /dev/null +++ b/crates/renderers-cli/benches/qwen3.rs @@ -0,0 +1,149 @@ +//! Qwen3 throughput benchmarks for `renderers-core`. +//! +//! Needs a real tokenizer.json on disk because the benchmarks measure +//! end-to-end render/parse latency (the tokenizer is on the hot path). +//! Set `BENCH_TOKENIZER=/path/to/tokenizer.json` before running: +//! +//! ```bash +//! BENCH_TOKENIZER=/path/to/qwen3-8b/tokenizer.json \ +//! cargo bench -p renderers-cli +//! ``` +//! +//! When `BENCH_TOKENIZER` is unset the benches return early without +//! failing — they're informational, not a CI gate. + +use std::hint::black_box; +use std::time::Duration; + +use criterion::{Criterion, criterion_group, criterion_main}; + +use renderers_core::Renderer; +use renderers_core::families::Qwen3Renderer; +use renderers_core::tokenizer::Tokenizer; +use renderers_core::types::{Content, Message}; + +fn tokenizer() -> Option { + let path = std::env::var("BENCH_TOKENIZER").ok()?; + match Tokenizer::from_file(&path) { + Ok(t) => Some(t), + Err(e) => { + eprintln!("bench skipped — couldn't load tokenizer at {path}: {e}"); + None + } + } +} + +fn text_msg(role: &str, content: &str) -> Message { + Message { + role: role.to_string(), + content: Content::Text(content.to_string()), + ..Default::default() + } +} + +fn typical_conversation() -> Vec { + vec![ + text_msg( + "system", + "You are a helpful assistant that calls tools when needed.", + ), + text_msg( + "user", + "Plan a weekend trip to Lisbon for two; we like food and walking.", + ), + text_msg( + "assistant", + "I'll help. First, let me check the weather and find some restaurants.", + ), + text_msg("user", "Sounds good — go ahead."), + text_msg( + "assistant", + "Here's a plan: Friday evening tapas at Time Out Market, \ + Saturday morning walk through Alfama, Saturday lunch at \ + Ramiro (seafood), Saturday afternoon Belém pastéis, \ + Sunday morning São Jorge castle, Sunday lunch at Cervejaria \ + Trindade.", + ), + ] +} + +fn bench_render_ids(c: &mut Criterion) { + let Some(tok) = tokenizer() else { + return; + }; + let renderer = Qwen3Renderer::new(tok).expect("build Qwen3 renderer"); + let messages = typical_conversation(); + let mut group = c.benchmark_group("qwen3"); + group.measurement_time(Duration::from_secs(5)); + group.bench_function("render_ids/5_turn_text", |b| { + b.iter(|| { + let ids = renderer + .render_ids(black_box(&messages), None, true) + .expect("render_ids"); + black_box(ids); + }); + }); + group.finish(); +} + +fn bench_parse_response(c: &mut Criterion) { + let Some(tok) = tokenizer() else { + return; + }; + let renderer = Qwen3Renderer::new(tok).expect("build Qwen3 renderer"); + let messages = typical_conversation(); + // Render once to get a realistic completion-ish prefix; treat it + // as a "completion" for the parse benchmark. + let output = renderer.render(&messages, None, true).expect("render"); + let ids = output.token_ids; + + let mut group = c.benchmark_group("qwen3"); + group.measurement_time(Duration::from_secs(5)); + group.bench_function("parse_response/no_tool_calls", |b| { + b.iter(|| { + let parsed = renderer.parse_response(black_box(&ids)); + black_box(parsed); + }); + }); + group.finish(); +} + +fn bench_bridge(c: &mut Criterion) { + let Some(tok) = tokenizer() else { + return; + }; + let renderer = Qwen3Renderer::new(tok).expect("build Qwen3 renderer"); + let messages = typical_conversation(); + let output = renderer.render(&messages, None, true).expect("render"); + let prev_prompt_ids = output.token_ids.clone(); + let prev_completion_ids: Vec = vec![]; + let new_messages = vec![text_msg( + "user", + "Add a kid-friendly option for Sunday morning.", + )]; + + let mut group = c.benchmark_group("qwen3"); + group.measurement_time(Duration::from_secs(5)); + group.bench_function("bridge_to_next_turn/short_user_turn", |b| { + b.iter(|| { + let bridged = renderer + .bridge_to_next_turn( + black_box(&prev_prompt_ids), + black_box(&prev_completion_ids), + black_box(&new_messages), + None, + ) + .expect("bridge"); + black_box(bridged); + }); + }); + group.finish(); +} + +criterion_group!( + benches, + bench_render_ids, + bench_parse_response, + bench_bridge +); +criterion_main!(benches); diff --git a/crates/renderers-cli/src/main.rs b/crates/renderers-cli/src/main.rs new file mode 100644 index 0000000..6d54843 --- /dev/null +++ b/crates/renderers-cli/src/main.rs @@ -0,0 +1,214 @@ +//! `renderers-cli` — small dev tool that drives `renderers-core` +//! without going through Python. +//! +//! Designed for two use cases: +//! +//! 1. **Golden parity checking**: render a fixture JSON of messages +//! against a tokenizer.json, emit the result as JSON, and `diff` +//! against the Python reference output. The exit code is non-zero +//! if the run fails — the comparison is left to the caller (the +//! pytest harness does the actual diffing). +//! 2. **Manual prototyping**: try out new families / config changes +//! without spinning up the `PyO3` wheel. + +use std::path::PathBuf; +use std::process::ExitCode; + +use clap::{Parser, Subcommand, ValueEnum}; +use renderers_core::Renderer; +use renderers_core::families::Qwen3Renderer; +use renderers_core::tokenizer::Tokenizer; +use renderers_core::types::{Message, ParsedToolCall, RenderedTokens, ToolArguments, ToolSpec}; +use serde::Serialize; + +/// Render and parse messages via `renderers-core`. Output is line-by-line +/// JSON on stdout for easy diffing. +#[derive(Debug, Parser)] +#[command(name = "renderers-cli", version, about, long_about = None)] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Subcommand)] +enum Command { + /// Render a conversation to token ids + per-token message indices. + Render(RenderArgs), + + /// Parse a completion's token ids into a structured response. + Parse(ParseArgs), +} + +/// Renderer families wired through to `renderers-core`. New families +/// land here as they're ported. +#[derive(Debug, Clone, Copy, ValueEnum)] +enum Family { + Qwen3, +} + +#[derive(Debug, Parser)] +struct RenderArgs { + /// Renderer family to instantiate. + #[arg(long, value_enum, default_value_t = Family::Qwen3)] + family: Family, + + /// Path to a `tokenizer.json` file. + #[arg(long)] + tokenizer: PathBuf, + + /// Path to a JSON file containing a list of messages. + #[arg(long)] + messages: PathBuf, + + /// Path to a JSON file containing a list of tool specs. + #[arg(long)] + tools: Option, + + /// Emit a trailing generation prompt (`<|im_start|>assistant\n` for + /// Qwen3). + #[arg(long)] + gen_prompt: bool, +} + +#[derive(Debug, Parser)] +struct ParseArgs { + /// Renderer family to instantiate. + #[arg(long, value_enum, default_value_t = Family::Qwen3)] + family: Family, + + /// Path to a `tokenizer.json` file. + #[arg(long)] + tokenizer: PathBuf, + + /// JSON-encoded list of integer token ids + /// (e.g. `'[151644, 8948, 198, ...]'`). + #[arg(long)] + token_ids: String, +} + +#[derive(Serialize)] +struct RenderedJson { + token_ids: Vec, + message_indices: Vec, +} + +impl From for RenderedJson { + fn from(r: RenderedTokens) -> Self { + Self { + token_ids: r.token_ids, + message_indices: r.message_indices, + } + } +} + +#[derive(Serialize)] +struct ParsedToolCallJson<'a> { + raw: &'a str, + name: Option<&'a str>, + arguments: serde_json::Value, + status: &'static str, + token_span: Option<(usize, usize)>, + id: Option<&'a str>, +} + +impl<'a> From<&'a ParsedToolCall> for ParsedToolCallJson<'a> { + fn from(p: &'a ParsedToolCall) -> Self { + let args = match &p.arguments { + None => serde_json::Value::Null, + Some(ToolArguments::Object(v)) => v.clone(), + Some(ToolArguments::Raw(s)) => serde_json::Value::String(s.clone()), + }; + Self { + raw: &p.raw, + name: p.name.as_deref(), + arguments: args, + status: p.status.as_wire(), + token_span: p.token_span.as_ref().map(|r| (r.start, r.end)), + id: p.id.as_deref(), + } + } +} + +#[derive(Serialize)] +struct ParsedJson<'a> { + content: &'a str, + reasoning_content: Option<&'a str>, + tool_calls: Vec>, +} + +fn build_renderer(family: Family, tokenizer: Tokenizer) -> Result, String> { + match family { + Family::Qwen3 => Qwen3Renderer::new(tokenizer) + .map(|r| Box::new(r) as Box) + .map_err(|e| e.to_string()), + } +} + +fn load_messages(path: &PathBuf) -> Result, String> { + let bytes = std::fs::read(path).map_err(|e| format!("read {}: {e}", path.display()))?; + serde_json::from_slice(&bytes).map_err(|e| format!("messages JSON: {e}")) +} + +fn load_tools(path: &PathBuf) -> Result, String> { + let bytes = std::fs::read(path).map_err(|e| format!("read {}: {e}", path.display()))?; + serde_json::from_slice(&bytes).map_err(|e| format!("tools JSON: {e}")) +} + +fn parse_token_ids(s: &str) -> Result, String> { + let v: Vec = serde_json::from_str(s).map_err(|e| format!("token-ids JSON: {e}"))?; + v.into_iter() + .map(|t| u32::try_from(t).map_err(|_| format!("token id out of range: {t}"))) + .collect() +} + +fn run_render(args: &RenderArgs) -> Result<(), String> { + let tok = Tokenizer::from_file(&args.tokenizer) + .map_err(|e| format!("load tokenizer {}: {e}", args.tokenizer.display()))?; + let renderer = build_renderer(args.family, tok)?; + let messages = load_messages(&args.messages)?; + let tools = match args.tools.as_ref() { + Some(p) => Some(load_tools(p)?), + None => None, + }; + let output = renderer + .render(&messages, tools.as_deref(), args.gen_prompt) + .map_err(|e| e.to_string())?; + let json: RenderedJson = output.into(); + println!("{}", serde_json::to_string(&json).unwrap()); + Ok(()) +} + +fn run_parse(args: &ParseArgs) -> Result<(), String> { + let tok = Tokenizer::from_file(&args.tokenizer) + .map_err(|e| format!("load tokenizer {}: {e}", args.tokenizer.display()))?; + let renderer = build_renderer(args.family, tok)?; + let ids = parse_token_ids(&args.token_ids)?; + let parsed = renderer.parse_response(&ids); + let tool_calls: Vec> = parsed + .tool_calls + .iter() + .map(ParsedToolCallJson::from) + .collect(); + let json = ParsedJson { + content: &parsed.content, + reasoning_content: parsed.reasoning_content.as_deref(), + tool_calls, + }; + println!("{}", serde_json::to_string(&json).unwrap()); + Ok(()) +} + +fn main() -> ExitCode { + let cli = Cli::parse(); + let result = match cli.command { + Command::Render(args) => run_render(&args), + Command::Parse(args) => run_parse(&args), + }; + match result { + Ok(()) => ExitCode::SUCCESS, + Err(msg) => { + eprintln!("error: {msg}"); + ExitCode::FAILURE + } + } +} diff --git a/crates/renderers-core/Cargo.toml b/crates/renderers-core/Cargo.toml index f5973f8..11fb4ad 100644 --- a/crates/renderers-core/Cargo.toml +++ b/crates/renderers-core/Cargo.toml @@ -23,8 +23,11 @@ phf = { workspace = true } openai-harmony = { version = "0.0.8", default-features = false } minijinja = { version = "2", default-features = false, features = ["builtins", "serde"] } image = { version = "0.25", default-features = false, features = ["jpeg", "png", "webp"] } -ndarray = "0.16" -sha2 = "0.10" +ndarray = "0.17" +sha2 = "0.11" [dev-dependencies] serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/crates/renderers-core/src/families/default.rs b/crates/renderers-core/src/families/default.rs index ecd5fa5..748649a 100644 --- a/crates/renderers-core/src/families/default.rs +++ b/crates/renderers-core/src/families/default.rs @@ -304,6 +304,7 @@ fn messages_to_value(messages: &[Message]) -> Result { map.insert("role".into(), JsonValue::String(m.role.clone())); // Content: string fast-path, structured parts pass through as JSON let content_value = match &m.content { + crate::types::Content::Null => JsonValue::Null, crate::types::Content::Text(s) => JsonValue::String(s.clone()), crate::types::Content::Parts(parts) => serde_json::to_value(parts) .map_err(|e| RenderError::Invalid(format!("content serialisation: {e}")))?, diff --git a/crates/renderers-core/src/families/glm.rs b/crates/renderers-core/src/families/glm.rs index 7da9c23..e9c0a33 100644 --- a/crates/renderers-core/src/families/glm.rs +++ b/crates/renderers-core/src/families/glm.rs @@ -31,6 +31,7 @@ use serde_json::Value as JsonValue; use crate::bridge::reject_assistant_in_extension; use crate::emit::RenderBuf; +use crate::json::{to_string_python, tool_spec_inner_value, tool_spec_template_value}; use crate::parsing::glm::parse_glm; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; @@ -216,18 +217,12 @@ impl GlmRenderer { } fn format_tool_spec(&self, tool: &ToolSpec) -> Result { - // GLM-5 / GLM-4.5 render the spec verbatim; GLM-5.1 unwraps the - // OpenAI envelope (`{"type":"function","function":{...}}`) and - // strips internal-only keys. - // - // Our `ToolSpec` is already the inner shape, so the GLM-5.1 - // unwrap is a no-op in Rust — kept here as a structural note. - let spec = serde_json::json!({ - "name": tool.name, - "description": tool.description, - "parameters": tool.parameters, - }); - serde_json::to_string(&spec) + let spec = if self.variant == Variant::Glm51 { + tool_spec_inner_value(tool) + } else { + tool_spec_template_value(tool) + }; + to_string_python(&spec) .map_err(|e| RenderError::Invalid(format!("tool spec serialisation: {e}"))) } @@ -264,9 +259,6 @@ impl Renderer for GlmRenderer { if !t.is_empty() { buf.scaffold_special(self.system); let mut s = String::with_capacity(512); - if !nl.is_empty() { - s.push_str(nl); - } s.push_str(TOOLS_HEADER_GLM5); for tool in t { s.push_str(&self.format_tool_spec(tool)?); @@ -284,7 +276,7 @@ impl Renderer for GlmRenderer { let last_ui = Self::last_user_index(messages); for (i, msg) in messages.iter().enumerate() { - let content = msg.text_content(); + let content = msg.visible_text_content(); let idx = i as i32; match msg.role.as_str() { "system" => { @@ -397,7 +389,7 @@ impl Renderer for GlmRenderer { for (i, msg) in new_messages.iter().enumerate() { let idx = i as i32; - let content = msg.text_content(); + let content = msg.visible_text_content(); match msg.role.as_str() { "user" => { if !(i == 0 && last_prev == self.user) { @@ -469,7 +461,7 @@ impl GlmRenderer { last_user_index: i32, preserve_thinking: bool, ) -> Result<(), RenderError> { - let raw_content = msg.text_content(); + let raw_content = msg.visible_text_content(); let (reasoning_content, content) = match &msg.reasoning_content { Some(s) => (s.clone(), raw_content.to_string()), None => { diff --git a/crates/renderers-core/src/families/kimi_k25.rs b/crates/renderers-core/src/families/kimi_k25.rs index d78f893..ace5c7f 100644 --- a/crates/renderers-core/src/families/kimi_k25.rs +++ b/crates/renderers-core/src/families/kimi_k25.rs @@ -492,6 +492,11 @@ impl KimiK25Renderer { mm: &mut MultiModalData, ) -> Result<(), RenderError> { match &msg.content { + crate::types::Content::Null => { + for item in media_iter.by_ref() { + self.emit_media_item(buf, msg_idx, item, mm)?; + } + } crate::types::Content::Text(s) => { // Plain-text + attached media: emit images first, then // text. Same convention as Qwen-VL when the caller diff --git a/crates/renderers-core/src/families/minimax_m2.rs b/crates/renderers-core/src/families/minimax_m2.rs index 9f181d4..52357b7 100644 --- a/crates/renderers-core/src/families/minimax_m2.rs +++ b/crates/renderers-core/src/families/minimax_m2.rs @@ -20,6 +20,7 @@ use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; use crate::emit::RenderBuf; +use crate::json::to_string_python; use crate::parsing::minimax::parse_minimax; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; @@ -139,7 +140,7 @@ impl MiniMaxM2Renderer { "description": tool.description, "parameters": tool.parameters, }); - s.push_str(&serde_json::to_string(&spec).unwrap_or_default()); + s.push_str(&to_string_python(&spec).unwrap_or_default()); s.push_str("\n"); } s.push_str(TOOLS_FOOTER_PREFIX); @@ -181,7 +182,7 @@ impl Renderer for MiniMaxM2Renderer { buf.special(self.bos, sys_idx); buf.special(self.role, sys_idx); let sys_content = if first_is_system { - messages[0].text_content().to_string() + messages[0].visible_text_content().to_string() } else { String::new() }; @@ -204,7 +205,7 @@ impl Renderer for MiniMaxM2Renderer { for (ci, msg) in conversation.iter().enumerate() { let orig_idx = (ci + conversation_start) as i32; - let content = msg.text_content(); + let content = msg.visible_text_content(); match msg.role.as_str() { "user" => { buf.special(self.role, orig_idx); @@ -290,7 +291,7 @@ impl Renderer for MiniMaxM2Renderer { for (i, msg) in new_messages.iter().enumerate() { let idx = i as i32; - let content = msg.text_content(); + let content = msg.visible_text_content(); match msg.role.as_str() { "user" => { buf.special(self.role, idx); @@ -342,7 +343,7 @@ impl MiniMaxM2Renderer { last_user_index: i32, preserve_thinking: bool, ) -> Result<(), RenderError> { - let raw_content = msg.text_content(); + let raw_content = msg.visible_text_content(); let (reasoning_content, content_text) = match &msg.reasoning_content { Some(s) => (s.clone(), raw_content.to_string()), None => { @@ -449,7 +450,7 @@ impl MiniMaxM2Renderer { } let prefix = if prev_is_tool { "" } else { "\n" }; let suffix = if next_is_tool { "\n" } else { "" }; - let content = conversation[conv_idx].text_content(); + let content = conversation[conv_idx].visible_text_content(); let mut s = String::with_capacity(content.len() + 32); s.push_str(prefix); s.push_str(""); diff --git a/crates/renderers-core/src/families/qwen3.rs b/crates/renderers-core/src/families/qwen3.rs index e48bb58..7d42ea7 100644 --- a/crates/renderers-core/src/families/qwen3.rs +++ b/crates/renderers-core/src/families/qwen3.rs @@ -16,10 +16,9 @@ //! ~5–10× faster than Python's `json.dumps` for the JSON sizes typical //! here. -use serde_json::json; - use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; use crate::emit::RenderBuf; +use crate::json::{to_string_python, tool_spec_template_value}; use crate::parsing::qwen3::parse_qwen3; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; @@ -170,15 +169,8 @@ impl Qwen3Renderer { tool_text.push_str(TOOLS_HEADER); for tool in tools { tool_text.push('\n'); - let spec = json!({ - "name": tool.name, - "description": tool.description, - "parameters": tool.parameters, - }); - // `to_string` here is `serde_json::to_string` (no pretty, - // no ensure_ascii — matches Python's - // `json.dumps(..., ensure_ascii=False)`). - tool_text.push_str(&serde_json::to_string(&spec).map_err(|e| { + let spec = tool_spec_template_value(tool); + tool_text.push_str(&to_string_python(&spec).map_err(|e| { RenderError::Invalid(format!("tool spec serialisation failed: {e}")) })?); } @@ -332,7 +324,7 @@ impl Qwen3Renderer { let name = tc.function.name.as_str(); let args_str = match &tc.function.arguments { ToolArguments::Raw(s) => s.clone(), - ToolArguments::Object(v) => serde_json::to_string(v).map_err(|e| { + ToolArguments::Object(v) => to_string_python(v).map_err(|e| { RenderError::Invalid(format!("tool args serialisation failed: {e}")) })?, }; diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs index 35fbed5..efe820a 100644 --- a/crates/renderers-core/src/families/qwen35.rs +++ b/crates/renderers-core/src/families/qwen35.rs @@ -18,10 +18,9 @@ use std::borrow::Cow; -use serde_json::json; - use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; use crate::emit::RenderBuf; +use crate::json::{to_string_python, tool_spec_template_value}; use crate::parsing::qwen35::parse_qwen35; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; @@ -192,6 +191,7 @@ impl Qwen35Renderer { /// so OpenAI-style structured text content is not silently dropped. fn render_content_text(content: &Content) -> Cow<'_, str> { match content { + Content::Null => Cow::Borrowed(""), Content::Text(s) => Cow::Borrowed(s.as_str()), Content::Parts(parts) => { let mut out = String::new(); @@ -239,12 +239,8 @@ impl Qwen35Renderer { tool_text.push_str(TOOLS_HEADER); for tool in tools { tool_text.push('\n'); - let spec = json!({ - "name": tool.name, - "description": tool.description, - "parameters": tool.parameters, - }); - tool_text.push_str(&serde_json::to_string(&spec).map_err(|e| { + let spec = tool_spec_template_value(tool); + tool_text.push_str(&to_string_python(&spec).map_err(|e| { RenderError::Invalid(format!("tool spec serialisation failed: {e}")) })?); } @@ -677,6 +673,11 @@ impl Qwen35Renderer { .filter_map(|(m, item)| (*m == msg_idx).then_some(item)); match &msg.content { + crate::types::Content::Null => { + for item in media_iter.by_ref() { + self.emit_media_item(buf, idx, item, mm)?; + } + } crate::types::Content::Text(s) => { // Plain-text user message with attached media: emit // images first (canonical Qwen-VL shape: diff --git a/crates/renderers-core/src/json.rs b/crates/renderers-core/src/json.rs new file mode 100644 index 0000000..60f1545 --- /dev/null +++ b/crates/renderers-core/src/json.rs @@ -0,0 +1,78 @@ +use std::io; + +use serde::Serialize; +use serde_json::json; +use serde_json::ser::Formatter; + +use crate::types::ToolSpec; + +/// Serialize JSON with Python's default `json.dumps(..., ensure_ascii=False)` +/// separators: `", "` between values and `": "` between keys and values. +pub(crate) fn to_string_python(value: &T) -> Result +where + T: Serialize + ?Sized, +{ + let mut out = Vec::new(); + { + let mut serializer = serde_json::Serializer::with_formatter(&mut out, PythonJsonFormatter); + value.serialize(&mut serializer)?; + } + Ok(String::from_utf8(out).expect("serde_json only writes valid UTF-8")) +} + +pub(crate) fn tool_spec_inner_value(tool: &ToolSpec) -> serde_json::Value { + json!({ + "name": &tool.name, + "description": &tool.description, + "parameters": &tool.parameters, + }) +} + +pub(crate) fn tool_spec_openai_value(tool: &ToolSpec) -> serde_json::Value { + json!({ + "type": "function", + "function": tool_spec_inner_value(tool), + }) +} + +pub(crate) fn tool_spec_template_value(tool: &ToolSpec) -> serde_json::Value { + if tool.openai_envelope { + tool_spec_openai_value(tool) + } else { + tool_spec_inner_value(tool) + } +} + +#[derive(Debug, Default)] +struct PythonJsonFormatter; + +impl Formatter for PythonJsonFormatter { + fn begin_array_value(&mut self, writer: &mut W, first: bool) -> io::Result<()> + where + W: ?Sized + io::Write, + { + if first { + Ok(()) + } else { + writer.write_all(b", ") + } + } + + fn begin_object_key(&mut self, writer: &mut W, first: bool) -> io::Result<()> + where + W: ?Sized + io::Write, + { + if first { + Ok(()) + } else { + writer.write_all(b", ") + } + } + + fn begin_object_value(&mut self, writer: &mut W) -> io::Result<()> + where + W: ?Sized + io::Write, + { + writer.write_all(b": ") + } +} diff --git a/crates/renderers-core/src/lib.rs b/crates/renderers-core/src/lib.rs index fd93426..3c21527 100644 --- a/crates/renderers-core/src/lib.rs +++ b/crates/renderers-core/src/lib.rs @@ -28,6 +28,7 @@ pub mod bridge; pub mod emit; pub mod families; +pub(crate) mod json; pub mod parsing; pub mod processing; pub mod registry; diff --git a/crates/renderers-core/src/types.rs b/crates/renderers-core/src/types.rs index 0c33d6c..5610e01 100644 --- a/crates/renderers-core/src/types.rs +++ b/crates/renderers-core/src/types.rs @@ -58,6 +58,7 @@ pub struct VideoRef { #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(untagged)] pub enum Content { + Null, Text(String), Parts(Vec), } @@ -73,6 +74,15 @@ impl Content { /// `""` for `Parts` variants (Qwen3 ignores list content entirely). pub fn as_text(&self) -> &str { match self { + Content::Null => "", + Content::Text(s) => s.as_str(), + Content::Parts(_) => "", + } + } + + pub fn as_text_or_none_literal(&self) -> &str { + match self { + Content::Null => "None", Content::Text(s) => s.as_str(), Content::Parts(_) => "", } @@ -80,6 +90,7 @@ impl Content { pub fn is_empty(&self) -> bool { match self { + Content::Null => true, Content::Text(s) => s.is_empty(), Content::Parts(p) => p.is_empty(), } @@ -121,6 +132,8 @@ pub struct ToolSpec { pub description: String, #[serde(default)] pub parameters: serde_json::Value, + #[serde(default, skip)] + pub openai_envelope: bool, } /// A single turn in a multi-turn conversation. @@ -147,6 +160,11 @@ impl Message { pub fn text_content(&self) -> &str { self.content.as_text() } + + #[inline] + pub fn visible_text_content(&self) -> &str { + self.content.as_text_or_none_literal() + } } /// Tool-call argument payload. The JSON-object case is the common path; @@ -155,8 +173,8 @@ impl Message { #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(untagged)] pub enum ToolArguments { - Object(serde_json::Value), Raw(String), + Object(serde_json::Value), } impl Default for ToolArguments { diff --git a/crates/renderers-py/Cargo.toml b/crates/renderers-py/Cargo.toml index dcd2c8f..a9ba280 100644 --- a/crates/renderers-py/Cargo.toml +++ b/crates/renderers-py/Cargo.toml @@ -14,9 +14,12 @@ path = "src/lib.rs" [dependencies] renderers-core = { path = "../renderers-core" } -pyo3 = { version = "0.22", features = ["abi3-py310"] } +pyo3 = { version = "0.28", features = ["abi3-py310"] } serde = { workspace = true } serde_json = { workspace = true } -pythonize = "0.22" -numpy = "0.23" -ndarray = "0.16" +pythonize = "0.28" +numpy = "0.28" +ndarray = "0.17" + +[lints] +workspace = true diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index dcc34d5..6edaa67 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -51,13 +51,28 @@ fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult> if obj.is_none() { return Ok(None); } - let value: serde_json::Value = pythonize::depythonize(obj).map_err(|e| { + let mut value: serde_json::Value = pythonize::depythonize(obj).map_err(|e| { invalid(format!( "tools must be a list of dicts (decode failed: {e})" )) })?; - let parsed: Vec = + let arr = value + .as_array_mut() + .ok_or_else(|| invalid("tools must be a list of dicts"))?; + let mut envelopes = Vec::with_capacity(arr.len()); + for item in arr { + if let Some(function) = item.get("function").and_then(|v| v.as_object()) { + envelopes.push(true); + *item = serde_json::Value::Object(function.clone()); + } else { + envelopes.push(false); + } + } + let mut parsed: Vec = serde_json::from_value(value).map_err(|e| invalid(format!("tools shape mismatch: {e}")))?; + for (tool, openai_envelope) in parsed.iter_mut().zip(envelopes) { + tool.openai_envelope = openai_envelope; + } Ok(Some(parsed)) } @@ -116,7 +131,7 @@ fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { fn parse_u32_list(obj: &Bound<'_, PyAny>) -> PyResult> { // Accept either a Python list of ints or a numpy-style sequence. let list = obj - .downcast::() + .cast::() .map_err(|_| invalid("expected list[int]"))?; let mut out = Vec::with_capacity(list.len()); for item in list.iter() { @@ -129,7 +144,11 @@ fn parse_u32_list(obj: &Bound<'_, PyAny>) -> PyResult> { Ok(out) } -#[pyclass(name = "RenderedTokens", module = "renderers_native")] +#[pyclass( + name = "RenderedTokens", + module = "renderers_native", + skip_from_py_object +)] #[derive(Clone)] struct PyRenderedTokens { inner: RenderedTokens, @@ -173,7 +192,11 @@ impl PyRenderedTokens { } } -#[pyclass(name = "ParsedToolCall", module = "renderers_native")] +#[pyclass( + name = "ParsedToolCall", + module = "renderers_native", + skip_from_py_object +)] #[derive(Clone)] struct PyParsedToolCall { inner: ParsedToolCall, @@ -227,7 +250,11 @@ impl PyParsedToolCall { } } -#[pyclass(name = "ParsedResponse", module = "renderers_native")] +#[pyclass( + name = "ParsedResponse", + module = "renderers_native", + skip_from_py_object +)] #[derive(Clone)] struct PyParsedResponse { inner: ParsedResponse, @@ -267,7 +294,11 @@ impl PyParsedResponse { /// Wire enum mirror — matches the Python `ToolCallParseStatus` string /// values so existing code reading `tc.status == "ok"` keeps working. -#[pyclass(name = "ToolCallParseStatus", module = "renderers_native")] +#[pyclass( + name = "ToolCallParseStatus", + module = "renderers_native", + skip_from_py_object +)] #[derive(Clone, Copy)] struct PyToolCallParseStatus { inner: ToolCallParseStatus, @@ -323,7 +354,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { Qwen3RendererBuilder::default() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -391,7 +422,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { Qwen35RendererBuilder::default() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -423,7 +454,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { Qwen36RendererBuilder::default() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -455,7 +486,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { GlmRendererBuilder::glm5() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -487,7 +518,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { GlmRendererBuilder::glm51() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -519,7 +550,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { GlmRendererBuilder::glm45() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -549,7 +580,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { MiniMaxM2RendererBuilder::default() .preserve_all_thinking(preserve_all_thinking) .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) @@ -597,7 +628,7 @@ impl PyRenderer { }; let ct = chat_template.to_string(); let renderer = py - .allow_threads(move || { + .detach(move || { let mut b = DefaultRendererBuilder::new(ct).stop_token_ids(stop_ids); for (k, v) in extras { b = b.add_context(k, v); @@ -645,7 +676,7 @@ impl PyRenderer { let _ = tokenizer_path; // not needed for harmony let effort = reasoning_effort.unwrap_or("medium").to_string(); let renderer = py - .allow_threads(move || -> Result<_, renderers_core::types::RenderError> { + .detach(move || -> Result<_, renderers_core::types::RenderError> { let mut b = GptOssRendererBuilder::default() .use_system_prompt(use_system_prompt) .preserve_all_thinking(preserve_all_thinking) @@ -692,7 +723,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { KimiK25RendererBuilder::default() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -724,7 +755,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { KimiK2RendererBuilder::default() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -759,7 +790,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { Nemotron3RendererBuilder::default() .enable_thinking(enable_thinking) .preserve_all_thinking(preserve_all_thinking) @@ -787,7 +818,7 @@ impl PyRenderer { ) -> PyResult { let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; let renderer = py - .allow_threads(|| { + .detach(|| { DeepSeekV3RendererBuilder::default() .enable_thinking(enable_thinking) .build(tok) @@ -810,7 +841,7 @@ impl PyRenderer { let tools = parse_tools(tools)?; let renderer = self.inner.clone(); let out = py - .allow_threads(move || renderer.render(&msgs, tools.as_deref(), add_generation_prompt)) + .detach(move || renderer.render(&msgs, tools.as_deref(), add_generation_prompt)) .map_err(render_err)?; Ok(PyRenderedTokens { inner: out }) } @@ -827,9 +858,7 @@ impl PyRenderer { let tools = parse_tools(tools)?; let renderer = self.inner.clone(); let ids = py - .allow_threads(move || { - renderer.render_ids(&msgs, tools.as_deref(), add_generation_prompt) - }) + .detach(move || renderer.render_ids(&msgs, tools.as_deref(), add_generation_prompt)) .map_err(render_err)?; Ok(PyList::new_bound(py, ids.iter().map(|&t| t as i64))) } @@ -841,7 +870,7 @@ impl PyRenderer { ) -> PyResult { let ids = parse_u32_list(token_ids)?; let renderer = self.inner.clone(); - let parsed = py.allow_threads(move || renderer.parse_response(&ids)); + let parsed = py.detach(move || renderer.parse_response(&ids)); Ok(PyParsedResponse { inner: parsed }) } @@ -877,7 +906,7 @@ impl PyRenderer { let bundle = parse_media_bundle(media)?; let renderer = self.inner.clone(); let out = py - .allow_threads(move || -> Result<_, renderers_core::types::RenderError> { + .detach(move || -> Result<_, renderers_core::types::RenderError> { let mm = renderer .as_multimodal() .ok_or_else(|| renderers_core::types::RenderError::Invalid( @@ -910,9 +939,7 @@ impl PyRenderer { let tools = parse_tools(tools)?; let renderer = self.inner.clone(); let bridged = py - .allow_threads(move || { - renderer.bridge_to_next_turn(&prev_p, &prev_c, &msgs, tools.as_deref()) - }) + .detach(move || renderer.bridge_to_next_turn(&prev_p, &prev_c, &msgs, tools.as_deref())) .map_err(render_err)?; Ok(bridged.map(|rt| PyRenderedTokens { inner: rt })) } @@ -994,9 +1021,9 @@ impl PyQwen3VlImageProcessor { /// /// `message_idx` is up to the caller — it's not added here. fn process_bytes<'py>(&self, py: Python<'py>, bytes: &[u8]) -> PyResult> { - // Clone so the move into allow_threads is straightforward + // Clone so the move into detach is straightforward let processed: ProcessedImage = py - .allow_threads(|| self.inner.process_bytes(bytes)) + .detach(|| self.inner.process_bytes(bytes)) .map_err(render_err)?; processed_to_pyobject(py, processed) } @@ -1006,7 +1033,7 @@ impl PyQwen3VlImageProcessor { let bytes = std::fs::read(path).map_err(|e| invalid(format!("read image {path:?}: {e}")))?; let processed: ProcessedImage = py - .allow_threads(|| self.inner.process_bytes(&bytes)) + .detach(|| self.inner.process_bytes(&bytes)) .map_err(render_err)?; processed_to_pyobject(py, processed) } diff --git a/renderers/_native_router.py b/renderers/_native_router.py index ece558d..42df709 100644 --- a/renderers/_native_router.py +++ b/renderers/_native_router.py @@ -18,8 +18,10 @@ from __future__ import annotations +import hashlib import logging import os +import tempfile from pathlib import Path from typing import Any @@ -27,6 +29,7 @@ _NATIVE_MODULE: Any | None = None _NATIVE_LOAD_ATTEMPTED = False +_ALL_EXCLUDED = {"default"} def native_enabled(family: str) -> bool: @@ -35,7 +38,7 @@ def native_enabled(family: str) -> bool: if not raw or raw == "0": return False if raw in {"1", "all"}: - return True + return family not in _ALL_EXCLUDED return family in {part.strip() for part in raw.split(",") if part.strip()} @@ -82,6 +85,19 @@ def resolve_tokenizer_path(tokenizer: Any) -> str: return str(path / "tokenizer.json") return str(path) + backend = getattr(tokenizer, "backend_tokenizer", None) + if backend is not None and hasattr(backend, "to_str"): + data = backend.to_str() + digest = hashlib.sha256(data.encode("utf-8")).hexdigest() + cache_dir = Path(tempfile.gettempdir()) / "renderers-tokenizers" + cache_dir.mkdir(parents=True, exist_ok=True) + path = cache_dir / f"{digest}.json" + if not path.exists(): + tmp = path.with_suffix(".tmp") + tmp.write_text(data, encoding="utf-8") + tmp.replace(path) + return str(path) + name_or_path = getattr(tokenizer, "name_or_path", None) if not name_or_path: raise ValueError( @@ -111,3 +127,17 @@ def resolve_tokenizer_path(tokenizer: Any) -> str: "Run `snapshot_download` first or pass an explicit path." ) return str(cached) + + +def try_resolve_tokenizer_path(tokenizer: Any, family: str) -> str | None: + """Best-effort tokenizer resolution for optional native routing.""" + try: + return resolve_tokenizer_path(tokenizer) + except ValueError as exc: + logger.info( + "RENDERERS_NATIVE selected %s but no native tokenizer path was " + "available (%s); falling back to pure Python.", + family, + exc, + ) + return None diff --git a/renderers/kimi_k2.py b/renderers/kimi_k2.py index 0e87a1b..c7a760f 100644 --- a/renderers/kimi_k2.py +++ b/renderers/kimi_k2.py @@ -21,7 +21,7 @@ from renderers._native_router import ( load_native, native_enabled, - resolve_tokenizer_path, + try_resolve_tokenizer_path, ) from renderers.base import ( Message, @@ -58,13 +58,14 @@ def __new__( if native_enabled("kimi_k2") or native_enabled("kimi-k2"): native = load_native() if native is not None: - path = resolve_tokenizer_path(tokenizer) - return native.Renderer.kimi_k2( - path, - enable_thinking=enable_thinking, - preserve_all_thinking=preserve_all_thinking, - preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, - ) + path = try_resolve_tokenizer_path(tokenizer, "kimi_k2") + if path is not None: + return native.Renderer.kimi_k2( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) return super().__new__(cls) def __init__( diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 4155e2e..7483cd3 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -30,7 +30,7 @@ from renderers._native_router import ( load_native, native_enabled, - resolve_tokenizer_path, + try_resolve_tokenizer_path, ) from renderers.base import ( Message, @@ -620,13 +620,14 @@ def __init__( if native_enabled("kimi_k25") and processor is None: native = load_native() if native is not None: - path = resolve_tokenizer_path(tokenizer) - self._native_renderer = native.Renderer.kimi_k25( - path, - enable_thinking=enable_thinking, - preserve_all_thinking=preserve_all_thinking, - preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, - ) + path = try_resolve_tokenizer_path(tokenizer, "kimi_k25") + if path is not None: + self._native_renderer = native.Renderer.kimi_k25( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) # Core structural tokens — all must be single special tokens in the vocab self._im_user = self._token_id("<|im_user|>") diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..85f3606 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,4 @@ +[toolchain] +channel = "stable" +components = ["rustfmt", "clippy"] +profile = "minimal" From 6527b65f3cb1997ef8b08f00e107cd8164ad5678 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:54:14 +0200 Subject: [PATCH 18/35] Tighten Rust native lint coverage --- clippy.toml | 10 ++++++++++ crates/renderers-core/src/emit.rs | 2 +- .../src/families/deepseek_v3.rs | 2 +- crates/renderers-core/src/families/default.rs | 8 ++++---- crates/renderers-core/src/families/gpt_oss.rs | 19 +++++++------------ .../renderers-core/src/families/kimi_k25.rs | 7 +++---- .../renderers-core/src/families/minimax_m2.rs | 8 ++++---- .../renderers-core/src/families/nemotron3.rs | 10 +++++----- crates/renderers-core/src/families/qwen35.rs | 4 ++-- .../renderers-core/src/parsing/deepseek_v3.rs | 2 +- crates/renderers-core/src/parsing/kimi_k2.rs | 2 +- .../renderers-core/src/processing/qwen3_vl.rs | 6 +++++- crates/renderers-core/src/types.rs | 3 +-- crates/renderers-py/src/lib.rs | 17 ++++++++++------- 14 files changed, 55 insertions(+), 45 deletions(-) create mode 100644 clippy.toml diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..916ec78 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,10 @@ +# Renderer functions can grow long: the family-specific render/parse +# paths string together many small steps and reading them top-to-bottom +# is the point. Bumped from the default 100 to accommodate our longest +# real function (parse_minimax at 121 lines) without giving up the lint. +too-many-lines-threshold = 130 + +# Config builders carry several independent bool toggles (enable_thinking, +# preserve_all_thinking, preserve_thinking_between_tool_calls, ...). Four +# is the natural shape; five+ would warrant a flags struct. +max-struct-bools = 4 diff --git a/crates/renderers-core/src/emit.rs b/crates/renderers-core/src/emit.rs index 96a8016..f8cd73d 100644 --- a/crates/renderers-core/src/emit.rs +++ b/crates/renderers-core/src/emit.rs @@ -23,7 +23,7 @@ pub struct RenderBuf<'tok> { scratch_offsets: Vec, } -impl<'tok> std::fmt::Debug for RenderBuf<'tok> { +impl std::fmt::Debug for RenderBuf<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("RenderBuf") .field("tokens_len", &self.tokens.len()) diff --git a/crates/renderers-core/src/families/deepseek_v3.rs b/crates/renderers-core/src/families/deepseek_v3.rs index e7b090f..8735666 100644 --- a/crates/renderers-core/src/families/deepseek_v3.rs +++ b/crates/renderers-core/src/families/deepseek_v3.rs @@ -171,7 +171,7 @@ impl Renderer for DeepSeekV3Renderer { // before any role marker, attributed to message index 0. let mut first_non_sys = 0usize; let mut sys_parts: Vec<&str> = Vec::new(); - for msg in messages.iter() { + for msg in messages { if msg.role != "system" { break; } diff --git a/crates/renderers-core/src/families/default.rs b/crates/renderers-core/src/families/default.rs index 748649a..ba5db09 100644 --- a/crates/renderers-core/src/families/default.rs +++ b/crates/renderers-core/src/families/default.rs @@ -92,7 +92,7 @@ impl std::fmt::Debug for DefaultRenderer { f.debug_struct("DefaultRenderer") .field("stop_token_ids", &self.stop_token_ids) .field("extra_context_keys", &self.extra_context.len()) - .finish() + .finish_non_exhaustive() } } @@ -149,7 +149,7 @@ impl DefaultRenderer { serde_json::to_value(messages_to_value(messages)?).unwrap_or(JsonValue::Null), ); let tools_value: MjValue = match tools { - Some(t) => tools_to_value(t)?, + Some(t) => tools_to_value(t), None => MjValue::from(Vec::::new()), }; ctx_map.insert( @@ -347,7 +347,7 @@ fn messages_to_value(messages: &[Message]) -> Result { Ok(MjValue::from(out)) } -fn tools_to_value(tools: &[ToolSpec]) -> Result { +fn tools_to_value(tools: &[ToolSpec]) -> MjValue { let mut out: Vec = Vec::with_capacity(tools.len()); for t in tools { let v = serde_json::json!({ @@ -360,5 +360,5 @@ fn tools_to_value(tools: &[ToolSpec]) -> Result { }); out.push(MjValue::from_serialize(v)); } - Ok(MjValue::from(out)) + MjValue::from(out) } diff --git a/crates/renderers-core/src/families/gpt_oss.rs b/crates/renderers-core/src/families/gpt_oss.rs index 108153f..12be5db 100644 --- a/crates/renderers-core/src/families/gpt_oss.rs +++ b/crates/renderers-core/src/families/gpt_oss.rs @@ -259,19 +259,12 @@ impl GptOssRenderer { } } - fn message_to_harmony(&self, msg: &Message, preserve_thinking: bool) -> Vec { + fn message_to_harmony(msg: &Message, preserve_thinking: bool) -> Vec { match msg.role.as_str() { "user" => vec![HarmonyMessage::from_role_and_content( HarmonyRole::User, msg.text_content().to_string(), )], - "system" | "developer" => { - let dev = DeveloperContent::new().with_instructions(msg.text_content()); - vec![HarmonyMessage::from_role_and_content( - HarmonyRole::Developer, - dev, - )] - } "tool" => { let m = HarmonyMessage::from_author_and_content( Self::tool_author(msg), @@ -281,7 +274,9 @@ impl GptOssRenderer { .with_channel("commentary"); vec![m] } - "assistant" => self.assistant_to_harmony(msg, preserve_thinking), + "assistant" => Self::assistant_to_harmony(msg, preserve_thinking), + // Default branch covers "system", "developer", and any + // unknown role, all of which route to the Developer channel. _ => { let dev = DeveloperContent::new().with_instructions(msg.text_content()); vec![HarmonyMessage::from_role_and_content( @@ -292,7 +287,7 @@ impl GptOssRenderer { } } - fn assistant_to_harmony(&self, msg: &Message, preserve_thinking: bool) -> Vec { + fn assistant_to_harmony(msg: &Message, preserve_thinking: bool) -> Vec { let mut out: Vec = Vec::new(); if preserve_thinking { @@ -464,7 +459,7 @@ impl Renderer for GptOssRenderer { self.preserve_all_thinking, self.preserve_thinking_between_tool_calls, ); - for hm in self.message_to_harmony(msg, preserve_thinking) { + for hm in Self::message_to_harmony(msg, preserve_thinking) { self.emit_render(&mut tokens, &mut indices, i as i32, &hm)?; } } @@ -642,7 +637,7 @@ impl Renderer for GptOssRenderer { "tool" | "user" | "system" | "developer" => {} _ => return Ok(None), } - for hm in self.message_to_harmony(msg, false) { + for hm in Self::message_to_harmony(msg, false) { let mut out: Vec = Vec::new(); self.enc .render_into(&hm, &mut out, None) diff --git a/crates/renderers-core/src/families/kimi_k25.rs b/crates/renderers-core/src/families/kimi_k25.rs index ace5c7f..6b127e6 100644 --- a/crates/renderers-core/src/families/kimi_k25.rs +++ b/crates/renderers-core/src/families/kimi_k25.rs @@ -232,7 +232,6 @@ impl KimiK25Renderer { } fn emit_tool_body( - &self, buf: &mut RenderBuf<'_>, msg: &Message, msg_idx: i32, @@ -301,7 +300,7 @@ impl Renderer for KimiK25Renderer { ); self.emit_assistant_body(&mut buf, msg, idx, is_suffix, preserve_thinking)?; } - "tool" => self.emit_tool_body(&mut buf, msg, idx)?, + "tool" => Self::emit_tool_body(&mut buf, msg, idx)?, _ => { let content = msg.text_content(); if !content.is_empty() { @@ -383,7 +382,7 @@ impl Renderer for KimiK25Renderer { buf.text(content, idx)?; } } - "tool" => self.emit_tool_body(&mut buf, msg, idx)?, + "tool" => Self::emit_tool_body(&mut buf, msg, idx)?, _ => return Ok(None), } buf.special(self.im_end, idx); @@ -590,7 +589,7 @@ impl MultimodalRenderer for KimiK25Renderer { ); self.emit_assistant_body(&mut buf, msg, idx, is_suffix, preserve_thinking)?; } - "tool" => self.emit_tool_body(&mut buf, msg, idx)?, + "tool" => Self::emit_tool_body(&mut buf, msg, idx)?, _ => { // user / system / other — interleave media inline let mut media_iter = media diff --git a/crates/renderers-core/src/families/minimax_m2.rs b/crates/renderers-core/src/families/minimax_m2.rs index 52357b7..f14cbdd 100644 --- a/crates/renderers-core/src/families/minimax_m2.rs +++ b/crates/renderers-core/src/families/minimax_m2.rs @@ -192,7 +192,7 @@ impl Renderer for MiniMaxM2Renderer { buf.text("\n", sys_idx)?; // Conversation messages — skip the leading system if present - let conversation_start = if first_is_system { 1 } else { 0 }; + let conversation_start = usize::from(first_is_system); let conversation = &messages[conversation_start..]; // last_user_index relative to the conversation @@ -393,7 +393,9 @@ impl MiniMaxM2Renderer { s }; - if !tool_calls.is_empty() { + if tool_calls.is_empty() { + buf.text(&after_think, orig_idx)?; + } else { // \n before contiguous with preceding text let mut head = after_think; head.push('\n'); @@ -424,8 +426,6 @@ impl MiniMaxM2Renderer { } buf.text(&invoke_block, orig_idx)?; buf.special(self.tool_call_end, orig_idx); - } else { - buf.text(&after_think, orig_idx)?; } buf.special(self.eos, orig_idx); diff --git a/crates/renderers-core/src/families/nemotron3.rs b/crates/renderers-core/src/families/nemotron3.rs index 44cec7d..1e25c38 100644 --- a/crates/renderers-core/src/families/nemotron3.rs +++ b/crates/renderers-core/src/families/nemotron3.rs @@ -376,7 +376,7 @@ impl Nemotron3Renderer { buf.text("assistant\n", msg_orig_idx)?; let tool_calls = &msg.tool_calls; - let content_suffix = if !tool_calls.is_empty() { "\n" } else { "" }; + let content_suffix = if tool_calls.is_empty() { "" } else { "\n" }; if !reasoning_content.is_empty() && (is_last_turn || preserve_thinking) { buf.special(self.think, msg_orig_idx); @@ -501,7 +501,10 @@ impl Renderer for Nemotron3Renderer { // Normalise: prepend empty system message if none is present. let mut normalised: Vec; let auto_system_injected: bool; - let messages_ref: &[Message] = if messages[0].role != "system" { + let messages_ref: &[Message] = if messages[0].role == "system" { + auto_system_injected = false; + messages + } else { auto_system_injected = true; normalised = Vec::with_capacity(messages.len() + 1); normalised.push(Message { @@ -511,9 +514,6 @@ impl Renderer for Nemotron3Renderer { }); normalised.extend_from_slice(messages); &normalised - } else { - auto_system_injected = false; - messages }; // Map normalised index back to caller's original index. Injected diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs index efe820a..a8aa255 100644 --- a/crates/renderers-core/src/families/qwen35.rs +++ b/crates/renderers-core/src/families/qwen35.rs @@ -878,7 +878,7 @@ impl MultimodalRenderer for Qwen35Renderer { previous_completion_ids: &[u32], new_messages: &[Message], tools: Option<&[ToolSpec]>, - _new_media: &MediaBundle, + new_media: &MediaBundle, _previous_multi_modal_data: Option<&MultiModalData>, ) -> Result, RenderError> { // Phase 5a scope: bridge ignores media on the new-turn side @@ -888,7 +888,7 @@ impl MultimodalRenderer for Qwen35Renderer { // image-bearing turns through a verbatim prefix is fragile // because placeholder offsets shift if the prior turn was // truncated mid-image. Phase 5b can revisit. - if !_new_media.is_empty() { + if !new_media.is_empty() { return Ok(None); } self.bridge_to_next_turn( diff --git a/crates/renderers-core/src/parsing/deepseek_v3.rs b/crates/renderers-core/src/parsing/deepseek_v3.rs index 7d2871e..ebacc99 100644 --- a/crates/renderers-core/src/parsing/deepseek_v3.rs +++ b/crates/renderers-core/src/parsing/deepseek_v3.rs @@ -121,7 +121,7 @@ fn parse_deepseek_tool_calls( let block_text = decode(tokenizer, call_ids).unwrap_or_default(); let span = Range { start: inner_offset + i, - end: inner_offset + end + if unclosed { 0 } else { 1 }, + end: inner_offset + end + usize::from(!unclosed), }; let Some(sep_pos) = find(call_ids, sep_id) else { diff --git a/crates/renderers-core/src/parsing/kimi_k2.rs b/crates/renderers-core/src/parsing/kimi_k2.rs index 8735f68..612a901 100644 --- a/crates/renderers-core/src/parsing/kimi_k2.rs +++ b/crates/renderers-core/src/parsing/kimi_k2.rs @@ -131,7 +131,7 @@ fn parse_kimi_k2_calls( let block_text = decode(tokenizer, &ids[i + 1..tc_end]).unwrap_or_default(); let span = Range { start: section_offset + i, - end: section_offset + tc_end + if unclosed { 0 } else { 1 }, + end: section_offset + tc_end + usize::from(!unclosed), }; // Extract function name from "functions.{name}:{index}" diff --git a/crates/renderers-core/src/processing/qwen3_vl.rs b/crates/renderers-core/src/processing/qwen3_vl.rs index e359121..e4f0c70 100644 --- a/crates/renderers-core/src/processing/qwen3_vl.rs +++ b/crates/renderers-core/src/processing/qwen3_vl.rs @@ -24,6 +24,7 @@ //! is required (e.g. for regression tests against PIL-rendered //! fixtures) keep the Python processor on the path. +use std::fmt::Write as _; use std::io::Cursor; use ndarray::{Array2, Array3}; @@ -148,7 +149,10 @@ impl Qwen3VlImageProcessor { h.update(format!("({}, {})", rgb.width(), rgb.height()).as_bytes()); let digest = h.finalize(); // Trim to 32 hex chars to match the Python implementation. - let hex: String = digest.iter().map(|b| format!("{b:02x}")).collect(); + let mut hex = String::with_capacity(digest.len() * 2); + for b in &digest { + write!(&mut hex, "{b:02x}").expect("writing to String never fails"); + } hex[..32].to_string() } diff --git a/crates/renderers-core/src/types.rs b/crates/renderers-core/src/types.rs index 5610e01..98afbfd 100644 --- a/crates/renderers-core/src/types.rs +++ b/crates/renderers-core/src/types.rs @@ -74,9 +74,8 @@ impl Content { /// `""` for `Parts` variants (Qwen3 ignores list content entirely). pub fn as_text(&self) -> &str { match self { - Content::Null => "", Content::Text(s) => s.as_str(), - Content::Parts(_) => "", + Content::Null | Content::Parts(_) => "", } } diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 6edaa67..a6b2353 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -91,7 +91,7 @@ fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { .ok_or_else(|| invalid("media item must be a dict"))?; let message_idx = obj.get("message_idx") - .and_then(|v| v.as_u64()) + .and_then(serde_json::Value::as_u64) .ok_or_else(|| invalid("media item missing message_idx"))? as usize; let modality_str = obj .get("modality") @@ -104,12 +104,12 @@ fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { }; let num_tokens = obj.get("num_tokens") - .and_then(|v| v.as_u64()) + .and_then(serde_json::Value::as_u64) .ok_or_else(|| invalid("media item missing num_tokens"))? as usize; let hash = obj .get("hash") .and_then(|v| v.as_str()) - .map(|s| s.to_string()) + .map(str::to_string) .unwrap_or_default(); let hf_payload = obj .get("hf_payload") @@ -317,7 +317,10 @@ impl PyToolCallParseStatus { #[classattr] const MALFORMED_STRUCTURE: &'static str = "malformed_structure"; + // PyO3 #[getter] requires `&self`; the Copy enum is 1 byte so clippy + // suggests by-value, but the macro shape is fixed. #[getter] + #[allow(clippy::trivially_copy_pass_by_ref)] fn value(&self) -> &'static str { self.inner.as_wire() } @@ -382,7 +385,7 @@ impl PyRenderer { preserve_thinking_between_tool_calls = false, ))] fn qwen3_vl( - _cls: &Bound<'_, PyType>, + cls: &Bound<'_, PyType>, py: Python<'_>, tokenizer_path: &str, enable_thinking: bool, @@ -390,7 +393,7 @@ impl PyRenderer { preserve_thinking_between_tool_calls: bool, ) -> PyResult { Self::qwen35( - _cls, + cls, py, tokenizer_path, enable_thinking, @@ -977,7 +980,7 @@ impl PyQwen3VlImageProcessor { patch_size: Option, temporal_patch_size: Option, merge_size: Option, - ) -> PyResult { + ) -> Self { let mut p = Qwen3VlImageProcessor::default(); if let Some(v) = min_pixels { p.min_pixels = v; @@ -994,7 +997,7 @@ impl PyQwen3VlImageProcessor { if let Some(v) = merge_size { p.merge_size = v; } - Ok(Self { inner: p }) + Self { inner: p } } /// Compute the resized `(height, width)` for an input image From 54270ff27639f78f0ad86b9262f5c161ae7e8eb9 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:55:39 +0200 Subject: [PATCH 19/35] Harden native parity tests --- .../src/families/deepseek_v3.rs | 19 ++- crates/renderers-core/src/families/default.rs | 4 +- crates/renderers-core/src/families/glm.rs | 12 +- crates/renderers-core/src/families/gpt_oss.rs | 17 +- crates/renderers-core/src/families/kimi_k2.rs | 8 +- .../renderers-core/src/families/kimi_k25.rs | 12 +- .../renderers-core/src/families/minimax_m2.rs | 12 +- .../renderers-core/src/families/nemotron3.rs | 10 +- crates/renderers-core/src/families/qwen3.rs | 6 +- crates/renderers-core/src/families/qwen35.rs | 11 +- .../renderers-core/src/parsing/deepseek_v3.rs | 28 ++-- crates/renderers-core/src/parsing/glm.rs | 88 +++++----- crates/renderers-core/src/parsing/kimi_k2.rs | 71 ++++---- crates/renderers-core/src/parsing/minimax.rs | 91 +++++----- crates/renderers-core/src/parsing/qwen3.rs | 5 +- crates/renderers-core/src/parsing/qwen35.rs | 157 ++++++++---------- crates/renderers-core/src/processing/mod.rs | 4 +- .../renderers-core/src/processing/qwen3_vl.rs | 22 +-- .../renderers-core/src/processing/resolver.rs | 5 +- crates/renderers-core/src/registry.rs | 2 +- crates/renderers-core/src/types.rs | 14 +- crates/renderers-py/src/lib.rs | 41 ++--- renderers/_native_router.py | 2 +- renderers/gpt_oss.py | 1 - tests/test_client.py | 12 ++ tests/test_native_router.py | 28 ++++ tests/test_renderer_e2e.py | 74 +++++++++ 27 files changed, 422 insertions(+), 334 deletions(-) create mode 100644 tests/test_renderer_e2e.py diff --git a/crates/renderers-core/src/families/deepseek_v3.rs b/crates/renderers-core/src/families/deepseek_v3.rs index 8735666..e2524f2 100644 --- a/crates/renderers-core/src/families/deepseek_v3.rs +++ b/crates/renderers-core/src/families/deepseek_v3.rs @@ -1,4 +1,4 @@ -//! DeepSeek V3 renderer. Port of `renderers/deepseek_v3.py`. +//! `DeepSeek` V3 renderer. Port of `renderers/deepseek_v3.py`. //! //! Key differences from the Qwen-family renderers: //! @@ -56,7 +56,7 @@ impl DeepSeekV3RendererBuilder { self } pub fn build(self, tokenizer: Tokenizer) -> Result { - DeepSeekV3Renderer::new_with(tokenizer, self) + DeepSeekV3Renderer::new_with(tokenizer, &self) } } @@ -90,7 +90,7 @@ impl DeepSeekV3Renderer { DeepSeekV3RendererBuilder::default() } - /// Encode a DeepSeek special token via the tokenizer's encode path and + /// Encode a `DeepSeek` special token via the tokenizer's encode path and /// assert it maps to exactly one id. Matches the Python /// `_get_special_token` helper — required because the tokenizer /// doesn't expose these by `token_to_id` directly (the fullwidth @@ -105,7 +105,14 @@ impl DeepSeekV3Renderer { Ok(ids[0]) } - fn new_with(tokenizer: Tokenizer, cfg: DeepSeekV3RendererBuilder) -> Result { + // Paired begin/end token ids share semantic prefixes (tool_call, + // tool_calls, tool_output, tool_outputs); the similarity is the + // structural relationship, so renaming would lose information. + #[allow(clippy::similar_names)] + fn new_with( + tokenizer: Tokenizer, + cfg: &DeepSeekV3RendererBuilder, + ) -> Result { let bos = Self::resolve(&tokenizer, &format!("begin{US}of{US}sentence"))?; let eos = Self::resolve(&tokenizer, &format!("end{US}of{US}sentence"))?; let user_token = Self::resolve(&tokenizer, "User")?; @@ -204,7 +211,7 @@ impl Renderer for DeepSeekV3Renderer { // Generation prompt — skip <|Assistant|> after a tool output if add_generation_prompt { - let last_role = messages.last().map(|m| m.role.as_str()).unwrap_or(""); + let last_role = messages.last().map_or("", |m| m.role.as_str()); if last_role != "tool" { buf.scaffold_special(self.assistant_token); } @@ -284,7 +291,7 @@ impl Renderer for DeepSeekV3Renderer { } } - let last_role = new_messages.last().map(|m| m.role.as_str()).unwrap_or(""); + let last_role = new_messages.last().map_or("", |m| m.role.as_str()); if last_role != "tool" { buf.scaffold_special(self.assistant_token); } diff --git a/crates/renderers-core/src/families/default.rs b/crates/renderers-core/src/families/default.rs index ba5db09..44c8e11 100644 --- a/crates/renderers-core/src/families/default.rs +++ b/crates/renderers-core/src/families/default.rs @@ -1,4 +1,4 @@ -//! DefaultRenderer — Jinja-template fallback for models without a +//! `DefaultRenderer` — Jinja-template fallback for models without a //! hand-coded family. //! //! Port of `renderers/default.py`. Two key differences from the Python @@ -18,7 +18,7 @@ //! //! `parse_response` is intentionally basic: strip stop tokens, decode, //! split on `` if present. Models with structured tool calls -//! need a hand-coded family — DefaultRenderer doesn't try to guess. +//! need a hand-coded family — `DefaultRenderer` doesn't try to guess. //! //! `bridge_to_next_turn` returns `None` unconditionally: without //! template-specific knowledge of the turn-close token, the bridge diff --git a/crates/renderers-core/src/families/glm.rs b/crates/renderers-core/src/families/glm.rs index e9c0a33..bc1893d 100644 --- a/crates/renderers-core/src/families/glm.rs +++ b/crates/renderers-core/src/families/glm.rs @@ -22,7 +22,7 @@ //! | newlines inside tool-call | no | no | yes | //! | `/nothink` user suffix | no | no | yes | //! | empty `` wrap | no | yes | no | -//! | unwrap OpenAI tool envelope | no | yes | no | +//! | unwrap `OpenAI` tool envelope | no | yes | no | //! //! The flags are surfaced on the builder; the three variants pick //! their own combination at construction time. @@ -94,7 +94,7 @@ impl GlmRendererBuilder { self } pub fn build(self, tokenizer: Tokenizer) -> Result { - GlmRenderer::new_with(tokenizer, self) + GlmRenderer::new_with(tokenizer, &self) } } @@ -139,7 +139,7 @@ impl GlmRenderer { GlmRendererBuilder::glm45().build(tokenizer) } - fn new_with(tokenizer: Tokenizer, cfg: GlmRendererBuilder) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: &GlmRendererBuilder) -> Result { let gmask = tokenizer.token_to_id_strict("[gMASK]")?; let sop = tokenizer.token_to_id_strict("")?; let system = tokenizer.token_to_id_strict("<|system|>")?; @@ -247,7 +247,7 @@ impl Renderer for GlmRenderer { let nl = self.nl_after_role(); let mut buf = RenderBuf::new( &self.tokenizer, - messages.len().max(1) * 256 + tools.map(|t| t.len() * 256 + 256).unwrap_or(0), + messages.len().max(1) * 256 + tools.map_or(0, |t| t.len() * 256 + 256), ); // Prefix @@ -547,7 +547,7 @@ impl GlmRenderer { let args_value = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), ToolArguments::Raw(s) => { - serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())) + serde_json::from_str(s).unwrap_or(JsonValue::Object(serde_json::Map::new())) } }; if let Some(obj) = args_value.as_object() { @@ -616,7 +616,7 @@ impl GlmRenderer { let args_value = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), ToolArguments::Raw(s) => { - serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())) + serde_json::from_str(s).unwrap_or(JsonValue::Object(serde_json::Map::new())) } }; if let Some(obj) = args_value.as_object() { diff --git a/crates/renderers-core/src/families/gpt_oss.rs b/crates/renderers-core/src/families/gpt_oss.rs index 12be5db..917ea29 100644 --- a/crates/renderers-core/src/families/gpt_oss.rs +++ b/crates/renderers-core/src/families/gpt_oss.rs @@ -10,7 +10,7 @@ //! - Holds a [`HarmonyEncoding`] (lazily loaded from //! [`HarmonyEncodingName::HarmonyGptOss`]) and a cache of the //! special-token ids it exposes. -//! - `render` builds a prefix conversation (SystemContent + DeveloperContent +//! - `render` builds a prefix conversation (`SystemContent` + `DeveloperContent` //! when a system message or tools are present) via //! `render_conversation`, then walks the remaining messages and renders //! each one individually via `render(msg)` so per-token attribution @@ -19,7 +19,7 @@ //! (token-id based) — matching what `renderers/parsing.py:parse_gpt_oss` //! does — so we don't need to manage a `StreamableParser`'s lifetime. //! -//! This renderer does NOT need a HuggingFace `tokenizer.json`; the +//! This renderer does NOT need a `HuggingFace` `tokenizer.json`; the //! harmony encoding embeds its own tiktoken-based tokenizer. use std::sync::Arc; @@ -212,7 +212,7 @@ impl GptOssRenderer { } /// Encode a UTF-8 string via the harmony tokenizer, returning u32 ids. - /// Helper so the call sites don't need to name CoreBPE (which is not + /// Helper so the call sites don't need to name `CoreBPE` (which is not /// re-exported from the harmony crate). fn encode_text(&self, text: &str) -> Vec { // `Rank` is `u32`; encode_with_special_tokens already returns Vec. @@ -416,7 +416,7 @@ impl Renderer for GptOssRenderer { sys, )); } - let has_dev = first_system_idx.is_some() || tools.map(|t| !t.is_empty()).unwrap_or(false); + let has_dev = first_system_idx.is_some() || tools.is_some_and(|t| !t.is_empty()); if has_dev { let mut dev = DeveloperContent::new(); if let Some(idx) = first_system_idx { @@ -439,7 +439,7 @@ impl Renderer for GptOssRenderer { } if !prefix_msgs.is_empty() { let prefix_tokens = self.render_conversation_tokens(prefix_msgs)?; - let attr_idx: i32 = first_system_idx.map(|i| i as i32).unwrap_or(SCAFFOLD_IDX); + let attr_idx: i32 = first_system_idx.map_or(SCAFFOLD_IDX, |i| i as i32); for id in prefix_tokens { tokens.push(id); indices.push(attr_idx); @@ -523,8 +523,7 @@ impl Renderer for GptOssRenderer { let body_end = ids[body_start..] .iter() .position(|&t| t == self.start || t == self.end || t == self.call) - .map(|p| p + body_start) - .unwrap_or(ids.len()); + .map_or(ids.len(), |p| p + body_start); let body_closed = body_end < ids.len() && (ids[body_end] == self.end || ids[body_end] == self.call); let body_text = self.decode_text(&ids[body_start..body_end]); @@ -670,8 +669,7 @@ fn today_yyyy_mm_dd() -> String { use std::time::{SystemTime, UNIX_EPOCH}; let secs = SystemTime::now() .duration_since(UNIX_EPOCH) - .map(|d| d.as_secs()) - .unwrap_or(0); + .map_or(0, |d| d.as_secs()); let days = secs / 86_400; // 1970-01-01 + days let (y, m, d) = civil_from_days(days as i64); @@ -680,6 +678,7 @@ fn today_yyyy_mm_dd() -> String { /// Convert days since 1970-01-01 to (year, month, day) — Howard Hinnant's /// algorithm, public-domain. +#[allow(clippy::cast_sign_loss)] // remainder mod 146_097 is in [0, 146_097) fn civil_from_days(z: i64) -> (i32, u32, u32) { let z = z + 719_468; let era = if z >= 0 { z } else { z - 146_096 } / 146_097; diff --git a/crates/renderers-core/src/families/kimi_k2.rs b/crates/renderers-core/src/families/kimi_k2.rs index 70ef98f..16c148e 100644 --- a/crates/renderers-core/src/families/kimi_k2.rs +++ b/crates/renderers-core/src/families/kimi_k2.rs @@ -60,7 +60,7 @@ impl KimiK2RendererBuilder { self } pub fn build(self, tokenizer: Tokenizer) -> Result { - KimiK2Renderer::new_with(tokenizer, self) + KimiK2Renderer::new_with(tokenizer, &self) } } @@ -97,7 +97,7 @@ impl KimiK2Renderer { KimiK2RendererBuilder::default() } - fn new_with(tokenizer: Tokenizer, cfg: KimiK2RendererBuilder) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: &KimiK2RendererBuilder) -> Result { let im_user = tokenizer.token_to_id_strict("<|im_user|>")?; let im_assistant = tokenizer.token_to_id_strict("<|im_assistant|>")?; let im_system = tokenizer.token_to_id_strict("<|im_system|>")?; @@ -212,7 +212,7 @@ impl Renderer for KimiK2Renderer { // tool_declare goes first if tools were provided and the caller // didn't already include a tool_declare message. - let tools_pending = tools.map(|t| !t.is_empty()).unwrap_or(false); + let tools_pending = tools.is_some_and(|t| !t.is_empty()); let already_has_tool_declare = !messages.is_empty() && messages[0].role == "tool_declare"; if tools_pending && !already_has_tool_declare { working.push(Message { @@ -305,7 +305,7 @@ impl Renderer for KimiK2Renderer { let mut buf = RenderBuf::new( &self.tokenizer, - working.len().max(1) * 256 + tools.map(|t| 64 * t.len() + 256).unwrap_or(0), + working.len().max(1) * 256 + tools.map_or(0, |t| 64 * t.len() + 256), ); for (i, msg) in working.iter().enumerate() { diff --git a/crates/renderers-core/src/families/kimi_k25.rs b/crates/renderers-core/src/families/kimi_k25.rs index 6b127e6..a254dc9 100644 --- a/crates/renderers-core/src/families/kimi_k25.rs +++ b/crates/renderers-core/src/families/kimi_k25.rs @@ -8,8 +8,8 @@ //! //! Distinctive features vs Kimi K2: //! -//! - Generation prompt prefills `` (enable_thinking=True) or the -//! empty block `` (enable_thinking=False) to control +//! - Generation prompt prefills `` (`enable_thinking=True`) or the +//! empty block `` (`enable_thinking=False`) to control //! thinking mode at sample time. `` and `` may be //! multi-token; the renderer encodes them as text. //! - Assistant body uses the hist/suffix split: the last non-tool-call @@ -61,7 +61,7 @@ impl KimiK25RendererBuilder { self } pub fn build(self, tokenizer: Tokenizer) -> Result { - KimiK25Renderer::new_with(tokenizer, self) + KimiK25Renderer::new_with(tokenizer, &self) } } @@ -102,7 +102,7 @@ impl KimiK25Renderer { KimiK25RendererBuilder::default() } - fn new_with(tokenizer: Tokenizer, cfg: KimiK25RendererBuilder) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: &KimiK25RendererBuilder) -> Result { let im_user = tokenizer.token_to_id_strict("<|im_user|>")?; let im_assistant = tokenizer.token_to_id_strict("<|im_assistant|>")?; let im_system = tokenizer.token_to_id_strict("<|im_system|>")?; @@ -264,7 +264,7 @@ impl Renderer for KimiK25Renderer { // (~270 lines) isn't ported yet. The Python shim avoids native // routing when tools are present, so this is a hard error if we // got here with tools. - if tools.map(|t| !t.is_empty()).unwrap_or(false) { + if tools.is_some_and(|t| !t.is_empty()) { return Err(RenderError::Invalid( "Kimi K2.5 with tools not supported on the native path yet; the Python shim should route to pure Python in this case".into(), )); @@ -551,7 +551,7 @@ impl MultimodalRenderer for KimiK25Renderer { if messages.is_empty() { return Err(RenderError::EmptyMessages); } - if tools.map(|t| !t.is_empty()).unwrap_or(false) { + if tools.is_some_and(|t| !t.is_empty()) { return Err(RenderError::Invalid( "Kimi K2.5 with tools not supported on the native path yet".into(), )); diff --git a/crates/renderers-core/src/families/minimax_m2.rs b/crates/renderers-core/src/families/minimax_m2.rs index f14cbdd..c3bfb2e 100644 --- a/crates/renderers-core/src/families/minimax_m2.rs +++ b/crates/renderers-core/src/families/minimax_m2.rs @@ -1,4 +1,4 @@ -//! MiniMax M2.5 renderer. Port of `renderers/minimax_m2.py`. +//! `MiniMax` M2.5 renderer. Port of `renderers/minimax_m2.py`. //! //! Unique characteristics: //! @@ -6,7 +6,7 @@ //! Role "assistant" is rendered as "ai". //! - System block always present — default system message //! ("You are a helpful assistant. Your name is MiniMax-M2.5 and is -//! built by MiniMax.") auto-injected if missing. +//! built by `MiniMax`.") auto-injected if missing. //! - Tools, when supplied, are appended to the system message as //! `{json}` lines inside a `...` block, //! followed by a verbose instructions block. @@ -16,7 +16,7 @@ //! - Tool responses wrapped in literal `...` //! (plain text, no special token). //! - Thinking emitted only for assistants after the last user turn -//! (or when preserve_all_thinking is on). +//! (or when `preserve_all_thinking` is on). use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; use crate::emit::RenderBuf; @@ -154,7 +154,7 @@ impl MiniMaxM2Renderer { match args { ToolArguments::Object(v) => v.clone(), ToolArguments::Raw(s) => { - serde_json::from_str(s).unwrap_or(serde_json::Value::Object(Default::default())) + serde_json::from_str(s).unwrap_or(serde_json::Value::Object(serde_json::Map::new())) } } } @@ -172,7 +172,7 @@ impl Renderer for MiniMaxM2Renderer { } let mut buf = RenderBuf::new( &self.tokenizer, - messages.len().max(1) * 256 + tools.map(|t| t.len() * 256 + 512).unwrap_or(0), + messages.len().max(1) * 256 + tools.map_or(0, |t| t.len() * 256 + 512), ); let first_is_system = messages[0].role == "system"; @@ -217,6 +217,8 @@ impl Renderer for MiniMaxM2Renderer { buf.text("\n", orig_idx)?; } "assistant" => { + // orig_idx was just cast from a usize; non-negative by construction. + #[allow(clippy::cast_sign_loss)] let preserve_thinking = should_preserve_past_thinking( messages, orig_idx as usize, diff --git a/crates/renderers-core/src/families/nemotron3.rs b/crates/renderers-core/src/families/nemotron3.rs index 1e25c38..a16d77a 100644 --- a/crates/renderers-core/src/families/nemotron3.rs +++ b/crates/renderers-core/src/families/nemotron3.rs @@ -65,7 +65,7 @@ impl Nemotron3RendererBuilder { self } pub fn build(self, tokenizer: Tokenizer) -> Result { - Nemotron3Renderer::new_with(tokenizer, self) + Nemotron3Renderer::new_with(tokenizer, &self) } } @@ -99,7 +99,7 @@ impl Nemotron3Renderer { Nemotron3RendererBuilder::default() } - fn new_with(tokenizer: Tokenizer, cfg: Nemotron3RendererBuilder) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: &Nemotron3RendererBuilder) -> Result { let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; let endoftext = tokenizer.token_to_id("<|endoftext|>"); @@ -424,7 +424,7 @@ impl Nemotron3Renderer { let args_value = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), ToolArguments::Raw(s) => { - serde_json::from_str(s).unwrap_or(JsonValue::Object(Default::default())) + serde_json::from_str(s).unwrap_or(JsonValue::Object(serde_json::Map::new())) } }; if let Some(obj) = args_value.as_object() { @@ -481,7 +481,7 @@ impl Nemotron3Renderer { fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { let base = messages.len().max(1) * 256; - let tools_bonus = tools.map(|t| 384 * t.len().max(1) + 512).unwrap_or(0); + let tools_bonus = tools.map_or(0, |t| 384 * t.len().max(1) + 512); base + tools_bonus } } @@ -569,6 +569,8 @@ impl Renderer for Nemotron3Renderer { "user" => self.emit_user(&mut buf, content, oi)?, "assistant" => { let is_last_turn = (i as i32) >= last_plain_assistant_idx; + // oi >= 0 guard above makes the usize cast safe. + #[allow(clippy::cast_sign_loss)] let preserve_thinking = oi >= 0 && should_preserve_past_thinking( messages, diff --git a/crates/renderers-core/src/families/qwen3.rs b/crates/renderers-core/src/families/qwen3.rs index 7d42ea7..940fe8a 100644 --- a/crates/renderers-core/src/families/qwen3.rs +++ b/crates/renderers-core/src/families/qwen3.rs @@ -69,7 +69,7 @@ impl Qwen3RendererBuilder { } pub fn build(self, tokenizer: Tokenizer) -> Result { - Qwen3Renderer::new_with(tokenizer, self) + Qwen3Renderer::new_with(tokenizer, &self) } } @@ -109,7 +109,7 @@ impl Qwen3Renderer { Qwen3RendererBuilder::default() } - fn new_with(tokenizer: Tokenizer, cfg: Qwen3RendererBuilder) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: &Qwen3RendererBuilder) -> Result { let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; @@ -359,7 +359,7 @@ impl Qwen3Renderer { // tools block (it can be substantial). Realloc once if we // underestimate; the cost of over-allocating is a few KB. let base = messages.len().max(1) * 256; - let tools_bonus = tools.map(|t| 256 * t.len().max(1)).unwrap_or(0); + let tools_bonus = tools.map_or(0, |t| 256 * t.len().max(1)); base + tools_bonus } } diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs index a8aa255..b70b842 100644 --- a/crates/renderers-core/src/families/qwen35.rs +++ b/crates/renderers-core/src/families/qwen35.rs @@ -81,7 +81,7 @@ impl Qwen35RendererBuilder { self } pub fn build(self, tokenizer: Tokenizer) -> Result { - Qwen35Renderer::new_with(tokenizer, self) + Qwen35Renderer::new_with(tokenizer, &self) } } @@ -128,7 +128,7 @@ impl Qwen35Renderer { Qwen35RendererBuilder::default() } - fn new_with(tokenizer: Tokenizer, cfg: Qwen35RendererBuilder) -> Result { + fn new_with(tokenizer: Tokenizer, cfg: &Qwen35RendererBuilder) -> Result { let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; @@ -436,9 +436,8 @@ impl Qwen35Renderer { // Arguments — accept JSON string (decode first) or object let args_value = match &tc.function.arguments { ToolArguments::Object(v) => v.clone(), - ToolArguments::Raw(s) => { - serde_json::from_str(s).unwrap_or(serde_json::Value::Object(Default::default())) - } + ToolArguments::Raw(s) => serde_json::from_str(s) + .unwrap_or(serde_json::Value::Object(serde_json::Map::new())), }; if let Some(obj) = args_value.as_object() { for (arg_name, arg_value) in obj { @@ -479,7 +478,7 @@ impl Qwen35Renderer { fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { let base = messages.len().max(1) * 256; - let tools_bonus = tools.map(|t| 256 * t.len().max(1) + 512).unwrap_or(0); + let tools_bonus = tools.map_or(0, |t| 256 * t.len().max(1) + 512); base + tools_bonus } } diff --git a/crates/renderers-core/src/parsing/deepseek_v3.rs b/crates/renderers-core/src/parsing/deepseek_v3.rs index ebacc99..e06c9b0 100644 --- a/crates/renderers-core/src/parsing/deepseek_v3.rs +++ b/crates/renderers-core/src/parsing/deepseek_v3.rs @@ -1,4 +1,4 @@ -//! DeepSeek V3 tool-call parser. Port of +//! `DeepSeek` V3 tool-call parser. Port of //! `renderers/parsing.py:parse_deepseek_v3` + `_parse_deepseek_tool_calls`. //! //! Structural shape: @@ -14,7 +14,7 @@ //! <|tool▁calls▁end|> //! ``` //! -//! Thinking is **text tags** (not special tokens) — DeepSeek emits +//! Thinking is **text tags** (not special tokens) — `DeepSeek` emits //! `...` as decoded text. Tool calls are special-token //! delimited. The fenced JSON inside is parsed with a small anchored regex. @@ -32,7 +32,10 @@ static JSON_FENCE_RE: LazyLock = LazyLock::new(|| { Regex::new(r"(?s)^```(?:json)?\s*(.*?)\s*```$").expect("json-fence regex") }); -#[allow(clippy::too_many_arguments)] +// Paired begin/end token ids (tool_call vs tool_calls, with matching +// _end suffixes) carry distinct meaning — the singular/plural distinction +// is the actual semantic. Renaming would obscure the structure. +#[allow(clippy::too_many_arguments, clippy::similar_names)] pub fn parse_deepseek_v3( tokenizer: &Tokenizer, token_ids: &[u32], @@ -145,11 +148,7 @@ fn parse_deepseek_tool_calls( let n = after_sep[..nl].trim().to_string(); let rest = after_sep[nl + 1..].trim(); let args = match JSON_FENCE_RE.captures(rest) { - Some(c) => c - .get(1) - .map(|m| m.as_str().trim()) - .unwrap_or("") - .to_string(), + Some(c) => c.get(1).map_or("", |m| m.as_str().trim()).to_string(), None => rest.to_string(), }; (n, args) @@ -159,15 +158,12 @@ fn parse_deepseek_tool_calls( let mut invalid_json = false; let arguments = if args_str.is_empty() { - ToolArguments::Object(serde_json::Value::Object(Default::default())) + ToolArguments::Object(serde_json::Value::Object(serde_json::Map::new())) + } else if let Ok(v) = serde_json::from_str::(&args_str) { + ToolArguments::Object(v) } else { - match serde_json::from_str::(&args_str) { - Ok(v) => ToolArguments::Object(v), - Err(_) => { - invalid_json = true; - ToolArguments::Raw(args_str.clone()) - } - } + invalid_json = true; + ToolArguments::Raw(args_str.clone()) }; let status = if unclosed { diff --git a/crates/renderers-core/src/parsing/glm.rs b/crates/renderers-core/src/parsing/glm.rs index e9c4943..1c6dc1d 100644 --- a/crates/renderers-core/src/parsing/glm.rs +++ b/crates/renderers-core/src/parsing/glm.rs @@ -43,31 +43,28 @@ pub fn parse_glm( let mut reasoning: Option = None; let mut parse_offset = 0usize; let working_ids: Vec; - let ids: &[u32] = match find(stripped, think_end_id) { - Some(think_end) => { - let reasoning_ids: Vec = stripped[..think_end] - .iter() - .copied() - .filter(|&t| t != think_id) - .collect(); - let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); - reasoning = Some(txt.trim().to_string()).filter(|s| !s.is_empty()); - parse_offset = think_end + 1; - &stripped[think_end + 1..] - } - None => { - // Truncated reasoning — without - if let Some(think_start) = find(stripped, think_id) { - let txt = decode(tokenizer, &stripped[think_start + 1..]).unwrap_or_default(); - return ParsedResponse { - content: String::new(), - reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), - tool_calls: Vec::new(), - }; - } - working_ids = stripped.to_vec(); - &working_ids + let ids: &[u32] = if let Some(think_end) = find(stripped, think_end_id) { + let reasoning_ids: Vec = stripped[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()).filter(|s| !s.is_empty()); + parse_offset = think_end + 1; + &stripped[think_end + 1..] + } else { + // Truncated reasoning — without + if let Some(think_start) = find(stripped, think_id) { + let txt = decode(tokenizer, &stripped[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; } + working_ids = stripped.to_vec(); + &working_ids }; let (content_text, tool_calls) = match find(ids, tool_call_id) { @@ -105,7 +102,10 @@ pub fn parse_glm( } } -#[allow(clippy::too_many_arguments)] +// Abbreviated arg-key/arg-value begin/end ids (ak/ake/av/ave) are tight +// pairs by design — the abbreviations keep call sites readable, and the +// surface fn (parse_glm) uses full names. +#[allow(clippy::too_many_arguments, clippy::similar_names)] fn parse_glm_tool_calls( tokenizer: &Tokenizer, ids: &[u32], @@ -127,21 +127,18 @@ fn parse_glm_tool_calls( } let span_start = section_offset + i; - let end = match find_from(ids, tc_end_id, i + 1) { - Some(end) => end, - None => { - let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); - out.push(ParsedToolCall { - raw, - token_span: Some(Range { - start: span_start, - end: section_offset + ids.len(), - }), - status: ToolCallParseStatus::UnclosedBlock, - ..Default::default() - }); - break; - } + let Some(end) = find_from(ids, tc_end_id, i + 1) else { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; }; let block = &ids[i + 1..end]; @@ -191,12 +188,11 @@ fn parse_glm_tool_calls( .unwrap_or_default() .trim() .to_string(); - let val = match serde_json::from_str::(&val_text) { - Ok(v) => v, - Err(_) => { - any_json_fallback = true; - serde_json::Value::String(val_text) - } + let val = if let Ok(v) = serde_json::from_str::(&val_text) { + v + } else { + any_json_fallback = true; + serde_json::Value::String(val_text) }; arguments.insert(key, val); j = ave + 1; diff --git a/crates/renderers-core/src/parsing/kimi_k2.rs b/crates/renderers-core/src/parsing/kimi_k2.rs index 612a901..2deca32 100644 --- a/crates/renderers-core/src/parsing/kimi_k2.rs +++ b/crates/renderers-core/src/parsing/kimi_k2.rs @@ -53,26 +53,23 @@ pub fn parse_kimi_k2( }; let text = decode(tokenizer, content_ids).unwrap_or_default(); - let (reasoning, content) = match text.split_once("") { - Some((before, after)) => { - let raw = before.replacen("", "", 1); + let (reasoning, content) = if let Some((before, after)) = text.split_once("") { + let raw = before.replacen("", "", 1); + let r = raw.trim_matches('\n').trim().to_string(); + let c = after.trim_matches('\n').to_string(); + (Some(r).filter(|s| !s.is_empty()), c) + } else { + if let Some(think_at) = text.find("") { + // Truncated thinking — no closing tag + let raw = &text[think_at + "".len()..]; let r = raw.trim_matches('\n').trim().to_string(); - let c = after.trim_matches('\n').to_string(); - (Some(r).filter(|s| !s.is_empty()), c) - } - None => { - if let Some(think_at) = text.find("") { - // Truncated thinking — no closing tag - let raw = &text[think_at + "".len()..]; - let r = raw.trim_matches('\n').trim().to_string(); - return ParsedResponse { - content: String::new(), - reasoning_content: Some(r).filter(|s| !s.is_empty()), - tool_calls: Vec::new(), - }; - } - (None, text) + return ParsedResponse { + content: String::new(), + reasoning_content: Some(r).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; } + (None, text) }; ParsedResponse { @@ -98,21 +95,18 @@ fn parse_kimi_k2_calls( i += 1; continue; } - let arg_begin = match find_from(ids, tc_arg_begin_id, i + 1) { - Some(v) => v, - None => { - let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); - out.push(ParsedToolCall { - raw, - token_span: Some(Range { - start: section_offset + i, - end: section_offset + ids.len(), - }), - status: ToolCallParseStatus::MalformedStructure, - ..Default::default() - }); - break; - } + let Some(arg_begin) = find_from(ids, tc_arg_begin_id, i + 1) else { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: section_offset + i, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + break; }; let (tc_end, unclosed) = match find_from(ids, tc_end_id, arg_begin + 1) { @@ -143,12 +137,11 @@ fn parse_kimi_k2_calls( }; let mut invalid_json = false; - let arguments = match serde_json::from_str::(&args_str) { - Ok(v) => ToolArguments::Object(v), - Err(_) => { - invalid_json = true; - ToolArguments::Raw(args_str.clone()) - } + let arguments = if let Ok(v) = serde_json::from_str::(&args_str) { + ToolArguments::Object(v) + } else { + invalid_json = true; + ToolArguments::Raw(args_str.clone()) }; let status = if unclosed { diff --git a/crates/renderers-core/src/parsing/minimax.rs b/crates/renderers-core/src/parsing/minimax.rs index f1e309e..0cac581 100644 --- a/crates/renderers-core/src/parsing/minimax.rs +++ b/crates/renderers-core/src/parsing/minimax.rs @@ -1,4 +1,4 @@ -//! MiniMax M2 tool-call parser. Port of +//! `MiniMax` M2 tool-call parser. Port of //! `renderers/parsing.py:parse_minimax`. //! //! Structural shape: @@ -52,30 +52,27 @@ pub fn parse_minimax( let mut reasoning: Option = None; let mut parse_offset = 0usize; let working: Vec; - let ids: &[u32] = match find(stripped, think_end_id) { - Some(think_end) => { - let reasoning_ids: Vec = stripped[..think_end] - .iter() - .copied() - .filter(|&t| t != think_id) - .collect(); - let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); - reasoning = Some(txt.trim().to_string()).filter(|s| !s.is_empty()); - parse_offset = think_end + 1; - &stripped[think_end + 1..] - } - None => { - if let Some(think_start) = find(stripped, think_id) { - let txt = decode(tokenizer, &stripped[think_start + 1..]).unwrap_or_default(); - return ParsedResponse { - content: String::new(), - reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), - tool_calls: Vec::new(), - }; - } - working = stripped.to_vec(); - &working + let ids: &[u32] = if let Some(think_end) = find(stripped, think_end_id) { + let reasoning_ids: Vec = stripped[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()).filter(|s| !s.is_empty()); + parse_offset = think_end + 1; + &stripped[think_end + 1..] + } else { + if let Some(think_start) = find(stripped, think_id) { + let txt = decode(tokenizer, &stripped[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; } + working = stripped.to_vec(); + &working }; let mut tool_calls: Vec = Vec::new(); @@ -97,21 +94,18 @@ pub fn parse_minimax( } let span_start = parse_offset + i; - let end = match find_from(ids, tool_call_end_id, i + 1) { - Some(end) => end, - None => { - let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); - tool_calls.push(ParsedToolCall { - raw, - token_span: Some(Range { - start: span_start, - end: parse_offset + ids.len(), - }), - status: ToolCallParseStatus::UnclosedBlock, - ..Default::default() - }); - break; - } + let Some(end) = find_from(ids, tool_call_end_id, i + 1) else { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + tool_calls.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: parse_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; }; let block_text = decode(tokenizer, &ids[i + 1..end]).unwrap_or_default(); let span = Range { @@ -129,19 +123,18 @@ pub fn parse_minimax( }); } else { for inv in invokes { - let name = inv.get(1).map(|m| m.as_str()).unwrap_or(""); - let body = inv.get(2).map(|m| m.as_str()).unwrap_or(""); + let name = inv.get(1).map_or("", |m| m.as_str()); + let body = inv.get(2).map_or("", |m| m.as_str()); let mut arguments = serde_json::Map::new(); let mut any_json_fallback = false; for pm in PARAMETER_RE.captures_iter(body) { - let pname = pm.get(1).map(|m| m.as_str()).unwrap_or(""); - let pval = pm.get(2).map(|m| m.as_str().trim()).unwrap_or(""); - let v = match serde_json::from_str::(pval) { - Ok(v) => v, - Err(_) => { - any_json_fallback = true; - serde_json::Value::String(pval.to_string()) - } + let pname = pm.get(1).map_or("", |m| m.as_str()); + let pval = pm.get(2).map_or("", |m| m.as_str().trim()); + let v = if let Ok(v) = serde_json::from_str::(pval) { + v + } else { + any_json_fallback = true; + serde_json::Value::String(pval.to_string()) }; arguments.insert(pname.to_string(), v); } diff --git a/crates/renderers-core/src/parsing/qwen3.rs b/crates/renderers-core/src/parsing/qwen3.rs index bcd7ee2..c3b77b5 100644 --- a/crates/renderers-core/src/parsing/qwen3.rs +++ b/crates/renderers-core/src/parsing/qwen3.rs @@ -116,9 +116,8 @@ pub fn parse_qwen3( /// emitted) out of a parsed tool-call JSON value. Matches the Python /// `parsed.get("name", "")` / `parsed.get("arguments", {})` semantics. fn extract_name_and_args(value: &serde_json::Value) -> (String, ToolArguments) { - let obj = match value.as_object() { - Some(o) => o, - None => return (String::new(), ToolArguments::default()), + let Some(obj) = value.as_object() else { + return (String::new(), ToolArguments::default()); }; let name = obj .get("name") diff --git a/crates/renderers-core/src/parsing/qwen35.rs b/crates/renderers-core/src/parsing/qwen35.rs index cb105bd..e9d5a40 100644 --- a/crates/renderers-core/src/parsing/qwen35.rs +++ b/crates/renderers-core/src/parsing/qwen35.rs @@ -59,59 +59,53 @@ pub fn parse_qwen35( let mut reasoning: Option = None; let mut parse_offset: usize = 0; let working_ids: Vec; - let ids_after_think: &[u32] = match find(ids, think_end_id) { - Some(think_end) => { - // Filter out think_id tokens from the reasoning span so the - // decoded text doesn't include the opening marker. - let reasoning_ids: Vec = ids[..think_end] - .iter() - .copied() - .filter(|&t| t != think_id) - .collect(); - let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); - reasoning = Some(txt.trim().to_string()); - parse_offset = think_end + 1; - &ids[think_end + 1..] - } - None => { - // present but no — truncated reasoning; - // return early with reasoning-only response. - if let Some(think_start) = find(ids, think_id) { - let txt = decode(tokenizer, &ids[think_start + 1..]).unwrap_or_default(); - return ParsedResponse { - content: String::new(), - reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), - tool_calls: Vec::new(), - }; - } - working_ids = ids.to_vec(); - &working_ids + let ids_after_think: &[u32] = if let Some(think_end) = find(ids, think_end_id) { + // Filter out think_id tokens from the reasoning span so the + // decoded text doesn't include the opening marker. + let reasoning_ids: Vec = ids[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()); + parse_offset = think_end + 1; + &ids[think_end + 1..] + } else { + // present but no — truncated reasoning; + // return early with reasoning-only response. + if let Some(think_start) = find(ids, think_id) { + let txt = decode(tokenizer, &ids[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; } + working_ids = ids.to_vec(); + &working_ids }; // ── Tool calls (token-bounded, regex-on-decoded-span) ─────────── - let (content_text, tool_calls) = match find(ids_after_think, tool_call_id) { - Some(tc_start) => { - let content = decode(tokenizer, &ids_after_think[..tc_start]) - .unwrap_or_default() - .trim() - .to_string(); - let tcs = parse_xml_tool_calls( - tokenizer, - &ids_after_think[tc_start..], - tool_call_id, - tool_call_end_id, - parse_offset + tc_start, - ); - (content, tcs) - } - None => { - let content = decode(tokenizer, ids_after_think) - .unwrap_or_default() - .trim() - .to_string(); - (content, Vec::new()) - } + let (content_text, tool_calls) = if let Some(tc_start) = find(ids_after_think, tool_call_id) { + let content = decode(tokenizer, &ids_after_think[..tc_start]) + .unwrap_or_default() + .trim() + .to_string(); + let tcs = parse_xml_tool_calls( + tokenizer, + &ids_after_think[tc_start..], + tool_call_id, + tool_call_end_id, + parse_offset + tc_start, + ); + (content, tcs) + } else { + let content = decode(tokenizer, ids_after_think) + .unwrap_or_default() + .trim() + .to_string(); + (content, Vec::new()) }; ParsedResponse { @@ -138,21 +132,18 @@ fn parse_xml_tool_calls( } let span_start = section_offset + i; - let end = match find_from(ids, tc_end_id, i + 1) { - Some(end) => end, - None => { - let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); - out.push(ParsedToolCall { - raw, - token_span: Some(Range { - start: span_start, - end: section_offset + ids.len(), - }), - status: ToolCallParseStatus::UnclosedBlock, - ..Default::default() - }); - break; - } + let Some(end) = find_from(ids, tc_end_id, i + 1) else { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; }; let block_text = decode(tokenizer, &ids[i + 1..end]).unwrap_or_default(); @@ -161,18 +152,15 @@ fn parse_xml_tool_calls( end: section_offset + end + 1, }; - let name_match = match FUNCTION_NAME_RE.captures(&block_text) { - Some(c) => c, - None => { - out.push(ParsedToolCall { - raw: block_text, - token_span: Some(span), - status: ToolCallParseStatus::MalformedStructure, - ..Default::default() - }); - i = end + 1; - continue; - } + let Some(name_match) = FUNCTION_NAME_RE.captures(&block_text) else { + out.push(ParsedToolCall { + raw: block_text, + token_span: Some(span), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + i = end + 1; + continue; }; let name = name_match .get(1) @@ -182,16 +170,13 @@ fn parse_xml_tool_calls( let mut arguments = serde_json::Map::new(); let mut any_json_fallback = false; for pm in PARAMETER_RE.captures_iter(&block_text) { - let arg_name = pm.get(1).map(|m| m.as_str()).unwrap_or("").to_string(); - let arg_value = pm.get(2).map(|m| m.as_str().trim()).unwrap_or(""); - match serde_json::from_str::(arg_value) { - Ok(v) => { - arguments.insert(arg_name, v); - } - Err(_) => { - arguments.insert(arg_name, serde_json::Value::String(arg_value.to_string())); - any_json_fallback = true; - } + let arg_name = pm.get(1).map_or("", |m| m.as_str()).to_string(); + let arg_value = pm.get(2).map_or("", |m| m.as_str().trim()); + if let Ok(v) = serde_json::from_str::(arg_value) { + arguments.insert(arg_name, v); + } else { + arguments.insert(arg_name, serde_json::Value::String(arg_value.to_string())); + any_json_fallback = true; } } diff --git a/crates/renderers-core/src/processing/mod.rs b/crates/renderers-core/src/processing/mod.rs index bc45bad..7f5f4d8 100644 --- a/crates/renderers-core/src/processing/mod.rs +++ b/crates/renderers-core/src/processing/mod.rs @@ -1,4 +1,4 @@ -//! Vision processors — port of the HuggingFace image processor pipelines. +//! Vision processors — port of the `HuggingFace` image processor pipelines. //! //! Phase 5b: actual pixel-data preprocessing in Rust. Decode image bytes, //! smart-resize, normalise, patch-extract, and produce the tensors the @@ -12,7 +12,7 @@ //! //! Future: //! -//! - Kimi K2.5 — different smart_resize defaults and a single-pad +//! - Kimi K2.5 — different `smart_resize` defaults and a single-pad //! placeholder convention (Phase 5b follow-up). //! - Video frame sampling — needs `video-rs` or `ffmpeg-next` (Phase 5c). diff --git a/crates/renderers-core/src/processing/qwen3_vl.rs b/crates/renderers-core/src/processing/qwen3_vl.rs index e4f0c70..5e9f311 100644 --- a/crates/renderers-core/src/processing/qwen3_vl.rs +++ b/crates/renderers-core/src/processing/qwen3_vl.rs @@ -1,7 +1,7 @@ //! Vision image processing for Qwen-VL family models (Qwen2-VL, //! Qwen3-VL, Qwen3.5-VL). //! -//! Port of the HuggingFace `Qwen2VLImageProcessor` / `Qwen3VLImageProcessor` +//! Port of the `HuggingFace` `Qwen2VLImageProcessor` / `Qwen3VLImageProcessor` //! pipeline. Given an image (bytes or decoded RGB), produces: //! //! - `pixel_values`: `ndarray::Array2` of shape @@ -15,9 +15,9 @@ //! //! # Parity caveat //! -//! The grid dimensions, num_tokens, and tensor shape match HF exactly. +//! The grid dimensions, `num_tokens`, and tensor shape match HF exactly. //! The pixel values themselves use the `image` crate's bicubic -//! (CatmullRom) resize, which differs from PIL's bicubic in the last +//! (`CatmullRom`) resize, which differs from PIL's bicubic in the last //! few decimals — typical RMS difference ≈ 1e-3 on normalized pixels. //! Downstream models tolerate this level of noise (it's far below the //! quantization floor of vision encoders); but if exact pixel parity @@ -32,7 +32,7 @@ use sha2::{Digest, Sha256}; use crate::types::RenderError; -/// OpenAI CLIP normalisation constants — Qwen-VL inherits these. +/// `OpenAI` CLIP normalisation constants — Qwen-VL inherits these. pub const CLIP_MEAN: [f32; 3] = [0.481_454_66, 0.457_827_5, 0.408_210_73]; pub const CLIP_STD: [f32; 3] = [0.268_629_54, 0.261_302_6, 0.275_777_1]; @@ -78,7 +78,7 @@ impl Default for Qwen3VlImageProcessor { /// Output of one image's processing run. #[derive(Debug, Clone)] pub struct ProcessedImage { - /// Flattened patches: shape (grid_h * grid_w, channel * temporal * patch²). + /// Flattened patches: shape (`grid_h` * `grid_w`, channel * temporal * patch²). pub pixel_values: Array2, /// `[1, grid_h, grid_w]` — temporal × height × width patch count. pub image_grid_thw: [u32; 3], @@ -96,7 +96,7 @@ impl Qwen3VlImageProcessor { /// `factor = patch_size * merge_size` (28 by default). pub fn smart_resize(&self, height: u32, width: u32) -> Result<(u32, u32), RenderError> { let factor = self.patch_size * self.merge_size; - let (h, w) = (height as f64, width as f64); + let (h, w) = (f64::from(height), f64::from(width)); let max_dim = h.max(w); let min_dim = h.min(w); if min_dim == 0.0 { @@ -108,12 +108,12 @@ impl Qwen3VlImageProcessor { max_dim / min_dim ))); } - let f = factor as f64; + let f = f64::from(factor); let mut h_bar = (h / f).round() * f; let mut w_bar = (w / f).round() * f; - let max_pixels = self.max_pixels as f64; - let min_pixels = self.min_pixels as f64; + let max_pixels = f64::from(self.max_pixels); + let min_pixels = f64::from(self.min_pixels); if h_bar * w_bar > max_pixels { let beta = ((h * w) / max_pixels).sqrt(); @@ -126,6 +126,8 @@ impl Qwen3VlImageProcessor { h_bar = ((h * beta) / f).ceil() * f; w_bar = ((w * beta) / f).ceil() * f; } + // smart_resize math keeps h_bar/w_bar positive (clamped to `f`). + #[allow(clippy::cast_sign_loss)] Ok((h_bar as u32, w_bar as u32)) } @@ -173,7 +175,7 @@ impl Qwen3VlImageProcessor { for x in 0..w { let p = resized.get_pixel(x as u32, y as u32); for c in 0..3 { - let v = (p[c] as f32) * self.rescale_factor; + let v = f32::from(p[c]) * self.rescale_factor; chw[(c, y, x)] = (v - self.image_mean[c]) / self.image_std[c]; } } diff --git a/crates/renderers-core/src/processing/resolver.rs b/crates/renderers-core/src/processing/resolver.rs index 8411057..1685e5b 100644 --- a/crates/renderers-core/src/processing/resolver.rs +++ b/crates/renderers-core/src/processing/resolver.rs @@ -79,9 +79,8 @@ impl MediaResolver for Qwen3VlResolver { fn resolve_image(&self, source: &MediaSource<'_>) -> Result { let bytes: Vec = match source { MediaSource::Bytes(b) => b.to_vec(), - MediaSource::Path(p) => { - fs::read(p).map_err(|e| RenderError::Invalid(format!("read image {p:?}: {e}")))? - } + MediaSource::Path(p) => fs::read(p) + .map_err(|e| RenderError::Invalid(format!("read image {}: {e}", p.display())))?, MediaSource::Url(_) => { return Err(RenderError::Invalid( "URL sources require an async fetch — pass already-downloaded bytes instead" diff --git a/crates/renderers-core/src/registry.rs b/crates/renderers-core/src/registry.rs index 5b8a193..acff732 100644 --- a/crates/renderers-core/src/registry.rs +++ b/crates/renderers-core/src/registry.rs @@ -48,7 +48,7 @@ pub struct RendererConfig { pub fn create_renderer( kind: RendererKind, tokenizer: Tokenizer, - cfg: RendererConfig, + cfg: &RendererConfig, ) -> Result, RenderError> { match kind { RendererKind::Qwen3 => Ok(Box::new( diff --git a/crates/renderers-core/src/types.rs b/crates/renderers-core/src/types.rs index 98afbfd..0d4d3b2 100644 --- a/crates/renderers-core/src/types.rs +++ b/crates/renderers-core/src/types.rs @@ -1,7 +1,7 @@ //! Core data types for renderers. //! //! The shapes mirror the Python `renderers.base` types so JSON round-trips -//! and PyO3 wrapping stay mechanical. Strings are owned (`String`) — PyO3 +//! and `PyO3` wrapping stay mechanical. Strings are owned (`String`) — `PyO3` //! always materialises strings on entry, so `Cow<'a, str>` would only //! propagate lifetimes for no win. The few `&str` borrows that pay off are //! taken locally inside renderer implementations from `&[Message]` slices. @@ -108,7 +108,7 @@ pub struct ToolCallFunction { pub arguments: ToolArguments, } -/// Structured tool invocation in OpenAI function-calling format. +/// Structured tool invocation in `OpenAI` function-calling format. #[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] pub struct ToolCall { #[serde(default = "default_tool_type", rename = "type")] @@ -122,7 +122,7 @@ fn default_tool_type() -> String { "function".to_string() } -/// Tool specification (OpenAI function-calling format) passed to +/// Tool specification (`OpenAI` function-calling format) passed to /// [`Renderer::render`](crate::Renderer::render). #[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] pub struct ToolSpec { @@ -167,7 +167,7 @@ impl Message { } /// Tool-call argument payload. The JSON-object case is the common path; -/// the raw-string case preserves the OpenAI quirk where some clients +/// the raw-string case preserves the `OpenAI` quirk where some clients /// pre-serialise arguments to a string. #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(untagged)] @@ -178,7 +178,7 @@ pub enum ToolArguments { impl Default for ToolArguments { fn default() -> Self { - ToolArguments::Object(serde_json::Value::Object(Default::default())) + ToolArguments::Object(serde_json::Value::Object(serde_json::Map::new())) } } @@ -213,7 +213,7 @@ pub struct MultiModalData { pub mm_placeholders: std::collections::BTreeMap>, /// Per-item processor outputs. The values are passed through as opaque /// JSON to keep this crate framework-agnostic; vision processors live - /// behind the PyO3 boundary in the current Phase 1 design. + /// behind the `PyO3` boundary in the current Phase 1 design. #[serde(default)] pub mm_items: std::collections::BTreeMap>, } @@ -281,7 +281,7 @@ pub enum ToolCallParseStatus { impl ToolCallParseStatus { /// Wire string matching the Python enum values - /// (`"ok" | "invalid_json" | ...`) so PyO3 can round-trip them. + /// (`"ok" | "invalid_json" | ...`) so `PyO3` can round-trip them. pub fn as_wire(&self) -> &'static str { match self { Self::Ok => "ok", diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index a6b2353..2047a00 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -5,7 +5,7 @@ //! pyclasses wrap `RenderedTokens` / `ParsedResponse` / `ParsedToolCall` //! with `#[getter]` accessors. Argument unpacking is done by //! `pythonize` so callers can pass plain dicts / lists for messages and -//! tools without per-field PyO3 conversion. +//! tools without per-field `PyO3` conversion. use std::sync::Arc; @@ -28,6 +28,9 @@ use renderers_core::types::{ ToolSpec, }; +// Kept by-value so call sites can use the bare fn pointer +// `.map_err(render_err)` (closures would be needed for `&E`). +#[allow(clippy::needless_pass_by_value)] fn render_err(e: renderers_core::types::RenderError) -> PyErr { PyRuntimeError::new_err(e.to_string()) } @@ -80,9 +83,8 @@ fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult> fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { let value: serde_json::Value = pythonize::depythonize(obj) .map_err(|e| invalid(format!("media must be a list of dicts: {e}")))?; - let arr = match value { - serde_json::Value::Array(a) => a, - _ => return Err(invalid("media must be a list")), + let serde_json::Value::Array(arr) = value else { + return Err(invalid("media must be a list")); }; let mut bundle = MediaBundle::new(); for item in arr { @@ -136,10 +138,8 @@ fn parse_u32_list(obj: &Bound<'_, PyAny>) -> PyResult> { let mut out = Vec::with_capacity(list.len()); for item in list.iter() { let v: i64 = item.extract()?; - if v < 0 || v > u32::MAX as i64 { - return Err(invalid(format!("token id out of range: {v}"))); - } - out.push(v as u32); + let id = u32::try_from(v).map_err(|_| invalid(format!("token id out of range: {v}")))?; + out.push(id); } Ok(out) } @@ -161,7 +161,7 @@ impl PyRenderedTokens { // Cast u32 -> i64 for Python `int` compatibility. PyList::new is // the fastest path; per-element extract is unavoidable until // numpy support is added. - PyList::new_bound(py, self.inner.token_ids.iter().map(|&t| t as i64)) + PyList::new_bound(py, self.inner.token_ids.iter().copied().map(i64::from)) } #[getter] @@ -566,7 +566,7 @@ impl PyRenderer { }) } - /// Build a MiniMax M2 / M2.5 renderer from a tokenizer.json. + /// Build a `MiniMax` M2 / M2.5 renderer from a tokenizer.json. #[classmethod] #[pyo3(signature = ( tokenizer_path, @@ -595,7 +595,7 @@ impl PyRenderer { }) } - /// Build a DefaultRenderer (Jinja fallback via minijinja). + /// Build a `DefaultRenderer` (Jinja fallback via minijinja). /// /// `chat_template` is the model's Jinja chat template (usually the /// `chat_template` field of `tokenizer_config.json` or the contents @@ -646,7 +646,7 @@ impl PyRenderer { /// Build a GPT-OSS (Harmony) renderer. /// - /// Unlike the other families, GPT-OSS doesn't need a HuggingFace + /// Unlike the other families, GPT-OSS doesn't need a `HuggingFace` /// `tokenizer.json` — the harmony encoding embeds its own /// tiktoken-based tokenizer. The `tokenizer_path` argument is /// ignored on this path but kept for API uniformity with the other @@ -806,7 +806,7 @@ impl PyRenderer { }) } - /// Build a DeepSeek V3 renderer from a tokenizer.json. + /// Build a `DeepSeek` V3 renderer from a tokenizer.json. /// /// `enable_thinking=True` (default) prefills the generation prompt /// with `\n` to trigger reasoning. The Python shim mirrors @@ -863,7 +863,7 @@ impl PyRenderer { let ids = py .detach(move || renderer.render_ids(&msgs, tools.as_deref(), add_generation_prompt)) .map_err(render_err)?; - Ok(PyList::new_bound(py, ids.iter().map(|&t| t as i64))) + Ok(PyList::new_bound(py, ids.iter().copied().map(i64::from))) } fn parse_response( @@ -878,7 +878,10 @@ impl PyRenderer { } fn get_stop_token_ids<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { - PyList::new_bound(py, self.inner.stop_token_ids().iter().map(|&t| t as i64)) + PyList::new_bound( + py, + self.inner.stop_token_ids().iter().copied().map(i64::from), + ) } /// Render with pre-resolved multimodal media items. @@ -952,11 +955,11 @@ impl PyRenderer { /// Rust port of HF's `Qwen3VLImageProcessor` / `Qwen2VLImageProcessor`. /// -/// Decodes image bytes, smart-resizes, normalises with the OpenAI CLIP +/// Decodes image bytes, smart-resizes, normalises with the `OpenAI` CLIP /// mean / std, and produces `pixel_values` + `image_grid_thw` tensors /// in the exact shape the model expects. Equivalent to the Python -/// processor end-to-end; pixel-byte parity is approximate (CatmullRom -/// vs PIL bicubic), but grid dims, num_tokens, and tensor shape match +/// processor end-to-end; pixel-byte parity is approximate (`CatmullRom` +/// vs PIL bicubic), but grid dims, `num_tokens`, and tensor shape match /// exactly. #[pyclass(name = "Qwen3VlImageProcessor", module = "renderers_native")] struct PyQwen3VlImageProcessor { @@ -1072,7 +1075,7 @@ fn processed_to_pyobject<'py>(py: Python<'py>, p: ProcessedImage) -> PyResult> = p.pixel_values.into_pyarray(py); let grid_array: Bound<'py, PyArray2> = ndarray::Array2::from_shape_vec( (1, 3), - p.image_grid_thw.iter().map(|&v| v as i64).collect(), + p.image_grid_thw.iter().copied().map(i64::from).collect(), ) .expect("image_grid_thw is always shape [1,3]") .into_pyarray(py); diff --git a/renderers/_native_router.py b/renderers/_native_router.py index 42df709..0e687b0 100644 --- a/renderers/_native_router.py +++ b/renderers/_native_router.py @@ -121,7 +121,7 @@ def resolve_tokenizer_path(tokenizer: Any) -> str: ) cached = try_to_load_from_cache(repo_id=name_or_path, filename="tokenizer.json") - if cached is None or cached is False: + if not isinstance(cached, (str, os.PathLike)): raise ValueError( f"tokenizer.json not available in the local HF cache for {name_or_path}. " "Run `snapshot_download` first or pass an explicit path." diff --git a/renderers/gpt_oss.py b/renderers/gpt_oss.py index 99b9fc9..5311d1e 100644 --- a/renderers/gpt_oss.py +++ b/renderers/gpt_oss.py @@ -54,7 +54,6 @@ from renderers._native_router import ( load_native, native_enabled, - resolve_tokenizer_path, ) from renderers.base import ( Message, diff --git a/tests/test_client.py b/tests/test_client.py index 1cc1000..79c9c52 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -299,6 +299,18 @@ def test_generate_serializes_multimodal_features_for_qwen_vl_family( pytest.importorskip("torch") pytest.importorskip("vllm", reason="vllm needed for features serialization") + _pytest.importorskip( + "vllm.entrypoints.serve.disagg.mm_serde", + reason="vLLM multimodal serializer is not available", + ) + _pytest.importorskip( + "vllm.model_executor.models.qwen2_vl", + reason="vLLM Qwen-VL field factory is not available", + ) + _pytest.importorskip( + "vllm.multimodal.inputs", + reason="vLLM multimodal input wrappers are not available", + ) import torch as _torch from renderers.base import ( diff --git a/tests/test_native_router.py b/tests/test_native_router.py index 2a73ad5..12dc140 100644 --- a/tests/test_native_router.py +++ b/tests/test_native_router.py @@ -9,6 +9,8 @@ class surface. from __future__ import annotations import os +import sys +from types import SimpleNamespace from unittest import mock import pytest @@ -74,6 +76,32 @@ def test_resolve_tokenizer_path_from_exact_file(tmp_path): assert router.resolve_tokenizer_path(str(f)) == str(f) +def test_resolve_tokenizer_path_rejects_hf_missing_sentinel(monkeypatch): + tokenizer = SimpleNamespace(name_or_path="org/custom-tokenizer") + fake_hf = SimpleNamespace(try_to_load_from_cache=lambda **_kwargs: object()) + monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hf) + + with pytest.raises(ValueError, match="tokenizer.json not available"): + router.resolve_tokenizer_path(tokenizer) + + +def test_kimi_k2_constructor_falls_back_without_tokenizer_path(monkeypatch): + from renderers.kimi_k2 import KimiK2Renderer + + fake_native = mock.Mock() + monkeypatch.setattr("renderers.kimi_k2.native_enabled", lambda _family: True) + monkeypatch.setattr("renderers.kimi_k2.load_native", lambda: fake_native) + monkeypatch.setattr( + "renderers.kimi_k2.try_resolve_tokenizer_path", + lambda _tokenizer, _family: None, + ) + + inst = KimiK2Renderer.__new__(KimiK2Renderer, object()) + + assert isinstance(inst, KimiK2Renderer) + fake_native.Renderer.kimi_k2.assert_not_called() + + def test_kimi_k25_constructor_does_not_route_eagerly(monkeypatch): from renderers.kimi_k25 import KimiK25Renderer diff --git a/tests/test_renderer_e2e.py b/tests/test_renderer_e2e.py new file mode 100644 index 0000000..11eedf8 --- /dev/null +++ b/tests/test_renderer_e2e.py @@ -0,0 +1,74 @@ +"""Backend-free end-to-end renderer flow tests. + +These tests simulate the token-in/token-out control loop without launching +vLLM, SGLang, Transformers generation, or Tinker. They cover the glue between +``render_ids``, ``parse_response``, and ``bridge_to_next_turn`` so the examples +have a local parity check for the renderer-owned part of the stack. +""" + +from __future__ import annotations + + +def test_renderer_owned_two_turn_flow_preserves_sampled_prefix(): + from renderers import create_renderer + from renderers.base import load_tokenizer + + tokenizer = load_tokenizer("Qwen/Qwen3.5-9B") + renderer = create_renderer(tokenizer, renderer="auto") + + messages = [ + {"role": "system", "content": "You are concise."}, + {"role": "user", "content": "Say hello."}, + ] + assistant = {"role": "assistant", "content": "Hello."} + + prompt_ids = renderer.render_ids(messages, add_generation_prompt=True) + full_ids = renderer.render_ids(messages + [assistant]) + completion_ids = full_ids[len(prompt_ids) :] + + parsed = renderer.parse_response(completion_ids) + assert "Hello" in parsed.content + + bridged = renderer.bridge_to_next_turn( + prompt_ids, + completion_ids, + [{"role": "user", "content": "Now say bye."}], + ) + assert bridged is not None + bridged_ids = list(bridged.token_ids) + expected_prefix = prompt_ids + completion_ids + assert bridged_ids[: len(expected_prefix)] == expected_prefix + + +def test_default_renderer_fallback_keeps_raw_decoded_completion_prefix(): + """DefaultRenderer cannot bridge, so callers fall back to a full render. + + The fallback must use raw decoded completion bytes, not parse-normalized + assistant structure. For round-tripping tokenizers, that preserves the + sampled assistant prefix even though the bridge API correctly returns + ``None``. + """ + + from renderers import create_renderer + from renderers.base import load_tokenizer + + tokenizer = load_tokenizer("Qwen/Qwen2.5-0.5B-Instruct") + renderer = create_renderer(tokenizer, renderer="default") + + messages = [{"role": "user", "content": "Say hello."}] + assistant = {"role": "assistant", "content": "HELLO_SENTINEL"} + new_messages = [{"role": "user", "content": "Now say bye."}] + + prompt_ids = renderer.render_ids(messages, add_generation_prompt=True) + full_ids = renderer.render_ids(messages + [assistant]) + completion_ids = full_ids[len(prompt_ids) :] + + assert renderer.bridge_to_next_turn(prompt_ids, completion_ids, new_messages) is None + + raw_completion = tokenizer.decode(completion_ids, skip_special_tokens=False) + fallback_ids = renderer.render_ids( + messages + [{"role": "assistant", "content": raw_completion}] + new_messages, + add_generation_prompt=True, + ) + expected_prefix = prompt_ids + completion_ids + assert fallback_ids[: len(expected_prefix)] == expected_prefix From 0f5dd5051abe6d5049c2ef28cf9bf85077d33b54 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:56:23 +0200 Subject: [PATCH 20/35] Enable Kimi native tokenizer parity --- crates/renderers-core/src/families/kimi_k2.rs | 45 +++++++++- renderers/_native_router.py | 85 +++++++++++++++++-- tests/test_native_router.py | 17 ++++ 3 files changed, 138 insertions(+), 9 deletions(-) diff --git a/crates/renderers-core/src/families/kimi_k2.rs b/crates/renderers-core/src/families/kimi_k2.rs index 16c148e..7fe0989 100644 --- a/crates/renderers-core/src/families/kimi_k2.rs +++ b/crates/renderers-core/src/families/kimi_k2.rs @@ -138,7 +138,7 @@ impl KimiK2Renderer { // for sort_keys behaviour we use a BTreeMap-backed Value tree. // serde_json's `serialize` of a BTreeMap sorts keys by Ord. use std::collections::BTreeMap; - let mut arr: Vec> = Vec::with_capacity(tools.len()); + let mut arr: Vec = Vec::with_capacity(tools.len()); for tool in tools { let mut m: BTreeMap = BTreeMap::new(); m.insert("name".into(), serde_json::Value::String(tool.name.clone())); @@ -147,7 +147,17 @@ impl KimiK2Renderer { serde_json::Value::String(tool.description.clone()), ); m.insert("parameters".into(), Self::sort_keys(&tool.parameters)); - arr.push(m); + if tool.openai_envelope { + let mut envelope: BTreeMap = BTreeMap::new(); + envelope.insert( + "function".into(), + serde_json::to_value(m).unwrap_or_default(), + ); + envelope.insert("type".into(), serde_json::Value::String("function".into())); + arr.push(serde_json::to_value(envelope).unwrap_or_default()); + } else { + arr.push(serde_json::to_value(m).unwrap_or_default()); + } } serde_json::to_string(&arr).unwrap_or_else(|_| "[]".to_string()) } @@ -172,7 +182,36 @@ impl KimiK2Renderer { fn args_to_string(args: &ToolArguments) -> String { match args { ToolArguments::Raw(s) => s.clone(), - ToolArguments::Object(v) => serde_json::to_string(v).unwrap_or_else(|_| "{}".into()), + ToolArguments::Object(v) => Self::json_dumps_default(v), + } + } + + fn json_dumps_default(v: &serde_json::Value) -> String { + match v { + serde_json::Value::Null => "null".to_string(), + serde_json::Value::Bool(b) => b.to_string(), + serde_json::Value::Number(n) => n.to_string(), + serde_json::Value::String(s) => serde_json::to_string(s).unwrap_or_default(), + serde_json::Value::Array(values) => { + let inner = values + .iter() + .map(Self::json_dumps_default) + .collect::>() + .join(", "); + format!("[{inner}]") + } + serde_json::Value::Object(values) => { + let inner = values + .iter() + .map(|(key, value)| { + let key = serde_json::to_string(key).unwrap_or_default(); + let value = Self::json_dumps_default(value); + format!("{key}: {value}") + }) + .collect::>() + .join(", "); + format!("{{{inner}}}") + } } } diff --git a/renderers/_native_router.py b/renderers/_native_router.py index 0e687b0..726af70 100644 --- a/renderers/_native_router.py +++ b/renderers/_native_router.py @@ -19,6 +19,7 @@ from __future__ import annotations import hashlib +import json import logging import os import tempfile @@ -30,6 +31,18 @@ _NATIVE_MODULE: Any | None = None _NATIVE_LOAD_ATTEMPTED = False _ALL_EXCLUDED = {"default"} +_KIMI_TIKTOKEN_PATTERN = "|".join( + [ + r"""[\p{Han}]+""", + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""\p{N}{1,3}""", + r""" ?[^\s\p{L}\p{N}]+[\r\n]*""", + r"""\s*[\r\n]+""", + r"""\s+(?!\S)""", + r"""\s+""", + ] +) def native_enabled(family: str) -> bool: @@ -121,12 +134,72 @@ def resolve_tokenizer_path(tokenizer: Any) -> str: ) cached = try_to_load_from_cache(repo_id=name_or_path, filename="tokenizer.json") - if not isinstance(cached, (str, os.PathLike)): - raise ValueError( - f"tokenizer.json not available in the local HF cache for {name_or_path}. " - "Run `snapshot_download` first or pass an explicit path." - ) - return str(cached) + if isinstance(cached, (str, os.PathLike)): + return str(cached) + + exported = _export_tiktoken_tokenizer_json(name_or_path, try_to_load_from_cache) + if exported is not None: + return exported + + raise ValueError( + f"tokenizer.json not available in the local HF cache for {name_or_path}. " + "Run `snapshot_download` first or pass an explicit path." + ) + + +def _export_tiktoken_tokenizer_json( + repo_id: str, + try_to_load_from_cache: Any, +) -> str | None: + """Export Kimi's tiktoken tokenizer to a native-loadable tokenizer.json.""" + tiktoken_model = try_to_load_from_cache(repo_id=repo_id, filename="tiktoken.model") + tokenizer_config = try_to_load_from_cache( + repo_id=repo_id, filename="tokenizer_config.json" + ) + if not isinstance(tiktoken_model, (str, os.PathLike)) or not isinstance( + tokenizer_config, (str, os.PathLike) + ): + return None + + config_path = Path(tokenizer_config) + model_path = Path(tiktoken_model) + config = json.loads(config_path.read_text(encoding="utf-8")) + if config.get("tokenizer_class") != "TikTokenTokenizer": + return None + + added = { + int(idx): value["content"] + for idx, value in config.get("added_tokens_decoder", {}).items() + } + if not added: + return None + + base_id = min(added) + special_tokens = [ + added.get(idx, f"<|reserved_token_{idx}|>") + for idx in range(base_id, base_id + 256) + ] + digest = hashlib.sha256() + digest.update(model_path.read_bytes()) + digest.update(config_path.read_bytes()) + digest.update(_KIMI_TIKTOKEN_PATTERN.encode("utf-8")) + cache_dir = Path(tempfile.gettempdir()) / "renderers-tokenizers" + cache_dir.mkdir(parents=True, exist_ok=True) + out = cache_dir / f"tiktoken-{digest.hexdigest()}.json" + if out.exists(): + return str(out) + + from transformers.convert_slow_tokenizer import TikTokenConverter + + converted = TikTokenConverter( + vocab_file=str(model_path), + pattern=_KIMI_TIKTOKEN_PATTERN, + extra_special_tokens=special_tokens, + ).converted() + tmp = out.with_suffix(".tmp") + converted.save(str(tmp)) + tmp.replace(out) + return str(out) def try_resolve_tokenizer_path(tokenizer: Any, family: str) -> str | None: diff --git a/tests/test_native_router.py b/tests/test_native_router.py index 12dc140..06fdf47 100644 --- a/tests/test_native_router.py +++ b/tests/test_native_router.py @@ -85,6 +85,23 @@ def test_resolve_tokenizer_path_rejects_hf_missing_sentinel(monkeypatch): router.resolve_tokenizer_path(tokenizer) +def test_resolve_tokenizer_path_uses_tiktoken_export(monkeypatch, tmp_path): + tokenizer = SimpleNamespace(name_or_path="moonshotai/Kimi-K2-Instruct") + fake_hf = SimpleNamespace(try_to_load_from_cache=lambda **_kwargs: object()) + exported = tmp_path / "tokenizer.json" + exported.write_text("{}") + monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hf) + monkeypatch.setattr( + router, + "_export_tiktoken_tokenizer_json", + lambda repo_id, _loader: str(exported) + if repo_id == "moonshotai/Kimi-K2-Instruct" + else None, + ) + + assert router.resolve_tokenizer_path(tokenizer) == str(exported) + + def test_kimi_k2_constructor_falls_back_without_tokenizer_path(monkeypatch): from renderers.kimi_k2 import KimiK2Renderer From df2a5f2ece9a2d73bf08c0a0bba49212814a4b51 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:56:59 +0200 Subject: [PATCH 21/35] Add native runtime benchmark --- benchmarks/native_vs_python_qwen3.py | 209 +++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 benchmarks/native_vs_python_qwen3.py diff --git a/benchmarks/native_vs_python_qwen3.py b/benchmarks/native_vs_python_qwen3.py new file mode 100644 index 0000000..377bf9c --- /dev/null +++ b/benchmarks/native_vs_python_qwen3.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# /// script +# requires-python = ">=3.10,<3.14" +# dependencies = [ +# "transformers>=4.50.0", +# ] +# /// +"""Compare Qwen3 pure-Python renderer latency with the native PyO3 path. + +Run from a checkout after building the native extension: + + uv run --with maturin maturin develop \ + --manifest-path crates/renderers-py/Cargo.toml --release + uv run python benchmarks/native_vs_python_qwen3.py + +The benchmark intentionally uses the public Python APIs on both sides. That +means native timings include PyO3 boundary and Python object conversion costs, +which is the relevant number for Python callers. Use the Criterion bench for +pure Rust hot-path timings. +""" + +from __future__ import annotations + +import argparse +import gc +import os +import statistics +import time +from collections.abc import Callable +from dataclasses import dataclass + +from renderers import _native_router as router +from renderers.base import load_tokenizer +from renderers.qwen3 import Qwen3Renderer + + +MESSAGES = [ + { + "role": "system", + "content": "You are a helpful assistant that calls tools when needed.", + }, + { + "role": "user", + "content": "Plan a weekend trip to Lisbon for two; we like food and walking.", + }, + { + "role": "assistant", + "content": "I'll help. First, let me check the weather and find some restaurants.", + }, + {"role": "user", "content": "Sounds good - go ahead."}, + { + "role": "assistant", + "content": ( + "Here's a plan: Friday evening tapas at Time Out Market, Saturday " + "morning walk through Alfama, Saturday lunch at Ramiro (seafood), " + "Saturday afternoon Belem pasteis, Sunday morning Sao Jorge castle, " + "Sunday lunch at Cervejaria Trindade." + ), + }, +] + +NEW_MESSAGES = [ + {"role": "user", "content": "Add a kid-friendly option for Sunday morning."} +] + + +@dataclass(frozen=True) +class Timing: + loops: int + median_ns: float + min_ns: float + max_ns: float + + @property + def median_us(self) -> float: + return self.median_ns / 1_000.0 + + +def time_case( + fn: Callable[[], object], + *, + min_time_s: float, + repeats: int, +) -> Timing: + loops = 1 + while True: + start = time.perf_counter_ns() + for _ in range(loops): + fn() + elapsed_s = (time.perf_counter_ns() - start) / 1_000_000_000 + if elapsed_s >= min_time_s: + break + loops *= 2 + + samples: list[float] = [] + for _ in range(repeats): + start = time.perf_counter_ns() + for _ in range(loops): + fn() + samples.append((time.perf_counter_ns() - start) / loops) + + return Timing( + loops=loops, + median_ns=statistics.median(samples), + min_ns=min(samples), + max_ns=max(samples), + ) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="Qwen/Qwen3-8B") + parser.add_argument("--min-time", type=float, default=0.35) + parser.add_argument("--repeats", type=int, default=7) + args = parser.parse_args() + + os.environ.pop("RENDERERS_NATIVE", None) + tokenizer = load_tokenizer(args.model) + tokenizer_path = router.resolve_tokenizer_path(tokenizer) + + native = router.load_native() + if native is None: + raise RuntimeError( + "renderers_native is not built; run `uv run --with maturin maturin " + "develop --manifest-path crates/renderers-py/Cargo.toml --release`" + ) + + py_renderer = Qwen3Renderer(tokenizer) + native_renderer = native.Renderer.qwen3(tokenizer_path) + + py_ids = py_renderer.render_ids(MESSAGES, add_generation_prompt=True) + native_ids = list(native_renderer.render_ids(MESSAGES, add_generation_prompt=True)) + if py_ids != native_ids: + raise AssertionError("render_ids parity failed before benchmarking") + + prompt_messages = MESSAGES[:-1] + assistant_message = MESSAGES[-1:] + prev_prompt = py_renderer.render_ids(prompt_messages, add_generation_prompt=True) + full = py_renderer.render_ids(prompt_messages + assistant_message) + prev_completion = full[len(prev_prompt) :] + if not prev_completion: + raise AssertionError("benchmark fixture produced an empty completion") + + native_prev_prompt = list( + native_renderer.render_ids(prompt_messages, add_generation_prompt=True) + ) + if native_prev_prompt != prev_prompt: + raise AssertionError("prompt parity failed before benchmarking") + + py_bridge = py_renderer.bridge_to_next_turn( + prev_prompt, prev_completion, NEW_MESSAGES + ) + native_bridge = native_renderer.bridge_to_next_turn( + prev_prompt, prev_completion, NEW_MESSAGES + ) + if py_bridge is None or native_bridge is None: + raise AssertionError("bridge fixture unexpectedly returned None") + if list(py_bridge.token_ids) != list(native_bridge.token_ids): + raise AssertionError("bridge parity failed before benchmarking") + + parsed = py_renderer.parse_response(py_ids) + native_parsed = native_renderer.parse_response(py_ids) + if parsed.content != native_parsed.content: + raise AssertionError("parse_response parity failed before benchmarking") + + cases: list[tuple[str, Callable[[object], Callable[[], object]]]] = [ + ( + "render_ids", + lambda r: lambda: r.render_ids(MESSAGES, add_generation_prompt=True), + ), + ("parse_response", lambda r: lambda: r.parse_response(py_ids)), + ( + "bridge_to_next_turn", + lambda r: lambda: r.bridge_to_next_turn( + prev_prompt, prev_completion, NEW_MESSAGES + ), + ), + ] + + gc.collect() + gc.disable() + try: + rows = [] + for name, make in cases: + py_timing = time_case( + make(py_renderer), min_time_s=args.min_time, repeats=args.repeats + ) + native_timing = time_case( + make(native_renderer), min_time_s=args.min_time, repeats=args.repeats + ) + rows.append((name, py_timing, native_timing)) + finally: + gc.enable() + + print(f"model={args.model}") + print(f"tokenizer_path={tokenizer_path}") + print() + print("| operation | python us | native us | speedup |") + print("|---|---:|---:|---:|") + for name, py_timing, native_timing in rows: + speedup = py_timing.median_ns / native_timing.median_ns + print( + f"| `{name}` | {py_timing.median_us:.3f} | " + f"{native_timing.median_us:.3f} | {speedup:.2f}x |" + ) + + +if __name__ == "__main__": + main() From 9042447b1b1a15e921c29f7b8238fc54d962bbf3 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:57:23 +0200 Subject: [PATCH 22/35] Expand native runtime benchmark --- benchmarks/native_vs_python_qwen3.py | 654 ++++++++++++++++++++++++--- 1 file changed, 597 insertions(+), 57 deletions(-) diff --git a/benchmarks/native_vs_python_qwen3.py b/benchmarks/native_vs_python_qwen3.py index 377bf9c..b9ecf93 100644 --- a/benchmarks/native_vs_python_qwen3.py +++ b/benchmarks/native_vs_python_qwen3.py @@ -23,18 +23,21 @@ import argparse import gc +import json import os import statistics import time +import tracemalloc from collections.abc import Callable from dataclasses import dataclass +from typing import Any, cast from renderers import _native_router as router -from renderers.base import load_tokenizer +from renderers.base import Message, ToolSpec, load_tokenizer from renderers.qwen3 import Qwen3Renderer -MESSAGES = [ +MESSAGES: list[Message] = [ { "role": "system", "content": "You are a helpful assistant that calls tools when needed.", @@ -59,11 +62,101 @@ }, ] -NEW_MESSAGES = [ +NEW_MESSAGES: list[Message] = [ {"role": "user", "content": "Add a kid-friendly option for Sunday morning."} ] +TOOLS = cast( + list[ToolSpec], + [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather for a city.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"}, + "units": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["city"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "search_places", + "description": "Find places matching a set of constraints.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + "query": {"type": "string"}, + "filters": { + "type": "object", + "properties": { + "kid_friendly": {"type": "boolean"}, + "max_walk_minutes": {"type": "integer"}, + "tags": { + "type": "array", + "items": {"type": "string"}, + }, + }, + }, + }, + "required": ["city", "query"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "book_table", + "description": "Create a restaurant booking request.", + "parameters": { + "type": "object", + "properties": { + "restaurant": {"type": "string"}, + "party_size": {"type": "integer"}, + "time": {"type": "string"}, + "notes": {"type": "string"}, + }, + "required": ["restaurant", "party_size", "time"], + }, + }, + }, + ], +) + + +@dataclass(frozen=True) +class RenderScenario: + name: str + messages: list[Message] + tools: list[ToolSpec] | None = None + add_generation_prompt: bool = False + + +@dataclass(frozen=True) +class ParseScenario: + name: str + prompt: list[Message] + assistant: Message + tools: list[ToolSpec] | None = None + + +@dataclass(frozen=True) +class BridgeScenario: + name: str + prompt: list[Message] + assistant: Message + new_messages: list[Message] + tools: list[ToolSpec] | None = None + + @dataclass(frozen=True) class Timing: loops: int @@ -76,6 +169,291 @@ def median_us(self) -> float: return self.median_ns / 1_000.0 +@dataclass(frozen=True) +class Memory: + loops: int + peak_bytes: int + + @property + def peak_kib(self) -> float: + return self.peak_bytes / 1024 + + @property + def per_call_bytes(self) -> float: + return self.peak_bytes / self.loops + + +def _long_history(rounds: int = 18) -> list[Message]: + messages: list[Message] = [ + { + "role": "system", + "content": ( + "You are an itinerary planner. Preserve constraints, cite tradeoffs, " + "and keep tool observations separate from recommendations." + ), + } + ] + for idx in range(rounds): + messages.append( + { + "role": "user", + "content": ( + f"Leg {idx}: compare museum, food, and walking options. " + f"We have budget band {idx % 4}, transit pass {idx % 3}, " + "and one traveler who avoids late dinners." + ), + } + ) + messages.append( + { + "role": "assistant", + "content": ( + f"For leg {idx}, start with a walkable cluster, keep the meal " + "close to transit, and leave a fallback indoor option. " + "The strongest tradeoff is time certainty versus variety." + ), + } + ) + messages.append( + { + "role": "user", + "content": "Now produce the final plan with the best three swaps.", + } + ) + return messages + + +def _reasoning_history(rounds: int = 10) -> list[Message]: + messages: list[Message] = [ + {"role": "system", "content": "You are concise but keep prior reasoning."} + ] + for idx in range(rounds): + messages.append({"role": "user", "content": f"Score option {idx}."}) + messages.append( + { + "role": "assistant", + "reasoning_content": ( + f"Option {idx} has a distance score of {idx % 5}, a food score " + f"of {(idx + 2) % 5}, and a weather risk score of {(idx + 3) % 5}." + ), + "content": f"Option {idx}: viable with one caveat.", + } + ) + return messages + + +def _tool_cycle_messages() -> list[Message]: + return [ + { + "role": "system", + "content": "You can call tools and then summarize the result.", + }, + {"role": "user", "content": "Plan Sunday morning in Lisbon with weather."}, + { + "role": "assistant", + "content": "I will check weather and candidate places.", + "tool_calls": [ + { + "id": "call_weather", + "type": "function", + "function": { + "name": "get_weather", + "arguments": {"city": "Lisbon", "units": "celsius"}, + }, + }, + { + "id": "call_places", + "type": "function", + "function": { + "name": "search_places", + "arguments": { + "city": "Lisbon", + "query": "kid friendly Sunday morning", + "filters": { + "kid_friendly": True, + "max_walk_minutes": 20, + "tags": ["parks", "pastries", "views"], + }, + }, + }, + }, + ], + }, + {"role": "tool", "name": "get_weather", "content": '{"temp": 19, "rain": 0.1}'}, + { + "role": "tool", + "name": "search_places", + "content": json.dumps( + { + "places": [ + {"name": "Jardim da Estrela", "walk_minutes": 12}, + {"name": "Manteigaria", "walk_minutes": 18}, + ] + }, + ensure_ascii=False, + ), + }, + { + "role": "assistant", + "content": "Use Jardim da Estrela first, then pastries if the weather holds.", + }, + ] + + +def _large_tool_only_messages() -> list[Message]: + return [ + { + "role": "system", + "content": "You are a travel operations assistant.", + }, + { + "role": "user", + "content": ( + "Use the available tools to build a food-first morning plan, " + "but only call tools if missing information blocks the answer." + ), + }, + ] + + +def render_scenarios() -> list[RenderScenario]: + return [ + RenderScenario("medium_gen_prompt", MESSAGES, add_generation_prompt=True), + RenderScenario( + "long_history_gen_prompt", + _long_history(), + add_generation_prompt=True, + ), + RenderScenario("reasoning_history", _reasoning_history()), + RenderScenario("tool_cycle_large_schema", _tool_cycle_messages(), tools=TOOLS), + RenderScenario( + "large_tools_gen_prompt", + _large_tool_only_messages(), + tools=TOOLS, + add_generation_prompt=True, + ), + ] + + +def parse_scenarios() -> list[ParseScenario]: + prompt: list[Message] = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Answer with the needed structure."}, + ] + return [ + ParseScenario( + "plain_content", + prompt, + {"role": "assistant", "content": "The answer is four."}, + ), + ParseScenario( + "reasoning_and_content", + prompt, + { + "role": "assistant", + "reasoning_content": ( + "The user asks for arithmetic, so compute two plus two." + ), + "content": "The answer is four.", + }, + ), + ParseScenario( + "multi_tool_call", + prompt, + { + "role": "assistant", + "content": "I will inspect the required details.", + "tool_calls": [ + { + "id": "call_weather", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"city":"Lisbon","units":"celsius"}', + }, + }, + { + "id": "call_places", + "type": "function", + "function": { + "name": "search_places", + "arguments": json.dumps( + { + "city": "Lisbon", + "query": "kid friendly Sunday morning", + "filters": { + "kid_friendly": True, + "max_walk_minutes": 20, + "tags": ["parks", "pastries", "views"], + }, + }, + separators=(",", ":"), + ), + }, + }, + ], + }, + tools=TOOLS, + ), + ParseScenario( + "long_content", + prompt, + { + "role": "assistant", + "content": " ".join( + f"Recommendation {idx}: keep the plan walkable and reversible." + for idx in range(80) + ), + }, + ), + ] + + +def bridge_scenarios() -> list[BridgeScenario]: + return [ + BridgeScenario( + "medium_extend_user", + MESSAGES[:-1], + MESSAGES[-1], + NEW_MESSAGES, + ), + BridgeScenario( + "long_history_extend_user", + _long_history(14)[:-1], + { + "role": "assistant", + "content": ( + "Here is the compressed plan: keep mornings flexible, cluster " + "food stops near transit, and reserve one indoor fallback." + ), + }, + [ + { + "role": "user", + "content": "Add one backup if rain starts before lunch.", + } + ], + ), + BridgeScenario( + "tool_response_extension", + _tool_cycle_messages()[:-1], + _tool_cycle_messages()[-1], + [ + { + "role": "tool", + "name": "book_table", + "content": '{"status": "waitlist", "eta_minutes": 15}', + }, + { + "role": "user", + "content": "Adjust if the restaurant is waitlisted.", + }, + ], + tools=TOOLS, + ), + ] + + def time_case( fn: Callable[[], object], *, @@ -107,11 +485,98 @@ def time_case( ) +def memory_case(fn: Callable[[], object], *, loops: int) -> Memory: + gc.collect() + tracemalloc.start() + try: + for _ in range(loops): + fn() + _current, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return Memory(loops=loops, peak_bytes=peak) + + +def _as_ids(value: Any) -> list[int]: + if hasattr(value, "token_ids"): + return list(value.token_ids) + return list(value) + + +def _assert_parsed_equal(py_value: Any, native_value: Any) -> None: + if py_value.content != native_value.content: + raise AssertionError("parse_response content parity failed before benchmarking") + if (py_value.reasoning_content or None) != (native_value.reasoning_content or None): + raise AssertionError( + "parse_response reasoning parity failed before benchmarking" + ) + py_calls = py_value.tool_calls + native_calls = native_value.tool_calls + if len(py_calls) != len(native_calls): + raise AssertionError("parse_response tool-call count parity failed") + for py_call, native_call in zip(py_calls, native_calls, strict=True): + if ( + py_call.raw, + py_call.name, + py_call.arguments, + py_call.status, + ) != ( + native_call.raw, + native_call.name, + native_call.arguments, + native_call.status, + ): + raise AssertionError("parse_response tool-call parity failed") + + +def _completion_ids(renderer: Any, scenario: ParseScenario) -> list[int]: + prompt_ids = renderer.render_ids( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + full_ids = renderer.render_ids( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + completion = list(full_ids)[len(prompt_ids) :] + if not completion: + raise AssertionError(f"{scenario.name} produced an empty completion") + return completion + + +def _bridge_inputs( + renderer: Any, scenario: BridgeScenario +) -> tuple[list[int], list[int]]: + previous_prompt_ids = renderer.render_ids( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + full_ids = renderer.render_ids( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + previous_completion_ids = list(full_ids)[len(previous_prompt_ids) :] + if not previous_completion_ids: + raise AssertionError(f"{scenario.name} produced an empty completion") + return list(previous_prompt_ids), previous_completion_ids + + def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--model", default="Qwen/Qwen3-8B") parser.add_argument("--min-time", type=float, default=0.35) parser.add_argument("--repeats", type=int, default=7) + parser.add_argument( + "--memory-loops", + type=int, + default=1000, + help=( + "Iterations for tracemalloc peak measurement. This tracks Python " + "heap allocations, including PyO3 boundary objects, not Rust malloc." + ), + ) args = parser.parse_args() os.environ.pop("RENDERERS_NATIVE", None) @@ -128,81 +593,156 @@ def main() -> None: py_renderer = Qwen3Renderer(tokenizer) native_renderer = native.Renderer.qwen3(tokenizer_path) - py_ids = py_renderer.render_ids(MESSAGES, add_generation_prompt=True) - native_ids = list(native_renderer.render_ids(MESSAGES, add_generation_prompt=True)) - if py_ids != native_ids: - raise AssertionError("render_ids parity failed before benchmarking") - - prompt_messages = MESSAGES[:-1] - assistant_message = MESSAGES[-1:] - prev_prompt = py_renderer.render_ids(prompt_messages, add_generation_prompt=True) - full = py_renderer.render_ids(prompt_messages + assistant_message) - prev_completion = full[len(prev_prompt) :] - if not prev_completion: - raise AssertionError("benchmark fixture produced an empty completion") - - native_prev_prompt = list( - native_renderer.render_ids(prompt_messages, add_generation_prompt=True) - ) - if native_prev_prompt != prev_prompt: - raise AssertionError("prompt parity failed before benchmarking") + cases: list[tuple[str, str, int, Callable[[Any], Callable[[], object]]]] = [] - py_bridge = py_renderer.bridge_to_next_turn( - prev_prompt, prev_completion, NEW_MESSAGES - ) - native_bridge = native_renderer.bridge_to_next_turn( - prev_prompt, prev_completion, NEW_MESSAGES - ) - if py_bridge is None or native_bridge is None: - raise AssertionError("bridge fixture unexpectedly returned None") - if list(py_bridge.token_ids) != list(native_bridge.token_ids): - raise AssertionError("bridge parity failed before benchmarking") - - parsed = py_renderer.parse_response(py_ids) - native_parsed = native_renderer.parse_response(py_ids) - if parsed.content != native_parsed.content: - raise AssertionError("parse_response parity failed before benchmarking") - - cases: list[tuple[str, Callable[[object], Callable[[], object]]]] = [ - ( - "render_ids", - lambda r: lambda: r.render_ids(MESSAGES, add_generation_prompt=True), - ), - ("parse_response", lambda r: lambda: r.parse_response(py_ids)), - ( - "bridge_to_next_turn", - lambda r: lambda: r.bridge_to_next_turn( - prev_prompt, prev_completion, NEW_MESSAGES - ), - ), - ] + for scenario in render_scenarios(): + py_ids = _as_ids( + py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + native_ids = _as_ids( + native_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + if py_ids != native_ids: + raise AssertionError(f"{scenario.name} render_ids parity failed") + cases.append( + ( + "render_ids", + scenario.name, + len(py_ids), + lambda r, scenario=scenario: ( + lambda: r.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ), + ) + ) + + for scenario in parse_scenarios(): + py_completion_ids = _completion_ids(py_renderer, scenario) + native_completion_ids = _completion_ids(native_renderer, scenario) + if py_completion_ids != native_completion_ids: + raise AssertionError(f"{scenario.name} completion parity failed") + _assert_parsed_equal( + py_renderer.parse_response(py_completion_ids), + native_renderer.parse_response(py_completion_ids), + ) + cases.append( + ( + "parse_response", + scenario.name, + len(py_completion_ids), + lambda r, ids=py_completion_ids: lambda: r.parse_response(ids), + ) + ) + + for scenario in bridge_scenarios(): + prev_prompt, prev_completion = _bridge_inputs(py_renderer, scenario) + native_prev_prompt, native_prev_completion = _bridge_inputs( + native_renderer, scenario + ) + if ( + prev_prompt != native_prev_prompt + or prev_completion != native_prev_completion + ): + raise AssertionError(f"{scenario.name} bridge input parity failed") + py_bridge = py_renderer.bridge_to_next_turn( + prev_prompt, + prev_completion, + scenario.new_messages, + tools=scenario.tools, + ) + native_bridge = native_renderer.bridge_to_next_turn( + prev_prompt, + prev_completion, + scenario.new_messages, + tools=scenario.tools, + ) + if py_bridge is None or native_bridge is None: + raise AssertionError(f"{scenario.name} bridge unexpectedly returned None") + if list(py_bridge.token_ids) != list(native_bridge.token_ids): + raise AssertionError(f"{scenario.name} bridge parity failed") + cases.append( + ( + "bridge_to_next_turn", + scenario.name, + len(py_bridge.token_ids), + lambda r, scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + lambda: r.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) + ), + ) + ) gc.collect() gc.disable() try: rows = [] - for name, make in cases: + for operation, scenario, token_count, make in cases: py_timing = time_case( make(py_renderer), min_time_s=args.min_time, repeats=args.repeats ) native_timing = time_case( make(native_renderer), min_time_s=args.min_time, repeats=args.repeats ) - rows.append((name, py_timing, native_timing)) + py_memory = memory_case(make(py_renderer), loops=args.memory_loops) + native_memory = memory_case(make(native_renderer), loops=args.memory_loops) + rows.append( + ( + operation, + scenario, + token_count, + py_timing, + native_timing, + py_memory, + native_memory, + ) + ) finally: gc.enable() print(f"model={args.model}") print(f"tokenizer_path={tokenizer_path}") print() - print("| operation | python us | native us | speedup |") - print("|---|---:|---:|---:|") - for name, py_timing, native_timing in rows: + print( + "| operation | scenario | tokens | python us | native us | speedup | " + "python peak KiB | native peak KiB |" + ) + print("|---|---|---:|---:|---:|---:|---:|---:|") + for ( + operation, + scenario, + token_count, + py_timing, + native_timing, + py_memory, + native_memory, + ) in rows: speedup = py_timing.median_ns / native_timing.median_ns print( - f"| `{name}` | {py_timing.median_us:.3f} | " - f"{native_timing.median_us:.3f} | {speedup:.2f}x |" + f"| `{operation}` | `{scenario}` | {token_count} | " + f"{py_timing.median_us:.3f} | " + f"{native_timing.median_us:.3f} | {speedup:.2f}x | " + f"{py_memory.peak_kib:.1f} | {native_memory.peak_kib:.1f} |" ) + print() + print( + "memory note: peak KiB uses Python tracemalloc over " + f"{args.memory_loops} calls; Rust allocator memory is not included." + ) if __name__ == "__main__": From 61925c3378fd75de84c1012f615b7969a34f0fa5 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:58:15 +0200 Subject: [PATCH 23/35] Trim native token bridge overhead --- crates/renderers-core/src/emit.rs | 30 ++- crates/renderers-core/src/families/qwen3.rs | 49 +++-- crates/renderers-py/src/lib.rs | 228 +++++++++++++++----- 3 files changed, 238 insertions(+), 69 deletions(-) diff --git a/crates/renderers-core/src/emit.rs b/crates/renderers-core/src/emit.rs index f8cd73d..d4ff82f 100644 --- a/crates/renderers-core/src/emit.rs +++ b/crates/renderers-core/src/emit.rs @@ -14,7 +14,7 @@ use crate::types::{RenderError, RenderedTokens, SCAFFOLD_IDX}; /// All emits are O(1) amortised against the pre-allocated capacity. pub struct RenderBuf<'tok> { tokens: Vec, - indices: Vec, + indices: Option>, tokenizer: &'tok Tokenizer, /// Scratch `Vec` reused across `encode` calls so each text segment /// doesn't allocate. The tokenizer's `encode` API returns its own @@ -27,7 +27,7 @@ impl std::fmt::Debug for RenderBuf<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("RenderBuf") .field("tokens_len", &self.tokens.len()) - .field("indices_len", &self.indices.len()) + .field("indices_len", &self.indices.as_ref().map(Vec::len)) .finish() } } @@ -36,7 +36,16 @@ impl<'tok> RenderBuf<'tok> { pub fn new(tokenizer: &'tok Tokenizer, hint: usize) -> Self { Self { tokens: Vec::with_capacity(hint), - indices: Vec::with_capacity(hint), + indices: Some(Vec::with_capacity(hint)), + tokenizer, + scratch_offsets: Vec::new(), + } + } + + pub fn new_token_ids_only(tokenizer: &'tok Tokenizer, hint: usize) -> Self { + Self { + tokens: Vec::with_capacity(hint), + indices: None, tokenizer, scratch_offsets: Vec::new(), } @@ -51,7 +60,9 @@ impl<'tok> RenderBuf<'tok> { #[inline] pub fn special(&mut self, token_id: u32, msg_idx: i32) { self.tokens.push(token_id); - self.indices.push(msg_idx); + if let Some(indices) = &mut self.indices { + indices.push(msg_idx); + } } /// Append a span of token ids to the buffer, all attributed to the @@ -61,8 +72,10 @@ impl<'tok> RenderBuf<'tok> { self.tokens.extend_from_slice(token_ids); // `resize` with a Copy fill is the cheapest way to extend the // indices vector by N elements of the same value. - let new_len = self.indices.len() + token_ids.len(); - self.indices.resize(new_len, msg_idx); + if let Some(indices) = &mut self.indices { + let new_len = indices.len() + token_ids.len(); + indices.resize(new_len, msg_idx); + } } /// Encode `text` and append the resulting tokens, attributing all of @@ -93,11 +106,12 @@ impl<'tok> RenderBuf<'tok> { /// Consume the buffer and return a [`RenderedTokens`]. pub fn into_rendered(self) -> RenderedTokens { - debug_assert_eq!(self.tokens.len(), self.indices.len()); + let indices = self.indices.unwrap_or_default(); + debug_assert_eq!(self.tokens.len(), indices.len()); let _ = self.scratch_offsets; // keep the field but ignore RenderedTokens { token_ids: self.tokens, - message_indices: self.indices, + message_indices: indices, multi_modal_data: None, } } diff --git a/crates/renderers-core/src/families/qwen3.rs b/crates/renderers-core/src/families/qwen3.rs index 940fe8a..5ab2ea0 100644 --- a/crates/renderers-core/src/families/qwen3.rs +++ b/crates/renderers-core/src/families/qwen3.rs @@ -362,31 +362,28 @@ impl Qwen3Renderer { let tools_bonus = tools.map_or(0, |t| 256 * t.len().max(1)); base + tools_bonus } -} -impl Renderer for Qwen3Renderer { - fn render( + fn render_into_buf( &self, + buf: &mut RenderBuf<'_>, messages: &[Message], tools: Option<&[ToolSpec]>, add_generation_prompt: bool, - ) -> Result { + ) -> Result<(), RenderError> { if messages.is_empty() { return Err(RenderError::EmptyMessages); } - let cap = Self::estimate_capacity(messages, tools); - let mut buf = RenderBuf::new(&self.tokenizer, cap); let first_is_system = messages[0].role == "system"; // 1. System + tools header. match tools { Some(t) if !t.is_empty() => { - self.emit_system_with_tools(&mut buf, messages, t, first_is_system)?; + self.emit_system_with_tools(buf, messages, t, first_is_system)?; } _ => { if first_is_system { - self.emit_system_no_tools(&mut buf, messages)?; + self.emit_system_no_tools(buf, messages)?; } } } @@ -403,10 +400,10 @@ impl Renderer for Qwen3Renderer { if i == 0 { continue; } - self.emit_non_initial_system(&mut buf, content, i as i32)?; + self.emit_non_initial_system(buf, content, i as i32)?; } "user" => { - self.emit_user(&mut buf, content, i as i32)?; + self.emit_user(buf, content, i as i32)?; } "assistant" => { let preserve_thinking = should_preserve_past_thinking( @@ -416,7 +413,7 @@ impl Renderer for Qwen3Renderer { self.preserve_thinking_between_tool_calls, ); self.emit_assistant( - &mut buf, + buf, msg, i, last_qi, @@ -425,7 +422,7 @@ impl Renderer for Qwen3Renderer { )?; } "tool" => { - self.emit_tool(&mut buf, messages, i, content)?; + self.emit_tool(buf, messages, i, content)?; } _ => { // Unknown role: skip silently (matches Python which @@ -443,9 +440,35 @@ impl Renderer for Qwen3Renderer { } } + Ok(()) + } +} + +impl Renderer for Qwen3Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let cap = Self::estimate_capacity(messages, tools); + let mut buf = RenderBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; Ok(buf.into_rendered()) } + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages, tools); + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { parse_qwen3( &self.tokenizer, @@ -484,7 +507,7 @@ impl Renderer for Qwen3Renderer { }; let cap = Self::estimate_capacity(new_messages, None); - let mut buf = RenderBuf::new(&self.tokenizer, cap); + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); // Trailing `\n` after the prior turn's close token. buf.scaffold_text("\n")?; diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 2047a00..454df19 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -9,7 +9,7 @@ use std::sync::Arc; -use numpy::{IntoPyArray, PyArray2}; +use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1}; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyDict, PyList, PyType}; @@ -22,11 +22,11 @@ use renderers_core::families::{ }; use renderers_core::processing::{ProcessedImage, Qwen3VlImageProcessor}; use renderers_core::tokenizer::Tokenizer; -use renderers_core::types::{MediaBundle, MediaItem, Modality}; use renderers_core::types::{ - Message, ParsedResponse, ParsedToolCall, RenderedTokens, ToolArguments, ToolCallParseStatus, - ToolSpec, + Content, Message, ParsedResponse, ParsedToolCall, RenderedTokens, ToolArguments, + ToolCallParseStatus, ToolSpec, }; +use renderers_core::types::{MediaBundle, MediaItem, Modality}; // Kept by-value so call sites can use the bare fn pointer // `.map_err(render_err)` (closures would be needed for `&E`). @@ -39,14 +39,64 @@ fn invalid(msg: impl Into) -> PyErr { PyValueError::new_err(msg.into()) } -/// Decode a Python `list[dict]` of messages via pythonize. +/// Decode a Python `list[dict]` of messages. +/// +/// The hot path is plain OpenAI-style dictionaries with string fields. +/// Hand-parsing that shape avoids routing every render through generic +/// serde conversion while still falling back to `pythonize` for structured +/// content parts and tool-call lists. fn parse_messages(obj: &Bound<'_, PyAny>) -> PyResult> { - let value: serde_json::Value = pythonize::depythonize(obj).map_err(|e| { - invalid(format!( - "messages must be a list of dicts (decode failed: {e})" - )) - })?; - serde_json::from_value(value).map_err(|e| invalid(format!("messages shape mismatch: {e}"))) + let list = obj + .cast::() + .map_err(|_| invalid("messages must be a list of dicts"))?; + let mut parsed = Vec::with_capacity(list.len()); + for item in list.iter() { + let dict = item + .cast::() + .map_err(|_| invalid("messages must be a list of dicts"))?; + let role = dict + .get_item("role")? + .ok_or_else(|| invalid("message missing role"))? + .extract::()?; + + let content = match dict.get_item("content")? { + None => Content::default(), + Some(value) if value.is_none() => Content::Null, + Some(value) => match value.extract::() { + Ok(text) => Content::Text(text), + Err(_) => pythonize::depythonize(&value) + .map_err(|e| invalid(format!("message content decode failed: {e}")))?, + }, + }; + + let tool_calls = match dict.get_item("tool_calls")? { + None => Vec::new(), + Some(value) if value.is_none() => Vec::new(), + Some(value) => pythonize::depythonize(&value) + .map_err(|e| invalid(format!("message tool_calls decode failed: {e}")))?, + }; + let tool_call_id = optional_string(dict, "tool_call_id")?; + let name = optional_string(dict, "name")?; + let reasoning_content = optional_string(dict, "reasoning_content")?; + + parsed.push(Message { + role, + content, + tool_calls, + tool_call_id, + name, + reasoning_content, + }); + } + Ok(parsed) +} + +fn optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult> { + match dict.get_item(key)? { + None => Ok(None), + Some(value) if value.is_none() => Ok(None), + Some(value) => value.extract::().map(Some), + } } fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult>> { @@ -54,27 +104,44 @@ fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult> if obj.is_none() { return Ok(None); } - let mut value: serde_json::Value = pythonize::depythonize(obj).map_err(|e| { - invalid(format!( - "tools must be a list of dicts (decode failed: {e})" - )) - })?; - let arr = value - .as_array_mut() - .ok_or_else(|| invalid("tools must be a list of dicts"))?; - let mut envelopes = Vec::with_capacity(arr.len()); - for item in arr { - if let Some(function) = item.get("function").and_then(|v| v.as_object()) { - envelopes.push(true); - *item = serde_json::Value::Object(function.clone()); + let list = obj + .cast::() + .map_err(|_| invalid("tools must be a list of dicts"))?; + let mut parsed = Vec::with_capacity(list.len()); + for item in list.iter() { + let dict = item + .cast::() + .map_err(|_| invalid("tools must be a list of dicts"))?; + let mut openai_envelope = false; + let spec = if let Some(function) = dict.get_item("function")? { + if let Ok(function_dict) = function.cast::() { + openai_envelope = true; + function_dict.clone() + } else { + dict.clone() + } } else { - envelopes.push(false); - } - } - let mut parsed: Vec = - serde_json::from_value(value).map_err(|e| invalid(format!("tools shape mismatch: {e}")))?; - for (tool, openai_envelope) in parsed.iter_mut().zip(envelopes) { - tool.openai_envelope = openai_envelope; + dict.clone() + }; + let name = spec + .get_item("name")? + .ok_or_else(|| invalid("tool spec missing name"))? + .extract::()?; + let description = match spec.get_item("description")? { + Some(value) => value.extract::()?, + None => String::new(), + }; + let parameters = match spec.get_item("parameters")? { + Some(value) => pythonize::depythonize(&value) + .map_err(|e| invalid(format!("tool parameters decode failed: {e}")))?, + None => serde_json::Value::Object(serde_json::Map::new()), + }; + parsed.push(ToolSpec { + name, + description, + parameters, + openai_envelope, + }); } Ok(Some(parsed)) } @@ -137,13 +204,17 @@ fn parse_u32_list(obj: &Bound<'_, PyAny>) -> PyResult> { .map_err(|_| invalid("expected list[int]"))?; let mut out = Vec::with_capacity(list.len()); for item in list.iter() { - let v: i64 = item.extract()?; - let id = u32::try_from(v).map_err(|_| invalid(format!("token id out of range: {v}")))?; - out.push(id); + out.push(item.extract::()?); } Ok(out) } +fn numpy_u32_slice<'py>(array: &'py PyReadonlyArray1<'py, u32>) -> PyResult<&'py [u32]> { + array + .as_slice() + .map_err(|e| invalid(format!("expected a contiguous uint32 numpy array: {e}"))) +} + #[pyclass( name = "RenderedTokens", module = "renderers_native", @@ -157,16 +228,13 @@ struct PyRenderedTokens { #[pymethods] impl PyRenderedTokens { #[getter] - fn token_ids<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { - // Cast u32 -> i64 for Python `int` compatibility. PyList::new is - // the fastest path; per-element extract is unavoidable until - // numpy support is added. - PyList::new_bound(py, self.inner.token_ids.iter().copied().map(i64::from)) + fn token_ids<'py>(&self, py: Python<'py>) -> PyResult> { + PyList::new(py, &self.inner.token_ids) } #[getter] - fn message_indices<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { - PyList::new_bound(py, self.inner.message_indices.iter().copied()) + fn message_indices<'py>(&self, py: Python<'py>) -> PyResult> { + PyList::new(py, self.inner.message_indices.iter().copied()) } #[getter] @@ -221,7 +289,11 @@ impl PyParsedToolCall { Some(ToolArguments::Object(v)) => { pythonize::pythonize(py, v).map_err(|e| invalid(format!("args serialisation: {e}"))) } - Some(ToolArguments::Raw(s)) => Ok(s.clone().into_py(py).into_bound(py)), + Some(ToolArguments::Raw(s)) => Ok(s + .as_str() + .into_pyobject(py) + .map_err(|e| invalid(format!("string into pyobject: {e}")))? + .into_any()), } } @@ -863,7 +935,30 @@ impl PyRenderer { let ids = py .detach(move || renderer.render_ids(&msgs, tools.as_deref(), add_generation_prompt)) .map_err(render_err)?; - Ok(PyList::new_bound(py, ids.iter().copied().map(i64::from))) + PyList::new(py, ids) + } + + /// Render token ids as a `numpy.ndarray[np.uint32]`. + /// + /// This transfers the Rust `Vec` allocation into `NumPy` instead of + /// materialising a Python `list[int]`, which is the preferred hot-path + /// API for benchmark loops and inference clients that already operate on + /// array buffers. + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] + fn render_ids_np<'py>( + &self, + py: Python<'py>, + messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult>> { + let msgs = parse_messages(messages)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let ids = py + .detach(move || renderer.render_ids(&msgs, tools.as_deref(), add_generation_prompt)) + .map_err(render_err)?; + Ok(ids.into_pyarray(py)) } fn parse_response( @@ -877,11 +972,22 @@ impl PyRenderer { Ok(PyParsedResponse { inner: parsed }) } - fn get_stop_token_ids<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { - PyList::new_bound( - py, - self.inner.stop_token_ids().iter().copied().map(i64::from), - ) + /// Parse completion ids from a contiguous `numpy.ndarray[np.uint32]`. + /// + /// The input buffer is borrowed directly, avoiding the Python-list scan and + /// temporary Rust `Vec` used by `parse_response`. + #[allow(clippy::needless_pass_by_value)] + fn parse_response_np( + &self, + token_ids: PyReadonlyArray1<'_, u32>, + ) -> PyResult { + let ids = numpy_u32_slice(&token_ids)?; + let parsed = self.inner.parse_response(ids); + Ok(PyParsedResponse { inner: parsed }) + } + + fn get_stop_token_ids<'py>(&self, py: Python<'py>) -> PyResult> { + PyList::new(py, self.inner.stop_token_ids()) } /// Render with pre-resolved multimodal media items. @@ -949,6 +1055,32 @@ impl PyRenderer { .map_err(render_err)?; Ok(bridged.map(|rt| PyRenderedTokens { inner: rt })) } + + /// Bridge using `NumPy` token buffers and return a `NumPy` token buffer. + /// + /// Previous prompt/completion ids are borrowed directly from contiguous + /// `uint32` arrays, and the bridged Rust `Vec` is transferred into + /// `NumPy` on output. This is the lowest-overhead Python-facing bridge path. + #[allow(clippy::needless_pass_by_value)] + #[pyo3(signature = (previous_prompt_ids, previous_completion_ids, new_messages, *, tools = None))] + fn bridge_to_next_turn_np<'py>( + &self, + py: Python<'py>, + previous_prompt_ids: PyReadonlyArray1<'_, u32>, + previous_completion_ids: PyReadonlyArray1<'_, u32>, + new_messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + ) -> PyResult>>> { + let prev_p = numpy_u32_slice(&previous_prompt_ids)?; + let prev_c = numpy_u32_slice(&previous_completion_ids)?; + let msgs = parse_messages(new_messages)?; + let tools = parse_tools(tools)?; + let bridged = self + .inner + .bridge_to_next_turn(prev_p, prev_c, &msgs, tools.as_deref()) + .map_err(render_err)?; + Ok(bridged.map(|rt| rt.token_ids.into_pyarray(py))) + } } // ── Vision: Qwen3-VL image processor ────────────────────────────────── From 05e170edbc35af4742bd5b6935d8014551ddfd27 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:58:33 +0200 Subject: [PATCH 24/35] Add native NumPy token fast paths --- benchmarks/native_vs_python_qwen3.py | 146 ++++++++++++++++---- crates/renderers-core/src/families/qwen3.rs | 44 ++++-- tests/test_native_numpy.py | 82 +++++++++++ 3 files changed, 232 insertions(+), 40 deletions(-) create mode 100644 tests/test_native_numpy.py diff --git a/benchmarks/native_vs_python_qwen3.py b/benchmarks/native_vs_python_qwen3.py index b9ecf93..abfd0c8 100644 --- a/benchmarks/native_vs_python_qwen3.py +++ b/benchmarks/native_vs_python_qwen3.py @@ -574,7 +574,8 @@ def main() -> None: default=1000, help=( "Iterations for tracemalloc peak measurement. This tracks Python " - "heap allocations, including PyO3 boundary objects, not Rust malloc." + "heap allocations, including PyO3 boundary objects, not Rust malloc " + "or NumPy native data buffers." ), ) args = parser.parse_args() @@ -593,7 +594,16 @@ def main() -> None: py_renderer = Qwen3Renderer(tokenizer) native_renderer = native.Renderer.qwen3(tokenizer_path) - cases: list[tuple[str, str, int, Callable[[Any], Callable[[], object]]]] = [] + cases: list[ + tuple[ + str, + str, + int, + Callable[[], object], + Callable[[], object], + Callable[[], object] | None, + ] + ] = [] for scenario in render_scenarios(): py_ids = _as_ids( @@ -617,12 +627,20 @@ def main() -> None: "render_ids", scenario.name, len(py_ids), - lambda r, scenario=scenario: ( - lambda: r.render_ids( - scenario.messages, - tools=scenario.tools, - add_generation_prompt=scenario.add_generation_prompt, - ) + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario: native_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario: native_renderer.render_ids_np( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, ), ) ) @@ -630,18 +648,34 @@ def main() -> None: for scenario in parse_scenarios(): py_completion_ids = _completion_ids(py_renderer, scenario) native_completion_ids = _completion_ids(native_renderer, scenario) + native_prompt_np = native_renderer.render_ids_np( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + native_full_np = native_renderer.render_ids_np( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + native_completion_np = native_full_np[len(native_prompt_np) :] if py_completion_ids != native_completion_ids: raise AssertionError(f"{scenario.name} completion parity failed") _assert_parsed_equal( py_renderer.parse_response(py_completion_ids), native_renderer.parse_response(py_completion_ids), ) + _assert_parsed_equal( + py_renderer.parse_response(py_completion_ids), + native_renderer.parse_response_np(native_completion_np), + ) cases.append( ( "parse_response", scenario.name, len(py_completion_ids), - lambda r, ids=py_completion_ids: lambda: r.parse_response(ids), + lambda ids=py_completion_ids: py_renderer.parse_response(ids), + lambda ids=py_completion_ids: native_renderer.parse_response(ids), + lambda ids=native_completion_np: native_renderer.parse_response_np(ids), ) ) @@ -650,6 +684,16 @@ def main() -> None: native_prev_prompt, native_prev_completion = _bridge_inputs( native_renderer, scenario ) + native_prev_prompt_np = native_renderer.render_ids_np( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + native_full_np = native_renderer.render_ids_np( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + native_prev_completion_np = native_full_np[len(native_prev_prompt_np) :] if ( prev_prompt != native_prev_prompt or prev_completion != native_prev_completion @@ -671,18 +715,38 @@ def main() -> None: raise AssertionError(f"{scenario.name} bridge unexpectedly returned None") if list(py_bridge.token_ids) != list(native_bridge.token_ids): raise AssertionError(f"{scenario.name} bridge parity failed") + native_bridge_np = native_renderer.bridge_to_next_turn_np( + native_prev_prompt_np, + native_prev_completion_np, + scenario.new_messages, + tools=scenario.tools, + ) + if native_bridge_np is None: + raise AssertionError(f"{scenario.name} numpy bridge returned None") + if list(py_bridge.token_ids) != native_bridge_np.tolist(): + raise AssertionError(f"{scenario.name} numpy bridge parity failed") cases.append( ( "bridge_to_next_turn", scenario.name, len(py_bridge.token_ids), - lambda r, scenario=scenario, pp=prev_prompt, pc=prev_completion: ( - lambda: r.bridge_to_next_turn( - pp, - pc, - scenario.new_messages, - tools=scenario.tools, - ) + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: py_renderer.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ), + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: native_renderer.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ), + lambda scenario=scenario, pp=native_prev_prompt_np, pc=native_prev_completion_np: native_renderer.bridge_to_next_turn_np( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, ), ) ) @@ -691,15 +755,25 @@ def main() -> None: gc.disable() try: rows = [] - for operation, scenario, token_count, make in cases: + for operation, scenario, token_count, py_fn, native_fn, native_np_fn in cases: py_timing = time_case( - make(py_renderer), min_time_s=args.min_time, repeats=args.repeats + py_fn, min_time_s=args.min_time, repeats=args.repeats ) native_timing = time_case( - make(native_renderer), min_time_s=args.min_time, repeats=args.repeats + native_fn, min_time_s=args.min_time, repeats=args.repeats + ) + native_np_timing = ( + time_case(native_np_fn, min_time_s=args.min_time, repeats=args.repeats) + if native_np_fn is not None + else None + ) + py_memory = memory_case(py_fn, loops=args.memory_loops) + native_memory = memory_case(native_fn, loops=args.memory_loops) + native_np_memory = ( + memory_case(native_np_fn, loops=args.memory_loops) + if native_np_fn is not None + else None ) - py_memory = memory_case(make(py_renderer), loops=args.memory_loops) - native_memory = memory_case(make(native_renderer), loops=args.memory_loops) rows.append( ( operation, @@ -707,8 +781,10 @@ def main() -> None: token_count, py_timing, native_timing, + native_np_timing, py_memory, native_memory, + native_np_memory, ) ) finally: @@ -718,30 +794,46 @@ def main() -> None: print(f"tokenizer_path={tokenizer_path}") print() print( - "| operation | scenario | tokens | python us | native us | speedup | " - "python peak KiB | native peak KiB |" + "| operation | scenario | tokens | python us | native list us | " + "native np us | list speedup | np speedup | python peak KiB | " + "native list peak KiB | native np peak KiB |" ) - print("|---|---|---:|---:|---:|---:|---:|---:|") + print("|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|") for ( operation, scenario, token_count, py_timing, native_timing, + native_np_timing, py_memory, native_memory, + native_np_memory, ) in rows: speedup = py_timing.median_ns / native_timing.median_ns + np_us = ( + f"{native_np_timing.median_us:.3f}" if native_np_timing is not None else "-" + ) + np_speedup = ( + f"{py_timing.median_ns / native_np_timing.median_ns:.2f}x" + if native_np_timing is not None + else "-" + ) + np_peak = ( + f"{native_np_memory.peak_kib:.1f}" if native_np_memory is not None else "-" + ) print( f"| `{operation}` | `{scenario}` | {token_count} | " f"{py_timing.median_us:.3f} | " - f"{native_timing.median_us:.3f} | {speedup:.2f}x | " - f"{py_memory.peak_kib:.1f} | {native_memory.peak_kib:.1f} |" + f"{native_timing.median_us:.3f} | {np_us} | " + f"{speedup:.2f}x | {np_speedup} | " + f"{py_memory.peak_kib:.1f} | {native_memory.peak_kib:.1f} | {np_peak} |" ) print() print( "memory note: peak KiB uses Python tracemalloc over " - f"{args.memory_loops} calls; Rust allocator memory is not included." + f"{args.memory_loops} calls; Rust allocator and NumPy native data buffers " + "are not included." ) diff --git a/crates/renderers-core/src/families/qwen3.rs b/crates/renderers-core/src/families/qwen3.rs index 5ab2ea0..065a3d1 100644 --- a/crates/renderers-core/src/families/qwen3.rs +++ b/crates/renderers-core/src/families/qwen3.rs @@ -97,6 +97,10 @@ pub struct Qwen3Renderer { /// and bridge close-token sets. Two-element vector held by-value /// per renderer instance. stop_tokens: Vec, + newline_tokens: Vec, + user_tokens: Vec, + assistant_newline_tokens: Vec, + gen_prompt_no_thinking_suffix_tokens: Vec, } impl Qwen3Renderer { @@ -119,6 +123,16 @@ impl Qwen3Renderer { let tool_response_end = tokenizer.token_to_id_strict("")?; let stop_tokens = vec![im_end, endoftext]; + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let user_tokens = tokenizer.encode_no_special("user")?.as_slice().to_vec(); + let assistant_newline_tokens = tokenizer + .encode_no_special("assistant\n")? + .as_slice() + .to_vec(); + let gen_prompt_no_thinking_suffix_tokens = tokenizer + .encode_no_special(GEN_PROMPT_NO_THINKING_SUFFIX)? + .as_slice() + .to_vec(); Ok(Self { tokenizer, @@ -133,6 +147,10 @@ impl Qwen3Renderer { tool_response, tool_response_end, stop_tokens, + newline_tokens, + user_tokens, + assistant_newline_tokens, + gen_prompt_no_thinking_suffix_tokens, }) } @@ -177,7 +195,7 @@ impl Qwen3Renderer { tool_text.push_str(TOOLS_FOOTER); buf.text(&tool_text, sys_idx)?; buf.special(self.im_end, sys_idx); - buf.text("\n", sys_idx)?; + buf.ids(&self.newline_tokens, sys_idx); Ok(()) } @@ -192,7 +210,7 @@ impl Qwen3Renderer { s.push_str(messages[0].text_content()); buf.text(&s, 0)?; buf.special(self.im_end, 0); - buf.text("\n", 0)?; + buf.ids(&self.newline_tokens, 0); Ok(()) } @@ -208,7 +226,7 @@ impl Qwen3Renderer { s.push_str(content); buf.text(&s, idx)?; buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); Ok(()) } @@ -224,7 +242,7 @@ impl Qwen3Renderer { s.push_str(content); buf.text(&s, idx)?; buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); Ok(()) } @@ -241,9 +259,9 @@ impl Qwen3Renderer { if !prev_is_tool { buf.special(self.im_start, idx); - buf.text("user", idx)?; + buf.ids(&self.user_tokens, idx); } - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); buf.special(self.tool_response, idx); let mut wrapped = String::with_capacity(content.len() + 2); wrapped.push('\n'); @@ -253,7 +271,7 @@ impl Qwen3Renderer { buf.special(self.tool_response_end, idx); if !next_is_tool { buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); } Ok(()) } @@ -350,7 +368,7 @@ impl Qwen3Renderer { } buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); Ok(()) } @@ -434,9 +452,9 @@ impl Qwen3Renderer { // 4. Generation prompt. if add_generation_prompt { buf.scaffold_special(self.im_start); - buf.scaffold_text("assistant\n")?; + buf.ids(&self.assistant_newline_tokens, SCAFFOLD_IDX); if !self.enable_thinking { - buf.scaffold_text(GEN_PROMPT_NO_THINKING_SUFFIX)?; + buf.ids(&self.gen_prompt_no_thinking_suffix_tokens, SCAFFOLD_IDX); } } @@ -510,7 +528,7 @@ impl Renderer for Qwen3Renderer { let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); // Trailing `\n` after the prior turn's close token. - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); for (i, msg) in new_messages.iter().enumerate() { let content = msg.text_content(); @@ -524,9 +542,9 @@ impl Renderer for Qwen3Renderer { } buf.scaffold_special(self.im_start); - buf.scaffold_text("assistant\n")?; + buf.ids(&self.assistant_newline_tokens, SCAFFOLD_IDX); if !self.enable_thinking { - buf.scaffold_text(GEN_PROMPT_NO_THINKING_SUFFIX)?; + buf.ids(&self.gen_prompt_no_thinking_suffix_tokens, SCAFFOLD_IDX); } let ext = buf.into_token_ids(); diff --git a/tests/test_native_numpy.py b/tests/test_native_numpy.py new file mode 100644 index 0000000..558eeb1 --- /dev/null +++ b/tests/test_native_numpy.py @@ -0,0 +1,82 @@ +"""NumPy fast-path coverage for the native PyO3 module.""" + +from __future__ import annotations + +import os + +import numpy as np +import pytest + +from renderers import _native_router as router + + +@pytest.fixture(scope="module") +def qwen3_native(): + native = router.load_native() + if native is None: + pytest.skip("renderers_native not built; run `maturin develop`") + + try: + from renderers.base import load_tokenizer + + tokenizer = load_tokenizer("Qwen/Qwen3-8B") + tok_path = router.resolve_tokenizer_path(tokenizer) + except Exception as exc: + pytest.skip(f"could not resolve Qwen3 tokenizer: {exc}") + if not os.path.exists(tok_path): + pytest.skip(f"tokenizer.json missing on disk at {tok_path}") + + return native.Renderer.qwen3(tok_path) + + +def test_render_ids_np_matches_list_api(qwen3_native): + messages = [ + {"role": "system", "content": "You are concise."}, + {"role": "user", "content": "Say hi."}, + ] + + ids = qwen3_native.render_ids_np(messages, add_generation_prompt=True) + + assert ids.dtype == np.uint32 + assert ids.tolist() == qwen3_native.render_ids( + messages, + add_generation_prompt=True, + ) + + +def test_parse_response_np_borrows_uint32_completion(qwen3_native): + prompt = [{"role": "user", "content": "What is 2+2?"}] + assistant = {"role": "assistant", "content": "4"} + prompt_ids = qwen3_native.render_ids_np(prompt, add_generation_prompt=True) + full_ids = qwen3_native.render_ids_np(prompt + [assistant]) + completion_ids = full_ids[len(prompt_ids) :] + + parsed = qwen3_native.parse_response_np(completion_ids) + + assert parsed.content == "4" + + +def test_bridge_to_next_turn_np_matches_list_api(qwen3_native): + prompt = [{"role": "user", "content": "Plan Saturday."}] + assistant = {"role": "assistant", "content": "Start with breakfast."} + new_messages = [{"role": "user", "content": "Add one museum."}] + + prompt_ids = qwen3_native.render_ids_np(prompt, add_generation_prompt=True) + full_ids = qwen3_native.render_ids_np(prompt + [assistant]) + completion_ids = full_ids[len(prompt_ids) :] + + bridged_np = qwen3_native.bridge_to_next_turn_np( + prompt_ids, + completion_ids, + new_messages, + ) + bridged_list = qwen3_native.bridge_to_next_turn( + prompt_ids.tolist(), + completion_ids.tolist(), + new_messages, + ) + + assert bridged_np is not None + assert bridged_list is not None + assert bridged_np.dtype == np.uint32 + assert bridged_np.tolist() == bridged_list.token_ids From 5152854630800a5605c91e576d38291b3c49b086 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 10:59:43 +0200 Subject: [PATCH 25/35] Fix native workspace manifest --- crates/renderers-core/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/renderers-core/Cargo.toml b/crates/renderers-core/Cargo.toml index 11fb4ad..4d24b65 100644 --- a/crates/renderers-core/Cargo.toml +++ b/crates/renderers-core/Cargo.toml @@ -15,7 +15,6 @@ tokenizers = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } regex = { workspace = true } -once_cell = { workspace = true } thiserror = { workspace = true } smallvec = { workspace = true } bumpalo = { workspace = true } From 8a53b30e813f2a910c9dd684a82f7a1ad7916b2b Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 11:00:31 +0200 Subject: [PATCH 26/35] Restore native multimodal type surface --- crates/renderers-core/src/lib.rs | 9 ++-- crates/renderers-core/src/types.rs | 79 ++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 5 deletions(-) diff --git a/crates/renderers-core/src/lib.rs b/crates/renderers-core/src/lib.rs index 3c21527..1d9a905 100644 --- a/crates/renderers-core/src/lib.rs +++ b/crates/renderers-core/src/lib.rs @@ -37,10 +37,9 @@ pub mod tokenizer; pub mod traits; pub mod types; -pub use traits::{MultimodalRenderer, Renderer}; +pub use traits::{MediaResolver, MediaSource, MultimodalRenderer, Renderer}; pub use types::{ - Content, ContentPart, ImageRef, Message, MultiModalData, ParsedResponse, ParsedToolCall, - PlaceholderRange, RenderError, RenderedTokens, SCAFFOLD_IDX, - ToolArguments, ToolCall, ToolCallFunction, - ToolCallParseStatus, ToolSpec, VideoRef, + Content, ContentPart, ImageRef, MediaBundle, MediaItem, Message, Modality, MultiModalData, + ParsedResponse, ParsedToolCall, PlaceholderRange, RenderError, RenderedTokens, SCAFFOLD_IDX, + ToolArguments, ToolCall, ToolCallFunction, ToolCallParseStatus, ToolSpec, VideoRef, }; diff --git a/crates/renderers-core/src/types.rs b/crates/renderers-core/src/types.rs index 0d4d3b2..3af3534 100644 --- a/crates/renderers-core/src/types.rs +++ b/crates/renderers-core/src/types.rs @@ -224,6 +224,85 @@ impl MultiModalData { } } +/// Modality marker for a multimodal item. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Modality { + Image, + Video, +} + +impl Modality { + /// Wire string matching the keys used in [`MultiModalData::mm_hashes`] + /// and friends ("image" / "video"). + pub fn as_str(&self) -> &'static str { + match self { + Self::Image => "image", + Self::Video => "video", + } + } + + /// Numeric marker used by per-token modality masks + /// (1 = image, 2 = video). Matches the `mm_token_type_id_map` + /// convention in the Python protocol. + pub fn type_id(&self) -> u8 { + match self { + Self::Image => 1, + Self::Video => 2, + } + } +} + +/// A single media item — image or video — that the caller has already +/// resolved through a vision processor. The renderer never touches raw +/// pixel data; it only needs [`MediaItem::num_tokens`] to emit the right +/// placeholder count and the opaque [`MediaItem::hf_payload`] to splice +/// into the [`MultiModalData::mm_items`] map for the inference engine. +#[derive(Clone, Debug)] +pub struct MediaItem { + pub modality: Modality, + /// Cache key for this item — typically a SHA256 of the resolved + /// bytes. The renderer pushes it into + /// [`MultiModalData::mm_hashes`] under the modality key. + pub hash: String, + /// How many placeholder tokens this item expands into. For + /// Qwen3-VL this is `image_grid_thw.prod() / merge_size²`; for + /// Kimi K2.5 this is always 1 (the model expands per-patch + /// internally). + pub num_tokens: usize, + /// Opaque payload that travels alongside the placeholders to the + /// inference engine. In Phase 5a this is the HF + /// `image_processor(...)` output (`pixel_values`, `image_grid_thw`, + /// ...) — `serde_json::Value` keeps the crate framework-agnostic + /// without dragging numpy / torch into the dependency graph. + pub hf_payload: serde_json::Value, +} + +/// Bundle of pre-resolved media items keyed by the message index they +/// belong to. The renderer pops items in walk order; one bundle covers +/// the full call. +#[derive(Clone, Debug, Default)] +pub struct MediaBundle { + /// `(message_idx, item)` pairs in render order. Multiple items per + /// message are supported — the bundle stays a flat `Vec` so the + /// renderer can iterate with a single cursor. + pub items: Vec<(usize, MediaItem)>, +} + +impl MediaBundle { + pub fn new() -> Self { + Self::default() + } + + pub fn is_empty(&self) -> bool { + self.items.is_empty() + } + + pub fn push(&mut self, message_idx: usize, item: MediaItem) { + self.items.push((message_idx, item)); + } +} + /// Result of rendering messages to tokens. /// /// `token_ids` and `message_indices` are parallel: `message_indices[i]` is From 5608d57b2ce577dc7ad859d9927f69f0c01372c6 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 11:03:46 +0200 Subject: [PATCH 27/35] Fix DeepSeek native tool parity --- .../src/families/deepseek_v3.rs | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/crates/renderers-core/src/families/deepseek_v3.rs b/crates/renderers-core/src/families/deepseek_v3.rs index e2524f2..0429296 100644 --- a/crates/renderers-core/src/families/deepseek_v3.rs +++ b/crates/renderers-core/src/families/deepseek_v3.rs @@ -150,7 +150,7 @@ impl DeepSeekV3Renderer { fn args_to_json_string(args: &ToolArguments) -> String { match args { ToolArguments::Raw(s) => s.clone(), - ToolArguments::Object(v) => serde_json::to_string(v).unwrap_or_else(|_| "{}".into()), + ToolArguments::Object(v) => python_json_dumps(v), } } @@ -159,6 +159,39 @@ impl DeepSeekV3Renderer { } } +fn python_json_dumps(value: &JsonValue) -> String { + match value { + JsonValue::Null => "null".to_string(), + JsonValue::Bool(v) => v.to_string(), + JsonValue::Number(v) => v.to_string(), + JsonValue::String(v) => serde_json::to_string(v).unwrap_or_else(|_| "\"\"".to_string()), + JsonValue::Array(items) => { + let mut out = String::from("["); + for (i, item) in items.iter().enumerate() { + if i > 0 { + out.push_str(", "); + } + out.push_str(&python_json_dumps(item)); + } + out.push(']'); + out + } + JsonValue::Object(map) => { + let mut out = String::from("{"); + for (i, (key, item)) in map.iter().enumerate() { + if i > 0 { + out.push_str(", "); + } + out.push_str(&serde_json::to_string(key).unwrap_or_else(|_| "\"\"".to_string())); + out.push_str(": "); + out.push_str(&python_json_dumps(item)); + } + out.push('}'); + out + } + } +} + impl Renderer for DeepSeekV3Renderer { fn render( &self, @@ -347,7 +380,6 @@ impl DeepSeekV3Renderer { for tc in &msg.tool_calls { let name = tc.function.name.as_str(); let args_str = Self::args_to_json_string(&tc.function.arguments); - let _ = JsonValue::Null; // keep import in scope for future use buf.special(self.tool_call_begin, idx); buf.text("function", idx)?; From cf4196ef796f39bf6167c7f122a600691934371e Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Wed, 20 May 2026 16:32:05 +0200 Subject: [PATCH 28/35] Extend native runtime benchmark families --- benchmarks/native_vs_python_qwen3.py | 986 ++++++++++++++++++--------- 1 file changed, 651 insertions(+), 335 deletions(-) diff --git a/benchmarks/native_vs_python_qwen3.py b/benchmarks/native_vs_python_qwen3.py index abfd0c8..76b4008 100644 --- a/benchmarks/native_vs_python_qwen3.py +++ b/benchmarks/native_vs_python_qwen3.py @@ -5,133 +5,99 @@ # "transformers>=4.50.0", # ] # /// -"""Compare Qwen3 pure-Python renderer latency with the native PyO3 path. +"""Compare pure-Python renderer latency with native PyO3 renderer latency. Run from a checkout after building the native extension: - uv run --with maturin maturin develop \ - --manifest-path crates/renderers-py/Cargo.toml --release - uv run python benchmarks/native_vs_python_qwen3.py + uv run maturin develop --manifest-path crates/renderers-py/Cargo.toml --release + uv run python benchmarks/native_vs_python_qwen3.py --families all -The benchmark intentionally uses the public Python APIs on both sides. That -means native timings include PyO3 boundary and Python object conversion costs, -which is the relevant number for Python callers. Use the Criterion bench for -pure Rust hot-path timings. +The benchmark intentionally uses the public Python APIs on both sides. Native +timings include PyO3 boundary and Python object conversion costs, which is the +relevant number for Python callers. Use the Criterion bench for pure Rust +hot-path timings. """ from __future__ import annotations import argparse +import contextlib import gc +import io import json +import logging import os import statistics +import sys import time import tracemalloc -from collections.abc import Callable +from collections.abc import Callable, Sequence from dataclasses import dataclass from typing import Any, cast from renderers import _native_router as router from renderers.base import Message, ToolSpec, load_tokenizer -from renderers.qwen3 import Qwen3Renderer - - -MESSAGES: list[Message] = [ - { - "role": "system", - "content": "You are a helpful assistant that calls tools when needed.", - }, - { - "role": "user", - "content": "Plan a weekend trip to Lisbon for two; we like food and walking.", - }, - { - "role": "assistant", - "content": "I'll help. First, let me check the weather and find some restaurants.", - }, - {"role": "user", "content": "Sounds good - go ahead."}, - { - "role": "assistant", - "content": ( - "Here's a plan: Friday evening tapas at Time Out Market, Saturday " - "morning walk through Alfama, Saturday lunch at Ramiro (seafood), " - "Saturday afternoon Belem pasteis, Sunday morning Sao Jorge castle, " - "Sunday lunch at Cervejaria Trindade." - ), - }, -] - -NEW_MESSAGES: list[Message] = [ - {"role": "user", "content": "Add a kid-friendly option for Sunday morning."} -] TOOLS = cast( list[ToolSpec], [ { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get current weather for a city.", - "parameters": { - "type": "object", - "properties": { - "city": {"type": "string", "description": "City name"}, - "units": {"type": "string", "enum": ["celsius", "fahrenheit"]}, - }, - "required": ["city"], + "name": "get_weather", + "description": "Get current weather for a city.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"}, + "units": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, + "required": ["city"], }, }, { - "type": "function", - "function": { - "name": "search_places", - "description": "Find places matching a set of constraints.", - "parameters": { - "type": "object", - "properties": { - "city": {"type": "string"}, - "query": {"type": "string"}, - "filters": { - "type": "object", - "properties": { - "kid_friendly": {"type": "boolean"}, - "max_walk_minutes": {"type": "integer"}, - "tags": { - "type": "array", - "items": {"type": "string"}, - }, - }, + "name": "search_places", + "description": "Find places matching a set of constraints.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + "query": {"type": "string"}, + "filters": { + "type": "object", + "properties": { + "kid_friendly": {"type": "boolean"}, + "max_walk_minutes": {"type": "integer"}, + "tags": {"type": "array", "items": {"type": "string"}}, }, }, - "required": ["city", "query"], }, + "required": ["city", "query"], }, }, { - "type": "function", - "function": { - "name": "book_table", - "description": "Create a restaurant booking request.", - "parameters": { - "type": "object", - "properties": { - "restaurant": {"type": "string"}, - "party_size": {"type": "integer"}, - "time": {"type": "string"}, - "notes": {"type": "string"}, - }, - "required": ["restaurant", "party_size", "time"], + "name": "book_table", + "description": "Create a restaurant booking request.", + "parameters": { + "type": "object", + "properties": { + "restaurant": {"type": "string"}, + "party_size": {"type": "integer"}, + "time": {"type": "string"}, + "notes": {"type": "string"}, }, + "required": ["restaurant", "party_size", "time"], }, }, ], ) +@dataclass(frozen=True) +class FamilySpec: + family: str + model: str + + @dataclass(frozen=True) class RenderScenario: name: str @@ -178,9 +144,88 @@ class Memory: def peak_kib(self) -> float: return self.peak_bytes / 1024 + +@dataclass(frozen=True) +class BenchCase: + family: str + model: str + operation: str + scenario: str + token_count: int + py_fn: Callable[[], object] + native_fn: Callable[[], object] + native_np_fn: Callable[[], object] | None + + +@dataclass(frozen=True) +class BenchRow: + family: str + model: str + operation: str + scenario: str + token_count: int + py_timing: Timing + native_timing: Timing + native_np_timing: Timing | None + py_memory: Memory + native_memory: Memory + native_np_memory: Memory | None + @property - def per_call_bytes(self) -> float: - return self.peak_bytes / self.loops + def list_speedup(self) -> float: + return self.py_timing.median_ns / self.native_timing.median_ns + + @property + def np_speedup(self) -> float | None: + if self.native_np_timing is None: + return None + return self.py_timing.median_ns / self.native_np_timing.median_ns + + +DEFAULT_FAMILIES: tuple[FamilySpec, ...] = ( + FamilySpec("qwen3", "Qwen/Qwen3-8B"), + FamilySpec("qwen35", "Qwen/Qwen3.5-9B"), + FamilySpec("qwen36", "Qwen/Qwen3.6-35B-A3B"), + FamilySpec("glm5", "zai-org/GLM-5"), + FamilySpec("glm51", "zai-org/GLM-5.1"), + FamilySpec("glm45", "THUDM/GLM-4.5-Air"), + FamilySpec("deepseek_v3", "deepseek-ai/DeepSeek-V3"), + FamilySpec("kimi_k2", "moonshotai/Kimi-K2-Instruct"), + FamilySpec("minimax_m2", "MiniMaxAI/MiniMax-M2.5"), + FamilySpec("nemotron3", "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"), +) + + +FAMILY_BY_NAME = {spec.family: spec for spec in DEFAULT_FAMILIES} + + +def _medium_messages() -> list[Message]: + return [ + { + "role": "system", + "content": "You are a helpful assistant that calls tools when needed.", + }, + { + "role": "user", + "content": "Plan a weekend trip to Lisbon for two; we like food and walking.", + }, + { + "role": "assistant", + "content": ( + "I'll help. First, let me check the weather and find some restaurants." + ), + }, + {"role": "user", "content": "Sounds good - go ahead."}, + { + "role": "assistant", + "content": ( + "Here's a plan: Friday evening tapas at Time Out Market, Saturday " + "morning walk through Alfama, Saturday lunch at Ramiro, Saturday " + "afternoon Belem pasteis, Sunday morning Sao Jorge castle, Sunday " + "lunch at Cervejaria Trindade." + ), + }, + ] def _long_history(rounds: int = 18) -> list[Message]: @@ -215,10 +260,7 @@ def _long_history(rounds: int = 18) -> list[Message]: } ) messages.append( - { - "role": "user", - "content": "Now produce the final plan with the best three swaps.", - } + {"role": "user", "content": "Now produce the final plan with the best swaps."} ) return messages @@ -242,70 +284,47 @@ def _reasoning_history(rounds: int = 10) -> list[Message]: return messages -def _tool_cycle_messages() -> list[Message]: +def _structured_text_messages() -> list[Message]: return [ + {"role": "system", "content": "You preserve structured text parts."}, { - "role": "system", - "content": "You can call tools and then summarize the result.", + "role": "user", + "content": [ + {"type": "text", "text": "Compare two plans. "}, + {"type": "text", "text": "Prefer the one with fewer transfers."}, + ], }, - {"role": "user", "content": "Plan Sunday morning in Lisbon with weather."}, + {"role": "assistant", "content": "The lower-transfer plan is better."}, + ] + + +def _tool_cycle_messages() -> list[Message]: + return [ + {"role": "user", "content": "Weather?"}, { "role": "assistant", - "content": "I will check weather and candidate places.", + "content": "", "tool_calls": [ { - "id": "call_weather", "type": "function", "function": { "name": "get_weather", - "arguments": {"city": "Lisbon", "units": "celsius"}, - }, - }, - { - "id": "call_places", - "type": "function", - "function": { - "name": "search_places", - "arguments": { - "city": "Lisbon", - "query": "kid friendly Sunday morning", - "filters": { - "kid_friendly": True, - "max_walk_minutes": 20, - "tags": ["parks", "pastries", "views"], - }, - }, + "arguments": {"city": "Paris"}, }, }, ], }, - {"role": "tool", "name": "get_weather", "content": '{"temp": 19, "rain": 0.1}'}, - { - "role": "tool", - "name": "search_places", - "content": json.dumps( - { - "places": [ - {"name": "Jardim da Estrela", "walk_minutes": 12}, - {"name": "Manteigaria", "walk_minutes": 18}, - ] - }, - ensure_ascii=False, - ), - }, + {"role": "tool", "content": "sunny, 22 C"}, { "role": "assistant", - "content": "Use Jardim da Estrela first, then pastries if the weather holds.", + "content": "It's sunny and 22 C in Paris.", }, ] def _large_tool_only_messages() -> list[Message]: return [ - { - "role": "system", - "content": "You are a travel operations assistant.", - }, + {"role": "system", "content": "You are a travel operations assistant."}, { "role": "user", "content": ( @@ -316,9 +335,11 @@ def _large_tool_only_messages() -> list[Message]: ] -def render_scenarios() -> list[RenderScenario]: - return [ - RenderScenario("medium_gen_prompt", MESSAGES, add_generation_prompt=True), +def render_scenarios(family: str) -> list[RenderScenario]: + scenarios = [ + RenderScenario( + "medium_gen_prompt", _medium_messages(), add_generation_prompt=True + ), RenderScenario( "long_history_gen_prompt", _long_history(), @@ -333,6 +354,11 @@ def render_scenarios() -> list[RenderScenario]: add_generation_prompt=True, ), ] + if family in {"qwen35", "qwen36"}: + scenarios.insert( + 3, RenderScenario("structured_text_parts", _structured_text_messages()) + ) + return scenarios def parse_scenarios() -> list[ParseScenario]: @@ -410,12 +436,19 @@ def parse_scenarios() -> list[ParseScenario]: def bridge_scenarios() -> list[BridgeScenario]: + medium = _medium_messages() + tool_cycle = _tool_cycle_messages() return [ BridgeScenario( "medium_extend_user", - MESSAGES[:-1], - MESSAGES[-1], - NEW_MESSAGES, + medium[:-1], + medium[-1], + [ + { + "role": "user", + "content": "Add a kid-friendly option for Sunday morning.", + } + ], ), BridgeScenario( "long_history_extend_user", @@ -436,24 +469,129 @@ def bridge_scenarios() -> list[BridgeScenario]: ), BridgeScenario( "tool_response_extension", - _tool_cycle_messages()[:-1], - _tool_cycle_messages()[-1], + tool_cycle[:-1], + tool_cycle[-1], [ { "role": "tool", "name": "book_table", "content": '{"status": "waitlist", "eta_minutes": 15}', }, - { - "role": "user", - "content": "Adjust if the restaurant is waitlisted.", - }, + {"role": "user", "content": "Adjust if the restaurant is waitlisted."}, ], tools=TOOLS, ), ] +def build_python_renderer(family: str, tokenizer: Any) -> Any: + saved = os.environ.pop("RENDERERS_NATIVE", None) + try: + if family == "qwen3": + from renderers.qwen3 import Qwen3Renderer + + return Qwen3Renderer(tokenizer) + if family == "qwen35": + from renderers.qwen35 import Qwen35Renderer + + return Qwen35Renderer(tokenizer) + if family == "qwen36": + from renderers.qwen36 import Qwen36Renderer + + return Qwen36Renderer(tokenizer) + if family == "glm5": + from renderers.glm5 import GLM5Renderer + + return GLM5Renderer(tokenizer) + if family == "glm51": + from renderers.glm5 import GLM51Renderer + + return GLM51Renderer(tokenizer) + if family == "glm45": + from renderers.glm45 import GLM45Renderer + + return GLM45Renderer(tokenizer) + if family == "deepseek_v3": + from renderers.deepseek_v3 import DeepSeekV3Renderer + + return DeepSeekV3Renderer(tokenizer) + if family == "kimi_k2": + from renderers.kimi_k2 import KimiK2Renderer + + return KimiK2Renderer(tokenizer) + if family == "minimax_m2": + from renderers.minimax_m2 import MiniMaxM2Renderer + + return MiniMaxM2Renderer(tokenizer) + if family == "nemotron3": + from renderers.nemotron3 import Nemotron3Renderer + + return Nemotron3Renderer(tokenizer) + finally: + if saved is not None: + os.environ["RENDERERS_NATIVE"] = saved + raise ValueError(f"unknown family: {family}") + + +def build_native_renderer(native_module: Any, family: str, tokenizer_path: str) -> Any: + factory = { + "qwen3": native_module.Renderer.qwen3, + "qwen35": native_module.Renderer.qwen35, + "qwen36": native_module.Renderer.qwen36, + "glm5": native_module.Renderer.glm5, + "glm51": native_module.Renderer.glm51, + "glm45": native_module.Renderer.glm45, + "deepseek_v3": native_module.Renderer.deepseek_v3, + "kimi_k2": native_module.Renderer.kimi_k2, + "minimax_m2": native_module.Renderer.minimax_m2, + "nemotron3": native_module.Renderer.nemotron3, + }.get(family) + if factory is None: + raise ValueError(f"unknown family: {family}") + return factory(tokenizer_path) + + +def parse_families(raw: str) -> list[FamilySpec]: + if raw in {"all", "native"}: + return list(DEFAULT_FAMILIES) + selected: list[FamilySpec] = [] + for item in raw.split(","): + family = item.strip() + if not family: + continue + try: + selected.append(FAMILY_BY_NAME[family]) + except KeyError as exc: + known = ", ".join(sorted(FAMILY_BY_NAME)) + raise SystemExit(f"unknown family {family!r}; known: {known}") from exc + if not selected: + raise SystemExit("--families resolved to an empty set") + return selected + + +def apply_model_overrides( + specs: Sequence[FamilySpec], overrides: Sequence[str] +) -> list[FamilySpec]: + by_family = {spec.family: spec for spec in specs} + for override in overrides: + if "=" not in override: + if len(specs) != 1: + raise SystemExit( + "--model without FAMILY=MODEL is only valid with one family" + ) + family, model = specs[0].family, override + else: + family, model = override.split("=", 1) + family = family.strip() + model = model.strip() + if family not in by_family: + raise SystemExit( + f"--model override references unselected family {family!r}" + ) + by_family[family] = FamilySpec(family, model) + return [by_family[spec.family] for spec in specs] + + def time_case( fn: Callable[[], object], *, @@ -510,11 +648,11 @@ def _assert_parsed_equal(py_value: Any, native_value: Any) -> None: raise AssertionError( "parse_response reasoning parity failed before benchmarking" ) - py_calls = py_value.tool_calls - native_calls = native_value.tool_calls - if len(py_calls) != len(native_calls): + if len(py_value.tool_calls) != len(native_value.tool_calls): raise AssertionError("parse_response tool-call count parity failed") - for py_call, native_call in zip(py_calls, native_calls, strict=True): + for py_call, native_call in zip( + py_value.tool_calls, native_value.tool_calls, strict=True + ): if ( py_call.raw, py_call.name, @@ -563,67 +701,44 @@ def _bridge_inputs( return list(previous_prompt_ids), previous_completion_ids -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--model", default="Qwen/Qwen3-8B") - parser.add_argument("--min-time", type=float, default=0.35) - parser.add_argument("--repeats", type=int, default=7) - parser.add_argument( - "--memory-loops", - type=int, - default=1000, - help=( - "Iterations for tracemalloc peak measurement. This tracks Python " - "heap allocations, including PyO3 boundary objects, not Rust malloc " - "or NumPy native data buffers." - ), - ) - args = parser.parse_args() - - os.environ.pop("RENDERERS_NATIVE", None) - tokenizer = load_tokenizer(args.model) - tokenizer_path = router.resolve_tokenizer_path(tokenizer) - - native = router.load_native() - if native is None: - raise RuntimeError( - "renderers_native is not built; run `uv run --with maturin maturin " - "develop --manifest-path crates/renderers-py/Cargo.toml --release`" - ) - - py_renderer = Qwen3Renderer(tokenizer) - native_renderer = native.Renderer.qwen3(tokenizer_path) - - cases: list[ - tuple[ - str, - str, - int, - Callable[[], object], - Callable[[], object], - Callable[[], object] | None, - ] - ] = [] - - for scenario in render_scenarios(): - py_ids = _as_ids( - py_renderer.render_ids( - scenario.messages, - tools=scenario.tools, - add_generation_prompt=scenario.add_generation_prompt, +def _add_render_cases( + cases: list[BenchCase], + skipped: list[str], + *, + spec: FamilySpec, + py_renderer: Any, + native_renderer: Any, + strict: bool, +) -> None: + for scenario in render_scenarios(spec.family): + try: + py_ids = _as_ids( + py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ) ) - ) - native_ids = _as_ids( - native_renderer.render_ids( - scenario.messages, - tools=scenario.tools, - add_generation_prompt=scenario.add_generation_prompt, + native_ids = _as_ids( + native_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ) ) - ) - if py_ids != native_ids: - raise AssertionError(f"{scenario.name} render_ids parity failed") + if py_ids != native_ids: + raise AssertionError("render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_ids:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:render_ids:{scenario.name}: {exc}") + continue cases.append( - ( + BenchCase( + spec.family, + spec.model, "render_ids", scenario.name, len(py_ids), @@ -645,31 +760,51 @@ def main() -> None: ) ) + +def _add_parse_cases( + cases: list[BenchCase], + skipped: list[str], + *, + spec: FamilySpec, + py_renderer: Any, + native_renderer: Any, + strict: bool, +) -> None: for scenario in parse_scenarios(): - py_completion_ids = _completion_ids(py_renderer, scenario) - native_completion_ids = _completion_ids(native_renderer, scenario) - native_prompt_np = native_renderer.render_ids_np( - scenario.prompt, - tools=scenario.tools, - add_generation_prompt=True, - ) - native_full_np = native_renderer.render_ids_np( - scenario.prompt + [scenario.assistant], - tools=scenario.tools, - ) - native_completion_np = native_full_np[len(native_prompt_np) :] - if py_completion_ids != native_completion_ids: - raise AssertionError(f"{scenario.name} completion parity failed") - _assert_parsed_equal( - py_renderer.parse_response(py_completion_ids), - native_renderer.parse_response(py_completion_ids), - ) - _assert_parsed_equal( - py_renderer.parse_response(py_completion_ids), - native_renderer.parse_response_np(native_completion_np), - ) + try: + py_completion_ids = _completion_ids(py_renderer, scenario) + native_completion_ids = _completion_ids(native_renderer, scenario) + native_prompt_np = native_renderer.render_ids_np( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + native_full_np = native_renderer.render_ids_np( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + native_completion_np = native_full_np[len(native_prompt_np) :] + if py_completion_ids != native_completion_ids: + raise AssertionError("completion parity failed") + _assert_parsed_equal( + py_renderer.parse_response(py_completion_ids), + native_renderer.parse_response(py_completion_ids), + ) + _assert_parsed_equal( + py_renderer.parse_response(py_completion_ids), + native_renderer.parse_response_np(native_completion_np), + ) + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:parse_response:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:parse_response:{scenario.name}: {exc}") + continue cases.append( - ( + BenchCase( + spec.family, + spec.model, "parse_response", scenario.name, len(py_completion_ids), @@ -679,106 +814,204 @@ def main() -> None: ) ) + +def _add_bridge_cases( + cases: list[BenchCase], + skipped: list[str], + *, + spec: FamilySpec, + py_renderer: Any, + native_renderer: Any, + strict: bool, +) -> None: for scenario in bridge_scenarios(): - prev_prompt, prev_completion = _bridge_inputs(py_renderer, scenario) - native_prev_prompt, native_prev_completion = _bridge_inputs( - native_renderer, scenario - ) - native_prev_prompt_np = native_renderer.render_ids_np( - scenario.prompt, - tools=scenario.tools, - add_generation_prompt=True, - ) - native_full_np = native_renderer.render_ids_np( - scenario.prompt + [scenario.assistant], - tools=scenario.tools, - ) - native_prev_completion_np = native_full_np[len(native_prev_prompt_np) :] - if ( - prev_prompt != native_prev_prompt - or prev_completion != native_prev_completion - ): - raise AssertionError(f"{scenario.name} bridge input parity failed") - py_bridge = py_renderer.bridge_to_next_turn( - prev_prompt, - prev_completion, - scenario.new_messages, - tools=scenario.tools, - ) - native_bridge = native_renderer.bridge_to_next_turn( - prev_prompt, - prev_completion, - scenario.new_messages, - tools=scenario.tools, - ) - if py_bridge is None or native_bridge is None: - raise AssertionError(f"{scenario.name} bridge unexpectedly returned None") - if list(py_bridge.token_ids) != list(native_bridge.token_ids): - raise AssertionError(f"{scenario.name} bridge parity failed") - native_bridge_np = native_renderer.bridge_to_next_turn_np( - native_prev_prompt_np, - native_prev_completion_np, - scenario.new_messages, - tools=scenario.tools, - ) - if native_bridge_np is None: - raise AssertionError(f"{scenario.name} numpy bridge returned None") - if list(py_bridge.token_ids) != native_bridge_np.tolist(): - raise AssertionError(f"{scenario.name} numpy bridge parity failed") + try: + prev_prompt, prev_completion = _bridge_inputs(py_renderer, scenario) + native_prev_prompt, native_prev_completion = _bridge_inputs( + native_renderer, scenario + ) + if ( + prev_prompt != native_prev_prompt + or prev_completion != native_prev_completion + ): + raise AssertionError("bridge input parity failed") + + py_bridge = py_renderer.bridge_to_next_turn( + prev_prompt, + prev_completion, + scenario.new_messages, + tools=scenario.tools, + ) + native_bridge = native_renderer.bridge_to_next_turn( + prev_prompt, + prev_completion, + scenario.new_messages, + tools=scenario.tools, + ) + if py_bridge is None and native_bridge is None: + continue + if py_bridge is None or native_bridge is None: + raise AssertionError("bridge None parity failed") + if list(py_bridge.token_ids) != list(native_bridge.token_ids): + raise AssertionError("bridge parity failed") + + native_prev_prompt_np = native_renderer.render_ids_np( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + native_full_np = native_renderer.render_ids_np( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + native_prev_completion_np = native_full_np[len(native_prev_prompt_np) :] + native_bridge_np = native_renderer.bridge_to_next_turn_np( + native_prev_prompt_np, + native_prev_completion_np, + scenario.new_messages, + tools=scenario.tools, + ) + if native_bridge_np is None: + raise AssertionError("numpy bridge returned None") + if list(py_bridge.token_ids) != native_bridge_np.tolist(): + raise AssertionError("numpy bridge parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:bridge_to_next_turn:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:bridge_to_next_turn:{scenario.name}: {exc}") + continue + cases.append( - ( + BenchCase( + spec.family, + spec.model, "bridge_to_next_turn", scenario.name, len(py_bridge.token_ids), - lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: py_renderer.bridge_to_next_turn( - pp, - pc, - scenario.new_messages, - tools=scenario.tools, + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + py_renderer.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) ), - lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: native_renderer.bridge_to_next_turn( - pp, - pc, - scenario.new_messages, - tools=scenario.tools, + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + native_renderer.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) ), - lambda scenario=scenario, pp=native_prev_prompt_np, pc=native_prev_completion_np: native_renderer.bridge_to_next_turn_np( - pp, - pc, - scenario.new_messages, - tools=scenario.tools, + lambda scenario=scenario, pp=native_prev_prompt_np, pc=native_prev_completion_np: ( + native_renderer.bridge_to_next_turn_np( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) ), ) ) + +def build_cases( + *, + specs: Sequence[FamilySpec], + native_module: Any, + strict: bool, +) -> tuple[list[BenchCase], list[str]]: + cases: list[BenchCase] = [] + skipped: list[str] = [] + for spec in specs: + try: + with contextlib.redirect_stdout(io.StringIO()): + tokenizer = load_tokenizer(spec.model) + tokenizer_path = router.resolve_tokenizer_path(tokenizer) + if not os.path.exists(tokenizer_path): + raise FileNotFoundError(tokenizer_path) + py_renderer = build_python_renderer(spec.family, tokenizer) + native_renderer = build_native_renderer( + native_module, spec.family, tokenizer_path + ) + family_cases: list[BenchCase] = [] + _add_render_cases( + family_cases, + skipped, + spec=spec, + py_renderer=py_renderer, + native_renderer=native_renderer, + strict=strict, + ) + _add_parse_cases( + family_cases, + skipped, + spec=spec, + py_renderer=py_renderer, + native_renderer=native_renderer, + strict=strict, + ) + _add_bridge_cases( + family_cases, + skipped, + spec=spec, + py_renderer=py_renderer, + native_renderer=native_renderer, + strict=strict, + ) + cases.extend(family_cases) + print( + f"prepared family={spec.family} model={spec.model} " + f"tokenizer_path={tokenizer_path}", + file=sys.stderr, + ) + except Exception as exc: + message = f"{spec.family} ({spec.model}): {exc}" + if strict: + raise RuntimeError(message) from exc + skipped.append(message) + print(f"skipped {message}", file=sys.stderr) + return cases, skipped + + +def run_cases( + cases: Sequence[BenchCase], + *, + min_time_s: float, + repeats: int, + memory_loops: int, +) -> list[BenchRow]: gc.collect() gc.disable() try: - rows = [] - for operation, scenario, token_count, py_fn, native_fn, native_np_fn in cases: - py_timing = time_case( - py_fn, min_time_s=args.min_time, repeats=args.repeats - ) + rows: list[BenchRow] = [] + for case in cases: + py_timing = time_case(case.py_fn, min_time_s=min_time_s, repeats=repeats) native_timing = time_case( - native_fn, min_time_s=args.min_time, repeats=args.repeats + case.native_fn, min_time_s=min_time_s, repeats=repeats ) native_np_timing = ( - time_case(native_np_fn, min_time_s=args.min_time, repeats=args.repeats) - if native_np_fn is not None + time_case(case.native_np_fn, min_time_s=min_time_s, repeats=repeats) + if case.native_np_fn is not None else None ) - py_memory = memory_case(py_fn, loops=args.memory_loops) - native_memory = memory_case(native_fn, loops=args.memory_loops) + py_memory = memory_case(case.py_fn, loops=memory_loops) + native_memory = memory_case(case.native_fn, loops=memory_loops) native_np_memory = ( - memory_case(native_np_fn, loops=args.memory_loops) - if native_np_fn is not None + memory_case(case.native_np_fn, loops=memory_loops) + if case.native_np_fn is not None else None ) rows.append( - ( - operation, - scenario, - token_count, + BenchRow( + case.family, + case.model, + case.operation, + case.scenario, + case.token_count, py_timing, native_timing, native_np_timing, @@ -789,52 +1022,135 @@ def main() -> None: ) finally: gc.enable() + return rows - print(f"model={args.model}") - print(f"tokenizer_path={tokenizer_path}") - print() + +def geometric_mean(values: Sequence[float]) -> float: + if not values: + return 0.0 + product = 1.0 + for value in values: + product *= value + return product ** (1.0 / len(values)) + + +def print_results( + rows: Sequence[BenchRow], skipped: Sequence[str], memory_loops: int +) -> None: print( - "| operation | scenario | tokens | python us | native list us | " + "| family | operation | scenario | tokens | python us | native list us | " "native np us | list speedup | np speedup | python peak KiB | " "native list peak KiB | native np peak KiB |" ) - print("|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|") - for ( - operation, - scenario, - token_count, - py_timing, - native_timing, - native_np_timing, - py_memory, - native_memory, - native_np_memory, - ) in rows: - speedup = py_timing.median_ns / native_timing.median_ns + print("|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|") + for row in rows: np_us = ( - f"{native_np_timing.median_us:.3f}" if native_np_timing is not None else "-" - ) - np_speedup = ( - f"{py_timing.median_ns / native_np_timing.median_ns:.2f}x" - if native_np_timing is not None + f"{row.native_np_timing.median_us:.3f}" + if row.native_np_timing is not None else "-" ) + np_speedup = f"{row.np_speedup:.2f}x" if row.np_speedup is not None else "-" np_peak = ( - f"{native_np_memory.peak_kib:.1f}" if native_np_memory is not None else "-" + f"{row.native_np_memory.peak_kib:.1f}" + if row.native_np_memory is not None + else "-" ) print( - f"| `{operation}` | `{scenario}` | {token_count} | " - f"{py_timing.median_us:.3f} | " - f"{native_timing.median_us:.3f} | {np_us} | " - f"{speedup:.2f}x | {np_speedup} | " - f"{py_memory.peak_kib:.1f} | {native_memory.peak_kib:.1f} | {np_peak} |" + f"| `{row.family}` | `{row.operation}` | `{row.scenario}` | " + f"{row.token_count} | {row.py_timing.median_us:.3f} | " + f"{row.native_timing.median_us:.3f} | {np_us} | " + f"{row.list_speedup:.2f}x | {np_speedup} | " + f"{row.py_memory.peak_kib:.1f} | {row.native_memory.peak_kib:.1f} | " + f"{np_peak} |" + ) + + print() + print("| family | rows | list geomean speedup | np geomean speedup |") + print("|---|---:|---:|---:|") + families = sorted({row.family for row in rows}) + for family in families: + family_rows = [row for row in rows if row.family == family] + list_speedup = geometric_mean([row.list_speedup for row in family_rows]) + np_speedup = geometric_mean( + [row.np_speedup for row in family_rows if row.np_speedup is not None] ) + print( + f"| `{family}` | {len(family_rows)} | {list_speedup:.2f}x | " + f"{np_speedup:.2f}x |" + ) + print() print( "memory note: peak KiB uses Python tracemalloc over " - f"{args.memory_loops} calls; Rust allocator and NumPy native data buffers " + f"{memory_loops} calls; Rust allocator and NumPy native data buffers " "are not included." ) + if skipped: + print() + print("Skipped cases:") + for item in skipped: + print(f"- {item}") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--families", + default="all", + help=( + "Comma-separated family keys or 'all'. Known keys: " + + ", ".join(sorted(FAMILY_BY_NAME)) + ), + ) + parser.add_argument( + "--model", + action="append", + default=[], + help=( + "Override model id. Use MODEL when one family is selected, or " + "FAMILY=MODEL for multi-family runs. May be repeated." + ), + ) + parser.add_argument("--min-time", type=float, default=0.35) + parser.add_argument("--repeats", type=int, default=7) + parser.add_argument( + "--memory-loops", + type=int, + default=1000, + help=( + "Iterations for tracemalloc peak measurement. This tracks Python " + "heap allocations, including PyO3 boundary objects, not Rust malloc " + "or NumPy native data buffers." + ), + ) + parser.add_argument( + "--strict", + action="store_true", + help="Fail instead of skipping families whose tokenizer is unavailable.", + ) + args = parser.parse_args() + + os.environ.pop("RENDERERS_NATIVE", None) + logging.getLogger("transformers_modules").setLevel(logging.ERROR) + logging.getLogger("transformers").setLevel(logging.ERROR) + native = router.load_native() + if native is None: + raise RuntimeError( + "renderers_native is not built; run `uv run maturin develop " + "--manifest-path crates/renderers-py/Cargo.toml --release`" + ) + + specs = apply_model_overrides(parse_families(args.families), args.model) + cases, skipped = build_cases(specs=specs, native_module=native, strict=args.strict) + if not cases: + raise RuntimeError("no benchmark cases were prepared") + rows = run_cases( + cases, + min_time_s=args.min_time, + repeats=args.repeats, + memory_loops=args.memory_loops, + ) + print_results(rows, skipped, args.memory_loops) if __name__ == "__main__": From db2905a3656dbe53499fab47c8c06264707c022c Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Thu, 21 May 2026 09:56:06 +0200 Subject: [PATCH 29/35] Improve native runtime performance paths --- Cargo.lock | 1 + IDEAS.md | 684 +++++++++++ benchmarks/native_vs_python_qwen3.py | 1092 +++++++++++++++++ crates/renderers-core/src/emit.rs | 112 ++ .../src/families/deepseek_v3.rs | 132 +- crates/renderers-core/src/families/glm.rs | 134 +- crates/renderers-core/src/families/kimi_k2.rs | 54 +- .../renderers-core/src/families/kimi_k25.rs | 44 +- .../renderers-core/src/families/minimax_m2.rs | 144 ++- .../renderers-core/src/families/nemotron3.rs | 121 +- crates/renderers-core/src/families/qwen3.rs | 79 +- crates/renderers-core/src/families/qwen35.rs | 201 ++- crates/renderers-core/src/lib.rs | 1 + crates/renderers-core/src/parsing/glm.rs | 4 +- crates/renderers-core/src/parsing/qwen3.rs | 6 +- crates/renderers-core/src/parsing/qwen35.rs | 4 +- crates/renderers-core/src/tokenizer.rs | 16 + crates/renderers-core/src/tool_cache.rs | 86 ++ crates/renderers-py/Cargo.toml | 1 + crates/renderers-py/src/lib.rs | 395 +++++- examples/README.md | 10 + examples/sglang/multiturn_generate_sglang.py | 26 +- examples/sglang/online_multiturn_sglang.py | 24 +- examples/vllm/multiturn_generate_vllm.py | 26 +- tests/test_native_numpy.py | 170 +++ 25 files changed, 3254 insertions(+), 313 deletions(-) create mode 100644 IDEAS.md create mode 100644 crates/renderers-core/src/tool_cache.rs diff --git a/Cargo.lock b/Cargo.lock index a4e72a8..e8286e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2360,6 +2360,7 @@ dependencies = [ "numpy", "pyo3", "pythonize", + "rayon", "renderers-core", "serde", "serde_json", diff --git a/IDEAS.md b/IDEAS.md new file mode 100644 index 0000000..ad28625 --- /dev/null +++ b/IDEAS.md @@ -0,0 +1,684 @@ +# Native Runtime Performance Ideas + +This document is the working plan for making the Rust/PyO3 renderers faster while +keeping parity visible at every step. The goal is not to guess where the speedup +comes from. Each change should land with a benchmark artifact that compares the +new commit against the previous baseline. + +## Current Shape + +The benchmark entry point is: + +```bash +uv run maturin develop --manifest-path crates/renderers-py/Cargo.toml --release +uv run python benchmarks/native_vs_python_qwen3.py --families all --min-time 0.35 --repeats 7 --memory-loops 1000 +``` + +The script already compares: + +- Python renderer public APIs. +- Native list-returning APIs. +- Native NumPy-returning APIs where available. +- `render_ids`, `parse_response`, and `bridge_to_next_turn`. +- Multiple families: Qwen, GLM, DeepSeek, Kimi, MiniMax, and Nemotron. + +The script now has progress and reproducibility support, so it can be used as +the optimization scoreboard before and after each runtime commit. + +## Benchmark Harness First + +Before optimizing runtime code, make the benchmark produce stable artifacts. +This lets every commit answer the same question: what got faster, what got +slower, and by how much? + +### 1. Structured Output + +Implemented flags in `benchmarks/native_vs_python_qwen3.py`: + +```bash +--json-out benchmark-results/native-runtime/latest.json +--markdown-out benchmark-results/native-runtime/latest.md +--baseline benchmark-results/native-runtime/baseline.json +``` + +The JSON includes: + +- Git commit SHA and dirty state. +- Python version, Rust version, platform, CPU model if available. +- Native extension build mode. +- Benchmark args: families, repeats, min time, memory loops. +- One row per family, operation, scenario, and API path. +- Median, min, max, loop count, token count, and memory peak. +- Per-family geomean and overall geomean. + +The Markdown includes: + +- A short summary table with overall list and NumPy geomean speedups. +- A per-family table. +- A worst regressions table versus baseline. +- A best improvements table versus baseline. +- Skipped cases and why they were skipped. + +Raw terminal tables are still printed, but the JSON is the source of truth. + +### 2. Live Progress + +The full all-family benchmark is long enough that it renders progress as it +runs. Progress output goes to stderr. + +Suggested progress lines: + +```text +[1/120] qwen3 render_ids medium_gen_prompt: python +[1/120] qwen3 render_ids medium_gen_prompt: native list +[1/120] qwen3 render_ids medium_gen_prompt: native np +[1/120] qwen3 render_ids medium_gen_prompt: memory +``` + +The script also prints a compact family summary after each family finishes: + +```text +family=qwen3 rows=12 list_geomean=1.81x np_geomean=2.03x elapsed=31.2s +``` + +This matters because performance work can fail halfway through a full +matrix. Partial progress should still be useful. + +### 3. Compare Mode + +Implemented comparison mode: + +```bash +uv run python benchmarks/native_vs_python_qwen3.py \ + --families all \ + --baseline benchmark-results/native-runtime/baseline.json \ + --json-out benchmark-results/native-runtime/$SHA.json \ + --markdown-out benchmark-results/native-runtime/$SHA.md +``` + +Comparison rules: + +- Compare matching `family + operation + scenario + path`. +- Report ratios against the baseline medians. +- Treat missing baseline rows as new coverage, not wins. +- Treat missing current rows as failures unless explicitly skipped. +- Flag any row slower than baseline by more than 5 percent. +- Flag any row faster than baseline by more than 5 percent. + +The script exits non-zero only with an explicit flag such as: + +```bash +--fail-on-regression 5 +``` + +That keeps exploratory runs flexible while making CI or local gates strict when +we want them strict. + +### 4. Add a Small/Fast Profile + +Use a sub-minute profile before every larger run: + +```bash +uv run python benchmarks/native_vs_python_qwen3.py \ + --families qwen3,qwen35,kimi_k2 \ + --min-time 0.02 \ + --repeats 3 \ + --memory-loops 20 \ + --json-out benchmark-results/native-runtime/smoke.json +``` + +The smoke profile catches broken benchmark plumbing and obvious parity failures. +Only after it passes should we run the full profile: + +```bash +uv run python benchmarks/native_vs_python_qwen3.py \ + --families all \ + --min-time 0.35 \ + --repeats 7 \ + --memory-loops 1000 \ + --json-out benchmark-results/native-runtime/$SHA.json \ + --markdown-out benchmark-results/native-runtime/$SHA.md +``` + +## Commit Measurement Loop + +Every runtime optimization commit should follow this loop: + +1. Build native extension in release mode. + +```bash +uv run maturin develop --manifest-path crates/renderers-py/Cargo.toml --release +``` + +2. Run correctness checks. + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --locked +cargo test --workspace +uv run pytest -m parity tests/test_native_parity.py -q -rs +env RENDERERS_NATIVE=all uv run pytest \ + tests/test_render_ids.py \ + tests/test_bridge.py \ + tests/test_roundtrip.py \ + tests/test_message_indices.py \ + tests/test_native_router.py \ + tests/test_native_vision.py \ + tests/test_native_numpy.py \ + -q -rs +``` + +3. Run benchmark smoke. + +```bash +uv run python benchmarks/native_vs_python_qwen3.py \ + --families qwen3,qwen35,kimi_k2 \ + --min-time 0.02 \ + --repeats 3 \ + --memory-loops 20 +``` + +4. Run full benchmark and save artifacts. + +```bash +SHA=$(git rev-parse --short HEAD) +uv run python benchmarks/native_vs_python_qwen3.py \ + --families all \ + --min-time 0.35 \ + --repeats 7 \ + --memory-loops 1000 \ + --baseline benchmark-results/native-runtime/baseline.json \ + --json-out benchmark-results/native-runtime/$SHA.json \ + --markdown-out benchmark-results/native-runtime/$SHA.md +``` + +5. Commit code and benchmark artifact together when the benchmark is part of the +claim. If artifacts are too noisy for git, commit the code and paste the saved +Markdown summary into the commit message body or PR description. + +## Performance Work Queue + +The highest-value work is reducing repeated Python object parsing, repeated tool +formatting, and repeated token list materialization. Single fresh calls still pay +for Python input objects and tokenizer work, so the 8x to 10x target is most +realistic for prepared, batched, or multiturn workloads. + +### A. Prepared Tools + +Problem: + +The examples and benchmarks pass the same tool schema repeatedly. Today the +native path still receives Python objects, converts them to Rust structures, and +formats schema text for each render. + +Idea: + +Add a Python-visible prepared tool handle: + +```python +prepared_tools = renderer.prepare_tools(TOOLS) +ids = renderer.render_ids(messages, tools=prepared_tools, add_generation_prompt=True) +``` + +Native side: + +- Parse tool specs once. +- Normalize provider-specific tool shape once. +- Pre-render static tool instruction text once. +- Pre-tokenize static tool blocks where the family template allows it. +- Keep the original public `tools=list[dict]` path as fallback. + +Benchmark cases: + +- Existing `large_tools_gen_prompt`. +- Existing `tool_cycle_large_schema`. +- New repeated-tools scenario that renders the same tools across many short + prompts. + +Expected proof: + +- `render_ids` with tools gets faster. +- No regression for no-tools scenarios. +- SGLang and vLLM examples can use it directly because they already reuse + `TOOLS`. + +Status: + +- Implemented a Python-visible native `PreparedTools` handle. +- `Renderer.prepare_tools(TOOLS)` parses Python tool dictionaries once and can + be passed to `render_ids`, `render_ids_np`, `render_batch_ids`, and + `render_batch_ids_np_packed`. +- Benchmark rows now include `render_ids_prepared_tools`. +- Added a native `ToolTextCache` for repeated prepared-tool prompts. Qwen3, + Qwen3.5/Qwen3.6, GLM, Nemotron 3, MiniMax M2, and Kimi K2 now cache the fully + rendered and pre-tokenized system/tool text block keyed by the prepared tools + and dynamic system text. Repeated prepared-tool renders skip both tool + formatting and tokenization. + +### B. Prepared Conversation or Session + +Problem: + +Multiturn examples repeatedly pass Python message lists. For bridge paths, we +also repeatedly pass prompt IDs, completion IDs, and new messages across PyO3. + +Idea: + +Add a native session object that owns parsed messages and token buffers: + +```python +session = renderer.new_session(messages, tools=prepared_tools) +prompt_ids = session.render_ids(add_generation_prompt=True) +completion = engine_completion_ids(...) +bridged_ids = session.bridge_to_next_turn(completion, new_messages) +``` + +Native side: + +- Store parsed messages in Rust. +- Store prepared tools by reference or shared handle. +- Store previous prompt and completion buffers. +- Append new messages without reparsing the whole conversation. +- Return list IDs for existing engine APIs, and NumPy IDs for callers that can + keep arrays. + +Benchmark cases: + +- Existing `bridge_to_next_turn`. +- New `session_bridge_to_next_turn`. +- Long history plus one new user message. +- Tool response extension. + +Expected proof: + +- Big gains on `bridge_to_next_turn`. +- Lower memory pressure on Python heap. +- Minimal Python-side example change: replace renderer calls with a session. + +Status: + +- Implemented a Python-visible native `RendererSession`. +- `Renderer.new_session(messages, tools=prepared_tools)` stores parsed messages + and prepared tools in Rust. +- `session.render_ids()`, `session.render_ids_np()`, + `session.bridge_to_next_turn()`, and `session.bridge_to_next_turn_np()` are + available. +- Session messages are stored behind `Arc>`, so repeated + `session.render_ids()` calls clone only a pointer before releasing the GIL. +- Benchmark rows now include `session_render_ids`. +- Bridge implementations that only need token IDs now use token-id-only render + buffers, avoiding per-token message-index allocation on the extension path. +- `RendererSession.bridge_to_next_turn(..., update=False)` and + `bridge_to_next_turn_np(..., update=False)` allow repeatable measurement of + an initialized session bridge without mutating the stored prompt between + benchmark iterations. +- Benchmark rows now include `session_bridge_to_next_turn`. +- Implemented `RendererSession.fork()` so benchmarks and callers can cheaply + reset an initialized session state without reparsing messages or tools. +- Benchmark rows now include `session_bridge_loop`, a multi-step bridge loop + that advances the same session through several generated turns. + +### C. Batched Render APIs + +Problem: + +Serving systems rarely render one prompt in isolation. Even if SGLang or vLLM +does the model batching, the renderer can batch preprocessing before requests +reach the engine. + +Idea: + +Add: + +```python +batch = renderer.render_batch_ids(messages_batch, tools=prepared_tools) +batch_np = renderer.render_batch_ids_np(messages_batch, tools=prepared_tools) +``` + +Native side: + +- Parse one Python outer list. +- Reuse prepared tools across the batch. +- Use Rayon only after measuring thread overhead. +- Return `list[list[int]]` for current SGLang/vLLM compatibility. +- Return a packed NumPy representation for internal pipelines: + `ids: np.ndarray[uint32]` plus `offsets: np.ndarray[int64]`. + +Benchmark cases: + +- 8, 32, and 128 prompt batches. +- Short prompts with large tools. +- Long histories without tools. +- Mixed prompt lengths. + +Expected proof: + +- Batch throughput in prompts per second improves. +- Per-prompt median latency improves for realistic batch sizes. +- No change required at SGLang/vLLM engine boundary if we return lists. + +Status: + +- Implemented `Renderer.render_batch_ids(...)`. +- The native batch path uses Rayon for batches of 8 or more prompts. +- Benchmark rows now include `render_batch_ids`. + +### D. Packed NumPy Token Buffers + +Problem: + +Returning Python lists creates one Python integer object per token. NumPy avoids +that, but current SGLang/vLLM HTTP-style boundaries usually still need lists. + +Idea: + +Keep NumPy for renderer-internal and client-side intermediate steps: + +```python +prompt_np = renderer.render_ids_np(messages, tools=prepared_tools) +parsed = renderer.parse_response_np(completion_np) +bridged_np = renderer.bridge_to_next_turn_np(prompt_np, completion_np, new_messages) +``` + +Native side: + +- Return `uint32` arrays for token IDs. +- Accept contiguous `uint32` arrays without copying. +- Add packed batch arrays with offsets. +- Avoid list conversion until the exact engine call that requires it. + +SGLang/vLLM applicability: + +- Useful before and after engine generation. +- Not true end-to-end zero-copy for JSON or APIs requiring `list[int]`. +- Still useful for offline pipelines, metrics, masks, and bridge-heavy loops. + +Benchmark cases: + +- Existing NumPy rows. +- Add explicit `.tolist()` boundary rows: + `render_ids_np_then_tolist`. +- Add packed batch rows: + `render_batch_ids_np_packed`. + +Expected proof: + +- NumPy path stays faster than list path inside renderer. +- `.tolist()` boundary cost is visible instead of hidden. +- We can decide which examples should use NumPy and which should stay list-only. + +Status: + +- Existing single-prompt NumPy paths remain covered. +- Implemented `Renderer.render_batch_ids_np_packed(...)`, returning + `(ids: np.ndarray[uint32], offsets: np.ndarray[int64])`. +- Benchmark rows now use the packed batch path as the native NumPy batch path. +- Benchmark rows now include `render_ids_np_then_tolist` so the cost of crossing + back to engine-compatible Python lists is visible instead of hidden. + +### E. Template Constant Token Caches + +Problem: + +Family templates contain repeated literal tokens: role tags, separators, +generation prompts, reasoning delimiters, tool delimiters, image sentinels, and +end markers. + +Idea: + +Pre-tokenize constant fragments when constructing each native renderer. + +Native side: + +- Store static token slices per family. +- Append cached token slices instead of repeatedly encoding literals. +- Keep text-render parity tests strict because whitespace and delimiter changes + are easy to miss. + +Benchmark cases: + +- No-tools short prompts. +- Long histories. +- Reasoning histories. +- Structured text parts. + +Expected proof: + +- Broad `render_ids` improvement across families. +- Stronger gains on many-turn conversations. + +### F. Dynamic Text Encode Batching + +Problem: + +Rendering many message parts can call the tokenizer repeatedly. Tokenizer call +overhead can dominate short fragments. + +Idea: + +Batch dynamic text segments where the tokenizer supports it, then interleave the +encoded pieces with cached template tokens. + +Native side: + +- Collect dynamic text fragments during render planning. +- Encode them in one tokenizer batch. +- Reassemble tokens in original order. +- Preserve message index accounting. + +Benchmark cases: + +- Long history. +- Structured text parts. +- Many short user/assistant turns. + +Expected proof: + +- Long history render improves. +- Message indices remain identical. +- No parse or bridge regressions. + +Status: + +- Added `Tokenizer::encode_batch_no_special(...)`, backed by the tokenizer + crate's batch-fast encoder. +- Added a token-only `TokenPlanBuf` that records literal-token and dynamic-text + operations, batch-encodes text fragments, then materializes the final token + stream in order. +- Qwen3 `render_ids` uses the planned batch-encode path only for long no-tool + histories. Short prompts, tool-heavy prompts, attributed `render()`, and + bridge paths stay on the lower-overhead direct buffer. +- Benchmark rows now expose the targeted long-history render gain while keeping + short-prompt and tool-response bridge regressions visible. +- Remaining rollout work: apply the same conservative dispatch to additional + families only when a family-specific benchmark shows a gain. + +### G. Fast Input Shape + +Problem: + +OpenAI-style dict messages are flexible but expensive to parse. Hot callers can +use a stricter shape if it is optional. + +Idea: + +Add a compact input API without replacing existing public APIs: + +```python +renderer.render_fast( + roles=["system", "user", "assistant"], + contents=["...", "...", "..."], + tools=prepared_tools, +) +``` + +Native side: + +- Validate parallel arrays once. +- Avoid generic `dict` and `Content` traversal. +- Keep support for structured parts in the generic path. + +Benchmark cases: + +- Short chat. +- Long chat. +- Tool-heavy prompt. + +Expected proof: + +- Fast shape wins when the caller can provide it. +- Existing API behavior is unchanged. + +Status: + +- Implemented `Renderer.render_fast_ids(roles, contents, ...)`. +- Implemented `Renderer.render_fast_ids_np(roles, contents, ...)`. +- Benchmark rows now include `render_fast_ids` where the scenario is compatible + with plain string roles and contents. + +### I. Cached Template Literal Tokens + +Problem: + +Several family renderers still encoded fixed literal fragments on every render: +newlines, role prefixes, generation prompts, and XML close fragments. + +Status: + +- Qwen3 already cached the highest-frequency literal fragments. +- Qwen3.5/Qwen3.6 now cache common literal tokens at construction time: + newline, double newline, role prefixes, assistant generation prefix, and + `\n`. +- Text render, bridge, and multimodal user rendering use the cached token + slices. +- Nemotron 3 now caches common standalone literal tokens at construction time: + newline, role prefixes, assistant generation prefix, and `\n`. +- GLM now caches standalone newline tokens used in GLM-4.5 generation prompts + and tool-call separators. +- MiniMax M2 now caches standalone newline, `ai\n`, and `tool` tokens. +- Kimi K2 now caches standalone newline and `assistant` tokens. +- Kimi K2.5 now caches standalone newline, `assistant`, ``, and + `` tokens for text, bridge, and multimodal paths. +- Prepared tool text blocks are now pre-rendered and pre-tokenized for Qwen3, + Qwen3.5/Qwen3.6, GLM, Nemotron 3, MiniMax M2, and Kimi K2 through the shared + native `ToolTextCache`. + +### H. Parse Response Fast Path + +Problem: + +Parse can be sub-microsecond in simple cases, but tool calls and reasoning blocks +still require scanning and allocation. + +Idea: + +Optimize parsing around byte/token markers: + +- Search token IDs for known delimiter IDs before decoding full text. +- Decode only spans that become content, reasoning, or JSON arguments. +- Avoid JSON parsing unless a tool call delimiter exists. +- Return borrowed or compact Python objects where PyO3 allows it. + +Benchmark cases: + +- Plain content. +- Reasoning plus content. +- Multi-tool call. +- Long content. + +Expected proof: + +- Parse geomean improves. +- Multi-tool parse improves without slowing plain content. + +Status: + +- Qwen3.5/Qwen3.6 no longer allocate a copied `Vec` for the no-thinking + parse path. Plain content and tool-call parse now borrow the stripped token + slice directly. +- GLM no longer allocates a copied token vector for the no-thinking parse path. +- Qwen3 now moves plain decoded content through the no-thinking split path + instead of cloning it into a second `String`. +- Remaining deeper work: token-delimiter partial decode for more families, and + avoiding regex/string work inside XML tool-call spans where possible. + +## SGLang and vLLM Compatibility + +The examples currently pass renderer-owned token IDs to engines: + +- SGLang offline uses `engine.generate(input_ids=prompt_ids, ...)`. +- SGLang online sends `"input_ids": prompt_ids` over JSON. +- vLLM offline uses `{"prompt_token_ids": prompt_ids}`. + +That means: + +- Prepared tools are directly usable. +- Session rendering and session bridge are directly usable. +- Batched list output is directly usable. +- NumPy buffers are useful inside the renderer/client pipeline, but many engine + calls still need `list[int]`. +- True zero-copy across HTTP JSON is not realistic without changing the server + protocol. + +The best PR path is to preserve the existing list APIs and add opt-in fast paths. +Examples can adopt fast paths only where the call site remains clear. + +The SGLang and vLLM multiturn examples now keep that shape: + +- Native runs call `prepare_tools(TOOLS)` once when the renderer exposes it. +- Native runs use `new_session(messages, tools=prepared_tools)` for the first + render and the next-turn bridge, so repeated serving-loop calls do not parse + the same prompt/tool dictionaries again. +- `render_fast_ids(...)` remains the lighter API for local loops that already + hold parallel role/content arrays and do not need structured content parts. + +## Native/PyO3 API Map + +This is the concrete mapping from the performance ideas above to the current +native extension surface and verification hooks. + +| Idea | PyO3/native API | Benchmark row | Test coverage | +|---|---|---|---| +| Prepared tools | `Renderer.prepare_tools(...)`, `PreparedTools` | `render_ids_prepared_tools`, `render_batch_ids:short_batch_prepared_tools` | `tests/test_native_numpy.py::test_prepared_tools_match_raw_tools`, parity tool rows | +| Native session | `Renderer.new_session(...)`, `RendererSession.render_ids(...)`, `RendererSession.render_ids_np(...)` | `session_render_ids` | `tests/test_native_numpy.py::test_session_render_and_bridge_match_renderer`, parity rows | +| Session bridge | `RendererSession.bridge_to_next_turn(...)`, `RendererSession.bridge_to_next_turn_np(...)` | `session_bridge_to_next_turn` | `tests/test_native_numpy.py::test_session_render_and_bridge_match_renderer`, `test_session_numpy_bridge_match_renderer` | +| Repeatable session loop | `RendererSession.fork()` plus `bridge_to_next_turn(update=True)` | `session_bridge_loop` | `tests/test_native_numpy.py::test_session_fork_preserves_prompt_state`, benchmark parity precheck | +| Batched render | `Renderer.render_batch_ids(...)` | `render_batch_ids` | `tests/test_native_numpy.py::test_render_batch_ids_matches_single_calls` | +| Packed NumPy batch | `Renderer.render_batch_ids_np_packed(...)` | `render_batch_ids` native NumPy path | `tests/test_native_numpy.py::test_render_batch_ids_np_packed_matches_single_calls` | +| Single-prompt NumPy | `render_ids_np(...)`, `parse_response_np(...)`, `bridge_to_next_turn_np(...)` | native NumPy path, `render_ids_np_then_tolist` | `test_render_ids_np_matches_list_api`, `test_parse_response_np_borrows_uint32_completion`, `test_bridge_to_next_turn_np_matches_list_api` | +| Fast input shape | `Renderer.render_fast_ids(...)`, `Renderer.render_fast_ids_np(...)` | `render_fast_ids` | `tests/test_native_numpy.py::test_render_fast_ids_matches_dict_messages` | +| Dynamic text batching | `Tokenizer::encode_batch_no_special(...)`, `TokenPlanBuf`, Qwen3, Qwen3.5/Qwen3.6, DeepSeek V3, MiniMax M2, and GLM long no-tool `render_ids` dispatch | long-history `render_ids` and `render_fast_ids` rows | full parity, native-forced render tests, benchmark parity precheck | +| Template literal caches | family constructors store pre-tokenized literals | normal render and bridge rows across families | full parity and native-forced render/bridge tests | +| Prepared tool text cache | `ToolTextCache` in core family renderers | prepared-tools rows across supported families | full parity and native-forced render/bridge tests | +| Parse fast paths | borrowed stripped slices in Qwen3.5/Qwen3.6 and GLM, moved decoded content in Qwen3 | `parse_response` rows | full parity parse rows and native-forced roundtrip tests | + +## PR Implementation Order + +1. Benchmark artifact and progress support. +2. Baseline benchmark artifact from the current branch. +3. Prepared tools. +4. Session object for multiturn render and bridge. +5. Packed NumPy batch output. +6. Template constant token caches. +7. Dynamic text encode batching. +8. Optional fast input shape. +9. Parse response fast paths. + +Each item should have: + +- A parity test. +- A benchmark row or scenario that isolates it. +- A benchmark summary against the previous baseline. +- No broad Python-side rewrite unless the benchmark shows the API is worth it. + +## Success Criteria + +Runtime work is ready for the PR when: + +- Full parity passes. +- Full native-forced Python test subset passes. +- Full benchmark artifacts exist for baseline and final commits. +- The PR description shows per-family and overall geomean speedup. +- Any regression over 5 percent is explained or fixed. +- SGLang and vLLM examples still show the simple list-based path. +- Optional fast paths are documented by example, not required for normal use. diff --git a/benchmarks/native_vs_python_qwen3.py b/benchmarks/native_vs_python_qwen3.py index 76b4008..e97762e 100644 --- a/benchmarks/native_vs_python_qwen3.py +++ b/benchmarks/native_vs_python_qwen3.py @@ -27,12 +27,15 @@ import json import logging import os +import platform import statistics +import subprocess import sys import time import tracemalloc from collections.abc import Callable, Sequence from dataclasses import dataclass +from pathlib import Path from typing import Any, cast from renderers import _native_router as router @@ -182,6 +185,23 @@ def np_speedup(self) -> float | None: return self.py_timing.median_ns / self.native_np_timing.median_ns +@dataclass(frozen=True) +class BaselineDiff: + family: str + operation: str + scenario: str + path: str + current_median_ns: float | None + baseline_median_ns: float | None + ratio: float | None + + @property + def percent_change(self) -> float | None: + if self.ratio is None: + return None + return (self.ratio - 1.0) * 100.0 + + DEFAULT_FAMILIES: tuple[FamilySpec, ...] = ( FamilySpec("qwen3", "Qwen/Qwen3-8B"), FamilySpec("qwen35", "Qwen/Qwen3.5-9B"), @@ -335,6 +355,16 @@ def _large_tool_only_messages() -> list[Message]: ] +def _batch_messages() -> list[list[Message]]: + return [ + [ + {"role": "system", "content": "You are concise."}, + {"role": "user", "content": f"Write option {idx} in one sentence."}, + ] + for idx in range(16) + ] + + def render_scenarios(family: str) -> list[RenderScenario]: scenarios = [ RenderScenario( @@ -641,6 +671,33 @@ def _as_ids(value: Any) -> list[int]: return list(value) +def _packed_batch_to_lists(value: Any) -> list[list[int]]: + ids, offsets = value + return [ + ids[offsets[idx] : offsets[idx + 1]].tolist() for idx in range(len(offsets) - 1) + ] + + +def _sum_token_count(batch: Sequence[Sequence[int]]) -> int: + return sum(len(ids) for ids in batch) + + +def _roles_and_contents( + messages: Sequence[Message], +) -> tuple[list[str], list[str]] | None: + roles: list[str] = [] + contents: list[str] = [] + for message in messages: + if message.get("tool_calls") or message.get("reasoning_content"): + return None + content = message.get("content", "") + if not isinstance(content, str): + return None + roles.append(str(message["role"])) + contents.append(content) + return roles, contents + + def _assert_parsed_equal(py_value: Any, native_value: Any) -> None: if py_value.content != native_value.content: raise AssertionError("parse_response content parity failed before benchmarking") @@ -701,6 +758,122 @@ def _bridge_inputs( return list(previous_prompt_ids), previous_completion_ids +def _session_bridge_to_next_turn( + session: Any, + previous_completion_ids: Sequence[int], + new_messages: Sequence[Message], +) -> Any: + return session.bridge_to_next_turn( + previous_completion_ids, new_messages, update=False + ) + + +def _session_bridge_to_next_turn_np( + session: Any, + previous_completion_ids: Any, + new_messages: Sequence[Message], +) -> Any: + return session.bridge_to_next_turn_np( + previous_completion_ids, new_messages, update=False + ) + + +def _bridge_loop( + renderer: Any, + previous_prompt_ids: Sequence[int], + previous_completion_ids: Sequence[int], + new_messages: Sequence[Message], + tools: list[ToolSpec] | None, + *, + steps: int, +) -> Any: + prompt_ids = list(previous_prompt_ids) + bridged = None + for _ in range(steps): + bridged = renderer.bridge_to_next_turn( + prompt_ids, + previous_completion_ids, + new_messages, + tools=tools, + ) + if bridged is None: + raise AssertionError("bridge loop returned None") + prompt_ids = list(bridged.token_ids) + return bridged + + +def _session_bridge_loop( + session: Any, + previous_completion_ids: Sequence[int], + new_messages: Sequence[Message], + *, + steps: int, +) -> Any: + bridged = None + for _ in range(steps): + bridged = session.bridge_to_next_turn( + previous_completion_ids, new_messages, update=True + ) + if bridged is None: + raise AssertionError("session bridge loop returned None") + return bridged + + +def _session_bridge_loop_np( + session: Any, + previous_completion_ids: Any, + new_messages: Sequence[Message], + *, + steps: int, +) -> Any: + bridged = None + for _ in range(steps): + bridged = session.bridge_to_next_turn_np( + previous_completion_ids, new_messages, update=True + ) + if bridged is None: + raise AssertionError("session numpy bridge loop returned None") + return bridged + + +def _new_session_bridge_loop( + renderer: Any, + prompt: Sequence[Message], + tools: Any, + previous_completion_ids: Sequence[int], + new_messages: Sequence[Message], + *, + steps: int, +) -> Any: + session = renderer.new_session(prompt, tools=tools) + session.render_ids(add_generation_prompt=True) + return _session_bridge_loop( + session, + previous_completion_ids, + new_messages, + steps=steps, + ) + + +def _new_session_bridge_loop_np( + renderer: Any, + prompt: Sequence[Message], + tools: Any, + previous_completion_ids: Any, + new_messages: Sequence[Message], + *, + steps: int, +) -> Any: + session = renderer.new_session(prompt, tools=tools) + session.render_ids_np(add_generation_prompt=True) + return _session_bridge_loop_np( + session, + previous_completion_ids, + new_messages, + steps=steps, + ) + + def _add_render_cases( cases: list[BenchCase], skipped: list[str], @@ -759,6 +932,179 @@ def _add_render_cases( ), ) ) + cases.append( + BenchCase( + spec.family, + spec.model, + "render_ids_np_then_tolist", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario: native_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario: native_renderer.render_ids_np( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ).tolist(), + ) + ) + if scenario.tools: + prepared_tools = native_renderer.prepare_tools(scenario.tools) + try: + native_prepared_ids = _as_ids( + native_renderer.render_ids( + scenario.messages, + tools=prepared_tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + if py_ids != native_prepared_ids: + raise AssertionError("prepared tools render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_ids_prepared_tools:{scenario.name}: {exc}" + ) from exc + skipped.append( + f"{spec.family}:render_ids_prepared_tools:{scenario.name}: {exc}" + ) + continue + cases.append( + BenchCase( + spec.family, + spec.model, + "render_ids_prepared_tools", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario, prepared_tools=prepared_tools: ( + native_renderer.render_ids( + scenario.messages, + tools=prepared_tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ), + lambda scenario=scenario, prepared_tools=prepared_tools: ( + native_renderer.render_ids_np( + scenario.messages, + tools=prepared_tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ), + ) + ) + fast_input = _roles_and_contents(scenario.messages) + if fast_input is not None and scenario.tools is None: + roles, contents = fast_input + try: + native_fast_ids = _as_ids( + native_renderer.render_fast_ids( + roles, + contents, + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + if py_ids != native_fast_ids: + raise AssertionError("fast input render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_fast_ids:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:render_fast_ids:{scenario.name}: {exc}") + continue + cases.append( + BenchCase( + spec.family, + spec.model, + "render_fast_ids", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda roles=roles, contents=contents, add_generation_prompt=scenario.add_generation_prompt: ( + native_renderer.render_fast_ids( + roles, + contents, + add_generation_prompt=add_generation_prompt, + ) + ), + lambda roles=roles, contents=contents, add_generation_prompt=scenario.add_generation_prompt: ( + native_renderer.render_fast_ids_np( + roles, + contents, + add_generation_prompt=add_generation_prompt, + ) + ), + ) + ) + try: + prepared_tools = ( + native_renderer.prepare_tools(scenario.tools) + if scenario.tools is not None + else None + ) + session = native_renderer.new_session( + scenario.messages, + tools=prepared_tools, + ) + session_np = native_renderer.new_session( + scenario.messages, + tools=prepared_tools, + ) + session_ids = _as_ids( + session.render_ids( + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + session_np_ids = session_np.render_ids_np( + add_generation_prompt=scenario.add_generation_prompt, + ).tolist() + if py_ids != session_ids: + raise AssertionError("session render_ids parity failed") + if py_ids != session_np_ids: + raise AssertionError("session numpy render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:session_render_ids:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:session_render_ids:{scenario.name}: {exc}") + continue + cases.append( + BenchCase( + spec.family, + spec.model, + "session_render_ids", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda session=session, add_generation_prompt=scenario.add_generation_prompt: ( + session.render_ids(add_generation_prompt=add_generation_prompt) + ), + lambda session=session_np, add_generation_prompt=scenario.add_generation_prompt: ( + session.render_ids_np(add_generation_prompt=add_generation_prompt) + ), + ) + ) def _add_parse_cases( @@ -917,6 +1263,282 @@ def _add_bridge_cases( ) ) + try: + native_tools = ( + native_renderer.prepare_tools(scenario.tools) + if scenario.tools is not None + else None + ) + session = native_renderer.new_session(scenario.prompt, tools=native_tools) + session_prompt = list(session.render_ids(add_generation_prompt=True)) + if session_prompt != native_prev_prompt: + raise AssertionError("session prompt parity failed") + session_bridge = session.bridge_to_next_turn( + prev_completion, + scenario.new_messages, + update=False, + ) + if session_bridge is None: + raise AssertionError("session bridge returned None") + if list(py_bridge.token_ids) != list(session_bridge.token_ids): + raise AssertionError("session bridge parity failed") + + session_np = native_renderer.new_session( + scenario.prompt, tools=native_tools + ) + session_prompt_np = session_np.render_ids_np(add_generation_prompt=True) + if session_prompt_np.tolist() != native_prev_prompt: + raise AssertionError("session numpy prompt parity failed") + session_bridge_np = session_np.bridge_to_next_turn_np( + native_prev_completion_np, + scenario.new_messages, + update=False, + ) + if session_bridge_np is None: + raise AssertionError("session numpy bridge returned None") + if list(py_bridge.token_ids) != session_bridge_np.tolist(): + raise AssertionError("session numpy bridge parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:session_bridge_to_next_turn:{scenario.name}: {exc}" + ) from exc + skipped.append( + f"{spec.family}:session_bridge_to_next_turn:{scenario.name}: {exc}" + ) + continue + + bench_session = native_renderer.new_session(scenario.prompt, tools=native_tools) + bench_session.render_ids(add_generation_prompt=True) + bench_session_np = native_renderer.new_session( + scenario.prompt, tools=native_tools + ) + bench_session_np.render_ids_np(add_generation_prompt=True) + + cases.append( + BenchCase( + spec.family, + spec.model, + "session_bridge_to_next_turn", + scenario.name, + len(py_bridge.token_ids), + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + py_renderer.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) + ), + lambda scenario=scenario, pc=prev_completion, session=bench_session: ( + _session_bridge_to_next_turn( + session, + pc, + scenario.new_messages, + ) + ), + lambda scenario=scenario, pc=native_prev_completion_np, session=bench_session_np: ( + _session_bridge_to_next_turn_np( + session, + pc, + scenario.new_messages, + ) + ), + ) + ) + + loop_steps = 4 + try: + py_loop = _bridge_loop( + py_renderer, + prev_prompt, + prev_completion, + scenario.new_messages, + scenario.tools, + steps=loop_steps, + ) + native_loop = _bridge_loop( + native_renderer, + prev_prompt, + prev_completion, + scenario.new_messages, + scenario.tools, + steps=loop_steps, + ) + if list(py_loop.token_ids) != list(native_loop.token_ids): + raise AssertionError("bridge loop parity failed") + + session_loop = _new_session_bridge_loop( + native_renderer, + scenario.prompt, + native_tools, + prev_completion, + scenario.new_messages, + steps=loop_steps, + ) + if list(py_loop.token_ids) != list(session_loop.token_ids): + raise AssertionError("session bridge loop parity failed") + + session_loop_np = _new_session_bridge_loop_np( + native_renderer, + scenario.prompt, + native_tools, + native_prev_completion_np, + scenario.new_messages, + steps=loop_steps, + ) + if list(py_loop.token_ids) != session_loop_np.tolist(): + raise AssertionError("session numpy bridge loop parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:session_bridge_loop:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:session_bridge_loop:{scenario.name}: {exc}") + continue + + bench_loop_session = native_renderer.new_session( + scenario.prompt, tools=native_tools + ) + bench_loop_session.render_ids(add_generation_prompt=True) + bench_loop_session_np = native_renderer.new_session( + scenario.prompt, tools=native_tools + ) + bench_loop_session_np.render_ids_np(add_generation_prompt=True) + + cases.append( + BenchCase( + spec.family, + spec.model, + "session_bridge_loop", + f"{scenario.name}_{loop_steps}_steps", + len(py_loop.token_ids), + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + _bridge_loop( + py_renderer, + pp, + pc, + scenario.new_messages, + scenario.tools, + steps=loop_steps, + ) + ), + lambda scenario=scenario, pc=prev_completion, session=bench_loop_session: ( + _session_bridge_loop( + session.fork(), + pc, + scenario.new_messages, + steps=loop_steps, + ) + ), + lambda scenario=scenario, pc=native_prev_completion_np, session=bench_loop_session_np: ( + _session_bridge_loop_np( + session.fork(), + pc, + scenario.new_messages, + steps=loop_steps, + ) + ), + ) + ) + + +def _add_batch_cases( + cases: list[BenchCase], + skipped: list[str], + *, + spec: FamilySpec, + py_renderer: Any, + native_renderer: Any, + strict: bool, +) -> None: + batch = _batch_messages() + batch_scenarios: list[tuple[str, list[ToolSpec] | None, Any]] = [ + ("short_batch", None, None) + ] + try: + prepared_tools = native_renderer.prepare_tools(TOOLS) + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_batch_ids:prepare_tools: {exc}" + ) from exc + skipped.append(f"{spec.family}:render_batch_ids:prepare_tools: {exc}") + prepared_tools = None + batch_scenarios.append(("short_batch_prepared_tools", TOOLS, prepared_tools)) + + for scenario_name, tools, prepared_tools in batch_scenarios: + if tools is not None and prepared_tools is None: + continue + native_tools = prepared_tools if prepared_tools is not None else None + try: + py_batch = [ + list( + py_renderer.render_ids( + messages, tools=tools, add_generation_prompt=True + ) + ) + for messages in batch + ] + native_batch = [ + list(ids) + for ids in native_renderer.render_batch_ids( + batch, + tools=native_tools, + add_generation_prompt=True, + ) + ] + native_packed_batch = _packed_batch_to_lists( + native_renderer.render_batch_ids_np_packed( + batch, + tools=native_tools, + add_generation_prompt=True, + ) + ) + if py_batch != native_batch: + raise AssertionError("batch render_ids parity failed") + if py_batch != native_packed_batch: + raise AssertionError("packed numpy batch render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_batch_ids:{scenario_name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:render_batch_ids:{scenario_name}: {exc}") + continue + + cases.append( + BenchCase( + spec.family, + spec.model, + "render_batch_ids", + scenario_name, + _sum_token_count(py_batch), + lambda batch=batch, tools=tools: [ + py_renderer.render_ids( + messages, + tools=tools, + add_generation_prompt=True, + ) + for messages in batch + ], + lambda batch=batch, native_tools=native_tools: ( + native_renderer.render_batch_ids( + batch, + tools=native_tools, + add_generation_prompt=True, + ) + ), + lambda batch=batch, native_tools=native_tools: ( + native_renderer.render_batch_ids_np_packed( + batch, + tools=native_tools, + add_generation_prompt=True, + ) + ), + ) + ) + def build_cases( *, @@ -962,6 +1584,14 @@ def build_cases( native_renderer=native_renderer, strict=strict, ) + _add_batch_cases( + family_cases, + skipped, + spec=spec, + py_renderer=py_renderer, + native_renderer=native_renderer, + strict=strict, + ) cases.extend(family_cases) print( f"prepared family={spec.family} model={spec.model} " @@ -988,16 +1618,58 @@ def run_cases( gc.disable() try: rows: list[BenchRow] = [] + current_family: str | None = None + family_started_ns = time.perf_counter_ns() + total_steps = len(cases) * 4 + step = 0 + + def progress(case: BenchCase, label: str) -> None: + nonlocal step + step += 1 + print( + f"[{step}/{total_steps}] {case.family} {case.operation} " + f"{case.scenario}: {label}", + file=sys.stderr, + ) + + def finish_family(family: str) -> None: + family_rows = [row for row in rows if row.family == family] + if not family_rows: + return + elapsed_s = (time.perf_counter_ns() - family_started_ns) / 1_000_000_000 + list_speedup = geometric_mean([row.list_speedup for row in family_rows]) + np_speedup = geometric_mean( + [row.np_speedup for row in family_rows if row.np_speedup is not None] + ) + print( + f"family={family} rows={len(family_rows)} " + f"list_geomean={list_speedup:.2f}x " + f"np_geomean={np_speedup:.2f}x elapsed={elapsed_s:.1f}s", + file=sys.stderr, + ) + for case in cases: + if current_family is None: + current_family = case.family + family_started_ns = time.perf_counter_ns() + elif case.family != current_family: + finish_family(current_family) + current_family = case.family + family_started_ns = time.perf_counter_ns() + + progress(case, "python") py_timing = time_case(case.py_fn, min_time_s=min_time_s, repeats=repeats) + progress(case, "native list") native_timing = time_case( case.native_fn, min_time_s=min_time_s, repeats=repeats ) + progress(case, "native np") native_np_timing = ( time_case(case.native_np_fn, min_time_s=min_time_s, repeats=repeats) if case.native_np_fn is not None else None ) + progress(case, "memory") py_memory = memory_case(case.py_fn, loops=memory_loops) native_memory = memory_case(case.native_fn, loops=memory_loops) native_np_memory = ( @@ -1020,6 +1692,8 @@ def run_cases( native_np_memory, ) ) + if current_family is not None: + finish_family(current_family) finally: gc.enable() return rows @@ -1034,6 +1708,354 @@ def geometric_mean(values: Sequence[float]) -> float: return product ** (1.0 / len(values)) +def _run_text(args: Sequence[str]) -> str | None: + try: + result = subprocess.run( + args, + check=True, + capture_output=True, + text=True, + ) + except (OSError, subprocess.CalledProcessError): + return None + return result.stdout.strip() or None + + +def _git_metadata() -> dict[str, Any]: + return { + "commit": _run_text(["git", "rev-parse", "HEAD"]), + "short_commit": _run_text(["git", "rev-parse", "--short", "HEAD"]), + "dirty": bool(_run_text(["git", "status", "--porcelain"])), + "branch": _run_text(["git", "branch", "--show-current"]), + } + + +def _cpu_model() -> str | None: + if sys.platform == "darwin": + return _run_text(["sysctl", "-n", "machdep.cpu.brand_string"]) + if sys.platform.startswith("linux"): + try: + cpuinfo = Path("/proc/cpuinfo").read_text(encoding="utf-8") + except OSError: + return None + for line in cpuinfo.splitlines(): + if line.startswith("model name"): + return line.split(":", 1)[1].strip() + return platform.processor() or None + + +def _timing_dict(timing: Timing) -> dict[str, Any]: + return { + "loops": timing.loops, + "median_ns": timing.median_ns, + "median_us": timing.median_us, + "min_ns": timing.min_ns, + "max_ns": timing.max_ns, + } + + +def _memory_dict(memory: Memory) -> dict[str, Any]: + return { + "loops": memory.loops, + "peak_bytes": memory.peak_bytes, + "peak_kib": memory.peak_kib, + } + + +def _result_rows(rows: Sequence[BenchRow]) -> list[dict[str, Any]]: + result: list[dict[str, Any]] = [] + for row in rows: + base = { + "family": row.family, + "model": row.model, + "operation": row.operation, + "scenario": row.scenario, + "token_count": row.token_count, + } + result.append( + { + **base, + "path": "python", + "timing": _timing_dict(row.py_timing), + "memory": _memory_dict(row.py_memory), + "speedup_vs_python": 1.0, + } + ) + result.append( + { + **base, + "path": "native_list", + "timing": _timing_dict(row.native_timing), + "memory": _memory_dict(row.native_memory), + "speedup_vs_python": row.list_speedup, + } + ) + if row.native_np_timing is not None and row.native_np_memory is not None: + result.append( + { + **base, + "path": "native_np", + "timing": _timing_dict(row.native_np_timing), + "memory": _memory_dict(row.native_np_memory), + "speedup_vs_python": row.np_speedup, + } + ) + return result + + +def _family_summaries(rows: Sequence[BenchRow]) -> list[dict[str, Any]]: + summaries: list[dict[str, Any]] = [] + for family in sorted({row.family for row in rows}): + family_rows = [row for row in rows if row.family == family] + summaries.append( + { + "family": family, + "rows": len(family_rows), + "list_geomean_speedup": geometric_mean( + [row.list_speedup for row in family_rows] + ), + "np_geomean_speedup": geometric_mean( + [ + row.np_speedup + for row in family_rows + if row.np_speedup is not None + ] + ), + } + ) + return summaries + + +def _overall_summary(rows: Sequence[BenchRow]) -> dict[str, Any]: + return { + "rows": len(rows), + "list_geomean_speedup": geometric_mean([row.list_speedup for row in rows]), + "np_geomean_speedup": geometric_mean( + [row.np_speedup for row in rows if row.np_speedup is not None] + ), + } + + +def build_result_document( + *, + rows: Sequence[BenchRow], + skipped: Sequence[str], + args: argparse.Namespace, + native_module: Any, +) -> dict[str, Any]: + return { + "schema_version": 1, + "metadata": { + "git": _git_metadata(), + "python": { + "version": sys.version, + "executable": sys.executable, + }, + "rust": { + "rustc": _run_text(["rustc", "--version"]), + }, + "platform": { + "platform": platform.platform(), + "machine": platform.machine(), + "processor": platform.processor(), + "cpu_model": _cpu_model(), + }, + "native_extension": { + "module_file": getattr(native_module, "__file__", None), + "build_mode": "unknown", + }, + }, + "args": { + "families": args.families, + "model": args.model, + "min_time": args.min_time, + "repeats": args.repeats, + "memory_loops": args.memory_loops, + "strict": args.strict, + }, + "summary": _overall_summary(rows), + "families": _family_summaries(rows), + "rows": _result_rows(rows), + "skipped": list(skipped), + } + + +def write_json(path: str, document: dict[str, Any]) -> None: + output = Path(path) + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(json.dumps(document, indent=2, sort_keys=True), encoding="utf-8") + + +def _row_key(row: dict[str, Any]) -> tuple[str, str, str, str]: + return ( + str(row["family"]), + str(row["operation"]), + str(row["scenario"]), + str(row["path"]), + ) + + +def compare_to_baseline( + current: dict[str, Any], baseline: dict[str, Any] +) -> list[BaselineDiff]: + current_by_key = {_row_key(row): row for row in current.get("rows", [])} + baseline_by_key = {_row_key(row): row for row in baseline.get("rows", [])} + diffs: list[BaselineDiff] = [] + for key in sorted(set(current_by_key) | set(baseline_by_key)): + current_row = current_by_key.get(key) + baseline_row = baseline_by_key.get(key) + current_median = ( + current_row["timing"]["median_ns"] if current_row is not None else None + ) + baseline_median = ( + baseline_row["timing"]["median_ns"] if baseline_row is not None else None + ) + ratio = ( + current_median / baseline_median + if current_median is not None and baseline_median is not None + else None + ) + family, operation, scenario, path = key + diffs.append( + BaselineDiff( + family=family, + operation=operation, + scenario=scenario, + path=path, + current_median_ns=current_median, + baseline_median_ns=baseline_median, + ratio=ratio, + ) + ) + return diffs + + +def _load_baseline(path: str | None) -> dict[str, Any] | None: + if path is None: + return None + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def _format_us(ns: float | None) -> str: + if ns is None: + return "-" + return f"{ns / 1000.0:.3f}" + + +def _format_change(diff: BaselineDiff) -> str: + if diff.percent_change is None: + return "-" + return f"{diff.percent_change:+.1f}%" + + +def _diff_label(diff: BaselineDiff) -> str: + return f"{diff.family}/{diff.operation}/{diff.scenario}/{diff.path}" + + +def write_markdown( + path: str, + document: dict[str, Any], + baseline_diffs: Sequence[BaselineDiff], +) -> None: + output = Path(path) + output.parent.mkdir(parents=True, exist_ok=True) + summary = document["summary"] + lines = [ + "# Native Runtime Benchmark", + "", + "## Summary", + "", + "| rows | list geomean | np geomean | commit | dirty |", + "|---:|---:|---:|---|---|", + ( + f"| {summary['rows']} | {summary['list_geomean_speedup']:.2f}x | " + f"{summary['np_geomean_speedup']:.2f}x | " + f"`{document['metadata']['git']['short_commit']}` | " + f"{document['metadata']['git']['dirty']} |" + ), + "", + "## Families", + "", + "| family | rows | list geomean | np geomean |", + "|---|---:|---:|---:|", + ] + for item in document["families"]: + lines.append( + f"| `{item['family']}` | {item['rows']} | " + f"{item['list_geomean_speedup']:.2f}x | " + f"{item['np_geomean_speedup']:.2f}x |" + ) + + if baseline_diffs: + comparable = [diff for diff in baseline_diffs if diff.ratio is not None] + regressions = sorted( + [diff for diff in comparable if diff.ratio and diff.ratio > 1.05], + key=lambda diff: diff.ratio or 0.0, + reverse=True, + )[:10] + improvements = sorted( + [diff for diff in comparable if diff.ratio and diff.ratio < 0.95], + key=lambda diff: diff.ratio or 1.0, + )[:10] + missing_current = [ + diff for diff in baseline_diffs if diff.current_median_ns is None + ] + new_rows = [diff for diff in baseline_diffs if diff.baseline_median_ns is None] + + lines.extend( + [ + "", + "## Worst Regressions", + "", + "| case | current us | baseline us | change |", + "|---|---:|---:|---:|", + ] + ) + if regressions: + for diff in regressions: + lines.append( + f"| `{_diff_label(diff)}` | {_format_us(diff.current_median_ns)} | " + f"{_format_us(diff.baseline_median_ns)} | {_format_change(diff)} |" + ) + else: + lines.append("| none | - | - | - |") + + lines.extend( + [ + "", + "## Best Improvements", + "", + "| case | current us | baseline us | change |", + "|---|---:|---:|---:|", + ] + ) + if improvements: + for diff in improvements: + lines.append( + f"| `{_diff_label(diff)}` | {_format_us(diff.current_median_ns)} | " + f"{_format_us(diff.baseline_median_ns)} | {_format_change(diff)} |" + ) + else: + lines.append("| none | - | - | - |") + + if missing_current or new_rows: + lines.extend(["", "## Coverage Changes", ""]) + if missing_current: + lines.append("Missing current rows:") + lines.extend(f"- `{_diff_label(diff)}`" for diff in missing_current) + if new_rows: + lines.append("New rows:") + lines.extend(f"- `{_diff_label(diff)}`" for diff in new_rows) + + lines.extend(["", "## Skipped Cases", ""]) + if document["skipped"]: + lines.extend(f"- {item}" for item in document["skipped"]) + else: + lines.append("None.") + lines.append("") + output.write_text("\n".join(lines), encoding="utf-8") + + def print_results( rows: Sequence[BenchRow], skipped: Sequence[str], memory_loops: int ) -> None: @@ -1128,6 +2150,26 @@ def main() -> None: action="store_true", help="Fail instead of skipping families whose tokenizer is unavailable.", ) + parser.add_argument( + "--json-out", + help="Write structured benchmark results to this JSON file.", + ) + parser.add_argument( + "--markdown-out", + help="Write a Markdown benchmark summary to this file.", + ) + parser.add_argument( + "--baseline", + help="Compare current results against a previous JSON benchmark artifact.", + ) + parser.add_argument( + "--fail-on-regression", + type=float, + help=( + "Exit non-zero when a baseline row regresses by more than this " + "percentage. Missing current baseline rows also fail this gate." + ), + ) args = parser.parse_args() os.environ.pop("RENDERERS_NATIVE", None) @@ -1150,7 +2192,57 @@ def main() -> None: repeats=args.repeats, memory_loops=args.memory_loops, ) + document = build_result_document( + rows=rows, + skipped=skipped, + args=args, + native_module=native, + ) + baseline = _load_baseline(args.baseline) + baseline_diffs = compare_to_baseline(document, baseline) if baseline else [] print_results(rows, skipped, args.memory_loops) + if args.json_out: + if baseline_diffs: + document["baseline"] = { + "path": args.baseline, + "diffs": [ + { + "family": diff.family, + "operation": diff.operation, + "scenario": diff.scenario, + "path": diff.path, + "current_median_ns": diff.current_median_ns, + "baseline_median_ns": diff.baseline_median_ns, + "ratio": diff.ratio, + "percent_change": diff.percent_change, + } + for diff in baseline_diffs + ], + } + write_json(args.json_out, document) + print(f"wrote json={args.json_out}", file=sys.stderr) + if args.markdown_out: + write_markdown(args.markdown_out, document, baseline_diffs) + print(f"wrote markdown={args.markdown_out}", file=sys.stderr) + + if args.fail_on_regression is not None and baseline_diffs: + threshold = args.fail_on_regression / 100.0 + regressions = [ + diff + for diff in baseline_diffs + if diff.ratio is not None and diff.ratio > 1.0 + threshold + ] + missing_current = [ + diff for diff in baseline_diffs if diff.current_median_ns is None + ] + if regressions or missing_current: + details = ", ".join( + _diff_label(diff) for diff in [*regressions, *missing_current][:5] + ) + raise SystemExit( + f"benchmark regression gate failed: {len(regressions)} " + f"regressions, {len(missing_current)} missing current rows; {details}" + ) if __name__ == "__main__": diff --git a/crates/renderers-core/src/emit.rs b/crates/renderers-core/src/emit.rs index d4ff82f..ae95268 100644 --- a/crates/renderers-core/src/emit.rs +++ b/crates/renderers-core/src/emit.rs @@ -8,6 +8,22 @@ use crate::tokenizer::Tokenizer; use crate::types::{RenderError, RenderedTokens, SCAFFOLD_IDX}; +pub trait TokenSink { + fn special(&mut self, token_id: u32, msg_idx: i32); + fn ids(&mut self, token_ids: &[u32], msg_idx: i32); + fn text(&mut self, text: &str, msg_idx: i32) -> Result<(), RenderError>; + + #[inline] + fn scaffold_special(&mut self, token_id: u32) { + self.special(token_id, SCAFFOLD_IDX); + } + + #[inline] + fn scaffold_text(&mut self, text: &str) -> Result<(), RenderError> { + self.text(text, SCAFFOLD_IDX) + } +} + /// Mutable render-time buffer paired with a tokenizer reference. /// /// Holds both the token stream and the parallel `message_indices` array. @@ -132,3 +148,99 @@ impl<'tok> RenderBuf<'tok> { self.tokens.is_empty() } } + +impl TokenSink for RenderBuf<'_> { + #[inline] + fn special(&mut self, token_id: u32, msg_idx: i32) { + RenderBuf::special(self, token_id, msg_idx); + } + + #[inline] + fn ids(&mut self, token_ids: &[u32], msg_idx: i32) { + RenderBuf::ids(self, token_ids, msg_idx); + } + + #[inline] + fn text(&mut self, text: &str, msg_idx: i32) -> Result<(), RenderError> { + RenderBuf::text(self, text, msg_idx) + } +} + +#[derive(Debug)] +enum TokenPlanOp { + Ids(Vec), + Special(u32), + Text(String), +} + +#[derive(Debug)] +pub struct TokenPlanBuf<'tok> { + ops: Vec, + tokenizer: &'tok Tokenizer, + cap_hint: usize, + text_count: usize, +} + +impl<'tok> TokenPlanBuf<'tok> { + pub fn new(tokenizer: &'tok Tokenizer, hint: usize) -> Self { + Self { + ops: Vec::with_capacity(hint.min(256)), + tokenizer, + cap_hint: hint, + text_count: 0, + } + } + + pub fn into_token_ids(self) -> Result, RenderError> { + let encoded_texts = if self.text_count == 0 { + Vec::new() + } else { + let texts: Vec<&str> = self + .ops + .iter() + .filter_map(|op| match op { + TokenPlanOp::Text(text) => Some(text.as_str()), + _ => None, + }) + .collect(); + self.tokenizer.encode_batch_no_special(texts)? + }; + + let mut text_idx = 0; + let mut tokens = Vec::with_capacity(self.cap_hint); + for op in self.ops { + match op { + TokenPlanOp::Ids(ids) => tokens.extend_from_slice(&ids), + TokenPlanOp::Special(id) => tokens.push(id), + TokenPlanOp::Text(_) => { + tokens.extend_from_slice(encoded_texts[text_idx].as_slice()); + text_idx += 1; + } + } + } + Ok(tokens) + } +} + +impl TokenSink for TokenPlanBuf<'_> { + #[inline] + fn special(&mut self, token_id: u32, _msg_idx: i32) { + self.ops.push(TokenPlanOp::Special(token_id)); + } + + #[inline] + fn ids(&mut self, token_ids: &[u32], _msg_idx: i32) { + if !token_ids.is_empty() { + self.ops.push(TokenPlanOp::Ids(token_ids.to_vec())); + } + } + + #[inline] + fn text(&mut self, text: &str, _msg_idx: i32) -> Result<(), RenderError> { + if !text.is_empty() { + self.ops.push(TokenPlanOp::Text(text.to_string())); + self.text_count += 1; + } + Ok(()) + } +} diff --git a/crates/renderers-core/src/families/deepseek_v3.rs b/crates/renderers-core/src/families/deepseek_v3.rs index 0429296..260257a 100644 --- a/crates/renderers-core/src/families/deepseek_v3.rs +++ b/crates/renderers-core/src/families/deepseek_v3.rs @@ -18,7 +18,7 @@ use serde_json::Value as JsonValue; use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; -use crate::emit::RenderBuf; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; use crate::parsing::deepseek_v3::parse_deepseek_v3; use crate::tokenizer::Tokenizer; use crate::traits::Renderer; @@ -157,58 +157,23 @@ impl DeepSeekV3Renderer { fn estimate_capacity(messages: &[Message]) -> usize { messages.len().max(1) * 256 + 64 } -} -fn python_json_dumps(value: &JsonValue) -> String { - match value { - JsonValue::Null => "null".to_string(), - JsonValue::Bool(v) => v.to_string(), - JsonValue::Number(v) => v.to_string(), - JsonValue::String(v) => serde_json::to_string(v).unwrap_or_else(|_| "\"\"".to_string()), - JsonValue::Array(items) => { - let mut out = String::from("["); - for (i, item) in items.iter().enumerate() { - if i > 0 { - out.push_str(", "); - } - out.push_str(&python_json_dumps(item)); - } - out.push(']'); - out - } - JsonValue::Object(map) => { - let mut out = String::from("{"); - for (i, (key, item)) in map.iter().enumerate() { - if i > 0 { - out.push_str(", "); - } - out.push_str(&serde_json::to_string(key).unwrap_or_else(|_| "\"\"".to_string())); - out.push_str(": "); - out.push_str(&python_json_dumps(item)); - } - out.push('}'); - out - } + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) } -} -impl Renderer for DeepSeekV3Renderer { - fn render( + fn render_into_buf( &self, + buf: &mut impl TokenSink, messages: &[Message], - _tools: Option<&[ToolSpec]>, add_generation_prompt: bool, - ) -> Result { + ) -> Result<(), RenderError> { if messages.is_empty() { return Err(RenderError::EmptyMessages); } - let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages)); - // BOS buf.scaffold_special(self.bos); - // Leading system messages: concat with "\n\n", emit as plain text - // before any role marker, attributed to message index 0. let mut first_non_sys = 0usize; let mut sys_parts: Vec<&str> = Vec::new(); for msg in messages { @@ -227,22 +192,16 @@ impl Renderer for DeepSeekV3Renderer { let idx = i as i32; let content = msg.text_content(); match msg.role.as_str() { - "system" => { - // Post-initial system → treat as user - buf.special(self.user_token, idx); - buf.text(content, idx)?; - } - "user" => { + "system" | "user" => { buf.special(self.user_token, idx); buf.text(content, idx)?; } - "assistant" => self.emit_assistant(&mut buf, msg, i, messages)?, - "tool" => self.emit_tool(&mut buf, messages, i)?, - _ => {} // mirror Python: silent skip on unknown role + "assistant" => self.emit_assistant(buf, msg, i, messages)?, + "tool" => self.emit_tool(buf, messages, i)?, + _ => {} } } - // Generation prompt — skip <|Assistant|> after a tool output if add_generation_prompt { let last_role = messages.last().map_or("", |m| m.role.as_str()); if last_role != "tool" { @@ -253,9 +212,73 @@ impl Renderer for DeepSeekV3Renderer { } } + Ok(()) + } +} + +fn python_json_dumps(value: &JsonValue) -> String { + match value { + JsonValue::Null => "null".to_string(), + JsonValue::Bool(v) => v.to_string(), + JsonValue::Number(v) => v.to_string(), + JsonValue::String(v) => serde_json::to_string(v).unwrap_or_else(|_| "\"\"".to_string()), + JsonValue::Array(items) => { + let mut out = String::from("["); + for (i, item) in items.iter().enumerate() { + if i > 0 { + out.push_str(", "); + } + out.push_str(&python_json_dumps(item)); + } + out.push(']'); + out + } + JsonValue::Object(map) => { + let mut out = String::from("{"); + for (i, (key, item)) in map.iter().enumerate() { + if i > 0 { + out.push_str(", "); + } + out.push_str(&serde_json::to_string(key).unwrap_or_else(|_| "\"\"".to_string())); + out.push_str(": "); + out.push_str(&python_json_dumps(item)); + } + out.push('}'); + out + } + } +} + +impl Renderer for DeepSeekV3Renderer { + fn render( + &self, + messages: &[Message], + _tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages)); + self.render_into_buf(&mut buf, messages, add_generation_prompt)?; Ok(buf.into_rendered()) } + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { parse_deepseek_v3( &self.tokenizer, @@ -296,7 +319,8 @@ impl Renderer for DeepSeekV3Renderer { return Ok(None); }; - let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(new_messages)); + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, Self::estimate_capacity(new_messages)); for (i, msg) in new_messages.iter().enumerate() { let idx = i as i32; @@ -347,7 +371,7 @@ impl Renderer for DeepSeekV3Renderer { impl DeepSeekV3Renderer { fn emit_assistant( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, msg: &Message, msg_idx: usize, messages: &[Message], @@ -401,7 +425,7 @@ impl DeepSeekV3Renderer { fn emit_tool( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], msg_idx: usize, ) -> Result<(), RenderError> { diff --git a/crates/renderers-core/src/families/glm.rs b/crates/renderers-core/src/families/glm.rs index bc1893d..d1190c5 100644 --- a/crates/renderers-core/src/families/glm.rs +++ b/crates/renderers-core/src/families/glm.rs @@ -30,11 +30,12 @@ use serde_json::Value as JsonValue; use crate::bridge::reject_assistant_in_extension; -use crate::emit::RenderBuf; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; use crate::json::{to_string_python, tool_spec_inner_value, tool_spec_template_value}; use crate::parsing::glm::parse_glm; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; use crate::traits::Renderer; use crate::types::{ Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, @@ -125,6 +126,8 @@ pub struct GlmRenderer { tool_response: Option, tool_response_end: Option, + newline_tokens: Vec, + tool_text_cache: ToolTextCache, stop_tokens: Vec, } @@ -167,6 +170,7 @@ impl GlmRenderer { Some(tokenizer.token_to_id_strict("")?), ) }; + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); Ok(Self { tokenizer, @@ -191,6 +195,8 @@ impl GlmRenderer { arg_value_end, tool_response, tool_response_end, + newline_tokens, + tool_text_cache: ToolTextCache::default(), stop_tokens: vec![endoftext, user, observation], }) } @@ -216,8 +222,11 @@ impl GlmRenderer { -1 } - fn format_tool_spec(&self, tool: &ToolSpec) -> Result { - let spec = if self.variant == Variant::Glm51 { + fn format_tool_spec_for_variant( + variant: Variant, + tool: &ToolSpec, + ) -> Result { + let spec = if variant == Variant::Glm51 { tool_spec_inner_value(tool) } else { tool_spec_template_value(tool) @@ -232,23 +241,26 @@ impl GlmRenderer { _ => serde_json::to_string(arg_value).unwrap_or_default(), } } -} -impl Renderer for GlmRenderer { - fn render( + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + messages.len().max(1) * 256 + tools.map_or(0, |t| t.len() * 256 + 256) + } + + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + + fn render_into_buf( &self, + buf: &mut impl TokenSink, messages: &[Message], tools: Option<&[ToolSpec]>, add_generation_prompt: bool, - ) -> Result { + ) -> Result<(), RenderError> { if messages.is_empty() { return Err(RenderError::EmptyMessages); } let nl = self.nl_after_role(); - let mut buf = RenderBuf::new( - &self.tokenizer, - messages.len().max(1) * 256 + tools.map_or(0, |t| t.len() * 256 + 256), - ); // Prefix buf.scaffold_special(self.gmask); @@ -258,18 +270,32 @@ impl Renderer for GlmRenderer { if let Some(t) = tools { if !t.is_empty() { buf.scaffold_special(self.system); - let mut s = String::with_capacity(512); - s.push_str(TOOLS_HEADER_GLM5); - for tool in t { - s.push_str(&self.format_tool_spec(tool)?); - s.push('\n'); - } - s.push_str(if self.variant == Variant::Glm45 { - TOOLS_FOOTER_GLM45 - } else { - TOOLS_FOOTER_GLM5 - }); - buf.scaffold_text(&s)?; + let variant = self.variant; + let tool_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + t, + match variant { + Variant::Glm45 => 45, + Variant::Glm5 => 50, + Variant::Glm51 => 51, + }, + "", + || { + let mut s = String::with_capacity(512); + s.push_str(TOOLS_HEADER_GLM5); + for tool in t { + s.push_str(&Self::format_tool_spec_for_variant(variant, tool)?); + s.push('\n'); + } + s.push_str(if variant == Variant::Glm45 { + TOOLS_FOOTER_GLM45 + } else { + TOOLS_FOOTER_GLM5 + }); + Ok(s) + }, + )?; + buf.ids(tool_tokens.as_slice(), SCAFFOLD_IDX); } } @@ -306,9 +332,9 @@ impl Renderer for GlmRenderer { self.preserve_all_thinking, self.preserve_thinking_between_tool_calls, ); - self.emit_assistant(&mut buf, msg, idx, last_ui, preserve_thinking)?; + self.emit_assistant(buf, msg, idx, last_ui, preserve_thinking)?; } - "tool" => self.emit_tool(&mut buf, messages, i, content, idx)?, + "tool" => self.emit_tool(buf, messages, i, content, idx)?, _ => {} // mirror Python: silent skip } } @@ -317,7 +343,7 @@ impl Renderer for GlmRenderer { buf.scaffold_special(self.assistant); if self.variant == Variant::Glm45 { if !self.enable_thinking { - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.think); buf.scaffold_special(self.think_end); } @@ -329,9 +355,40 @@ impl Renderer for GlmRenderer { } } + Ok(()) + } +} + +impl Renderer for GlmRenderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; Ok(buf.into_rendered()) } + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages, tools); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { parse_glm( &self.tokenizer, @@ -385,7 +442,8 @@ impl Renderer for GlmRenderer { let last_prev = *combined.last().expect("non-empty"); let nl = self.nl_after_role(); - let mut buf = RenderBuf::new(&self.tokenizer, new_messages.len().max(1) * 256); + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, new_messages.len().max(1) * 256); for (i, msg) in new_messages.iter().enumerate() { let idx = i as i32; @@ -430,7 +488,7 @@ impl Renderer for GlmRenderer { buf.scaffold_special(self.assistant); if self.variant == Variant::Glm45 { if !self.enable_thinking { - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.think); buf.scaffold_special(self.think_end); } @@ -455,7 +513,7 @@ impl Renderer for GlmRenderer { impl GlmRenderer { fn emit_assistant( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, msg: &Message, msg_idx: i32, last_user_index: i32, @@ -514,7 +572,7 @@ impl GlmRenderer { #[allow(clippy::too_many_arguments)] fn emit_assistant_glm5_family( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, msg: &Message, msg_idx: i32, reasoning_content: &str, @@ -568,7 +626,7 @@ impl GlmRenderer { #[allow(clippy::too_many_arguments)] fn emit_assistant_glm45( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, msg: &Message, msg_idx: i32, reasoning_content: &str, @@ -577,12 +635,12 @@ impl GlmRenderer { preserve_thinking: bool, ) -> Result<(), RenderError> { if (msg_idx > last_user_index || preserve_thinking) && !reasoning_content.is_empty() { - buf.text("\n", msg_idx)?; + buf.ids(&self.newline_tokens, msg_idx); buf.special(self.think, msg_idx); buf.text(reasoning_content.trim(), msg_idx)?; buf.special(self.think_end, msg_idx); } else { - buf.text("\n", msg_idx)?; + buf.ids(&self.newline_tokens, msg_idx); buf.special(self.think, msg_idx); buf.special(self.think_end, msg_idx); } @@ -605,7 +663,7 @@ impl GlmRenderer { for tc in tool_calls { let name = tc.function.name.as_str(); if trimmed.is_empty() { - buf.text("\n", msg_idx)?; + buf.ids(&self.newline_tokens, msg_idx); } buf.special(self.tool_call, msg_idx); let mut head = String::with_capacity(name.len() + 1); @@ -624,11 +682,11 @@ impl GlmRenderer { buf.special(self.arg_key, msg_idx); buf.text(k, msg_idx)?; buf.special(self.arg_key_end, msg_idx); - buf.text("\n", msg_idx)?; + buf.ids(&self.newline_tokens, msg_idx); buf.special(self.arg_value, msg_idx); buf.text(&Self::render_arg_value(v), msg_idx)?; buf.special(self.arg_value_end, msg_idx); - buf.text("\n", msg_idx)?; + buf.ids(&self.newline_tokens, msg_idx); } } buf.special(self.tool_call_end, msg_idx); @@ -638,7 +696,7 @@ impl GlmRenderer { fn emit_tool( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], msg_idx: usize, content: &str, @@ -653,7 +711,7 @@ impl GlmRenderer { fn emit_tool_response( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, content: &str, idx: i32, ) -> Result<(), RenderError> { diff --git a/crates/renderers-core/src/families/kimi_k2.rs b/crates/renderers-core/src/families/kimi_k2.rs index 7fe0989..f2d53ba 100644 --- a/crates/renderers-core/src/families/kimi_k2.rs +++ b/crates/renderers-core/src/families/kimi_k2.rs @@ -22,6 +22,7 @@ use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; use crate::emit::RenderBuf; use crate::parsing::kimi_k2::parse_kimi_k2; use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; use crate::traits::Renderer; use crate::types::{ Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, @@ -86,6 +87,9 @@ pub struct KimiK2Renderer { tool_call_argument_begin: u32, tool_call_end: u32, + newline_tokens: Vec, + assistant_tokens: Vec, + tool_text_cache: ToolTextCache, stop_tokens: Vec, } @@ -110,6 +114,11 @@ impl KimiK2Renderer { let tool_call_argument_begin = tokenizer.token_to_id_strict("<|tool_call_argument_begin|>")?; let tool_call_end = tokenizer.token_to_id_strict("<|tool_call_end|>")?; + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let assistant_tokens = tokenizer + .encode_no_special("assistant")? + .as_slice() + .to_vec(); Ok(Self { tokenizer, @@ -126,6 +135,9 @@ impl KimiK2Renderer { tool_call_begin, tool_call_argument_begin, tool_call_end, + newline_tokens, + assistant_tokens, + tool_text_cache: ToolTextCache::default(), stop_tokens: vec![im_end], }) } @@ -230,6 +242,25 @@ impl KimiK2Renderer { buf.special(self.im_end, idx); Ok(()) } + + fn emit_tool_declare_from_tools( + &self, + buf: &mut RenderBuf<'_>, + tools: &[ToolSpec], + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_system, idx); + buf.text("tool_declare", idx)?; + buf.special(self.im_middle, idx); + let tool_tokens = + self.tool_text_cache + .get_or_insert_with(&self.tokenizer, tools, 0, "", || { + Ok(Self::serialize_tools(tools)) + })?; + buf.ids(tool_tokens.as_slice(), idx); + buf.special(self.im_end, idx); + Ok(()) + } } impl Renderer for KimiK2Renderer { @@ -256,7 +287,7 @@ impl Renderer for KimiK2Renderer { if tools_pending && !already_has_tool_declare { working.push(Message { role: "tool_declare".to_string(), - content: crate::types::Content::Text(Self::serialize_tools(tools.unwrap())), + content: crate::types::Content::Text(String::new()), ..Default::default() }); injected.push(true); @@ -354,11 +385,19 @@ impl Renderer for KimiK2Renderer { "system" => { self.emit_im_role(&mut buf, self.im_system, "system", content, oi)?; if Some(i) == auto_system_idx { - buf.text("\n", oi)?; + buf.ids(&self.newline_tokens, oi); } } "tool_declare" => { - self.emit_im_role(&mut buf, self.im_system, "tool_declare", content, oi)?; + if injected[i] { + self.emit_tool_declare_from_tools( + &mut buf, + tools.expect("injected tool_declare requires tools"), + oi, + )?; + } else { + self.emit_im_role(&mut buf, self.im_system, "tool_declare", content, oi)?; + } } "user" => { self.emit_im_role(&mut buf, self.im_user, "user", content, oi)?; @@ -374,7 +413,7 @@ impl Renderer for KimiK2Renderer { if add_generation_prompt { buf.scaffold_special(self.im_assistant); - buf.scaffold_text("assistant")?; + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.im_middle); } @@ -420,7 +459,8 @@ impl Renderer for KimiK2Renderer { return Ok(None); }; - let mut buf = RenderBuf::new(&self.tokenizer, new_messages.len().max(1) * 256); + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, new_messages.len().max(1) * 256); for (i, msg) in new_messages.iter().enumerate() { let idx = i as i32; let content = msg.text_content(); @@ -433,7 +473,7 @@ impl Renderer for KimiK2Renderer { } buf.scaffold_special(self.im_assistant); - buf.scaffold_text("assistant")?; + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.im_middle); let ext = buf.into_token_ids(); @@ -456,7 +496,7 @@ impl KimiK2Renderer { idx: i32, ) -> Result<(), RenderError> { buf.special(self.im_assistant, idx); - buf.text("assistant", idx)?; + buf.ids(&self.assistant_tokens, idx); buf.special(self.im_middle, idx); // Kimi's template renders content verbatim; reasoning_content is diff --git a/crates/renderers-core/src/families/kimi_k25.rs b/crates/renderers-core/src/families/kimi_k25.rs index a254dc9..56ce339 100644 --- a/crates/renderers-core/src/families/kimi_k25.rs +++ b/crates/renderers-core/src/families/kimi_k25.rs @@ -19,6 +19,7 @@ //! ("You are Kimi, an AI assistant created by Moonshot AI.") but the //! Python class doesn't auto-inject it — neither does this port. +use crate::SCAFFOLD_IDX; use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; use crate::emit::RenderBuf; use crate::parsing::kimi_k2::parse_kimi_k2; @@ -91,6 +92,10 @@ pub struct KimiK25Renderer { media_end: Option, mm_token_type_ids: Vec<(u32, u8)>, + newline_tokens: Vec, + assistant_tokens: Vec, + think_tokens: Vec, + empty_think_tokens: Vec, stop_tokens: Vec, } @@ -125,6 +130,16 @@ impl KimiK25Renderer { if let Some(p) = media_pad { mm_token_type_ids.push((p, 1)); // image marker; K2.5 handles video via the same pad } + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let assistant_tokens = tokenizer + .encode_no_special("assistant")? + .as_slice() + .to_vec(); + let think_tokens = tokenizer.encode_no_special("")?.as_slice().to_vec(); + let empty_think_tokens = tokenizer + .encode_no_special("")? + .as_slice() + .to_vec(); Ok(Self { tokenizer, @@ -146,6 +161,10 @@ impl KimiK25Renderer { media_pad, media_end, mm_token_type_ids, + newline_tokens, + assistant_tokens, + think_tokens, + empty_think_tokens, stop_tokens: vec![im_end], }) } @@ -211,7 +230,7 @@ impl KimiK25Renderer { s.push_str(""); buf.text(&s, msg_idx)?; } else { - buf.text("", msg_idx)?; + buf.ids(&self.empty_think_tokens, msg_idx); } buf.text(&text_content, msg_idx)?; @@ -314,12 +333,12 @@ impl Renderer for KimiK25Renderer { // Generation prompt if add_generation_prompt { buf.scaffold_special(self.im_assistant); - buf.scaffold_text("assistant")?; + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.im_middle); if self.enable_thinking { - buf.scaffold_text("")?; + buf.ids(&self.think_tokens, SCAFFOLD_IDX); } else { - buf.scaffold_text("")?; + buf.ids(&self.empty_think_tokens, SCAFFOLD_IDX); } } @@ -368,7 +387,8 @@ impl Renderer for KimiK25Renderer { return Ok(None); }; - let mut buf = RenderBuf::new(&self.tokenizer, new_messages.len().max(1) * 256); + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, new_messages.len().max(1) * 256); for (i, msg) in new_messages.iter().enumerate() { let idx = i as i32; buf.special(self.role_token(&msg.role), idx); @@ -390,12 +410,12 @@ impl Renderer for KimiK25Renderer { // Generation prompt buf.scaffold_special(self.im_assistant); - buf.scaffold_text("assistant")?; + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.im_middle); if self.enable_thinking { - buf.scaffold_text("")?; + buf.ids(&self.think_tokens, SCAFFOLD_IDX); } else { - buf.scaffold_text("")?; + buf.ids(&self.empty_think_tokens, SCAFFOLD_IDX); } let ext = buf.into_token_ids(); @@ -462,7 +482,7 @@ impl KimiK25Renderer { let offset = buf.len(); buf.special(pad, idx); buf.special(end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); // Always exactly 1 placeholder in the stream, regardless of // image size — that's the K2.5 convention. @@ -609,12 +629,12 @@ impl MultimodalRenderer for KimiK25Renderer { if add_generation_prompt { buf.scaffold_special(self.im_assistant); - buf.scaffold_text("assistant")?; + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.im_middle); if self.enable_thinking { - buf.scaffold_text("")?; + buf.ids(&self.think_tokens, SCAFFOLD_IDX); } else { - buf.scaffold_text("")?; + buf.ids(&self.empty_think_tokens, SCAFFOLD_IDX); } } diff --git a/crates/renderers-core/src/families/minimax_m2.rs b/crates/renderers-core/src/families/minimax_m2.rs index c3bfb2e..63fe3d2 100644 --- a/crates/renderers-core/src/families/minimax_m2.rs +++ b/crates/renderers-core/src/families/minimax_m2.rs @@ -19,11 +19,12 @@ //! (or when `preserve_all_thinking` is on). use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; -use crate::emit::RenderBuf; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; use crate::json::to_string_python; use crate::parsing::minimax::parse_minimax; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; use crate::traits::Renderer; use crate::types::{ Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, @@ -86,6 +87,10 @@ pub struct MiniMaxM2Renderer { tool_call: u32, tool_call_end: u32, + newline_tokens: Vec, + ai_newline_tokens: Vec, + tool_tokens: Vec, + tool_text_cache: ToolTextCache, stop_tokens: Vec, } @@ -105,6 +110,9 @@ impl MiniMaxM2Renderer { let think_end = tokenizer.token_to_id_strict("")?; let tool_call = tokenizer.token_to_id_strict("")?; let tool_call_end = tokenizer.token_to_id_strict("")?; + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let ai_newline_tokens = tokenizer.encode_no_special("ai\n")?.as_slice().to_vec(); + let tool_tokens = tokenizer.encode_no_special("tool")?.as_slice().to_vec(); Ok(Self { tokenizer, @@ -118,15 +126,27 @@ impl MiniMaxM2Renderer { think_end, tool_call, tool_call_end, + newline_tokens, + ai_newline_tokens, + tool_tokens, + tool_text_cache: ToolTextCache::default(), stop_tokens: vec![eos], }) } fn build_system_text(&self, sys_content: &str, tools: Option<&[ToolSpec]>) -> String { + Self::build_system_text_from(&self.default_system, sys_content, tools) + } + + fn build_system_text_from( + default_system: &str, + sys_content: &str, + tools: Option<&[ToolSpec]>, + ) -> String { let mut s = String::with_capacity(512); s.push_str("system\n"); if sys_content.is_empty() { - s.push_str(&self.default_system); + s.push_str(default_system); } else { s.push_str(sys_content); } @@ -158,27 +178,29 @@ impl MiniMaxM2Renderer { } } } -} -impl Renderer for MiniMaxM2Renderer { - fn render( + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + messages.len().max(1) * 256 + tools.map_or(0, |t| t.len() * 256 + 512) + } + + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + + fn render_into_buf( &self, + buf: &mut impl TokenSink, messages: &[Message], tools: Option<&[ToolSpec]>, add_generation_prompt: bool, - ) -> Result { + ) -> Result<(), RenderError> { if messages.is_empty() { return Err(RenderError::EmptyMessages); } - let mut buf = RenderBuf::new( - &self.tokenizer, - messages.len().max(1) * 256 + tools.map_or(0, |t| t.len() * 256 + 512), - ); let first_is_system = messages[0].role == "system"; let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; - // System block buf.special(self.bos, sys_idx); buf.special(self.role, sys_idx); let sys_content = if first_is_system { @@ -186,16 +208,32 @@ impl Renderer for MiniMaxM2Renderer { } else { String::new() }; - let system_text = self.build_system_text(&sys_content, tools); - buf.text(&system_text, sys_idx)?; + if let Some(t) = tools.filter(|t| !t.is_empty()) { + let default_system = self.default_system.clone(); + let system_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + t, + u64::from(first_is_system), + &sys_content, + || { + Ok(Self::build_system_text_from( + &default_system, + &sys_content, + Some(t), + )) + }, + )?; + buf.ids(system_tokens.as_slice(), sys_idx); + } else { + let system_text = self.build_system_text(&sys_content, tools); + buf.text(&system_text, sys_idx)?; + } buf.special(self.eos, sys_idx); - buf.text("\n", sys_idx)?; + buf.ids(&self.newline_tokens, sys_idx); - // Conversation messages — skip the leading system if present let conversation_start = usize::from(first_is_system); let conversation = &messages[conversation_start..]; - // last_user_index relative to the conversation let mut last_ui: i32 = -1; for (ci, m) in conversation.iter().enumerate() { if m.role == "user" { @@ -214,10 +252,9 @@ impl Renderer for MiniMaxM2Renderer { s.push_str(content); buf.text(&s, orig_idx)?; buf.special(self.eos, orig_idx); - buf.text("\n", orig_idx)?; + buf.ids(&self.newline_tokens, orig_idx); } "assistant" => { - // orig_idx was just cast from a usize; non-negative by construction. #[allow(clippy::cast_sign_loss)] let preserve_thinking = should_preserve_past_thinking( messages, @@ -225,30 +262,54 @@ impl Renderer for MiniMaxM2Renderer { self.preserve_all_thinking, self.preserve_thinking_between_tool_calls, ); - self.emit_assistant( - &mut buf, - msg, - orig_idx, - ci as i32, - last_ui, - preserve_thinking, - )?; + self.emit_assistant(buf, msg, orig_idx, ci as i32, last_ui, preserve_thinking)?; } - "tool" => self.emit_tool(&mut buf, conversation, ci, orig_idx)?, + "tool" => self.emit_tool(buf, conversation, ci, orig_idx)?, _ => {} } } if add_generation_prompt { buf.scaffold_special(self.role); - buf.scaffold_text("ai\n")?; + buf.ids(&self.ai_newline_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.think); - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); } + Ok(()) + } +} + +impl Renderer for MiniMaxM2Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; Ok(buf.into_rendered()) } + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages, tools); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { parse_minimax( &self.tokenizer, @@ -287,9 +348,10 @@ impl Renderer for MiniMaxM2Renderer { return Ok(None); }; - let mut buf = RenderBuf::new(&self.tokenizer, new_messages.len().max(1) * 256); + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, new_messages.len().max(1) * 256); // Trailing \n after the prior turn's [e~[ - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); for (i, msg) in new_messages.iter().enumerate() { let idx = i as i32; @@ -302,7 +364,7 @@ impl Renderer for MiniMaxM2Renderer { s.push_str(content); buf.text(&s, idx)?; buf.special(self.eos, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); } "system" => { buf.special(self.role, idx); @@ -311,7 +373,7 @@ impl Renderer for MiniMaxM2Renderer { s.push_str(content); buf.text(&s, idx)?; buf.special(self.eos, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); } "tool" => self.emit_tool(&mut buf, new_messages, i, idx)?, _ => return Ok(None), @@ -319,9 +381,9 @@ impl Renderer for MiniMaxM2Renderer { } buf.scaffold_special(self.role); - buf.scaffold_text("ai\n")?; + buf.ids(&self.ai_newline_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.think); - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); let ext = buf.into_token_ids(); let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); @@ -338,7 +400,7 @@ impl Renderer for MiniMaxM2Renderer { impl MiniMaxM2Renderer { fn emit_assistant( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, msg: &Message, orig_idx: i32, conv_idx: i32, @@ -369,7 +431,7 @@ impl MiniMaxM2Renderer { !reasoning_content.is_empty() && (conv_idx > last_user_index || preserve_thinking); let after_think: String = if emit_think { - buf.text("ai\n", orig_idx)?; + buf.ids(&self.ai_newline_tokens, orig_idx); buf.special(self.think, orig_idx); let mut head = String::with_capacity(reasoning_content.len() + 2); head.push('\n'); @@ -431,13 +493,13 @@ impl MiniMaxM2Renderer { } buf.special(self.eos, orig_idx); - buf.text("\n", orig_idx)?; + buf.ids(&self.newline_tokens, orig_idx); Ok(()) } fn emit_tool( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, conversation: &[Message], conv_idx: usize, orig_idx: i32, @@ -448,7 +510,7 @@ impl MiniMaxM2Renderer { if !prev_is_tool { buf.special(self.role, orig_idx); - buf.text("tool", orig_idx)?; + buf.ids(&self.tool_tokens, orig_idx); } let prefix = if prev_is_tool { "" } else { "\n" }; let suffix = if next_is_tool { "\n" } else { "" }; @@ -463,7 +525,7 @@ impl MiniMaxM2Renderer { if !next_is_tool { buf.special(self.eos, orig_idx); - buf.text("\n", orig_idx)?; + buf.ids(&self.newline_tokens, orig_idx); } Ok(()) } diff --git a/crates/renderers-core/src/families/nemotron3.rs b/crates/renderers-core/src/families/nemotron3.rs index a16d77a..4fb76e2 100644 --- a/crates/renderers-core/src/families/nemotron3.rs +++ b/crates/renderers-core/src/families/nemotron3.rs @@ -25,6 +25,7 @@ use crate::emit::RenderBuf; use crate::parsing::qwen35::parse_qwen35; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; use crate::traits::Renderer; use crate::types::{ Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, @@ -89,6 +90,12 @@ pub struct Nemotron3Renderer { tool_response_end: u32, stop_tokens: Vec, + newline_tokens: Vec, + system_newline_tokens: Vec, + user_newline_tokens: Vec, + assistant_newline_tokens: Vec, + function_close_newline_tokens: Vec, + tool_text_cache: ToolTextCache, } impl Nemotron3Renderer { @@ -114,6 +121,17 @@ impl Nemotron3Renderer { if let Some(eot) = endoftext { stop_tokens.push(eot); } + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let system_newline_tokens = tokenizer.encode_no_special("system\n")?.as_slice().to_vec(); + let user_newline_tokens = tokenizer.encode_no_special("user\n")?.as_slice().to_vec(); + let assistant_newline_tokens = tokenizer + .encode_no_special("assistant\n")? + .as_slice() + .to_vec(); + let function_close_newline_tokens = tokenizer + .encode_no_special("\n")? + .as_slice() + .to_vec(); Ok(Self { tokenizer, @@ -130,6 +148,12 @@ impl Nemotron3Renderer { tool_response, tool_response_end, stop_tokens, + newline_tokens, + system_newline_tokens, + user_newline_tokens, + assistant_newline_tokens, + function_close_newline_tokens, + tool_text_cache: ToolTextCache::default(), }) } @@ -243,34 +267,45 @@ impl Nemotron3Renderer { ) -> Result<(), RenderError> { let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; buf.special(self.im_start, sys_idx); - buf.text("system\n", sys_idx)?; + buf.ids(&self.system_newline_tokens, sys_idx); - let mut full_sys = String::with_capacity(512); - if first_is_system { - full_sys.push_str(messages[0].text_content().trim()); - } - let mut tools_block = String::with_capacity(512); - tools_block.push_str(TOOLS_HEADER); - tools_block.push('\n'); - let mut first = true; - for t in tools { - if !first { + let system_content = if first_is_system { + messages[0].text_content().trim().to_string() + } else { + String::new() + }; + let tool_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + tools, + u64::from(first_is_system), + &system_content, + || { + let mut full_sys = String::with_capacity(512); + full_sys.push_str(&system_content); + let mut tools_block = String::with_capacity(512); + tools_block.push_str(TOOLS_HEADER); tools_block.push('\n'); - } - tools_block.push_str(&Self::format_tool_declaration(t)); - first = false; - } - tools_block.push_str(TOOLS_FOOTER); - tools_block.push_str(TOOLS_INSTRUCTIONS); - - if !full_sys.is_empty() { - full_sys.push_str("\n\n"); - } - full_sys.push_str(&tools_block); + let mut first = true; + for t in tools { + if !first { + tools_block.push('\n'); + } + tools_block.push_str(&Self::format_tool_declaration(t)); + first = false; + } + tools_block.push_str(TOOLS_FOOTER); + tools_block.push_str(TOOLS_INSTRUCTIONS); - buf.text(&full_sys, sys_idx)?; + if !full_sys.is_empty() { + full_sys.push_str("\n\n"); + } + full_sys.push_str(&tools_block); + Ok(full_sys) + }, + )?; + buf.ids(tool_tokens.as_slice(), sys_idx); buf.special(self.im_end, sys_idx); - buf.text("\n", sys_idx)?; + buf.ids(&self.newline_tokens, sys_idx); Ok(()) } @@ -287,7 +322,7 @@ impl Nemotron3Renderer { s.push_str(content); buf.text(&s, sys_idx)?; buf.special(self.im_end, sys_idx); - buf.text("\n", sys_idx)?; + buf.ids(&self.newline_tokens, sys_idx); Ok(()) } @@ -303,7 +338,7 @@ impl Nemotron3Renderer { s.push_str(content); buf.text(&s, idx)?; buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); Ok(()) } @@ -320,7 +355,7 @@ impl Nemotron3Renderer { if !prev_is_tool { buf.special(self.im_start, msg_orig_idx); - buf.text("user\n", msg_orig_idx)?; + buf.ids(&self.user_newline_tokens, msg_orig_idx); } buf.special(self.tool_response, msg_orig_idx); let mut wrapped = String::with_capacity(content.len() + 2); @@ -330,11 +365,11 @@ impl Nemotron3Renderer { buf.text(&wrapped, msg_orig_idx)?; buf.special(self.tool_response_end, msg_orig_idx); // Nemotron 3: trailing \n after - buf.text("\n", msg_orig_idx)?; + buf.ids(&self.newline_tokens, msg_orig_idx); if !next_is_tool { buf.special(self.im_end, msg_orig_idx); - buf.text("\n", msg_orig_idx)?; + buf.ids(&self.newline_tokens, msg_orig_idx); } Ok(()) } @@ -373,7 +408,7 @@ impl Nemotron3Renderer { let reasoning_content = reasoning_content.trim().to_string(); buf.special(self.im_start, msg_orig_idx); - buf.text("assistant\n", msg_orig_idx)?; + buf.ids(&self.assistant_newline_tokens, msg_orig_idx); let tool_calls = &msg.tool_calls; let content_suffix = if tool_calls.is_empty() { "" } else { "\n" }; @@ -454,29 +489,28 @@ impl Nemotron3Renderer { } } - buf.text("\n", msg_orig_idx)?; + buf.ids(&self.function_close_newline_tokens, msg_orig_idx); buf.special(self.tool_call_end, msg_orig_idx); // Nemotron 3: trailing \n after - buf.text("\n", msg_orig_idx)?; + buf.ids(&self.newline_tokens, msg_orig_idx); } buf.special(self.im_end, msg_orig_idx); - buf.text("\n", msg_orig_idx)?; + buf.ids(&self.newline_tokens, msg_orig_idx); Ok(()) } - fn emit_generation_prompt(&self, buf: &mut RenderBuf<'_>) -> Result<(), RenderError> { + fn emit_generation_prompt(&self, buf: &mut RenderBuf<'_>) { buf.scaffold_special(self.im_start); - buf.scaffold_text("assistant\n")?; + buf.ids(&self.assistant_newline_tokens, SCAFFOLD_IDX); if self.enable_thinking { buf.scaffold_special(self.think); - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); } else { // Disable-thinking suffix: with no trailing newlines buf.scaffold_special(self.think); buf.scaffold_special(self.think_end); } - Ok(()) } fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { @@ -591,7 +625,7 @@ impl Renderer for Nemotron3Renderer { } if add_generation_prompt { - self.emit_generation_prompt(&mut buf)?; + self.emit_generation_prompt(&mut buf); } Ok(buf.into_rendered()) @@ -635,8 +669,11 @@ impl Renderer for Nemotron3Renderer { return Ok(None); }; - let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(new_messages, None)); - buf.scaffold_text("\n")?; + let mut buf = RenderBuf::new_token_ids_only( + &self.tokenizer, + Self::estimate_capacity(new_messages, None), + ); + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); for (i, msg) in new_messages.iter().enumerate() { let content = msg.text_content().trim(); @@ -650,14 +687,14 @@ impl Renderer for Nemotron3Renderer { s.push_str(content); buf.text(&s, idx)?; buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); } "tool" => self.emit_tool(&mut buf, new_messages, i, content, idx)?, _ => return Ok(None), } } - self.emit_generation_prompt(&mut buf)?; + self.emit_generation_prompt(&mut buf); let ext = buf.into_token_ids(); let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); diff --git a/crates/renderers-core/src/families/qwen3.rs b/crates/renderers-core/src/families/qwen3.rs index 065a3d1..3b25a27 100644 --- a/crates/renderers-core/src/families/qwen3.rs +++ b/crates/renderers-core/src/families/qwen3.rs @@ -17,11 +17,12 @@ //! here. use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; -use crate::emit::RenderBuf; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; use crate::json::{to_string_python, tool_spec_template_value}; use crate::parsing::qwen3::parse_qwen3; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; use crate::traits::Renderer; use crate::types::{ Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, @@ -101,6 +102,7 @@ pub struct Qwen3Renderer { user_tokens: Vec, assistant_newline_tokens: Vec, gen_prompt_no_thinking_suffix_tokens: Vec, + tool_text_cache: ToolTextCache, } impl Qwen3Renderer { @@ -151,6 +153,7 @@ impl Qwen3Renderer { user_tokens, assistant_newline_tokens, gen_prompt_no_thinking_suffix_tokens, + tool_text_cache: ToolTextCache::default(), }) } @@ -172,28 +175,42 @@ impl Qwen3Renderer { fn emit_system_with_tools( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], tools: &[ToolSpec], first_is_system: bool, ) -> Result<(), RenderError> { let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; buf.special(self.im_start, sys_idx); - let mut tool_text = String::from("system\n"); - if first_is_system { - tool_text.push_str(messages[0].text_content()); - tool_text.push_str("\n\n"); - } - tool_text.push_str(TOOLS_HEADER); - for tool in tools { - tool_text.push('\n'); - let spec = tool_spec_template_value(tool); - tool_text.push_str(&to_string_python(&spec).map_err(|e| { - RenderError::Invalid(format!("tool spec serialisation failed: {e}")) - })?); - } - tool_text.push_str(TOOLS_FOOTER); - buf.text(&tool_text, sys_idx)?; + let system_content = if first_is_system { + messages[0].text_content().to_string() + } else { + String::new() + }; + let tool_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + tools, + u64::from(first_is_system), + &system_content, + || { + let mut tool_text = String::from("system\n"); + if first_is_system { + tool_text.push_str(&system_content); + tool_text.push_str("\n\n"); + } + tool_text.push_str(TOOLS_HEADER); + for tool in tools { + tool_text.push('\n'); + let spec = tool_spec_template_value(tool); + tool_text.push_str(&to_string_python(&spec).map_err(|e| { + RenderError::Invalid(format!("tool spec serialisation failed: {e}")) + })?); + } + tool_text.push_str(TOOLS_FOOTER); + Ok(tool_text) + }, + )?; + buf.ids(tool_tokens.as_slice(), sys_idx); buf.special(self.im_end, sys_idx); buf.ids(&self.newline_tokens, sys_idx); Ok(()) @@ -201,7 +218,7 @@ impl Qwen3Renderer { fn emit_system_no_tools( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], ) -> Result<(), RenderError> { buf.special(self.im_start, 0); @@ -216,7 +233,7 @@ impl Qwen3Renderer { fn emit_user( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, content: &str, idx: i32, ) -> Result<(), RenderError> { @@ -232,7 +249,7 @@ impl Qwen3Renderer { fn emit_non_initial_system( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, content: &str, idx: i32, ) -> Result<(), RenderError> { @@ -248,7 +265,7 @@ impl Qwen3Renderer { fn emit_tool( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], msg_idx: usize, content: &str, @@ -279,7 +296,7 @@ impl Qwen3Renderer { #[allow(clippy::too_many_arguments)] fn emit_assistant( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, msg: &Message, msg_idx: usize, last_query_index: i32, @@ -381,9 +398,13 @@ impl Qwen3Renderer { base + tools_bonus } + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + fn render_into_buf( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], tools: Option<&[ToolSpec]>, add_generation_prompt: bool, @@ -482,9 +503,15 @@ impl Renderer for Qwen3Renderer { add_generation_prompt: bool, ) -> Result, RenderError> { let cap = Self::estimate_capacity(messages, tools); - let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); - self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; - Ok(buf.into_token_ids()) + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } } fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs index b70b842..01aecc8 100644 --- a/crates/renderers-core/src/families/qwen35.rs +++ b/crates/renderers-core/src/families/qwen35.rs @@ -19,11 +19,12 @@ use std::borrow::Cow; use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; -use crate::emit::RenderBuf; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; use crate::json::{to_string_python, tool_spec_template_value}; use crate::parsing::qwen35::parse_qwen35; use crate::thinking::should_preserve_past_thinking; use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; use crate::traits::{MultimodalRenderer, Renderer}; use crate::types::{ Content, ContentPart, MediaBundle, MediaItem, Message, Modality, MultiModalData, @@ -117,6 +118,14 @@ pub struct Qwen35Renderer { mm_token_type_ids: Vec<(u32, u8)>, stop_tokens: Vec, + newline_tokens: Vec, + double_newline_tokens: Vec, + user_tokens: Vec, + user_newline_tokens: Vec, + system_newline_tokens: Vec, + assistant_newline_tokens: Vec, + function_close_newline_tokens: Vec, + tool_text_cache: ToolTextCache, } impl Qwen35Renderer { @@ -154,6 +163,19 @@ impl Qwen35Renderer { if let Some(p) = video_pad { mm_token_type_ids.push((p, 2)); } + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let double_newline_tokens = tokenizer.encode_no_special("\n\n")?.as_slice().to_vec(); + let user_tokens = tokenizer.encode_no_special("user")?.as_slice().to_vec(); + let user_newline_tokens = tokenizer.encode_no_special("user\n")?.as_slice().to_vec(); + let system_newline_tokens = tokenizer.encode_no_special("system\n")?.as_slice().to_vec(); + let assistant_newline_tokens = tokenizer + .encode_no_special("assistant\n")? + .as_slice() + .to_vec(); + let function_close_newline_tokens = tokenizer + .encode_no_special("\n")? + .as_slice() + .to_vec(); Ok(Self { tokenizer, @@ -176,6 +198,14 @@ impl Qwen35Renderer { video_pad, mm_token_type_ids, stop_tokens: vec![im_end, endoftext], + newline_tokens, + double_newline_tokens, + user_tokens, + user_newline_tokens, + system_newline_tokens, + assistant_newline_tokens, + function_close_newline_tokens, + tool_text_cache: ToolTextCache::default(), }) } @@ -225,46 +255,57 @@ impl Qwen35Renderer { fn emit_system_with_tools( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], tools: &[ToolSpec], first_is_system: bool, ) -> Result<(), RenderError> { let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; buf.special(self.im_start, sys_idx); - buf.text("system\n", sys_idx)?; - - let mut tool_text = - String::with_capacity(TOOLS_HEADER.len() + TOOLS_INSTRUCTIONS.len() + 256); - tool_text.push_str(TOOLS_HEADER); - for tool in tools { - tool_text.push('\n'); - let spec = tool_spec_template_value(tool); - tool_text.push_str(&to_string_python(&spec).map_err(|e| { - RenderError::Invalid(format!("tool spec serialisation failed: {e}")) - })?); - } - tool_text.push_str(TOOLS_FOOTER); - tool_text.push_str(TOOLS_INSTRUCTIONS); + buf.ids(&self.system_newline_tokens, sys_idx); - if first_is_system { + let system_content = if first_is_system { let sys_content = Self::render_content_text(&messages[0].content); let sys_content = sys_content.trim(); - if !sys_content.is_empty() { - tool_text.push_str("\n\n"); - tool_text.push_str(sys_content); - } - } + sys_content.to_string() + } else { + String::new() + }; + let tool_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + tools, + u64::from(first_is_system), + &system_content, + || { + let mut tool_text = + String::with_capacity(TOOLS_HEADER.len() + TOOLS_INSTRUCTIONS.len() + 256); + tool_text.push_str(TOOLS_HEADER); + for tool in tools { + tool_text.push('\n'); + let spec = tool_spec_template_value(tool); + tool_text.push_str(&to_string_python(&spec).map_err(|e| { + RenderError::Invalid(format!("tool spec serialisation failed: {e}")) + })?); + } + tool_text.push_str(TOOLS_FOOTER); + tool_text.push_str(TOOLS_INSTRUCTIONS); - buf.text(&tool_text, sys_idx)?; + if !system_content.is_empty() { + tool_text.push_str("\n\n"); + tool_text.push_str(&system_content); + } + Ok(tool_text) + }, + )?; + buf.ids(tool_tokens.as_slice(), sys_idx); buf.special(self.im_end, sys_idx); - buf.text("\n", sys_idx)?; + buf.ids(&self.newline_tokens, sys_idx); Ok(()) } fn emit_system_no_tools( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], ) -> Result<(), RenderError> { let content = Self::render_content_text(&messages[0].content); @@ -275,13 +316,13 @@ impl Qwen35Renderer { s.push_str(content); buf.text(&s, 0)?; buf.special(self.im_end, 0); - buf.text("\n", 0)?; + buf.ids(&self.newline_tokens, 0); Ok(()) } fn emit_user( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, content: &str, idx: i32, ) -> Result<(), RenderError> { @@ -291,13 +332,13 @@ impl Qwen35Renderer { s.push_str(content); buf.text(&s, idx)?; buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); Ok(()) } fn emit_tool( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, messages: &[Message], msg_idx: usize, content: &str, @@ -308,9 +349,9 @@ impl Qwen35Renderer { if !prev_is_tool { buf.special(self.im_start, idx); - buf.text("user", idx)?; + buf.ids(&self.user_tokens, idx); } - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); buf.special(self.tool_response, idx); let mut wrapped = String::with_capacity(content.len() + 2); wrapped.push('\n'); @@ -320,7 +361,7 @@ impl Qwen35Renderer { buf.special(self.tool_response_end, idx); if !next_is_tool { buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); } Ok(()) } @@ -358,7 +399,7 @@ impl Qwen35Renderer { fn emit_assistant( &self, - buf: &mut RenderBuf<'_>, + buf: &mut impl TokenSink, msg: &Message, msg_idx: usize, last_query_index: i32, @@ -396,7 +437,7 @@ impl Qwen35Renderer { || (preserve_thinking && !reasoning_content.is_empty()); if emit_thinking { - buf.text("assistant\n", idx)?; + buf.ids(&self.assistant_newline_tokens, idx); buf.special(self.think, idx); let mut s = String::with_capacity(reasoning_content.len() + 2); s.push('\n'); @@ -420,10 +461,10 @@ impl Qwen35Renderer { // Separator before this tool call if tc_idx == 0 { if !content.is_empty() { - buf.text("\n\n", idx)?; + buf.ids(&self.double_newline_tokens, idx); } } else { - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); } buf.special(self.tool_call, idx); @@ -452,28 +493,27 @@ impl Qwen35Renderer { } } - buf.text("\n", idx)?; + buf.ids(&self.function_close_newline_tokens, idx); buf.special(self.tool_call_end, idx); } buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); Ok(()) } - fn emit_generation_prompt(&self, buf: &mut RenderBuf<'_>) -> Result<(), RenderError> { + fn emit_generation_prompt(&self, buf: &mut impl TokenSink) { buf.scaffold_special(self.im_start); - buf.scaffold_text("assistant\n")?; + buf.ids(&self.assistant_newline_tokens, SCAFFOLD_IDX); if self.enable_thinking { buf.scaffold_special(self.think); - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); } else { buf.scaffold_special(self.think); - buf.scaffold_text("\n\n")?; + buf.ids(&self.double_newline_tokens, SCAFFOLD_IDX); buf.scaffold_special(self.think_end); - buf.scaffold_text("\n\n")?; + buf.ids(&self.double_newline_tokens, SCAFFOLD_IDX); } - Ok(()) } fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { @@ -481,29 +521,31 @@ impl Qwen35Renderer { let tools_bonus = tools.map_or(0, |t| 256 * t.len().max(1) + 512); base + tools_bonus } -} -impl Renderer for Qwen35Renderer { - fn render( + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + + fn render_text_into_buf( &self, + buf: &mut impl TokenSink, messages: &[Message], tools: Option<&[ToolSpec]>, add_generation_prompt: bool, - ) -> Result { + ) -> Result<(), RenderError> { if messages.is_empty() { return Err(RenderError::EmptyMessages); } - let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); let first_is_system = messages[0].role == "system"; match tools { Some(t) if !t.is_empty() => { - self.emit_system_with_tools(&mut buf, messages, t, first_is_system)?; + self.emit_system_with_tools(buf, messages, t, first_is_system)?; } _ => { if first_is_system { - self.emit_system_no_tools(&mut buf, messages)?; + self.emit_system_no_tools(buf, messages)?; } } } @@ -520,9 +562,8 @@ impl Renderer for Qwen35Renderer { "system message must be at the beginning".into(), )); } - // Already handled above } - "user" => self.emit_user(&mut buf, content, i as i32)?, + "user" => self.emit_user(buf, content, i as i32)?, "assistant" => { let preserve_thinking = should_preserve_past_thinking( messages, @@ -530,9 +571,9 @@ impl Renderer for Qwen35Renderer { self.preserve_all_thinking, self.preserve_thinking_between_tool_calls, ); - self.emit_assistant(&mut buf, msg, i, last_qi, preserve_thinking)?; + self.emit_assistant(buf, msg, i, last_qi, preserve_thinking)?; } - "tool" => self.emit_tool(&mut buf, messages, i, content)?, + "tool" => self.emit_tool(buf, messages, i, content)?, _ => { return Err(RenderError::Invalid(format!( "unexpected message role: {}", @@ -543,12 +584,43 @@ impl Renderer for Qwen35Renderer { } if add_generation_prompt { - self.emit_generation_prompt(&mut buf)?; + self.emit_generation_prompt(buf); } + Ok(()) + } +} + +impl Renderer for Qwen35Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + self.render_text_into_buf(&mut buf, messages, tools, add_generation_prompt)?; Ok(buf.into_rendered()) } + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages, tools); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_text_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_text_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { parse_qwen35( &self.tokenizer, @@ -588,9 +660,12 @@ impl Renderer for Qwen35Renderer { return Ok(None); }; - let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(new_messages, None)); + let mut buf = RenderBuf::new_token_ids_only( + &self.tokenizer, + Self::estimate_capacity(new_messages, None), + ); // Trailing newline that the prior render emitted but vLLM stopped on - buf.scaffold_text("\n")?; + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); for (i, msg) in new_messages.iter().enumerate() { let content = Self::render_content_text(&msg.content); @@ -605,14 +680,14 @@ impl Renderer for Qwen35Renderer { s.push_str(content); buf.text(&s, idx)?; buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); } "tool" => self.emit_tool(&mut buf, new_messages, i, content)?, _ => return Ok(None), } } - self.emit_generation_prompt(&mut buf)?; + self.emit_generation_prompt(&mut buf); let ext = buf.into_token_ids(); let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); @@ -663,7 +738,7 @@ impl Qwen35Renderer { ) -> Result<(), RenderError> { let idx = msg_idx as i32; buf.special(self.im_start, idx); - buf.text("user\n", idx)?; + buf.ids(&self.user_newline_tokens, idx); // Gather this message's media items in render order. let mut media_iter = media @@ -724,7 +799,7 @@ impl Qwen35Renderer { } buf.special(self.im_end, idx); - buf.text("\n", idx)?; + buf.ids(&self.newline_tokens, idx); Ok(()) } @@ -861,7 +936,7 @@ impl MultimodalRenderer for Qwen35Renderer { } if add_generation_prompt { - self.emit_generation_prompt(&mut buf)?; + self.emit_generation_prompt(&mut buf); } let mut out = buf.into_rendered(); diff --git a/crates/renderers-core/src/lib.rs b/crates/renderers-core/src/lib.rs index 1d9a905..372482b 100644 --- a/crates/renderers-core/src/lib.rs +++ b/crates/renderers-core/src/lib.rs @@ -34,6 +34,7 @@ pub mod processing; pub mod registry; pub mod thinking; pub mod tokenizer; +pub(crate) mod tool_cache; pub mod traits; pub mod types; diff --git a/crates/renderers-core/src/parsing/glm.rs b/crates/renderers-core/src/parsing/glm.rs index 1c6dc1d..152c2dd 100644 --- a/crates/renderers-core/src/parsing/glm.rs +++ b/crates/renderers-core/src/parsing/glm.rs @@ -42,7 +42,6 @@ pub fn parse_glm( // Thinking — find by token id. let mut reasoning: Option = None; let mut parse_offset = 0usize; - let working_ids: Vec; let ids: &[u32] = if let Some(think_end) = find(stripped, think_end_id) { let reasoning_ids: Vec = stripped[..think_end] .iter() @@ -63,8 +62,7 @@ pub fn parse_glm( tool_calls: Vec::new(), }; } - working_ids = stripped.to_vec(); - &working_ids + stripped }; let (content_text, tool_calls) = match find(ids, tool_call_id) { diff --git a/crates/renderers-core/src/parsing/qwen3.rs b/crates/renderers-core/src/parsing/qwen3.rs index c3b77b5..8af0d00 100644 --- a/crates/renderers-core/src/parsing/qwen3.rs +++ b/crates/renderers-core/src/parsing/qwen3.rs @@ -103,7 +103,7 @@ pub fn parse_qwen3( }; let text = decode(tokenizer, content_ids).unwrap_or_default(); - let (reasoning, content) = split_thinking(&text); + let (reasoning, content) = split_thinking(text); ParsedResponse { content: content.trim().to_string(), @@ -135,7 +135,7 @@ fn extract_name_and_args(value: &serde_json::Value) -> (String, ToolArguments) { /// Split a decoded text segment around ``. Mirrors the inline /// logic at `renderers/parsing.py` for Qwen3 (which has no `` as /// special token — reasoning lives in the decoded text). -fn split_thinking(text: &str) -> (Option, String) { +fn split_thinking(text: String) -> (Option, String) { if let Some((before, after)) = text.split_once("") { let reasoning = before .replace("", "") @@ -145,6 +145,6 @@ fn split_thinking(text: &str) -> (Option, String) { let content = after.trim_matches('\n').to_string(); (Some(reasoning), content) } else { - (None, text.to_string()) + (None, text) } } diff --git a/crates/renderers-core/src/parsing/qwen35.rs b/crates/renderers-core/src/parsing/qwen35.rs index e9d5a40..cee6378 100644 --- a/crates/renderers-core/src/parsing/qwen35.rs +++ b/crates/renderers-core/src/parsing/qwen35.rs @@ -58,7 +58,6 @@ pub fn parse_qwen35( // ── Thinking: find by token ID ───────────────────────── let mut reasoning: Option = None; let mut parse_offset: usize = 0; - let working_ids: Vec; let ids_after_think: &[u32] = if let Some(think_end) = find(ids, think_end_id) { // Filter out think_id tokens from the reasoning span so the // decoded text doesn't include the opening marker. @@ -82,8 +81,7 @@ pub fn parse_qwen35( tool_calls: Vec::new(), }; } - working_ids = ids.to_vec(); - &working_ids + ids }; // ── Tool calls (token-bounded, regex-on-decoded-span) ─────────── diff --git a/crates/renderers-core/src/tokenizer.rs b/crates/renderers-core/src/tokenizer.rs index b75d8f3..98d34da 100644 --- a/crates/renderers-core/src/tokenizer.rs +++ b/crates/renderers-core/src/tokenizer.rs @@ -73,6 +73,22 @@ impl Tokenizer { Ok(Encoded { enc }) } + /// Encode many text fragments without model special tokens. The + /// tokenizer crate parallelizes this internally, which avoids paying + /// per-fragment call overhead on render paths that can plan the whole + /// prompt before materialising ids. + pub fn encode_batch_no_special<'s, E>(&self, texts: Vec) -> Result, RenderError> + where + E: Into> + Send, + { + let encodings = self + .inner + .tok + .encode_batch_fast(texts, false) + .map_err(|e| RenderError::Tokenizer(e.to_string()))?; + Ok(encodings.into_iter().map(|enc| Encoded { enc }).collect()) + } + /// Decode `ids` to text, including special tokens (matches the /// Python `tokenizer.decode(ids, skip_special_tokens=False)` used /// across the parsing layer). diff --git a/crates/renderers-core/src/tool_cache.rs b/crates/renderers-core/src/tool_cache.rs new file mode 100644 index 0000000..87f45a5 --- /dev/null +++ b/crates/renderers-core/src/tool_cache.rs @@ -0,0 +1,86 @@ +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::sync::{Arc, Mutex}; + +use crate::tokenizer::Tokenizer; +use crate::types::{RenderError, ToolSpec}; + +const MAX_TOOL_TEXT_CACHE_ENTRIES: usize = 64; + +#[derive(Debug, Clone, Default)] +pub(crate) struct ToolTextCache { + inner: Arc>>, +} + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +struct ToolTextCacheKey { + tools_ptr: usize, + tools_len: usize, + discriminator: u64, + dynamic_hash: u64, +} + +#[derive(Debug, Clone)] +struct CachedToolText { + tools: Vec, + dynamic_text: String, + tokens: Arc>, +} + +impl ToolTextCache { + pub(crate) fn get_or_insert_with( + &self, + tokenizer: &Tokenizer, + tools: &[ToolSpec], + discriminator: u64, + dynamic_text: &str, + build_text: impl FnOnce() -> Result, + ) -> Result>, RenderError> { + let key = ToolTextCacheKey { + tools_ptr: tools.as_ptr() as usize, + tools_len: tools.len(), + discriminator, + dynamic_hash: hash_dynamic_text(dynamic_text), + }; + + { + let cache = self.lock_cache()?; + if let Some(cached) = cache.get(&key) { + if cached.tools == tools && cached.dynamic_text == dynamic_text { + return Ok(cached.tokens.clone()); + } + } + } + + let text = build_text()?; + let tokens = Arc::new(tokenizer.encode_no_special(&text)?.as_slice().to_vec()); + let mut cache = self.lock_cache()?; + if cache.len() >= MAX_TOOL_TEXT_CACHE_ENTRIES { + cache.clear(); + } + cache.insert( + key, + CachedToolText { + tools: tools.to_vec(), + dynamic_text: dynamic_text.to_string(), + tokens: tokens.clone(), + }, + ); + Ok(tokens) + } + + fn lock_cache( + &self, + ) -> Result>, RenderError> + { + self.inner + .lock() + .map_err(|_| RenderError::Invalid("tool text cache lock poisoned".into())) + } +} + +fn hash_dynamic_text(text: &str) -> u64 { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + text.hash(&mut hasher); + hasher.finish() +} diff --git a/crates/renderers-py/Cargo.toml b/crates/renderers-py/Cargo.toml index a9ba280..c95d6da 100644 --- a/crates/renderers-py/Cargo.toml +++ b/crates/renderers-py/Cargo.toml @@ -20,6 +20,7 @@ serde_json = { workspace = true } pythonize = "0.28" numpy = "0.28" ndarray = "0.17" +rayon = "1" [lints] workspace = true diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 454df19..fd77fe3 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -12,7 +12,8 @@ use std::sync::Arc; use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1}; use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; -use pyo3::types::{PyDict, PyList, PyType}; +use pyo3::types::{PyDict, PyList, PyTuple, PyType}; +use rayon::prelude::*; use renderers_core::Renderer as CoreRenderer; use renderers_core::families::{ @@ -99,14 +100,17 @@ fn optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult>) -> PyResult>> { +fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult>>> { let Some(obj) = obj else { return Ok(None) }; if obj.is_none() { return Ok(None); } + if let Ok(prepared) = obj.extract::>() { + return Ok(Some(prepared.inner.clone())); + } let list = obj .cast::() - .map_err(|_| invalid("tools must be a list of dicts"))?; + .map_err(|_| invalid("tools must be a list of dicts or PreparedTools"))?; let mut parsed = Vec::with_capacity(list.len()); for item in list.iter() { let dict = item @@ -143,7 +147,47 @@ fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult> openai_envelope, }); } - Ok(Some(parsed)) + Ok(Some(Arc::new(parsed))) +} + +#[inline] +fn tools_slice(tools: Option<&Arc>>) -> Option<&[ToolSpec]> { + tools.map(|tools| tools.as_slice()) +} + +fn parse_message_batch(obj: &Bound<'_, PyAny>) -> PyResult>> { + let list = obj + .cast::() + .map_err(|_| invalid("messages_batch must be a list of message lists"))?; + let mut parsed = Vec::with_capacity(list.len()); + for item in list.iter() { + parsed.push(parse_messages(&item)?); + } + Ok(parsed) +} + +fn parse_fast_messages( + roles: &Bound<'_, PyAny>, + contents: &Bound<'_, PyAny>, +) -> PyResult> { + let roles = roles + .cast::() + .map_err(|_| invalid("roles must be a list[str]"))?; + let contents = contents + .cast::() + .map_err(|_| invalid("contents must be a list[str]"))?; + if roles.len() != contents.len() { + return Err(invalid("roles and contents must have the same length")); + } + let mut parsed = Vec::with_capacity(roles.len()); + for (role, content) in roles.iter().zip(contents.iter()) { + parsed.push(Message { + role: role.extract::()?, + content: Content::Text(content.extract::()?), + ..Default::default() + }); + } + Ok(parsed) } /// Decode a Python list of media-item dicts into a [`MediaBundle`]. @@ -398,6 +442,156 @@ impl PyToolCallParseStatus { } } +#[pyclass( + name = "PreparedTools", + module = "renderers_native", + skip_from_py_object +)] +#[derive(Clone)] +struct PyPreparedTools { + inner: Arc>, +} + +#[pymethods] +impl PyPreparedTools { + fn __len__(&self) -> usize { + self.inner.len() + } + + fn __repr__(&self) -> String { + format!("PreparedTools(<{} tools>)", self.inner.len()) + } +} + +#[pyclass(name = "RendererSession", module = "renderers_native")] +struct PyRendererSession { + renderer: Arc, + messages: Arc>, + tools: Option>>, + last_prompt_ids: Option>, +} + +#[pymethods] +impl PyRendererSession { + fn fork(&self) -> Self { + Self { + renderer: self.renderer.clone(), + messages: self.messages.clone(), + tools: self.tools.clone(), + last_prompt_ids: self.last_prompt_ids.clone(), + } + } + + #[pyo3(signature = (*, add_generation_prompt = false))] + fn render_ids<'py>( + &mut self, + py: Python<'py>, + add_generation_prompt: bool, + ) -> PyResult> { + let renderer = self.renderer.clone(); + let messages = self.messages.clone(); + let tools = self.tools.clone(); + let ids = py + .detach(move || { + renderer.render_ids( + messages.as_slice(), + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .map_err(render_err)?; + self.last_prompt_ids = Some(ids.clone()); + PyList::new(py, ids) + } + + #[pyo3(signature = (*, add_generation_prompt = false))] + fn render_ids_np<'py>( + &mut self, + py: Python<'py>, + add_generation_prompt: bool, + ) -> PyResult>> { + let renderer = self.renderer.clone(); + let messages = self.messages.clone(); + let tools = self.tools.clone(); + let ids = py + .detach(move || { + renderer.render_ids( + messages.as_slice(), + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .map_err(render_err)?; + self.last_prompt_ids = Some(ids.clone()); + Ok(ids.into_pyarray(py)) + } + + #[pyo3(signature = (previous_completion_ids, new_messages, *, update = true))] + fn bridge_to_next_turn( + &mut self, + py: Python<'_>, + previous_completion_ids: &Bound<'_, PyAny>, + new_messages: &Bound<'_, PyAny>, + update: bool, + ) -> PyResult> { + let prev_p = self + .last_prompt_ids + .clone() + .ok_or_else(|| invalid("render_ids must be called before session bridge"))?; + let prev_c = parse_u32_list(previous_completion_ids)?; + let msgs = parse_messages(new_messages)?; + let renderer = self.renderer.clone(); + let tools = self.tools.clone(); + let bridged = py + .detach(move || { + renderer.bridge_to_next_turn(&prev_p, &prev_c, &msgs, tools_slice(tools.as_ref())) + }) + .map_err(render_err)?; + if update && let Some(rendered) = &bridged { + self.last_prompt_ids = Some(rendered.token_ids.clone()); + } + Ok(bridged.map(|rt| PyRenderedTokens { inner: rt })) + } + + #[allow(clippy::needless_pass_by_value)] + #[pyo3(signature = (previous_completion_ids, new_messages, *, update = true))] + fn bridge_to_next_turn_np<'py>( + &mut self, + py: Python<'py>, + previous_completion_ids: PyReadonlyArray1<'_, u32>, + new_messages: &Bound<'_, PyAny>, + update: bool, + ) -> PyResult>>> { + let prev_p = self + .last_prompt_ids + .as_deref() + .ok_or_else(|| invalid("render_ids must be called before session bridge"))?; + let prev_c = numpy_u32_slice(&previous_completion_ids)?; + let msgs = parse_messages(new_messages)?; + let bridged = self + .renderer + .bridge_to_next_turn(prev_p, prev_c, &msgs, tools_slice(self.tools.as_ref())) + .map_err(render_err)?; + if let Some(rendered) = bridged { + if update { + self.last_prompt_ids = Some(rendered.token_ids.clone()); + } + Ok(Some(rendered.token_ids.into_pyarray(py))) + } else { + Ok(None) + } + } + + fn __repr__(&self) -> String { + format!( + "RendererSession(messages={}, tools={}, has_prompt={})", + self.messages.len(), + self.tools.as_ref().map_or(0, |t| t.len()), + self.last_prompt_ids.is_some(), + ) + } +} + /// Polymorphic Python-facing renderer. #[pyclass(name = "Renderer", module = "renderers_native")] struct PyRenderer { @@ -904,6 +1098,172 @@ impl PyRenderer { }) } + #[allow(clippy::unused_self)] + fn prepare_tools(&self, tools: &Bound<'_, PyAny>) -> PyResult { + let parsed = parse_tools(Some(tools))?.unwrap_or_else(|| Arc::new(Vec::new())); + Ok(PyPreparedTools { inner: parsed }) + } + + #[pyo3(signature = (messages, *, tools = None))] + fn new_session( + &self, + messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + Ok(PyRendererSession { + renderer: self.inner.clone(), + messages: Arc::new(parse_messages(messages)?), + tools: parse_tools(tools)?, + last_prompt_ids: None, + }) + } + + #[pyo3(signature = (messages_batch, *, tools = None, add_generation_prompt = false))] + fn render_batch_ids<'py>( + &self, + py: Python<'py>, + messages_batch: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult> { + let batch = parse_message_batch(messages_batch)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let batch_ids = py + .detach(move || { + if batch.len() >= 8 { + batch + .par_iter() + .map(|messages| { + renderer.render_ids( + messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .collect::, _>>() + } else { + batch + .iter() + .map(|messages| { + renderer.render_ids( + messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .collect::, _>>() + } + }) + .map_err(render_err)?; + let out = PyList::empty(py); + for ids in batch_ids { + out.append(PyList::new(py, ids)?)?; + } + Ok(out) + } + + #[pyo3(signature = (messages_batch, *, tools = None, add_generation_prompt = false))] + fn render_batch_ids_np_packed<'py>( + &self, + py: Python<'py>, + messages_batch: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult> { + let batch = parse_message_batch(messages_batch)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let (ids, offsets) = py + .detach( + move || -> Result<(Vec, Vec), renderers_core::types::RenderError> { + let batch_ids = if batch.len() >= 8 { + batch + .par_iter() + .map(|messages| { + renderer.render_ids( + messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .collect::, _>>()? + } else { + batch + .iter() + .map(|messages| { + renderer.render_ids( + messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .collect::, _>>()? + }; + let mut ids = Vec::new(); + let mut offsets = Vec::with_capacity(batch_ids.len() + 1); + offsets.push(0); + for row in batch_ids { + ids.extend_from_slice(&row); + offsets.push(ids.len() as i64); + } + Ok((ids, offsets)) + }, + ) + .map_err(render_err)?; + let ids = ids.into_pyarray(py).into_any(); + let offsets = offsets.into_pyarray(py).into_any(); + PyTuple::new(py, [ids, offsets]) + } + + #[pyo3(signature = (roles, contents, *, tools = None, add_generation_prompt = false))] + fn render_fast_ids<'py>( + &self, + py: Python<'py>, + roles: &Bound<'_, PyAny>, + contents: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult> { + let messages = parse_fast_messages(roles, contents)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let ids = py + .detach(move || { + renderer.render_ids( + &messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .map_err(render_err)?; + PyList::new(py, ids) + } + + #[pyo3(signature = (roles, contents, *, tools = None, add_generation_prompt = false))] + fn render_fast_ids_np<'py>( + &self, + py: Python<'py>, + roles: &Bound<'_, PyAny>, + contents: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult>> { + let messages = parse_fast_messages(roles, contents)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let ids = py + .detach(move || { + renderer.render_ids( + &messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .map_err(render_err)?; + Ok(ids.into_pyarray(py)) + } + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] fn render( &self, @@ -916,7 +1276,9 @@ impl PyRenderer { let tools = parse_tools(tools)?; let renderer = self.inner.clone(); let out = py - .detach(move || renderer.render(&msgs, tools.as_deref(), add_generation_prompt)) + .detach(move || { + renderer.render(&msgs, tools_slice(tools.as_ref()), add_generation_prompt) + }) .map_err(render_err)?; Ok(PyRenderedTokens { inner: out }) } @@ -933,7 +1295,9 @@ impl PyRenderer { let tools = parse_tools(tools)?; let renderer = self.inner.clone(); let ids = py - .detach(move || renderer.render_ids(&msgs, tools.as_deref(), add_generation_prompt)) + .detach(move || { + renderer.render_ids(&msgs, tools_slice(tools.as_ref()), add_generation_prompt) + }) .map_err(render_err)?; PyList::new(py, ids) } @@ -956,7 +1320,9 @@ impl PyRenderer { let tools = parse_tools(tools)?; let renderer = self.inner.clone(); let ids = py - .detach(move || renderer.render_ids(&msgs, tools.as_deref(), add_generation_prompt)) + .detach(move || { + renderer.render_ids(&msgs, tools_slice(tools.as_ref()), add_generation_prompt) + }) .map_err(render_err)?; Ok(ids.into_pyarray(py)) } @@ -1024,7 +1390,12 @@ impl PyRenderer { .ok_or_else(|| renderers_core::types::RenderError::Invalid( "this renderer does not support multimodal — use a -VL tokenizer or check supports_multimodal()".into(), ))?; - mm.render_with_media(&msgs, tools.as_deref(), &bundle, add_generation_prompt) + mm.render_with_media( + &msgs, + tools_slice(tools.as_ref()), + &bundle, + add_generation_prompt, + ) }) .map_err(render_err)?; Ok(PyRenderedTokens { inner: out }) @@ -1051,7 +1422,9 @@ impl PyRenderer { let tools = parse_tools(tools)?; let renderer = self.inner.clone(); let bridged = py - .detach(move || renderer.bridge_to_next_turn(&prev_p, &prev_c, &msgs, tools.as_deref())) + .detach(move || { + renderer.bridge_to_next_turn(&prev_p, &prev_c, &msgs, tools_slice(tools.as_ref())) + }) .map_err(render_err)?; Ok(bridged.map(|rt| PyRenderedTokens { inner: rt })) } @@ -1077,7 +1450,7 @@ impl PyRenderer { let tools = parse_tools(tools)?; let bridged = self .inner - .bridge_to_next_turn(prev_p, prev_c, &msgs, tools.as_deref()) + .bridge_to_next_turn(prev_p, prev_c, &msgs, tools_slice(tools.as_ref())) .map_err(render_err)?; Ok(bridged.map(|rt| rt.token_ids.into_pyarray(py))) } @@ -1229,6 +1602,8 @@ fn processed_to_pyobject<'py>(py: Python<'py>, p: ProcessedImage) -> PyResult, m: &Bound<'_, PyModule>) -> PyResult<()> { let _ = py; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/examples/README.md b/examples/README.md index 08d79a6..2d9ba1f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,6 +13,16 @@ backend: The scripts use PEP 723 `uv` headers, so backend dependencies stay local to the recipe and do not touch the repo `uv.lock`. +When `RENDERERS_NATIVE` selects a native renderer, the vLLM and SGLang multiturn +recipes prepare tool schemas once and use a renderer session for first render +plus bridge. The engine-facing contract stays the same: vLLM receives +`prompt_token_ids`, and SGLang receives `input_ids`. + +For local serving loops that already hold parallel role/content arrays, native +renderers also expose `render_fast_ids(roles, contents, tools=prepared_tools)`. +Use the regular message-dict path when structured content parts or multimodal +items are needed. + ## vLLM Multi-Turn Recipe ```bash diff --git a/examples/sglang/multiturn_generate_sglang.py b/examples/sglang/multiturn_generate_sglang.py index bd67fbb..4f352f2 100755 --- a/examples/sglang/multiturn_generate_sglang.py +++ b/examples/sglang/multiturn_generate_sglang.py @@ -122,6 +122,11 @@ def main() -> None: "skip_special_tokens": False, "no_stop_trim": True, } + renderer_tools = ( + renderer.prepare_tools(TOOLS) + if hasattr(renderer, "prepare_tools") + else TOOLS + ) messages = [ {"role": "system", "content": "You are a concise tool-using assistant."}, @@ -133,8 +138,17 @@ def main() -> None: # Turn 1: render locally and pass token IDs to SGLang. SGLang never # sees messages and never applies a chat template. - prompt_ids = renderer.render_ids( - messages, tools=TOOLS, add_generation_prompt=True + session = ( + renderer.new_session(messages, tools=renderer_tools) + if hasattr(renderer, "new_session") + else None + ) + prompt_ids = ( + session.render_ids(add_generation_prompt=True) + if session is not None + else renderer.render_ids( + messages, tools=renderer_tools, add_generation_prompt=True + ) ) output1 = engine.generate(input_ids=prompt_ids, sampling_params=sampling) completion1 = completion_ids(output1, prompt_ids) @@ -185,8 +199,12 @@ def main() -> None: # Turn 2: bridge extends prompt_ids + completion1 exactly. # ``bridge_to_next_turn`` returns a ``RenderedTokens`` (or None); the # extended id stream is on ``.token_ids``. - bridged = renderer.bridge_to_next_turn( - prompt_ids, completion1, new_messages, tools=TOOLS + bridged = ( + session.bridge_to_next_turn(completion1, new_messages) + if session is not None + else renderer.bridge_to_next_turn( + prompt_ids, completion1, new_messages, tools=renderer_tools + ) ) if bridged is None: raise RuntimeError("bridge_to_next_turn returned None") diff --git a/examples/sglang/online_multiturn_sglang.py b/examples/sglang/online_multiturn_sglang.py index 4c17278..1ea2d34 100644 --- a/examples/sglang/online_multiturn_sglang.py +++ b/examples/sglang/online_multiturn_sglang.py @@ -142,6 +142,9 @@ async def run_one( print(f"\n=== {label} ===") renderer = make_renderer(model, enable_thinking) + renderer_tools = ( + renderer.prepare_tools(TOOLS) if hasattr(renderer, "prepare_tools") else TOOLS + ) messages: list[dict[str, Any]] = [ {"role": "system", "content": "You are a concise tool-using assistant."}, @@ -152,7 +155,18 @@ async def run_one( ] # Turn 1: render locally, send token IDs. SGLang never sees messages. - prompt_ids = renderer.render_ids(messages, tools=TOOLS, add_generation_prompt=True) + session = ( + renderer.new_session(messages, tools=renderer_tools) + if hasattr(renderer, "new_session") + else None + ) + prompt_ids = ( + session.render_ids(add_generation_prompt=True) + if session is not None + else renderer.render_ids( + messages, tools=renderer_tools, add_generation_prompt=True + ) + ) output1 = await generate_sglang( client=client, base_url=base_url, @@ -208,8 +222,12 @@ async def run_one( # Turn 2: bridge extends prompt_ids + completion1 exactly. # ``bridge_to_next_turn`` returns a ``RenderedTokens`` (or None); the # extended id stream is on ``.token_ids``. - bridged = renderer.bridge_to_next_turn( - prompt_ids, completion1, new_messages, tools=TOOLS + bridged = ( + session.bridge_to_next_turn(completion1, new_messages) + if session is not None + else renderer.bridge_to_next_turn( + prompt_ids, completion1, new_messages, tools=renderer_tools + ) ) if bridged is None: raise RuntimeError("bridge_to_next_turn returned None") diff --git a/examples/vllm/multiturn_generate_vllm.py b/examples/vllm/multiturn_generate_vllm.py index 0eafd4d..58f74bc 100755 --- a/examples/vllm/multiturn_generate_vllm.py +++ b/examples/vllm/multiturn_generate_vllm.py @@ -111,6 +111,11 @@ def main() -> None: stop_token_ids=renderer.get_stop_token_ids(), skip_special_tokens=False, ) + renderer_tools = ( + renderer.prepare_tools(TOOLS) + if hasattr(renderer, "prepare_tools") + else TOOLS + ) messages = [ {"role": "system", "content": "You are a concise tool-using assistant."}, @@ -122,8 +127,17 @@ def main() -> None: # Turn 1: render locally and pass token IDs to vLLM. vLLM never sees # messages and never applies a chat template. - prompt_ids = renderer.render_ids( - messages, tools=TOOLS, add_generation_prompt=True + session = ( + renderer.new_session(messages, tools=renderer_tools) + if hasattr(renderer, "new_session") + else None + ) + prompt_ids = ( + session.render_ids(add_generation_prompt=True) + if session is not None + else renderer.render_ids( + messages, tools=renderer_tools, add_generation_prompt=True + ) ) output1 = llm.generate( [{"prompt_token_ids": prompt_ids}], @@ -178,8 +192,12 @@ def main() -> None: # Turn 2: bridge extends prompt_ids + completion1 exactly. # ``bridge_to_next_turn`` returns a ``RenderedTokens`` (or None); the # extended id stream is on ``.token_ids``. - bridged = renderer.bridge_to_next_turn( - prompt_ids, completion1, new_messages, tools=TOOLS + bridged = ( + session.bridge_to_next_turn(completion1, new_messages) + if session is not None + else renderer.bridge_to_next_turn( + prompt_ids, completion1, new_messages, tools=renderer_tools + ) ) if bridged is None: raise RuntimeError("bridge_to_next_turn returned None") diff --git a/tests/test_native_numpy.py b/tests/test_native_numpy.py index 558eeb1..5b669a4 100644 --- a/tests/test_native_numpy.py +++ b/tests/test_native_numpy.py @@ -9,6 +9,18 @@ from renderers import _native_router as router +TOOLS = [ + { + "name": "get_weather", + "description": "Get current weather.", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + } +] + @pytest.fixture(scope="module") def qwen3_native(): @@ -80,3 +92,161 @@ def test_bridge_to_next_turn_np_matches_list_api(qwen3_native): assert bridged_list is not None assert bridged_np.dtype == np.uint32 assert bridged_np.tolist() == bridged_list.token_ids + + +def test_prepared_tools_match_raw_tools(qwen3_native): + messages = [ + {"role": "system", "content": "You call tools when useful."}, + {"role": "user", "content": "Weather in Paris?"}, + ] + prepared = qwen3_native.prepare_tools(TOOLS) + + raw_ids = qwen3_native.render_ids( + messages, + tools=TOOLS, + add_generation_prompt=True, + ) + prepared_ids = qwen3_native.render_ids( + messages, + tools=prepared, + add_generation_prompt=True, + ) + + assert len(prepared) == 1 + assert prepared_ids == raw_ids + + +def test_render_batch_ids_matches_single_calls(qwen3_native): + batch = [ + [{"role": "user", "content": "Say hi."}], + [{"role": "user", "content": "Say bye."}], + ] + + batch_ids = qwen3_native.render_batch_ids(batch, add_generation_prompt=True) + + assert batch_ids == [ + qwen3_native.render_ids(messages, add_generation_prompt=True) + for messages in batch + ] + + +def test_render_batch_ids_np_packed_matches_single_calls(qwen3_native): + batch = [ + [{"role": "user", "content": "A"}], + [{"role": "user", "content": "B"}], + [{"role": "user", "content": "C"}], + ] + + ids, offsets = qwen3_native.render_batch_ids_np_packed( + batch, + add_generation_prompt=True, + ) + + assert ids.dtype == np.uint32 + assert offsets.dtype == np.int64 + assert offsets.tolist()[0] == 0 + assert len(offsets) == len(batch) + 1 + unpacked = [ + ids[offsets[idx] : offsets[idx + 1]].tolist() for idx in range(len(batch)) + ] + assert unpacked == [ + qwen3_native.render_ids(messages, add_generation_prompt=True) + for messages in batch + ] + + +def test_render_fast_ids_matches_dict_messages(qwen3_native): + roles = ["system", "user", "assistant"] + contents = ["You are concise.", "Say hi.", "Hi."] + messages = [ + {"role": role, "content": content} + for role, content in zip(roles, contents, strict=True) + ] + + fast_ids = qwen3_native.render_fast_ids( + roles, + contents, + add_generation_prompt=True, + ) + fast_np = qwen3_native.render_fast_ids_np( + roles, + contents, + add_generation_prompt=True, + ) + regular_ids = qwen3_native.render_ids( + messages, + add_generation_prompt=True, + ) + + assert fast_ids == regular_ids + assert fast_np.dtype == np.uint32 + assert fast_np.tolist() == regular_ids + + +def test_session_render_and_bridge_match_renderer(qwen3_native): + prompt = [{"role": "user", "content": "Plan Saturday."}] + assistant = {"role": "assistant", "content": "Start with breakfast."} + new_messages = [{"role": "user", "content": "Add one museum."}] + session = qwen3_native.new_session(prompt) + + session_prompt = session.render_ids(add_generation_prompt=True) + full_ids = qwen3_native.render_ids(prompt + [assistant]) + completion_ids = full_ids[len(session_prompt) :] + session_bridge = session.bridge_to_next_turn(completion_ids, new_messages) + direct_bridge = qwen3_native.bridge_to_next_turn( + session_prompt, + completion_ids, + new_messages, + ) + + assert session_prompt == qwen3_native.render_ids( + prompt, + add_generation_prompt=True, + ) + assert session_bridge is not None + assert direct_bridge is not None + assert session_bridge.token_ids == direct_bridge.token_ids + + +def test_session_fork_preserves_prompt_state(qwen3_native): + prompt = [{"role": "user", "content": "Plan Monday."}] + assistant = {"role": "assistant", "content": "Start with tea."} + new_messages = [{"role": "user", "content": "Add one errand."}] + session = qwen3_native.new_session(prompt) + session_prompt = session.render_ids(add_generation_prompt=True) + forked = session.fork() + + full_ids = qwen3_native.render_ids(prompt + [assistant]) + completion_ids = full_ids[len(session_prompt) :] + forked_bridge = forked.bridge_to_next_turn(completion_ids, new_messages) + direct_bridge = qwen3_native.bridge_to_next_turn( + session_prompt, + completion_ids, + new_messages, + ) + + assert forked_bridge is not None + assert direct_bridge is not None + assert forked_bridge.token_ids == direct_bridge.token_ids + + +def test_session_numpy_bridge_match_renderer(qwen3_native): + prompt = [{"role": "user", "content": "Plan Sunday."}] + assistant = {"role": "assistant", "content": "Start with a walk."} + new_messages = [{"role": "user", "content": "Add coffee."}] + session = qwen3_native.new_session(prompt) + + session_prompt = session.render_ids_np(add_generation_prompt=True) + full_ids = qwen3_native.render_ids_np(prompt + [assistant]) + completion_ids = full_ids[len(session_prompt) :] + session_bridge = session.bridge_to_next_turn_np(completion_ids, new_messages) + direct_bridge = qwen3_native.bridge_to_next_turn_np( + session_prompt, + completion_ids, + new_messages, + ) + + assert session_bridge is not None + assert direct_bridge is not None + assert session_bridge.dtype == np.uint32 + assert session_bridge.tolist() == direct_bridge.tolist() From 25eb78a197c0eafd4be911b77f61f1b3d56f5b28 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Thu, 21 May 2026 11:08:31 +0200 Subject: [PATCH 30/35] Align native Python API surface --- crates/renderers-py/src/lib.rs | 111 +++++++++++++++++++++++++++++++++ tests/test_native_router.py | 32 ++++++++++ 2 files changed, 143 insertions(+) diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index fd77fe3..a52a75d 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -281,6 +281,24 @@ impl PyRenderedTokens { PyList::new(py, self.inner.message_indices.iter().copied()) } + #[getter] + #[allow(clippy::unused_self)] + fn sampled_mask<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + PyList::empty(py) + } + + #[getter] + #[allow(clippy::unused_self)] + fn is_content<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + PyList::empty(py) + } + + #[getter] + #[allow(clippy::unused_self)] + fn message_roles<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + PyList::empty(py) + } + #[getter] fn multi_modal_data<'py>(&self, py: Python<'py>) -> PyResult> { match &self.inner.multi_modal_data { @@ -290,6 +308,95 @@ impl PyRenderedTokens { } } + #[pyo3(signature = (n_messages = None, *, sampled_only = false))] + fn tokens_per_message<'py>( + &self, + py: Python<'py>, + n_messages: Option, + sampled_only: bool, + ) -> PyResult> { + let n_messages = n_messages.unwrap_or(0); + let out = if sampled_only { + vec![0usize; n_messages] + } else { + let mut counts = vec![0usize; n_messages]; + for idx in &self.inner.message_indices { + let Ok(msg_idx) = usize::try_from(*idx) else { + continue; + }; + if msg_idx < n_messages { + counts[msg_idx] += 1; + } + } + counts + }; + PyList::new(py, out) + } + + fn message_token_spans<'py>(&self, py: Python<'py>) -> PyResult> { + let n_messages = self + .inner + .message_indices + .iter() + .copied() + .filter(|idx| *idx >= 0) + .max() + .map_or(0usize, |idx| usize::try_from(idx).map_or(0, |idx| idx + 1)); + let mut firsts = vec![None::; n_messages]; + let mut lasts = vec![None::; n_messages]; + for (pos, idx) in self.inner.message_indices.iter().copied().enumerate() { + let Ok(msg_idx) = usize::try_from(idx) else { + continue; + }; + if msg_idx >= n_messages { + continue; + } + if firsts[msg_idx].is_none() { + firsts[msg_idx] = Some(pos); + } + lasts[msg_idx] = Some(pos); + } + + let out = PyList::empty(py); + for (first, last) in firsts.into_iter().zip(lasts) { + match (first, last) { + (Some(start), Some(end)) => { + out.append(PyTuple::new(py, [start, end + 1])?)?; + } + _ => out.append(py.None())?, + } + } + Ok(out) + } + + #[allow(clippy::unused_self)] + fn role_token_spans<'py>(&self, py: Python<'py>) -> Bound<'py, PyDict> { + PyDict::new(py) + } + + #[pyo3(signature = (*, sampled_only = false))] + #[allow(clippy::unused_self)] + fn tokens_by_role<'py>( + &self, + py: Python<'py>, + #[allow(unused_variables)] sampled_only: bool, + ) -> Bound<'py, PyDict> { + PyDict::new(py) + } + + #[allow(clippy::unused_self)] + fn content_token_spans_by_role<'py>(&self, py: Python<'py>) -> Bound<'py, PyDict> { + PyDict::new(py) + } + + fn content_mask_for_roles<'py>( + &self, + py: Python<'py>, + #[allow(unused_variables)] roles: &Bound<'_, PyAny>, + ) -> PyResult> { + PyList::new(py, vec![false; self.inner.token_ids.len()]) + } + fn __repr__(&self) -> String { format!( "RenderedTokens(token_ids=<{} tokens>, message_indices=<{} entries>, multi_modal_data={})", @@ -1327,10 +1434,12 @@ impl PyRenderer { Ok(ids.into_pyarray(py)) } + #[pyo3(signature = (token_ids, *, tools = None))] fn parse_response( &self, py: Python<'_>, token_ids: &Bound<'_, PyAny>, + #[allow(unused_variables)] tools: Option<&Bound<'_, PyAny>>, ) -> PyResult { let ids = parse_u32_list(token_ids)?; let renderer = self.inner.clone(); @@ -1343,9 +1452,11 @@ impl PyRenderer { /// The input buffer is borrowed directly, avoiding the Python-list scan and /// temporary Rust `Vec` used by `parse_response`. #[allow(clippy::needless_pass_by_value)] + #[pyo3(signature = (token_ids, *, tools = None))] fn parse_response_np( &self, token_ids: PyReadonlyArray1<'_, u32>, + #[allow(unused_variables)] tools: Option<&Bound<'_, PyAny>>, ) -> PyResult { let ids = numpy_u32_slice(&token_ids)?; let parsed = self.inner.parse_response(ids); diff --git a/tests/test_native_router.py b/tests/test_native_router.py index 06fdf47..cd94fc6 100644 --- a/tests/test_native_router.py +++ b/tests/test_native_router.py @@ -8,6 +8,7 @@ class surface. from __future__ import annotations +import inspect import os import sys from types import SimpleNamespace @@ -175,3 +176,34 @@ def test_native_status_constants(native): assert s.UNCLOSED_BLOCK == "unclosed_block" assert s.MISSING_NAME == "missing_name" assert s.MALFORMED_STRUCTURE == "malformed_structure" + + +def test_native_base_api_surface(native): + renderer_methods = [ + "render", + "render_ids", + "parse_response", + "get_stop_token_ids", + "bridge_to_next_turn", + ] + rendered_tokens_attrs = [ + "token_ids", + "message_indices", + "sampled_mask", + "is_content", + "message_roles", + "multi_modal_data", + "tokens_per_message", + "message_token_spans", + "role_token_spans", + "tokens_by_role", + "content_token_spans_by_role", + "content_mask_for_roles", + ] + + for name in renderer_methods: + assert hasattr(native.Renderer, name), f"missing Renderer.{name}" + for name in rendered_tokens_attrs: + assert hasattr(native.RenderedTokens, name), f"missing RenderedTokens.{name}" + + assert "tools" in inspect.signature(native.Renderer.parse_response).parameters From 6c945be872f7d572491a6009497647daa8b063bd Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Thu, 21 May 2026 11:33:57 +0200 Subject: [PATCH 31/35] Fix Ruff import skip references --- tests/test_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_client.py b/tests/test_client.py index 79c9c52..a543c38 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -299,15 +299,15 @@ def test_generate_serializes_multimodal_features_for_qwen_vl_family( pytest.importorskip("torch") pytest.importorskip("vllm", reason="vllm needed for features serialization") - _pytest.importorskip( + pytest.importorskip( "vllm.entrypoints.serve.disagg.mm_serde", reason="vLLM multimodal serializer is not available", ) - _pytest.importorskip( + pytest.importorskip( "vllm.model_executor.models.qwen2_vl", reason="vLLM Qwen-VL field factory is not available", ) - _pytest.importorskip( + pytest.importorskip( "vllm.multimodal.inputs", reason="vLLM multimodal input wrappers are not available", ) From bad105c03ecb68412d23bec4976384df05c4879c Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Thu, 21 May 2026 11:34:04 +0200 Subject: [PATCH 32/35] Apply Ruff formatting --- renderers/_native_vision.py | 1 + renderers/kimi_k25.py | 8 +++++- tests/test_native_parity.py | 52 +++++++++++++++++++++++++------------ tests/test_native_router.py | 10 +++---- tests/test_renderer_e2e.py | 4 ++- 5 files changed, 51 insertions(+), 24 deletions(-) diff --git a/renderers/_native_vision.py b/renderers/_native_vision.py index cd07b3a..7ab6d9e 100644 --- a/renderers/_native_vision.py +++ b/renderers/_native_vision.py @@ -23,6 +23,7 @@ try: import renderers_native # type: ignore[import-not-found] + _NATIVE = renderers_native except ImportError: _NATIVE = None diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 7483cd3..b0c5443 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -61,10 +61,16 @@ def _messages_have_media(messages: list[Message]) -> bool: c = m.get("content") if isinstance(m, dict) else getattr(m, "content", None) if isinstance(c, list): for p in c: - if isinstance(p, dict) and p.get("type") in ("image", "image_url", "video", "video_url"): + if isinstance(p, dict) and p.get("type") in ( + "image", + "image_url", + "video", + "video_url", + ): return True return False + # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- diff --git a/tests/test_native_parity.py b/tests/test_native_parity.py index f466967..64d566a 100644 --- a/tests/test_native_parity.py +++ b/tests/test_native_parity.py @@ -111,33 +111,43 @@ def _build_python_renderer(family: str, tokenizer, extra): """Return a pure-Python renderer for *family*, or ``None`` if missing.""" if family == "qwen3": from renderers.qwen3 import Qwen3Renderer + return Qwen3Renderer(tokenizer, **extra) if family == "qwen35": from renderers.qwen35 import Qwen35Renderer + return Qwen35Renderer(tokenizer, **extra) if family == "qwen36": from renderers.qwen36 import Qwen36Renderer + return Qwen36Renderer(tokenizer, **extra) if family == "glm5": from renderers.glm5 import GLM5Renderer + return GLM5Renderer(tokenizer, **extra) if family == "glm51": from renderers.glm5 import GLM51Renderer + return GLM51Renderer(tokenizer, **extra) if family == "glm45": from renderers.glm45 import GLM45Renderer + return GLM45Renderer(tokenizer, **extra) if family == "deepseek_v3": from renderers.deepseek_v3 import DeepSeekV3Renderer + return DeepSeekV3Renderer(tokenizer, **extra) if family == "kimi_k2": from renderers.kimi_k2 import KimiK2Renderer + return KimiK2Renderer(tokenizer, **extra) if family == "minimax_m2": from renderers.minimax_m2 import MiniMaxM2Renderer + return MiniMaxM2Renderer(tokenizer, **extra) if family == "nemotron3": from renderers.nemotron3 import Nemotron3Renderer + return Nemotron3Renderer(tokenizer, **extra) return None @@ -145,16 +155,16 @@ def _build_python_renderer(family: str, tokenizer, extra): def _build_native_renderer(native_module, family: str, tok_path: str, extra): """Return a native renderer for *family* via the explicit factory.""" factory = { - "qwen3": native_module.Renderer.qwen3, - "qwen35": native_module.Renderer.qwen35, - "qwen36": native_module.Renderer.qwen36, - "glm5": native_module.Renderer.glm5, - "glm51": native_module.Renderer.glm51, - "glm45": native_module.Renderer.glm45, + "qwen3": native_module.Renderer.qwen3, + "qwen35": native_module.Renderer.qwen35, + "qwen36": native_module.Renderer.qwen36, + "glm5": native_module.Renderer.glm5, + "glm51": native_module.Renderer.glm51, + "glm45": native_module.Renderer.glm45, "deepseek_v3": native_module.Renderer.deepseek_v3, - "kimi_k2": native_module.Renderer.kimi_k2, - "minimax_m2": native_module.Renderer.minimax_m2, - "nemotron3": native_module.Renderer.nemotron3, + "kimi_k2": native_module.Renderer.kimi_k2, + "minimax_m2": native_module.Renderer.minimax_m2, + "nemotron3": native_module.Renderer.nemotron3, }.get(family) if factory is None: return None @@ -266,7 +276,9 @@ def _build_native_renderer(native_module, family: str, tok_path: str, extra): # ── Tests ──────────────────────────────────────────────────────────── -@pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) +@pytest.mark.parametrize( + "case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None +) def test_render_ids_parity(native_pair, case, messages): py_renderer, native_renderer, _tok = native_pair py_ids = list(py_renderer.render_ids(messages)) @@ -278,7 +290,9 @@ def test_render_ids_parity(native_pair, case, messages): ) -@pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) +@pytest.mark.parametrize( + "case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None +) def test_render_ids_with_gen_prompt_parity(native_pair, case, messages): py_renderer, native_renderer, _tok = native_pair py_ids = list(py_renderer.render_ids(messages, add_generation_prompt=True)) @@ -286,7 +300,9 @@ def test_render_ids_with_gen_prompt_parity(native_pair, case, messages): assert py_ids == rs_ids -@pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) +@pytest.mark.parametrize( + "case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None +) def test_render_ids_with_tools_parity(native_pair, case, messages): py_renderer, native_renderer, _tok = native_pair py_ids = list(py_renderer.render_ids(messages, tools=TOOLS)) @@ -321,7 +337,9 @@ def test_qwen35_structured_text_parts_parity(native_pair): assert py_ids == rs_ids -@pytest.mark.parametrize("case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None) +@pytest.mark.parametrize( + "case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None +) def test_message_indices_parity(native_pair, case, messages): """Per-token attribution must match — critical for training loss masks.""" py_renderer, native_renderer, _tok = native_pair @@ -348,12 +366,14 @@ def test_parse_response_no_tool_calls_parity(native_pair): ) # Slice out just the assistant section by re-rendering up to the user. prompt_ids = py_renderer.render_ids(msgs, add_generation_prompt=True) - assistant_ids = completion_ids[len(prompt_ids):] + assistant_ids = completion_ids[len(prompt_ids) :] py_parsed = py_renderer.parse_response(assistant_ids) rs_parsed = native_renderer.parse_response(assistant_ids) assert py_parsed.content == rs_parsed.content - assert (py_parsed.reasoning_content or None) == (rs_parsed.reasoning_content or None) + assert (py_parsed.reasoning_content or None) == ( + rs_parsed.reasoning_content or None + ) assert len(py_parsed.tool_calls) == len(rs_parsed.tool_calls) @@ -364,7 +384,7 @@ def test_bridge_to_next_turn_parity(native_pair): {"role": "assistant", "content": "Hi! How can I help?"}, ] prev_prompt_ids = py_renderer.render_ids(initial[:-1], add_generation_prompt=True) - prev_completion_ids = py_renderer.render_ids(initial)[len(prev_prompt_ids):] + prev_completion_ids = py_renderer.render_ids(initial)[len(prev_prompt_ids) :] new_messages = [{"role": "user", "content": "Tell me about Rust."}] py_b = py_renderer.bridge_to_next_turn( diff --git a/tests/test_native_router.py b/tests/test_native_router.py index cd94fc6..c22cbd8 100644 --- a/tests/test_native_router.py +++ b/tests/test_native_router.py @@ -39,9 +39,7 @@ def test_native_on_global(value): def test_native_csv_specific_families(): - with mock.patch.dict( - os.environ, {"RENDERERS_NATIVE": "qwen3,glm5"}, clear=True - ): + with mock.patch.dict(os.environ, {"RENDERERS_NATIVE": "qwen3,glm5"}, clear=True): assert router.native_enabled("qwen3") assert router.native_enabled("glm5") assert not router.native_enabled("qwen35") @@ -95,9 +93,9 @@ def test_resolve_tokenizer_path_uses_tiktoken_export(monkeypatch, tmp_path): monkeypatch.setattr( router, "_export_tiktoken_tokenizer_json", - lambda repo_id, _loader: str(exported) - if repo_id == "moonshotai/Kimi-K2-Instruct" - else None, + lambda repo_id, _loader: ( + str(exported) if repo_id == "moonshotai/Kimi-K2-Instruct" else None + ), ) assert router.resolve_tokenizer_path(tokenizer) == str(exported) diff --git a/tests/test_renderer_e2e.py b/tests/test_renderer_e2e.py index 11eedf8..0b6406d 100644 --- a/tests/test_renderer_e2e.py +++ b/tests/test_renderer_e2e.py @@ -63,7 +63,9 @@ def test_default_renderer_fallback_keeps_raw_decoded_completion_prefix(): full_ids = renderer.render_ids(messages + [assistant]) completion_ids = full_ids[len(prompt_ids) :] - assert renderer.bridge_to_next_turn(prompt_ids, completion_ids, new_messages) is None + assert ( + renderer.bridge_to_next_turn(prompt_ids, completion_ids, new_messages) is None + ) raw_completion = tokenizer.decode(completion_ids, skip_special_tokens=False) fallback_ids = renderer.render_ids( From d7e8a582f515290fbd0af1fd2a85001116cbc9a8 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Thu, 28 May 2026 14:17:11 +0200 Subject: [PATCH 33/35] Fix native renderer config constructors --- renderers/deepseek_v3.py | 4 ++++ renderers/default.py | 11 +++++++++++ renderers/glm45.py | 8 ++++++++ renderers/glm5.py | 8 ++++++++ renderers/gpt_oss.py | 12 ++++++++++++ renderers/kimi_k2.py | 8 ++++++++ renderers/kimi_k25.py | 11 ++++------- renderers/minimax_m2.py | 15 +++++++-------- renderers/nemotron3.py | 8 ++++++++ renderers/qwen3.py | 8 ++++++++ renderers/qwen35.py | 13 ++++++------- renderers/qwen36.py | 17 ++++++++--------- 12 files changed, 92 insertions(+), 31 deletions(-) diff --git a/renderers/deepseek_v3.py b/renderers/deepseek_v3.py index 39dc5dc..dfdd3ba 100644 --- a/renderers/deepseek_v3.py +++ b/renderers/deepseek_v3.py @@ -60,11 +60,15 @@ class DeepSeekV3Renderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: DeepSeekV3RendererConfig | None = None, *, enable_thinking: bool = True, preserve_all_thinking: bool = False, preserve_thinking_between_tool_calls: bool = False, ): + if config is not None: + enable_thinking = config.enable_thinking + if native_enabled("deepseek_v3") or native_enabled("deepseek-v3"): native = load_native() if native is not None: diff --git a/renderers/default.py b/renderers/default.py index d3f24c0..7c70f2f 100644 --- a/renderers/default.py +++ b/renderers/default.py @@ -96,6 +96,7 @@ class DefaultRenderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: DefaultRendererConfig | None = None, *, tool_parser=None, reasoning_parser=None, @@ -103,6 +104,15 @@ def __new__( preserve_thinking_between_tool_calls: bool = False, **chat_template_kwargs, ): + if config is not None: + tool_parser = config.tool_parser + reasoning_parser = config.reasoning_parser + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + chat_template_kwargs = dict(config.model_extra or {}) + # Native routing: only when there are no plugged parsers and no # exotic chat_template kwargs — the Rust path uses minijinja and # doesn't know about Python-side parser instances. @@ -112,6 +122,7 @@ def __new__( and reasoning_parser is None and not preserve_all_thinking and not preserve_thinking_between_tool_calls + and not chat_template_kwargs ): native = load_native() if native is not None: diff --git a/renderers/glm45.py b/renderers/glm45.py index f78e82a..33f6e8e 100644 --- a/renderers/glm45.py +++ b/renderers/glm45.py @@ -59,11 +59,19 @@ class GLM45Renderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: GLM45RendererConfig | None = None, *, enable_thinking: bool = True, preserve_all_thinking: bool = False, preserve_thinking_between_tool_calls: bool = False, ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + if native_enabled("glm45"): native = load_native() if native is not None: diff --git a/renderers/glm5.py b/renderers/glm5.py index 9cc3cd8..ecfb73b 100644 --- a/renderers/glm5.py +++ b/renderers/glm5.py @@ -67,11 +67,19 @@ class GLM5Renderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: GLM5RendererConfig | GLM51RendererConfig | None = None, *, enable_thinking: bool = True, preserve_all_thinking: bool = False, preserve_thinking_between_tool_calls: bool = False, ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + if native_enabled(cls._NATIVE_KEY): native = load_native() if native is not None: diff --git a/renderers/gpt_oss.py b/renderers/gpt_oss.py index 5311d1e..1d8fb99 100644 --- a/renderers/gpt_oss.py +++ b/renderers/gpt_oss.py @@ -126,6 +126,7 @@ class GptOssRenderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: GptOssRendererConfig | None = None, *, use_system_prompt: bool = True, reasoning_effort: str | None = "medium", @@ -135,6 +136,17 @@ def __new__( preserve_all_thinking: bool = False, preserve_thinking_between_tool_calls: bool = False, ): + if config is not None: + use_system_prompt = config.use_system_prompt + reasoning_effort = config.reasoning_effort + conversation_start_date = config.conversation_start_date + knowledge_cutoff = config.knowledge_cutoff + model_identity = config.model_identity + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + if native_enabled("gpt_oss") or native_enabled("gpt-oss"): native = load_native() if native is not None: diff --git a/renderers/kimi_k2.py b/renderers/kimi_k2.py index c7a760f..7943ae8 100644 --- a/renderers/kimi_k2.py +++ b/renderers/kimi_k2.py @@ -50,11 +50,19 @@ class KimiK2Renderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: KimiK2RendererConfig | None = None, *, enable_thinking: bool = True, preserve_all_thinking: bool = False, preserve_thinking_between_tool_calls: bool = False, ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + if native_enabled("kimi_k2") or native_enabled("kimi-k2"): native = load_native() if native is not None: diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index b0c5443..95c9451 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -601,12 +601,9 @@ class KimiK25Renderer: def __new__( cls, tokenizer, + config: KimiK25RendererConfig | None = None, *, processor=None, - enable_thinking=True, - preserve_all_thinking=False, - preserve_thinking_between_tool_calls=False, - image_cache_max=256, # Tools / messages are bound to render-time, so native routing # happens inside render() via a cached text-only delegate. ): @@ -630,9 +627,9 @@ def __init__( if path is not None: self._native_renderer = native.Renderer.kimi_k25( path, - enable_thinking=enable_thinking, - preserve_all_thinking=preserve_all_thinking, - preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + enable_thinking=self.config.thinking, + preserve_all_thinking=self.config.preserve_all_thinking, + preserve_thinking_between_tool_calls=self.config.preserve_thinking_between_tool_calls, ) # Core structural tokens — all must be single special tokens in the vocab diff --git a/renderers/minimax_m2.py b/renderers/minimax_m2.py index 601d55c..e477e1b 100644 --- a/renderers/minimax_m2.py +++ b/renderers/minimax_m2.py @@ -61,24 +61,23 @@ class MiniMaxM2Renderer: def __new__( cls, tokenizer: PreTrainedTokenizer, - *, - default_system: str = _DEFAULT_SYSTEM, - preserve_all_thinking: bool = False, - preserve_thinking_between_tool_calls: bool = False, + config: MiniMaxM2RendererConfig | None = None, ): # Native routing: only when the caller relies on the default - # system message; a custom default_system isn't wired through to + # system message; a custom model_identity isn't wired through to # the native classmethod yet. + cfg = config or MiniMaxM2RendererConfig() + default_identity = MiniMaxM2RendererConfig().model_identity if ( native_enabled("minimax_m2") or native_enabled("minimax-m2") - ) and default_system == _DEFAULT_SYSTEM: + ) and cfg.model_identity == default_identity: native = load_native() if native is not None: path = resolve_tokenizer_path(tokenizer) return native.Renderer.minimax_m2( path, - preserve_all_thinking=preserve_all_thinking, - preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + preserve_all_thinking=cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls=cfg.preserve_thinking_between_tool_calls, ) return super().__new__(cls) diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py index be6723d..49a1b9c 100644 --- a/renderers/nemotron3.py +++ b/renderers/nemotron3.py @@ -85,11 +85,19 @@ class Nemotron3Renderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: Nemotron3RendererConfig | None = None, *, enable_thinking: bool = True, preserve_all_thinking: bool = False, preserve_thinking_between_tool_calls: bool = False, ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + if native_enabled("nemotron3"): native = load_native() if native is not None: diff --git a/renderers/qwen3.py b/renderers/qwen3.py index d86a180..2b8210d 100644 --- a/renderers/qwen3.py +++ b/renderers/qwen3.py @@ -62,11 +62,19 @@ class Qwen3Renderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: Qwen3RendererConfig | None = None, *, enable_thinking: bool = True, preserve_all_thinking: bool = False, preserve_thinking_between_tool_calls: bool = False, ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + # Native routing: when ``RENDERERS_NATIVE`` opts qwen3 into the # Rust path and the extension is installed, return the native # instance directly. Otherwise fall through to the pure-Python diff --git a/renderers/qwen35.py b/renderers/qwen35.py index fc706a4..7498cfc 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -117,12 +117,9 @@ class Qwen35Renderer: def __new__( cls, tokenizer: PreTrainedTokenizer, + config: Qwen35RendererConfig | None = None, *, processor: Any = None, - enable_thinking: bool | None = None, - preserve_all_thinking: bool = False, - preserve_thinking_between_tool_calls: bool = False, - image_cache_max: int = 256, ): # Route to native only when: # 1. the user opted in via RENDERERS_NATIVE, @@ -132,14 +129,16 @@ def __new__( if native_enabled("qwen35") and processor is None: native = load_native() if native is not None: + cfg = config or cls._config_cls() + enable_thinking = cfg.enable_thinking if enable_thinking is None: - enable_thinking = _detect_enable_thinking_default(tokenizer) + enable_thinking = _default_enable_thinking(tokenizer) path = resolve_tokenizer_path(tokenizer) return native.Renderer.qwen35( path, enable_thinking=enable_thinking, - preserve_all_thinking=preserve_all_thinking, - preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + preserve_all_thinking=cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls=cfg.preserve_thinking_between_tool_calls, ) return super().__new__(cls) diff --git a/renderers/qwen36.py b/renderers/qwen36.py index 8b959b5..4606273 100644 --- a/renderers/qwen36.py +++ b/renderers/qwen36.py @@ -23,13 +23,13 @@ import json from typing import Any -from renderers.configs import Qwen36RendererConfig from renderers._native_router import ( load_native, native_enabled, resolve_tokenizer_path, ) -from renderers.qwen35 import Qwen35Renderer, _detect_enable_thinking_default +from renderers.configs import Qwen36RendererConfig +from renderers.qwen35 import Qwen35Renderer, _default_enable_thinking class Qwen36Renderer(Qwen35Renderer): @@ -40,12 +40,9 @@ class Qwen36Renderer(Qwen35Renderer): def __new__( cls, tokenizer, + config: Qwen36RendererConfig | None = None, *, processor=None, - enable_thinking=None, - preserve_all_thinking=False, - preserve_thinking_between_tool_calls=False, - image_cache_max=256, ): # Route to native only for Qwen3.6 specifically — never fall # through to the parent's qwen35 router (the renderer flag is @@ -53,14 +50,16 @@ def __new__( if native_enabled("qwen36") and processor is None: native = load_native() if native is not None: + cfg = config or Qwen36RendererConfig() + enable_thinking = cfg.enable_thinking if enable_thinking is None: - enable_thinking = _detect_enable_thinking_default(tokenizer) + enable_thinking = _default_enable_thinking(tokenizer) path = resolve_tokenizer_path(tokenizer) return native.Renderer.qwen36( path, enable_thinking=enable_thinking, - preserve_all_thinking=preserve_all_thinking, - preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + preserve_all_thinking=cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls=cfg.preserve_thinking_between_tool_calls, ) # Skip Qwen35Renderer.__new__ (would also try to route, with the # wrong flag). Go straight to object. From 7be0ed77af3658ad200bce696d80e4f59237df21 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Thu, 28 May 2026 14:26:53 +0200 Subject: [PATCH 34/35] Optimize native binding batch conversions --- crates/renderers-py/src/lib.rs | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index a52a75d..04d9760 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -259,6 +259,17 @@ fn numpy_u32_slice<'py>(array: &'py PyReadonlyArray1<'py, u32>) -> PyResult<&'py .map_err(|e| invalid(format!("expected a contiguous uint32 numpy array: {e}"))) } +fn batch_ids_to_pylist<'py>( + py: Python<'py>, + batch_ids: Vec>, +) -> PyResult> { + let mut rows = Vec::with_capacity(batch_ids.len()); + for ids in batch_ids { + rows.push(PyList::new(py, ids)?); + } + PyList::new(py, rows) +} + #[pyclass( name = "RenderedTokens", module = "renderers_native", @@ -607,8 +618,9 @@ impl PyRendererSession { ) }) .map_err(render_err)?; - self.last_prompt_ids = Some(ids.clone()); - PyList::new(py, ids) + let out = PyList::new(py, ids.iter().copied())?; + self.last_prompt_ids = Some(ids); + Ok(out) } #[pyo3(signature = (*, add_generation_prompt = false))] @@ -1263,11 +1275,7 @@ impl PyRenderer { } }) .map_err(render_err)?; - let out = PyList::empty(py); - for ids in batch_ids { - out.append(PyList::new(py, ids)?)?; - } - Ok(out) + batch_ids_to_pylist(py, batch_ids) } #[pyo3(signature = (messages_batch, *, tools = None, add_generation_prompt = false))] @@ -1307,7 +1315,8 @@ impl PyRenderer { }) .collect::, _>>()? }; - let mut ids = Vec::new(); + let total_len = batch_ids.iter().map(Vec::len).sum(); + let mut ids = Vec::with_capacity(total_len); let mut offsets = Vec::with_capacity(batch_ids.len() + 1); offsets.push(0); for row in batch_ids { From fa47618fedeec3e14995cf7d1b81ca7dc041d165 Mon Sep 17 00:00:00 2001 From: Thomas Aubry Date: Thu, 28 May 2026 14:36:28 +0200 Subject: [PATCH 35/35] Fix renderer compatibility and clippy --- crates/renderers-py/src/lib.rs | 5 +---- renderers/base.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs index 04d9760..38ad004 100644 --- a/crates/renderers-py/src/lib.rs +++ b/crates/renderers-py/src/lib.rs @@ -259,10 +259,7 @@ fn numpy_u32_slice<'py>(array: &'py PyReadonlyArray1<'py, u32>) -> PyResult<&'py .map_err(|e| invalid(format!("expected a contiguous uint32 numpy array: {e}"))) } -fn batch_ids_to_pylist<'py>( - py: Python<'py>, - batch_ids: Vec>, -) -> PyResult> { +fn batch_ids_to_pylist(py: Python<'_>, batch_ids: Vec>) -> PyResult> { let mut rows = Vec::with_capacity(batch_ids.len()); for ids in batch_ids { rows.push(PyList::new(py, ids)?); diff --git a/renderers/base.py b/renderers/base.py index 5bed116..3cae8a6 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1301,6 +1301,8 @@ def factory() -> Renderer: def create_renderer( tokenizer, config: RendererConfig | None = None, + *, + renderer: str | None = None, ) -> Renderer: """Create a Renderer from a typed config. @@ -1316,16 +1318,23 @@ def create_renderer( template-control kwargs (e.g. ``enable_thinking``), pass the specific :class:`Qwen3RendererConfig`, :class:`GLM5RendererConfig` etc. and set those fields. + renderer: Backward-compatible renderer name. Prefer ``config=`` for + new code; ``renderer="auto"`` is equivalent to ``config=None``. Selecting the auto-renderer for a model without a registered renderer falls back to :class:`DefaultRenderer` for text-only models and raises for VLMs (where ``apply_chat_template`` would silently drop images). """ - from renderers.configs import AutoRendererConfig + from renderers.configs import AutoRendererConfig, config_from_name _populate_registry() + if renderer is not None: + if config is not None: + raise TypeError("pass either config= or renderer=, not both") + config = config_from_name(renderer) + if config is None: config = AutoRendererConfig()