diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..bfb6c39 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,81 @@ +name: Rust + +on: + push: + branches: [main] + paths: + - "**.rs" + - "**/Cargo.toml" + - "Cargo.lock" + - "rust-toolchain.toml" + - ".github/workflows/rust.yml" + pull_request: + branches: [main] + paths: + - "**.rs" + - "**/Cargo.toml" + - "Cargo.lock" + - "rust-toolchain.toml" + - ".github/workflows/rust.yml" + +env: + CARGO_TERM_COLOR: always + RUSTFLAGS: -D warnings + +jobs: + fmt: + name: Rustfmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install stable toolchain + run: rustup show active-toolchain || rustup toolchain install + - name: cargo fmt + run: cargo fmt --all -- --check + + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install stable toolchain + run: rustup show active-toolchain || rustup toolchain install + - uses: Swatinem/rust-cache@v2 + - name: cargo clippy + # `clippy::all` is `deny` via workspace lints; pedantic stays at `warn` + # for visibility. `-D warnings` here still escalates any remaining + # rustc warnings to errors. + run: cargo clippy --workspace --all-targets --locked + + test: + name: Test (Rust) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install stable toolchain + run: rustup show active-toolchain || rustup toolchain install + - uses: Swatinem/rust-cache@v2 + - name: cargo test + # Skip the PyO3 crate: it links libpython and is exercised via pytest. + run: cargo test --workspace --exclude renderers-py --locked + + miri: + name: Miri (renderers-core) + runs-on: ubuntu-latest + env: + # Miri's stricter UB detection. + MIRIFLAGS: -Zmiri-strict-provenance -Zmiri-symbolic-alignment-check + steps: + - uses: actions/checkout@v4 + - name: Install nightly with miri + run: | + rustup toolchain install nightly --component miri + rustup +nightly component add rust-src + - uses: Swatinem/rust-cache@v2 + with: + key: miri + - name: cargo miri setup + run: cargo +nightly miri setup + - name: cargo miri test + # renderers-core only — PyO3 / FFI crates can't run under Miri. + run: cargo +nightly miri test -p renderers-core --lib --tests diff --git a/.gitignore b/.gitignore index cca70b8..4db25f1 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,6 @@ coverage.xml # agent harness state .claude/ + +# rust +target/ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..e8286e1 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,3625 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "serde", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "aligned" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685" +dependencies = [ + "as-slice", +] + +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + +[[package]] +name = "alloca" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4" +dependencies = [ + "cc", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" + +[[package]] +name = "arg_enum_proc_macro" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "as-slice" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "av-scenechange" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394" +dependencies = [ + "aligned", + "anyhow", + "arg_enum_proc_macro", + "arrayvec", + "log", + "num-rational", + "num-traits", + "pastey", + "rayon", + "thiserror", + "v_frame", + "y4m", +] + +[[package]] +name = "av1-grain" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cfddb07216410377231960af4fcab838eaa12e013417781b78bd95ee22077f8" +dependencies = [ + "anyhow", + "arrayvec", + "log", + "nom 8.0.0", + "num-rational", + "v_frame", +] + +[[package]] +name = "avif-serialize" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7178fe5f7d460b13895ebb9dcb28a3a6216d2df2574a0806cb51b555d297f38" +dependencies = [ + "arrayvec", +] + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bit_field" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" + +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "bitstream-io" +version = "4.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eff00be299a18769011411c9def0d827e8f2d7bf0c3dbf53633147a8867fd1f" +dependencies = [ + "no_std_io2", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "block-buffer" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +dependencies = [ + "hybrid-array", +] + +[[package]] +name = "bs58" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf88ba1141d185c399bee5288d850d63b8369520c1eafc32a0430b5b6c287bf4" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "built" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4ad8f11f288f48ca24471bbd51ac257aaeaaa07adae295591266b792902ae64" + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + +[[package]] +name = "cc" +version = "1.2.62" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "num-traits", + "serde", + "windows-link", +] + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "compact_str" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + +[[package]] +name = "const-oid" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "criterion" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "950046b2aa2492f9a536f5f4f9a3de7b9e2476e575e05bd6c333371add4d98f3" +dependencies = [ + "alloca", + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "itertools 0.13.0", + "num-traits", + "oorandom", + "page_size", + "plotters", + "rayon", + "regex", + "serde", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8d80a2f4f5b554395e47b5d8305bc3d27813bacb73493eb1001e8f76dae29ea" +dependencies = [ + "cast", + "itertools 0.13.0", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "crypto-common" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710" +dependencies = [ + "hybrid-array", +] + +[[package]] +name = "daachorse" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f55d7153ba3b507595872a3874803f07a8a81d1e888abed8e5db7da0597d6e2" + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core 0.23.0", + "darling_macro 0.23.0", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core 0.23.0", + "quote", + "syn", +] + +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" +dependencies = [ + "serde", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer 0.10.4", + "crypto-common 0.1.7", +] + +[[package]] +name = "digest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" +dependencies = [ + "block-buffer 0.12.0", + "const-oid", + "crypto-common 0.2.1", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] + +[[package]] +name = "exr" +version = "1.74.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4300e043a56aa2cb633c01af81ca8f699a321879a7854d3896a0ba89056363be" +dependencies = [ + "bit_field", + "half", + "lebe", + "miniz_oxide", + "rayon-core", + "smallvec", + "zune-inflate", +] + +[[package]] +name = "fancy-regex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "fax" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf1079563223d5d59d83c85886a56e586cfd5c1a26292e971a0fa266531ac5a" + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "gif" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159" +dependencies = [ + "color_quant", + "weezl", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "hybrid-array" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da" +dependencies = [ + "typenum", +] + +[[package]] +name = "hyper" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "image" +version = "0.25.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "exr", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "qoi", + "ravif", + "rayon", + "rgb", + "tiff", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "imgref" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fac9d56ed6437b198fddba683305e8e2d651aa42647f00f5ae542e7f5c94a2" + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.1", + "serde", + "serde_core", +] + +[[package]] +name = "interpolate_name" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67df7112613f8bfd9150013a0314e196f4800d3201ae742489d999db2f979f08" +dependencies = [ + "cfg-if", + "futures-util", + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lebe" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f12a681b7dd8ce12bff52488013ba614b869148d54dd79836ab85aafdd53f08d" +dependencies = [ + "arbitrary", + "cc", +] + +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "loop9" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" +dependencies = [ + "imgref", +] + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "maybe-rayon" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" +dependencies = [ + "cfg-if", + "rayon", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memo-map" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "minijinja" +version = "2.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "805bfd7352166bae857ee569628b52bcd85a1cecf7810861ebceb1686b72b75d" +dependencies = [ + "memo-map", + "serde", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "mio" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "moxcms" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b" +dependencies = [ + "num-traits", + "pxfm", +] + +[[package]] +name = "ndarray" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "no_std_io2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "noop_proc_macro" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-conv" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" + +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "numpy" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "778da78c64ddc928ebf5ad9df5edf0789410ff3bdbf3619aed51cd789a6af1e2" +dependencies = [ + "libc", + "ndarray", + "num-complex", + "num-integer", + "num-traits", + "pyo3", + "pyo3-build-config", + "rustc-hash 2.1.2", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "onig" +version = "6.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" +dependencies = [ + "bitflags", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e68317604e77e53b85896388e1a803c1d21b74c899ec9e5e1112db90735edd7" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "openai-harmony" +version = "0.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e77e82af451fc95deeb728a40b84db8ee82d341e136c268de415123a560b9b72" +dependencies = [ + "anyhow", + "base64 0.22.1", + "bstr", + "clap", + "fancy-regex", + "futures", + "image", + "regex", + "reqwest", + "rustc-hash 1.1.0", + "serde", + "serde_json", + "serde_with", + "sha1", + "sha2 0.10.9", + "thiserror", +] + +[[package]] +name = "page_size" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_macros", + "phf_shared", + "serde", +] + +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "profiling" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d595e54a326bc53c1c197b32d295e14b169e3cfeaa8dc82b529f947fba6bcf5" +dependencies = [ + "profiling-procmacros", +] + +[[package]] +name = "profiling-procmacros" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4488a4a36b9a4ba6b9334a32a39971f77c1436ec82c38707bce707699cc3bbcb" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "pxfm" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f" + +[[package]] +name = "pyo3" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91fd8e38a3b50ed1167fb981cd6fd60147e091784c427b8f7183a7ee32c31c12" +dependencies = [ + "libc", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", +] + +[[package]] +name = "pyo3-build-config" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e368e7ddfdeb98c9bca7f8383be1648fd84ab466bf2bc015e94008db6d35611e" +dependencies = [ + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f29e10af80b1f7ccaf7f69eace800a03ecd13e883acfacc1e5d0988605f651e" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df6e520eff47c45997d2fc7dd8214b25dd1310918bbb2642156ef66a67f29813" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.28.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4cdc218d835738f81c2338f822078af45b4afdf8b2e33cbb5916f108b813acb" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "pythonize" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b79f670c9626c8b651c0581011b57b6ba6970bb69faf01a7c4c0cfc81c43f95" +dependencies = [ + "pyo3", + "serde", +] + +[[package]] +name = "qoi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash 2.1.2", + "rustls", + "socket2", + "thiserror", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand", + "ring", + "rustc-hash 2.1.2", + "rustls", + "rustls-pki-types", + "slab", + "thiserror", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rav1e" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b" +dependencies = [ + "aligned-vec", + "arbitrary", + "arg_enum_proc_macro", + "arrayvec", + "av-scenechange", + "av1-grain", + "bitstream-io", + "built", + "cfg-if", + "interpolate_name", + "itertools 0.14.0", + "libc", + "libfuzzer-sys", + "log", + "maybe-rayon", + "new_debug_unreachable", + "noop_proc_macro", + "num-derive", + "num-traits", + "paste", + "profiling", + "rand", + "rand_chacha", + "simd_helpers", + "thiserror", + "v_frame", + "wasm-bindgen", +] + +[[package]] +name = "ravif" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e52310197d971b0f5be7fe6b57530dcd27beb35c1b013f29d66c1ad73fbbcc45" +dependencies = [ + "avif-serialize", + "imgref", + "loop9", + "quick-error", + "rav1e", + "rayon", + "rgb", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" +dependencies = [ + "either", + "itertools 0.14.0", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "renderers-cli" +version = "0.1.0" +dependencies = [ + "clap", + "criterion", + "renderers-core", + "serde", + "serde_json", +] + +[[package]] +name = "renderers-core" +version = "0.1.0" +dependencies = [ + "bumpalo", + "image", + "minijinja", + "ndarray", + "openai-harmony", + "phf", + "regex", + "serde", + "serde_json", + "sha2 0.11.0", + "smallvec", + "thiserror", + "tokenizers", +] + +[[package]] +name = "renderers-py" +version = "0.1.0" +dependencies = [ + "ndarray", + "numpy", + "pyo3", + "pythonize", + "rayon", + "renderers-core", + "serde", + "serde_json", +] + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "mime_guess", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "rgb" +version = "0.8.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4" + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustls" +version = "0.23.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +dependencies = [ + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schemars" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "indexmap 2.14.0", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e72c1c2cb7b223fafb600a619537a871c2818583d619401b785e7c0b746ccde2" +dependencies = [ + "base64 0.22.1", + "bs58", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.14.0", + "schemars 0.9.0", + "schemars 1.2.1", + "serde_core", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b90c488738ecb4fb0262f41f43bc40efc5868d9fb744319ddf5f5317f417bfac" +dependencies = [ + "darling 0.23.0", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.3", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "simd_helpers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" +dependencies = [ + "quote", +] + +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom 7.1.3", + "serde", + "unicode-segmentation", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "target-lexicon" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tiff" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b63feaf3343d35b6ca4d50483f94843803b0f51634937cc2ec519fc32232bc52" +dependencies = [ + "fax", + "flate2", + "half", + "quick-error", + "weezl", + "zune-jpeg", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinystr" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokenizers" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44e5bea67576e04b6ff8564c5d9e09c2ef0cf476502245f2f120e497769d3112" +dependencies = [ + "ahash", + "compact_str", + "daachorse", + "dary_heap", + "derive_builder", + "esaxx-rs", + "getrandom 0.3.4", + "itertools 0.14.0", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + +[[package]] +name = "tokio" +version = "1.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" +dependencies = [ + "bytes", + "libc", + "mio", + "pin-project-lite", + "socket2", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68d6fdd9f81c2819c9a8b0e0cd91660e7746a8e6ea2ba7c6b2b057985f6bcb51" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", + "url", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" + +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "v_frame" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2" +dependencies = [ + "aligned-vec", + "num-traits", + "wasm-bindgen", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ace1d07c165b0864824eee619580c4689389afa9dc9ed3a4c75040d82e6790" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96492d0d3ffba25305a7dc88720d250b1401d7edca02cc3bcd50633b424673b8" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e68e6f4afd367a562002c05637acb8578ff2dea1943df76afb9e83d177c8578" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d95a9ec35c64b2a7cb35d3fead40c4238d0940c86d107136999567a4703259f2" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.121" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4e0100b01e9f0d03189a92b96772a1fb998639d981193d7dbab487302513441" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b572dff8bcf38bad0fa19729c89bb5748b2b9b1d8be70cf90df697e3a8f32aa" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "writeable" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "y4m" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" + +[[package]] +name = "yoke" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-inflate" +version = "0.2.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "zune-jpeg" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296" +dependencies = [ + "zune-core", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..a7d827c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,56 @@ +[workspace] +resolver = "2" +members = [ + "crates/renderers-core", + "crates/renderers-py", + "crates/renderers-cli", +] + +[workspace.package] +edition = "2024" +license = "Apache-2.0" +repository = "https://github.com/thomaub/renderers" +rust-version = "1.85" + +[workspace.dependencies] +tokenizers = { version = "0.23", default-features = false, features = ["onig", "esaxx_fast"] } +serde = { version = "1", features = ["derive"] } +serde_json = { version = "1", features = ["preserve_order"] } +regex = "1" +thiserror = "2" +smallvec = { version = "1", features = ["union", "const_generics"] } +bumpalo = { version = "3", features = ["collections"] } +phf = { version = "0.13", features = ["macros"] } +clap = { version = "4.6", features = ["derive"] } + +[workspace.lints.rust] +unsafe_op_in_unsafe_fn = "deny" +rust_2018_idioms = { level = "warn", priority = -1 } + +[workspace.lints.clippy] +# clippy::all = correctness + suspicious + style + complexity + perf. +# Already clean today; keep it that way. +all = { level = "deny", priority = -1 } +# Pedantic is informational only — flags style improvements without breaking CI. +pedantic = { level = "warn", priority = -1 } +# Stylistic pedantic lints we deliberately tolerate. +must_use_candidate = "allow" +cast_possible_truncation = "allow" +cast_possible_wrap = "allow" +missing_errors_doc = "allow" +return_self_not_must_use = "allow" +module_name_repetitions = "allow" + +[profile.release] +opt-level = 3 +lto = "thin" +codegen-units = 1 +debug = false +overflow-checks = false +panic = "abort" + +[profile.bench] +opt-level = 3 +lto = "thin" +codegen-units = 1 +debug = true diff --git a/IDEAS.md b/IDEAS.md new file mode 100644 index 0000000..ad28625 --- /dev/null +++ b/IDEAS.md @@ -0,0 +1,684 @@ +# Native Runtime Performance Ideas + +This document is the working plan for making the Rust/PyO3 renderers faster while +keeping parity visible at every step. The goal is not to guess where the speedup +comes from. Each change should land with a benchmark artifact that compares the +new commit against the previous baseline. + +## Current Shape + +The benchmark entry point is: + +```bash +uv run maturin develop --manifest-path crates/renderers-py/Cargo.toml --release +uv run python benchmarks/native_vs_python_qwen3.py --families all --min-time 0.35 --repeats 7 --memory-loops 1000 +``` + +The script already compares: + +- Python renderer public APIs. +- Native list-returning APIs. +- Native NumPy-returning APIs where available. +- `render_ids`, `parse_response`, and `bridge_to_next_turn`. +- Multiple families: Qwen, GLM, DeepSeek, Kimi, MiniMax, and Nemotron. + +The script now has progress and reproducibility support, so it can be used as +the optimization scoreboard before and after each runtime commit. + +## Benchmark Harness First + +Before optimizing runtime code, make the benchmark produce stable artifacts. +This lets every commit answer the same question: what got faster, what got +slower, and by how much? + +### 1. Structured Output + +Implemented flags in `benchmarks/native_vs_python_qwen3.py`: + +```bash +--json-out benchmark-results/native-runtime/latest.json +--markdown-out benchmark-results/native-runtime/latest.md +--baseline benchmark-results/native-runtime/baseline.json +``` + +The JSON includes: + +- Git commit SHA and dirty state. +- Python version, Rust version, platform, CPU model if available. +- Native extension build mode. +- Benchmark args: families, repeats, min time, memory loops. +- One row per family, operation, scenario, and API path. +- Median, min, max, loop count, token count, and memory peak. +- Per-family geomean and overall geomean. + +The Markdown includes: + +- A short summary table with overall list and NumPy geomean speedups. +- A per-family table. +- A worst regressions table versus baseline. +- A best improvements table versus baseline. +- Skipped cases and why they were skipped. + +Raw terminal tables are still printed, but the JSON is the source of truth. + +### 2. Live Progress + +The full all-family benchmark is long enough that it renders progress as it +runs. Progress output goes to stderr. + +Suggested progress lines: + +```text +[1/120] qwen3 render_ids medium_gen_prompt: python +[1/120] qwen3 render_ids medium_gen_prompt: native list +[1/120] qwen3 render_ids medium_gen_prompt: native np +[1/120] qwen3 render_ids medium_gen_prompt: memory +``` + +The script also prints a compact family summary after each family finishes: + +```text +family=qwen3 rows=12 list_geomean=1.81x np_geomean=2.03x elapsed=31.2s +``` + +This matters because performance work can fail halfway through a full +matrix. Partial progress should still be useful. + +### 3. Compare Mode + +Implemented comparison mode: + +```bash +uv run python benchmarks/native_vs_python_qwen3.py \ + --families all \ + --baseline benchmark-results/native-runtime/baseline.json \ + --json-out benchmark-results/native-runtime/$SHA.json \ + --markdown-out benchmark-results/native-runtime/$SHA.md +``` + +Comparison rules: + +- Compare matching `family + operation + scenario + path`. +- Report ratios against the baseline medians. +- Treat missing baseline rows as new coverage, not wins. +- Treat missing current rows as failures unless explicitly skipped. +- Flag any row slower than baseline by more than 5 percent. +- Flag any row faster than baseline by more than 5 percent. + +The script exits non-zero only with an explicit flag such as: + +```bash +--fail-on-regression 5 +``` + +That keeps exploratory runs flexible while making CI or local gates strict when +we want them strict. + +### 4. Add a Small/Fast Profile + +Use a sub-minute profile before every larger run: + +```bash +uv run python benchmarks/native_vs_python_qwen3.py \ + --families qwen3,qwen35,kimi_k2 \ + --min-time 0.02 \ + --repeats 3 \ + --memory-loops 20 \ + --json-out benchmark-results/native-runtime/smoke.json +``` + +The smoke profile catches broken benchmark plumbing and obvious parity failures. +Only after it passes should we run the full profile: + +```bash +uv run python benchmarks/native_vs_python_qwen3.py \ + --families all \ + --min-time 0.35 \ + --repeats 7 \ + --memory-loops 1000 \ + --json-out benchmark-results/native-runtime/$SHA.json \ + --markdown-out benchmark-results/native-runtime/$SHA.md +``` + +## Commit Measurement Loop + +Every runtime optimization commit should follow this loop: + +1. Build native extension in release mode. + +```bash +uv run maturin develop --manifest-path crates/renderers-py/Cargo.toml --release +``` + +2. Run correctness checks. + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --locked +cargo test --workspace +uv run pytest -m parity tests/test_native_parity.py -q -rs +env RENDERERS_NATIVE=all uv run pytest \ + tests/test_render_ids.py \ + tests/test_bridge.py \ + tests/test_roundtrip.py \ + tests/test_message_indices.py \ + tests/test_native_router.py \ + tests/test_native_vision.py \ + tests/test_native_numpy.py \ + -q -rs +``` + +3. Run benchmark smoke. + +```bash +uv run python benchmarks/native_vs_python_qwen3.py \ + --families qwen3,qwen35,kimi_k2 \ + --min-time 0.02 \ + --repeats 3 \ + --memory-loops 20 +``` + +4. Run full benchmark and save artifacts. + +```bash +SHA=$(git rev-parse --short HEAD) +uv run python benchmarks/native_vs_python_qwen3.py \ + --families all \ + --min-time 0.35 \ + --repeats 7 \ + --memory-loops 1000 \ + --baseline benchmark-results/native-runtime/baseline.json \ + --json-out benchmark-results/native-runtime/$SHA.json \ + --markdown-out benchmark-results/native-runtime/$SHA.md +``` + +5. Commit code and benchmark artifact together when the benchmark is part of the +claim. If artifacts are too noisy for git, commit the code and paste the saved +Markdown summary into the commit message body or PR description. + +## Performance Work Queue + +The highest-value work is reducing repeated Python object parsing, repeated tool +formatting, and repeated token list materialization. Single fresh calls still pay +for Python input objects and tokenizer work, so the 8x to 10x target is most +realistic for prepared, batched, or multiturn workloads. + +### A. Prepared Tools + +Problem: + +The examples and benchmarks pass the same tool schema repeatedly. Today the +native path still receives Python objects, converts them to Rust structures, and +formats schema text for each render. + +Idea: + +Add a Python-visible prepared tool handle: + +```python +prepared_tools = renderer.prepare_tools(TOOLS) +ids = renderer.render_ids(messages, tools=prepared_tools, add_generation_prompt=True) +``` + +Native side: + +- Parse tool specs once. +- Normalize provider-specific tool shape once. +- Pre-render static tool instruction text once. +- Pre-tokenize static tool blocks where the family template allows it. +- Keep the original public `tools=list[dict]` path as fallback. + +Benchmark cases: + +- Existing `large_tools_gen_prompt`. +- Existing `tool_cycle_large_schema`. +- New repeated-tools scenario that renders the same tools across many short + prompts. + +Expected proof: + +- `render_ids` with tools gets faster. +- No regression for no-tools scenarios. +- SGLang and vLLM examples can use it directly because they already reuse + `TOOLS`. + +Status: + +- Implemented a Python-visible native `PreparedTools` handle. +- `Renderer.prepare_tools(TOOLS)` parses Python tool dictionaries once and can + be passed to `render_ids`, `render_ids_np`, `render_batch_ids`, and + `render_batch_ids_np_packed`. +- Benchmark rows now include `render_ids_prepared_tools`. +- Added a native `ToolTextCache` for repeated prepared-tool prompts. Qwen3, + Qwen3.5/Qwen3.6, GLM, Nemotron 3, MiniMax M2, and Kimi K2 now cache the fully + rendered and pre-tokenized system/tool text block keyed by the prepared tools + and dynamic system text. Repeated prepared-tool renders skip both tool + formatting and tokenization. + +### B. Prepared Conversation or Session + +Problem: + +Multiturn examples repeatedly pass Python message lists. For bridge paths, we +also repeatedly pass prompt IDs, completion IDs, and new messages across PyO3. + +Idea: + +Add a native session object that owns parsed messages and token buffers: + +```python +session = renderer.new_session(messages, tools=prepared_tools) +prompt_ids = session.render_ids(add_generation_prompt=True) +completion = engine_completion_ids(...) +bridged_ids = session.bridge_to_next_turn(completion, new_messages) +``` + +Native side: + +- Store parsed messages in Rust. +- Store prepared tools by reference or shared handle. +- Store previous prompt and completion buffers. +- Append new messages without reparsing the whole conversation. +- Return list IDs for existing engine APIs, and NumPy IDs for callers that can + keep arrays. + +Benchmark cases: + +- Existing `bridge_to_next_turn`. +- New `session_bridge_to_next_turn`. +- Long history plus one new user message. +- Tool response extension. + +Expected proof: + +- Big gains on `bridge_to_next_turn`. +- Lower memory pressure on Python heap. +- Minimal Python-side example change: replace renderer calls with a session. + +Status: + +- Implemented a Python-visible native `RendererSession`. +- `Renderer.new_session(messages, tools=prepared_tools)` stores parsed messages + and prepared tools in Rust. +- `session.render_ids()`, `session.render_ids_np()`, + `session.bridge_to_next_turn()`, and `session.bridge_to_next_turn_np()` are + available. +- Session messages are stored behind `Arc>`, so repeated + `session.render_ids()` calls clone only a pointer before releasing the GIL. +- Benchmark rows now include `session_render_ids`. +- Bridge implementations that only need token IDs now use token-id-only render + buffers, avoiding per-token message-index allocation on the extension path. +- `RendererSession.bridge_to_next_turn(..., update=False)` and + `bridge_to_next_turn_np(..., update=False)` allow repeatable measurement of + an initialized session bridge without mutating the stored prompt between + benchmark iterations. +- Benchmark rows now include `session_bridge_to_next_turn`. +- Implemented `RendererSession.fork()` so benchmarks and callers can cheaply + reset an initialized session state without reparsing messages or tools. +- Benchmark rows now include `session_bridge_loop`, a multi-step bridge loop + that advances the same session through several generated turns. + +### C. Batched Render APIs + +Problem: + +Serving systems rarely render one prompt in isolation. Even if SGLang or vLLM +does the model batching, the renderer can batch preprocessing before requests +reach the engine. + +Idea: + +Add: + +```python +batch = renderer.render_batch_ids(messages_batch, tools=prepared_tools) +batch_np = renderer.render_batch_ids_np(messages_batch, tools=prepared_tools) +``` + +Native side: + +- Parse one Python outer list. +- Reuse prepared tools across the batch. +- Use Rayon only after measuring thread overhead. +- Return `list[list[int]]` for current SGLang/vLLM compatibility. +- Return a packed NumPy representation for internal pipelines: + `ids: np.ndarray[uint32]` plus `offsets: np.ndarray[int64]`. + +Benchmark cases: + +- 8, 32, and 128 prompt batches. +- Short prompts with large tools. +- Long histories without tools. +- Mixed prompt lengths. + +Expected proof: + +- Batch throughput in prompts per second improves. +- Per-prompt median latency improves for realistic batch sizes. +- No change required at SGLang/vLLM engine boundary if we return lists. + +Status: + +- Implemented `Renderer.render_batch_ids(...)`. +- The native batch path uses Rayon for batches of 8 or more prompts. +- Benchmark rows now include `render_batch_ids`. + +### D. Packed NumPy Token Buffers + +Problem: + +Returning Python lists creates one Python integer object per token. NumPy avoids +that, but current SGLang/vLLM HTTP-style boundaries usually still need lists. + +Idea: + +Keep NumPy for renderer-internal and client-side intermediate steps: + +```python +prompt_np = renderer.render_ids_np(messages, tools=prepared_tools) +parsed = renderer.parse_response_np(completion_np) +bridged_np = renderer.bridge_to_next_turn_np(prompt_np, completion_np, new_messages) +``` + +Native side: + +- Return `uint32` arrays for token IDs. +- Accept contiguous `uint32` arrays without copying. +- Add packed batch arrays with offsets. +- Avoid list conversion until the exact engine call that requires it. + +SGLang/vLLM applicability: + +- Useful before and after engine generation. +- Not true end-to-end zero-copy for JSON or APIs requiring `list[int]`. +- Still useful for offline pipelines, metrics, masks, and bridge-heavy loops. + +Benchmark cases: + +- Existing NumPy rows. +- Add explicit `.tolist()` boundary rows: + `render_ids_np_then_tolist`. +- Add packed batch rows: + `render_batch_ids_np_packed`. + +Expected proof: + +- NumPy path stays faster than list path inside renderer. +- `.tolist()` boundary cost is visible instead of hidden. +- We can decide which examples should use NumPy and which should stay list-only. + +Status: + +- Existing single-prompt NumPy paths remain covered. +- Implemented `Renderer.render_batch_ids_np_packed(...)`, returning + `(ids: np.ndarray[uint32], offsets: np.ndarray[int64])`. +- Benchmark rows now use the packed batch path as the native NumPy batch path. +- Benchmark rows now include `render_ids_np_then_tolist` so the cost of crossing + back to engine-compatible Python lists is visible instead of hidden. + +### E. Template Constant Token Caches + +Problem: + +Family templates contain repeated literal tokens: role tags, separators, +generation prompts, reasoning delimiters, tool delimiters, image sentinels, and +end markers. + +Idea: + +Pre-tokenize constant fragments when constructing each native renderer. + +Native side: + +- Store static token slices per family. +- Append cached token slices instead of repeatedly encoding literals. +- Keep text-render parity tests strict because whitespace and delimiter changes + are easy to miss. + +Benchmark cases: + +- No-tools short prompts. +- Long histories. +- Reasoning histories. +- Structured text parts. + +Expected proof: + +- Broad `render_ids` improvement across families. +- Stronger gains on many-turn conversations. + +### F. Dynamic Text Encode Batching + +Problem: + +Rendering many message parts can call the tokenizer repeatedly. Tokenizer call +overhead can dominate short fragments. + +Idea: + +Batch dynamic text segments where the tokenizer supports it, then interleave the +encoded pieces with cached template tokens. + +Native side: + +- Collect dynamic text fragments during render planning. +- Encode them in one tokenizer batch. +- Reassemble tokens in original order. +- Preserve message index accounting. + +Benchmark cases: + +- Long history. +- Structured text parts. +- Many short user/assistant turns. + +Expected proof: + +- Long history render improves. +- Message indices remain identical. +- No parse or bridge regressions. + +Status: + +- Added `Tokenizer::encode_batch_no_special(...)`, backed by the tokenizer + crate's batch-fast encoder. +- Added a token-only `TokenPlanBuf` that records literal-token and dynamic-text + operations, batch-encodes text fragments, then materializes the final token + stream in order. +- Qwen3 `render_ids` uses the planned batch-encode path only for long no-tool + histories. Short prompts, tool-heavy prompts, attributed `render()`, and + bridge paths stay on the lower-overhead direct buffer. +- Benchmark rows now expose the targeted long-history render gain while keeping + short-prompt and tool-response bridge regressions visible. +- Remaining rollout work: apply the same conservative dispatch to additional + families only when a family-specific benchmark shows a gain. + +### G. Fast Input Shape + +Problem: + +OpenAI-style dict messages are flexible but expensive to parse. Hot callers can +use a stricter shape if it is optional. + +Idea: + +Add a compact input API without replacing existing public APIs: + +```python +renderer.render_fast( + roles=["system", "user", "assistant"], + contents=["...", "...", "..."], + tools=prepared_tools, +) +``` + +Native side: + +- Validate parallel arrays once. +- Avoid generic `dict` and `Content` traversal. +- Keep support for structured parts in the generic path. + +Benchmark cases: + +- Short chat. +- Long chat. +- Tool-heavy prompt. + +Expected proof: + +- Fast shape wins when the caller can provide it. +- Existing API behavior is unchanged. + +Status: + +- Implemented `Renderer.render_fast_ids(roles, contents, ...)`. +- Implemented `Renderer.render_fast_ids_np(roles, contents, ...)`. +- Benchmark rows now include `render_fast_ids` where the scenario is compatible + with plain string roles and contents. + +### I. Cached Template Literal Tokens + +Problem: + +Several family renderers still encoded fixed literal fragments on every render: +newlines, role prefixes, generation prompts, and XML close fragments. + +Status: + +- Qwen3 already cached the highest-frequency literal fragments. +- Qwen3.5/Qwen3.6 now cache common literal tokens at construction time: + newline, double newline, role prefixes, assistant generation prefix, and + `\n`. +- Text render, bridge, and multimodal user rendering use the cached token + slices. +- Nemotron 3 now caches common standalone literal tokens at construction time: + newline, role prefixes, assistant generation prefix, and `\n`. +- GLM now caches standalone newline tokens used in GLM-4.5 generation prompts + and tool-call separators. +- MiniMax M2 now caches standalone newline, `ai\n`, and `tool` tokens. +- Kimi K2 now caches standalone newline and `assistant` tokens. +- Kimi K2.5 now caches standalone newline, `assistant`, ``, and + `` tokens for text, bridge, and multimodal paths. +- Prepared tool text blocks are now pre-rendered and pre-tokenized for Qwen3, + Qwen3.5/Qwen3.6, GLM, Nemotron 3, MiniMax M2, and Kimi K2 through the shared + native `ToolTextCache`. + +### H. Parse Response Fast Path + +Problem: + +Parse can be sub-microsecond in simple cases, but tool calls and reasoning blocks +still require scanning and allocation. + +Idea: + +Optimize parsing around byte/token markers: + +- Search token IDs for known delimiter IDs before decoding full text. +- Decode only spans that become content, reasoning, or JSON arguments. +- Avoid JSON parsing unless a tool call delimiter exists. +- Return borrowed or compact Python objects where PyO3 allows it. + +Benchmark cases: + +- Plain content. +- Reasoning plus content. +- Multi-tool call. +- Long content. + +Expected proof: + +- Parse geomean improves. +- Multi-tool parse improves without slowing plain content. + +Status: + +- Qwen3.5/Qwen3.6 no longer allocate a copied `Vec` for the no-thinking + parse path. Plain content and tool-call parse now borrow the stripped token + slice directly. +- GLM no longer allocates a copied token vector for the no-thinking parse path. +- Qwen3 now moves plain decoded content through the no-thinking split path + instead of cloning it into a second `String`. +- Remaining deeper work: token-delimiter partial decode for more families, and + avoiding regex/string work inside XML tool-call spans where possible. + +## SGLang and vLLM Compatibility + +The examples currently pass renderer-owned token IDs to engines: + +- SGLang offline uses `engine.generate(input_ids=prompt_ids, ...)`. +- SGLang online sends `"input_ids": prompt_ids` over JSON. +- vLLM offline uses `{"prompt_token_ids": prompt_ids}`. + +That means: + +- Prepared tools are directly usable. +- Session rendering and session bridge are directly usable. +- Batched list output is directly usable. +- NumPy buffers are useful inside the renderer/client pipeline, but many engine + calls still need `list[int]`. +- True zero-copy across HTTP JSON is not realistic without changing the server + protocol. + +The best PR path is to preserve the existing list APIs and add opt-in fast paths. +Examples can adopt fast paths only where the call site remains clear. + +The SGLang and vLLM multiturn examples now keep that shape: + +- Native runs call `prepare_tools(TOOLS)` once when the renderer exposes it. +- Native runs use `new_session(messages, tools=prepared_tools)` for the first + render and the next-turn bridge, so repeated serving-loop calls do not parse + the same prompt/tool dictionaries again. +- `render_fast_ids(...)` remains the lighter API for local loops that already + hold parallel role/content arrays and do not need structured content parts. + +## Native/PyO3 API Map + +This is the concrete mapping from the performance ideas above to the current +native extension surface and verification hooks. + +| Idea | PyO3/native API | Benchmark row | Test coverage | +|---|---|---|---| +| Prepared tools | `Renderer.prepare_tools(...)`, `PreparedTools` | `render_ids_prepared_tools`, `render_batch_ids:short_batch_prepared_tools` | `tests/test_native_numpy.py::test_prepared_tools_match_raw_tools`, parity tool rows | +| Native session | `Renderer.new_session(...)`, `RendererSession.render_ids(...)`, `RendererSession.render_ids_np(...)` | `session_render_ids` | `tests/test_native_numpy.py::test_session_render_and_bridge_match_renderer`, parity rows | +| Session bridge | `RendererSession.bridge_to_next_turn(...)`, `RendererSession.bridge_to_next_turn_np(...)` | `session_bridge_to_next_turn` | `tests/test_native_numpy.py::test_session_render_and_bridge_match_renderer`, `test_session_numpy_bridge_match_renderer` | +| Repeatable session loop | `RendererSession.fork()` plus `bridge_to_next_turn(update=True)` | `session_bridge_loop` | `tests/test_native_numpy.py::test_session_fork_preserves_prompt_state`, benchmark parity precheck | +| Batched render | `Renderer.render_batch_ids(...)` | `render_batch_ids` | `tests/test_native_numpy.py::test_render_batch_ids_matches_single_calls` | +| Packed NumPy batch | `Renderer.render_batch_ids_np_packed(...)` | `render_batch_ids` native NumPy path | `tests/test_native_numpy.py::test_render_batch_ids_np_packed_matches_single_calls` | +| Single-prompt NumPy | `render_ids_np(...)`, `parse_response_np(...)`, `bridge_to_next_turn_np(...)` | native NumPy path, `render_ids_np_then_tolist` | `test_render_ids_np_matches_list_api`, `test_parse_response_np_borrows_uint32_completion`, `test_bridge_to_next_turn_np_matches_list_api` | +| Fast input shape | `Renderer.render_fast_ids(...)`, `Renderer.render_fast_ids_np(...)` | `render_fast_ids` | `tests/test_native_numpy.py::test_render_fast_ids_matches_dict_messages` | +| Dynamic text batching | `Tokenizer::encode_batch_no_special(...)`, `TokenPlanBuf`, Qwen3, Qwen3.5/Qwen3.6, DeepSeek V3, MiniMax M2, and GLM long no-tool `render_ids` dispatch | long-history `render_ids` and `render_fast_ids` rows | full parity, native-forced render tests, benchmark parity precheck | +| Template literal caches | family constructors store pre-tokenized literals | normal render and bridge rows across families | full parity and native-forced render/bridge tests | +| Prepared tool text cache | `ToolTextCache` in core family renderers | prepared-tools rows across supported families | full parity and native-forced render/bridge tests | +| Parse fast paths | borrowed stripped slices in Qwen3.5/Qwen3.6 and GLM, moved decoded content in Qwen3 | `parse_response` rows | full parity parse rows and native-forced roundtrip tests | + +## PR Implementation Order + +1. Benchmark artifact and progress support. +2. Baseline benchmark artifact from the current branch. +3. Prepared tools. +4. Session object for multiturn render and bridge. +5. Packed NumPy batch output. +6. Template constant token caches. +7. Dynamic text encode batching. +8. Optional fast input shape. +9. Parse response fast paths. + +Each item should have: + +- A parity test. +- A benchmark row or scenario that isolates it. +- A benchmark summary against the previous baseline. +- No broad Python-side rewrite unless the benchmark shows the API is worth it. + +## Success Criteria + +Runtime work is ready for the PR when: + +- Full parity passes. +- Full native-forced Python test subset passes. +- Full benchmark artifacts exist for baseline and final commits. +- The PR description shows per-family and overall geomean speedup. +- Any regression over 5 percent is explained or fixed. +- SGLang and vLLM examples still show the simple list-based path. +- Optional fast paths are documented by example, not required for normal use. diff --git a/benchmarks/native_vs_python_qwen3.py b/benchmarks/native_vs_python_qwen3.py new file mode 100644 index 0000000..e97762e --- /dev/null +++ b/benchmarks/native_vs_python_qwen3.py @@ -0,0 +1,2249 @@ +#!/usr/bin/env python +# /// script +# requires-python = ">=3.10,<3.14" +# dependencies = [ +# "transformers>=4.50.0", +# ] +# /// +"""Compare pure-Python renderer latency with native PyO3 renderer latency. + +Run from a checkout after building the native extension: + + uv run maturin develop --manifest-path crates/renderers-py/Cargo.toml --release + uv run python benchmarks/native_vs_python_qwen3.py --families all + +The benchmark intentionally uses the public Python APIs on both sides. Native +timings include PyO3 boundary and Python object conversion costs, which is the +relevant number for Python callers. Use the Criterion bench for pure Rust +hot-path timings. +""" + +from __future__ import annotations + +import argparse +import contextlib +import gc +import io +import json +import logging +import os +import platform +import statistics +import subprocess +import sys +import time +import tracemalloc +from collections.abc import Callable, Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any, cast + +from renderers import _native_router as router +from renderers.base import Message, ToolSpec, load_tokenizer + + +TOOLS = cast( + list[ToolSpec], + [ + { + "name": "get_weather", + "description": "Get current weather for a city.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"}, + "units": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["city"], + }, + }, + { + "name": "search_places", + "description": "Find places matching a set of constraints.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + "query": {"type": "string"}, + "filters": { + "type": "object", + "properties": { + "kid_friendly": {"type": "boolean"}, + "max_walk_minutes": {"type": "integer"}, + "tags": {"type": "array", "items": {"type": "string"}}, + }, + }, + }, + "required": ["city", "query"], + }, + }, + { + "name": "book_table", + "description": "Create a restaurant booking request.", + "parameters": { + "type": "object", + "properties": { + "restaurant": {"type": "string"}, + "party_size": {"type": "integer"}, + "time": {"type": "string"}, + "notes": {"type": "string"}, + }, + "required": ["restaurant", "party_size", "time"], + }, + }, + ], +) + + +@dataclass(frozen=True) +class FamilySpec: + family: str + model: str + + +@dataclass(frozen=True) +class RenderScenario: + name: str + messages: list[Message] + tools: list[ToolSpec] | None = None + add_generation_prompt: bool = False + + +@dataclass(frozen=True) +class ParseScenario: + name: str + prompt: list[Message] + assistant: Message + tools: list[ToolSpec] | None = None + + +@dataclass(frozen=True) +class BridgeScenario: + name: str + prompt: list[Message] + assistant: Message + new_messages: list[Message] + tools: list[ToolSpec] | None = None + + +@dataclass(frozen=True) +class Timing: + loops: int + median_ns: float + min_ns: float + max_ns: float + + @property + def median_us(self) -> float: + return self.median_ns / 1_000.0 + + +@dataclass(frozen=True) +class Memory: + loops: int + peak_bytes: int + + @property + def peak_kib(self) -> float: + return self.peak_bytes / 1024 + + +@dataclass(frozen=True) +class BenchCase: + family: str + model: str + operation: str + scenario: str + token_count: int + py_fn: Callable[[], object] + native_fn: Callable[[], object] + native_np_fn: Callable[[], object] | None + + +@dataclass(frozen=True) +class BenchRow: + family: str + model: str + operation: str + scenario: str + token_count: int + py_timing: Timing + native_timing: Timing + native_np_timing: Timing | None + py_memory: Memory + native_memory: Memory + native_np_memory: Memory | None + + @property + def list_speedup(self) -> float: + return self.py_timing.median_ns / self.native_timing.median_ns + + @property + def np_speedup(self) -> float | None: + if self.native_np_timing is None: + return None + return self.py_timing.median_ns / self.native_np_timing.median_ns + + +@dataclass(frozen=True) +class BaselineDiff: + family: str + operation: str + scenario: str + path: str + current_median_ns: float | None + baseline_median_ns: float | None + ratio: float | None + + @property + def percent_change(self) -> float | None: + if self.ratio is None: + return None + return (self.ratio - 1.0) * 100.0 + + +DEFAULT_FAMILIES: tuple[FamilySpec, ...] = ( + FamilySpec("qwen3", "Qwen/Qwen3-8B"), + FamilySpec("qwen35", "Qwen/Qwen3.5-9B"), + FamilySpec("qwen36", "Qwen/Qwen3.6-35B-A3B"), + FamilySpec("glm5", "zai-org/GLM-5"), + FamilySpec("glm51", "zai-org/GLM-5.1"), + FamilySpec("glm45", "THUDM/GLM-4.5-Air"), + FamilySpec("deepseek_v3", "deepseek-ai/DeepSeek-V3"), + FamilySpec("kimi_k2", "moonshotai/Kimi-K2-Instruct"), + FamilySpec("minimax_m2", "MiniMaxAI/MiniMax-M2.5"), + FamilySpec("nemotron3", "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"), +) + + +FAMILY_BY_NAME = {spec.family: spec for spec in DEFAULT_FAMILIES} + + +def _medium_messages() -> list[Message]: + return [ + { + "role": "system", + "content": "You are a helpful assistant that calls tools when needed.", + }, + { + "role": "user", + "content": "Plan a weekend trip to Lisbon for two; we like food and walking.", + }, + { + "role": "assistant", + "content": ( + "I'll help. First, let me check the weather and find some restaurants." + ), + }, + {"role": "user", "content": "Sounds good - go ahead."}, + { + "role": "assistant", + "content": ( + "Here's a plan: Friday evening tapas at Time Out Market, Saturday " + "morning walk through Alfama, Saturday lunch at Ramiro, Saturday " + "afternoon Belem pasteis, Sunday morning Sao Jorge castle, Sunday " + "lunch at Cervejaria Trindade." + ), + }, + ] + + +def _long_history(rounds: int = 18) -> list[Message]: + messages: list[Message] = [ + { + "role": "system", + "content": ( + "You are an itinerary planner. Preserve constraints, cite tradeoffs, " + "and keep tool observations separate from recommendations." + ), + } + ] + for idx in range(rounds): + messages.append( + { + "role": "user", + "content": ( + f"Leg {idx}: compare museum, food, and walking options. " + f"We have budget band {idx % 4}, transit pass {idx % 3}, " + "and one traveler who avoids late dinners." + ), + } + ) + messages.append( + { + "role": "assistant", + "content": ( + f"For leg {idx}, start with a walkable cluster, keep the meal " + "close to transit, and leave a fallback indoor option. " + "The strongest tradeoff is time certainty versus variety." + ), + } + ) + messages.append( + {"role": "user", "content": "Now produce the final plan with the best swaps."} + ) + return messages + + +def _reasoning_history(rounds: int = 10) -> list[Message]: + messages: list[Message] = [ + {"role": "system", "content": "You are concise but keep prior reasoning."} + ] + for idx in range(rounds): + messages.append({"role": "user", "content": f"Score option {idx}."}) + messages.append( + { + "role": "assistant", + "reasoning_content": ( + f"Option {idx} has a distance score of {idx % 5}, a food score " + f"of {(idx + 2) % 5}, and a weather risk score of {(idx + 3) % 5}." + ), + "content": f"Option {idx}: viable with one caveat.", + } + ) + return messages + + +def _structured_text_messages() -> list[Message]: + return [ + {"role": "system", "content": "You preserve structured text parts."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Compare two plans. "}, + {"type": "text", "text": "Prefer the one with fewer transfers."}, + ], + }, + {"role": "assistant", "content": "The lower-transfer plan is better."}, + ] + + +def _tool_cycle_messages() -> list[Message]: + return [ + {"role": "user", "content": "Weather?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + }, + }, + ], + }, + {"role": "tool", "content": "sunny, 22 C"}, + { + "role": "assistant", + "content": "It's sunny and 22 C in Paris.", + }, + ] + + +def _large_tool_only_messages() -> list[Message]: + return [ + {"role": "system", "content": "You are a travel operations assistant."}, + { + "role": "user", + "content": ( + "Use the available tools to build a food-first morning plan, " + "but only call tools if missing information blocks the answer." + ), + }, + ] + + +def _batch_messages() -> list[list[Message]]: + return [ + [ + {"role": "system", "content": "You are concise."}, + {"role": "user", "content": f"Write option {idx} in one sentence."}, + ] + for idx in range(16) + ] + + +def render_scenarios(family: str) -> list[RenderScenario]: + scenarios = [ + RenderScenario( + "medium_gen_prompt", _medium_messages(), add_generation_prompt=True + ), + RenderScenario( + "long_history_gen_prompt", + _long_history(), + add_generation_prompt=True, + ), + RenderScenario("reasoning_history", _reasoning_history()), + RenderScenario("tool_cycle_large_schema", _tool_cycle_messages(), tools=TOOLS), + RenderScenario( + "large_tools_gen_prompt", + _large_tool_only_messages(), + tools=TOOLS, + add_generation_prompt=True, + ), + ] + if family in {"qwen35", "qwen36"}: + scenarios.insert( + 3, RenderScenario("structured_text_parts", _structured_text_messages()) + ) + return scenarios + + +def parse_scenarios() -> list[ParseScenario]: + prompt: list[Message] = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Answer with the needed structure."}, + ] + return [ + ParseScenario( + "plain_content", + prompt, + {"role": "assistant", "content": "The answer is four."}, + ), + ParseScenario( + "reasoning_and_content", + prompt, + { + "role": "assistant", + "reasoning_content": ( + "The user asks for arithmetic, so compute two plus two." + ), + "content": "The answer is four.", + }, + ), + ParseScenario( + "multi_tool_call", + prompt, + { + "role": "assistant", + "content": "I will inspect the required details.", + "tool_calls": [ + { + "id": "call_weather", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"city":"Lisbon","units":"celsius"}', + }, + }, + { + "id": "call_places", + "type": "function", + "function": { + "name": "search_places", + "arguments": json.dumps( + { + "city": "Lisbon", + "query": "kid friendly Sunday morning", + "filters": { + "kid_friendly": True, + "max_walk_minutes": 20, + "tags": ["parks", "pastries", "views"], + }, + }, + separators=(",", ":"), + ), + }, + }, + ], + }, + tools=TOOLS, + ), + ParseScenario( + "long_content", + prompt, + { + "role": "assistant", + "content": " ".join( + f"Recommendation {idx}: keep the plan walkable and reversible." + for idx in range(80) + ), + }, + ), + ] + + +def bridge_scenarios() -> list[BridgeScenario]: + medium = _medium_messages() + tool_cycle = _tool_cycle_messages() + return [ + BridgeScenario( + "medium_extend_user", + medium[:-1], + medium[-1], + [ + { + "role": "user", + "content": "Add a kid-friendly option for Sunday morning.", + } + ], + ), + BridgeScenario( + "long_history_extend_user", + _long_history(14)[:-1], + { + "role": "assistant", + "content": ( + "Here is the compressed plan: keep mornings flexible, cluster " + "food stops near transit, and reserve one indoor fallback." + ), + }, + [ + { + "role": "user", + "content": "Add one backup if rain starts before lunch.", + } + ], + ), + BridgeScenario( + "tool_response_extension", + tool_cycle[:-1], + tool_cycle[-1], + [ + { + "role": "tool", + "name": "book_table", + "content": '{"status": "waitlist", "eta_minutes": 15}', + }, + {"role": "user", "content": "Adjust if the restaurant is waitlisted."}, + ], + tools=TOOLS, + ), + ] + + +def build_python_renderer(family: str, tokenizer: Any) -> Any: + saved = os.environ.pop("RENDERERS_NATIVE", None) + try: + if family == "qwen3": + from renderers.qwen3 import Qwen3Renderer + + return Qwen3Renderer(tokenizer) + if family == "qwen35": + from renderers.qwen35 import Qwen35Renderer + + return Qwen35Renderer(tokenizer) + if family == "qwen36": + from renderers.qwen36 import Qwen36Renderer + + return Qwen36Renderer(tokenizer) + if family == "glm5": + from renderers.glm5 import GLM5Renderer + + return GLM5Renderer(tokenizer) + if family == "glm51": + from renderers.glm5 import GLM51Renderer + + return GLM51Renderer(tokenizer) + if family == "glm45": + from renderers.glm45 import GLM45Renderer + + return GLM45Renderer(tokenizer) + if family == "deepseek_v3": + from renderers.deepseek_v3 import DeepSeekV3Renderer + + return DeepSeekV3Renderer(tokenizer) + if family == "kimi_k2": + from renderers.kimi_k2 import KimiK2Renderer + + return KimiK2Renderer(tokenizer) + if family == "minimax_m2": + from renderers.minimax_m2 import MiniMaxM2Renderer + + return MiniMaxM2Renderer(tokenizer) + if family == "nemotron3": + from renderers.nemotron3 import Nemotron3Renderer + + return Nemotron3Renderer(tokenizer) + finally: + if saved is not None: + os.environ["RENDERERS_NATIVE"] = saved + raise ValueError(f"unknown family: {family}") + + +def build_native_renderer(native_module: Any, family: str, tokenizer_path: str) -> Any: + factory = { + "qwen3": native_module.Renderer.qwen3, + "qwen35": native_module.Renderer.qwen35, + "qwen36": native_module.Renderer.qwen36, + "glm5": native_module.Renderer.glm5, + "glm51": native_module.Renderer.glm51, + "glm45": native_module.Renderer.glm45, + "deepseek_v3": native_module.Renderer.deepseek_v3, + "kimi_k2": native_module.Renderer.kimi_k2, + "minimax_m2": native_module.Renderer.minimax_m2, + "nemotron3": native_module.Renderer.nemotron3, + }.get(family) + if factory is None: + raise ValueError(f"unknown family: {family}") + return factory(tokenizer_path) + + +def parse_families(raw: str) -> list[FamilySpec]: + if raw in {"all", "native"}: + return list(DEFAULT_FAMILIES) + selected: list[FamilySpec] = [] + for item in raw.split(","): + family = item.strip() + if not family: + continue + try: + selected.append(FAMILY_BY_NAME[family]) + except KeyError as exc: + known = ", ".join(sorted(FAMILY_BY_NAME)) + raise SystemExit(f"unknown family {family!r}; known: {known}") from exc + if not selected: + raise SystemExit("--families resolved to an empty set") + return selected + + +def apply_model_overrides( + specs: Sequence[FamilySpec], overrides: Sequence[str] +) -> list[FamilySpec]: + by_family = {spec.family: spec for spec in specs} + for override in overrides: + if "=" not in override: + if len(specs) != 1: + raise SystemExit( + "--model without FAMILY=MODEL is only valid with one family" + ) + family, model = specs[0].family, override + else: + family, model = override.split("=", 1) + family = family.strip() + model = model.strip() + if family not in by_family: + raise SystemExit( + f"--model override references unselected family {family!r}" + ) + by_family[family] = FamilySpec(family, model) + return [by_family[spec.family] for spec in specs] + + +def time_case( + fn: Callable[[], object], + *, + min_time_s: float, + repeats: int, +) -> Timing: + loops = 1 + while True: + start = time.perf_counter_ns() + for _ in range(loops): + fn() + elapsed_s = (time.perf_counter_ns() - start) / 1_000_000_000 + if elapsed_s >= min_time_s: + break + loops *= 2 + + samples: list[float] = [] + for _ in range(repeats): + start = time.perf_counter_ns() + for _ in range(loops): + fn() + samples.append((time.perf_counter_ns() - start) / loops) + + return Timing( + loops=loops, + median_ns=statistics.median(samples), + min_ns=min(samples), + max_ns=max(samples), + ) + + +def memory_case(fn: Callable[[], object], *, loops: int) -> Memory: + gc.collect() + tracemalloc.start() + try: + for _ in range(loops): + fn() + _current, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return Memory(loops=loops, peak_bytes=peak) + + +def _as_ids(value: Any) -> list[int]: + if hasattr(value, "token_ids"): + return list(value.token_ids) + return list(value) + + +def _packed_batch_to_lists(value: Any) -> list[list[int]]: + ids, offsets = value + return [ + ids[offsets[idx] : offsets[idx + 1]].tolist() for idx in range(len(offsets) - 1) + ] + + +def _sum_token_count(batch: Sequence[Sequence[int]]) -> int: + return sum(len(ids) for ids in batch) + + +def _roles_and_contents( + messages: Sequence[Message], +) -> tuple[list[str], list[str]] | None: + roles: list[str] = [] + contents: list[str] = [] + for message in messages: + if message.get("tool_calls") or message.get("reasoning_content"): + return None + content = message.get("content", "") + if not isinstance(content, str): + return None + roles.append(str(message["role"])) + contents.append(content) + return roles, contents + + +def _assert_parsed_equal(py_value: Any, native_value: Any) -> None: + if py_value.content != native_value.content: + raise AssertionError("parse_response content parity failed before benchmarking") + if (py_value.reasoning_content or None) != (native_value.reasoning_content or None): + raise AssertionError( + "parse_response reasoning parity failed before benchmarking" + ) + if len(py_value.tool_calls) != len(native_value.tool_calls): + raise AssertionError("parse_response tool-call count parity failed") + for py_call, native_call in zip( + py_value.tool_calls, native_value.tool_calls, strict=True + ): + if ( + py_call.raw, + py_call.name, + py_call.arguments, + py_call.status, + ) != ( + native_call.raw, + native_call.name, + native_call.arguments, + native_call.status, + ): + raise AssertionError("parse_response tool-call parity failed") + + +def _completion_ids(renderer: Any, scenario: ParseScenario) -> list[int]: + prompt_ids = renderer.render_ids( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + full_ids = renderer.render_ids( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + completion = list(full_ids)[len(prompt_ids) :] + if not completion: + raise AssertionError(f"{scenario.name} produced an empty completion") + return completion + + +def _bridge_inputs( + renderer: Any, scenario: BridgeScenario +) -> tuple[list[int], list[int]]: + previous_prompt_ids = renderer.render_ids( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + full_ids = renderer.render_ids( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + previous_completion_ids = list(full_ids)[len(previous_prompt_ids) :] + if not previous_completion_ids: + raise AssertionError(f"{scenario.name} produced an empty completion") + return list(previous_prompt_ids), previous_completion_ids + + +def _session_bridge_to_next_turn( + session: Any, + previous_completion_ids: Sequence[int], + new_messages: Sequence[Message], +) -> Any: + return session.bridge_to_next_turn( + previous_completion_ids, new_messages, update=False + ) + + +def _session_bridge_to_next_turn_np( + session: Any, + previous_completion_ids: Any, + new_messages: Sequence[Message], +) -> Any: + return session.bridge_to_next_turn_np( + previous_completion_ids, new_messages, update=False + ) + + +def _bridge_loop( + renderer: Any, + previous_prompt_ids: Sequence[int], + previous_completion_ids: Sequence[int], + new_messages: Sequence[Message], + tools: list[ToolSpec] | None, + *, + steps: int, +) -> Any: + prompt_ids = list(previous_prompt_ids) + bridged = None + for _ in range(steps): + bridged = renderer.bridge_to_next_turn( + prompt_ids, + previous_completion_ids, + new_messages, + tools=tools, + ) + if bridged is None: + raise AssertionError("bridge loop returned None") + prompt_ids = list(bridged.token_ids) + return bridged + + +def _session_bridge_loop( + session: Any, + previous_completion_ids: Sequence[int], + new_messages: Sequence[Message], + *, + steps: int, +) -> Any: + bridged = None + for _ in range(steps): + bridged = session.bridge_to_next_turn( + previous_completion_ids, new_messages, update=True + ) + if bridged is None: + raise AssertionError("session bridge loop returned None") + return bridged + + +def _session_bridge_loop_np( + session: Any, + previous_completion_ids: Any, + new_messages: Sequence[Message], + *, + steps: int, +) -> Any: + bridged = None + for _ in range(steps): + bridged = session.bridge_to_next_turn_np( + previous_completion_ids, new_messages, update=True + ) + if bridged is None: + raise AssertionError("session numpy bridge loop returned None") + return bridged + + +def _new_session_bridge_loop( + renderer: Any, + prompt: Sequence[Message], + tools: Any, + previous_completion_ids: Sequence[int], + new_messages: Sequence[Message], + *, + steps: int, +) -> Any: + session = renderer.new_session(prompt, tools=tools) + session.render_ids(add_generation_prompt=True) + return _session_bridge_loop( + session, + previous_completion_ids, + new_messages, + steps=steps, + ) + + +def _new_session_bridge_loop_np( + renderer: Any, + prompt: Sequence[Message], + tools: Any, + previous_completion_ids: Any, + new_messages: Sequence[Message], + *, + steps: int, +) -> Any: + session = renderer.new_session(prompt, tools=tools) + session.render_ids_np(add_generation_prompt=True) + return _session_bridge_loop_np( + session, + previous_completion_ids, + new_messages, + steps=steps, + ) + + +def _add_render_cases( + cases: list[BenchCase], + skipped: list[str], + *, + spec: FamilySpec, + py_renderer: Any, + native_renderer: Any, + strict: bool, +) -> None: + for scenario in render_scenarios(spec.family): + try: + py_ids = _as_ids( + py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + native_ids = _as_ids( + native_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + if py_ids != native_ids: + raise AssertionError("render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_ids:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:render_ids:{scenario.name}: {exc}") + continue + cases.append( + BenchCase( + spec.family, + spec.model, + "render_ids", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario: native_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario: native_renderer.render_ids_np( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + ) + ) + cases.append( + BenchCase( + spec.family, + spec.model, + "render_ids_np_then_tolist", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario: native_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario: native_renderer.render_ids_np( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ).tolist(), + ) + ) + if scenario.tools: + prepared_tools = native_renderer.prepare_tools(scenario.tools) + try: + native_prepared_ids = _as_ids( + native_renderer.render_ids( + scenario.messages, + tools=prepared_tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + if py_ids != native_prepared_ids: + raise AssertionError("prepared tools render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_ids_prepared_tools:{scenario.name}: {exc}" + ) from exc + skipped.append( + f"{spec.family}:render_ids_prepared_tools:{scenario.name}: {exc}" + ) + continue + cases.append( + BenchCase( + spec.family, + spec.model, + "render_ids_prepared_tools", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda scenario=scenario, prepared_tools=prepared_tools: ( + native_renderer.render_ids( + scenario.messages, + tools=prepared_tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ), + lambda scenario=scenario, prepared_tools=prepared_tools: ( + native_renderer.render_ids_np( + scenario.messages, + tools=prepared_tools, + add_generation_prompt=scenario.add_generation_prompt, + ) + ), + ) + ) + fast_input = _roles_and_contents(scenario.messages) + if fast_input is not None and scenario.tools is None: + roles, contents = fast_input + try: + native_fast_ids = _as_ids( + native_renderer.render_fast_ids( + roles, + contents, + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + if py_ids != native_fast_ids: + raise AssertionError("fast input render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_fast_ids:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:render_fast_ids:{scenario.name}: {exc}") + continue + cases.append( + BenchCase( + spec.family, + spec.model, + "render_fast_ids", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda roles=roles, contents=contents, add_generation_prompt=scenario.add_generation_prompt: ( + native_renderer.render_fast_ids( + roles, + contents, + add_generation_prompt=add_generation_prompt, + ) + ), + lambda roles=roles, contents=contents, add_generation_prompt=scenario.add_generation_prompt: ( + native_renderer.render_fast_ids_np( + roles, + contents, + add_generation_prompt=add_generation_prompt, + ) + ), + ) + ) + try: + prepared_tools = ( + native_renderer.prepare_tools(scenario.tools) + if scenario.tools is not None + else None + ) + session = native_renderer.new_session( + scenario.messages, + tools=prepared_tools, + ) + session_np = native_renderer.new_session( + scenario.messages, + tools=prepared_tools, + ) + session_ids = _as_ids( + session.render_ids( + add_generation_prompt=scenario.add_generation_prompt, + ) + ) + session_np_ids = session_np.render_ids_np( + add_generation_prompt=scenario.add_generation_prompt, + ).tolist() + if py_ids != session_ids: + raise AssertionError("session render_ids parity failed") + if py_ids != session_np_ids: + raise AssertionError("session numpy render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:session_render_ids:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:session_render_ids:{scenario.name}: {exc}") + continue + cases.append( + BenchCase( + spec.family, + spec.model, + "session_render_ids", + scenario.name, + len(py_ids), + lambda scenario=scenario: py_renderer.render_ids( + scenario.messages, + tools=scenario.tools, + add_generation_prompt=scenario.add_generation_prompt, + ), + lambda session=session, add_generation_prompt=scenario.add_generation_prompt: ( + session.render_ids(add_generation_prompt=add_generation_prompt) + ), + lambda session=session_np, add_generation_prompt=scenario.add_generation_prompt: ( + session.render_ids_np(add_generation_prompt=add_generation_prompt) + ), + ) + ) + + +def _add_parse_cases( + cases: list[BenchCase], + skipped: list[str], + *, + spec: FamilySpec, + py_renderer: Any, + native_renderer: Any, + strict: bool, +) -> None: + for scenario in parse_scenarios(): + try: + py_completion_ids = _completion_ids(py_renderer, scenario) + native_completion_ids = _completion_ids(native_renderer, scenario) + native_prompt_np = native_renderer.render_ids_np( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + native_full_np = native_renderer.render_ids_np( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + native_completion_np = native_full_np[len(native_prompt_np) :] + if py_completion_ids != native_completion_ids: + raise AssertionError("completion parity failed") + _assert_parsed_equal( + py_renderer.parse_response(py_completion_ids), + native_renderer.parse_response(py_completion_ids), + ) + _assert_parsed_equal( + py_renderer.parse_response(py_completion_ids), + native_renderer.parse_response_np(native_completion_np), + ) + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:parse_response:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:parse_response:{scenario.name}: {exc}") + continue + cases.append( + BenchCase( + spec.family, + spec.model, + "parse_response", + scenario.name, + len(py_completion_ids), + lambda ids=py_completion_ids: py_renderer.parse_response(ids), + lambda ids=py_completion_ids: native_renderer.parse_response(ids), + lambda ids=native_completion_np: native_renderer.parse_response_np(ids), + ) + ) + + +def _add_bridge_cases( + cases: list[BenchCase], + skipped: list[str], + *, + spec: FamilySpec, + py_renderer: Any, + native_renderer: Any, + strict: bool, +) -> None: + for scenario in bridge_scenarios(): + try: + prev_prompt, prev_completion = _bridge_inputs(py_renderer, scenario) + native_prev_prompt, native_prev_completion = _bridge_inputs( + native_renderer, scenario + ) + if ( + prev_prompt != native_prev_prompt + or prev_completion != native_prev_completion + ): + raise AssertionError("bridge input parity failed") + + py_bridge = py_renderer.bridge_to_next_turn( + prev_prompt, + prev_completion, + scenario.new_messages, + tools=scenario.tools, + ) + native_bridge = native_renderer.bridge_to_next_turn( + prev_prompt, + prev_completion, + scenario.new_messages, + tools=scenario.tools, + ) + if py_bridge is None and native_bridge is None: + continue + if py_bridge is None or native_bridge is None: + raise AssertionError("bridge None parity failed") + if list(py_bridge.token_ids) != list(native_bridge.token_ids): + raise AssertionError("bridge parity failed") + + native_prev_prompt_np = native_renderer.render_ids_np( + scenario.prompt, + tools=scenario.tools, + add_generation_prompt=True, + ) + native_full_np = native_renderer.render_ids_np( + scenario.prompt + [scenario.assistant], + tools=scenario.tools, + ) + native_prev_completion_np = native_full_np[len(native_prev_prompt_np) :] + native_bridge_np = native_renderer.bridge_to_next_turn_np( + native_prev_prompt_np, + native_prev_completion_np, + scenario.new_messages, + tools=scenario.tools, + ) + if native_bridge_np is None: + raise AssertionError("numpy bridge returned None") + if list(py_bridge.token_ids) != native_bridge_np.tolist(): + raise AssertionError("numpy bridge parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:bridge_to_next_turn:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:bridge_to_next_turn:{scenario.name}: {exc}") + continue + + cases.append( + BenchCase( + spec.family, + spec.model, + "bridge_to_next_turn", + scenario.name, + len(py_bridge.token_ids), + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + py_renderer.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) + ), + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + native_renderer.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) + ), + lambda scenario=scenario, pp=native_prev_prompt_np, pc=native_prev_completion_np: ( + native_renderer.bridge_to_next_turn_np( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) + ), + ) + ) + + try: + native_tools = ( + native_renderer.prepare_tools(scenario.tools) + if scenario.tools is not None + else None + ) + session = native_renderer.new_session(scenario.prompt, tools=native_tools) + session_prompt = list(session.render_ids(add_generation_prompt=True)) + if session_prompt != native_prev_prompt: + raise AssertionError("session prompt parity failed") + session_bridge = session.bridge_to_next_turn( + prev_completion, + scenario.new_messages, + update=False, + ) + if session_bridge is None: + raise AssertionError("session bridge returned None") + if list(py_bridge.token_ids) != list(session_bridge.token_ids): + raise AssertionError("session bridge parity failed") + + session_np = native_renderer.new_session( + scenario.prompt, tools=native_tools + ) + session_prompt_np = session_np.render_ids_np(add_generation_prompt=True) + if session_prompt_np.tolist() != native_prev_prompt: + raise AssertionError("session numpy prompt parity failed") + session_bridge_np = session_np.bridge_to_next_turn_np( + native_prev_completion_np, + scenario.new_messages, + update=False, + ) + if session_bridge_np is None: + raise AssertionError("session numpy bridge returned None") + if list(py_bridge.token_ids) != session_bridge_np.tolist(): + raise AssertionError("session numpy bridge parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:session_bridge_to_next_turn:{scenario.name}: {exc}" + ) from exc + skipped.append( + f"{spec.family}:session_bridge_to_next_turn:{scenario.name}: {exc}" + ) + continue + + bench_session = native_renderer.new_session(scenario.prompt, tools=native_tools) + bench_session.render_ids(add_generation_prompt=True) + bench_session_np = native_renderer.new_session( + scenario.prompt, tools=native_tools + ) + bench_session_np.render_ids_np(add_generation_prompt=True) + + cases.append( + BenchCase( + spec.family, + spec.model, + "session_bridge_to_next_turn", + scenario.name, + len(py_bridge.token_ids), + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + py_renderer.bridge_to_next_turn( + pp, + pc, + scenario.new_messages, + tools=scenario.tools, + ) + ), + lambda scenario=scenario, pc=prev_completion, session=bench_session: ( + _session_bridge_to_next_turn( + session, + pc, + scenario.new_messages, + ) + ), + lambda scenario=scenario, pc=native_prev_completion_np, session=bench_session_np: ( + _session_bridge_to_next_turn_np( + session, + pc, + scenario.new_messages, + ) + ), + ) + ) + + loop_steps = 4 + try: + py_loop = _bridge_loop( + py_renderer, + prev_prompt, + prev_completion, + scenario.new_messages, + scenario.tools, + steps=loop_steps, + ) + native_loop = _bridge_loop( + native_renderer, + prev_prompt, + prev_completion, + scenario.new_messages, + scenario.tools, + steps=loop_steps, + ) + if list(py_loop.token_ids) != list(native_loop.token_ids): + raise AssertionError("bridge loop parity failed") + + session_loop = _new_session_bridge_loop( + native_renderer, + scenario.prompt, + native_tools, + prev_completion, + scenario.new_messages, + steps=loop_steps, + ) + if list(py_loop.token_ids) != list(session_loop.token_ids): + raise AssertionError("session bridge loop parity failed") + + session_loop_np = _new_session_bridge_loop_np( + native_renderer, + scenario.prompt, + native_tools, + native_prev_completion_np, + scenario.new_messages, + steps=loop_steps, + ) + if list(py_loop.token_ids) != session_loop_np.tolist(): + raise AssertionError("session numpy bridge loop parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:session_bridge_loop:{scenario.name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:session_bridge_loop:{scenario.name}: {exc}") + continue + + bench_loop_session = native_renderer.new_session( + scenario.prompt, tools=native_tools + ) + bench_loop_session.render_ids(add_generation_prompt=True) + bench_loop_session_np = native_renderer.new_session( + scenario.prompt, tools=native_tools + ) + bench_loop_session_np.render_ids_np(add_generation_prompt=True) + + cases.append( + BenchCase( + spec.family, + spec.model, + "session_bridge_loop", + f"{scenario.name}_{loop_steps}_steps", + len(py_loop.token_ids), + lambda scenario=scenario, pp=prev_prompt, pc=prev_completion: ( + _bridge_loop( + py_renderer, + pp, + pc, + scenario.new_messages, + scenario.tools, + steps=loop_steps, + ) + ), + lambda scenario=scenario, pc=prev_completion, session=bench_loop_session: ( + _session_bridge_loop( + session.fork(), + pc, + scenario.new_messages, + steps=loop_steps, + ) + ), + lambda scenario=scenario, pc=native_prev_completion_np, session=bench_loop_session_np: ( + _session_bridge_loop_np( + session.fork(), + pc, + scenario.new_messages, + steps=loop_steps, + ) + ), + ) + ) + + +def _add_batch_cases( + cases: list[BenchCase], + skipped: list[str], + *, + spec: FamilySpec, + py_renderer: Any, + native_renderer: Any, + strict: bool, +) -> None: + batch = _batch_messages() + batch_scenarios: list[tuple[str, list[ToolSpec] | None, Any]] = [ + ("short_batch", None, None) + ] + try: + prepared_tools = native_renderer.prepare_tools(TOOLS) + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_batch_ids:prepare_tools: {exc}" + ) from exc + skipped.append(f"{spec.family}:render_batch_ids:prepare_tools: {exc}") + prepared_tools = None + batch_scenarios.append(("short_batch_prepared_tools", TOOLS, prepared_tools)) + + for scenario_name, tools, prepared_tools in batch_scenarios: + if tools is not None and prepared_tools is None: + continue + native_tools = prepared_tools if prepared_tools is not None else None + try: + py_batch = [ + list( + py_renderer.render_ids( + messages, tools=tools, add_generation_prompt=True + ) + ) + for messages in batch + ] + native_batch = [ + list(ids) + for ids in native_renderer.render_batch_ids( + batch, + tools=native_tools, + add_generation_prompt=True, + ) + ] + native_packed_batch = _packed_batch_to_lists( + native_renderer.render_batch_ids_np_packed( + batch, + tools=native_tools, + add_generation_prompt=True, + ) + ) + if py_batch != native_batch: + raise AssertionError("batch render_ids parity failed") + if py_batch != native_packed_batch: + raise AssertionError("packed numpy batch render_ids parity failed") + except Exception as exc: + if strict: + raise RuntimeError( + f"{spec.family}:render_batch_ids:{scenario_name}: {exc}" + ) from exc + skipped.append(f"{spec.family}:render_batch_ids:{scenario_name}: {exc}") + continue + + cases.append( + BenchCase( + spec.family, + spec.model, + "render_batch_ids", + scenario_name, + _sum_token_count(py_batch), + lambda batch=batch, tools=tools: [ + py_renderer.render_ids( + messages, + tools=tools, + add_generation_prompt=True, + ) + for messages in batch + ], + lambda batch=batch, native_tools=native_tools: ( + native_renderer.render_batch_ids( + batch, + tools=native_tools, + add_generation_prompt=True, + ) + ), + lambda batch=batch, native_tools=native_tools: ( + native_renderer.render_batch_ids_np_packed( + batch, + tools=native_tools, + add_generation_prompt=True, + ) + ), + ) + ) + + +def build_cases( + *, + specs: Sequence[FamilySpec], + native_module: Any, + strict: bool, +) -> tuple[list[BenchCase], list[str]]: + cases: list[BenchCase] = [] + skipped: list[str] = [] + for spec in specs: + try: + with contextlib.redirect_stdout(io.StringIO()): + tokenizer = load_tokenizer(spec.model) + tokenizer_path = router.resolve_tokenizer_path(tokenizer) + if not os.path.exists(tokenizer_path): + raise FileNotFoundError(tokenizer_path) + py_renderer = build_python_renderer(spec.family, tokenizer) + native_renderer = build_native_renderer( + native_module, spec.family, tokenizer_path + ) + family_cases: list[BenchCase] = [] + _add_render_cases( + family_cases, + skipped, + spec=spec, + py_renderer=py_renderer, + native_renderer=native_renderer, + strict=strict, + ) + _add_parse_cases( + family_cases, + skipped, + spec=spec, + py_renderer=py_renderer, + native_renderer=native_renderer, + strict=strict, + ) + _add_bridge_cases( + family_cases, + skipped, + spec=spec, + py_renderer=py_renderer, + native_renderer=native_renderer, + strict=strict, + ) + _add_batch_cases( + family_cases, + skipped, + spec=spec, + py_renderer=py_renderer, + native_renderer=native_renderer, + strict=strict, + ) + cases.extend(family_cases) + print( + f"prepared family={spec.family} model={spec.model} " + f"tokenizer_path={tokenizer_path}", + file=sys.stderr, + ) + except Exception as exc: + message = f"{spec.family} ({spec.model}): {exc}" + if strict: + raise RuntimeError(message) from exc + skipped.append(message) + print(f"skipped {message}", file=sys.stderr) + return cases, skipped + + +def run_cases( + cases: Sequence[BenchCase], + *, + min_time_s: float, + repeats: int, + memory_loops: int, +) -> list[BenchRow]: + gc.collect() + gc.disable() + try: + rows: list[BenchRow] = [] + current_family: str | None = None + family_started_ns = time.perf_counter_ns() + total_steps = len(cases) * 4 + step = 0 + + def progress(case: BenchCase, label: str) -> None: + nonlocal step + step += 1 + print( + f"[{step}/{total_steps}] {case.family} {case.operation} " + f"{case.scenario}: {label}", + file=sys.stderr, + ) + + def finish_family(family: str) -> None: + family_rows = [row for row in rows if row.family == family] + if not family_rows: + return + elapsed_s = (time.perf_counter_ns() - family_started_ns) / 1_000_000_000 + list_speedup = geometric_mean([row.list_speedup for row in family_rows]) + np_speedup = geometric_mean( + [row.np_speedup for row in family_rows if row.np_speedup is not None] + ) + print( + f"family={family} rows={len(family_rows)} " + f"list_geomean={list_speedup:.2f}x " + f"np_geomean={np_speedup:.2f}x elapsed={elapsed_s:.1f}s", + file=sys.stderr, + ) + + for case in cases: + if current_family is None: + current_family = case.family + family_started_ns = time.perf_counter_ns() + elif case.family != current_family: + finish_family(current_family) + current_family = case.family + family_started_ns = time.perf_counter_ns() + + progress(case, "python") + py_timing = time_case(case.py_fn, min_time_s=min_time_s, repeats=repeats) + progress(case, "native list") + native_timing = time_case( + case.native_fn, min_time_s=min_time_s, repeats=repeats + ) + progress(case, "native np") + native_np_timing = ( + time_case(case.native_np_fn, min_time_s=min_time_s, repeats=repeats) + if case.native_np_fn is not None + else None + ) + progress(case, "memory") + py_memory = memory_case(case.py_fn, loops=memory_loops) + native_memory = memory_case(case.native_fn, loops=memory_loops) + native_np_memory = ( + memory_case(case.native_np_fn, loops=memory_loops) + if case.native_np_fn is not None + else None + ) + rows.append( + BenchRow( + case.family, + case.model, + case.operation, + case.scenario, + case.token_count, + py_timing, + native_timing, + native_np_timing, + py_memory, + native_memory, + native_np_memory, + ) + ) + if current_family is not None: + finish_family(current_family) + finally: + gc.enable() + return rows + + +def geometric_mean(values: Sequence[float]) -> float: + if not values: + return 0.0 + product = 1.0 + for value in values: + product *= value + return product ** (1.0 / len(values)) + + +def _run_text(args: Sequence[str]) -> str | None: + try: + result = subprocess.run( + args, + check=True, + capture_output=True, + text=True, + ) + except (OSError, subprocess.CalledProcessError): + return None + return result.stdout.strip() or None + + +def _git_metadata() -> dict[str, Any]: + return { + "commit": _run_text(["git", "rev-parse", "HEAD"]), + "short_commit": _run_text(["git", "rev-parse", "--short", "HEAD"]), + "dirty": bool(_run_text(["git", "status", "--porcelain"])), + "branch": _run_text(["git", "branch", "--show-current"]), + } + + +def _cpu_model() -> str | None: + if sys.platform == "darwin": + return _run_text(["sysctl", "-n", "machdep.cpu.brand_string"]) + if sys.platform.startswith("linux"): + try: + cpuinfo = Path("/proc/cpuinfo").read_text(encoding="utf-8") + except OSError: + return None + for line in cpuinfo.splitlines(): + if line.startswith("model name"): + return line.split(":", 1)[1].strip() + return platform.processor() or None + + +def _timing_dict(timing: Timing) -> dict[str, Any]: + return { + "loops": timing.loops, + "median_ns": timing.median_ns, + "median_us": timing.median_us, + "min_ns": timing.min_ns, + "max_ns": timing.max_ns, + } + + +def _memory_dict(memory: Memory) -> dict[str, Any]: + return { + "loops": memory.loops, + "peak_bytes": memory.peak_bytes, + "peak_kib": memory.peak_kib, + } + + +def _result_rows(rows: Sequence[BenchRow]) -> list[dict[str, Any]]: + result: list[dict[str, Any]] = [] + for row in rows: + base = { + "family": row.family, + "model": row.model, + "operation": row.operation, + "scenario": row.scenario, + "token_count": row.token_count, + } + result.append( + { + **base, + "path": "python", + "timing": _timing_dict(row.py_timing), + "memory": _memory_dict(row.py_memory), + "speedup_vs_python": 1.0, + } + ) + result.append( + { + **base, + "path": "native_list", + "timing": _timing_dict(row.native_timing), + "memory": _memory_dict(row.native_memory), + "speedup_vs_python": row.list_speedup, + } + ) + if row.native_np_timing is not None and row.native_np_memory is not None: + result.append( + { + **base, + "path": "native_np", + "timing": _timing_dict(row.native_np_timing), + "memory": _memory_dict(row.native_np_memory), + "speedup_vs_python": row.np_speedup, + } + ) + return result + + +def _family_summaries(rows: Sequence[BenchRow]) -> list[dict[str, Any]]: + summaries: list[dict[str, Any]] = [] + for family in sorted({row.family for row in rows}): + family_rows = [row for row in rows if row.family == family] + summaries.append( + { + "family": family, + "rows": len(family_rows), + "list_geomean_speedup": geometric_mean( + [row.list_speedup for row in family_rows] + ), + "np_geomean_speedup": geometric_mean( + [ + row.np_speedup + for row in family_rows + if row.np_speedup is not None + ] + ), + } + ) + return summaries + + +def _overall_summary(rows: Sequence[BenchRow]) -> dict[str, Any]: + return { + "rows": len(rows), + "list_geomean_speedup": geometric_mean([row.list_speedup for row in rows]), + "np_geomean_speedup": geometric_mean( + [row.np_speedup for row in rows if row.np_speedup is not None] + ), + } + + +def build_result_document( + *, + rows: Sequence[BenchRow], + skipped: Sequence[str], + args: argparse.Namespace, + native_module: Any, +) -> dict[str, Any]: + return { + "schema_version": 1, + "metadata": { + "git": _git_metadata(), + "python": { + "version": sys.version, + "executable": sys.executable, + }, + "rust": { + "rustc": _run_text(["rustc", "--version"]), + }, + "platform": { + "platform": platform.platform(), + "machine": platform.machine(), + "processor": platform.processor(), + "cpu_model": _cpu_model(), + }, + "native_extension": { + "module_file": getattr(native_module, "__file__", None), + "build_mode": "unknown", + }, + }, + "args": { + "families": args.families, + "model": args.model, + "min_time": args.min_time, + "repeats": args.repeats, + "memory_loops": args.memory_loops, + "strict": args.strict, + }, + "summary": _overall_summary(rows), + "families": _family_summaries(rows), + "rows": _result_rows(rows), + "skipped": list(skipped), + } + + +def write_json(path: str, document: dict[str, Any]) -> None: + output = Path(path) + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(json.dumps(document, indent=2, sort_keys=True), encoding="utf-8") + + +def _row_key(row: dict[str, Any]) -> tuple[str, str, str, str]: + return ( + str(row["family"]), + str(row["operation"]), + str(row["scenario"]), + str(row["path"]), + ) + + +def compare_to_baseline( + current: dict[str, Any], baseline: dict[str, Any] +) -> list[BaselineDiff]: + current_by_key = {_row_key(row): row for row in current.get("rows", [])} + baseline_by_key = {_row_key(row): row for row in baseline.get("rows", [])} + diffs: list[BaselineDiff] = [] + for key in sorted(set(current_by_key) | set(baseline_by_key)): + current_row = current_by_key.get(key) + baseline_row = baseline_by_key.get(key) + current_median = ( + current_row["timing"]["median_ns"] if current_row is not None else None + ) + baseline_median = ( + baseline_row["timing"]["median_ns"] if baseline_row is not None else None + ) + ratio = ( + current_median / baseline_median + if current_median is not None and baseline_median is not None + else None + ) + family, operation, scenario, path = key + diffs.append( + BaselineDiff( + family=family, + operation=operation, + scenario=scenario, + path=path, + current_median_ns=current_median, + baseline_median_ns=baseline_median, + ratio=ratio, + ) + ) + return diffs + + +def _load_baseline(path: str | None) -> dict[str, Any] | None: + if path is None: + return None + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def _format_us(ns: float | None) -> str: + if ns is None: + return "-" + return f"{ns / 1000.0:.3f}" + + +def _format_change(diff: BaselineDiff) -> str: + if diff.percent_change is None: + return "-" + return f"{diff.percent_change:+.1f}%" + + +def _diff_label(diff: BaselineDiff) -> str: + return f"{diff.family}/{diff.operation}/{diff.scenario}/{diff.path}" + + +def write_markdown( + path: str, + document: dict[str, Any], + baseline_diffs: Sequence[BaselineDiff], +) -> None: + output = Path(path) + output.parent.mkdir(parents=True, exist_ok=True) + summary = document["summary"] + lines = [ + "# Native Runtime Benchmark", + "", + "## Summary", + "", + "| rows | list geomean | np geomean | commit | dirty |", + "|---:|---:|---:|---|---|", + ( + f"| {summary['rows']} | {summary['list_geomean_speedup']:.2f}x | " + f"{summary['np_geomean_speedup']:.2f}x | " + f"`{document['metadata']['git']['short_commit']}` | " + f"{document['metadata']['git']['dirty']} |" + ), + "", + "## Families", + "", + "| family | rows | list geomean | np geomean |", + "|---|---:|---:|---:|", + ] + for item in document["families"]: + lines.append( + f"| `{item['family']}` | {item['rows']} | " + f"{item['list_geomean_speedup']:.2f}x | " + f"{item['np_geomean_speedup']:.2f}x |" + ) + + if baseline_diffs: + comparable = [diff for diff in baseline_diffs if diff.ratio is not None] + regressions = sorted( + [diff for diff in comparable if diff.ratio and diff.ratio > 1.05], + key=lambda diff: diff.ratio or 0.0, + reverse=True, + )[:10] + improvements = sorted( + [diff for diff in comparable if diff.ratio and diff.ratio < 0.95], + key=lambda diff: diff.ratio or 1.0, + )[:10] + missing_current = [ + diff for diff in baseline_diffs if diff.current_median_ns is None + ] + new_rows = [diff for diff in baseline_diffs if diff.baseline_median_ns is None] + + lines.extend( + [ + "", + "## Worst Regressions", + "", + "| case | current us | baseline us | change |", + "|---|---:|---:|---:|", + ] + ) + if regressions: + for diff in regressions: + lines.append( + f"| `{_diff_label(diff)}` | {_format_us(diff.current_median_ns)} | " + f"{_format_us(diff.baseline_median_ns)} | {_format_change(diff)} |" + ) + else: + lines.append("| none | - | - | - |") + + lines.extend( + [ + "", + "## Best Improvements", + "", + "| case | current us | baseline us | change |", + "|---|---:|---:|---:|", + ] + ) + if improvements: + for diff in improvements: + lines.append( + f"| `{_diff_label(diff)}` | {_format_us(diff.current_median_ns)} | " + f"{_format_us(diff.baseline_median_ns)} | {_format_change(diff)} |" + ) + else: + lines.append("| none | - | - | - |") + + if missing_current or new_rows: + lines.extend(["", "## Coverage Changes", ""]) + if missing_current: + lines.append("Missing current rows:") + lines.extend(f"- `{_diff_label(diff)}`" for diff in missing_current) + if new_rows: + lines.append("New rows:") + lines.extend(f"- `{_diff_label(diff)}`" for diff in new_rows) + + lines.extend(["", "## Skipped Cases", ""]) + if document["skipped"]: + lines.extend(f"- {item}" for item in document["skipped"]) + else: + lines.append("None.") + lines.append("") + output.write_text("\n".join(lines), encoding="utf-8") + + +def print_results( + rows: Sequence[BenchRow], skipped: Sequence[str], memory_loops: int +) -> None: + print( + "| family | operation | scenario | tokens | python us | native list us | " + "native np us | list speedup | np speedup | python peak KiB | " + "native list peak KiB | native np peak KiB |" + ) + print("|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|") + for row in rows: + np_us = ( + f"{row.native_np_timing.median_us:.3f}" + if row.native_np_timing is not None + else "-" + ) + np_speedup = f"{row.np_speedup:.2f}x" if row.np_speedup is not None else "-" + np_peak = ( + f"{row.native_np_memory.peak_kib:.1f}" + if row.native_np_memory is not None + else "-" + ) + print( + f"| `{row.family}` | `{row.operation}` | `{row.scenario}` | " + f"{row.token_count} | {row.py_timing.median_us:.3f} | " + f"{row.native_timing.median_us:.3f} | {np_us} | " + f"{row.list_speedup:.2f}x | {np_speedup} | " + f"{row.py_memory.peak_kib:.1f} | {row.native_memory.peak_kib:.1f} | " + f"{np_peak} |" + ) + + print() + print("| family | rows | list geomean speedup | np geomean speedup |") + print("|---|---:|---:|---:|") + families = sorted({row.family for row in rows}) + for family in families: + family_rows = [row for row in rows if row.family == family] + list_speedup = geometric_mean([row.list_speedup for row in family_rows]) + np_speedup = geometric_mean( + [row.np_speedup for row in family_rows if row.np_speedup is not None] + ) + print( + f"| `{family}` | {len(family_rows)} | {list_speedup:.2f}x | " + f"{np_speedup:.2f}x |" + ) + + print() + print( + "memory note: peak KiB uses Python tracemalloc over " + f"{memory_loops} calls; Rust allocator and NumPy native data buffers " + "are not included." + ) + if skipped: + print() + print("Skipped cases:") + for item in skipped: + print(f"- {item}") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--families", + default="all", + help=( + "Comma-separated family keys or 'all'. Known keys: " + + ", ".join(sorted(FAMILY_BY_NAME)) + ), + ) + parser.add_argument( + "--model", + action="append", + default=[], + help=( + "Override model id. Use MODEL when one family is selected, or " + "FAMILY=MODEL for multi-family runs. May be repeated." + ), + ) + parser.add_argument("--min-time", type=float, default=0.35) + parser.add_argument("--repeats", type=int, default=7) + parser.add_argument( + "--memory-loops", + type=int, + default=1000, + help=( + "Iterations for tracemalloc peak measurement. This tracks Python " + "heap allocations, including PyO3 boundary objects, not Rust malloc " + "or NumPy native data buffers." + ), + ) + parser.add_argument( + "--strict", + action="store_true", + help="Fail instead of skipping families whose tokenizer is unavailable.", + ) + parser.add_argument( + "--json-out", + help="Write structured benchmark results to this JSON file.", + ) + parser.add_argument( + "--markdown-out", + help="Write a Markdown benchmark summary to this file.", + ) + parser.add_argument( + "--baseline", + help="Compare current results against a previous JSON benchmark artifact.", + ) + parser.add_argument( + "--fail-on-regression", + type=float, + help=( + "Exit non-zero when a baseline row regresses by more than this " + "percentage. Missing current baseline rows also fail this gate." + ), + ) + args = parser.parse_args() + + os.environ.pop("RENDERERS_NATIVE", None) + logging.getLogger("transformers_modules").setLevel(logging.ERROR) + logging.getLogger("transformers").setLevel(logging.ERROR) + native = router.load_native() + if native is None: + raise RuntimeError( + "renderers_native is not built; run `uv run maturin develop " + "--manifest-path crates/renderers-py/Cargo.toml --release`" + ) + + specs = apply_model_overrides(parse_families(args.families), args.model) + cases, skipped = build_cases(specs=specs, native_module=native, strict=args.strict) + if not cases: + raise RuntimeError("no benchmark cases were prepared") + rows = run_cases( + cases, + min_time_s=args.min_time, + repeats=args.repeats, + memory_loops=args.memory_loops, + ) + document = build_result_document( + rows=rows, + skipped=skipped, + args=args, + native_module=native, + ) + baseline = _load_baseline(args.baseline) + baseline_diffs = compare_to_baseline(document, baseline) if baseline else [] + print_results(rows, skipped, args.memory_loops) + if args.json_out: + if baseline_diffs: + document["baseline"] = { + "path": args.baseline, + "diffs": [ + { + "family": diff.family, + "operation": diff.operation, + "scenario": diff.scenario, + "path": diff.path, + "current_median_ns": diff.current_median_ns, + "baseline_median_ns": diff.baseline_median_ns, + "ratio": diff.ratio, + "percent_change": diff.percent_change, + } + for diff in baseline_diffs + ], + } + write_json(args.json_out, document) + print(f"wrote json={args.json_out}", file=sys.stderr) + if args.markdown_out: + write_markdown(args.markdown_out, document, baseline_diffs) + print(f"wrote markdown={args.markdown_out}", file=sys.stderr) + + if args.fail_on_regression is not None and baseline_diffs: + threshold = args.fail_on_regression / 100.0 + regressions = [ + diff + for diff in baseline_diffs + if diff.ratio is not None and diff.ratio > 1.0 + threshold + ] + missing_current = [ + diff for diff in baseline_diffs if diff.current_median_ns is None + ] + if regressions or missing_current: + details = ", ".join( + _diff_label(diff) for diff in [*regressions, *missing_current][:5] + ) + raise SystemExit( + f"benchmark regression gate failed: {len(regressions)} " + f"regressions, {len(missing_current)} missing current rows; {details}" + ) + + +if __name__ == "__main__": + main() diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..916ec78 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,10 @@ +# Renderer functions can grow long: the family-specific render/parse +# paths string together many small steps and reading them top-to-bottom +# is the point. Bumped from the default 100 to accommodate our longest +# real function (parse_minimax at 121 lines) without giving up the lint. +too-many-lines-threshold = 130 + +# Config builders carry several independent bool toggles (enable_thinking, +# preserve_all_thinking, preserve_thinking_between_tool_calls, ...). Four +# is the natural shape; five+ would warrant a flags struct. +max-struct-bools = 4 diff --git a/crates/README.md b/crates/README.md new file mode 100644 index 0000000..cd8f7ea --- /dev/null +++ b/crates/README.md @@ -0,0 +1,116 @@ +# `renderers` Rust port + +Pure-Rust port of the `renderers` library, with a thin PyO3 wrapper so +existing Python callers can opt into the native path without code +changes. + +## Workspace layout + +| Crate | Role | +| ---------------- | ------------------------------------------------------------------------------------------ | +| `renderers-core` | Pure-Rust crate. Public `Renderer` / `MultimodalRenderer` traits, family implementations. | +| `renderers-py` | PyO3 wrapper. Builds the `renderers._native` extension module via maturin. | + +The Rust crate is usable standalone (e.g. from an sglang-rs / vllm-rs +integration); the Python wrapper exists only to bridge into the +existing `renderers` package. + +## Building the native extension + +For development (editable install into the active venv): + +```bash +maturin develop --manifest-path crates/renderers-py/Cargo.toml --release +``` + +This installs `renderers_native..so` into the venv's +`site-packages` so `import renderers_native` resolves. It's kept as a +top-level module (rather than `renderers._native`) so the maturin-built +wheel doesn't collide with the hatchling-built `renderers` wheel at +install time. + +## Opting into the native path at runtime + +The Python shims keep the pure-Python implementation as the default and +only route to the native module when `RENDERERS_NATIVE` selects the +family: + +```bash +RENDERERS_NATIVE=qwen3 pytest tests/test_render_ids.py +RENDERERS_NATIVE=all pytest tests/ # everything ported +``` + +## Parity testing + +Two complementary suites validate the port: + +1. **`tests/test_render_ids.py` (and siblings)** — Python (or, when the + env var routes, native) vs HuggingFace's `apply_chat_template`. + Catches drift from the upstream reference. Run under the native + path with `RENDERERS_NATIVE=qwen3 pytest tests/test_render_ids.py`. +2. **`tests/test_native_parity.py`** — Python vs native, holding the + reference fixed. Catches drift between the two implementations even + if HuggingFace changes its template. Cheaper because the HF call + isn't on the path. Marker: `-m parity`. + +The parity suite skips cleanly when the tokenizer.json isn't on disk +or the extension isn't built, so it's safe to import in CI without +gating on either. + +Recognised values: + +| `RENDERERS_NATIVE` | Behaviour | +| ------------------ | -------------------------------------------------------- | +| unset / `0` | Pure Python (default) | +| `1` / `all` | Route every supported family to the native module | +| `qwen3` | Route only Qwen3 | +| `qwen3,qwen35,...` | Route a comma-separated list of families | + +If `RENDERERS_NATIVE` is set but the extension isn't installed, the +shim logs a one-shot info message and falls back to Python. + +## Family coverage + +| Family | Status | +| ------------ | ----------------------------------------------- | +| Qwen3 | ✅ ported (Phase 2) | +| Qwen3.5 | ✅ ported text-only (Phase 3) — multimodal Phase 5 | +| GLM 4.5 / 5 | ✅ ported (Phase 3) — GLM-5, GLM-5.1, GLM-4.5 | +| DeepSeek V3 | ✅ ported (Phase 3) | +| Nemotron3 | ✅ ported (Phase 3) | +| Kimi K2 | ✅ ported (Phase 4) | +| Kimi K2.5 | planned (Phase 4 — text; multimodal Phase 5) | +| MiniMax M2 | ✅ ported (Phase 4) | +| Qwen3.6 | ✅ ported (Phase 4) | +| Qwen3-VL | planned (Phase 5 — multimodal incl. processor) | +| Qwen3.5 mm | planned (Phase 5) | +| GPT-OSS | planned (Phase 6 — via `openai-harmony` crate) | +| Default | planned (Phase 7 — via `minijinja`) | + +## Performance targets + +Single-call latency (Qwen3.5, 1500-token prompt, 512-token completion): + +| Phase | Python (current) | Rust (target) | Speedup | +| -------------------- | ---------------: | ------------: | ------: | +| `render_ids` | 0.5–1.0 ms | 0.15–0.3 ms | 3–5×| +| `parse_response` | 0.05–0.15 ms | 0.05–0.15 ms¹ | 5–10×| +| `bridge_to_next_turn`| 0.3–0.6 ms | 0.05–0.15 ms | 4–6×| + +¹ Speedup vs Python including FFI overhead, which is the actual gap; +absolute numbers depend on completion content shape. + +Throughput on an 8-thread caller is expected to gain another ~5–8× +because every method releases the GIL (`py.allow_threads`) — the Python +pool model is obsolete in Rust. + +## Crate-level invariants + +- `#![forbid(unsafe_code)]` at the crate root of `renderers-core`. +- All hot-path scans use bounded `&[u32]` slices; no allocation in + `find` / `find_from` / `find_any`. +- `RenderBuf` reserves capacity once based on `messages.len() * 256`. +- Special-token ids are resolved at renderer construction and cached + on the struct. +- Tokenizer is held behind `Arc<...>` so a single instance serves any + number of concurrent callers. diff --git a/crates/renderers-cli/Cargo.toml b/crates/renderers-cli/Cargo.toml new file mode 100644 index 0000000..f972537 --- /dev/null +++ b/crates/renderers-cli/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "renderers-cli" +version = "0.1.0" +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +description = "CLI runner for renderers-core — golden-test fixtures and ad-hoc rendering." +publish = false + +[[bin]] +name = "renderers-cli" +path = "src/main.rs" + +[dependencies] +renderers-core = { path = "../renderers-core" } +serde = { workspace = true } +serde_json = { workspace = true } +clap = { workspace = true } + +[dev-dependencies] +criterion = { version = "0.8", features = ["html_reports"] } + +[[bench]] +name = "qwen3" +harness = false + +[lints] +workspace = true diff --git a/crates/renderers-cli/benches/qwen3.rs b/crates/renderers-cli/benches/qwen3.rs new file mode 100644 index 0000000..73b23a5 --- /dev/null +++ b/crates/renderers-cli/benches/qwen3.rs @@ -0,0 +1,149 @@ +//! Qwen3 throughput benchmarks for `renderers-core`. +//! +//! Needs a real tokenizer.json on disk because the benchmarks measure +//! end-to-end render/parse latency (the tokenizer is on the hot path). +//! Set `BENCH_TOKENIZER=/path/to/tokenizer.json` before running: +//! +//! ```bash +//! BENCH_TOKENIZER=/path/to/qwen3-8b/tokenizer.json \ +//! cargo bench -p renderers-cli +//! ``` +//! +//! When `BENCH_TOKENIZER` is unset the benches return early without +//! failing — they're informational, not a CI gate. + +use std::hint::black_box; +use std::time::Duration; + +use criterion::{Criterion, criterion_group, criterion_main}; + +use renderers_core::Renderer; +use renderers_core::families::Qwen3Renderer; +use renderers_core::tokenizer::Tokenizer; +use renderers_core::types::{Content, Message}; + +fn tokenizer() -> Option { + let path = std::env::var("BENCH_TOKENIZER").ok()?; + match Tokenizer::from_file(&path) { + Ok(t) => Some(t), + Err(e) => { + eprintln!("bench skipped — couldn't load tokenizer at {path}: {e}"); + None + } + } +} + +fn text_msg(role: &str, content: &str) -> Message { + Message { + role: role.to_string(), + content: Content::Text(content.to_string()), + ..Default::default() + } +} + +fn typical_conversation() -> Vec { + vec![ + text_msg( + "system", + "You are a helpful assistant that calls tools when needed.", + ), + text_msg( + "user", + "Plan a weekend trip to Lisbon for two; we like food and walking.", + ), + text_msg( + "assistant", + "I'll help. First, let me check the weather and find some restaurants.", + ), + text_msg("user", "Sounds good — go ahead."), + text_msg( + "assistant", + "Here's a plan: Friday evening tapas at Time Out Market, \ + Saturday morning walk through Alfama, Saturday lunch at \ + Ramiro (seafood), Saturday afternoon Belém pastéis, \ + Sunday morning São Jorge castle, Sunday lunch at Cervejaria \ + Trindade.", + ), + ] +} + +fn bench_render_ids(c: &mut Criterion) { + let Some(tok) = tokenizer() else { + return; + }; + let renderer = Qwen3Renderer::new(tok).expect("build Qwen3 renderer"); + let messages = typical_conversation(); + let mut group = c.benchmark_group("qwen3"); + group.measurement_time(Duration::from_secs(5)); + group.bench_function("render_ids/5_turn_text", |b| { + b.iter(|| { + let ids = renderer + .render_ids(black_box(&messages), None, true) + .expect("render_ids"); + black_box(ids); + }); + }); + group.finish(); +} + +fn bench_parse_response(c: &mut Criterion) { + let Some(tok) = tokenizer() else { + return; + }; + let renderer = Qwen3Renderer::new(tok).expect("build Qwen3 renderer"); + let messages = typical_conversation(); + // Render once to get a realistic completion-ish prefix; treat it + // as a "completion" for the parse benchmark. + let output = renderer.render(&messages, None, true).expect("render"); + let ids = output.token_ids; + + let mut group = c.benchmark_group("qwen3"); + group.measurement_time(Duration::from_secs(5)); + group.bench_function("parse_response/no_tool_calls", |b| { + b.iter(|| { + let parsed = renderer.parse_response(black_box(&ids)); + black_box(parsed); + }); + }); + group.finish(); +} + +fn bench_bridge(c: &mut Criterion) { + let Some(tok) = tokenizer() else { + return; + }; + let renderer = Qwen3Renderer::new(tok).expect("build Qwen3 renderer"); + let messages = typical_conversation(); + let output = renderer.render(&messages, None, true).expect("render"); + let prev_prompt_ids = output.token_ids.clone(); + let prev_completion_ids: Vec = vec![]; + let new_messages = vec![text_msg( + "user", + "Add a kid-friendly option for Sunday morning.", + )]; + + let mut group = c.benchmark_group("qwen3"); + group.measurement_time(Duration::from_secs(5)); + group.bench_function("bridge_to_next_turn/short_user_turn", |b| { + b.iter(|| { + let bridged = renderer + .bridge_to_next_turn( + black_box(&prev_prompt_ids), + black_box(&prev_completion_ids), + black_box(&new_messages), + None, + ) + .expect("bridge"); + black_box(bridged); + }); + }); + group.finish(); +} + +criterion_group!( + benches, + bench_render_ids, + bench_parse_response, + bench_bridge +); +criterion_main!(benches); diff --git a/crates/renderers-cli/src/main.rs b/crates/renderers-cli/src/main.rs new file mode 100644 index 0000000..6d54843 --- /dev/null +++ b/crates/renderers-cli/src/main.rs @@ -0,0 +1,214 @@ +//! `renderers-cli` — small dev tool that drives `renderers-core` +//! without going through Python. +//! +//! Designed for two use cases: +//! +//! 1. **Golden parity checking**: render a fixture JSON of messages +//! against a tokenizer.json, emit the result as JSON, and `diff` +//! against the Python reference output. The exit code is non-zero +//! if the run fails — the comparison is left to the caller (the +//! pytest harness does the actual diffing). +//! 2. **Manual prototyping**: try out new families / config changes +//! without spinning up the `PyO3` wheel. + +use std::path::PathBuf; +use std::process::ExitCode; + +use clap::{Parser, Subcommand, ValueEnum}; +use renderers_core::Renderer; +use renderers_core::families::Qwen3Renderer; +use renderers_core::tokenizer::Tokenizer; +use renderers_core::types::{Message, ParsedToolCall, RenderedTokens, ToolArguments, ToolSpec}; +use serde::Serialize; + +/// Render and parse messages via `renderers-core`. Output is line-by-line +/// JSON on stdout for easy diffing. +#[derive(Debug, Parser)] +#[command(name = "renderers-cli", version, about, long_about = None)] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Subcommand)] +enum Command { + /// Render a conversation to token ids + per-token message indices. + Render(RenderArgs), + + /// Parse a completion's token ids into a structured response. + Parse(ParseArgs), +} + +/// Renderer families wired through to `renderers-core`. New families +/// land here as they're ported. +#[derive(Debug, Clone, Copy, ValueEnum)] +enum Family { + Qwen3, +} + +#[derive(Debug, Parser)] +struct RenderArgs { + /// Renderer family to instantiate. + #[arg(long, value_enum, default_value_t = Family::Qwen3)] + family: Family, + + /// Path to a `tokenizer.json` file. + #[arg(long)] + tokenizer: PathBuf, + + /// Path to a JSON file containing a list of messages. + #[arg(long)] + messages: PathBuf, + + /// Path to a JSON file containing a list of tool specs. + #[arg(long)] + tools: Option, + + /// Emit a trailing generation prompt (`<|im_start|>assistant\n` for + /// Qwen3). + #[arg(long)] + gen_prompt: bool, +} + +#[derive(Debug, Parser)] +struct ParseArgs { + /// Renderer family to instantiate. + #[arg(long, value_enum, default_value_t = Family::Qwen3)] + family: Family, + + /// Path to a `tokenizer.json` file. + #[arg(long)] + tokenizer: PathBuf, + + /// JSON-encoded list of integer token ids + /// (e.g. `'[151644, 8948, 198, ...]'`). + #[arg(long)] + token_ids: String, +} + +#[derive(Serialize)] +struct RenderedJson { + token_ids: Vec, + message_indices: Vec, +} + +impl From for RenderedJson { + fn from(r: RenderedTokens) -> Self { + Self { + token_ids: r.token_ids, + message_indices: r.message_indices, + } + } +} + +#[derive(Serialize)] +struct ParsedToolCallJson<'a> { + raw: &'a str, + name: Option<&'a str>, + arguments: serde_json::Value, + status: &'static str, + token_span: Option<(usize, usize)>, + id: Option<&'a str>, +} + +impl<'a> From<&'a ParsedToolCall> for ParsedToolCallJson<'a> { + fn from(p: &'a ParsedToolCall) -> Self { + let args = match &p.arguments { + None => serde_json::Value::Null, + Some(ToolArguments::Object(v)) => v.clone(), + Some(ToolArguments::Raw(s)) => serde_json::Value::String(s.clone()), + }; + Self { + raw: &p.raw, + name: p.name.as_deref(), + arguments: args, + status: p.status.as_wire(), + token_span: p.token_span.as_ref().map(|r| (r.start, r.end)), + id: p.id.as_deref(), + } + } +} + +#[derive(Serialize)] +struct ParsedJson<'a> { + content: &'a str, + reasoning_content: Option<&'a str>, + tool_calls: Vec>, +} + +fn build_renderer(family: Family, tokenizer: Tokenizer) -> Result, String> { + match family { + Family::Qwen3 => Qwen3Renderer::new(tokenizer) + .map(|r| Box::new(r) as Box) + .map_err(|e| e.to_string()), + } +} + +fn load_messages(path: &PathBuf) -> Result, String> { + let bytes = std::fs::read(path).map_err(|e| format!("read {}: {e}", path.display()))?; + serde_json::from_slice(&bytes).map_err(|e| format!("messages JSON: {e}")) +} + +fn load_tools(path: &PathBuf) -> Result, String> { + let bytes = std::fs::read(path).map_err(|e| format!("read {}: {e}", path.display()))?; + serde_json::from_slice(&bytes).map_err(|e| format!("tools JSON: {e}")) +} + +fn parse_token_ids(s: &str) -> Result, String> { + let v: Vec = serde_json::from_str(s).map_err(|e| format!("token-ids JSON: {e}"))?; + v.into_iter() + .map(|t| u32::try_from(t).map_err(|_| format!("token id out of range: {t}"))) + .collect() +} + +fn run_render(args: &RenderArgs) -> Result<(), String> { + let tok = Tokenizer::from_file(&args.tokenizer) + .map_err(|e| format!("load tokenizer {}: {e}", args.tokenizer.display()))?; + let renderer = build_renderer(args.family, tok)?; + let messages = load_messages(&args.messages)?; + let tools = match args.tools.as_ref() { + Some(p) => Some(load_tools(p)?), + None => None, + }; + let output = renderer + .render(&messages, tools.as_deref(), args.gen_prompt) + .map_err(|e| e.to_string())?; + let json: RenderedJson = output.into(); + println!("{}", serde_json::to_string(&json).unwrap()); + Ok(()) +} + +fn run_parse(args: &ParseArgs) -> Result<(), String> { + let tok = Tokenizer::from_file(&args.tokenizer) + .map_err(|e| format!("load tokenizer {}: {e}", args.tokenizer.display()))?; + let renderer = build_renderer(args.family, tok)?; + let ids = parse_token_ids(&args.token_ids)?; + let parsed = renderer.parse_response(&ids); + let tool_calls: Vec> = parsed + .tool_calls + .iter() + .map(ParsedToolCallJson::from) + .collect(); + let json = ParsedJson { + content: &parsed.content, + reasoning_content: parsed.reasoning_content.as_deref(), + tool_calls, + }; + println!("{}", serde_json::to_string(&json).unwrap()); + Ok(()) +} + +fn main() -> ExitCode { + let cli = Cli::parse(); + let result = match cli.command { + Command::Render(args) => run_render(&args), + Command::Parse(args) => run_parse(&args), + }; + match result { + Ok(()) => ExitCode::SUCCESS, + Err(msg) => { + eprintln!("error: {msg}"); + ExitCode::FAILURE + } + } +} diff --git a/crates/renderers-core/Cargo.toml b/crates/renderers-core/Cargo.toml new file mode 100644 index 0000000..4d24b65 --- /dev/null +++ b/crates/renderers-core/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "renderers-core" +version = "0.1.0" +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +description = "Deterministic message-to-token rendering for LLM training/inference (Rust core)." + +[lib] +path = "src/lib.rs" + +[dependencies] +tokenizers = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +regex = { workspace = true } +thiserror = { workspace = true } +smallvec = { workspace = true } +bumpalo = { workspace = true } +phf = { workspace = true } +openai-harmony = { version = "0.0.8", default-features = false } +minijinja = { version = "2", default-features = false, features = ["builtins", "serde"] } +image = { version = "0.25", default-features = false, features = ["jpeg", "png", "webp"] } +ndarray = "0.17" +sha2 = "0.11" + +[dev-dependencies] +serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/crates/renderers-core/src/bridge.rs b/crates/renderers-core/src/bridge.rs new file mode 100644 index 0000000..42b6627 --- /dev/null +++ b/crates/renderers-core/src/bridge.rs @@ -0,0 +1,103 @@ +//! Helpers shared across renderers' `bridge_to_next_turn` implementations. + +use crate::types::Message; + +/// Returns `true` if any message in `new_messages` carries the assistant +/// role. Bridges refuse to retokenize assistant content because it would +/// replace model-sampled tokens with canonical template text, violating +/// the byte-for-byte contract. +#[inline] +pub fn reject_assistant_in_extension(new_messages: &[Message]) -> bool { + new_messages.iter().any(|m| m.role == "assistant") +} + +/// Return the longest prefix of `prev_prompt + prev_completion` that ends +/// at a turn-close token, or `None` if none exists and `synthesize_close` +/// is `None`. +/// +/// Scans only within the completion segment — close tokens inside the +/// prompt are structural scaffolding, not turn boundaries the current +/// step produced. +/// +/// Returns an owned `Vec` that the caller can mutate; the inputs are +/// borrowed. +pub fn trim_to_turn_close( + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + close_token_ids: &[u32], + synthesize_close: Option, +) -> Option> { + let prompt_len = previous_prompt_ids.len(); + let total_len = prompt_len + previous_completion_ids.len(); + + // Walk the completion section backwards looking for a close token. + for offset in (0..previous_completion_ids.len()).rev() { + if close_token_ids.contains(&previous_completion_ids[offset]) { + let mut out = Vec::with_capacity(prompt_len + offset + 1); + out.extend_from_slice(previous_prompt_ids); + out.extend_from_slice(&previous_completion_ids[..=offset]); + return Some(out); + } + } + + let close = synthesize_close?; + let mut out = Vec::with_capacity(total_len + 1); + out.extend_from_slice(previous_prompt_ids); + out.extend_from_slice(previous_completion_ids); + out.push(close); + Some(out) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn msg(role: &str) -> Message { + Message { + role: role.to_string(), + ..Default::default() + } + } + + #[test] + fn rejects_assistant_in_extension() { + assert!(reject_assistant_in_extension(&[msg("assistant")])); + assert!(!reject_assistant_in_extension(&[msg("user"), msg("tool")])); + assert!(!reject_assistant_in_extension(&[])); + } + + #[test] + fn trim_to_close_keeps_prefix() { + let prompt = vec![1, 2, 3]; + let completion = vec![4, 5, 9, 6, 9]; + let close = [9u32]; + let trimmed = trim_to_turn_close(&prompt, &completion, &close, None).unwrap(); + assert_eq!(trimmed, vec![1, 2, 3, 4, 5, 9, 6, 9]); + } + + #[test] + fn trim_to_close_finds_last_close() { + let prompt = vec![1, 2]; + let completion = vec![9, 3, 4]; + let close = [9u32]; + let trimmed = trim_to_turn_close(&prompt, &completion, &close, None).unwrap(); + assert_eq!(trimmed, vec![1, 2, 9]); + } + + #[test] + fn trim_to_close_ignores_prompt_close() { + let prompt = vec![9, 1, 2]; + let completion = vec![3, 4]; + let close = [9u32]; + assert!(trim_to_turn_close(&prompt, &completion, &close, None).is_none()); + } + + #[test] + fn trim_to_close_synthesises_when_truncated() { + let prompt = vec![1, 2]; + let completion = vec![3, 4]; + let close = [9u32]; + let trimmed = trim_to_turn_close(&prompt, &completion, &close, Some(9)).unwrap(); + assert_eq!(trimmed, vec![1, 2, 3, 4, 9]); + } +} diff --git a/crates/renderers-core/src/emit.rs b/crates/renderers-core/src/emit.rs new file mode 100644 index 0000000..ae95268 --- /dev/null +++ b/crates/renderers-core/src/emit.rs @@ -0,0 +1,246 @@ +//! Render-buffer helpers used by every hand-coded family. +//! +//! The pattern is the same everywhere: pre-allocated `Vec` for tokens +//! and `Vec` for per-token message attribution, with three primitives +//! to fill them. Centralising the primitives lets each family stay focused +//! on its own template logic without re-deriving the bookkeeping. + +use crate::tokenizer::Tokenizer; +use crate::types::{RenderError, RenderedTokens, SCAFFOLD_IDX}; + +pub trait TokenSink { + fn special(&mut self, token_id: u32, msg_idx: i32); + fn ids(&mut self, token_ids: &[u32], msg_idx: i32); + fn text(&mut self, text: &str, msg_idx: i32) -> Result<(), RenderError>; + + #[inline] + fn scaffold_special(&mut self, token_id: u32) { + self.special(token_id, SCAFFOLD_IDX); + } + + #[inline] + fn scaffold_text(&mut self, text: &str) -> Result<(), RenderError> { + self.text(text, SCAFFOLD_IDX) + } +} + +/// Mutable render-time buffer paired with a tokenizer reference. +/// +/// Holds both the token stream and the parallel `message_indices` array. +/// All emits are O(1) amortised against the pre-allocated capacity. +pub struct RenderBuf<'tok> { + tokens: Vec, + indices: Option>, + tokenizer: &'tok Tokenizer, + /// Scratch `Vec` reused across `encode` calls so each text segment + /// doesn't allocate. The tokenizer's `encode` API returns its own + /// `Encoding`, so the saving is at the buffer-extension layer, not + /// at encode itself. + scratch_offsets: Vec, +} + +impl std::fmt::Debug for RenderBuf<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RenderBuf") + .field("tokens_len", &self.tokens.len()) + .field("indices_len", &self.indices.as_ref().map(Vec::len)) + .finish() + } +} + +impl<'tok> RenderBuf<'tok> { + pub fn new(tokenizer: &'tok Tokenizer, hint: usize) -> Self { + Self { + tokens: Vec::with_capacity(hint), + indices: Some(Vec::with_capacity(hint)), + tokenizer, + scratch_offsets: Vec::new(), + } + } + + pub fn new_token_ids_only(tokenizer: &'tok Tokenizer, hint: usize) -> Self { + Self { + tokens: Vec::with_capacity(hint), + indices: None, + tokenizer, + scratch_offsets: Vec::new(), + } + } + + #[inline] + pub fn tokenizer(&self) -> &Tokenizer { + self.tokenizer + } + + /// Append a single special token id to the buffer. + #[inline] + pub fn special(&mut self, token_id: u32, msg_idx: i32) { + self.tokens.push(token_id); + if let Some(indices) = &mut self.indices { + indices.push(msg_idx); + } + } + + /// Append a span of token ids to the buffer, all attributed to the + /// same message index. + #[inline] + pub fn ids(&mut self, token_ids: &[u32], msg_idx: i32) { + self.tokens.extend_from_slice(token_ids); + // `resize` with a Copy fill is the cheapest way to extend the + // indices vector by N elements of the same value. + if let Some(indices) = &mut self.indices { + let new_len = indices.len() + token_ids.len(); + indices.resize(new_len, msg_idx); + } + } + + /// Encode `text` and append the resulting tokens, attributing all of + /// them to `msg_idx`. Empty strings are a no-op (saves a tokenizer + /// call on the common "no content here" path). + #[inline] + pub fn text(&mut self, text: &str, msg_idx: i32) -> Result<(), RenderError> { + if text.is_empty() { + return Ok(()); + } + let encoded = self.tokenizer.encode_no_special(text)?; + self.ids(encoded.as_slice(), msg_idx); + Ok(()) + } + + /// Append a scaffold token (one whose attribution is "structural, + /// not from any message" — uses [`SCAFFOLD_IDX`]). + #[inline] + pub fn scaffold_special(&mut self, token_id: u32) { + self.special(token_id, SCAFFOLD_IDX); + } + + /// Encode `text` and append as scaffolding (attribution [`SCAFFOLD_IDX`]). + #[inline] + pub fn scaffold_text(&mut self, text: &str) -> Result<(), RenderError> { + self.text(text, SCAFFOLD_IDX) + } + + /// Consume the buffer and return a [`RenderedTokens`]. + pub fn into_rendered(self) -> RenderedTokens { + let indices = self.indices.unwrap_or_default(); + debug_assert_eq!(self.tokens.len(), indices.len()); + let _ = self.scratch_offsets; // keep the field but ignore + RenderedTokens { + token_ids: self.tokens, + message_indices: indices, + multi_modal_data: None, + } + } + + /// Take the token ids only, dropping per-token attribution. Used by + /// `render_ids` callers that don't need the indices array. + pub fn into_token_ids(self) -> Vec { + self.tokens + } + + #[inline] + pub fn len(&self) -> usize { + self.tokens.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.tokens.is_empty() + } +} + +impl TokenSink for RenderBuf<'_> { + #[inline] + fn special(&mut self, token_id: u32, msg_idx: i32) { + RenderBuf::special(self, token_id, msg_idx); + } + + #[inline] + fn ids(&mut self, token_ids: &[u32], msg_idx: i32) { + RenderBuf::ids(self, token_ids, msg_idx); + } + + #[inline] + fn text(&mut self, text: &str, msg_idx: i32) -> Result<(), RenderError> { + RenderBuf::text(self, text, msg_idx) + } +} + +#[derive(Debug)] +enum TokenPlanOp { + Ids(Vec), + Special(u32), + Text(String), +} + +#[derive(Debug)] +pub struct TokenPlanBuf<'tok> { + ops: Vec, + tokenizer: &'tok Tokenizer, + cap_hint: usize, + text_count: usize, +} + +impl<'tok> TokenPlanBuf<'tok> { + pub fn new(tokenizer: &'tok Tokenizer, hint: usize) -> Self { + Self { + ops: Vec::with_capacity(hint.min(256)), + tokenizer, + cap_hint: hint, + text_count: 0, + } + } + + pub fn into_token_ids(self) -> Result, RenderError> { + let encoded_texts = if self.text_count == 0 { + Vec::new() + } else { + let texts: Vec<&str> = self + .ops + .iter() + .filter_map(|op| match op { + TokenPlanOp::Text(text) => Some(text.as_str()), + _ => None, + }) + .collect(); + self.tokenizer.encode_batch_no_special(texts)? + }; + + let mut text_idx = 0; + let mut tokens = Vec::with_capacity(self.cap_hint); + for op in self.ops { + match op { + TokenPlanOp::Ids(ids) => tokens.extend_from_slice(&ids), + TokenPlanOp::Special(id) => tokens.push(id), + TokenPlanOp::Text(_) => { + tokens.extend_from_slice(encoded_texts[text_idx].as_slice()); + text_idx += 1; + } + } + } + Ok(tokens) + } +} + +impl TokenSink for TokenPlanBuf<'_> { + #[inline] + fn special(&mut self, token_id: u32, _msg_idx: i32) { + self.ops.push(TokenPlanOp::Special(token_id)); + } + + #[inline] + fn ids(&mut self, token_ids: &[u32], _msg_idx: i32) { + if !token_ids.is_empty() { + self.ops.push(TokenPlanOp::Ids(token_ids.to_vec())); + } + } + + #[inline] + fn text(&mut self, text: &str, _msg_idx: i32) -> Result<(), RenderError> { + if !text.is_empty() { + self.ops.push(TokenPlanOp::Text(text.to_string())); + self.text_count += 1; + } + Ok(()) + } +} diff --git a/crates/renderers-core/src/families/deepseek_v3.rs b/crates/renderers-core/src/families/deepseek_v3.rs new file mode 100644 index 0000000..260257a --- /dev/null +++ b/crates/renderers-core/src/families/deepseek_v3.rs @@ -0,0 +1,448 @@ +//! `DeepSeek` V3 renderer. Port of `renderers/deepseek_v3.py`. +//! +//! Key differences from the Qwen-family renderers: +//! +//! - Special tokens use **fullwidth Unicode** delimiters (`|` = U+FF5C, +//! `▁` = U+2581). Token names are e.g. `<|begin▁of▁sentence|>`. +//! - **Implicit role markers** — `<|User|>` and `<|Assistant|>` carry the +//! role themselves; there's no role-name text after the marker the way +//! Qwen has `<|im_start|>user\n`. +//! - **All leading system messages are concatenated** with `\n\n` and +//! emitted as plain text *before* the first non-system role token (no +//! marker for the system block). +//! - Thinking is plain text `...` tags, not special tokens. +//! - Tool calls live in `<|tool▁calls▁begin|>...<|tool▁calls▁end|>` with +//! each call as `<|tool▁call▁begin|>function<|tool▁sep|>name\n +//! ` ```json\n{args}\n``` `<|tool▁call▁end|>`. + +use serde_json::Value as JsonValue; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; +use crate::parsing::deepseek_v3::parse_deepseek_v3; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{Message, ParsedResponse, RenderError, RenderedTokens, ToolArguments, ToolSpec}; + +const SEP: char = '\u{FF5C}'; // | +const US: char = '\u{2581}'; // ▁ + +fn ds_token(name: &str) -> String { + let mut s = String::with_capacity(name.len() + 4); + s.push('<'); + s.push(SEP); + s.push_str(name); + s.push(SEP); + s.push('>'); + s +} + +#[derive(Debug, Clone)] +pub struct DeepSeekV3RendererBuilder { + enable_thinking: bool, +} + +impl Default for DeepSeekV3RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + } + } +} + +impl DeepSeekV3RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + DeepSeekV3Renderer::new_with(tokenizer, &self) + } +} + +#[derive(Debug, Clone)] +pub struct DeepSeekV3Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + + bos: u32, + eos: u32, + user_token: u32, + assistant_token: u32, + tool_calls_begin: u32, + tool_calls_end: u32, + tool_call_begin: u32, + tool_call_end: u32, + tool_sep: u32, + tool_outputs_begin: u32, + tool_outputs_end: u32, + tool_output_begin: u32, + tool_output_end: u32, + + stop_tokens: Vec, +} + +impl DeepSeekV3Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + DeepSeekV3RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> DeepSeekV3RendererBuilder { + DeepSeekV3RendererBuilder::default() + } + + /// Encode a `DeepSeek` special token via the tokenizer's encode path and + /// assert it maps to exactly one id. Matches the Python + /// `_get_special_token` helper — required because the tokenizer + /// doesn't expose these by `token_to_id` directly (the fullwidth + /// characters are part of the BPE vocab as a single piece). + fn resolve(tokenizer: &Tokenizer, name: &str) -> Result { + let token_str = ds_token(name); + let encoded = tokenizer.encode_no_special(&token_str)?; + let ids = encoded.as_slice(); + if ids.len() != 1 { + return Err(RenderError::MissingSpecialToken(token_str)); + } + Ok(ids[0]) + } + + // Paired begin/end token ids share semantic prefixes (tool_call, + // tool_calls, tool_output, tool_outputs); the similarity is the + // structural relationship, so renaming would lose information. + #[allow(clippy::similar_names)] + fn new_with( + tokenizer: Tokenizer, + cfg: &DeepSeekV3RendererBuilder, + ) -> Result { + let bos = Self::resolve(&tokenizer, &format!("begin{US}of{US}sentence"))?; + let eos = Self::resolve(&tokenizer, &format!("end{US}of{US}sentence"))?; + let user_token = Self::resolve(&tokenizer, "User")?; + let assistant_token = Self::resolve(&tokenizer, "Assistant")?; + let tool_calls_begin = Self::resolve(&tokenizer, &format!("tool{US}calls{US}begin"))?; + let tool_calls_end = Self::resolve(&tokenizer, &format!("tool{US}calls{US}end"))?; + let tool_call_begin = Self::resolve(&tokenizer, &format!("tool{US}call{US}begin"))?; + let tool_call_end = Self::resolve(&tokenizer, &format!("tool{US}call{US}end"))?; + let tool_sep = Self::resolve(&tokenizer, &format!("tool{US}sep"))?; + let tool_outputs_begin = Self::resolve(&tokenizer, &format!("tool{US}outputs{US}begin"))?; + let tool_outputs_end = Self::resolve(&tokenizer, &format!("tool{US}outputs{US}end"))?; + let tool_output_begin = Self::resolve(&tokenizer, &format!("tool{US}output{US}begin"))?; + let tool_output_end = Self::resolve(&tokenizer, &format!("tool{US}output{US}end"))?; + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + bos, + eos, + user_token, + assistant_token, + tool_calls_begin, + tool_calls_end, + tool_call_begin, + tool_call_end, + tool_sep, + tool_outputs_begin, + tool_outputs_end, + tool_output_begin, + tool_output_end, + stop_tokens: vec![eos], + }) + } + + fn args_to_json_string(args: &ToolArguments) -> String { + match args { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => python_json_dumps(v), + } + } + + fn estimate_capacity(messages: &[Message]) -> usize { + messages.len().max(1) * 256 + 64 + } + + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + + fn render_into_buf( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + add_generation_prompt: bool, + ) -> Result<(), RenderError> { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + + buf.scaffold_special(self.bos); + + let mut first_non_sys = 0usize; + let mut sys_parts: Vec<&str> = Vec::new(); + for msg in messages { + if msg.role != "system" { + break; + } + sys_parts.push(msg.text_content()); + first_non_sys += 1; + } + if !sys_parts.is_empty() { + let joined = sys_parts.join("\n\n"); + buf.text(&joined, 0)?; + } + + for (i, msg) in messages.iter().enumerate().skip(first_non_sys) { + let idx = i as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "system" | "user" => { + buf.special(self.user_token, idx); + buf.text(content, idx)?; + } + "assistant" => self.emit_assistant(buf, msg, i, messages)?, + "tool" => self.emit_tool(buf, messages, i)?, + _ => {} + } + } + + if add_generation_prompt { + let last_role = messages.last().map_or("", |m| m.role.as_str()); + if last_role != "tool" { + buf.scaffold_special(self.assistant_token); + } + if self.enable_thinking { + buf.scaffold_text("\n")?; + } + } + + Ok(()) + } +} + +fn python_json_dumps(value: &JsonValue) -> String { + match value { + JsonValue::Null => "null".to_string(), + JsonValue::Bool(v) => v.to_string(), + JsonValue::Number(v) => v.to_string(), + JsonValue::String(v) => serde_json::to_string(v).unwrap_or_else(|_| "\"\"".to_string()), + JsonValue::Array(items) => { + let mut out = String::from("["); + for (i, item) in items.iter().enumerate() { + if i > 0 { + out.push_str(", "); + } + out.push_str(&python_json_dumps(item)); + } + out.push(']'); + out + } + JsonValue::Object(map) => { + let mut out = String::from("{"); + for (i, (key, item)) in map.iter().enumerate() { + if i > 0 { + out.push_str(", "); + } + out.push_str(&serde_json::to_string(key).unwrap_or_else(|_| "\"\"".to_string())); + out.push_str(": "); + out.push_str(&python_json_dumps(item)); + } + out.push('}'); + out + } + } +} + +impl Renderer for DeepSeekV3Renderer { + fn render( + &self, + messages: &[Message], + _tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages)); + self.render_into_buf(&mut buf, messages, add_generation_prompt)?; + Ok(buf.into_rendered()) + } + + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_deepseek_v3( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.tool_calls_begin, + self.tool_calls_end, + self.tool_call_begin, + self.tool_call_end, + self.tool_sep, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.eos), + ) else { + return Ok(None); + }; + + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, Self::estimate_capacity(new_messages)); + + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "user" | "system" => { + buf.special(self.user_token, idx); + buf.text(content, idx)?; + } + "tool" => { + let prev_is_tool = i > 0 && new_messages[i - 1].role == "tool"; + let next_is_tool = + i + 1 < new_messages.len() && new_messages[i + 1].role == "tool"; + if !prev_is_tool { + buf.special(self.tool_outputs_begin, idx); + } + buf.special(self.tool_output_begin, idx); + buf.text(content, idx)?; + buf.special(self.tool_output_end, idx); + if !next_is_tool { + buf.special(self.tool_outputs_end, idx); + } + } + _ => return Ok(None), + } + } + + let last_role = new_messages.last().map_or("", |m| m.role.as_str()); + if last_role != "tool" { + buf.scaffold_special(self.assistant_token); + } + if self.enable_thinking { + buf.scaffold_text("\n")?; + } + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +impl DeepSeekV3Renderer { + fn emit_assistant( + &self, + buf: &mut impl TokenSink, + msg: &Message, + msg_idx: usize, + messages: &[Message], + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let idx = msg_idx as i32; + + // Build the content text, with reasoning_content wrapped in if present + let mut content = msg.text_content().to_string(); + if let Some(reasoning) = msg.reasoning_content.as_deref() { + if !reasoning.is_empty() { + let mut wrapped = String::with_capacity(reasoning.len() + content.len() + 16); + wrapped.push_str(""); + wrapped.push_str(reasoning); + wrapped.push_str(""); + wrapped.push_str(&content); + content = wrapped; + } + } + + if !prev_is_tool { + buf.special(self.assistant_token, idx); + } + + // Pre-tool-call content + buf.text(&content, idx)?; + + if !msg.tool_calls.is_empty() { + buf.special(self.tool_calls_begin, idx); + for tc in &msg.tool_calls { + let name = tc.function.name.as_str(); + let args_str = Self::args_to_json_string(&tc.function.arguments); + + buf.special(self.tool_call_begin, idx); + buf.text("function", idx)?; + buf.special(self.tool_sep, idx); + let mut payload = String::with_capacity(name.len() + args_str.len() + 16); + payload.push_str(name); + payload.push_str("\n```json\n"); + payload.push_str(&args_str); + payload.push_str("\n```"); + buf.text(&payload, idx)?; + buf.special(self.tool_call_end, idx); + } + buf.special(self.tool_calls_end, idx); + } + + buf.special(self.eos, idx); + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + msg_idx: usize, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let next_is_tool = msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let idx = msg_idx as i32; + let content = messages[msg_idx].text_content(); + + if !prev_is_tool { + buf.special(self.tool_outputs_begin, idx); + } + buf.special(self.tool_output_begin, idx); + buf.text(content, idx)?; + buf.special(self.tool_output_end, idx); + if !next_is_tool { + buf.special(self.tool_outputs_end, idx); + } + Ok(()) + } +} diff --git a/crates/renderers-core/src/families/default.rs b/crates/renderers-core/src/families/default.rs new file mode 100644 index 0000000..44c8e11 --- /dev/null +++ b/crates/renderers-core/src/families/default.rs @@ -0,0 +1,364 @@ +//! `DefaultRenderer` — Jinja-template fallback for models without a +//! hand-coded family. +//! +//! Port of `renderers/default.py`. Two key differences from the Python +//! implementation: +//! +//! - Renders the template with [`minijinja`] (vs HF's Python Jinja). The +//! `chat_template` string is loaded from the model's +//! `tokenizer_config.json` and rendered against a context built from +//! the messages + tools. minijinja covers the Jinja2 subset HF +//! templates actually use (`for`, `if`, `set`, filters like `tojson`, +//! `length`, `trim`); anything more exotic will return a render error +//! instead of silently miscompiling. +//! - Per-token attribution is incremental: render the conversation +//! prefix-by-prefix and attribute the delta to each message index. +//! Same algorithm as the Python class, but driven by minijinja +//! instead of HF's `apply_chat_template`. +//! +//! `parse_response` is intentionally basic: strip stop tokens, decode, +//! split on `` if present. Models with structured tool calls +//! need a hand-coded family — `DefaultRenderer` doesn't try to guess. +//! +//! `bridge_to_next_turn` returns `None` unconditionally: without +//! template-specific knowledge of the turn-close token, the bridge +//! contract can't be proven, so the caller falls back to a full +//! re-render. + +use std::sync::Arc; + +use minijinja::Environment; +use minijinja::value::Value as MjValue; +use serde_json::Value as JsonValue; + +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, +}; + +/// Builder for [`DefaultRenderer`]. +pub struct DefaultRendererBuilder { + chat_template: String, + stop_token_ids: Vec, + extra_context: Vec<(String, JsonValue)>, +} + +impl DefaultRendererBuilder { + pub fn new(chat_template: impl Into) -> Self { + Self { + chat_template: chat_template.into(), + stop_token_ids: Vec::new(), + extra_context: Vec::new(), + } + } + /// Stop tokens — typically `[eos_token_id]`. The caller decides; the + /// renderer doesn't probe the tokenizer for `eos_token` since the + /// canonical id varies per model. + pub fn stop_token_ids(mut self, ids: Vec) -> Self { + self.stop_token_ids = ids; + self + } + /// Add a `key=value` context variable for the Jinja template. + /// Common entries: `bos_token`, `eos_token`, `add_generation_prompt`. + pub fn add_context(mut self, key: impl Into, value: JsonValue) -> Self { + self.extra_context.push((key.into(), value)); + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + DefaultRenderer::new_with(tokenizer, self) + } +} + +impl std::fmt::Debug for DefaultRendererBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DefaultRendererBuilder") + .field("chat_template_len", &self.chat_template.len()) + .field("stop_token_ids", &self.stop_token_ids) + .field("extra_context_keys", &self.extra_context.len()) + .finish() + } +} + +pub struct DefaultRenderer { + tokenizer: Tokenizer, + env: Arc>, + extra_context: Vec<(String, JsonValue)>, + stop_token_ids: Vec, +} + +impl std::fmt::Debug for DefaultRenderer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DefaultRenderer") + .field("stop_token_ids", &self.stop_token_ids) + .field("extra_context_keys", &self.extra_context.len()) + .finish_non_exhaustive() + } +} + +impl Clone for DefaultRenderer { + fn clone(&self) -> Self { + Self { + tokenizer: self.tokenizer.clone(), + env: self.env.clone(), + extra_context: self.extra_context.clone(), + stop_token_ids: self.stop_token_ids.clone(), + } + } +} + +impl DefaultRenderer { + fn new_with(tokenizer: Tokenizer, cfg: DefaultRendererBuilder) -> Result { + let mut env = Environment::new(); + // HF chat templates use whitespace-stripped markers freely + // (e.g. `{%- if foo -%}`); minijinja respects that via the + // `lstrip_blocks` / `trim_blocks` knobs below. + env.set_lstrip_blocks(true); + env.set_trim_blocks(true); + env.add_template_owned("chat", cfg.chat_template) + .map_err(|e| RenderError::Invalid(format!("chat_template parse: {e}")))?; + Ok(Self { + tokenizer, + env: Arc::new(env), + extra_context: cfg.extra_context, + stop_token_ids: cfg.stop_token_ids, + }) + } + + pub fn builder(chat_template: impl Into) -> DefaultRendererBuilder { + DefaultRendererBuilder::new(chat_template) + } + + /// Render the template up to `messages[..end]` (exclusive). When + /// `add_generation_prompt` is true the template's gen-prompt branch + /// fires. + fn render_jinja( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + // Build a single flat context map up front. minijinja's + // `context!` macro and `Value::from_object` produce equivalent + // results, but a single dict keeps the per-render allocation + // count constant regardless of how many extra context keys the + // caller passes (vs the wrapped-Object chain previously used). + let mut ctx_map = serde_json::Map::new(); + ctx_map.insert( + "messages".into(), + serde_json::to_value(messages_to_value(messages)?).unwrap_or(JsonValue::Null), + ); + let tools_value: MjValue = match tools { + Some(t) => tools_to_value(t), + None => MjValue::from(Vec::::new()), + }; + ctx_map.insert( + "tools".into(), + serde_json::to_value(tools_value).unwrap_or(JsonValue::Null), + ); + ctx_map.insert( + "add_generation_prompt".into(), + JsonValue::Bool(add_generation_prompt), + ); + for (k, v) in &self.extra_context { + ctx_map.insert(k.clone(), v.clone()); + } + let ctx = MjValue::from_serialize(JsonValue::Object(ctx_map)); + + let tmpl = self + .env + .get_template("chat") + .map_err(|e| RenderError::Invalid(format!("chat_template lookup: {e}")))?; + tmpl.render(ctx) + .map_err(|e| RenderError::Invalid(format!("chat_template render: {e}"))) + } + + fn encode_full(&self, text: &str) -> Result, RenderError> { + Ok(self.tokenizer.encode_no_special(text)?.as_slice().to_vec()) + } +} + +impl Renderer for DefaultRenderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + // Incremental render: tokenise prefix-by-prefix, attribute the + // delta to each message index. Same approach as the Python class. + let mut token_ids: Vec = Vec::new(); + let mut message_indices: Vec = Vec::new(); + let mut prev_len = 0usize; + + for (i, _) in messages.iter().enumerate() { + let text = self.render_jinja(&messages[..=i], tools, false)?; + let ids = self.encode_full(&text)?; + if ids.len() < prev_len { + // Template didn't extend prefix-monotonically — fall back to + // a single full render attributed entirely to scaffolding. + let all = self.encode_full(&self.render_jinja( + messages, + tools, + add_generation_prompt, + )?)?; + return Ok(RenderedTokens { + token_ids: all.clone(), + message_indices: vec![SCAFFOLD_IDX; all.len()], + multi_modal_data: None, + }); + } + let new_count = ids.len() - prev_len; + message_indices.extend(std::iter::repeat_n(i as i32, new_count)); + token_ids = ids; + prev_len = token_ids.len(); + } + + if add_generation_prompt { + let full = self.render_jinja(messages, tools, true)?; + let full_ids = self.encode_full(&full)?; + if full_ids.len() >= prev_len { + let gen_count = full_ids.len() - prev_len; + message_indices.extend(std::iter::repeat_n(SCAFFOLD_IDX, gen_count)); + token_ids = full_ids; + } else { + token_ids = full_ids; + message_indices.truncate(token_ids.len()); + } + } + + Ok(RenderedTokens { + token_ids, + message_indices, + multi_modal_data: None, + }) + } + + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + // Fast path: one full render instead of N prefix renders. Used by + // callers that don't need per-token attribution. + let text = self.render_jinja(messages, tools, add_generation_prompt)?; + self.encode_full(&text) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + // Truncate at the first stop token. + let end = token_ids + .iter() + .position(|t| self.stop_token_ids.contains(t)) + .unwrap_or(token_ids.len()); + let text = self.tokenizer.decode(&token_ids[..end]).unwrap_or_default(); + + // Split out a `...` block if present. Same logic + // as the Python fallback. + let (reasoning_content, content) = match text.split_once("") { + Some((before, after)) => { + let r = if let Some((_, inner)) = before.rsplit_once("") { + inner.to_string() + } else { + before.to_string() + }; + (Some(r).filter(|s| !s.is_empty()), after.to_string()) + } + None => (None, text.clone()), + }; + + ParsedResponse { + content, + reasoning_content, + tool_calls: Vec::new(), + } + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_token_ids + } + + fn bridge_to_next_turn( + &self, + _previous_prompt_ids: &[u32], + _previous_completion_ids: &[u32], + _new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + // Same contract as the Python DefaultRenderer: without family + // knowledge of the turn-close token, the bridge can't be proven. + Ok(None) + } +} + +// ── Jinja context conversion ────────────────────────────────────────── + +fn messages_to_value(messages: &[Message]) -> Result { + let mut out: Vec = Vec::with_capacity(messages.len()); + for m in messages { + let mut map = serde_json::Map::new(); + map.insert("role".into(), JsonValue::String(m.role.clone())); + // Content: string fast-path, structured parts pass through as JSON + let content_value = match &m.content { + crate::types::Content::Null => JsonValue::Null, + crate::types::Content::Text(s) => JsonValue::String(s.clone()), + crate::types::Content::Parts(parts) => serde_json::to_value(parts) + .map_err(|e| RenderError::Invalid(format!("content serialisation: {e}")))?, + }; + map.insert("content".into(), content_value); + if let Some(name) = &m.name { + map.insert("name".into(), JsonValue::String(name.clone())); + } + if let Some(tcid) = &m.tool_call_id { + map.insert("tool_call_id".into(), JsonValue::String(tcid.clone())); + } + if let Some(r) = &m.reasoning_content { + map.insert("reasoning_content".into(), JsonValue::String(r.clone())); + } + if !m.tool_calls.is_empty() { + let tcs: Vec = m + .tool_calls + .iter() + .map(|tc| { + let args = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(JsonValue::String(s.clone())) + } + }; + serde_json::json!({ + "type": tc.kind, + "id": tc.id, + "function": { + "name": tc.function.name, + "arguments": args, + }, + }) + }) + .collect(); + map.insert("tool_calls".into(), JsonValue::Array(tcs)); + } + out.push(MjValue::from_serialize(JsonValue::Object(map))); + } + Ok(MjValue::from(out)) +} + +fn tools_to_value(tools: &[ToolSpec]) -> MjValue { + let mut out: Vec = Vec::with_capacity(tools.len()); + for t in tools { + let v = serde_json::json!({ + "type": "function", + "function": { + "name": t.name, + "description": t.description, + "parameters": t.parameters, + }, + }); + out.push(MjValue::from_serialize(v)); + } + MjValue::from(out) +} diff --git a/crates/renderers-core/src/families/glm.rs b/crates/renderers-core/src/families/glm.rs new file mode 100644 index 0000000..d1190c5 --- /dev/null +++ b/crates/renderers-core/src/families/glm.rs @@ -0,0 +1,743 @@ +//! GLM family renderers — covers GLM-5, GLM-5.1, and GLM-4.5 Air. +//! +//! Port of `renderers/glm5.py` (+ `GLM51Renderer`) and `renderers/glm45.py`. +//! +//! Shared template shape: +//! +//! - Prefix: `[gMASK]` before all content +//! - Role markers: `<|system|>`, `<|user|>`, `<|assistant|>`, +//! `<|observation|>`. No role-name text follows the marker. +//! - **No close token** — turns end when the next role marker appears. +//! `bridge_to_next_turn` exploits this: the prior turn's tail +//! contains one of `{<|endoftext|>, <|user|>, <|observation|>}` +//! (the stop ids), so the bridge synthesises `<|endoftext|>` only on +//! truncation. +//! - Tool calls: `namekv...` +//! +//! Variants in this module: +//! +//! | Flag | GLM-5 | GLM-5.1 | GLM-4.5 | +//! | ----------------------------- | ----- | ------- | ------- | +//! | newlines after role markers | no | no | yes | +//! | newlines inside tool-call | no | no | yes | +//! | `/nothink` user suffix | no | no | yes | +//! | empty `` wrap | no | yes | no | +//! | unwrap `OpenAI` tool envelope | no | yes | no | +//! +//! The flags are surfaced on the builder; the three variants pick +//! their own combination at construction time. + +use serde_json::Value as JsonValue; + +use crate::bridge::reject_assistant_in_extension; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; +use crate::json::{to_string_python, tool_spec_inner_value, tool_spec_template_value}; +use crate::parsing::glm::parse_glm; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, +}; + +const TOOLS_HEADER_GLM5: &str = "\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n"; +const TOOLS_FOOTER_GLM5: &str = "\n\nFor each function call, output the function name and arguments within the following XML format:\n{function-name}{arg-key-1}{arg-value-1}{arg-key-2}{arg-value-2}..."; + +const TOOLS_FOOTER_GLM45: &str = "\n\nFor each function call, output the function name and arguments within the following XML format:\n{function-name}\n{arg-key-1}\n{arg-value-1}\n{arg-key-2}\n{arg-value-2}\n...\n"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Variant { + Glm5, + Glm51, + Glm45, +} + +#[derive(Debug, Clone)] +pub struct GlmRendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + variant: Variant, +} + +impl GlmRendererBuilder { + pub fn glm5() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + variant: Variant::Glm5, + } + } + pub fn glm51() -> Self { + Self { + variant: Variant::Glm51, + ..Self::glm5() + } + } + pub fn glm45() -> Self { + Self { + variant: Variant::Glm45, + ..Self::glm5() + } + } + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + GlmRenderer::new_with(tokenizer, &self) + } +} + +#[derive(Debug, Clone)] +pub struct GlmRenderer { + tokenizer: Tokenizer, + variant: Variant, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + gmask: u32, + sop: u32, + system: u32, + user: u32, + assistant: u32, + observation: u32, + endoftext: u32, + think: u32, + think_end: u32, + tool_call: u32, + tool_call_end: u32, + arg_key: u32, + arg_key_end: u32, + arg_value: u32, + arg_value_end: u32, + // GLM-5 also exposes tokens; GLM-4.5 emits them as text. + tool_response: Option, + tool_response_end: Option, + + newline_tokens: Vec, + tool_text_cache: ToolTextCache, + stop_tokens: Vec, +} + +impl GlmRenderer { + pub fn glm5(tokenizer: Tokenizer) -> Result { + GlmRendererBuilder::glm5().build(tokenizer) + } + pub fn glm51(tokenizer: Tokenizer) -> Result { + GlmRendererBuilder::glm51().build(tokenizer) + } + pub fn glm45(tokenizer: Tokenizer) -> Result { + GlmRendererBuilder::glm45().build(tokenizer) + } + + fn new_with(tokenizer: Tokenizer, cfg: &GlmRendererBuilder) -> Result { + let gmask = tokenizer.token_to_id_strict("[gMASK]")?; + let sop = tokenizer.token_to_id_strict("")?; + let system = tokenizer.token_to_id_strict("<|system|>")?; + let user = tokenizer.token_to_id_strict("<|user|>")?; + let assistant = tokenizer.token_to_id_strict("<|assistant|>")?; + let observation = tokenizer.token_to_id_strict("<|observation|>")?; + let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; + let think = tokenizer.token_to_id_strict("")?; + let think_end = tokenizer.token_to_id_strict("")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let arg_key = tokenizer.token_to_id_strict("")?; + let arg_key_end = tokenizer.token_to_id_strict("")?; + let arg_value = tokenizer.token_to_id_strict("")?; + let arg_value_end = tokenizer.token_to_id_strict("")?; + + // GLM-5 uses special tokens; GLM-4.5 emits them + // as plain text. Resolve optionally so the same struct serves + // both variants. + let (tool_response, tool_response_end) = if cfg.variant == Variant::Glm45 { + (None, None) + } else { + ( + Some(tokenizer.token_to_id_strict("")?), + Some(tokenizer.token_to_id_strict("")?), + ) + }; + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + + Ok(Self { + tokenizer, + variant: cfg.variant, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + gmask, + sop, + system, + user, + assistant, + observation, + endoftext, + think, + think_end, + tool_call, + tool_call_end, + arg_key, + arg_key_end, + arg_value, + arg_value_end, + tool_response, + tool_response_end, + newline_tokens, + tool_text_cache: ToolTextCache::default(), + stop_tokens: vec![endoftext, user, observation], + }) + } + + fn nl_after_role(&self) -> &'static str { + if self.variant == Variant::Glm45 { + "\n" + } else { + "" + } + } + + fn empty_think_on_last_assistant(&self) -> bool { + self.variant == Variant::Glm51 + } + + fn last_user_index(messages: &[Message]) -> i32 { + for (i, m) in messages.iter().enumerate().rev() { + if m.role == "user" { + return i as i32; + } + } + -1 + } + + fn format_tool_spec_for_variant( + variant: Variant, + tool: &ToolSpec, + ) -> Result { + let spec = if variant == Variant::Glm51 { + tool_spec_inner_value(tool) + } else { + tool_spec_template_value(tool) + }; + to_string_python(&spec) + .map_err(|e| RenderError::Invalid(format!("tool spec serialisation: {e}"))) + } + + fn render_arg_value(arg_value: &JsonValue) -> String { + match arg_value { + JsonValue::String(s) => s.clone(), + _ => serde_json::to_string(arg_value).unwrap_or_default(), + } + } + + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + messages.len().max(1) * 256 + tools.map_or(0, |t| t.len() * 256 + 256) + } + + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + + fn render_into_buf( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result<(), RenderError> { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let nl = self.nl_after_role(); + + // Prefix + buf.scaffold_special(self.gmask); + buf.scaffold_special(self.sop); + + // Tools system block + if let Some(t) = tools { + if !t.is_empty() { + buf.scaffold_special(self.system); + let variant = self.variant; + let tool_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + t, + match variant { + Variant::Glm45 => 45, + Variant::Glm5 => 50, + Variant::Glm51 => 51, + }, + "", + || { + let mut s = String::with_capacity(512); + s.push_str(TOOLS_HEADER_GLM5); + for tool in t { + s.push_str(&Self::format_tool_spec_for_variant(variant, tool)?); + s.push('\n'); + } + s.push_str(if variant == Variant::Glm45 { + TOOLS_FOOTER_GLM45 + } else { + TOOLS_FOOTER_GLM5 + }); + Ok(s) + }, + )?; + buf.ids(tool_tokens.as_slice(), SCAFFOLD_IDX); + } + } + + let last_ui = Self::last_user_index(messages); + + for (i, msg) in messages.iter().enumerate() { + let content = msg.visible_text_content(); + let idx = i as i32; + match msg.role.as_str() { + "system" => { + buf.special(self.system, idx); + let mut s = String::with_capacity(content.len() + 2); + s.push_str(nl); + s.push_str(content); + buf.text(&s, idx)?; + } + "user" => { + buf.special(self.user, idx); + let mut s = String::with_capacity(content.len() + 12); + s.push_str(nl); + s.push_str(content); + if self.variant == Variant::Glm45 + && !self.enable_thinking + && !content.ends_with("/nothink") + { + s.push_str("/nothink"); + } + buf.text(&s, idx)?; + } + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(buf, msg, idx, last_ui, preserve_thinking)?; + } + "tool" => self.emit_tool(buf, messages, i, content, idx)?, + _ => {} // mirror Python: silent skip + } + } + + if add_generation_prompt { + buf.scaffold_special(self.assistant); + if self.variant == Variant::Glm45 { + if !self.enable_thinking { + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.think); + buf.scaffold_special(self.think_end); + } + // GLM-4.5 enable_thinking=True: just <|assistant|>, nothing else + } else if self.enable_thinking { + buf.scaffold_special(self.think); + } else { + buf.scaffold_special(self.think_end); + } + } + + Ok(()) + } +} + +impl Renderer for GlmRenderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_rendered()) + } + + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages, tools); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_glm( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.think, + self.think_end, + self.tool_call, + self.tool_call_end, + self.arg_key, + self.arg_key_end, + self.arg_value, + self.arg_value_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + + // GLM has no per-turn close token. Build the combined prefix and + // synthesise <|endoftext|> when the model's completion ran past + // max_tokens (no stop-id at the tail). + let mut combined: Vec = + Vec::with_capacity(previous_prompt_ids.len() + previous_completion_ids.len() + 1); + combined.extend_from_slice(previous_prompt_ids); + combined.extend_from_slice(previous_completion_ids); + + let need_synth = match combined.last() { + None => true, + Some(&t) if !self.stop_tokens.contains(&t) => true, + _ => previous_completion_ids.is_empty(), + }; + if need_synth { + combined.push(self.endoftext); + } + let last_prev = *combined.last().expect("non-empty"); + + let nl = self.nl_after_role(); + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, new_messages.len().max(1) * 256); + + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + let content = msg.visible_text_content(); + match msg.role.as_str() { + "user" => { + if !(i == 0 && last_prev == self.user) { + buf.special(self.user, idx); + } + let mut s = String::with_capacity(content.len() + 12); + s.push_str(nl); + s.push_str(content); + if self.variant == Variant::Glm45 + && !self.enable_thinking + && !content.ends_with("/nothink") + { + s.push_str("/nothink"); + } + buf.text(&s, idx)?; + } + "system" => { + buf.special(self.system, idx); + let mut s = String::with_capacity(content.len() + 2); + s.push_str(nl); + s.push_str(content); + buf.text(&s, idx)?; + } + "tool" => { + let prev_is_tool = i > 0 && new_messages[i - 1].role == "tool"; + if i == 0 && last_prev == self.observation { + // model already emitted the marker; don't repeat + } else if !prev_is_tool { + buf.special(self.observation, idx); + } + self.emit_tool_response(&mut buf, content, idx)?; + } + _ => return Ok(None), + } + } + + // Generation prompt + buf.scaffold_special(self.assistant); + if self.variant == Variant::Glm45 { + if !self.enable_thinking { + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.think); + buf.scaffold_special(self.think_end); + } + } else if self.enable_thinking { + buf.scaffold_special(self.think); + } else { + buf.scaffold_special(self.think_end); + } + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(combined.len() + ext.len()); + out.extend_from_slice(&combined); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +impl GlmRenderer { + fn emit_assistant( + &self, + buf: &mut impl TokenSink, + msg: &Message, + msg_idx: i32, + last_user_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let raw_content = msg.visible_text_content(); + let (reasoning_content, content) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let r = if let Some((_, inner)) = before.rsplit_once("") { + inner + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() + } else { + before + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() + }; + (r, after.trim_start_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + let reasoning_content = reasoning_content.trim().to_string(); + let content = content.trim().to_string(); + + buf.special(self.assistant, msg_idx); + + if self.variant == Variant::Glm45 { + self.emit_assistant_glm45( + buf, + msg, + msg_idx, + &reasoning_content, + &content, + last_user_index, + preserve_thinking, + ) + } else { + self.emit_assistant_glm5_family( + buf, + msg, + msg_idx, + &reasoning_content, + &content, + last_user_index, + preserve_thinking, + ) + } + } + + #[allow(clippy::too_many_arguments)] + fn emit_assistant_glm5_family( + &self, + buf: &mut impl TokenSink, + msg: &Message, + msg_idx: i32, + reasoning_content: &str, + content: &str, + last_user_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let include_thinking = + (msg_idx > last_user_index || preserve_thinking) && !reasoning_content.is_empty(); + if include_thinking { + buf.special(self.think, msg_idx); + buf.text(reasoning_content.trim(), msg_idx)?; + buf.special(self.think_end, msg_idx); + } else if self.empty_think_on_last_assistant() && msg_idx > last_user_index { + // GLM-5.1: wrap the last assistant with empty + buf.special(self.think, msg_idx); + buf.special(self.think_end, msg_idx); + } else { + buf.special(self.think_end, msg_idx); + } + + if !content.trim().is_empty() { + buf.text(content.trim(), msg_idx)?; + } + + for tc in &msg.tool_calls { + let name = tc.function.name.as_str(); + buf.special(self.tool_call, msg_idx); + buf.text(name, msg_idx)?; + let args_value = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(JsonValue::Object(serde_json::Map::new())) + } + }; + if let Some(obj) = args_value.as_object() { + for (k, v) in obj { + buf.special(self.arg_key, msg_idx); + buf.text(k, msg_idx)?; + buf.special(self.arg_key_end, msg_idx); + buf.special(self.arg_value, msg_idx); + buf.text(&Self::render_arg_value(v), msg_idx)?; + buf.special(self.arg_value_end, msg_idx); + } + } + buf.special(self.tool_call_end, msg_idx); + } + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + fn emit_assistant_glm45( + &self, + buf: &mut impl TokenSink, + msg: &Message, + msg_idx: i32, + reasoning_content: &str, + content: &str, + last_user_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + if (msg_idx > last_user_index || preserve_thinking) && !reasoning_content.is_empty() { + buf.ids(&self.newline_tokens, msg_idx); + buf.special(self.think, msg_idx); + buf.text(reasoning_content.trim(), msg_idx)?; + buf.special(self.think_end, msg_idx); + } else { + buf.ids(&self.newline_tokens, msg_idx); + buf.special(self.think, msg_idx); + buf.special(self.think_end, msg_idx); + } + + let tool_calls = &msg.tool_calls; + let trimmed = content.trim(); + if !trimmed.is_empty() && !tool_calls.is_empty() { + let mut s = String::with_capacity(trimmed.len() + 2); + s.push('\n'); + s.push_str(trimmed); + s.push('\n'); + buf.text(&s, msg_idx)?; + } else if !trimmed.is_empty() { + let mut s = String::with_capacity(trimmed.len() + 1); + s.push('\n'); + s.push_str(trimmed); + buf.text(&s, msg_idx)?; + } + + for tc in tool_calls { + let name = tc.function.name.as_str(); + if trimmed.is_empty() { + buf.ids(&self.newline_tokens, msg_idx); + } + buf.special(self.tool_call, msg_idx); + let mut head = String::with_capacity(name.len() + 1); + head.push_str(name); + head.push('\n'); + buf.text(&head, msg_idx)?; + + let args_value = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(JsonValue::Object(serde_json::Map::new())) + } + }; + if let Some(obj) = args_value.as_object() { + for (k, v) in obj { + buf.special(self.arg_key, msg_idx); + buf.text(k, msg_idx)?; + buf.special(self.arg_key_end, msg_idx); + buf.ids(&self.newline_tokens, msg_idx); + buf.special(self.arg_value, msg_idx); + buf.text(&Self::render_arg_value(v), msg_idx)?; + buf.special(self.arg_value_end, msg_idx); + buf.ids(&self.newline_tokens, msg_idx); + } + } + buf.special(self.tool_call_end, msg_idx); + } + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + msg_idx: usize, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + if !prev_is_tool { + buf.special(self.observation, idx); + } + self.emit_tool_response(buf, content, idx) + } + + fn emit_tool_response( + &self, + buf: &mut impl TokenSink, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + if self.variant == Variant::Glm45 { + // GLM-4.5 emits the tool_response wrapper as plain text + let mut s = String::with_capacity(content.len() + 32); + s.push_str("\n\n"); + s.push_str(content); + s.push_str("\n"); + buf.text(&s, idx)?; + } else { + // GLM-5 / GLM-5.1 use special tokens + buf.special(self.tool_response.expect("tool_response token"), idx); + buf.text(content, idx)?; + buf.special( + self.tool_response_end.expect("tool_response_end token"), + idx, + ); + } + Ok(()) + } +} + +// Kept for completeness; GLM-5 doesn't ship the `<|endoftext|>` flag the +// way Nemotron does, so the field is always Some. +#[allow(dead_code)] +fn _glm_invariants() { + let _ = SCAFFOLD_IDX; +} diff --git a/crates/renderers-core/src/families/gpt_oss.rs b/crates/renderers-core/src/families/gpt_oss.rs new file mode 100644 index 0000000..917ea29 --- /dev/null +++ b/crates/renderers-core/src/families/gpt_oss.rs @@ -0,0 +1,694 @@ +//! GPT-OSS (Harmony) renderer. +//! +//! Thin adapter over the `openai-harmony` Rust crate. Wire format is +//! harmony (channel-based, no BOS). The Python implementation goes +//! through the same library, so matching its conversion logic guarantees +//! byte-identical tokens. +//! +//! Architecture: +//! +//! - Holds a [`HarmonyEncoding`] (lazily loaded from +//! [`HarmonyEncodingName::HarmonyGptOss`]) and a cache of the +//! special-token ids it exposes. +//! - `render` builds a prefix conversation (`SystemContent` + `DeveloperContent` +//! when a system message or tools are present) via +//! `render_conversation`, then walks the remaining messages and renders +//! each one individually via `render(msg)` so per-token attribution +//! stays per-source-message. +//! - `parse_response` walks the completion tokens with our own scanner +//! (token-id based) — matching what `renderers/parsing.py:parse_gpt_oss` +//! does — so we don't need to manage a `StreamableParser`'s lifetime. +//! +//! This renderer does NOT need a `HuggingFace` `tokenizer.json`; the +//! harmony encoding embeds its own tiktoken-based tokenizer. + +use std::sync::Arc; + +use openai_harmony::chat::{ + Author, ChannelConfig, Conversation, DeveloperContent, Message as HarmonyMessage, + ReasoningEffort, Role as HarmonyRole, SystemContent, ToolDescription, +}; +use openai_harmony::{HarmonyEncoding, HarmonyEncodingName, load_harmony_encoding}; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::thinking::should_preserve_past_thinking; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, ParsedToolCall, RenderError, RenderedTokens, SCAFFOLD_IDX, + ToolArguments, ToolCallParseStatus, ToolSpec, +}; + +fn harmony_err(e: E) -> RenderError { + RenderError::Invalid(format!("harmony: {e}")) +} + +/// Builder for [`GptOssRenderer`]. +#[derive(Debug, Clone)] +pub struct GptOssRendererBuilder { + use_system_prompt: bool, + reasoning_effort: ReasoningEffort, + conversation_start_date: Option, + knowledge_cutoff: Option, + model_identity: Option, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for GptOssRendererBuilder { + fn default() -> Self { + Self { + use_system_prompt: true, + reasoning_effort: ReasoningEffort::Medium, + conversation_start_date: None, + knowledge_cutoff: None, + model_identity: None, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl GptOssRendererBuilder { + pub fn use_system_prompt(mut self, on: bool) -> Self { + self.use_system_prompt = on; + self + } + pub fn reasoning_effort(mut self, effort: &str) -> Result { + self.reasoning_effort = match effort.to_ascii_lowercase().as_str() { + "low" => ReasoningEffort::Low, + "medium" => ReasoningEffort::Medium, + "high" => ReasoningEffort::High, + other => { + return Err(RenderError::Invalid(format!( + "unknown reasoning effort: {other}" + ))); + } + }; + Ok(self) + } + pub fn conversation_start_date(mut self, d: impl Into) -> Self { + self.conversation_start_date = Some(d.into()); + self + } + pub fn knowledge_cutoff(mut self, k: impl Into) -> Self { + self.knowledge_cutoff = Some(k.into()); + self + } + pub fn model_identity(mut self, m: impl Into) -> Self { + self.model_identity = Some(m.into()); + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self) -> Result { + GptOssRenderer::new_with(self) + } +} + +#[derive(Debug, Clone)] +pub struct GptOssRenderer { + enc: Arc, + use_system_prompt: bool, + reasoning_effort: ReasoningEffort, + conversation_start_date: String, + knowledge_cutoff: Option, + model_identity: Option, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + // Cached special-token ids — used by the parser and the generation prompt. + start: u32, + end: u32, + return_tok: u32, + call: u32, + channel: u32, + message: u32, + #[allow(dead_code)] + constrain: u32, + + stop_tokens: Vec, +} + +impl GptOssRenderer { + pub fn new() -> Result { + GptOssRendererBuilder::default().build() + } + pub fn builder() -> GptOssRendererBuilder { + GptOssRendererBuilder::default() + } + + fn new_with(cfg: GptOssRendererBuilder) -> Result { + let enc = load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss).map_err(harmony_err)?; + + // Resolve special-token ids by encoding their canonical text and + // asserting a single-token round-trip. The harmony encoding + // exposes a `tokenizer()` accessor (tiktoken CoreBPE) so we use + // its public special-token API. Bound to `enc` here directly so + // the rest of the constructor doesn't need to name CoreBPE + // (private outside the harmony crate). + let resolve = |s: &str| -> Result { + let ids = enc.tokenizer().encode_with_special_tokens(s); + if ids.len() != 1 { + return Err(RenderError::MissingSpecialToken(s.to_string())); + } + // `Rank` in tiktoken is `u32`; no conversion needed. + Ok(ids[0]) + }; + let start = resolve("<|start|>")?; + let end = resolve("<|end|>")?; + let return_tok = resolve("<|return|>")?; + let call = resolve("<|call|>")?; + let channel = resolve("<|channel|>")?; + let message = resolve("<|message|>")?; + let constrain = resolve("<|constrain|>")?; + + let start_date = cfg + .conversation_start_date + .clone() + .unwrap_or_else(today_yyyy_mm_dd); + + Ok(Self { + enc: Arc::new(enc), + use_system_prompt: cfg.use_system_prompt, + reasoning_effort: cfg.reasoning_effort, + conversation_start_date: start_date, + knowledge_cutoff: cfg.knowledge_cutoff, + model_identity: cfg.model_identity, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + start, + end, + return_tok, + call, + channel, + message, + constrain, + stop_tokens: vec![return_tok, call], + }) + } + + /// Append rendered ids to `tokens`, attribute each to `msg_idx`. + fn emit_render( + &self, + tokens: &mut Vec, + indices: &mut Vec, + msg_idx: i32, + message: &HarmonyMessage, + ) -> Result<(), RenderError> { + let mut out: Vec = Vec::new(); + self.enc + .render_into(message, &mut out, None) + .map_err(harmony_err)?; + let len = out.len(); + tokens.append(&mut out); + indices.extend(std::iter::repeat_n(msg_idx, len)); + Ok(()) + } + + /// Encode a UTF-8 string via the harmony tokenizer, returning u32 ids. + /// Helper so the call sites don't need to name `CoreBPE` (which is not + /// re-exported from the harmony crate). + fn encode_text(&self, text: &str) -> Vec { + // `Rank` is `u32`; encode_with_special_tokens already returns Vec. + self.enc.tokenizer().encode_with_special_tokens(text) + } + + /// Decode a slice of token ids via the harmony tokenizer. + fn decode_text(&self, ids: &[u32]) -> String { + if ids.is_empty() { + return String::new(); + } + // `Rank` in tiktoken is `u32` — pass ids directly without casting. + self.enc + .tokenizer() + .decode_utf8(ids.iter().copied()) + .unwrap_or_default() + } + + fn render_conversation_tokens( + &self, + messages: Vec, + ) -> Result, RenderError> { + let convo = Conversation::from_messages(messages); + let mut out: Vec = Vec::new(); + self.enc + .render_conversation_into(convo.messages.iter(), &mut out, None) + .map_err(harmony_err)?; + Ok(out) + } + + /// Build the harmony Author for tool messages — needs the function + /// name, which we recover from `msg.name` (set client-side by + /// `_attach_tool_call_names`). + fn tool_author(msg: &Message) -> Author { + let name = msg.name.as_deref().unwrap_or("unknown"); + let qualified: String = if name.starts_with("functions.") { + name.to_string() + } else { + format!("functions.{name}") + }; + Author { + role: HarmonyRole::Tool, + name: Some(qualified), + } + } + + fn message_to_harmony(msg: &Message, preserve_thinking: bool) -> Vec { + match msg.role.as_str() { + "user" => vec![HarmonyMessage::from_role_and_content( + HarmonyRole::User, + msg.text_content().to_string(), + )], + "tool" => { + let m = HarmonyMessage::from_author_and_content( + Self::tool_author(msg), + msg.text_content().to_string(), + ) + .with_recipient("assistant") + .with_channel("commentary"); + vec![m] + } + "assistant" => Self::assistant_to_harmony(msg, preserve_thinking), + // Default branch covers "system", "developer", and any + // unknown role, all of which route to the Developer channel. + _ => { + let dev = DeveloperContent::new().with_instructions(msg.text_content()); + vec![HarmonyMessage::from_role_and_content( + HarmonyRole::Developer, + dev, + )] + } + } + } + + fn assistant_to_harmony(msg: &Message, preserve_thinking: bool) -> Vec { + let mut out: Vec = Vec::new(); + + if preserve_thinking { + if let Some(reasoning) = msg.reasoning_content.as_deref() { + if !reasoning.is_empty() { + let m = HarmonyMessage::from_role_and_content( + HarmonyRole::Assistant, + reasoning.to_string(), + ) + .with_channel("analysis"); + out.push(m); + } + } + } + + // Text content goes on the `final` channel. + let text = msg.text_content(); + if !text.is_empty() { + let m = HarmonyMessage::from_role_and_content(HarmonyRole::Assistant, text.to_string()) + .with_channel("final"); + out.push(m); + } + + // Each tool_call becomes its own assistant message on the + // commentary channel with recipient=functions.. + for tc in &msg.tool_calls { + let name = &tc.function.name; + let args = match &tc.function.arguments { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => serde_json::to_string(v).unwrap_or_default(), + }; + let recipient = if name.starts_with("functions.") { + name.clone() + } else { + format!("functions.{name}") + }; + let m = HarmonyMessage::from_role_and_content(HarmonyRole::Assistant, args) + .with_channel("commentary") + .with_recipient(recipient); + out.push(m); + } + + // Empty assistant with no text and no tool_calls: emit empty + // final-channel message so per-token attribution still produces + // at least one token slot. + if out.is_empty() { + let m = HarmonyMessage::from_role_and_content(HarmonyRole::Assistant, String::new()) + .with_channel("final"); + out.push(m); + } + + out + } + + fn tool_to_description(tool: &ToolSpec) -> ToolDescription { + ToolDescription::new( + tool.name.as_str(), + tool.description.as_str(), + Some(tool.parameters.clone()), + ) + } + + fn build_system_content(&self) -> SystemContent { + let mut s = SystemContent::new().with_reasoning_effort(self.reasoning_effort); + s = s.with_conversation_start_date(self.conversation_start_date.as_str()); + if let Some(k) = &self.knowledge_cutoff { + s = s.with_knowledge_cutoff(k.as_str()); + } + if let Some(m) = &self.model_identity { + s = s.with_model_identity(m.as_str()); + } + s + } + + fn emit_generation_prompt(&self, tokens: &mut Vec, indices: &mut Vec) { + tokens.push(self.start); + indices.push(SCAFFOLD_IDX); + // "assistant" + <|channel|> + "analysis" + <|message|> + for id in self.encode_text("assistant") { + tokens.push(id); + indices.push(SCAFFOLD_IDX); + } + tokens.push(self.channel); + indices.push(SCAFFOLD_IDX); + for id in self.encode_text("analysis") { + tokens.push(id); + indices.push(SCAFFOLD_IDX); + } + tokens.push(self.message); + indices.push(SCAFFOLD_IDX); + } +} + +impl Renderer for GptOssRenderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut tokens: Vec = Vec::with_capacity(messages.len() * 256); + let mut indices: Vec = Vec::with_capacity(messages.len() * 256); + + let first_system_idx = messages.iter().position(|m| m.role == "system"); + + // Prefix: SystemContent + DeveloperContent (when tools or a + // caller-supplied system are present). + let mut prefix_msgs: Vec = Vec::new(); + if self.use_system_prompt { + let sys = self.build_system_content(); + let sys = match tools { + Some(t) if !t.is_empty() => { + sys.with_channel_config(ChannelConfig::require_channels([ + "analysis", + "commentary", + "final", + ])) + } + _ => sys, + }; + prefix_msgs.push(HarmonyMessage::from_role_and_content( + HarmonyRole::System, + sys, + )); + } + let has_dev = first_system_idx.is_some() || tools.is_some_and(|t| !t.is_empty()); + if has_dev { + let mut dev = DeveloperContent::new(); + if let Some(idx) = first_system_idx { + let instr = messages[idx].text_content(); + if !instr.is_empty() { + dev = dev.with_instructions(instr); + } + } + if let Some(t) = tools { + if !t.is_empty() { + let descs: Vec = + t.iter().map(Self::tool_to_description).collect(); + dev = dev.with_function_tools(descs); + } + } + prefix_msgs.push(HarmonyMessage::from_role_and_content( + HarmonyRole::Developer, + dev, + )); + } + if !prefix_msgs.is_empty() { + let prefix_tokens = self.render_conversation_tokens(prefix_msgs)?; + let attr_idx: i32 = first_system_idx.map_or(SCAFFOLD_IDX, |i| i as i32); + for id in prefix_tokens { + tokens.push(id); + indices.push(attr_idx); + } + } + + // Body + let last_idx = messages.len() - 1; + for (i, msg) in messages.iter().enumerate() { + if Some(i) == first_system_idx { + continue; + } + let preserve_thinking = msg.role == "assistant" + && should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + for hm in Self::message_to_harmony(msg, preserve_thinking) { + self.emit_render(&mut tokens, &mut indices, i as i32, &hm)?; + } + } + + // Terminal close: if the conversation ends on a plain assistant + // turn (no tool_calls) and we're not asking for a generation + // prompt, swap the trailing <|end|> for <|return|> — matches + // apply_chat_template. + if !add_generation_prompt + && last_idx < messages.len() + && messages[last_idx].role == "assistant" + && messages[last_idx].tool_calls.is_empty() + && tokens.last().copied() == Some(self.end) + { + *tokens.last_mut().expect("non-empty") = self.return_tok; + } + + if add_generation_prompt { + self.emit_generation_prompt(&mut tokens, &mut indices); + } + + Ok(RenderedTokens { + token_ids: tokens, + message_indices: indices, + multi_modal_data: None, + }) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + // Walk tokens block-by-block: `<|start|>{header}<|message|>{body}{terminator}`. + // Terminator is one of `<|start|>` (next block), `<|end|>`, `<|call|>`. + // `<|return|>` truncates the entire response. + let return_pos = token_ids.iter().position(|&t| t == self.return_tok); + let ids: &[u32] = match return_pos { + Some(p) => &token_ids[..p], + None => token_ids, + }; + + let mut reasoning_parts: Vec = Vec::new(); + let mut content_parts: Vec = Vec::new(); + let mut tool_calls: Vec = Vec::new(); + + let mut i = 0usize; + while i < ids.len() { + if ids[i] != self.start { + i += 1; + continue; + } + let block_start = i; + let Some(msg_pos) = ids[i + 1..] + .iter() + .position(|&t| t == self.message) + .map(|p| p + i + 1) + else { + break; + }; + let header_ids = &ids[i + 1..msg_pos]; + let header_text = self.decode_text(header_ids); + + let body_start = msg_pos + 1; + let body_end = ids[body_start..] + .iter() + .position(|&t| t == self.start || t == self.end || t == self.call) + .map_or(ids.len(), |p| p + body_start); + let body_closed = + body_end < ids.len() && (ids[body_end] == self.end || ids[body_end] == self.call); + let body_text = self.decode_text(&ids[body_start..body_end]); + + // Channel: look for <|channel|>NAME in header — NAME is the + // text between the channel token and the next whitespace / + // special token. + let channel = header_ids + .iter() + .position(|&t| t == self.channel) + .map(|p| { + let after = &header_ids[p + 1..]; + // Take tokens until newline/space — but since header + // is short, just decode the rest and split. + self.decode_text(after).trim().to_string() + }) + .unwrap_or_default(); + + // Recipient: header text may contain "to=functions.NAME" + let recipient: Option<&str> = header_text.split("to=").nth(1).map(|s| { + s.split(|c: char| c.is_whitespace() || c == '<') + .next() + .unwrap_or("") + }); + + if let Some(r) = recipient { + if let Some(tool_name) = r.strip_prefix("functions.") { + let block_end = if body_closed { body_end + 1 } else { body_end }; + let span = block_start..block_end; + match serde_json::from_str::(&body_text) { + Ok(v) => { + tool_calls.push(ParsedToolCall { + raw: body_text.clone(), + name: Some(tool_name.to_string()), + arguments: Some(ToolArguments::Object(v)), + token_span: Some(span), + status: ToolCallParseStatus::Ok, + ..Default::default() + }); + } + Err(_) => { + tool_calls.push(ParsedToolCall { + raw: body_text.clone(), + name: Some(tool_name.to_string()), + arguments: Some(ToolArguments::Raw(body_text.clone())), + token_span: Some(span), + status: ToolCallParseStatus::InvalidJson, + ..Default::default() + }); + } + } + i = if body_closed { body_end + 1 } else { body_end }; + continue; + } + } + + // analysis → reasoning_content; everything else (final, + // commentary without a tool recipient, missing channel) + // collapses into the visible content stream. + match channel.split_whitespace().next() { + Some("analysis") => reasoning_parts.push(body_text), + _ => content_parts.push(body_text), + } + + i = if body_closed { body_end + 1 } else { body_end }; + } + + let reasoning_content = if reasoning_parts.is_empty() { + None + } else { + Some(reasoning_parts.join("").trim().to_string()).filter(|s| !s.is_empty()) + }; + + ParsedResponse { + content: content_parts.join("").trim().to_string(), + reasoning_content, + tool_calls, + } + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &[self.return_tok, self.call], + Some(self.end), + ) else { + return Ok(None); + }; + + let mut ext: Vec = Vec::new(); + for msg in new_messages { + match msg.role.as_str() { + "tool" | "user" | "system" | "developer" => {} + _ => return Ok(None), + } + for hm in Self::message_to_harmony(msg, false) { + let mut out: Vec = Vec::new(); + self.enc + .render_into(&hm, &mut out, None) + .map_err(harmony_err)?; + ext.extend(out); + } + } + + // Generation prompt + ext.push(self.start); + ext.extend(self.encode_text("assistant")); + ext.push(self.channel); + ext.extend(self.encode_text("analysis")); + ext.push(self.message); + + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +fn today_yyyy_mm_dd() -> String { + // Avoid pulling chrono — use std::time::SystemTime and a small + // conversion that's good enough for "today" in UTC. + use std::time::{SystemTime, UNIX_EPOCH}; + let secs = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_secs()); + let days = secs / 86_400; + // 1970-01-01 + days + let (y, m, d) = civil_from_days(days as i64); + format!("{y:04}-{m:02}-{d:02}") +} + +/// Convert days since 1970-01-01 to (year, month, day) — Howard Hinnant's +/// algorithm, public-domain. +#[allow(clippy::cast_sign_loss)] // remainder mod 146_097 is in [0, 146_097) +fn civil_from_days(z: i64) -> (i32, u32, u32) { + let z = z + 719_468; + let era = if z >= 0 { z } else { z - 146_096 } / 146_097; + let doe = (z - era * 146_097) as u32; + let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365; + let y = yoe as i32 + era as i32 * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = doy - (153 * mp + 2) / 5 + 1; + let m = if mp < 10 { mp + 3 } else { mp - 9 }; + let y = if m <= 2 { y + 1 } else { y }; + (y, m, d) +} diff --git a/crates/renderers-core/src/families/kimi_k2.rs b/crates/renderers-core/src/families/kimi_k2.rs new file mode 100644 index 0000000..f2d53ba --- /dev/null +++ b/crates/renderers-core/src/families/kimi_k2.rs @@ -0,0 +1,544 @@ +//! Kimi K2 renderer. Port of `renderers/kimi_k2.py`. +//! +//! Distinctive features: +//! +//! - Per-message framing: `<|im_*|>{role}<|im_middle|>{content}<|im_end|>`. +//! Role tokens vary by role: `<|im_user|>`, `<|im_assistant|>`, +//! `<|im_system|>`. +//! - Tool calls wrapped in +//! `<|tool_calls_section_begin|>` + N × call + `<|tool_calls_section_end|>`, +//! with each call as +//! `<|tool_call_begin|>{id}<|tool_call_argument_begin|>{json}<|tool_call_end|>`. +//! - Tool declarations rendered as a `role="tool_declare"` system-style +//! message with `tojson(separators=(',',':'), sort_keys=True)` JSON. +//! - Tool results: `<|im_system|>{name}<|im_middle|>## Return of {id}\n{content}<|im_end|>`. +//! - Default system message auto-injected if missing +//! ("You are Kimi, an AI assistant created by Moonshot AI."). +//! - Thinking is plain text `...` (not special tokens). +//! The template doesn't read `reasoning_content` — assistant content +//! renders verbatim, inline `` tags and all. + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::kimi_k2::parse_kimi_k2; +use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, +}; + +const DEFAULT_SYSTEM: &str = "You are Kimi, an AI assistant created by Moonshot AI."; + +#[derive(Debug, Clone)] +pub struct KimiK2RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for KimiK2RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl KimiK2RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + KimiK2Renderer::new_with(tokenizer, &self) + } +} + +#[derive(Debug, Clone)] +pub struct KimiK2Renderer { + tokenizer: Tokenizer, + // Stored for API parity; the Kimi template ignores these flags. + #[allow(dead_code)] + enable_thinking: bool, + #[allow(dead_code)] + preserve_all_thinking: bool, + #[allow(dead_code)] + preserve_thinking_between_tool_calls: bool, + + im_user: u32, + im_assistant: u32, + im_system: u32, + im_middle: u32, + im_end: u32, + tool_calls_section_begin: u32, + tool_calls_section_end: u32, + tool_call_begin: u32, + tool_call_argument_begin: u32, + tool_call_end: u32, + + newline_tokens: Vec, + assistant_tokens: Vec, + tool_text_cache: ToolTextCache, + stop_tokens: Vec, +} + +impl KimiK2Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + KimiK2RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> KimiK2RendererBuilder { + KimiK2RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: &KimiK2RendererBuilder) -> Result { + let im_user = tokenizer.token_to_id_strict("<|im_user|>")?; + let im_assistant = tokenizer.token_to_id_strict("<|im_assistant|>")?; + let im_system = tokenizer.token_to_id_strict("<|im_system|>")?; + let im_middle = tokenizer.token_to_id_strict("<|im_middle|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let tool_calls_section_begin = + tokenizer.token_to_id_strict("<|tool_calls_section_begin|>")?; + let tool_calls_section_end = tokenizer.token_to_id_strict("<|tool_calls_section_end|>")?; + let tool_call_begin = tokenizer.token_to_id_strict("<|tool_call_begin|>")?; + let tool_call_argument_begin = + tokenizer.token_to_id_strict("<|tool_call_argument_begin|>")?; + let tool_call_end = tokenizer.token_to_id_strict("<|tool_call_end|>")?; + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let assistant_tokens = tokenizer + .encode_no_special("assistant")? + .as_slice() + .to_vec(); + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_user, + im_assistant, + im_system, + im_middle, + im_end, + tool_calls_section_begin, + tool_calls_section_end, + tool_call_begin, + tool_call_argument_begin, + tool_call_end, + newline_tokens, + assistant_tokens, + tool_text_cache: ToolTextCache::default(), + stop_tokens: vec![im_end], + }) + } + + /// Serialise the tools list as compact, key-sorted JSON. The Python + /// template uses `tojson(separators=(',', ':'), sort_keys=True)` — + /// match both options here for byte-identical output. + fn serialize_tools(tools: &[ToolSpec]) -> String { + // Build an ordered map via serde_json::Map (preserves insertion); + // for sort_keys behaviour we use a BTreeMap-backed Value tree. + // serde_json's `serialize` of a BTreeMap sorts keys by Ord. + use std::collections::BTreeMap; + let mut arr: Vec = Vec::with_capacity(tools.len()); + for tool in tools { + let mut m: BTreeMap = BTreeMap::new(); + m.insert("name".into(), serde_json::Value::String(tool.name.clone())); + m.insert( + "description".into(), + serde_json::Value::String(tool.description.clone()), + ); + m.insert("parameters".into(), Self::sort_keys(&tool.parameters)); + if tool.openai_envelope { + let mut envelope: BTreeMap = BTreeMap::new(); + envelope.insert( + "function".into(), + serde_json::to_value(m).unwrap_or_default(), + ); + envelope.insert("type".into(), serde_json::Value::String("function".into())); + arr.push(serde_json::to_value(envelope).unwrap_or_default()); + } else { + arr.push(serde_json::to_value(m).unwrap_or_default()); + } + } + serde_json::to_string(&arr).unwrap_or_else(|_| "[]".to_string()) + } + + fn sort_keys(v: &serde_json::Value) -> serde_json::Value { + use std::collections::BTreeMap; + match v { + serde_json::Value::Object(o) => { + let sorted: BTreeMap = o + .iter() + .map(|(k, v)| (k.clone(), Self::sort_keys(v))) + .collect(); + serde_json::to_value(sorted).unwrap_or(serde_json::Value::Null) + } + serde_json::Value::Array(a) => { + serde_json::Value::Array(a.iter().map(Self::sort_keys).collect()) + } + other => other.clone(), + } + } + + fn args_to_string(args: &ToolArguments) -> String { + match args { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => Self::json_dumps_default(v), + } + } + + fn json_dumps_default(v: &serde_json::Value) -> String { + match v { + serde_json::Value::Null => "null".to_string(), + serde_json::Value::Bool(b) => b.to_string(), + serde_json::Value::Number(n) => n.to_string(), + serde_json::Value::String(s) => serde_json::to_string(s).unwrap_or_default(), + serde_json::Value::Array(values) => { + let inner = values + .iter() + .map(Self::json_dumps_default) + .collect::>() + .join(", "); + format!("[{inner}]") + } + serde_json::Value::Object(values) => { + let inner = values + .iter() + .map(|(key, value)| { + let key = serde_json::to_string(key).unwrap_or_default(); + let value = Self::json_dumps_default(value); + format!("{key}: {value}") + }) + .collect::>() + .join(", "); + format!("{{{inner}}}") + } + } + } + + fn emit_im_role( + &self, + buf: &mut RenderBuf<'_>, + role_token: u32, + role_name: &str, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(role_token, idx); + buf.text(role_name, idx)?; + buf.special(self.im_middle, idx); + buf.text(content, idx)?; + buf.special(self.im_end, idx); + Ok(()) + } + + fn emit_tool_declare_from_tools( + &self, + buf: &mut RenderBuf<'_>, + tools: &[ToolSpec], + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_system, idx); + buf.text("tool_declare", idx)?; + buf.special(self.im_middle, idx); + let tool_tokens = + self.tool_text_cache + .get_or_insert_with(&self.tokenizer, tools, 0, "", || { + Ok(Self::serialize_tools(tools)) + })?; + buf.ids(tool_tokens.as_slice(), idx); + buf.special(self.im_end, idx); + Ok(()) + } +} + +impl Renderer for KimiK2Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + + // Inject tool_declare + default system into a working copy, tracking + // which slots are injected so message_indices stays aligned to the + // caller's original list. + let mut working: Vec = Vec::with_capacity(messages.len() + 2); + let mut injected: Vec = Vec::with_capacity(messages.len() + 2); + + // tool_declare goes first if tools were provided and the caller + // didn't already include a tool_declare message. + let tools_pending = tools.is_some_and(|t| !t.is_empty()); + let already_has_tool_declare = !messages.is_empty() && messages[0].role == "tool_declare"; + if tools_pending && !already_has_tool_declare { + working.push(Message { + role: "tool_declare".to_string(), + content: crate::types::Content::Text(String::new()), + ..Default::default() + }); + injected.push(true); + } + + // Then the optional default system message + let auto_system_position: Option = + if !messages.is_empty() && messages[0].role == "tool_declare" { + // tool_declare present in caller's input → if next isn't system, + // inject default system AFTER tool_declare + if messages.len() < 2 || messages[1].role != "system" { + Some(working.len() + 1) // will be inserted between tool_declare and the rest + } else { + None + } + } else if messages.is_empty() || messages[0].role != "system" { + Some(working.len()) + } else { + None + }; + + // Now lay out the rest: + if let Some(pos) = auto_system_position { + // Replicate the Python logic: if caller's first message is + // tool_declare, push it then the default system then the rest. + if !messages.is_empty() && messages[0].role == "tool_declare" { + working.push(messages[0].clone()); + injected.push(false); + working.push(Message { + role: "system".to_string(), + content: crate::types::Content::Text(DEFAULT_SYSTEM.to_string()), + ..Default::default() + }); + injected.push(true); + for m in &messages[1..] { + working.push(m.clone()); + injected.push(false); + } + } else { + working.push(Message { + role: "system".to_string(), + content: crate::types::Content::Text(DEFAULT_SYSTEM.to_string()), + ..Default::default() + }); + injected.push(true); + for m in messages { + working.push(m.clone()); + injected.push(false); + } + } + let _ = pos; + } else { + for m in messages { + working.push(m.clone()); + injected.push(false); + } + } + + // Map normalised index → caller's index (sentinel for injected). + // Precompute as a flat Vec so the lookup is O(1) instead of an + // O(i) filter inside the render loop — saves an O(n²) walk on + // long conversations. + let orig_idx_table: Vec = { + let mut table = Vec::with_capacity(working.len()); + let mut real: i32 = -1; + for &inj in &injected { + if inj { + table.push(SCAFFOLD_IDX); + } else { + real += 1; + table.push(real); + } + } + table + }; + let orig_idx = |i: usize| -> i32 { orig_idx_table[i] }; + + // Index of the auto-injected system message (if any) — emits a + // trailing literal "\n" after its <|im_end|>. + let auto_system_idx: Option = working + .iter() + .enumerate() + .find(|(i, m)| injected[*i] && m.role == "system") + .map(|(i, _)| i); + + let mut buf = RenderBuf::new( + &self.tokenizer, + working.len().max(1) * 256 + tools.map_or(0, |t| 64 * t.len() + 256), + ); + + for (i, msg) in working.iter().enumerate() { + let oi = orig_idx(i); + let content = msg.text_content(); + match msg.role.as_str() { + "system" => { + self.emit_im_role(&mut buf, self.im_system, "system", content, oi)?; + if Some(i) == auto_system_idx { + buf.ids(&self.newline_tokens, oi); + } + } + "tool_declare" => { + if injected[i] { + self.emit_tool_declare_from_tools( + &mut buf, + tools.expect("injected tool_declare requires tools"), + oi, + )?; + } else { + self.emit_im_role(&mut buf, self.im_system, "tool_declare", content, oi)?; + } + } + "user" => { + self.emit_im_role(&mut buf, self.im_user, "user", content, oi)?; + } + "assistant" => self.emit_assistant(&mut buf, msg, oi)?, + "tool" => self.emit_tool(&mut buf, msg, content, oi)?, + other => { + // Unknown role: render system-style + self.emit_im_role(&mut buf, self.im_system, other, content, oi)?; + } + } + } + + if add_generation_prompt { + buf.scaffold_special(self.im_assistant); + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.im_middle); + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_kimi_k2( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.tool_calls_section_begin, + self.tool_calls_section_end, + self.tool_call_begin, + self.tool_call_argument_begin, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, new_messages.len().max(1) * 256); + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + let content = msg.text_content(); + match msg.role.as_str() { + "user" => self.emit_im_role(&mut buf, self.im_user, "user", content, idx)?, + "system" => self.emit_im_role(&mut buf, self.im_system, "system", content, idx)?, + "tool" => self.emit_tool(&mut buf, msg, content, idx)?, + _ => return Ok(None), + } + } + + buf.scaffold_special(self.im_assistant); + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.im_middle); + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +impl KimiK2Renderer { + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_assistant, idx); + buf.ids(&self.assistant_tokens, idx); + buf.special(self.im_middle, idx); + + // Kimi's template renders content verbatim; reasoning_content is + // ignored (not read by the Jinja). + buf.text(msg.text_content(), idx)?; + + if !msg.tool_calls.is_empty() { + buf.special(self.tool_calls_section_begin, idx); + for tc in &msg.tool_calls { + let args_str = Self::args_to_string(&tc.function.arguments); + let tc_id = tc.id.clone().unwrap_or_default(); + buf.special(self.tool_call_begin, idx); + buf.text(&tc_id, idx)?; + buf.special(self.tool_call_argument_begin, idx); + buf.text(&args_str, idx)?; + buf.special(self.tool_call_end, idx); + } + buf.special(self.tool_calls_section_end, idx); + } + buf.special(self.im_end, idx); + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + let name = msg.name.as_deref().unwrap_or("tool"); + let tool_call_id = msg.tool_call_id.as_deref().unwrap_or(""); + buf.special(self.im_system, idx); + buf.text(name, idx)?; + buf.special(self.im_middle, idx); + let mut header = String::with_capacity(tool_call_id.len() + 16); + header.push_str("## Return of "); + header.push_str(tool_call_id); + header.push('\n'); + buf.text(&header, idx)?; + buf.text(content, idx)?; + buf.special(self.im_end, idx); + Ok(()) + } +} diff --git a/crates/renderers-core/src/families/kimi_k25.rs b/crates/renderers-core/src/families/kimi_k25.rs new file mode 100644 index 0000000..56ce339 --- /dev/null +++ b/crates/renderers-core/src/families/kimi_k25.rs @@ -0,0 +1,670 @@ +//! Kimi K2.5 renderer (text-only path, no tools). +//! +//! Port of `renderers/kimi_k25.py` covering the most common call shape: +//! chat without function-calling tools and without images. The path with +//! TypeScript-style tool declarations and the multimodal path are +//! deferred to Phase 5 (the Python shim keeps those on the pure-Python +//! implementation for now). +//! +//! Distinctive features vs Kimi K2: +//! +//! - Generation prompt prefills `` (`enable_thinking=True`) or the +//! empty block `` (`enable_thinking=False`) to control +//! thinking mode at sample time. `` and `` may be +//! multi-token; the renderer encodes them as text. +//! - Assistant body uses the hist/suffix split: the last non-tool-call +//! assistant + all later assistants keep `reasoning_content`; +//! historical assistants collapse to a literal ``. +//! - Default system message is the same as K2 +//! ("You are Kimi, an AI assistant created by Moonshot AI.") but the +//! Python class doesn't auto-inject it — neither does this port. + +use crate::SCAFFOLD_IDX; +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::kimi_k2::parse_kimi_k2; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::traits::{MultimodalRenderer, Renderer}; +use crate::types::{ + MediaBundle, MediaItem, Message, Modality, MultiModalData, ParsedResponse, PlaceholderRange, + RenderError, RenderedTokens, ToolArguments, ToolSpec, +}; + +#[derive(Debug, Clone)] +pub struct KimiK25RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for KimiK25RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl KimiK25RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + KimiK25Renderer::new_with(tokenizer, &self) + } +} + +#[derive(Debug, Clone)] +pub struct KimiK25Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + im_user: u32, + im_assistant: u32, + im_system: u32, + im_middle: u32, + im_end: u32, + tool_calls_section_begin: u32, + tool_calls_section_end: u32, + tool_call_begin: u32, + tool_call_argument_begin: u32, + tool_call_end: u32, + + // Media tokens — present on K2.5 tokenizers, absent on K2 proper. + // When absent, as_multimodal() returns None. + media_begin: Option, + media_content: Option, + media_pad: Option, + media_end: Option, + mm_token_type_ids: Vec<(u32, u8)>, + + newline_tokens: Vec, + assistant_tokens: Vec, + think_tokens: Vec, + empty_think_tokens: Vec, + stop_tokens: Vec, +} + +impl KimiK25Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + KimiK25RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> KimiK25RendererBuilder { + KimiK25RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: &KimiK25RendererBuilder) -> Result { + let im_user = tokenizer.token_to_id_strict("<|im_user|>")?; + let im_assistant = tokenizer.token_to_id_strict("<|im_assistant|>")?; + let im_system = tokenizer.token_to_id_strict("<|im_system|>")?; + let im_middle = tokenizer.token_to_id_strict("<|im_middle|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let tool_calls_section_begin = + tokenizer.token_to_id_strict("<|tool_calls_section_begin|>")?; + let tool_calls_section_end = tokenizer.token_to_id_strict("<|tool_calls_section_end|>")?; + let tool_call_begin = tokenizer.token_to_id_strict("<|tool_call_begin|>")?; + let tool_call_argument_begin = + tokenizer.token_to_id_strict("<|tool_call_argument_begin|>")?; + let tool_call_end = tokenizer.token_to_id_strict("<|tool_call_end|>")?; + + // Media tokens optional — K2 proper doesn't ship them. + let media_begin = tokenizer.token_to_id("<|media_begin|>"); + let media_content = tokenizer.token_to_id("<|media_content|>"); + let media_pad = tokenizer.token_to_id("<|media_pad|>"); + let media_end = tokenizer.token_to_id("<|media_end|>"); + let mut mm_token_type_ids: Vec<(u32, u8)> = Vec::new(); + if let Some(p) = media_pad { + mm_token_type_ids.push((p, 1)); // image marker; K2.5 handles video via the same pad + } + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let assistant_tokens = tokenizer + .encode_no_special("assistant")? + .as_slice() + .to_vec(); + let think_tokens = tokenizer.encode_no_special("")?.as_slice().to_vec(); + let empty_think_tokens = tokenizer + .encode_no_special("")? + .as_slice() + .to_vec(); + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_user, + im_assistant, + im_system, + im_middle, + im_end, + tool_calls_section_begin, + tool_calls_section_end, + tool_call_begin, + tool_call_argument_begin, + tool_call_end, + media_begin, + media_content, + media_pad, + media_end, + mm_token_type_ids, + newline_tokens, + assistant_tokens, + think_tokens, + empty_think_tokens, + stop_tokens: vec![im_end], + }) + } + + /// True when the loaded tokenizer ships the K2.5 media tokens. + pub fn supports_multimodal(&self) -> bool { + self.media_begin.is_some() + && self.media_content.is_some() + && self.media_pad.is_some() + && self.media_end.is_some() + } + + fn args_to_string(args: &ToolArguments) -> String { + match args { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => serde_json::to_string(v).unwrap_or_else(|_| "{}".into()), + } + } + + fn role_token(&self, role: &str) -> u32 { + match role { + "user" => self.im_user, + "assistant" => self.im_assistant, + _ => self.im_system, + } + } + + /// Extract `(reasoning_content, text_content)` from a message, + /// honouring the explicit `reasoning_content` field and the inline + /// `...` tag fallback. Mirrors the Python K2.5 + /// `_render_assistant_body` extraction. + fn extract_reasoning(msg: &Message) -> (String, String) { + if let Some(r) = &msg.reasoning_content { + return (r.clone(), msg.text_content().to_string()); + } + let content = msg.text_content(); + if let Some((before, after)) = content.split_once("") { + let reasoning = if let Some((_, inner)) = before.rsplit_once("") { + inner.to_string() + } else { + before.to_string() + }; + return (reasoning, after.trim_start_matches('\n').to_string()); + } + (String::new(), content.to_string()) + } + + fn emit_assistant_body( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + is_suffix: bool, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let (reasoning_content, text_content) = Self::extract_reasoning(msg); + + // hist/suffix split: hist drops reasoning, suffix preserves it. + if is_suffix || (preserve_thinking && !reasoning_content.is_empty()) { + let mut s = String::with_capacity(reasoning_content.len() + 16); + s.push_str(""); + s.push_str(&reasoning_content); + s.push_str(""); + buf.text(&s, msg_idx)?; + } else { + buf.ids(&self.empty_think_tokens, msg_idx); + } + buf.text(&text_content, msg_idx)?; + + if !msg.tool_calls.is_empty() { + buf.special(self.tool_calls_section_begin, msg_idx); + for tc in &msg.tool_calls { + let args_str = Self::args_to_string(&tc.function.arguments); + let tool_id = tc.id.clone().unwrap_or_default(); + buf.special(self.tool_call_begin, msg_idx); + buf.text(&tool_id, msg_idx)?; + buf.special(self.tool_call_argument_begin, msg_idx); + buf.text(&args_str, msg_idx)?; + buf.special(self.tool_call_end, msg_idx); + } + buf.special(self.tool_calls_section_end, msg_idx); + } + Ok(()) + } + + fn emit_tool_body( + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + ) -> Result<(), RenderError> { + let tool_call_id = msg.tool_call_id.as_deref().unwrap_or(""); + let mut header = String::with_capacity(tool_call_id.len() + 16); + header.push_str("## Return of "); + header.push_str(tool_call_id); + header.push('\n'); + buf.text(&header, msg_idx)?; + let content = msg.text_content(); + if !content.is_empty() { + buf.text(content, msg_idx)?; + } + Ok(()) + } +} + +impl Renderer for KimiK25Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + // Tools route to Python — the TS-style declaration formatter + // (~270 lines) isn't ported yet. The Python shim avoids native + // routing when tools are present, so this is a hard error if we + // got here with tools. + if tools.is_some_and(|t| !t.is_empty()) { + return Err(RenderError::Invalid( + "Kimi K2.5 with tools not supported on the native path yet; the Python shim should route to pure Python in this case".into(), + )); + } + + let mut buf = RenderBuf::new(&self.tokenizer, messages.len().max(1) * 256); + + // Find last non-tool-call assistant for the hist/suffix split + let mut last_non_tc_assistant: i32 = -1; + for (i, m) in messages.iter().enumerate().rev() { + if m.role == "assistant" && m.tool_calls.is_empty() { + last_non_tc_assistant = i as i32; + break; + } + } + + for (i, msg) in messages.iter().enumerate() { + let idx = i as i32; + buf.special(self.role_token(&msg.role), idx); + // K2.5 uses `msg.name or role` as the role-name literal + let role_name = msg.name.as_deref().unwrap_or(&msg.role); + buf.text(role_name, idx)?; + buf.special(self.im_middle, idx); + + match msg.role.as_str() { + "assistant" => { + let is_suffix = idx > last_non_tc_assistant; + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant_body(&mut buf, msg, idx, is_suffix, preserve_thinking)?; + } + "tool" => Self::emit_tool_body(&mut buf, msg, idx)?, + _ => { + let content = msg.text_content(); + if !content.is_empty() { + buf.text(content, idx)?; + } + } + } + buf.special(self.im_end, idx); + } + + // Generation prompt + if add_generation_prompt { + buf.scaffold_special(self.im_assistant); + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.im_middle); + if self.enable_thinking { + buf.ids(&self.think_tokens, SCAFFOLD_IDX); + } else { + buf.ids(&self.empty_think_tokens, SCAFFOLD_IDX); + } + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + // K2.5 reuses the K2 parser shape; only differences are the + // thinking-tag handling, which the K2 parser already does via the + // decoded-text branch. + parse_kimi_k2( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.tool_calls_section_begin, + self.tool_calls_section_end, + self.tool_call_begin, + self.tool_call_argument_begin, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, new_messages.len().max(1) * 256); + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + buf.special(self.role_token(&msg.role), idx); + let role_name = msg.name.as_deref().unwrap_or(&msg.role); + buf.text(role_name, idx)?; + buf.special(self.im_middle, idx); + match msg.role.as_str() { + "user" | "system" => { + let content = msg.text_content(); + if !content.is_empty() { + buf.text(content, idx)?; + } + } + "tool" => Self::emit_tool_body(&mut buf, msg, idx)?, + _ => return Ok(None), + } + buf.special(self.im_end, idx); + } + + // Generation prompt + buf.scaffold_special(self.im_assistant); + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.im_middle); + if self.enable_thinking { + buf.ids(&self.think_tokens, SCAFFOLD_IDX); + } else { + buf.ids(&self.empty_think_tokens, SCAFFOLD_IDX); + } + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } + + fn as_multimodal(&self) -> Option<&dyn MultimodalRenderer> { + if self.supports_multimodal() { + Some(self) + } else { + None + } + } +} + +// ── Multimodal implementation ───────────────────────────────────────── +// +// Kimi K2.5's placeholder shape diverges from Qwen-VL: each image gets +// exactly ONE `<|media_pad|>` token in the input stream, regardless of +// image size. The model's vision encoder expands per-patch attention +// internally from `pixel_values` + `grid_thws`. The renderer's job is +// just to emit the per-image wrapper: +// +// <|media_begin|>image<|media_content|><|media_pad|><|media_end|>\n +// +// and accumulate the corresponding placeholder ranges + opaque payloads. + +impl KimiK25Renderer { + fn emit_media_item( + &self, + buf: &mut RenderBuf<'_>, + idx: i32, + item: &MediaItem, + mm: &mut MultiModalData, + ) -> Result<(), RenderError> { + let begin = self + .media_begin + .ok_or_else(|| RenderError::MissingSpecialToken("<|media_begin|>".into()))?; + let content = self + .media_content + .ok_or_else(|| RenderError::MissingSpecialToken("<|media_content|>".into()))?; + let pad = self + .media_pad + .ok_or_else(|| RenderError::MissingSpecialToken("<|media_pad|>".into()))?; + let end = self + .media_end + .ok_or_else(|| RenderError::MissingSpecialToken("<|media_end|>".into()))?; + + let label = match item.modality { + Modality::Image => "image", + Modality::Video => "video", + }; + + buf.special(begin, idx); + buf.text(label, idx)?; + buf.special(content, idx); + let offset = buf.len(); + buf.special(pad, idx); + buf.special(end, idx); + buf.ids(&self.newline_tokens, idx); + + // Always exactly 1 placeholder in the stream, regardless of + // image size — that's the K2.5 convention. + let key = item.modality.as_str().to_string(); + mm.mm_hashes + .entry(key.clone()) + .or_default() + .push(item.hash.clone()); + mm.mm_placeholders + .entry(key.clone()) + .or_default() + .push(PlaceholderRange { offset, length: 1 }); + mm.mm_items + .entry(key) + .or_default() + .push(item.hf_payload.clone()); + Ok(()) + } + + fn emit_user_body_with_media<'m>( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: i32, + media_iter: &mut impl Iterator, + mm: &mut MultiModalData, + ) -> Result<(), RenderError> { + match &msg.content { + crate::types::Content::Null => { + for item in media_iter.by_ref() { + self.emit_media_item(buf, msg_idx, item, mm)?; + } + } + crate::types::Content::Text(s) => { + // Plain-text + attached media: emit images first, then + // text. Same convention as Qwen-VL when the caller + // doesn't pass a structured content list. + for item in media_iter.by_ref() { + self.emit_media_item(buf, msg_idx, item, mm)?; + } + if !s.is_empty() { + buf.text(s, msg_idx)?; + } + } + crate::types::Content::Parts(parts) => { + use crate::types::ContentPart; + for part in parts { + match part { + ContentPart::Text { text } => { + if !text.is_empty() { + buf.text(text, msg_idx)?; + } + } + ContentPart::Thinking { .. } => {} + ContentPart::Image(_) | ContentPart::Video(_) => { + let item = media_iter.next().ok_or_else(|| { + RenderError::Invalid( + "K2.5 message content lists more media parts than the MediaBundle provides".into(), + ) + })?; + self.emit_media_item(buf, msg_idx, item, mm)?; + } + } + } + } + } + Ok(()) + } +} + +impl MultimodalRenderer for KimiK25Renderer { + fn mm_token_type_id_map(&self) -> &[(u32, u8)] { + &self.mm_token_type_ids + } + + fn render_with_media( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + media: &MediaBundle, + add_generation_prompt: bool, + ) -> Result { + if media.is_empty() { + return self.render(messages, tools, add_generation_prompt); + } + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + if tools.is_some_and(|t| !t.is_empty()) { + return Err(RenderError::Invalid( + "Kimi K2.5 with tools not supported on the native path yet".into(), + )); + } + + // Per-message media iterator. The bundle is flat (message_idx, + // item), and K2.5 doesn't auto-inject system messages, so the + // indices align directly with the caller's input. + let mut buf = RenderBuf::new(&self.tokenizer, messages.len().max(1) * 256); + let mut mm = MultiModalData::default(); + + let mut last_non_tc_assistant: i32 = -1; + for (i, m) in messages.iter().enumerate().rev() { + if m.role == "assistant" && m.tool_calls.is_empty() { + last_non_tc_assistant = i as i32; + break; + } + } + + for (i, msg) in messages.iter().enumerate() { + let idx = i as i32; + buf.special(self.role_token(&msg.role), idx); + let role_name = msg.name.as_deref().unwrap_or(&msg.role); + buf.text(role_name, idx)?; + buf.special(self.im_middle, idx); + + match msg.role.as_str() { + "assistant" => { + let is_suffix = idx > last_non_tc_assistant; + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant_body(&mut buf, msg, idx, is_suffix, preserve_thinking)?; + } + "tool" => Self::emit_tool_body(&mut buf, msg, idx)?, + _ => { + // user / system / other — interleave media inline + let mut media_iter = media + .items + .iter() + .filter_map(|(m, it)| (*m == i).then_some(it)); + self.emit_user_body_with_media(&mut buf, msg, idx, &mut media_iter, &mut mm)?; + if media_iter.next().is_some() { + return Err(RenderError::Invalid(format!( + "MediaBundle has more items for message {i} than the content's media parts" + ))); + } + } + } + buf.special(self.im_end, idx); + } + + if add_generation_prompt { + buf.scaffold_special(self.im_assistant); + buf.ids(&self.assistant_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.im_middle); + if self.enable_thinking { + buf.ids(&self.think_tokens, SCAFFOLD_IDX); + } else { + buf.ids(&self.empty_think_tokens, SCAFFOLD_IDX); + } + } + + let mut out = buf.into_rendered(); + if !mm.is_empty() { + out.multi_modal_data = Some(mm); + } + Ok(out) + } + + fn bridge_to_next_turn_with_media( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + tools: Option<&[ToolSpec]>, + new_media: &MediaBundle, + _previous_multi_modal_data: Option<&MultiModalData>, + ) -> Result, RenderError> { + if !new_media.is_empty() { + // Same Phase 5a caveat as Qwen3.5: bridging media-bearing + // new turns is unsafe under truncation. Fall back to a full + // re-render. + return Ok(None); + } + self.bridge_to_next_turn( + previous_prompt_ids, + previous_completion_ids, + new_messages, + tools, + ) + } +} diff --git a/crates/renderers-core/src/families/minimax_m2.rs b/crates/renderers-core/src/families/minimax_m2.rs new file mode 100644 index 0000000..63fe3d2 --- /dev/null +++ b/crates/renderers-core/src/families/minimax_m2.rs @@ -0,0 +1,532 @@ +//! `MiniMax` M2.5 renderer. Port of `renderers/minimax_m2.py`. +//! +//! Unique characteristics: +//! +//! - Token format: `]~!b[` (BOS), `]~b]` (role prefix), `[e~[` (EOS). +//! Role "assistant" is rendered as "ai". +//! - System block always present — default system message +//! ("You are a helpful assistant. Your name is MiniMax-M2.5 and is +//! built by `MiniMax`.") auto-injected if missing. +//! - Tools, when supplied, are appended to the system message as +//! `{json}` lines inside a `...` block, +//! followed by a verbose instructions block. +//! - Tool calls use XML wrapper + nested invokes: +//! `v... +//! ` +//! - Tool responses wrapped in literal `...` +//! (plain text, no special token). +//! - Thinking emitted only for assistants after the last user turn +//! (or when `preserve_all_thinking` is on). + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; +use crate::json::to_string_python; +use crate::parsing::minimax::parse_minimax; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, +}; + +const DEFAULT_SYSTEM: &str = + "You are a helpful assistant. Your name is MiniMax-M2.5 and is built by MiniMax."; + +const TOOLS_HEADER: &str = "\n\n# Tools\nYou may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:\n\n\n"; +const TOOLS_FOOTER_PREFIX: &str = "\n\n"; +const TOOLS_INSTRUCTIONS: &str = "When making tool calls, use XML format to invoke tools and pass parameters:\n\n\n\nparam-value-1\nparam-value-2\n...\n\n"; + +#[derive(Debug, Clone)] +pub struct MiniMaxM2RendererBuilder { + default_system: String, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for MiniMaxM2RendererBuilder { + fn default() -> Self { + Self { + default_system: DEFAULT_SYSTEM.to_string(), + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl MiniMaxM2RendererBuilder { + pub fn default_system(mut self, s: impl Into) -> Self { + self.default_system = s.into(); + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + MiniMaxM2Renderer::new_with(tokenizer, self) + } +} + +#[derive(Debug, Clone)] +pub struct MiniMaxM2Renderer { + tokenizer: Tokenizer, + default_system: String, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + bos: u32, + role: u32, + eos: u32, + think: u32, + think_end: u32, + tool_call: u32, + tool_call_end: u32, + + newline_tokens: Vec, + ai_newline_tokens: Vec, + tool_tokens: Vec, + tool_text_cache: ToolTextCache, + stop_tokens: Vec, +} + +impl MiniMaxM2Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + MiniMaxM2RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> MiniMaxM2RendererBuilder { + MiniMaxM2RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: MiniMaxM2RendererBuilder) -> Result { + let bos = tokenizer.token_to_id_strict("]~!b[")?; + let role = tokenizer.token_to_id_strict("]~b]")?; + let eos = tokenizer.token_to_id_strict("[e~[")?; + let think = tokenizer.token_to_id_strict("")?; + let think_end = tokenizer.token_to_id_strict("")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let ai_newline_tokens = tokenizer.encode_no_special("ai\n")?.as_slice().to_vec(); + let tool_tokens = tokenizer.encode_no_special("tool")?.as_slice().to_vec(); + + Ok(Self { + tokenizer, + default_system: cfg.default_system, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + bos, + role, + eos, + think, + think_end, + tool_call, + tool_call_end, + newline_tokens, + ai_newline_tokens, + tool_tokens, + tool_text_cache: ToolTextCache::default(), + stop_tokens: vec![eos], + }) + } + + fn build_system_text(&self, sys_content: &str, tools: Option<&[ToolSpec]>) -> String { + Self::build_system_text_from(&self.default_system, sys_content, tools) + } + + fn build_system_text_from( + default_system: &str, + sys_content: &str, + tools: Option<&[ToolSpec]>, + ) -> String { + let mut s = String::with_capacity(512); + s.push_str("system\n"); + if sys_content.is_empty() { + s.push_str(default_system); + } else { + s.push_str(sys_content); + } + if let Some(tools) = tools { + if !tools.is_empty() { + s.push_str(TOOLS_HEADER); + for tool in tools { + s.push_str(""); + let spec = serde_json::json!({ + "name": tool.name, + "description": tool.description, + "parameters": tool.parameters, + }); + s.push_str(&to_string_python(&spec).unwrap_or_default()); + s.push_str("\n"); + } + s.push_str(TOOLS_FOOTER_PREFIX); + s.push_str(TOOLS_INSTRUCTIONS); + } + } + s + } + + fn args_to_value(args: &ToolArguments) -> serde_json::Value { + match args { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(serde_json::Value::Object(serde_json::Map::new())) + } + } + } + + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + messages.len().max(1) * 256 + tools.map_or(0, |t| t.len() * 256 + 512) + } + + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + + fn render_into_buf( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result<(), RenderError> { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + + let first_is_system = messages[0].role == "system"; + let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; + + buf.special(self.bos, sys_idx); + buf.special(self.role, sys_idx); + let sys_content = if first_is_system { + messages[0].visible_text_content().to_string() + } else { + String::new() + }; + if let Some(t) = tools.filter(|t| !t.is_empty()) { + let default_system = self.default_system.clone(); + let system_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + t, + u64::from(first_is_system), + &sys_content, + || { + Ok(Self::build_system_text_from( + &default_system, + &sys_content, + Some(t), + )) + }, + )?; + buf.ids(system_tokens.as_slice(), sys_idx); + } else { + let system_text = self.build_system_text(&sys_content, tools); + buf.text(&system_text, sys_idx)?; + } + buf.special(self.eos, sys_idx); + buf.ids(&self.newline_tokens, sys_idx); + + let conversation_start = usize::from(first_is_system); + let conversation = &messages[conversation_start..]; + + let mut last_ui: i32 = -1; + for (ci, m) in conversation.iter().enumerate() { + if m.role == "user" { + last_ui = ci as i32; + } + } + + for (ci, msg) in conversation.iter().enumerate() { + let orig_idx = (ci + conversation_start) as i32; + let content = msg.visible_text_content(); + match msg.role.as_str() { + "user" => { + buf.special(self.role, orig_idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, orig_idx)?; + buf.special(self.eos, orig_idx); + buf.ids(&self.newline_tokens, orig_idx); + } + "assistant" => { + #[allow(clippy::cast_sign_loss)] + let preserve_thinking = should_preserve_past_thinking( + messages, + orig_idx as usize, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(buf, msg, orig_idx, ci as i32, last_ui, preserve_thinking)?; + } + "tool" => self.emit_tool(buf, conversation, ci, orig_idx)?, + _ => {} + } + } + + if add_generation_prompt { + buf.scaffold_special(self.role); + buf.ids(&self.ai_newline_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.think); + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + } + + Ok(()) + } +} + +impl Renderer for MiniMaxM2Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_rendered()) + } + + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages, tools); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_minimax( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.think, + self.think_end, + self.tool_call, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.eos), + ) else { + return Ok(None); + }; + + let mut buf = + RenderBuf::new_token_ids_only(&self.tokenizer, new_messages.len().max(1) * 256); + // Trailing \n after the prior turn's [e~[ + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + + for (i, msg) in new_messages.iter().enumerate() { + let idx = i as i32; + let content = msg.visible_text_content(); + match msg.role.as_str() { + "user" => { + buf.special(self.role, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.eos, idx); + buf.ids(&self.newline_tokens, idx); + } + "system" => { + buf.special(self.role, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.eos, idx); + buf.ids(&self.newline_tokens, idx); + } + "tool" => self.emit_tool(&mut buf, new_messages, i, idx)?, + _ => return Ok(None), + } + } + + buf.scaffold_special(self.role); + buf.ids(&self.ai_newline_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.think); + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +impl MiniMaxM2Renderer { + fn emit_assistant( + &self, + buf: &mut impl TokenSink, + msg: &Message, + orig_idx: i32, + conv_idx: i32, + last_user_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let raw_content = msg.visible_text_content(); + let (reasoning_content, content_text) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let r = if let Some((_, inner)) = before.rsplit_once("") { + inner.trim_matches('\n').to_string() + } else { + before.trim_matches('\n').to_string() + }; + (r, after.trim_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + + buf.special(self.role, orig_idx); + + let tool_calls = &msg.tool_calls; + let emit_think = + !reasoning_content.is_empty() && (conv_idx > last_user_index || preserve_thinking); + + let after_think: String = if emit_think { + buf.ids(&self.ai_newline_tokens, orig_idx); + buf.special(self.think, orig_idx); + let mut head = String::with_capacity(reasoning_content.len() + 2); + head.push('\n'); + head.push_str(&reasoning_content); + head.push('\n'); + buf.text(&head, orig_idx)?; + buf.special(self.think_end, orig_idx); + // After , the rest is "\n\n" + content (or just "\n\n") + if content_text.is_empty() { + "\n\n".to_string() + } else { + let mut s = String::with_capacity(content_text.len() + 2); + s.push_str("\n\n"); + s.push_str(&content_text); + s + } + } else if content_text.is_empty() { + "ai\n".to_string() + } else { + let mut s = String::with_capacity(content_text.len() + 4); + s.push_str("ai\n"); + s.push_str(&content_text); + s + }; + + if tool_calls.is_empty() { + buf.text(&after_think, orig_idx)?; + } else { + // \n before contiguous with preceding text + let mut head = after_think; + head.push('\n'); + buf.text(&head, orig_idx)?; + buf.special(self.tool_call, orig_idx); + + let mut invoke_block = String::from("\n"); + for tc in tool_calls { + let name = tc.function.name.as_str(); + invoke_block.push_str("\n"); + let args_value = Self::args_to_value(&tc.function.arguments); + if let Some(obj) = args_value.as_object() { + for (arg_name, arg_value) in obj { + let val_str = match arg_value { + serde_json::Value::String(s) => s.clone(), + _ => serde_json::to_string(arg_value).unwrap_or_default(), + }; + invoke_block.push_str(""); + invoke_block.push_str(&val_str); + invoke_block.push_str("\n"); + } + } + invoke_block.push_str("\n"); + } + buf.text(&invoke_block, orig_idx)?; + buf.special(self.tool_call_end, orig_idx); + } + + buf.special(self.eos, orig_idx); + buf.ids(&self.newline_tokens, orig_idx); + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut impl TokenSink, + conversation: &[Message], + conv_idx: usize, + orig_idx: i32, + ) -> Result<(), RenderError> { + let prev_is_tool = conv_idx > 0 && conversation[conv_idx - 1].role == "tool"; + let next_is_tool = + conv_idx + 1 < conversation.len() && conversation[conv_idx + 1].role == "tool"; + + if !prev_is_tool { + buf.special(self.role, orig_idx); + buf.ids(&self.tool_tokens, orig_idx); + } + let prefix = if prev_is_tool { "" } else { "\n" }; + let suffix = if next_is_tool { "\n" } else { "" }; + let content = conversation[conv_idx].visible_text_content(); + let mut s = String::with_capacity(content.len() + 32); + s.push_str(prefix); + s.push_str(""); + s.push_str(content); + s.push_str(""); + s.push_str(suffix); + buf.text(&s, orig_idx)?; + + if !next_is_tool { + buf.special(self.eos, orig_idx); + buf.ids(&self.newline_tokens, orig_idx); + } + Ok(()) + } +} diff --git a/crates/renderers-core/src/families/mod.rs b/crates/renderers-core/src/families/mod.rs new file mode 100644 index 0000000..4113797 --- /dev/null +++ b/crates/renderers-core/src/families/mod.rs @@ -0,0 +1,29 @@ +//! Per-family renderer implementations. +//! +//! Each family lives in its own module so the hand-coded template logic +//! stays focused. New families slot in by adding a module here and a +//! registry entry in [`crate::registry`]. + +pub mod deepseek_v3; +pub mod default; +pub mod glm; +pub mod gpt_oss; +pub mod kimi_k2; +pub mod kimi_k25; +pub mod minimax_m2; +pub mod nemotron3; +pub mod qwen3; +pub mod qwen35; +pub mod qwen36; + +pub use deepseek_v3::{DeepSeekV3Renderer, DeepSeekV3RendererBuilder}; +pub use default::{DefaultRenderer, DefaultRendererBuilder}; +pub use glm::{GlmRenderer, GlmRendererBuilder}; +pub use gpt_oss::{GptOssRenderer, GptOssRendererBuilder}; +pub use kimi_k2::{KimiK2Renderer, KimiK2RendererBuilder}; +pub use kimi_k25::{KimiK25Renderer, KimiK25RendererBuilder}; +pub use minimax_m2::{MiniMaxM2Renderer, MiniMaxM2RendererBuilder}; +pub use nemotron3::{Nemotron3Renderer, Nemotron3RendererBuilder}; +pub use qwen3::{Qwen3Renderer, Qwen3RendererBuilder}; +pub use qwen35::{Qwen35Renderer, Qwen35RendererBuilder}; +pub use qwen36::{Qwen36Renderer, Qwen36RendererBuilder}; diff --git a/crates/renderers-core/src/families/nemotron3.rs b/crates/renderers-core/src/families/nemotron3.rs new file mode 100644 index 0000000..4fb76e2 --- /dev/null +++ b/crates/renderers-core/src/families/nemotron3.rs @@ -0,0 +1,717 @@ +//! Nemotron 3 renderer. Port of `renderers/nemotron3.py`. +//! +//! Same `<|im_start|>/<|im_end|>` framing as Qwen3.5, but with several +//! template-specific quirks: +//! +//! - Tool declarations use XML (`...` with nested +//! `` blocks), not JSON-per-line. +//! - System prompt is emitted BEFORE the tools block (Qwen3.5 puts +//! tools first). +//! - An empty system message is auto-injected if none is present. +//! - `` is emitted on EVERY assistant message, even +//! those without reasoning content (collapses to empty block). +//! - Single `\n` after `` (Qwen3.5 uses `\n\n`). +//! - Disable-thinking generation suffix is `` with no +//! trailing newlines. +//! - Trailing `\n` after ``. +//! - `<|endoftext|>` is *optional* — Nemotron-3 Nano / Super ship with +//! only `<|im_end|>` as EOS; larger variants additionally include +//! `<|endoftext|>`. + +use serde_json::Value as JsonValue; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::RenderBuf; +use crate::parsing::qwen35::parse_qwen35; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, +}; + +const TOOLS_HEADER: &str = "# Tools\n\nYou have access to the following functions:\n\n"; +const TOOLS_FOOTER: &str = "\n"; +const TOOLS_INSTRUCTIONS: &str = "\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n"; + +#[derive(Debug, Clone)] +pub struct Nemotron3RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for Nemotron3RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl Nemotron3RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + Nemotron3Renderer::new_with(tokenizer, &self) + } +} + +#[derive(Debug, Clone)] +pub struct Nemotron3Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + im_start: u32, + im_end: u32, + /// `<|endoftext|>` is optional — Nemotron-3 Nano / Super tokenizers + /// don't ship it. + endoftext: Option, + think: u32, + think_end: u32, + tool_call: u32, + tool_call_end: u32, + tool_response: u32, + tool_response_end: u32, + + stop_tokens: Vec, + newline_tokens: Vec, + system_newline_tokens: Vec, + user_newline_tokens: Vec, + assistant_newline_tokens: Vec, + function_close_newline_tokens: Vec, + tool_text_cache: ToolTextCache, +} + +impl Nemotron3Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + Nemotron3RendererBuilder::default().build(tokenizer) + } + pub fn builder() -> Nemotron3RendererBuilder { + Nemotron3RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: &Nemotron3RendererBuilder) -> Result { + let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let endoftext = tokenizer.token_to_id("<|endoftext|>"); + let think = tokenizer.token_to_id_strict("")?; + let think_end = tokenizer.token_to_id_strict("")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let tool_response = tokenizer.token_to_id_strict("")?; + let tool_response_end = tokenizer.token_to_id_strict("")?; + + let mut stop_tokens = vec![im_end]; + if let Some(eot) = endoftext { + stop_tokens.push(eot); + } + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let system_newline_tokens = tokenizer.encode_no_special("system\n")?.as_slice().to_vec(); + let user_newline_tokens = tokenizer.encode_no_special("user\n")?.as_slice().to_vec(); + let assistant_newline_tokens = tokenizer + .encode_no_special("assistant\n")? + .as_slice() + .to_vec(); + let function_close_newline_tokens = tokenizer + .encode_no_special("\n")? + .as_slice() + .to_vec(); + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_start, + im_end, + endoftext, + think, + think_end, + tool_call, + tool_call_end, + tool_response, + tool_response_end, + stop_tokens, + newline_tokens, + system_newline_tokens, + user_newline_tokens, + assistant_newline_tokens, + function_close_newline_tokens, + tool_text_cache: ToolTextCache::default(), + }) + } + + /// Render a single tool declaration in Nemotron 3's XML format. + /// Mirrors `_format_tool_declaration` in the Python impl. + fn format_tool_declaration(tool: &ToolSpec) -> String { + let mut out = String::with_capacity(256); + out.push_str("\n"); + out.push_str(&tool.name); + out.push_str(""); + let desc = tool.description.trim(); + if !desc.is_empty() { + out.push_str("\n"); + out.push_str(desc); + out.push_str(""); + } + out.push_str("\n"); + + if let Some(props) = tool + .parameters + .get("properties") + .and_then(|v| v.as_object()) + { + for (param_name, param_fields) in props { + out.push_str("\n\n"); + out.push_str(param_name); + out.push_str(""); + if let Some(t) = param_fields.get("type") { + out.push_str("\n"); + Self::write_value_as_text(&mut out, t); + out.push_str(""); + } + if let Some(d) = param_fields.get("description").and_then(|v| v.as_str()) { + out.push_str("\n"); + out.push_str(d.trim()); + out.push_str(""); + } + if let Some(e) = param_fields.get("enum") { + out.push_str("\n"); + out.push_str(&serde_json::to_string(e).unwrap_or_default()); + out.push_str(""); + } + if let Some(obj) = param_fields.as_object() { + Self::render_extra_keys( + &mut out, + obj, + &["name", "type", "description", "enum"], + ); + } + out.push_str("\n"); + } + } + if let Some(obj) = tool.parameters.as_object() { + Self::render_extra_keys(&mut out, obj, &["type", "properties", "required"]); + } + if let Some(req) = tool.parameters.get("required") { + out.push_str("\n"); + out.push_str(&serde_json::to_string(req).unwrap_or_default()); + out.push_str(""); + } + out.push_str("\n"); + out.push_str("\n"); + out + } + + /// Mirror Python's `str(value)` for non-string JSON values + /// (used inside `{value}` tags). + fn write_value_as_text(out: &mut String, value: &JsonValue) { + match value { + JsonValue::String(s) => out.push_str(s), + JsonValue::Bool(true) => out.push_str("True"), + JsonValue::Bool(false) => out.push_str("False"), + JsonValue::Null => out.push_str("None"), + JsonValue::Number(n) => out.push_str(&n.to_string()), + _ => out.push_str(&serde_json::to_string(value).unwrap_or_default()), + } + } + + /// Mirror Python's `_render_extra_keys` — emit `value` + /// for every key not already handled. + fn render_extra_keys( + out: &mut String, + obj: &serde_json::Map, + handled: &[&str], + ) { + for (k, v) in obj { + if handled.contains(&k.as_str()) { + continue; + } + out.push_str("\n<"); + out.push_str(k); + out.push('>'); + match v { + JsonValue::Object(_) | JsonValue::Array(_) => { + out.push_str(&serde_json::to_string(v).unwrap_or_default()); + } + _ => Self::write_value_as_text(out, v), + } + out.push_str("'); + } + } + + fn emit_system_with_tools( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + tools: &[ToolSpec], + first_is_system: bool, + ) -> Result<(), RenderError> { + let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; + buf.special(self.im_start, sys_idx); + buf.ids(&self.system_newline_tokens, sys_idx); + + let system_content = if first_is_system { + messages[0].text_content().trim().to_string() + } else { + String::new() + }; + let tool_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + tools, + u64::from(first_is_system), + &system_content, + || { + let mut full_sys = String::with_capacity(512); + full_sys.push_str(&system_content); + let mut tools_block = String::with_capacity(512); + tools_block.push_str(TOOLS_HEADER); + tools_block.push('\n'); + let mut first = true; + for t in tools { + if !first { + tools_block.push('\n'); + } + tools_block.push_str(&Self::format_tool_declaration(t)); + first = false; + } + tools_block.push_str(TOOLS_FOOTER); + tools_block.push_str(TOOLS_INSTRUCTIONS); + + if !full_sys.is_empty() { + full_sys.push_str("\n\n"); + } + full_sys.push_str(&tools_block); + Ok(full_sys) + }, + )?; + buf.ids(tool_tokens.as_slice(), sys_idx); + buf.special(self.im_end, sys_idx); + buf.ids(&self.newline_tokens, sys_idx); + Ok(()) + } + + fn emit_system_no_tools( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + sys_idx: i32, + ) -> Result<(), RenderError> { + let content = messages[0].text_content().trim(); + buf.special(self.im_start, sys_idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, sys_idx)?; + buf.special(self.im_end, sys_idx); + buf.ids(&self.newline_tokens, sys_idx); + Ok(()) + } + + fn emit_user( + &self, + buf: &mut RenderBuf<'_>, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut RenderBuf<'_>, + messages: &[Message], + msg_idx: usize, + content: &str, + msg_orig_idx: i32, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let next_is_tool = msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + + if !prev_is_tool { + buf.special(self.im_start, msg_orig_idx); + buf.ids(&self.user_newline_tokens, msg_orig_idx); + } + buf.special(self.tool_response, msg_orig_idx); + let mut wrapped = String::with_capacity(content.len() + 2); + wrapped.push('\n'); + wrapped.push_str(content); + wrapped.push('\n'); + buf.text(&wrapped, msg_orig_idx)?; + buf.special(self.tool_response_end, msg_orig_idx); + // Nemotron 3: trailing \n after + buf.ids(&self.newline_tokens, msg_orig_idx); + + if !next_is_tool { + buf.special(self.im_end, msg_orig_idx); + buf.ids(&self.newline_tokens, msg_orig_idx); + } + Ok(()) + } + + fn emit_assistant( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_orig_idx: i32, + is_last_turn: bool, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + // Recover reasoning_content either from the field or from inline tags. + let raw_content = msg.text_content().trim(); + let (reasoning_content, content) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let r = if let Some((_, inner)) = before.rsplit_once("") { + inner + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() + } else { + before + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() + }; + (r, after.trim_start_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + let reasoning_content = reasoning_content.trim().to_string(); + + buf.special(self.im_start, msg_orig_idx); + buf.ids(&self.assistant_newline_tokens, msg_orig_idx); + + let tool_calls = &msg.tool_calls; + let content_suffix = if tool_calls.is_empty() { "" } else { "\n" }; + + if !reasoning_content.is_empty() && (is_last_turn || preserve_thinking) { + buf.special(self.think, msg_orig_idx); + let mut s = String::with_capacity(reasoning_content.len() + 2); + s.push('\n'); + s.push_str(&reasoning_content); + s.push('\n'); + buf.text(&s, msg_orig_idx)?; + buf.special(self.think_end, msg_orig_idx); + // Single \n separator (not \n\n like Qwen3.5) + let mut tail = String::with_capacity(content.len() + 2); + tail.push('\n'); + tail.push_str(&content); + tail.push_str(content_suffix); + buf.text(&tail, msg_orig_idx)?; + } else if !reasoning_content.is_empty() { + // Historical assistant whose reasoning got stripped — collapsed + // + single \n + content. + buf.special(self.think, msg_orig_idx); + buf.special(self.think_end, msg_orig_idx); + let mut tail = String::with_capacity(content.len() + 2); + tail.push('\n'); + tail.push_str(&content); + tail.push_str(content_suffix); + buf.text(&tail, msg_orig_idx)?; + } else { + // No reasoning ever — glued directly to content. + buf.special(self.think, msg_orig_idx); + buf.special(self.think_end, msg_orig_idx); + let mut tail = String::with_capacity(content.len() + 2); + tail.push_str(&content); + tail.push_str(content_suffix); + buf.text(&tail, msg_orig_idx)?; + } + + for tc in tool_calls { + let name = tc.function.name.as_str(); + buf.special(self.tool_call, msg_orig_idx); + let mut head = String::with_capacity(name.len() + 16); + head.push_str("\n\n"); + buf.text(&head, msg_orig_idx)?; + + let args_value = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => { + serde_json::from_str(s).unwrap_or(JsonValue::Object(serde_json::Map::new())) + } + }; + if let Some(obj) = args_value.as_object() { + for (arg_name, arg_value) in obj { + let val_str = match arg_value { + JsonValue::Object(_) | JsonValue::Array(_) => { + serde_json::to_string(arg_value).unwrap_or_default() + } + JsonValue::String(s) => s.clone(), + JsonValue::Bool(b) => { + if *b { + "True".into() + } else { + "False".into() + } + } + JsonValue::Null => "None".into(), + JsonValue::Number(n) => n.to_string(), + }; + let mut param = String::with_capacity(arg_name.len() + val_str.len() + 24); + param.push_str("\n"); + param.push_str(&val_str); + param.push_str("\n\n"); + buf.text(¶m, msg_orig_idx)?; + } + } + + buf.ids(&self.function_close_newline_tokens, msg_orig_idx); + buf.special(self.tool_call_end, msg_orig_idx); + // Nemotron 3: trailing \n after + buf.ids(&self.newline_tokens, msg_orig_idx); + } + + buf.special(self.im_end, msg_orig_idx); + buf.ids(&self.newline_tokens, msg_orig_idx); + Ok(()) + } + + fn emit_generation_prompt(&self, buf: &mut RenderBuf<'_>) { + buf.scaffold_special(self.im_start); + buf.ids(&self.assistant_newline_tokens, SCAFFOLD_IDX); + if self.enable_thinking { + buf.scaffold_special(self.think); + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + } else { + // Disable-thinking suffix: with no trailing newlines + buf.scaffold_special(self.think); + buf.scaffold_special(self.think_end); + } + } + + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + let base = messages.len().max(1) * 256; + let tools_bonus = tools.map_or(0, |t| 384 * t.len().max(1) + 512); + base + tools_bonus + } +} + +impl Renderer for Nemotron3Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + + // Normalise: prepend empty system message if none is present. + let mut normalised: Vec; + let auto_system_injected: bool; + let messages_ref: &[Message] = if messages[0].role == "system" { + auto_system_injected = false; + messages + } else { + auto_system_injected = true; + normalised = Vec::with_capacity(messages.len() + 1); + normalised.push(Message { + role: "system".to_string(), + content: crate::types::Content::Text(String::new()), + ..Default::default() + }); + normalised.extend_from_slice(messages); + &normalised + }; + + // Map normalised index back to caller's original index. Injected + // system uses SCAFFOLD_IDX (-1) so build_training_sample can't + // dereference past the caller's input. + let orig_idx = |i: usize| -> i32 { + if auto_system_injected { + if i == 0 { SCAFFOLD_IDX } else { (i - 1) as i32 } + } else { + i as i32 + } + }; + + let first_is_system = messages_ref[0].role == "system"; + + match tools { + Some(t) if !t.is_empty() => { + self.emit_system_with_tools(&mut buf, messages_ref, t, first_is_system)?; + } + _ => { + if first_is_system { + self.emit_system_no_tools(&mut buf, messages_ref, orig_idx(0))?; + } + } + } + + // Find the most-recent plain (non-tool-call) assistant — reasoning + // is preserved on it and on later turns; earlier assistants + // collapse to . + let last_plain_assistant_idx: i32 = { + let mut found: i32 = -1; + for (j, m) in messages_ref.iter().enumerate().rev() { + if m.role == "assistant" && m.tool_calls.is_empty() { + found = j as i32; + break; + } + } + found + }; + + for (i, msg) in messages_ref.iter().enumerate() { + let content = msg.text_content().trim(); + let oi = orig_idx(i); + match msg.role.as_str() { + "system" => { + if i != 0 { + return Err(RenderError::Invalid( + "system message must be at the beginning".into(), + )); + } + // Already handled above + } + "user" => self.emit_user(&mut buf, content, oi)?, + "assistant" => { + let is_last_turn = (i as i32) >= last_plain_assistant_idx; + // oi >= 0 guard above makes the usize cast safe. + #[allow(clippy::cast_sign_loss)] + let preserve_thinking = oi >= 0 + && should_preserve_past_thinking( + messages, + oi as usize, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(&mut buf, msg, oi, is_last_turn, preserve_thinking)?; + } + "tool" => self.emit_tool(&mut buf, messages_ref, i, content, oi)?, + _ => { + return Err(RenderError::Invalid(format!( + "unexpected message role: {}", + msg.role + ))); + } + } + } + + if add_generation_prompt { + self.emit_generation_prompt(&mut buf); + } + + Ok(buf.into_rendered()) + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_qwen35( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.think, + self.think_end, + self.tool_call, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let mut buf = RenderBuf::new_token_ids_only( + &self.tokenizer, + Self::estimate_capacity(new_messages, None), + ); + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + + for (i, msg) in new_messages.iter().enumerate() { + let content = msg.text_content().trim(); + let idx = i as i32; + match msg.role.as_str() { + "user" => self.emit_user(&mut buf, content, idx)?, + "system" => { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + } + "tool" => self.emit_tool(&mut buf, new_messages, i, content, idx)?, + _ => return Ok(None), + } + } + + self.emit_generation_prompt(&mut buf); + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} + +// Keep the field readable; suppresses dead-code warning since we only use it via the Option arm above. +#[allow(dead_code)] +impl Nemotron3Renderer { + pub fn has_endoftext(&self) -> bool { + self.endoftext.is_some() + } +} diff --git a/crates/renderers-core/src/families/qwen3.rs b/crates/renderers-core/src/families/qwen3.rs new file mode 100644 index 0000000..3b25a27 --- /dev/null +++ b/crates/renderers-core/src/families/qwen3.rs @@ -0,0 +1,588 @@ +//! Qwen3 renderer. Port of `renderers/qwen3.py`. +//! +//! Byte-for-byte identical output to the Python version — the +//! `test_render_ids` / `test_bridge` / `test_roundtrip` golden suites are +//! the contract. +//! +//! # Performance notes +//! +//! - Special-token ids are resolved once at construction and cached on +//! the struct. Zero per-call lookup cost. +//! - The render buffer is sized to `messages.len() * 256` up front; this +//! covers ~99% of multi-turn conversations with no realloc. +//! - The tools header / footer are static `&str` constants — no +//! per-call allocation. +//! - Tool-call argument serialisation goes through `serde_json` directly, +//! ~5–10× faster than Python's `json.dumps` for the JSON sizes typical +//! here. + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; +use crate::json::{to_string_python, tool_spec_template_value}; +use crate::parsing::qwen3::parse_qwen3; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; +use crate::traits::Renderer; +use crate::types::{ + Message, ParsedResponse, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, ToolSpec, +}; + +const TOOLS_HEADER: &str = "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n"; + +const TOOLS_FOOTER: &str = "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n"; + +const GEN_PROMPT_NO_THINKING_SUFFIX: &str = "\n\n\n\n"; + +/// Builder for [`Qwen3Renderer`]. Use this to surface the rare optional +/// flags without polluting the most common constructor. +#[derive(Debug, Clone)] +pub struct Qwen3RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +} + +impl Default for Qwen3RendererBuilder { + fn default() -> Self { + Self { + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + } + } +} + +impl Qwen3RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + + pub fn build(self, tokenizer: Tokenizer) -> Result { + Qwen3Renderer::new_with(tokenizer, &self) + } +} + +/// Deterministic Qwen3 renderer. +#[derive(Debug, Clone)] +pub struct Qwen3Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + + im_start: u32, + im_end: u32, + /// Cached for parity with the Python `_endoftext` field; the + /// stop-token set already encodes the same id, so this is unused + /// directly but kept for debug parity. + #[allow(dead_code)] + endoftext: u32, + tool_call: u32, + tool_call_end: u32, + tool_response: u32, + tool_response_end: u32, + + /// Cached stop tokens (`im_end`, `endoftext`) for `stop_token_ids` + /// and bridge close-token sets. Two-element vector held by-value + /// per renderer instance. + stop_tokens: Vec, + newline_tokens: Vec, + user_tokens: Vec, + assistant_newline_tokens: Vec, + gen_prompt_no_thinking_suffix_tokens: Vec, + tool_text_cache: ToolTextCache, +} + +impl Qwen3Renderer { + /// Convenience constructor with all defaults. + pub fn new(tokenizer: Tokenizer) -> Result { + Qwen3RendererBuilder::default().build(tokenizer) + } + + pub fn builder() -> Qwen3RendererBuilder { + Qwen3RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: &Qwen3RendererBuilder) -> Result { + let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let tool_response = tokenizer.token_to_id_strict("")?; + let tool_response_end = tokenizer.token_to_id_strict("")?; + + let stop_tokens = vec![im_end, endoftext]; + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let user_tokens = tokenizer.encode_no_special("user")?.as_slice().to_vec(); + let assistant_newline_tokens = tokenizer + .encode_no_special("assistant\n")? + .as_slice() + .to_vec(); + let gen_prompt_no_thinking_suffix_tokens = tokenizer + .encode_no_special(GEN_PROMPT_NO_THINKING_SUFFIX)? + .as_slice() + .to_vec(); + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + im_start, + im_end, + endoftext, + tool_call, + tool_call_end, + tool_response, + tool_response_end, + stop_tokens, + newline_tokens, + user_tokens, + assistant_newline_tokens, + gen_prompt_no_thinking_suffix_tokens, + tool_text_cache: ToolTextCache::default(), + }) + } + + /// Index of the most recent user message whose content is *not* a + /// `...` placeholder. Defaults to + /// `len - 1` when no real user message is present. + fn last_query_index(messages: &[Message]) -> i32 { + for (i, msg) in messages.iter().enumerate().rev() { + if msg.role != "user" { + continue; + } + let content = msg.text_content(); + if !(content.starts_with("") && content.ends_with("")) { + return i as i32; + } + } + (messages.len() as i32).saturating_sub(1) + } + + fn emit_system_with_tools( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + tools: &[ToolSpec], + first_is_system: bool, + ) -> Result<(), RenderError> { + let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; + buf.special(self.im_start, sys_idx); + let system_content = if first_is_system { + messages[0].text_content().to_string() + } else { + String::new() + }; + let tool_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + tools, + u64::from(first_is_system), + &system_content, + || { + let mut tool_text = String::from("system\n"); + if first_is_system { + tool_text.push_str(&system_content); + tool_text.push_str("\n\n"); + } + tool_text.push_str(TOOLS_HEADER); + for tool in tools { + tool_text.push('\n'); + let spec = tool_spec_template_value(tool); + tool_text.push_str(&to_string_python(&spec).map_err(|e| { + RenderError::Invalid(format!("tool spec serialisation failed: {e}")) + })?); + } + tool_text.push_str(TOOLS_FOOTER); + Ok(tool_text) + }, + )?; + buf.ids(tool_tokens.as_slice(), sys_idx); + buf.special(self.im_end, sys_idx); + buf.ids(&self.newline_tokens, sys_idx); + Ok(()) + } + + fn emit_system_no_tools( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + ) -> Result<(), RenderError> { + buf.special(self.im_start, 0); + let mut s = String::with_capacity(messages[0].text_content().len() + 8); + s.push_str("system\n"); + s.push_str(messages[0].text_content()); + buf.text(&s, 0)?; + buf.special(self.im_end, 0); + buf.ids(&self.newline_tokens, 0); + Ok(()) + } + + fn emit_user( + &self, + buf: &mut impl TokenSink, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + Ok(()) + } + + fn emit_non_initial_system( + &self, + buf: &mut impl TokenSink, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + msg_idx: usize, + content: &str, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let next_is_tool = msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let idx = msg_idx as i32; + + if !prev_is_tool { + buf.special(self.im_start, idx); + buf.ids(&self.user_tokens, idx); + } + buf.ids(&self.newline_tokens, idx); + buf.special(self.tool_response, idx); + let mut wrapped = String::with_capacity(content.len() + 2); + wrapped.push('\n'); + wrapped.push_str(content); + wrapped.push('\n'); + buf.text(&wrapped, idx)?; + buf.special(self.tool_response_end, idx); + if !next_is_tool { + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + } + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + fn emit_assistant( + &self, + buf: &mut impl TokenSink, + msg: &Message, + msg_idx: usize, + last_query_index: i32, + is_last: bool, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + // Recover reasoning content either from the explicit field or + // from inline `...` text. Match the Python + // implementation's split semantics exactly. + let raw_content = msg.text_content(); + let (reasoning_content, content_after_think) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let reasoning = if let Some((_, inner)) = before.rsplit_once("") { + inner + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() + } else { + before + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() + }; + (reasoning, after.trim_start_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + + let idx = msg_idx as i32; + buf.special(self.im_start, idx); + + let tool_calls = &msg.tool_calls; + let emit_in_template_window = + (msg_idx as i32) > last_query_index && (is_last || !reasoning_content.is_empty()); + let emit_via_override = preserve_thinking && !reasoning_content.is_empty(); + + let prefix = if emit_in_template_window || emit_via_override { + let mut s = + String::with_capacity(reasoning_content.len() + content_after_think.len() + 32); + s.push_str("assistant\n\n"); + s.push_str(reasoning_content.trim_matches('\n')); + s.push_str("\n\n\n"); + s.push_str(content_after_think.trim_start_matches('\n')); + s + } else { + let mut s = String::with_capacity(content_after_think.len() + 10); + s.push_str("assistant\n"); + s.push_str(&content_after_think); + s + }; + + if tool_calls.is_empty() { + buf.text(&prefix, idx)?; + } else { + for (tc_idx, tc) in tool_calls.iter().enumerate() { + let name = tc.function.name.as_str(); + let args_str = match &tc.function.arguments { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => to_string_python(v).map_err(|e| { + RenderError::Invalid(format!("tool args serialisation failed: {e}")) + })?, + }; + if tc_idx == 0 { + let mut s = prefix.clone(); + if !content_after_think.is_empty() { + s.push('\n'); + } + buf.text(&s, idx)?; + } else { + buf.text("\n", idx)?; + } + buf.special(self.tool_call, idx); + let mut payload = String::with_capacity(args_str.len() + name.len() + 24); + payload.push_str("\n{\"name\": \""); + payload.push_str(name); + payload.push_str("\", \"arguments\": "); + payload.push_str(&args_str); + payload.push_str("}\n"); + buf.text(&payload, idx)?; + buf.special(self.tool_call_end, idx); + } + } + + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + Ok(()) + } + + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + // Heuristic: ~256 tokens / message, plus a flat surcharge for the + // tools block (it can be substantial). Realloc once if we + // underestimate; the cost of over-allocating is a few KB. + let base = messages.len().max(1) * 256; + let tools_bonus = tools.map_or(0, |t| 256 * t.len().max(1)); + base + tools_bonus + } + + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + + fn render_into_buf( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result<(), RenderError> { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + + let first_is_system = messages[0].role == "system"; + + // 1. System + tools header. + match tools { + Some(t) if !t.is_empty() => { + self.emit_system_with_tools(buf, messages, t, first_is_system)?; + } + _ => { + if first_is_system { + self.emit_system_no_tools(buf, messages)?; + } + } + } + + // 2. Last-query index. + let last_qi = Self::last_query_index(messages); + let num_messages = messages.len(); + + // 3. Body. + for (i, msg) in messages.iter().enumerate() { + let content = msg.text_content(); + match msg.role.as_str() { + "system" => { + if i == 0 { + continue; + } + self.emit_non_initial_system(buf, content, i as i32)?; + } + "user" => { + self.emit_user(buf, content, i as i32)?; + } + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant( + buf, + msg, + i, + last_qi, + i + 1 == num_messages, + preserve_thinking, + )?; + } + "tool" => { + self.emit_tool(buf, messages, i, content)?; + } + _ => { + // Unknown role: skip silently (matches Python which + // simply has no branch for it). + } + } + } + + // 4. Generation prompt. + if add_generation_prompt { + buf.scaffold_special(self.im_start); + buf.ids(&self.assistant_newline_tokens, SCAFFOLD_IDX); + if !self.enable_thinking { + buf.ids(&self.gen_prompt_no_thinking_suffix_tokens, SCAFFOLD_IDX); + } + } + + Ok(()) + } +} + +impl Renderer for Qwen3Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let cap = Self::estimate_capacity(messages, tools); + let mut buf = RenderBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_rendered()) + } + + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages, tools); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_qwen3( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.tool_call, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let cap = Self::estimate_capacity(new_messages, None); + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + + // Trailing `\n` after the prior turn's close token. + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + + for (i, msg) in new_messages.iter().enumerate() { + let content = msg.text_content(); + let idx = i as i32; + match msg.role.as_str() { + "user" => self.emit_user(&mut buf, content, idx)?, + "system" => self.emit_non_initial_system(&mut buf, content, idx)?, + "tool" => self.emit_tool(&mut buf, new_messages, i, content)?, + _ => return Ok(None), + } + } + + buf.scaffold_special(self.im_start); + buf.ids(&self.assistant_newline_tokens, SCAFFOLD_IDX); + if !self.enable_thinking { + buf.ids(&self.gen_prompt_no_thinking_suffix_tokens, SCAFFOLD_IDX); + } + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } +} diff --git a/crates/renderers-core/src/families/qwen35.rs b/crates/renderers-core/src/families/qwen35.rs new file mode 100644 index 0000000..01aecc8 --- /dev/null +++ b/crates/renderers-core/src/families/qwen35.rs @@ -0,0 +1,975 @@ +//! Qwen3.5 renderer (text-only). Port of `renderers/qwen35.py` minus the +//! multimodal path; multimodal lands in Phase 5 with the vision processor. +//! +//! Differences from Qwen3: +//! +//! - `` / `` are **special tokens**, not text tags. +//! - Tool calls use XML format with `` and +//! `` blocks. +//! - System prompt includes a verbose tool-instructions block. +//! - Generation prompt prefills `\n` (or the empty-think block +//! when `enable_thinking` is false), with polarity defaulting to +//! `enable_thinking=true` for big-size models. +//! +//! `enable_thinking` polarity detection (which the Python implementation +//! probes via a one-shot `apply_chat_template` call) is **not** done in +//! Rust — the caller passes it explicitly through the builder. The +//! Python shim handles the polarity probe and forwards the result. + +use std::borrow::Cow; + +use crate::bridge::{reject_assistant_in_extension, trim_to_turn_close}; +use crate::emit::{RenderBuf, TokenPlanBuf, TokenSink}; +use crate::json::{to_string_python, tool_spec_template_value}; +use crate::parsing::qwen35::parse_qwen35; +use crate::thinking::should_preserve_past_thinking; +use crate::tokenizer::Tokenizer; +use crate::tool_cache::ToolTextCache; +use crate::traits::{MultimodalRenderer, Renderer}; +use crate::types::{ + Content, ContentPart, MediaBundle, MediaItem, Message, Modality, MultiModalData, + ParsedResponse, PlaceholderRange, RenderError, RenderedTokens, SCAFFOLD_IDX, ToolArguments, + ToolSpec, +}; + +const TOOLS_HEADER: &str = "# Tools\n\nYou have access to the following functions:\n\n"; +const TOOLS_FOOTER: &str = "\n"; +const TOOLS_INSTRUCTIONS: &str = "\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n"; + +#[derive(Debug, Clone)] +pub struct Qwen35RendererBuilder { + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + /// When `true`, every non-string tool-call argument is serialised via + /// `serde_json::to_string` instead of Python's `str(...)` rules. This + /// is the only behavioural change Qwen3.6 introduces vs Qwen3.5 — + /// kept as a flag here so Qwen3.6 is a config delta, not a code + /// duplicate. + args_as_json: bool, +} + +impl Default for Qwen35RendererBuilder { + fn default() -> Self { + Self { + // Big-size model default. The Python shim probes the tokenizer's + // Jinja template to discover the per-model polarity; callers can + // pass an explicit override here. + enable_thinking: true, + preserve_all_thinking: false, + preserve_thinking_between_tool_calls: false, + args_as_json: false, + } + } +} + +impl Qwen35RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.enable_thinking = on; + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.preserve_all_thinking = on; + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.preserve_thinking_between_tool_calls = on; + self + } + /// Qwen3.6 flag — JSON-serialise every non-string tool argument. + pub fn args_as_json(mut self, on: bool) -> Self { + self.args_as_json = on; + self + } + pub fn build(self, tokenizer: Tokenizer) -> Result { + Qwen35Renderer::new_with(tokenizer, &self) + } +} + +#[derive(Debug, Clone)] +pub struct Qwen35Renderer { + tokenizer: Tokenizer, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + args_as_json: bool, + + im_start: u32, + im_end: u32, + #[allow(dead_code)] + endoftext: u32, + think: u32, + think_end: u32, + tool_call: u32, + tool_call_end: u32, + tool_response: u32, + tool_response_end: u32, + + // Multimodal placeholder tokens — resolved as optional so the + // text-only Qwen3.5 tokenizers (which don't ship the vision + // specials) still construct cleanly. `as_multimodal()` returns None + // when these are absent. + vision_start: Option, + vision_end: Option, + image_pad: Option, + video_pad: Option, + /// `[(token_id, modality_marker)]` — 1 = image, 2 = video. Empty + /// when this tokenizer doesn't have the vision specials. + mm_token_type_ids: Vec<(u32, u8)>, + + stop_tokens: Vec, + newline_tokens: Vec, + double_newline_tokens: Vec, + user_tokens: Vec, + user_newline_tokens: Vec, + system_newline_tokens: Vec, + assistant_newline_tokens: Vec, + function_close_newline_tokens: Vec, + tool_text_cache: ToolTextCache, +} + +impl Qwen35Renderer { + pub fn new(tokenizer: Tokenizer) -> Result { + Qwen35RendererBuilder::default().build(tokenizer) + } + + pub fn builder() -> Qwen35RendererBuilder { + Qwen35RendererBuilder::default() + } + + fn new_with(tokenizer: Tokenizer, cfg: &Qwen35RendererBuilder) -> Result { + let im_start = tokenizer.token_to_id_strict("<|im_start|>")?; + let im_end = tokenizer.token_to_id_strict("<|im_end|>")?; + let endoftext = tokenizer.token_to_id_strict("<|endoftext|>")?; + let think = tokenizer.token_to_id_strict("")?; + let think_end = tokenizer.token_to_id_strict("")?; + let tool_call = tokenizer.token_to_id_strict("")?; + let tool_call_end = tokenizer.token_to_id_strict("")?; + let tool_response = tokenizer.token_to_id_strict("")?; + let tool_response_end = tokenizer.token_to_id_strict("")?; + + // Multimodal tokens are optional — text-only tokenizers (e.g. + // Qwen3.5-9B, no `-VL` suffix) don't ship them. Resolve via + // `token_to_id` (non-strict) so the renderer constructs in both + // cases. + let vision_start = tokenizer.token_to_id("<|vision_start|>"); + let vision_end = tokenizer.token_to_id("<|vision_end|>"); + let image_pad = tokenizer.token_to_id("<|image_pad|>"); + let video_pad = tokenizer.token_to_id("<|video_pad|>"); + let mut mm_token_type_ids: Vec<(u32, u8)> = Vec::new(); + if let Some(p) = image_pad { + mm_token_type_ids.push((p, 1)); + } + if let Some(p) = video_pad { + mm_token_type_ids.push((p, 2)); + } + let newline_tokens = tokenizer.encode_no_special("\n")?.as_slice().to_vec(); + let double_newline_tokens = tokenizer.encode_no_special("\n\n")?.as_slice().to_vec(); + let user_tokens = tokenizer.encode_no_special("user")?.as_slice().to_vec(); + let user_newline_tokens = tokenizer.encode_no_special("user\n")?.as_slice().to_vec(); + let system_newline_tokens = tokenizer.encode_no_special("system\n")?.as_slice().to_vec(); + let assistant_newline_tokens = tokenizer + .encode_no_special("assistant\n")? + .as_slice() + .to_vec(); + let function_close_newline_tokens = tokenizer + .encode_no_special("\n")? + .as_slice() + .to_vec(); + + Ok(Self { + tokenizer, + enable_thinking: cfg.enable_thinking, + preserve_all_thinking: cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls: cfg.preserve_thinking_between_tool_calls, + args_as_json: cfg.args_as_json, + im_start, + im_end, + endoftext, + think, + think_end, + tool_call, + tool_call_end, + tool_response, + tool_response_end, + vision_start, + vision_end, + image_pad, + video_pad, + mm_token_type_ids, + stop_tokens: vec![im_end, endoftext], + newline_tokens, + double_newline_tokens, + user_tokens, + user_newline_tokens, + system_newline_tokens, + assistant_newline_tokens, + function_close_newline_tokens, + tool_text_cache: ToolTextCache::default(), + }) + } + + /// True when the underlying tokenizer ships the vision special + /// tokens. Used by [`Renderer::as_multimodal`]. + pub fn supports_multimodal(&self) -> bool { + self.vision_start.is_some() && self.vision_end.is_some() && self.image_pad.is_some() + } + + /// Text view of message content, matching the Python + /// `Qwen35Renderer._render_content` helper: join text parts and skip + /// media / thinking parts. This is used by the text-only native path + /// so OpenAI-style structured text content is not silently dropped. + fn render_content_text(content: &Content) -> Cow<'_, str> { + match content { + Content::Null => Cow::Borrowed(""), + Content::Text(s) => Cow::Borrowed(s.as_str()), + Content::Parts(parts) => { + let mut out = String::new(); + for part in parts { + if let ContentPart::Text { text } = part { + out.push_str(text); + } + } + Cow::Owned(out) + } + } + } + + /// Index of the most recent non-tool-response user message; + /// `messages.len()` when none — that out-of-range value makes + /// `msg_idx > last_query_index` uniformly false, matching the + /// Python contract. + fn last_query_index(messages: &[Message]) -> i32 { + for (i, msg) in messages.iter().enumerate().rev() { + if msg.role != "user" { + continue; + } + let content = Self::render_content_text(&msg.content); + let content = content.trim(); + if !(content.starts_with("") && content.ends_with("")) { + return i as i32; + } + } + messages.len() as i32 + } + + fn emit_system_with_tools( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + tools: &[ToolSpec], + first_is_system: bool, + ) -> Result<(), RenderError> { + let sys_idx: i32 = if first_is_system { 0 } else { SCAFFOLD_IDX }; + buf.special(self.im_start, sys_idx); + buf.ids(&self.system_newline_tokens, sys_idx); + + let system_content = if first_is_system { + let sys_content = Self::render_content_text(&messages[0].content); + let sys_content = sys_content.trim(); + sys_content.to_string() + } else { + String::new() + }; + let tool_tokens = self.tool_text_cache.get_or_insert_with( + &self.tokenizer, + tools, + u64::from(first_is_system), + &system_content, + || { + let mut tool_text = + String::with_capacity(TOOLS_HEADER.len() + TOOLS_INSTRUCTIONS.len() + 256); + tool_text.push_str(TOOLS_HEADER); + for tool in tools { + tool_text.push('\n'); + let spec = tool_spec_template_value(tool); + tool_text.push_str(&to_string_python(&spec).map_err(|e| { + RenderError::Invalid(format!("tool spec serialisation failed: {e}")) + })?); + } + tool_text.push_str(TOOLS_FOOTER); + tool_text.push_str(TOOLS_INSTRUCTIONS); + + if !system_content.is_empty() { + tool_text.push_str("\n\n"); + tool_text.push_str(&system_content); + } + Ok(tool_text) + }, + )?; + buf.ids(tool_tokens.as_slice(), sys_idx); + buf.special(self.im_end, sys_idx); + buf.ids(&self.newline_tokens, sys_idx); + Ok(()) + } + + fn emit_system_no_tools( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + ) -> Result<(), RenderError> { + let content = Self::render_content_text(&messages[0].content); + let content = content.trim(); + buf.special(self.im_start, 0); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, 0)?; + buf.special(self.im_end, 0); + buf.ids(&self.newline_tokens, 0); + Ok(()) + } + + fn emit_user( + &self, + buf: &mut impl TokenSink, + content: &str, + idx: i32, + ) -> Result<(), RenderError> { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("user\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + Ok(()) + } + + fn emit_tool( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + msg_idx: usize, + content: &str, + ) -> Result<(), RenderError> { + let prev_is_tool = msg_idx > 0 && messages[msg_idx - 1].role == "tool"; + let next_is_tool = msg_idx + 1 < messages.len() && messages[msg_idx + 1].role == "tool"; + let idx = msg_idx as i32; + + if !prev_is_tool { + buf.special(self.im_start, idx); + buf.ids(&self.user_tokens, idx); + } + buf.ids(&self.newline_tokens, idx); + buf.special(self.tool_response, idx); + let mut wrapped = String::with_capacity(content.len() + 2); + wrapped.push('\n'); + wrapped.push_str(content); + wrapped.push('\n'); + buf.text(&wrapped, idx)?; + buf.special(self.tool_response_end, idx); + if !next_is_tool { + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + } + Ok(()) + } + + fn render_arg_value(arg_value: &serde_json::Value, args_as_json: bool) -> String { + if args_as_json { + // Qwen3.6: every non-string serialises via serde_json (bools + // become "true"/"false", None becomes "null"). Strings still + // render verbatim — JSON would re-quote them. + match arg_value { + serde_json::Value::String(s) => s.clone(), + _ => serde_json::to_string(arg_value).unwrap_or_default(), + } + } else { + // Qwen3.5: Python's str() rules — dict/list go through JSON, + // bools become "True"/"False", None becomes "None", numbers + // and strings render verbatim. + match arg_value { + serde_json::Value::Object(_) | serde_json::Value::Array(_) => { + serde_json::to_string(arg_value).unwrap_or_default() + } + serde_json::Value::String(s) => s.clone(), + serde_json::Value::Bool(b) => { + if *b { + "True".to_string() + } else { + "False".to_string() + } + } + serde_json::Value::Null => "None".to_string(), + serde_json::Value::Number(n) => n.to_string(), + } + } + } + + fn emit_assistant( + &self, + buf: &mut impl TokenSink, + msg: &Message, + msg_idx: usize, + last_query_index: i32, + preserve_thinking: bool, + ) -> Result<(), RenderError> { + let raw_content = Self::render_content_text(&msg.content); + let (reasoning_content, content_after) = match &msg.reasoning_content { + Some(s) => (s.clone(), raw_content.to_string()), + None => { + if let Some((before, after)) = raw_content.split_once("") { + let reasoning = if let Some((_, inner)) = before.rsplit_once("") { + inner + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() + } else { + before + .trim_start_matches('\n') + .trim_end_matches('\n') + .to_string() + }; + (reasoning, after.trim_start_matches('\n').to_string()) + } else { + (String::new(), raw_content.to_string()) + } + } + }; + let reasoning_content = reasoning_content.trim().to_string(); + let content = content_after.trim().to_string(); + + let idx = msg_idx as i32; + buf.special(self.im_start, idx); + + let emit_thinking = (msg_idx as i32) > last_query_index + || (preserve_thinking && !reasoning_content.is_empty()); + + if emit_thinking { + buf.ids(&self.assistant_newline_tokens, idx); + buf.special(self.think, idx); + let mut s = String::with_capacity(reasoning_content.len() + 2); + s.push('\n'); + s.push_str(&reasoning_content); + s.push('\n'); + buf.text(&s, idx)?; + buf.special(self.think_end, idx); + let mut tail = String::with_capacity(content.len() + 2); + tail.push_str("\n\n"); + tail.push_str(&content); + buf.text(&tail, idx)?; + } else { + let mut s = String::with_capacity(content.len() + 10); + s.push_str("assistant\n"); + s.push_str(&content); + buf.text(&s, idx)?; + } + + for (tc_idx, tc) in msg.tool_calls.iter().enumerate() { + let name = tc.function.name.as_str(); + // Separator before this tool call + if tc_idx == 0 { + if !content.is_empty() { + buf.ids(&self.double_newline_tokens, idx); + } + } else { + buf.ids(&self.newline_tokens, idx); + } + + buf.special(self.tool_call, idx); + let mut payload = String::with_capacity(name.len() + 32); + payload.push_str("\n\n"); + buf.text(&payload, idx)?; + + // Arguments — accept JSON string (decode first) or object + let args_value = match &tc.function.arguments { + ToolArguments::Object(v) => v.clone(), + ToolArguments::Raw(s) => serde_json::from_str(s) + .unwrap_or(serde_json::Value::Object(serde_json::Map::new())), + }; + if let Some(obj) = args_value.as_object() { + for (arg_name, arg_value) in obj { + let value_str = Self::render_arg_value(arg_value, self.args_as_json); + let mut param = String::with_capacity(arg_name.len() + value_str.len() + 24); + param.push_str("\n"); + param.push_str(&value_str); + param.push_str("\n\n"); + buf.text(¶m, idx)?; + } + } + + buf.ids(&self.function_close_newline_tokens, idx); + buf.special(self.tool_call_end, idx); + } + + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + Ok(()) + } + + fn emit_generation_prompt(&self, buf: &mut impl TokenSink) { + buf.scaffold_special(self.im_start); + buf.ids(&self.assistant_newline_tokens, SCAFFOLD_IDX); + if self.enable_thinking { + buf.scaffold_special(self.think); + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + } else { + buf.scaffold_special(self.think); + buf.ids(&self.double_newline_tokens, SCAFFOLD_IDX); + buf.scaffold_special(self.think_end); + buf.ids(&self.double_newline_tokens, SCAFFOLD_IDX); + } + } + + fn estimate_capacity(messages: &[Message], tools: Option<&[ToolSpec]>) -> usize { + let base = messages.len().max(1) * 256; + let tools_bonus = tools.map_or(0, |t| 256 * t.len().max(1) + 512); + base + tools_bonus + } + + fn should_batch_encode_text(messages: &[Message], tools: Option<&[ToolSpec]>) -> bool { + messages.len() >= 8 && tools.is_none_or(<[ToolSpec]>::is_empty) + } + + fn render_text_into_buf( + &self, + buf: &mut impl TokenSink, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result<(), RenderError> { + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + + let first_is_system = messages[0].role == "system"; + + match tools { + Some(t) if !t.is_empty() => { + self.emit_system_with_tools(buf, messages, t, first_is_system)?; + } + _ => { + if first_is_system { + self.emit_system_no_tools(buf, messages)?; + } + } + } + + let last_qi = Self::last_query_index(messages); + + for (i, msg) in messages.iter().enumerate() { + let content = Self::render_content_text(&msg.content); + let content = content.trim(); + match msg.role.as_str() { + "system" => { + if i != 0 { + return Err(RenderError::Invalid( + "system message must be at the beginning".into(), + )); + } + } + "user" => self.emit_user(buf, content, i as i32)?, + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(buf, msg, i, last_qi, preserve_thinking)?; + } + "tool" => self.emit_tool(buf, messages, i, content)?, + _ => { + return Err(RenderError::Invalid(format!( + "unexpected message role: {}", + msg.role + ))); + } + } + } + + if add_generation_prompt { + self.emit_generation_prompt(buf); + } + + Ok(()) + } +} + +impl Renderer for Qwen35Renderer { + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result { + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + self.render_text_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_rendered()) + } + + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + let cap = Self::estimate_capacity(messages, tools); + if Self::should_batch_encode_text(messages, tools) { + let mut buf = TokenPlanBuf::new(&self.tokenizer, cap); + self.render_text_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + buf.into_token_ids() + } else { + let mut buf = RenderBuf::new_token_ids_only(&self.tokenizer, cap); + self.render_text_into_buf(&mut buf, messages, tools, add_generation_prompt)?; + Ok(buf.into_token_ids()) + } + } + + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse { + parse_qwen35( + &self.tokenizer, + token_ids, + &self.stop_tokens, + self.think, + self.think_end, + self.tool_call, + self.tool_call_end, + ) + } + + fn stop_token_ids(&self) -> &[u32] { + &self.stop_tokens + } + + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + _tools: Option<&[ToolSpec]>, + ) -> Result, RenderError> { + if previous_prompt_ids.is_empty() + || new_messages.is_empty() + || reject_assistant_in_extension(new_messages) + { + return Ok(None); + } + + let Some(previous_ids) = trim_to_turn_close( + previous_prompt_ids, + previous_completion_ids, + &self.stop_tokens, + Some(self.im_end), + ) else { + return Ok(None); + }; + + let mut buf = RenderBuf::new_token_ids_only( + &self.tokenizer, + Self::estimate_capacity(new_messages, None), + ); + // Trailing newline that the prior render emitted but vLLM stopped on + buf.ids(&self.newline_tokens, SCAFFOLD_IDX); + + for (i, msg) in new_messages.iter().enumerate() { + let content = Self::render_content_text(&msg.content); + let content = content.trim(); + let idx = i as i32; + match msg.role.as_str() { + "user" => self.emit_user(&mut buf, content, idx)?, + "system" => { + buf.special(self.im_start, idx); + let mut s = String::with_capacity(content.len() + 8); + s.push_str("system\n"); + s.push_str(content); + buf.text(&s, idx)?; + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + } + "tool" => self.emit_tool(&mut buf, new_messages, i, content)?, + _ => return Ok(None), + } + } + + self.emit_generation_prompt(&mut buf); + + let ext = buf.into_token_ids(); + let mut out = Vec::with_capacity(previous_ids.len() + ext.len()); + out.extend_from_slice(&previous_ids); + out.extend_from_slice(&ext); + Ok(Some(RenderedTokens { + token_ids: out, + message_indices: Vec::new(), + multi_modal_data: None, + })) + } + + fn as_multimodal(&self) -> Option<&dyn MultimodalRenderer> { + if self.supports_multimodal() { + Some(self) + } else { + None + } + } +} + +// ── Multimodal implementation ───────────────────────────────────────── +// +// Qwen3.5-VL emits the canonical Qwen-style placeholder block per image: +// <|vision_start|> + num_tokens × <|image_pad|> + <|vision_end|> +// +// where `num_tokens` is the pre-computed expansion the caller obtained +// from the HF processor (image_grid_thw.prod() / merge_size²). The +// renderer never touches pixel data; `MediaItem::hf_payload` rides +// through as opaque JSON into `MultiModalData::mm_items`. + +impl Qwen35Renderer { + /// Walk the user-message content parts in order, interleaving + /// placeholder spans where Image / Video parts appear. Mirrors the + /// HF chat template's behaviour: text and images appear in the + /// same order the caller listed them in `Content::Parts`. + /// + /// `media` items are consumed positionally — the N-th media item + /// for this message matches the N-th Image/Video part in the + /// content. Mismatched counts return an `Invalid` error. + fn emit_user_with_media( + &self, + buf: &mut RenderBuf<'_>, + msg: &Message, + msg_idx: usize, + media: &MediaBundle, + mm: &mut MultiModalData, + ) -> Result<(), RenderError> { + let idx = msg_idx as i32; + buf.special(self.im_start, idx); + buf.ids(&self.user_newline_tokens, idx); + + // Gather this message's media items in render order. + let mut media_iter = media + .items + .iter() + .filter_map(|(m, item)| (*m == msg_idx).then_some(item)); + + match &msg.content { + crate::types::Content::Null => { + for item in media_iter.by_ref() { + self.emit_media_item(buf, idx, item, mm)?; + } + } + crate::types::Content::Text(s) => { + // Plain-text user message with attached media: emit + // images first (canonical Qwen-VL shape: + // <|vision_start|>...<|vision_end|>{text}), then text. + for item in media_iter.by_ref() { + self.emit_media_item(buf, idx, item, mm)?; + } + if !s.is_empty() { + buf.text(s.trim(), idx)?; + } + } + crate::types::Content::Parts(parts) => { + use crate::types::ContentPart; + for part in parts { + match part { + ContentPart::Text { text } => { + if !text.is_empty() { + buf.text(text, idx)?; + } + } + ContentPart::Thinking { .. } => { + // Thinking parts shouldn't appear in user + // content — silently skip to match the + // Python implementation's behaviour. + } + ContentPart::Image(_) | ContentPart::Video(_) => { + let item = media_iter.next().ok_or_else(|| { + RenderError::Invalid(format!( + "message {msg_idx} content lists more media parts than the MediaBundle provides" + )) + })?; + self.emit_media_item(buf, idx, item, mm)?; + } + } + } + } + } + + // Reject extra media items in the bundle that didn't get used — + // catches off-by-one errors in caller's bundle construction. + if media_iter.next().is_some() { + return Err(RenderError::Invalid(format!( + "MediaBundle has more items for message {msg_idx} than the content's media parts" + ))); + } + + buf.special(self.im_end, idx); + buf.ids(&self.newline_tokens, idx); + Ok(()) + } + + fn emit_media_item( + &self, + buf: &mut RenderBuf<'_>, + idx: i32, + item: &MediaItem, + mm: &mut MultiModalData, + ) -> Result<(), RenderError> { + let pad = match item.modality { + Modality::Image => self.image_pad, + Modality::Video => self.video_pad, + } + .ok_or_else(|| { + RenderError::MissingSpecialToken(match item.modality { + Modality::Image => "<|image_pad|>".into(), + Modality::Video => "<|video_pad|>".into(), + }) + })?; + let vs = self + .vision_start + .ok_or_else(|| RenderError::MissingSpecialToken("<|vision_start|>".into()))?; + let ve = self + .vision_end + .ok_or_else(|| RenderError::MissingSpecialToken("<|vision_end|>".into()))?; + + buf.special(vs, idx); + let offset = buf.len(); + for _ in 0..item.num_tokens { + buf.special(pad, idx); + } + buf.special(ve, idx); + + // Update MultiModalData. Key by modality string ("image" / + // "video") so the inference engine glue can route per-key. + let key = item.modality.as_str().to_string(); + mm.mm_hashes + .entry(key.clone()) + .or_default() + .push(item.hash.clone()); + mm.mm_placeholders + .entry(key.clone()) + .or_default() + .push(PlaceholderRange { + offset, + length: item.num_tokens, + }); + mm.mm_items + .entry(key) + .or_default() + .push(item.hf_payload.clone()); + Ok(()) + } +} + +impl MultimodalRenderer for Qwen35Renderer { + fn mm_token_type_id_map(&self) -> &[(u32, u8)] { + &self.mm_token_type_ids + } + + fn render_with_media( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + media: &MediaBundle, + add_generation_prompt: bool, + ) -> Result { + // Fast path: no media → defer to the text-only render. + if media.is_empty() { + return self.render(messages, tools, add_generation_prompt); + } + if messages.is_empty() { + return Err(RenderError::EmptyMessages); + } + let mut buf = RenderBuf::new(&self.tokenizer, Self::estimate_capacity(messages, tools)); + let first_is_system = messages[0].role == "system"; + + match tools { + Some(t) if !t.is_empty() => { + self.emit_system_with_tools(&mut buf, messages, t, first_is_system)?; + } + _ => { + if first_is_system { + self.emit_system_no_tools(&mut buf, messages)?; + } + } + } + + let last_qi = Self::last_query_index(messages); + let mut mm = MultiModalData::default(); + + for (i, msg) in messages.iter().enumerate() { + let content = Self::render_content_text(&msg.content); + let content = content.trim(); + match msg.role.as_str() { + "system" => { + if i != 0 { + return Err(RenderError::Invalid( + "system message must be at the beginning".into(), + )); + } + } + "user" => { + // If this message has attached media OR the caller + // provided structured content with image/video parts, + // walk the parts inline so order matches the caller's + // list. Pure text paths bypass the heavier walk. + let has_media = media.items.iter().any(|(idx, _)| *idx == i); + let has_structured = matches!(msg.content, crate::types::Content::Parts(_)); + if has_media || has_structured { + self.emit_user_with_media(&mut buf, msg, i, media, &mut mm)?; + } else { + self.emit_user(&mut buf, content, i as i32)?; + } + } + "assistant" => { + let preserve_thinking = should_preserve_past_thinking( + messages, + i, + self.preserve_all_thinking, + self.preserve_thinking_between_tool_calls, + ); + self.emit_assistant(&mut buf, msg, i, last_qi, preserve_thinking)?; + } + "tool" => self.emit_tool(&mut buf, messages, i, content)?, + _ => { + return Err(RenderError::Invalid(format!( + "unexpected message role: {}", + msg.role + ))); + } + } + } + + if add_generation_prompt { + self.emit_generation_prompt(&mut buf); + } + + let mut out = buf.into_rendered(); + if !mm.is_empty() { + out.multi_modal_data = Some(mm); + } + Ok(out) + } + + fn bridge_to_next_turn_with_media( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + tools: Option<&[ToolSpec]>, + new_media: &MediaBundle, + _previous_multi_modal_data: Option<&MultiModalData>, + ) -> Result, RenderError> { + // Phase 5a scope: bridge ignores media on the new-turn side + // (the prior turn's mm_data is carried forward by the caller's + // glue layer, not by this function). When new_media is + // non-empty, fall back to a full re-render — bridging + // image-bearing turns through a verbatim prefix is fragile + // because placeholder offsets shift if the prior turn was + // truncated mid-image. Phase 5b can revisit. + if !new_media.is_empty() { + return Ok(None); + } + self.bridge_to_next_turn( + previous_prompt_ids, + previous_completion_ids, + new_messages, + tools, + ) + } +} diff --git a/crates/renderers-core/src/families/qwen36.rs b/crates/renderers-core/src/families/qwen36.rs new file mode 100644 index 0000000..de3888c --- /dev/null +++ b/crates/renderers-core/src/families/qwen36.rs @@ -0,0 +1,45 @@ +//! Qwen3.6 renderer. Delta vs Qwen3.5: tool-call arguments serialise +//! through JSON (bools → `true`/`false`, None → `null`, etc.) instead of +//! Python `str()`. Everything else — template structure, parser, +//! tool-call XML, thinking markers, bridge logic — is identical to +//! Qwen3.5, so this is a one-line config delta on the Qwen3.5 builder. +//! +//! Mirrors `renderers/qwen36.py`. + +use crate::families::Qwen35RendererBuilder; + +/// Build a Qwen3.6 renderer. +/// +/// Type alias preserved as a re-export of [`Qwen35Renderer`](crate::families::Qwen35Renderer) +/// — the type system doesn't distinguish them at runtime; they differ +/// only in the `args_as_json` flag. The builder below is the right +/// public surface. +pub use crate::families::Qwen35Renderer as Qwen36Renderer; + +/// Builder for [`Qwen36Renderer`] (a Qwen3.5 with JSON-flavoured tool +/// arguments). +#[derive(Debug, Clone, Default)] +pub struct Qwen36RendererBuilder { + inner: Qwen35RendererBuilder, +} + +impl Qwen36RendererBuilder { + pub fn enable_thinking(mut self, on: bool) -> Self { + self.inner = self.inner.enable_thinking(on); + self + } + pub fn preserve_all_thinking(mut self, on: bool) -> Self { + self.inner = self.inner.preserve_all_thinking(on); + self + } + pub fn preserve_thinking_between_tool_calls(mut self, on: bool) -> Self { + self.inner = self.inner.preserve_thinking_between_tool_calls(on); + self + } + pub fn build( + self, + tokenizer: crate::tokenizer::Tokenizer, + ) -> Result { + self.inner.args_as_json(true).build(tokenizer) + } +} diff --git a/crates/renderers-core/src/json.rs b/crates/renderers-core/src/json.rs new file mode 100644 index 0000000..60f1545 --- /dev/null +++ b/crates/renderers-core/src/json.rs @@ -0,0 +1,78 @@ +use std::io; + +use serde::Serialize; +use serde_json::json; +use serde_json::ser::Formatter; + +use crate::types::ToolSpec; + +/// Serialize JSON with Python's default `json.dumps(..., ensure_ascii=False)` +/// separators: `", "` between values and `": "` between keys and values. +pub(crate) fn to_string_python(value: &T) -> Result +where + T: Serialize + ?Sized, +{ + let mut out = Vec::new(); + { + let mut serializer = serde_json::Serializer::with_formatter(&mut out, PythonJsonFormatter); + value.serialize(&mut serializer)?; + } + Ok(String::from_utf8(out).expect("serde_json only writes valid UTF-8")) +} + +pub(crate) fn tool_spec_inner_value(tool: &ToolSpec) -> serde_json::Value { + json!({ + "name": &tool.name, + "description": &tool.description, + "parameters": &tool.parameters, + }) +} + +pub(crate) fn tool_spec_openai_value(tool: &ToolSpec) -> serde_json::Value { + json!({ + "type": "function", + "function": tool_spec_inner_value(tool), + }) +} + +pub(crate) fn tool_spec_template_value(tool: &ToolSpec) -> serde_json::Value { + if tool.openai_envelope { + tool_spec_openai_value(tool) + } else { + tool_spec_inner_value(tool) + } +} + +#[derive(Debug, Default)] +struct PythonJsonFormatter; + +impl Formatter for PythonJsonFormatter { + fn begin_array_value(&mut self, writer: &mut W, first: bool) -> io::Result<()> + where + W: ?Sized + io::Write, + { + if first { + Ok(()) + } else { + writer.write_all(b", ") + } + } + + fn begin_object_key(&mut self, writer: &mut W, first: bool) -> io::Result<()> + where + W: ?Sized + io::Write, + { + if first { + Ok(()) + } else { + writer.write_all(b", ") + } + } + + fn begin_object_value(&mut self, writer: &mut W) -> io::Result<()> + where + W: ?Sized + io::Write, + { + writer.write_all(b": ") + } +} diff --git a/crates/renderers-core/src/lib.rs b/crates/renderers-core/src/lib.rs new file mode 100644 index 0000000..372482b --- /dev/null +++ b/crates/renderers-core/src/lib.rs @@ -0,0 +1,46 @@ +//! `renderers-core` — deterministic message → token rendering for LLM +//! training and inference. +//! +//! This crate is the pure-Rust foundation: data types, the [`Renderer`] +//! trait, parsing primitives, and per-family renderer implementations. +//! The Python wrapper lives in `renderers-py`. +//! +//! # Design at a glance +//! +//! - Messages flow into a [`Renderer`] which emits [`RenderedTokens`] +//! (token ids plus per-token message attribution). +//! - Completion token ids flow back into [`Renderer::parse_response`], +//! which returns a [`ParsedResponse`] with content, optional reasoning, +//! and per-attempt [`ParsedToolCall`] records (success and malformed +//! both surface, distinguished by [`ToolCallParseStatus`]). +//! - Multi-turn rollouts use [`Renderer::bridge_to_next_turn`] to extend +//! the prior turn's token stream byte-for-byte, avoiding re-tokenization +//! drift. +//! +//! The crate is `#![forbid(unsafe_code)]` and aims to keep allocation off +//! the hot path: render buffers grow once, parsing uses a per-call arena, +//! and concrete renderers cache resolved special-token ids at construction. + +#![forbid(unsafe_code)] +#![warn(missing_debug_implementations)] +#![warn(rust_2018_idioms)] + +pub mod bridge; +pub mod emit; +pub mod families; +pub(crate) mod json; +pub mod parsing; +pub mod processing; +pub mod registry; +pub mod thinking; +pub mod tokenizer; +pub(crate) mod tool_cache; +pub mod traits; +pub mod types; + +pub use traits::{MediaResolver, MediaSource, MultimodalRenderer, Renderer}; +pub use types::{ + Content, ContentPart, ImageRef, MediaBundle, MediaItem, Message, Modality, MultiModalData, + ParsedResponse, ParsedToolCall, PlaceholderRange, RenderError, RenderedTokens, SCAFFOLD_IDX, + ToolArguments, ToolCall, ToolCallFunction, ToolCallParseStatus, ToolSpec, VideoRef, +}; diff --git a/crates/renderers-core/src/parsing/deepseek_v3.rs b/crates/renderers-core/src/parsing/deepseek_v3.rs new file mode 100644 index 0000000..e06c9b0 --- /dev/null +++ b/crates/renderers-core/src/parsing/deepseek_v3.rs @@ -0,0 +1,194 @@ +//! `DeepSeek` V3 tool-call parser. Port of +//! `renderers/parsing.py:parse_deepseek_v3` + `_parse_deepseek_tool_calls`. +//! +//! Structural shape: +//! +//! ```text +//! ...content... +//! ...reasoning... +//! <|tool▁calls▁begin|> +//! <|tool▁call▁begin|>function<|tool▁sep|>{name} +//! ```json +//! {args} +//! ```<|tool▁call▁end|> +//! <|tool▁calls▁end|> +//! ``` +//! +//! Thinking is **text tags** (not special tokens) — `DeepSeek` emits +//! `...` as decoded text. Tool calls are special-token +//! delimited. The fenced JSON inside is parsed with a small anchored regex. + +use std::ops::Range; +use std::sync::LazyLock; + +use regex::Regex; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +static JSON_FENCE_RE: LazyLock = LazyLock::new(|| { + // Matches ```json\n\n``` or ```\n\n``` at the end of the string. + Regex::new(r"(?s)^```(?:json)?\s*(.*?)\s*```$").expect("json-fence regex") +}); + +// Paired begin/end token ids (tool_call vs tool_calls, with matching +// _end suffixes) carry distinct meaning — the singular/plural distinction +// is the actual semantic. Renaming would obscure the structure. +#[allow(clippy::too_many_arguments, clippy::similar_names)] +pub fn parse_deepseek_v3( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + tool_calls_begin_id: u32, + tool_calls_end_id: u32, + tool_call_begin_id: u32, + tool_call_end_id: u32, + tool_sep_id: u32, +) -> ParsedResponse { + let ids = strip_stop_tokens(token_ids, stop_ids); + + let (content_ids, tool_calls) = match find(ids, tool_calls_begin_id) { + Some(section_start) => { + let content = &ids[..section_start]; + let tcs = parse_deepseek_tool_calls( + tokenizer, + &ids[section_start..], + tool_calls_begin_id, + tool_calls_end_id, + tool_call_begin_id, + tool_call_end_id, + tool_sep_id, + section_start, + ); + (content, tcs) + } + None => (ids, Vec::new()), + }; + + let text = decode(tokenizer, content_ids).unwrap_or_default(); + + // Split out `...` from the decoded content. Plain text + // tags here (no special tokens — that's the DeepSeek convention). + let (reasoning, content) = match text.split_once("") { + Some((before, after)) => { + let r = before + .replace("", "") + .trim_matches('\n') + .trim() + .to_string(); + let c = after.trim_start_matches('\n').trim().to_string(); + (Some(r), c) + } + None => (None, text.trim().to_string()), + }; + + ParsedResponse { + content, + reasoning_content: reasoning.filter(|s| !s.is_empty()), + tool_calls, + } +} + +#[allow(clippy::too_many_arguments)] +fn parse_deepseek_tool_calls( + tokenizer: &Tokenizer, + ids: &[u32], + tc_begin_id: u32, + tc_end_id: u32, + call_begin_id: u32, + call_end_id: u32, + sep_id: u32, + section_offset: usize, +) -> Vec { + let mut out: Vec = Vec::new(); + + let Some(section_start) = find(ids, tc_begin_id) else { + return out; + }; + let section_end = find_from(ids, tc_end_id, section_start + 1).unwrap_or(ids.len()); + let inner_offset = section_offset + section_start + 1; + let section_ids = &ids[section_start + 1..section_end]; + + let mut i = 0usize; + while i < section_ids.len() { + if section_ids[i] != call_begin_id { + i += 1; + continue; + } + let (end, unclosed) = match find_from(section_ids, call_end_id, i + 1) { + Some(end) => (end, false), + None => (section_ids.len(), true), + }; + let call_ids = §ion_ids[i + 1..end]; + let block_text = decode(tokenizer, call_ids).unwrap_or_default(); + let span = Range { + start: inner_offset + i, + end: inner_offset + end + usize::from(!unclosed), + }; + + let Some(sep_pos) = find(call_ids, sep_id) else { + out.push(ParsedToolCall { + raw: block_text, + token_span: Some(span), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + i = end + 1; + continue; + }; + + let after_sep = decode(tokenizer, &call_ids[sep_pos + 1..]) + .unwrap_or_default() + .trim() + .to_string(); + + let (name, args_str) = match after_sep.find('\n') { + Some(nl) => { + let n = after_sep[..nl].trim().to_string(); + let rest = after_sep[nl + 1..].trim(); + let args = match JSON_FENCE_RE.captures(rest) { + Some(c) => c.get(1).map_or("", |m| m.as_str().trim()).to_string(), + None => rest.to_string(), + }; + (n, args) + } + None => (after_sep.clone(), String::new()), + }; + + let mut invalid_json = false; + let arguments = if args_str.is_empty() { + ToolArguments::Object(serde_json::Value::Object(serde_json::Map::new())) + } else if let Ok(v) = serde_json::from_str::(&args_str) { + ToolArguments::Object(v) + } else { + invalid_json = true; + ToolArguments::Raw(args_str.clone()) + }; + + let status = if unclosed { + ToolCallParseStatus::UnclosedBlock + } else if name.is_empty() { + ToolCallParseStatus::MissingName + } else if invalid_json { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + + out.push(ParsedToolCall { + raw: block_text, + name: if name.is_empty() { None } else { Some(name) }, + arguments: Some(arguments), + token_span: Some(span), + status, + ..Default::default() + }); + i = end + 1; + if unclosed { + break; + } + } + + out +} diff --git a/crates/renderers-core/src/parsing/glm.rs b/crates/renderers-core/src/parsing/glm.rs new file mode 100644 index 0000000..152c2dd --- /dev/null +++ b/crates/renderers-core/src/parsing/glm.rs @@ -0,0 +1,223 @@ +//! GLM tool-call parser — covers GLM-5 / GLM-5.1 / GLM-4.5. +//! +//! Port of `renderers/parsing.py:parse_glm` + `_parse_glm_tool_calls`. +//! +//! Structural shape: +//! +//! ```text +//! <|assistant|>...content... +//! ...reasoning... +//! fn_name +//! k1v1 +//! k2v2 +//! +//! ``` +//! +//! Thinking is special-token (`` / ``). Each argument is +//! a pair of special-token-delimited spans inside the tool-call block. +//! All scanning is token-id based — no decoded-text regex. + +use std::ops::Range; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +#[allow(clippy::too_many_arguments)] +pub fn parse_glm( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + think_id: u32, + think_end_id: u32, + tool_call_id: u32, + tool_call_end_id: u32, + arg_key_id: u32, + arg_key_end_id: u32, + arg_value_id: u32, + arg_value_end_id: u32, +) -> ParsedResponse { + let stripped = strip_stop_tokens(token_ids, stop_ids); + + // Thinking — find by token id. + let mut reasoning: Option = None; + let mut parse_offset = 0usize; + let ids: &[u32] = if let Some(think_end) = find(stripped, think_end_id) { + let reasoning_ids: Vec = stripped[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()).filter(|s| !s.is_empty()); + parse_offset = think_end + 1; + &stripped[think_end + 1..] + } else { + // Truncated reasoning — without + if let Some(think_start) = find(stripped, think_id) { + let txt = decode(tokenizer, &stripped[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; + } + stripped + }; + + let (content_text, tool_calls) = match find(ids, tool_call_id) { + Some(tc_start) => { + let content = decode(tokenizer, &ids[..tc_start]) + .unwrap_or_default() + .trim() + .to_string(); + let tcs = parse_glm_tool_calls( + tokenizer, + &ids[tc_start..], + tool_call_id, + tool_call_end_id, + arg_key_id, + arg_key_end_id, + arg_value_id, + arg_value_end_id, + parse_offset + tc_start, + ); + (content, tcs) + } + None => ( + decode(tokenizer, ids) + .unwrap_or_default() + .trim() + .to_string(), + Vec::new(), + ), + }; + + ParsedResponse { + content: content_text, + reasoning_content: reasoning, + tool_calls, + } +} + +// Abbreviated arg-key/arg-value begin/end ids (ak/ake/av/ave) are tight +// pairs by design — the abbreviations keep call sites readable, and the +// surface fn (parse_glm) uses full names. +#[allow(clippy::too_many_arguments, clippy::similar_names)] +fn parse_glm_tool_calls( + tokenizer: &Tokenizer, + ids: &[u32], + tc_id: u32, + tc_end_id: u32, + ak_id: u32, + ake_id: u32, + av_id: u32, + ave_id: u32, + section_offset: usize, +) -> Vec { + let mut out: Vec = Vec::new(); + let mut i = 0usize; + + while i < ids.len() { + if ids[i] != tc_id { + i += 1; + continue; + } + let span_start = section_offset + i; + + let Some(end) = find_from(ids, tc_end_id, i + 1) else { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; + }; + + let block = &ids[i + 1..end]; + let block_text = decode(tokenizer, block).unwrap_or_default(); + let span = Range { + start: span_start, + end: section_offset + end + 1, + }; + + let first_ak = find(block, ak_id); + let mut arguments = serde_json::Map::new(); + let mut any_json_fallback = false; + let mut structure_broke = false; + let name = match first_ak { + None => decode(tokenizer, block) + .unwrap_or_default() + .trim() + .to_string(), + Some(first) => { + let n = decode(tokenizer, &block[..first]) + .unwrap_or_default() + .trim() + .to_string(); + let mut j = first; + while j < block.len() { + if block[j] != ak_id { + j += 1; + continue; + } + let Some(ake) = find_from(block, ake_id, j + 1) else { + structure_broke = true; + break; + }; + let key = decode(tokenizer, &block[j + 1..ake]) + .unwrap_or_default() + .trim() + .to_string(); + let Some(av) = find_from(block, av_id, ake + 1) else { + structure_broke = true; + break; + }; + let Some(ave) = find_from(block, ave_id, av + 1) else { + structure_broke = true; + break; + }; + let val_text = decode(tokenizer, &block[av + 1..ave]) + .unwrap_or_default() + .trim() + .to_string(); + let val = if let Ok(v) = serde_json::from_str::(&val_text) { + v + } else { + any_json_fallback = true; + serde_json::Value::String(val_text) + }; + arguments.insert(key, val); + j = ave + 1; + } + n + } + }; + + let status = if name.is_empty() { + ToolCallParseStatus::MissingName + } else if structure_broke { + ToolCallParseStatus::MalformedStructure + } else if any_json_fallback { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + + out.push(ParsedToolCall { + raw: block_text, + name: if name.is_empty() { None } else { Some(name) }, + arguments: Some(ToolArguments::Object(serde_json::Value::Object(arguments))), + token_span: Some(span), + status, + ..Default::default() + }); + i = end + 1; + } + out +} diff --git a/crates/renderers-core/src/parsing/kimi_k2.rs b/crates/renderers-core/src/parsing/kimi_k2.rs new file mode 100644 index 0000000..2deca32 --- /dev/null +++ b/crates/renderers-core/src/parsing/kimi_k2.rs @@ -0,0 +1,179 @@ +//! Kimi K2 tool-call parser. Port of +//! `renderers/parsing.py:parse_kimi_k2` + `_parse_kimi_k2_tool_calls`. +//! +//! Structural shape: +//! +//! ```text +//! ...content with optional ... text tags... +//! <|tool_calls_section_begin|> +//! <|tool_call_begin|>{id}<|tool_call_argument_begin|>{json_args}<|tool_call_end|> +//! ... +//! <|tool_calls_section_end|> +//! ``` +//! +//! `{id}` is `functions.{name}:{index}`. The parser strips the +//! `functions.` prefix and `:index` suffix to recover the function name. + +use std::ops::Range; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +#[allow(clippy::too_many_arguments)] +pub fn parse_kimi_k2( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + tool_calls_section_begin_id: u32, + tool_calls_section_end_id: u32, + tool_call_begin_id: u32, + tool_call_argument_begin_id: u32, + tool_call_end_id: u32, +) -> ParsedResponse { + let ids = strip_stop_tokens(token_ids, stop_ids); + + let (content_ids, tool_calls) = match find(ids, tool_calls_section_begin_id) { + Some(section_start) => { + let content = &ids[..section_start]; + let section_end = + find_from(ids, tool_calls_section_end_id, section_start + 1).unwrap_or(ids.len()); + let section_ids = &ids[section_start + 1..section_end]; + let tcs = parse_kimi_k2_calls( + tokenizer, + section_ids, + tool_call_begin_id, + tool_call_argument_begin_id, + tool_call_end_id, + section_start + 1, + ); + (content, tcs) + } + None => (ids, Vec::new()), + }; + + let text = decode(tokenizer, content_ids).unwrap_or_default(); + let (reasoning, content) = if let Some((before, after)) = text.split_once("") { + let raw = before.replacen("", "", 1); + let r = raw.trim_matches('\n').trim().to_string(); + let c = after.trim_matches('\n').to_string(); + (Some(r).filter(|s| !s.is_empty()), c) + } else { + if let Some(think_at) = text.find("") { + // Truncated thinking — no closing tag + let raw = &text[think_at + "".len()..]; + let r = raw.trim_matches('\n').trim().to_string(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(r).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; + } + (None, text) + }; + + ParsedResponse { + content: content.trim().to_string(), + reasoning_content: reasoning, + tool_calls, + } +} + +fn parse_kimi_k2_calls( + tokenizer: &Tokenizer, + ids: &[u32], + tc_begin_id: u32, + tc_arg_begin_id: u32, + tc_end_id: u32, + section_offset: usize, +) -> Vec { + let mut out: Vec = Vec::new(); + let mut i = 0usize; + + while i < ids.len() { + if ids[i] != tc_begin_id { + i += 1; + continue; + } + let Some(arg_begin) = find_from(ids, tc_arg_begin_id, i + 1) else { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: section_offset + i, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + break; + }; + + let (tc_end, unclosed) = match find_from(ids, tc_end_id, arg_begin + 1) { + Some(v) => (v, false), + None => (ids.len(), true), + }; + + let raw_id = decode(tokenizer, &ids[i + 1..arg_begin]) + .unwrap_or_default() + .trim() + .to_string(); + let args_str = decode(tokenizer, &ids[arg_begin + 1..tc_end]) + .unwrap_or_default() + .trim() + .to_string(); + let block_text = decode(tokenizer, &ids[i + 1..tc_end]).unwrap_or_default(); + let span = Range { + start: section_offset + i, + end: section_offset + tc_end + usize::from(!unclosed), + }; + + // Extract function name from "functions.{name}:{index}" + let name_part = raw_id.split(':').next().unwrap_or(""); + let func_name = if let Some((_, n)) = name_part.split_once('.') { + n.to_string() + } else { + name_part.to_string() + }; + + let mut invalid_json = false; + let arguments = if let Ok(v) = serde_json::from_str::(&args_str) { + ToolArguments::Object(v) + } else { + invalid_json = true; + ToolArguments::Raw(args_str.clone()) + }; + + let status = if unclosed { + ToolCallParseStatus::UnclosedBlock + } else if func_name.is_empty() { + ToolCallParseStatus::MissingName + } else if invalid_json { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + + out.push(ParsedToolCall { + raw: block_text, + name: if func_name.is_empty() { + None + } else { + Some(func_name) + }, + arguments: Some(arguments), + token_span: Some(span), + status, + id: if raw_id.is_empty() { + None + } else { + Some(raw_id) + }, + }); + i = tc_end + 1; + if unclosed { + break; + } + } + out +} diff --git a/crates/renderers-core/src/parsing/minimax.rs b/crates/renderers-core/src/parsing/minimax.rs new file mode 100644 index 0000000..0cac581 --- /dev/null +++ b/crates/renderers-core/src/parsing/minimax.rs @@ -0,0 +1,169 @@ +//! `MiniMax` M2 tool-call parser. Port of +//! `renderers/parsing.py:parse_minimax`. +//! +//! Structural shape: +//! +//! ```text +//! ...content... +//! ...reasoning... (special tokens) +//! +//! +//! value1 +//! value2 +//! +//! ...possibly more blocks in one wrapper... +//! +//! ``` +//! +//! Thinking is special-token (`` / ``); the +//! tool-call block is bounded by special tokens but the inner +//! `` / `` structure is parsed by regex on the +//! decoded span. + +use std::ops::Range; +use std::sync::LazyLock; + +use regex::Regex; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +static INVOKE_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"(?s)(.*?)"#).expect("invoke regex") +}); +static PARAMETER_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"(?s)(.*?)"#).expect("parameter regex") +}); + +#[allow(clippy::too_many_arguments)] +pub fn parse_minimax( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + think_id: u32, + think_end_id: u32, + tool_call_id: u32, + tool_call_end_id: u32, +) -> ParsedResponse { + let stripped = strip_stop_tokens(token_ids, stop_ids); + + // Thinking + let mut reasoning: Option = None; + let mut parse_offset = 0usize; + let working: Vec; + let ids: &[u32] = if let Some(think_end) = find(stripped, think_end_id) { + let reasoning_ids: Vec = stripped[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()).filter(|s| !s.is_empty()); + parse_offset = think_end + 1; + &stripped[think_end + 1..] + } else { + if let Some(think_start) = find(stripped, think_id) { + let txt = decode(tokenizer, &stripped[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; + } + working = stripped.to_vec(); + &working + }; + + let mut tool_calls: Vec = Vec::new(); + let content_text = match find(ids, tool_call_id) { + None => decode(tokenizer, ids) + .unwrap_or_default() + .trim() + .to_string(), + Some(tc_start) => { + let content = decode(tokenizer, &ids[..tc_start]) + .unwrap_or_default() + .trim() + .to_string(); + let mut i = tc_start; + while i < ids.len() { + if ids[i] != tool_call_id { + i += 1; + continue; + } + let span_start = parse_offset + i; + + let Some(end) = find_from(ids, tool_call_end_id, i + 1) else { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + tool_calls.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: parse_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; + }; + let block_text = decode(tokenizer, &ids[i + 1..end]).unwrap_or_default(); + let span = Range { + start: span_start, + end: parse_offset + end + 1, + }; + + let invokes: Vec<_> = INVOKE_RE.captures_iter(&block_text).collect(); + if invokes.is_empty() { + tool_calls.push(ParsedToolCall { + raw: block_text, + token_span: Some(span), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + } else { + for inv in invokes { + let name = inv.get(1).map_or("", |m| m.as_str()); + let body = inv.get(2).map_or("", |m| m.as_str()); + let mut arguments = serde_json::Map::new(); + let mut any_json_fallback = false; + for pm in PARAMETER_RE.captures_iter(body) { + let pname = pm.get(1).map_or("", |m| m.as_str()); + let pval = pm.get(2).map_or("", |m| m.as_str().trim()); + let v = if let Ok(v) = serde_json::from_str::(pval) { + v + } else { + any_json_fallback = true; + serde_json::Value::String(pval.to_string()) + }; + arguments.insert(pname.to_string(), v); + } + let status = if any_json_fallback { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + tool_calls.push(ParsedToolCall { + raw: block_text.clone(), + name: Some(name.to_string()), + arguments: Some(ToolArguments::Object(serde_json::Value::Object( + arguments, + ))), + token_span: Some(span.clone()), + status, + ..Default::default() + }); + } + } + i = end + 1; + } + content + } + }; + + ParsedResponse { + content: content_text, + reasoning_content: reasoning, + tool_calls, + } +} diff --git a/crates/renderers-core/src/parsing/mod.rs b/crates/renderers-core/src/parsing/mod.rs new file mode 100644 index 0000000..a2fa815 --- /dev/null +++ b/crates/renderers-core/src/parsing/mod.rs @@ -0,0 +1,64 @@ +//! Token-level parsing primitives shared across family-specific parsers. +//! +//! The strategy matches the Python implementation: scan token ids for +//! special-token boundaries (no decoded-text regex on the full stream), +//! then decode only inside the bounded segments. This is the only way to +//! avoid false positives from content that happens to look like a +//! special token. +//! +//! All helpers operate on `&[u32]` slices and are `#[inline]`-marked so +//! they vanish into the family parsers at -O. + +pub mod deepseek_v3; +pub mod glm; +pub mod kimi_k2; +pub mod minimax; +pub mod qwen3; +pub mod qwen35; + +use crate::tokenizer::Tokenizer; +use crate::types::RenderError; + +/// Find the first index of `target` in `ids`, or `None`. +#[inline] +pub fn find(ids: &[u32], target: u32) -> Option { + ids.iter().position(|&x| x == target) +} + +/// Find the first index of `target` in `ids[start..]`, or `None`. +#[inline] +pub fn find_from(ids: &[u32], target: u32, start: usize) -> Option { + ids[start..] + .iter() + .position(|&x| x == target) + .map(|i| i + start) +} + +/// Find the first index of any token in `targets`, or `None`. `targets` +/// is small (≤ a few) for every renderer, so a linear contains-check is +/// faster than a `HashSet`. +#[inline] +pub fn find_any(ids: &[u32], targets: &[u32]) -> Option { + ids.iter().position(|x| targets.contains(x)) +} + +/// Truncate `ids` at the first stop token. Returns the prefix as a +/// borrowed slice — no allocation. +#[inline] +pub fn strip_stop_tokens<'a>(ids: &'a [u32], stop_ids: &[u32]) -> &'a [u32] { + match find_any(ids, stop_ids) { + Some(i) => &ids[..i], + None => ids, + } +} + +/// Decode `ids` via `tokenizer.decode(ids, skip_special_tokens=False)`. +/// Returns an empty string for empty input without calling the +/// tokenizer (saves an FFI-free but still measurable ~µs per call). +#[inline] +pub fn decode(tokenizer: &Tokenizer, ids: &[u32]) -> Result { + if ids.is_empty() { + return Ok(String::new()); + } + tokenizer.decode(ids) +} diff --git a/crates/renderers-core/src/parsing/qwen3.rs b/crates/renderers-core/src/parsing/qwen3.rs new file mode 100644 index 0000000..8af0d00 --- /dev/null +++ b/crates/renderers-core/src/parsing/qwen3.rs @@ -0,0 +1,150 @@ +//! Qwen3 tool-call parser — Hermes-style JSON tool calls. +//! +//! Port of `renderers/parsing.py:parse_qwen3`. The structural shape is: +//! +//! ```text +//! ...content tokens... +//! +//! { "name": "fn", "arguments": { ... } } +//! +//! ...possibly more blocks... +//! ``` +//! +//! Reasoning (`...`) is emitted as plain text by Qwen3 +//! (not special tokens), so it falls out from the decoded content. + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +/// Parse Qwen3 completion tokens. `stop_ids` is consulted only to +/// truncate runaway content past EOS; the parser itself walks the +/// truncated prefix. +pub fn parse_qwen3( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + tool_call_id: u32, + tool_call_end_id: u32, +) -> ParsedResponse { + let ids = strip_stop_tokens(token_ids, stop_ids); + + let mut tool_calls: Vec = Vec::new(); + let (content_ids, _scanned) = match find(ids, tool_call_id) { + Some(tc_start) => { + let content = &ids[..tc_start]; + let mut i = tc_start; + while i < ids.len() { + if ids[i] == tool_call_id { + match find_from(ids, tool_call_end_id, i + 1) { + None => { + // No closing delim — runs to end of stripped ids. + let raw = decode(tokenizer, &ids[i + 1..]) + .unwrap_or_default() + .trim() + .to_string(); + tool_calls.push(ParsedToolCall { + raw, + token_span: Some(i..ids.len()), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; + } + Some(end) => { + let block = &ids[i + 1..end]; + let tc_text = decode(tokenizer, block) + .unwrap_or_default() + .trim() + .to_string(); + let span = i..(end + 1); + match serde_json::from_str::(&tc_text) { + Err(_) => { + tool_calls.push(ParsedToolCall { + raw: tc_text, + token_span: Some(span), + status: ToolCallParseStatus::InvalidJson, + ..Default::default() + }); + } + Ok(value) => { + let (name, args) = extract_name_and_args(&value); + if name.is_empty() { + tool_calls.push(ParsedToolCall { + raw: tc_text, + name: None, + arguments: Some(args), + token_span: Some(span), + status: ToolCallParseStatus::MissingName, + ..Default::default() + }); + } else { + tool_calls.push(ParsedToolCall { + raw: tc_text, + name: Some(name), + arguments: Some(args), + token_span: Some(span), + status: ToolCallParseStatus::Ok, + ..Default::default() + }); + } + } + } + i = end + 1; + } + } + } else { + i += 1; + } + } + (content, true) + } + None => (ids, false), + }; + + let text = decode(tokenizer, content_ids).unwrap_or_default(); + let (reasoning, content) = split_thinking(text); + + ParsedResponse { + content: content.trim().to_string(), + reasoning_content: reasoning.filter(|s| !s.is_empty()), + tool_calls, + } +} + +/// Pull `name` (string) and `arguments` (object or whatever the model +/// emitted) out of a parsed tool-call JSON value. Matches the Python +/// `parsed.get("name", "")` / `parsed.get("arguments", {})` semantics. +fn extract_name_and_args(value: &serde_json::Value) -> (String, ToolArguments) { + let Some(obj) = value.as_object() else { + return (String::new(), ToolArguments::default()); + }; + let name = obj + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let args = match obj.get("arguments") { + None => ToolArguments::default(), + Some(serde_json::Value::String(s)) => ToolArguments::Raw(s.clone()), + Some(v) => ToolArguments::Object(v.clone()), + }; + (name, args) +} + +/// Split a decoded text segment around ``. Mirrors the inline +/// logic at `renderers/parsing.py` for Qwen3 (which has no `` as +/// special token — reasoning lives in the decoded text). +fn split_thinking(text: String) -> (Option, String) { + if let Some((before, after)) = text.split_once("") { + let reasoning = before + .replace("", "") + .trim_matches('\n') + .trim() + .to_string(); + let content = after.trim_matches('\n').to_string(); + (Some(reasoning), content) + } else { + (None, text) + } +} diff --git a/crates/renderers-core/src/parsing/qwen35.rs b/crates/renderers-core/src/parsing/qwen35.rs new file mode 100644 index 0000000..cee6378 --- /dev/null +++ b/crates/renderers-core/src/parsing/qwen35.rs @@ -0,0 +1,199 @@ +//! Qwen3.5 tool-call parser — XML-style tool calls with special-token thinking. +//! +//! Port of `renderers/parsing.py:parse_qwen35` + `_parse_xml_tool_calls`. +//! +//! Structural shape: +//! +//! ```text +//! +//! ...reasoning text... +//! +//! +//! ...content text... +//! +//! +//! +//! +//! value1 +//! +//! +//! value2 +//! +//! +//! +//! ``` +//! +//! `` and `` are special tokens. Tool-call block contents are +//! parsed by regex on the decoded text — but the regex only runs inside the +//! bounded `...` span, never on the full completion. + +use std::ops::Range; +use std::sync::LazyLock; + +use regex::Regex; + +use crate::parsing::{decode, find, find_from, strip_stop_tokens}; +use crate::tokenizer::Tokenizer; +use crate::types::{ParsedResponse, ParsedToolCall, ToolArguments, ToolCallParseStatus}; + +static FUNCTION_NAME_RE: LazyLock = + LazyLock::new(|| Regex::new(r"]+)>").expect("function-name regex")); + +static PARAMETER_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"(?s)]+)>\n?(.*?)\n?").expect("parameter regex") +}); + +#[allow(clippy::too_many_arguments)] +pub fn parse_qwen35( + tokenizer: &Tokenizer, + token_ids: &[u32], + stop_ids: &[u32], + think_id: u32, + think_end_id: u32, + tool_call_id: u32, + tool_call_end_id: u32, +) -> ParsedResponse { + let ids = strip_stop_tokens(token_ids, stop_ids); + + // ── Thinking: find by token ID ───────────────────────── + let mut reasoning: Option = None; + let mut parse_offset: usize = 0; + let ids_after_think: &[u32] = if let Some(think_end) = find(ids, think_end_id) { + // Filter out think_id tokens from the reasoning span so the + // decoded text doesn't include the opening marker. + let reasoning_ids: Vec = ids[..think_end] + .iter() + .copied() + .filter(|&t| t != think_id) + .collect(); + let txt = decode(tokenizer, &reasoning_ids).unwrap_or_default(); + reasoning = Some(txt.trim().to_string()); + parse_offset = think_end + 1; + &ids[think_end + 1..] + } else { + // present but no — truncated reasoning; + // return early with reasoning-only response. + if let Some(think_start) = find(ids, think_id) { + let txt = decode(tokenizer, &ids[think_start + 1..]).unwrap_or_default(); + return ParsedResponse { + content: String::new(), + reasoning_content: Some(txt.trim().to_string()).filter(|s| !s.is_empty()), + tool_calls: Vec::new(), + }; + } + ids + }; + + // ── Tool calls (token-bounded, regex-on-decoded-span) ─────────── + let (content_text, tool_calls) = if let Some(tc_start) = find(ids_after_think, tool_call_id) { + let content = decode(tokenizer, &ids_after_think[..tc_start]) + .unwrap_or_default() + .trim() + .to_string(); + let tcs = parse_xml_tool_calls( + tokenizer, + &ids_after_think[tc_start..], + tool_call_id, + tool_call_end_id, + parse_offset + tc_start, + ); + (content, tcs) + } else { + let content = decode(tokenizer, ids_after_think) + .unwrap_or_default() + .trim() + .to_string(); + (content, Vec::new()) + }; + + ParsedResponse { + content: content_text, + reasoning_content: reasoning.filter(|s| !s.is_empty()), + tool_calls, + } +} + +fn parse_xml_tool_calls( + tokenizer: &Tokenizer, + ids: &[u32], + tc_id: u32, + tc_end_id: u32, + section_offset: usize, +) -> Vec { + let mut out: Vec = Vec::new(); + let mut i = 0usize; + + while i < ids.len() { + if ids[i] != tc_id { + i += 1; + continue; + } + let span_start = section_offset + i; + + let Some(end) = find_from(ids, tc_end_id, i + 1) else { + let raw = decode(tokenizer, &ids[i + 1..]).unwrap_or_default(); + out.push(ParsedToolCall { + raw, + token_span: Some(Range { + start: span_start, + end: section_offset + ids.len(), + }), + status: ToolCallParseStatus::UnclosedBlock, + ..Default::default() + }); + break; + }; + + let block_text = decode(tokenizer, &ids[i + 1..end]).unwrap_or_default(); + let span = Range { + start: span_start, + end: section_offset + end + 1, + }; + + let Some(name_match) = FUNCTION_NAME_RE.captures(&block_text) else { + out.push(ParsedToolCall { + raw: block_text, + token_span: Some(span), + status: ToolCallParseStatus::MalformedStructure, + ..Default::default() + }); + i = end + 1; + continue; + }; + let name = name_match + .get(1) + .map(|m| m.as_str().to_string()) + .unwrap_or_default(); + + let mut arguments = serde_json::Map::new(); + let mut any_json_fallback = false; + for pm in PARAMETER_RE.captures_iter(&block_text) { + let arg_name = pm.get(1).map_or("", |m| m.as_str()).to_string(); + let arg_value = pm.get(2).map_or("", |m| m.as_str().trim()); + if let Ok(v) = serde_json::from_str::(arg_value) { + arguments.insert(arg_name, v); + } else { + arguments.insert(arg_name, serde_json::Value::String(arg_value.to_string())); + any_json_fallback = true; + } + } + + let status = if any_json_fallback { + ToolCallParseStatus::InvalidJson + } else { + ToolCallParseStatus::Ok + }; + + out.push(ParsedToolCall { + raw: block_text, + name: if name.is_empty() { None } else { Some(name) }, + arguments: Some(ToolArguments::Object(serde_json::Value::Object(arguments))), + token_span: Some(span), + status, + ..Default::default() + }); + i = end + 1; + } + + out +} diff --git a/crates/renderers-core/src/processing/mod.rs b/crates/renderers-core/src/processing/mod.rs new file mode 100644 index 0000000..7f5f4d8 --- /dev/null +++ b/crates/renderers-core/src/processing/mod.rs @@ -0,0 +1,23 @@ +//! Vision processors — port of the `HuggingFace` image processor pipelines. +//! +//! Phase 5b: actual pixel-data preprocessing in Rust. Decode image bytes, +//! smart-resize, normalise, patch-extract, and produce the tensors the +//! vision encoder consumes — same shape as HF's processors, without +//! crossing back to Python. +//! +//! Currently shipped: +//! +//! - [`qwen3_vl::Qwen3VlImageProcessor`] — covers Qwen2-VL, Qwen3-VL, +//! and Qwen3.5-VL (they share the processor). +//! +//! Future: +//! +//! - Kimi K2.5 — different `smart_resize` defaults and a single-pad +//! placeholder convention (Phase 5b follow-up). +//! - Video frame sampling — needs `video-rs` or `ffmpeg-next` (Phase 5c). + +pub mod qwen3_vl; +pub mod resolver; + +pub use qwen3_vl::{CLIP_MEAN, CLIP_STD, ProcessedImage, Qwen3VlImageProcessor}; +pub use resolver::Qwen3VlResolver; diff --git a/crates/renderers-core/src/processing/qwen3_vl.rs b/crates/renderers-core/src/processing/qwen3_vl.rs new file mode 100644 index 0000000..5e9f311 --- /dev/null +++ b/crates/renderers-core/src/processing/qwen3_vl.rs @@ -0,0 +1,313 @@ +//! Vision image processing for Qwen-VL family models (Qwen2-VL, +//! Qwen3-VL, Qwen3.5-VL). +//! +//! Port of the `HuggingFace` `Qwen2VLImageProcessor` / `Qwen3VLImageProcessor` +//! pipeline. Given an image (bytes or decoded RGB), produces: +//! +//! - `pixel_values`: `ndarray::Array2` of shape +//! `(grid_h * grid_w, 3 * temporal_patch_size * patch_size * patch_size)`. +//! This is what the vision encoder consumes. +//! - `image_grid_thw`: `[1, grid_h, grid_w]` — the temporal × height × width +//! patch count. +//! - `num_tokens`: `grid_h * grid_w / (merge_size * merge_size)` — the +//! placeholder count the renderer emits between +//! `<|vision_start|>` and `<|vision_end|>`. +//! +//! # Parity caveat +//! +//! The grid dimensions, `num_tokens`, and tensor shape match HF exactly. +//! The pixel values themselves use the `image` crate's bicubic +//! (`CatmullRom`) resize, which differs from PIL's bicubic in the last +//! few decimals — typical RMS difference ≈ 1e-3 on normalized pixels. +//! Downstream models tolerate this level of noise (it's far below the +//! quantization floor of vision encoders); but if exact pixel parity +//! is required (e.g. for regression tests against PIL-rendered +//! fixtures) keep the Python processor on the path. + +use std::fmt::Write as _; +use std::io::Cursor; + +use ndarray::{Array2, Array3}; +use sha2::{Digest, Sha256}; + +use crate::types::RenderError; + +/// `OpenAI` CLIP normalisation constants — Qwen-VL inherits these. +pub const CLIP_MEAN: [f32; 3] = [0.481_454_66, 0.457_827_5, 0.408_210_73]; +pub const CLIP_STD: [f32; 3] = [0.268_629_54, 0.261_302_6, 0.275_777_1]; + +/// Configuration for the Qwen-VL image processor pipeline. +#[derive(Debug, Clone)] +pub struct Qwen3VlImageProcessor { + /// Lower bound on resized pixel count. Default for Qwen2-VL / Qwen3-VL: + /// `56 * 56 = 3136`. Resized images smaller than this get scaled up. + pub min_pixels: u32, + /// Upper bound on resized pixel count. Default: `28*28*1280 = 1_003_520`. + pub max_pixels: u32, + /// Patch size in pixels. Default: 14. + pub patch_size: u32, + /// Temporal patch size — `pixel_values` is duplicated across this + /// axis for static images so the same tensor shape serves images + /// and video frames. Default: 2. + pub temporal_patch_size: u32, + /// Spatial merge factor between vision encoder output and the + /// model's input — placeholders count divides by `merge²`. Default: 2. + pub merge_size: u32, + /// Rescale factor applied before normalisation. Default: 1/255. + pub rescale_factor: f32, + /// Per-channel mean / std for normalisation (after rescale). + pub image_mean: [f32; 3], + pub image_std: [f32; 3], +} + +impl Default for Qwen3VlImageProcessor { + fn default() -> Self { + Self { + min_pixels: 56 * 56, + max_pixels: 28 * 28 * 1280, + patch_size: 14, + temporal_patch_size: 2, + merge_size: 2, + rescale_factor: 1.0 / 255.0, + image_mean: CLIP_MEAN, + image_std: CLIP_STD, + } + } +} + +/// Output of one image's processing run. +#[derive(Debug, Clone)] +pub struct ProcessedImage { + /// Flattened patches: shape (`grid_h` * `grid_w`, channel * temporal * patch²). + pub pixel_values: Array2, + /// `[1, grid_h, grid_w]` — temporal × height × width patch count. + pub image_grid_thw: [u32; 3], + /// `grid_h * grid_w / merge²` — count of placeholder tokens to emit. + pub num_tokens: usize, + /// Stable SHA-256 prefix of the resolved RGB bytes — useful as a + /// cache key. + pub hash: String, +} + +impl Qwen3VlImageProcessor { + /// Compute the resized (height, width) for an input image. Mirrors + /// `transformers.models.qwen2_vl.image_processing_qwen2_vl.smart_resize`. + /// + /// `factor = patch_size * merge_size` (28 by default). + pub fn smart_resize(&self, height: u32, width: u32) -> Result<(u32, u32), RenderError> { + let factor = self.patch_size * self.merge_size; + let (h, w) = (f64::from(height), f64::from(width)); + let max_dim = h.max(w); + let min_dim = h.min(w); + if min_dim == 0.0 { + return Err(RenderError::Invalid("image dimension is zero".into())); + } + if max_dim / min_dim > 200.0 { + return Err(RenderError::Invalid(format!( + "absolute aspect ratio must be smaller than 200, got {:.2}", + max_dim / min_dim + ))); + } + let f = f64::from(factor); + let mut h_bar = (h / f).round() * f; + let mut w_bar = (w / f).round() * f; + + let max_pixels = f64::from(self.max_pixels); + let min_pixels = f64::from(self.min_pixels); + + if h_bar * w_bar > max_pixels { + let beta = ((h * w) / max_pixels).sqrt(); + h_bar = ((h / beta) / f).floor() * f; + w_bar = ((w / beta) / f).floor() * f; + h_bar = h_bar.max(f); + w_bar = w_bar.max(f); + } else if h_bar * w_bar < min_pixels { + let beta = (min_pixels / (h * w)).sqrt(); + h_bar = ((h * beta) / f).ceil() * f; + w_bar = ((w * beta) / f).ceil() * f; + } + // smart_resize math keeps h_bar/w_bar positive (clamped to `f`). + #[allow(clippy::cast_sign_loss)] + Ok((h_bar as u32, w_bar as u32)) + } + + /// Decode arbitrary image bytes (PNG/JPEG/WebP via the `image` + /// crate's auto-detect) to RGB pixel arrays. + pub fn decode(bytes: &[u8]) -> Result { + let reader = image::ImageReader::new(Cursor::new(bytes)) + .with_guessed_format() + .map_err(|e| RenderError::Invalid(format!("image format detection: {e}")))?; + let dynamic = reader + .decode() + .map_err(|e| RenderError::Invalid(format!("image decode: {e}")))?; + Ok(dynamic.to_rgb8()) + } + + /// Hash the resolved RGB bytes — same shape as the Python + /// `_image_hash` so the cache key is comparable. + pub fn hash_rgb(rgb: &image::RgbImage) -> String { + let mut h = Sha256::new(); + h.update(rgb.as_raw()); + h.update(format!("({}, {})", rgb.width(), rgb.height()).as_bytes()); + let digest = h.finalize(); + // Trim to 32 hex chars to match the Python implementation. + let mut hex = String::with_capacity(digest.len() * 2); + for b in &digest { + write!(&mut hex, "{b:02x}").expect("writing to String never fails"); + } + hex[..32].to_string() + } + + /// Process a single decoded RGB image end-to-end. + pub fn process_rgb(&self, rgb: &image::RgbImage) -> Result { + let (orig_w, orig_h) = (rgb.width(), rgb.height()); + let (new_h, new_w) = self.smart_resize(orig_h, orig_w)?; + + // Resize: image crate's CatmullRom is the closest match to PIL's + // bicubic. See module-level docs for the parity caveat. + let resized = + image::imageops::resize(rgb, new_w, new_h, image::imageops::FilterType::CatmullRom); + + // Build a (C=3, H, W) f32 array, normalised. + let (h, w) = (new_h as usize, new_w as usize); + let mut chw = Array3::::zeros((3, h, w)); + for y in 0..h { + for x in 0..w { + let p = resized.get_pixel(x as u32, y as u32); + for c in 0..3 { + let v = f32::from(p[c]) * self.rescale_factor; + chw[(c, y, x)] = (v - self.image_mean[c]) / self.image_std[c]; + } + } + } + + // Patch layout. The HF pipeline reshapes to: + // (C, grid_h/merge, merge, patch, grid_w/merge, merge, patch) + // then permutes to: + // (grid_h/merge, grid_w/merge, merge, merge, C, patch, patch) + // then unsqueezes a temporal axis and expands to temporal_patch_size, + // finally flattening to (grid_h*grid_w, C*temporal*patch*patch). + // + // The output layout is (token_idx, feature) where token_idx + // iterates in row-major order over the merged grid: + // token_idx = (m_row * grid_w/merge + m_col) * merge² + mi*merge + mj + // and the feature vector packs (C, temporal, patch, patch) in + // row-major order. + let ps = self.patch_size as usize; + let merge = self.merge_size as usize; + let temporal = self.temporal_patch_size as usize; + let grid_h = h / ps; + let grid_w = w / ps; + if grid_h % merge != 0 || grid_w % merge != 0 { + return Err(RenderError::Invalid(format!( + "resized grid ({grid_h}x{grid_w}) not divisible by merge_size {merge}" + ))); + } + let token_count = grid_h * grid_w; + let feature_len = 3 * temporal * ps * ps; + let mut pixel_values = Array2::::zeros((token_count, feature_len)); + + // Fill: for each token (m_row, m_col, mi, mj), copy the corresponding + // (patch_size × patch_size × 3) sub-block, replicated across the + // temporal axis. + let merged_grid_h = grid_h / merge; + let merged_grid_w = grid_w / merge; + for m_row in 0..merged_grid_h { + for m_col in 0..merged_grid_w { + for mi in 0..merge { + for mj in 0..merge { + let token_idx = ((m_row * merged_grid_w + m_col) * merge + mi) * merge + mj; + // Patch top-left in pixel coordinates: + let py = (m_row * merge + mi) * ps; + let px = (m_col * merge + mj) * ps; + let mut feature_idx = 0usize; + for c in 0..3 { + for _t in 0..temporal { + for dy in 0..ps { + for dx in 0..ps { + pixel_values[(token_idx, feature_idx)] = + chw[(c, py + dy, px + dx)]; + feature_idx += 1; + } + } + } + } + } + } + } + } + + let num_tokens = (grid_h * grid_w) / (merge * merge); + let hash = Self::hash_rgb(rgb); + + Ok(ProcessedImage { + pixel_values, + image_grid_thw: [1, grid_h as u32, grid_w as u32], + num_tokens, + hash, + }) + } + + /// Convenience: decode bytes then process. + pub fn process_bytes(&self, bytes: &[u8]) -> Result { + let rgb = Self::decode(bytes)?; + self.process_rgb(&rgb) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn smart_resize_round_trip() { + let p = Qwen3VlImageProcessor::default(); + let (h, w) = p.smart_resize(480, 640).unwrap(); + // 480*640 = 307_200 → under max_pixels, both align to factor 28 + assert_eq!(h % 28, 0); + assert_eq!(w % 28, 0); + } + + #[test] + fn smart_resize_scales_down_oversized() { + let p = Qwen3VlImageProcessor::default(); + // 4000*3000 = 12M pixels — must scale down + let (h, w) = p.smart_resize(4000, 3000).unwrap(); + assert!(h * w <= p.max_pixels); + assert_eq!(h % 28, 0); + assert_eq!(w % 28, 0); + } + + #[test] + fn smart_resize_scales_up_undersized() { + let p = Qwen3VlImageProcessor::default(); + // 16x16 = 256 pixels — below min, must scale up + let (h, w) = p.smart_resize(16, 16).unwrap(); + assert!(h * w >= p.min_pixels); + assert_eq!(h % 28, 0); + assert_eq!(w % 28, 0); + } + + #[test] + fn smart_resize_rejects_extreme_aspect_ratio() { + let p = Qwen3VlImageProcessor::default(); + assert!(p.smart_resize(10, 10_000).is_err()); + } + + #[test] + fn process_small_image() { + let p = Qwen3VlImageProcessor::default(); + // Synthesise a 56x56 RGB image + let mut rgb = image::RgbImage::new(56, 56); + for y in 0..56 { + for x in 0..56 { + rgb.put_pixel(x, y, image::Rgb([x as u8, y as u8, 128])); + } + } + let out = p.process_rgb(&rgb).unwrap(); + assert_eq!(out.image_grid_thw, [1, 4, 4]); + assert_eq!(out.num_tokens, 4); // 16 / (2*2) + // pixel_values shape: (16 tokens, 3*2*14*14 = 1176) + assert_eq!(out.pixel_values.shape(), &[16, 1176]); + } +} diff --git a/crates/renderers-core/src/processing/resolver.rs b/crates/renderers-core/src/processing/resolver.rs new file mode 100644 index 0000000..1685e5b --- /dev/null +++ b/crates/renderers-core/src/processing/resolver.rs @@ -0,0 +1,94 @@ +//! [`MediaResolver`] implementations backed by the in-crate vision +//! processors. Lets pure-Rust callers go from "image bytes / URL / +//! path" straight to a [`MediaItem`] without a Python round-trip. + +use std::fs; + +use serde_json::json; + +use crate::processing::qwen3_vl::{ProcessedImage, Qwen3VlImageProcessor}; +use crate::traits::{MediaResolver, MediaSource}; +use crate::types::{MediaItem, Modality, RenderError}; + +/// `MediaResolver` backed by [`Qwen3VlImageProcessor`]. Stores the +/// processed tensor inside `MediaItem.hf_payload` as a JSON object so +/// the inference engine glue can route it through the same path as +/// the Python-resolved case. +/// +/// The serialised payload shape is: +/// +/// ```json +/// { +/// "pixel_values": { "shape": [tokens, features], "data": [f32, ...] }, +/// "image_grid_thw": { "shape": [1, 3], "data": [1, h, w] } +/// } +/// ``` +/// +/// Callers that need zero-copy `numpy`/`torch` arrays should consume +/// the [`ProcessedImage`] struct directly via +/// [`Qwen3VlResolver::process_bytes`] instead of going through the +/// `MediaItem.hf_payload` field. +#[derive(Debug, Clone, Default)] +pub struct Qwen3VlResolver { + processor: Qwen3VlImageProcessor, +} + +impl Qwen3VlResolver { + pub fn new(processor: Qwen3VlImageProcessor) -> Self { + Self { processor } + } + + pub fn processor(&self) -> &Qwen3VlImageProcessor { + &self.processor + } + + /// Process raw image bytes into the structured [`ProcessedImage`] + /// — the zero-loss representation. The [`MediaResolver`] impl + /// wraps this and re-serialises into `MediaItem.hf_payload`. + pub fn process_bytes(&self, bytes: &[u8]) -> Result { + self.processor.process_bytes(bytes) + } + + fn to_media_item(processed: ProcessedImage) -> MediaItem { + let shape = processed.pixel_values.shape(); + let pixel_shape = vec![shape[0] as u64, shape[1] as u64]; + let pixel_data: Vec = processed.pixel_values.iter().copied().collect(); + let grid: Vec = processed.image_grid_thw.to_vec(); + + let payload = json!({ + "pixel_values": { + "shape": pixel_shape, + "data": pixel_data, + }, + "image_grid_thw": { + "shape": [1u32, 3u32], + "data": grid, + }, + }); + + MediaItem { + modality: Modality::Image, + hash: processed.hash, + num_tokens: processed.num_tokens, + hf_payload: payload, + } + } +} + +impl MediaResolver for Qwen3VlResolver { + fn resolve_image(&self, source: &MediaSource<'_>) -> Result { + let bytes: Vec = match source { + MediaSource::Bytes(b) => b.to_vec(), + MediaSource::Path(p) => fs::read(p) + .map_err(|e| RenderError::Invalid(format!("read image {}: {e}", p.display())))?, + MediaSource::Url(_) => { + return Err(RenderError::Invalid( + "URL sources require an async fetch — pass already-downloaded bytes instead" + .into(), + )); + } + }; + let processed = self.process_bytes(&bytes)?; + Ok(Self::to_media_item(processed)) + } +} diff --git a/crates/renderers-core/src/registry.rs b/crates/renderers-core/src/registry.rs new file mode 100644 index 0000000..acff732 --- /dev/null +++ b/crates/renderers-core/src/registry.rs @@ -0,0 +1,77 @@ +//! Tokenizer-path → renderer factory registry. +//! +//! Mirrors `renderers/base.py:MODEL_RENDERER_MAP` for the subset of +//! families ported to Rust so far. New families slot in by adding a +//! match arm in [`create_renderer`]. + +use crate::families::{DeepSeekV3Renderer, Qwen3Renderer, Qwen35Renderer}; +use crate::tokenizer::Tokenizer; +use crate::traits::Renderer; +use crate::types::RenderError; + +/// Renderer family identifier — closed enum used by [`create_renderer`]. +/// Adding a family means a new variant here plus a match arm. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum RendererKind { + Qwen3, + Qwen35, + DeepSeekV3, +} + +impl RendererKind { + /// Resolve a renderer kind from its registry name. Accepts the + /// canonical lowercase form plus common aliases. Not named + /// `from_str` to avoid the `std::str::FromStr` trait collision — + /// it's an inherent method. + pub fn parse(name: &str) -> Option { + match name { + "qwen3" | "Qwen3" => Some(Self::Qwen3), + "qwen35" | "qwen3.5" | "Qwen3.5" => Some(Self::Qwen35), + "deepseek_v3" | "deepseek-v3" | "DeepSeekV3" => Some(Self::DeepSeekV3), + _ => None, + } + } +} + +/// Configuration passed to [`create_renderer`]. +#[derive(Clone, Debug, Default)] +pub struct RendererConfig { + pub preserve_all_thinking: bool, + pub preserve_thinking_between_tool_calls: bool, + /// `None` keeps the family default; the Qwen3.5 Python shim probes + /// the tokenizer's Jinja template to pick the right polarity and + /// forwards the result here so the Rust side stays template-agnostic. + pub enable_thinking: Option, +} + +/// Build a renderer of the requested kind backed by `tokenizer`. +pub fn create_renderer( + kind: RendererKind, + tokenizer: Tokenizer, + cfg: &RendererConfig, +) -> Result, RenderError> { + match kind { + RendererKind::Qwen3 => Ok(Box::new( + Qwen3Renderer::builder() + .preserve_all_thinking(cfg.preserve_all_thinking) + .preserve_thinking_between_tool_calls(cfg.preserve_thinking_between_tool_calls) + .build(tokenizer)?, + )), + RendererKind::Qwen35 => { + let mut b = Qwen35Renderer::builder() + .preserve_all_thinking(cfg.preserve_all_thinking) + .preserve_thinking_between_tool_calls(cfg.preserve_thinking_between_tool_calls); + if let Some(en) = cfg.enable_thinking { + b = b.enable_thinking(en); + } + Ok(Box::new(b.build(tokenizer)?)) + } + RendererKind::DeepSeekV3 => { + let mut b = DeepSeekV3Renderer::builder(); + if let Some(en) = cfg.enable_thinking { + b = b.enable_thinking(en); + } + Ok(Box::new(b.build(tokenizer)?)) + } + } +} diff --git a/crates/renderers-core/src/thinking.rs b/crates/renderers-core/src/thinking.rs new file mode 100644 index 0000000..cd89887 --- /dev/null +++ b/crates/renderers-core/src/thinking.rs @@ -0,0 +1,90 @@ +//! `...` retention rules shared across renderers. + +use crate::types::Message; + +/// Should `messages[msg_idx]`'s reasoning content be re-emitted even when +/// the chat template would normally drop it? +/// +/// Returns `true` only as an override above the template default. Each +/// renderer ORs this into its own "render thinking?" condition. +/// +/// Mirrors `renderers/base.py:should_preserve_past_thinking`. +pub fn should_preserve_past_thinking( + messages: &[Message], + msg_idx: usize, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, +) -> bool { + if preserve_all_thinking { + return true; + } + if !preserve_thinking_between_tool_calls { + return false; + } + // Find the most recent user message (or None). + let last_user: Option = messages + .iter() + .enumerate() + .rev() + .find_map(|(j, m)| (m.role == "user").then_some(j)); + + let Some(last_user) = last_user else { + // No user message before us: keep only if there's any tool turn + // anywhere; rare path but matches the Python contract. + return messages.iter().any(|m| m.role == "tool"); + }; + + if msg_idx <= last_user { + return false; + } + // The current segment must contain a tool response for the block to + // count as an in-flight tool cycle. + messages[last_user + 1..].iter().any(|m| m.role == "tool") +} + +#[cfg(test)] +mod tests { + use super::*; + + fn m(role: &str) -> Message { + Message { + role: role.to_string(), + ..Default::default() + } + } + + #[test] + fn preserve_all_wins() { + let msgs = vec![m("user"), m("assistant")]; + assert!(should_preserve_past_thinking(&msgs, 1, true, false)); + } + + #[test] + fn between_tool_calls_keeps_active_cycle() { + // Tool-cycle assistants (after the last user) are kept; the + // current tool block must contain at least one `tool` turn. + let msgs = vec![m("user"), m("assistant"), m("tool"), m("assistant")]; + // both assistants are after last_user=0 and the segment has a tool + assert!(should_preserve_past_thinking(&msgs, 1, false, true)); + assert!(should_preserve_past_thinking(&msgs, 3, false, true)); + // a prior tool cycle (before a later user) is dropped + let msgs2 = vec![ + m("user"), + m("assistant"), + m("tool"), + m("assistant"), + m("user"), + m("assistant"), + ]; + // assistant at idx=1 is before last_user=4 → dropped + assert!(!should_preserve_past_thinking(&msgs2, 1, false, true)); + // assistant at idx=5 is after last_user=4 but segment has no tool → dropped + assert!(!should_preserve_past_thinking(&msgs2, 5, false, true)); + } + + #[test] + fn between_tool_calls_drops_without_tool() { + let msgs = vec![m("user"), m("assistant"), m("assistant")]; + assert!(!should_preserve_past_thinking(&msgs, 2, false, true)); + } +} diff --git a/crates/renderers-core/src/tokenizer.rs b/crates/renderers-core/src/tokenizer.rs new file mode 100644 index 0000000..98d34da --- /dev/null +++ b/crates/renderers-core/src/tokenizer.rs @@ -0,0 +1,140 @@ +//! Thin wrapper around `tokenizers::Tokenizer`. +//! +//! Provides three things the bare crate doesn't: +//! 1. A cached `unk_token_id` lookup so [`Tokenizer::token_to_id_strict`] +//! can match Python's "unk-id-is-missing" convention. +//! 2. An `encode_no_special` that returns `Vec` directly, sized to +//! the encoding length — saves the caller from juggling the +//! `tokenizers::Encoding` struct on every hot-path text segment. +//! 3. `Send + Sync` Arc-friendly storage so renderers can share one +//! instance across threads. + +use std::sync::Arc; + +use crate::types::RenderError; + +/// Owned tokenizer handle. Cloning is cheap (`Arc`); the +/// `tokenizers::Tokenizer` itself is held behind the Arc. +#[derive(Clone, Debug)] +pub struct Tokenizer { + inner: Arc, +} + +#[derive(Debug)] +struct Inner { + tok: tokenizers::Tokenizer, + unk_id: Option, +} + +impl Tokenizer { + /// Load a `tokenizer.json` from disk. + pub fn from_file(path: impl AsRef) -> Result { + let tok = tokenizers::Tokenizer::from_file(path) + .map_err(|e| RenderError::Tokenizer(e.to_string()))?; + Ok(Self::wrap(tok)) + } + + /// Wrap an already-loaded `tokenizers::Tokenizer`. + pub fn wrap(tok: tokenizers::Tokenizer) -> Self { + let unk_id = tok.token_to_id(""); + Self { + inner: Arc::new(Inner { tok, unk_id }), + } + } + + /// Returns the token id for `token`, or `None` if missing / + /// resolved to ``. Matches the Python helper at + /// `renderers/parsers.py:_token_id`. + pub fn token_to_id(&self, token: &str) -> Option { + let tid = self.inner.tok.token_to_id(token)?; + if Some(tid) == self.inner.unk_id { + None + } else { + Some(tid) + } + } + + /// Strict variant: returns an error if the token is missing. + pub fn token_to_id_strict(&self, token: &str) -> Result { + self.token_to_id(token) + .ok_or_else(|| RenderError::MissingSpecialToken(token.to_string())) + } + + /// Encode `text` without adding model special tokens, returning the + /// id sequence directly. Hot-path callers should batch text segments + /// where possible, but per-segment encode is still significantly + /// faster than the Python equivalent because there's no FFI hop. + pub fn encode_no_special(&self, text: &str) -> Result { + let enc = self + .inner + .tok + .encode_fast(text, false) + .map_err(|e| RenderError::Tokenizer(e.to_string()))?; + Ok(Encoded { enc }) + } + + /// Encode many text fragments without model special tokens. The + /// tokenizer crate parallelizes this internally, which avoids paying + /// per-fragment call overhead on render paths that can plan the whole + /// prompt before materialising ids. + pub fn encode_batch_no_special<'s, E>(&self, texts: Vec) -> Result, RenderError> + where + E: Into> + Send, + { + let encodings = self + .inner + .tok + .encode_batch_fast(texts, false) + .map_err(|e| RenderError::Tokenizer(e.to_string()))?; + Ok(encodings.into_iter().map(|enc| Encoded { enc }).collect()) + } + + /// Decode `ids` to text, including special tokens (matches the + /// Python `tokenizer.decode(ids, skip_special_tokens=False)` used + /// across the parsing layer). + pub fn decode(&self, ids: &[u32]) -> Result { + self.inner + .tok + .decode(ids, /*skip_special_tokens=*/ false) + .map_err(|e| RenderError::Tokenizer(e.to_string())) + } + + /// Borrow the underlying `tokenizers::Tokenizer` for advanced uses + /// (batch encoding, vocab access, ...). Prefer the wrappers above on + /// the hot path. + pub fn raw(&self) -> &tokenizers::Tokenizer { + &self.inner.tok + } +} + +/// Lightweight wrapper around `tokenizers::Encoding` exposing just the +/// id slice. Holding the encoding (instead of allocating a fresh +/// `Vec`) skips one copy on the way to `RenderBuf::ids`. +pub struct Encoded { + enc: tokenizers::Encoding, +} + +impl std::fmt::Debug for Encoded { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Encoded") + .field("len", &self.enc.len()) + .finish() + } +} + +impl Encoded { + #[inline] + pub fn as_slice(&self) -> &[u32] { + self.enc.get_ids() + } + + #[inline] + pub fn len(&self) -> usize { + self.enc.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.enc.is_empty() + } +} diff --git a/crates/renderers-core/src/tool_cache.rs b/crates/renderers-core/src/tool_cache.rs new file mode 100644 index 0000000..87f45a5 --- /dev/null +++ b/crates/renderers-core/src/tool_cache.rs @@ -0,0 +1,86 @@ +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::sync::{Arc, Mutex}; + +use crate::tokenizer::Tokenizer; +use crate::types::{RenderError, ToolSpec}; + +const MAX_TOOL_TEXT_CACHE_ENTRIES: usize = 64; + +#[derive(Debug, Clone, Default)] +pub(crate) struct ToolTextCache { + inner: Arc>>, +} + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +struct ToolTextCacheKey { + tools_ptr: usize, + tools_len: usize, + discriminator: u64, + dynamic_hash: u64, +} + +#[derive(Debug, Clone)] +struct CachedToolText { + tools: Vec, + dynamic_text: String, + tokens: Arc>, +} + +impl ToolTextCache { + pub(crate) fn get_or_insert_with( + &self, + tokenizer: &Tokenizer, + tools: &[ToolSpec], + discriminator: u64, + dynamic_text: &str, + build_text: impl FnOnce() -> Result, + ) -> Result>, RenderError> { + let key = ToolTextCacheKey { + tools_ptr: tools.as_ptr() as usize, + tools_len: tools.len(), + discriminator, + dynamic_hash: hash_dynamic_text(dynamic_text), + }; + + { + let cache = self.lock_cache()?; + if let Some(cached) = cache.get(&key) { + if cached.tools == tools && cached.dynamic_text == dynamic_text { + return Ok(cached.tokens.clone()); + } + } + } + + let text = build_text()?; + let tokens = Arc::new(tokenizer.encode_no_special(&text)?.as_slice().to_vec()); + let mut cache = self.lock_cache()?; + if cache.len() >= MAX_TOOL_TEXT_CACHE_ENTRIES { + cache.clear(); + } + cache.insert( + key, + CachedToolText { + tools: tools.to_vec(), + dynamic_text: dynamic_text.to_string(), + tokens: tokens.clone(), + }, + ); + Ok(tokens) + } + + fn lock_cache( + &self, + ) -> Result>, RenderError> + { + self.inner + .lock() + .map_err(|_| RenderError::Invalid("tool text cache lock poisoned".into())) + } +} + +fn hash_dynamic_text(text: &str) -> u64 { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + text.hash(&mut hasher); + hasher.finish() +} diff --git a/crates/renderers-core/src/traits.rs b/crates/renderers-core/src/traits.rs new file mode 100644 index 0000000..3ef7e5e --- /dev/null +++ b/crates/renderers-core/src/traits.rs @@ -0,0 +1,158 @@ +//! The [`Renderer`] trait and its multimodal extension. +//! +//! Both are object-safe so a `Box` (or `Arc`) +//! at the public boundary works without extra ceremony. Family-specific +//! configuration lives on the concrete struct that impls these traits. + +use crate::types::{MediaBundle, MultiModalData, ParsedResponse, RenderError, RenderedTokens}; +use crate::types::{Message, ToolSpec}; + +/// Deterministic message → token renderer for a specific model family. +/// +/// Implementors must: +/// +/// - Be `Send + Sync` so a single instance can be shared via `Arc` across +/// threads (the Python `RendererPool` is obsolete in Rust). +/// - Produce byte-for-byte identical output to the corresponding Python +/// renderer for the same inputs — verified by the `test_render_ids`, +/// `test_bridge`, `test_roundtrip`, and `test_parse_response_robustness` +/// golden suites. +pub trait Renderer: Send + Sync + std::fmt::Debug { + /// Render `messages` to tokens with per-token message attribution. + fn render( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result; + + /// Render `messages` to tokens, dropping per-token attribution. The + /// default impl delegates to [`Renderer::render`]; family-specific + /// renderers may override with a slimmer path if it shows up in + /// profiling (the saving is one `Vec` allocation). + fn render_ids( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + add_generation_prompt: bool, + ) -> Result, RenderError> { + Ok(self + .render(messages, tools, add_generation_prompt)? + .token_ids) + } + + /// Parse a completion's token ids back into a structured response. + fn parse_response(&self, token_ids: &[u32]) -> ParsedResponse; + + /// Stop token ids the sampler should respect. + fn stop_token_ids(&self) -> &[u32]; + + /// Extend the prior turn's tokens verbatim with `new_messages`. + /// + /// Contract: + /// - The returned token stream starts with + /// `previous_prompt_ids + previous_completion_ids` (byte-for-byte). + /// - Returns `None` if `new_messages` contains an assistant turn + /// (refuses to retokenize sampled output) or if the prior turn was + /// truncated and no canonical close can be synthesised. + fn bridge_to_next_turn( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + tools: Option<&[ToolSpec]>, + ) -> Result, RenderError>; + + /// Downcast to a multimodal renderer if this implementor supports it. + /// Default returns `None`; multimodal families override. + fn as_multimodal(&self) -> Option<&dyn MultimodalRenderer> { + None + } +} + +/// Extension implemented by multimodal-capable renderers. +/// +/// Phase 5 design: the renderer **does not touch raw pixel data**. The +/// caller resolves image/video parts upstream (via the HF processor in +/// the Phase 5a Python shim, or a candle-backed [`MediaResolver`] in +/// Phase 5b) and hands the renderer a [`MediaBundle`] with each item's +/// placeholder count pre-computed. +/// +/// Concrete implementors are added in Phase 5a; this trait surface is +/// frozen now so that diff is purely additive on a stable API. +pub trait MultimodalRenderer: Renderer { + /// Placeholder token id → modality marker (1 = image, 2 = video). + /// Used by the trainer to build per-token `mm_type_ids` masks. + fn mm_token_type_id_map(&self) -> &[(u32, u8)]; + + /// Render `messages` with pre-resolved `media`. + /// + /// The renderer walks `messages` and pulls items from `media` in + /// order. Each `MediaItem.num_tokens` is the count of placeholder + /// tokens the renderer must emit between the modality's + /// start/end special tokens. The item's `hf_payload` rides through + /// as opaque data on [`RenderedTokens::multi_modal_data`]. + fn render_with_media( + &self, + messages: &[Message], + tools: Option<&[ToolSpec]>, + media: &MediaBundle, + add_generation_prompt: bool, + ) -> Result; + + /// Multimodal-aware bridge. Same contract as + /// [`Renderer::bridge_to_next_turn`] plus `new_media` for the + /// extension and `previous_multi_modal_data` so prior placeholders + /// (and their hashes / payloads) survive across turns. + fn bridge_to_next_turn_with_media( + &self, + previous_prompt_ids: &[u32], + previous_completion_ids: &[u32], + new_messages: &[Message], + tools: Option<&[ToolSpec]>, + new_media: &MediaBundle, + previous_multi_modal_data: Option<&MultiModalData>, + ) -> Result, RenderError>; +} + +/// Resolves raw image / video sources to processor outputs. +/// +/// Phase 5a uses a Python-side implementation that wraps HF's +/// `Qwen3VLImageProcessor` / `KimiVLImageProcessor` and delivers +/// [`MediaItem`]s pre-sized. Phase 5b will add a Rust-native +/// implementation backed by `candle` (or `ort`) so downstream Rust +/// callers can skip the Python boundary entirely. +/// +/// The trait is deliberately tiny: a single resolve call per item, +/// caller chooses the modality and source. +pub trait MediaResolver: Send + Sync + std::fmt::Debug { + /// Resolve a single source (URL / filesystem path / inline bytes) + /// to a sized [`MediaItem`]. Implementations are free to cache by + /// hash; the resolver lives for the lifetime of a renderer pool + /// slot. + fn resolve_image( + &self, + source: &MediaSource<'_>, + ) -> Result; + + /// Resolve a video source — Phase 5b only. The default impl returns + /// an error so Phase 5a callers don't accidentally pass through. + fn resolve_video( + &self, + _source: &MediaSource<'_>, + ) -> Result { + Err(RenderError::Invalid( + "video resolution not implemented in this resolver".into(), + )) + } +} + +/// A source descriptor for a media item the caller wants resolved. +#[derive(Clone, Debug)] +pub enum MediaSource<'a> { + Url(&'a str), + Path(&'a std::path::Path), + /// Inline image bytes (PNG / JPEG / WebP / etc.). The resolver + /// detects the format from the bytes themselves. + Bytes(&'a [u8]), +} diff --git a/crates/renderers-core/src/types.rs b/crates/renderers-core/src/types.rs new file mode 100644 index 0000000..3af3534 --- /dev/null +++ b/crates/renderers-core/src/types.rs @@ -0,0 +1,427 @@ +//! Core data types for renderers. +//! +//! The shapes mirror the Python `renderers.base` types so JSON round-trips +//! and `PyO3` wrapping stay mechanical. Strings are owned (`String`) — `PyO3` +//! always materialises strings on entry, so `Cow<'a, str>` would only +//! propagate lifetimes for no win. The few `&str` borrows that pay off are +//! taken locally inside renderer implementations from `&[Message]` slices. + +use std::ops::Range; + +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +/// Sentinel value for `message_indices` entries that come from structural +/// scaffolding rather than a specific message (e.g. the generation prompt). +/// +/// Kept as a named constant so the `-1` in code is searchable and easy to +/// audit. Matches the Python contract at `renderers/base.py:160`. +pub const SCAFFOLD_IDX: i32 = -1; + +/// A single content part inside a multi-part message body. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum ContentPart { + /// Plain text. + Text { text: String }, + /// Model chain-of-thought as a content part. + Thinking { thinking: String }, + /// Image reference. Resolution to bytes / processor output happens + /// in the multimodal renderer. + Image(ImageRef), + /// Video reference; mirrors [`ImageRef`]. + Video(VideoRef), +} + +/// Image source variants accepted in [`ContentPart::Image`]. Phase 1 +/// covers text-only families, so only the URL/path discriminators carry +/// data — inline bytes are routed through `serde_json::Value` payload +/// for now and resolved by the (Phase 5) multimodal port. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct ImageRef { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub url: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, +} + +/// Video source variants accepted in [`ContentPart::Video`]. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct VideoRef { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub url: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, +} + +/// Message body. Either a plain string or a list of structured parts. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(untagged)] +pub enum Content { + Null, + Text(String), + Parts(Vec), +} + +impl Default for Content { + fn default() -> Self { + Content::Text(String::new()) + } +} + +impl Content { + /// Borrow the body as a `&str` if it is a plain string; returns + /// `""` for `Parts` variants (Qwen3 ignores list content entirely). + pub fn as_text(&self) -> &str { + match self { + Content::Text(s) => s.as_str(), + Content::Null | Content::Parts(_) => "", + } + } + + pub fn as_text_or_none_literal(&self) -> &str { + match self { + Content::Null => "None", + Content::Text(s) => s.as_str(), + Content::Parts(_) => "", + } + } + + pub fn is_empty(&self) -> bool { + match self { + Content::Null => true, + Content::Text(s) => s.is_empty(), + Content::Parts(p) => p.is_empty(), + } + } +} + +/// Function body inside a [`ToolCall`]. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct ToolCallFunction { + #[serde(default)] + pub name: String, + /// Arguments may arrive as a JSON object or as a pre-serialised JSON + /// string (some OpenAI-format clients do this); preserve the + /// distinction. + #[serde(default)] + pub arguments: ToolArguments, +} + +/// Structured tool invocation in `OpenAI` function-calling format. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct ToolCall { + #[serde(default = "default_tool_type", rename = "type")] + pub kind: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub id: Option, + pub function: ToolCallFunction, +} + +fn default_tool_type() -> String { + "function".to_string() +} + +/// Tool specification (`OpenAI` function-calling format) passed to +/// [`Renderer::render`](crate::Renderer::render). +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct ToolSpec { + pub name: String, + #[serde(default)] + pub description: String, + #[serde(default)] + pub parameters: serde_json::Value, + #[serde(default, skip)] + pub openai_envelope: bool, +} + +/// A single turn in a multi-turn conversation. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct Message { + pub role: String, + #[serde(default)] + pub content: Content, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub tool_calls: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub tool_call_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub name: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub reasoning_content: Option, +} + +impl Message { + /// Borrow `content` as a `&str` only when it is a plain string. Many + /// hand-coded renderers (Qwen3, GLM5, ...) drop list-content entirely + /// for non-multimodal text paths; this helper makes that explicit. + #[inline] + pub fn text_content(&self) -> &str { + self.content.as_text() + } + + #[inline] + pub fn visible_text_content(&self) -> &str { + self.content.as_text_or_none_literal() + } +} + +/// Tool-call argument payload. The JSON-object case is the common path; +/// the raw-string case preserves the `OpenAI` quirk where some clients +/// pre-serialise arguments to a string. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(untagged)] +pub enum ToolArguments { + Raw(String), + Object(serde_json::Value), +} + +impl Default for ToolArguments { + fn default() -> Self { + ToolArguments::Object(serde_json::Value::Object(serde_json::Map::new())) + } +} + +impl ToolArguments { + /// Render arguments as a JSON string suitable for inserting verbatim + /// into a tool-call payload (matches Python's + /// `json.dumps(arguments, ensure_ascii=False)`). + pub fn to_json_string(&self) -> String { + match self { + ToolArguments::Raw(s) => s.clone(), + ToolArguments::Object(v) => { + serde_json::to_string(v).unwrap_or_else(|_| "{}".to_string()) + } + } + } +} + +/// Where a single multimodal item's placeholder tokens sit in the stream. +#[derive(Clone, Copy, Debug, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct PlaceholderRange { + pub offset: usize, + pub length: usize, +} + +/// Multimodal sidecar emitted alongside the token stream. The shape +/// mirrors vLLM's `mm_*` payload without depending on vLLM types. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct MultiModalData { + #[serde(default)] + pub mm_hashes: std::collections::BTreeMap>, + #[serde(default)] + pub mm_placeholders: std::collections::BTreeMap>, + /// Per-item processor outputs. The values are passed through as opaque + /// JSON to keep this crate framework-agnostic; vision processors live + /// behind the `PyO3` boundary in the current Phase 1 design. + #[serde(default)] + pub mm_items: std::collections::BTreeMap>, +} + +impl MultiModalData { + pub fn is_empty(&self) -> bool { + self.mm_hashes.is_empty() && self.mm_placeholders.is_empty() && self.mm_items.is_empty() + } +} + +/// Modality marker for a multimodal item. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum Modality { + Image, + Video, +} + +impl Modality { + /// Wire string matching the keys used in [`MultiModalData::mm_hashes`] + /// and friends ("image" / "video"). + pub fn as_str(&self) -> &'static str { + match self { + Self::Image => "image", + Self::Video => "video", + } + } + + /// Numeric marker used by per-token modality masks + /// (1 = image, 2 = video). Matches the `mm_token_type_id_map` + /// convention in the Python protocol. + pub fn type_id(&self) -> u8 { + match self { + Self::Image => 1, + Self::Video => 2, + } + } +} + +/// A single media item — image or video — that the caller has already +/// resolved through a vision processor. The renderer never touches raw +/// pixel data; it only needs [`MediaItem::num_tokens`] to emit the right +/// placeholder count and the opaque [`MediaItem::hf_payload`] to splice +/// into the [`MultiModalData::mm_items`] map for the inference engine. +#[derive(Clone, Debug)] +pub struct MediaItem { + pub modality: Modality, + /// Cache key for this item — typically a SHA256 of the resolved + /// bytes. The renderer pushes it into + /// [`MultiModalData::mm_hashes`] under the modality key. + pub hash: String, + /// How many placeholder tokens this item expands into. For + /// Qwen3-VL this is `image_grid_thw.prod() / merge_size²`; for + /// Kimi K2.5 this is always 1 (the model expands per-patch + /// internally). + pub num_tokens: usize, + /// Opaque payload that travels alongside the placeholders to the + /// inference engine. In Phase 5a this is the HF + /// `image_processor(...)` output (`pixel_values`, `image_grid_thw`, + /// ...) — `serde_json::Value` keeps the crate framework-agnostic + /// without dragging numpy / torch into the dependency graph. + pub hf_payload: serde_json::Value, +} + +/// Bundle of pre-resolved media items keyed by the message index they +/// belong to. The renderer pops items in walk order; one bundle covers +/// the full call. +#[derive(Clone, Debug, Default)] +pub struct MediaBundle { + /// `(message_idx, item)` pairs in render order. Multiple items per + /// message are supported — the bundle stays a flat `Vec` so the + /// renderer can iterate with a single cursor. + pub items: Vec<(usize, MediaItem)>, +} + +impl MediaBundle { + pub fn new() -> Self { + Self::default() + } + + pub fn is_empty(&self) -> bool { + self.items.is_empty() + } + + pub fn push(&mut self, message_idx: usize, item: MediaItem) { + self.items.push((message_idx, item)); + } +} + +/// Result of rendering messages to tokens. +/// +/// `token_ids` and `message_indices` are parallel: `message_indices[i]` is +/// the index into the input `messages` slice of the message that produced +/// `token_ids[i]`, or [`SCAFFOLD_IDX`] for structural scaffolding tokens. +/// +/// Both vectors are sized once during render — see +/// [`RenderedTokens::with_capacity`]. +#[derive(Clone, Debug, Default)] +pub struct RenderedTokens { + pub token_ids: Vec, + pub message_indices: Vec, + pub multi_modal_data: Option, +} + +impl RenderedTokens { + pub fn new() -> Self { + Self::default() + } + + /// Pre-allocate both buffers to the same capacity. Renderers pass an + /// estimate based on `messages.len() * 256` to keep the hot path + /// realloc-free for typical conversations. + pub fn with_capacity(cap: usize) -> Self { + Self { + token_ids: Vec::with_capacity(cap), + message_indices: Vec::with_capacity(cap), + multi_modal_data: None, + } + } + + #[inline] + pub fn len(&self) -> usize { + self.token_ids.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.token_ids.is_empty() + } +} + +/// Per-attempt outcome of parsing a single tool-call block. Matches the +/// Python `ToolCallParseStatus` semantics — every parse attempt surfaces +/// (success and malformed alike), distinguished by this status. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ToolCallParseStatus { + Ok, + InvalidJson, + UnclosedBlock, + MissingName, + MalformedStructure, +} + +impl ToolCallParseStatus { + /// Wire string matching the Python enum values + /// (`"ok" | "invalid_json" | ...`) so `PyO3` can round-trip them. + pub fn as_wire(&self) -> &'static str { + match self { + Self::Ok => "ok", + Self::InvalidJson => "invalid_json", + Self::UnclosedBlock => "unclosed_block", + Self::MissingName => "missing_name", + Self::MalformedStructure => "malformed_structure", + } + } +} + +/// A single tool-call block as the parser saw it. +/// +/// `arguments` carries `None` only when the block was so malformed that +/// nothing could be recovered; successful parses produce +/// [`ToolArguments::Object`], pre-serialised string arguments produce +/// [`ToolArguments::Raw`]. +#[derive(Clone, Debug)] +pub struct ParsedToolCall { + pub raw: String, + pub name: Option, + pub arguments: Option, + /// Half-open `[start, end)` slice into the stop-stripped completion + /// token stream. `None` for text-based parsers that can't cheaply + /// recover offsets. + pub token_span: Option>, + pub status: ToolCallParseStatus, + /// Native id when the format carries one (Kimi K2). + pub id: Option, +} + +impl Default for ParsedToolCall { + fn default() -> Self { + Self { + raw: String::new(), + name: None, + arguments: None, + token_span: None, + status: ToolCallParseStatus::Ok, + id: None, + } + } +} + +/// Result of parsing completion tokens back into a structured message. +#[derive(Clone, Debug, Default)] +pub struct ParsedResponse { + pub content: String, + pub reasoning_content: Option, + pub tool_calls: Vec, +} + +/// Errors surfaced by rendering. +#[derive(Debug, Error)] +pub enum RenderError { + #[error("no messages provided")] + EmptyMessages, + #[error("special token {0:?} not found in tokenizer vocabulary")] + MissingSpecialToken(String), + #[error("tokenizer error: {0}")] + Tokenizer(String), + #[error("invalid input: {0}")] + Invalid(String), +} diff --git a/crates/renderers-py/Cargo.toml b/crates/renderers-py/Cargo.toml new file mode 100644 index 0000000..c95d6da --- /dev/null +++ b/crates/renderers-py/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "renderers-py" +version = "0.1.0" +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +description = "PyO3 bindings for renderers-core." + +[lib] +name = "renderers_native" +crate-type = ["cdylib", "rlib"] +path = "src/lib.rs" + +[dependencies] +renderers-core = { path = "../renderers-core" } +pyo3 = { version = "0.28", features = ["abi3-py310"] } +serde = { workspace = true } +serde_json = { workspace = true } +pythonize = "0.28" +numpy = "0.28" +ndarray = "0.17" +rayon = "1" + +[lints] +workspace = true diff --git a/crates/renderers-py/pyproject.toml b/crates/renderers-py/pyproject.toml new file mode 100644 index 0000000..6d28eda --- /dev/null +++ b/crates/renderers-py/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["maturin>=1.4"] +build-backend = "maturin" + +[project] +name = "renderers-native" +version = "0.1.0" +requires-python = ">=3.10,<3.14" +description = "Native (Rust) extension module for the `renderers` package." +license = { text = "Apache-2.0" } + +# Maturin installs the cdylib as a top-level module. The pure-Python +# `renderers` package imports it as ``import renderers_native``, kept +# separate from the `renderers/` hatchling-built wheel so the two +# distributions don't collide at install time. +[tool.maturin] +module-name = "renderers_native" +features = ["pyo3/extension-module"] +strip = true diff --git a/crates/renderers-py/src/lib.rs b/crates/renderers-py/src/lib.rs new file mode 100644 index 0000000..38ad004 --- /dev/null +++ b/crates/renderers-py/src/lib.rs @@ -0,0 +1,1730 @@ +//! Python bindings for `renderers-core`. +//! +//! The boundary is intentionally thin: one polymorphic `Renderer` +//! pyclass holds an `Arc`; small result +//! pyclasses wrap `RenderedTokens` / `ParsedResponse` / `ParsedToolCall` +//! with `#[getter]` accessors. Argument unpacking is done by +//! `pythonize` so callers can pass plain dicts / lists for messages and +//! tools without per-field `PyO3` conversion. + +use std::sync::Arc; + +use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1}; +use pyo3::exceptions::{PyRuntimeError, PyValueError}; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyList, PyTuple, PyType}; +use rayon::prelude::*; + +use renderers_core::Renderer as CoreRenderer; +use renderers_core::families::{ + DeepSeekV3RendererBuilder, DefaultRendererBuilder, GlmRendererBuilder, GptOssRendererBuilder, + KimiK2RendererBuilder, KimiK25RendererBuilder, MiniMaxM2RendererBuilder, + Nemotron3RendererBuilder, Qwen3RendererBuilder, Qwen35RendererBuilder, Qwen36RendererBuilder, +}; +use renderers_core::processing::{ProcessedImage, Qwen3VlImageProcessor}; +use renderers_core::tokenizer::Tokenizer; +use renderers_core::types::{ + Content, Message, ParsedResponse, ParsedToolCall, RenderedTokens, ToolArguments, + ToolCallParseStatus, ToolSpec, +}; +use renderers_core::types::{MediaBundle, MediaItem, Modality}; + +// Kept by-value so call sites can use the bare fn pointer +// `.map_err(render_err)` (closures would be needed for `&E`). +#[allow(clippy::needless_pass_by_value)] +fn render_err(e: renderers_core::types::RenderError) -> PyErr { + PyRuntimeError::new_err(e.to_string()) +} + +fn invalid(msg: impl Into) -> PyErr { + PyValueError::new_err(msg.into()) +} + +/// Decode a Python `list[dict]` of messages. +/// +/// The hot path is plain OpenAI-style dictionaries with string fields. +/// Hand-parsing that shape avoids routing every render through generic +/// serde conversion while still falling back to `pythonize` for structured +/// content parts and tool-call lists. +fn parse_messages(obj: &Bound<'_, PyAny>) -> PyResult> { + let list = obj + .cast::() + .map_err(|_| invalid("messages must be a list of dicts"))?; + let mut parsed = Vec::with_capacity(list.len()); + for item in list.iter() { + let dict = item + .cast::() + .map_err(|_| invalid("messages must be a list of dicts"))?; + let role = dict + .get_item("role")? + .ok_or_else(|| invalid("message missing role"))? + .extract::()?; + + let content = match dict.get_item("content")? { + None => Content::default(), + Some(value) if value.is_none() => Content::Null, + Some(value) => match value.extract::() { + Ok(text) => Content::Text(text), + Err(_) => pythonize::depythonize(&value) + .map_err(|e| invalid(format!("message content decode failed: {e}")))?, + }, + }; + + let tool_calls = match dict.get_item("tool_calls")? { + None => Vec::new(), + Some(value) if value.is_none() => Vec::new(), + Some(value) => pythonize::depythonize(&value) + .map_err(|e| invalid(format!("message tool_calls decode failed: {e}")))?, + }; + let tool_call_id = optional_string(dict, "tool_call_id")?; + let name = optional_string(dict, "name")?; + let reasoning_content = optional_string(dict, "reasoning_content")?; + + parsed.push(Message { + role, + content, + tool_calls, + tool_call_id, + name, + reasoning_content, + }); + } + Ok(parsed) +} + +fn optional_string(dict: &Bound<'_, PyDict>, key: &str) -> PyResult> { + match dict.get_item(key)? { + None => Ok(None), + Some(value) if value.is_none() => Ok(None), + Some(value) => value.extract::().map(Some), + } +} + +fn parse_tools(obj: Option<&Bound<'_, PyAny>>) -> PyResult>>> { + let Some(obj) = obj else { return Ok(None) }; + if obj.is_none() { + return Ok(None); + } + if let Ok(prepared) = obj.extract::>() { + return Ok(Some(prepared.inner.clone())); + } + let list = obj + .cast::() + .map_err(|_| invalid("tools must be a list of dicts or PreparedTools"))?; + let mut parsed = Vec::with_capacity(list.len()); + for item in list.iter() { + let dict = item + .cast::() + .map_err(|_| invalid("tools must be a list of dicts"))?; + let mut openai_envelope = false; + let spec = if let Some(function) = dict.get_item("function")? { + if let Ok(function_dict) = function.cast::() { + openai_envelope = true; + function_dict.clone() + } else { + dict.clone() + } + } else { + dict.clone() + }; + let name = spec + .get_item("name")? + .ok_or_else(|| invalid("tool spec missing name"))? + .extract::()?; + let description = match spec.get_item("description")? { + Some(value) => value.extract::()?, + None => String::new(), + }; + let parameters = match spec.get_item("parameters")? { + Some(value) => pythonize::depythonize(&value) + .map_err(|e| invalid(format!("tool parameters decode failed: {e}")))?, + None => serde_json::Value::Object(serde_json::Map::new()), + }; + parsed.push(ToolSpec { + name, + description, + parameters, + openai_envelope, + }); + } + Ok(Some(Arc::new(parsed))) +} + +#[inline] +fn tools_slice(tools: Option<&Arc>>) -> Option<&[ToolSpec]> { + tools.map(|tools| tools.as_slice()) +} + +fn parse_message_batch(obj: &Bound<'_, PyAny>) -> PyResult>> { + let list = obj + .cast::() + .map_err(|_| invalid("messages_batch must be a list of message lists"))?; + let mut parsed = Vec::with_capacity(list.len()); + for item in list.iter() { + parsed.push(parse_messages(&item)?); + } + Ok(parsed) +} + +fn parse_fast_messages( + roles: &Bound<'_, PyAny>, + contents: &Bound<'_, PyAny>, +) -> PyResult> { + let roles = roles + .cast::() + .map_err(|_| invalid("roles must be a list[str]"))?; + let contents = contents + .cast::() + .map_err(|_| invalid("contents must be a list[str]"))?; + if roles.len() != contents.len() { + return Err(invalid("roles and contents must have the same length")); + } + let mut parsed = Vec::with_capacity(roles.len()); + for (role, content) in roles.iter().zip(contents.iter()) { + parsed.push(Message { + role: role.extract::()?, + content: Content::Text(content.extract::()?), + ..Default::default() + }); + } + Ok(parsed) +} + +/// Decode a Python list of media-item dicts into a [`MediaBundle`]. +fn parse_media_bundle(obj: &Bound<'_, PyAny>) -> PyResult { + let value: serde_json::Value = pythonize::depythonize(obj) + .map_err(|e| invalid(format!("media must be a list of dicts: {e}")))?; + let serde_json::Value::Array(arr) = value else { + return Err(invalid("media must be a list")); + }; + let mut bundle = MediaBundle::new(); + for item in arr { + let obj = item + .as_object() + .ok_or_else(|| invalid("media item must be a dict"))?; + let message_idx = + obj.get("message_idx") + .and_then(serde_json::Value::as_u64) + .ok_or_else(|| invalid("media item missing message_idx"))? as usize; + let modality_str = obj + .get("modality") + .and_then(|v| v.as_str()) + .ok_or_else(|| invalid("media item missing modality"))?; + let modality = match modality_str { + "image" => Modality::Image, + "video" => Modality::Video, + other => return Err(invalid(format!("unknown modality: {other}"))), + }; + let num_tokens = + obj.get("num_tokens") + .and_then(serde_json::Value::as_u64) + .ok_or_else(|| invalid("media item missing num_tokens"))? as usize; + let hash = obj + .get("hash") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_default(); + let hf_payload = obj + .get("hf_payload") + .cloned() + .unwrap_or(serde_json::Value::Null); + bundle.push( + message_idx, + MediaItem { + modality, + hash, + num_tokens, + hf_payload, + }, + ); + } + Ok(bundle) +} + +fn parse_u32_list(obj: &Bound<'_, PyAny>) -> PyResult> { + // Accept either a Python list of ints or a numpy-style sequence. + let list = obj + .cast::() + .map_err(|_| invalid("expected list[int]"))?; + let mut out = Vec::with_capacity(list.len()); + for item in list.iter() { + out.push(item.extract::()?); + } + Ok(out) +} + +fn numpy_u32_slice<'py>(array: &'py PyReadonlyArray1<'py, u32>) -> PyResult<&'py [u32]> { + array + .as_slice() + .map_err(|e| invalid(format!("expected a contiguous uint32 numpy array: {e}"))) +} + +fn batch_ids_to_pylist(py: Python<'_>, batch_ids: Vec>) -> PyResult> { + let mut rows = Vec::with_capacity(batch_ids.len()); + for ids in batch_ids { + rows.push(PyList::new(py, ids)?); + } + PyList::new(py, rows) +} + +#[pyclass( + name = "RenderedTokens", + module = "renderers_native", + skip_from_py_object +)] +#[derive(Clone)] +struct PyRenderedTokens { + inner: RenderedTokens, +} + +#[pymethods] +impl PyRenderedTokens { + #[getter] + fn token_ids<'py>(&self, py: Python<'py>) -> PyResult> { + PyList::new(py, &self.inner.token_ids) + } + + #[getter] + fn message_indices<'py>(&self, py: Python<'py>) -> PyResult> { + PyList::new(py, self.inner.message_indices.iter().copied()) + } + + #[getter] + #[allow(clippy::unused_self)] + fn sampled_mask<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + PyList::empty(py) + } + + #[getter] + #[allow(clippy::unused_self)] + fn is_content<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + PyList::empty(py) + } + + #[getter] + #[allow(clippy::unused_self)] + fn message_roles<'py>(&self, py: Python<'py>) -> Bound<'py, PyList> { + PyList::empty(py) + } + + #[getter] + fn multi_modal_data<'py>(&self, py: Python<'py>) -> PyResult> { + match &self.inner.multi_modal_data { + Some(mm) => pythonize::pythonize(py, mm) + .map_err(|e| invalid(format!("mm serialisation failed: {e}"))), + None => Ok(py.None().into_bound(py)), + } + } + + #[pyo3(signature = (n_messages = None, *, sampled_only = false))] + fn tokens_per_message<'py>( + &self, + py: Python<'py>, + n_messages: Option, + sampled_only: bool, + ) -> PyResult> { + let n_messages = n_messages.unwrap_or(0); + let out = if sampled_only { + vec![0usize; n_messages] + } else { + let mut counts = vec![0usize; n_messages]; + for idx in &self.inner.message_indices { + let Ok(msg_idx) = usize::try_from(*idx) else { + continue; + }; + if msg_idx < n_messages { + counts[msg_idx] += 1; + } + } + counts + }; + PyList::new(py, out) + } + + fn message_token_spans<'py>(&self, py: Python<'py>) -> PyResult> { + let n_messages = self + .inner + .message_indices + .iter() + .copied() + .filter(|idx| *idx >= 0) + .max() + .map_or(0usize, |idx| usize::try_from(idx).map_or(0, |idx| idx + 1)); + let mut firsts = vec![None::; n_messages]; + let mut lasts = vec![None::; n_messages]; + for (pos, idx) in self.inner.message_indices.iter().copied().enumerate() { + let Ok(msg_idx) = usize::try_from(idx) else { + continue; + }; + if msg_idx >= n_messages { + continue; + } + if firsts[msg_idx].is_none() { + firsts[msg_idx] = Some(pos); + } + lasts[msg_idx] = Some(pos); + } + + let out = PyList::empty(py); + for (first, last) in firsts.into_iter().zip(lasts) { + match (first, last) { + (Some(start), Some(end)) => { + out.append(PyTuple::new(py, [start, end + 1])?)?; + } + _ => out.append(py.None())?, + } + } + Ok(out) + } + + #[allow(clippy::unused_self)] + fn role_token_spans<'py>(&self, py: Python<'py>) -> Bound<'py, PyDict> { + PyDict::new(py) + } + + #[pyo3(signature = (*, sampled_only = false))] + #[allow(clippy::unused_self)] + fn tokens_by_role<'py>( + &self, + py: Python<'py>, + #[allow(unused_variables)] sampled_only: bool, + ) -> Bound<'py, PyDict> { + PyDict::new(py) + } + + #[allow(clippy::unused_self)] + fn content_token_spans_by_role<'py>(&self, py: Python<'py>) -> Bound<'py, PyDict> { + PyDict::new(py) + } + + fn content_mask_for_roles<'py>( + &self, + py: Python<'py>, + #[allow(unused_variables)] roles: &Bound<'_, PyAny>, + ) -> PyResult> { + PyList::new(py, vec![false; self.inner.token_ids.len()]) + } + + fn __repr__(&self) -> String { + format!( + "RenderedTokens(token_ids=<{} tokens>, message_indices=<{} entries>, multi_modal_data={})", + self.inner.token_ids.len(), + self.inner.message_indices.len(), + if self.inner.multi_modal_data.is_some() { + "Some(...)" + } else { + "None" + }, + ) + } +} + +#[pyclass( + name = "ParsedToolCall", + module = "renderers_native", + skip_from_py_object +)] +#[derive(Clone)] +struct PyParsedToolCall { + inner: ParsedToolCall, +} + +#[pymethods] +impl PyParsedToolCall { + #[getter] + fn raw(&self) -> &str { + &self.inner.raw + } + + #[getter] + fn name(&self) -> Option<&str> { + self.inner.name.as_deref() + } + + #[getter] + fn arguments<'py>(&self, py: Python<'py>) -> PyResult> { + match &self.inner.arguments { + None => Ok(py.None().into_bound(py)), + Some(ToolArguments::Object(v)) => { + pythonize::pythonize(py, v).map_err(|e| invalid(format!("args serialisation: {e}"))) + } + Some(ToolArguments::Raw(s)) => Ok(s + .as_str() + .into_pyobject(py) + .map_err(|e| invalid(format!("string into pyobject: {e}")))? + .into_any()), + } + } + + #[getter] + fn token_span(&self) -> Option<(usize, usize)> { + self.inner.token_span.as_ref().map(|r| (r.start, r.end)) + } + + #[getter] + fn status(&self) -> &'static str { + self.inner.status.as_wire() + } + + #[getter] + fn id(&self) -> Option<&str> { + self.inner.id.as_deref() + } + + fn __repr__(&self) -> String { + format!( + "ParsedToolCall(name={:?}, status={:?}, has_args={})", + self.inner.name, + self.inner.status, + self.inner.arguments.is_some(), + ) + } +} + +#[pyclass( + name = "ParsedResponse", + module = "renderers_native", + skip_from_py_object +)] +#[derive(Clone)] +struct PyParsedResponse { + inner: ParsedResponse, +} + +#[pymethods] +impl PyParsedResponse { + #[getter] + fn content(&self) -> &str { + &self.inner.content + } + + #[getter] + fn reasoning_content(&self) -> Option<&str> { + self.inner.reasoning_content.as_deref() + } + + #[getter] + fn tool_calls(&self) -> Vec { + self.inner + .tool_calls + .iter() + .cloned() + .map(|c| PyParsedToolCall { inner: c }) + .collect() + } + + fn __repr__(&self) -> String { + format!( + "ParsedResponse(content_len={}, reasoning_content={}, tool_calls={})", + self.inner.content.len(), + self.inner.reasoning_content.is_some(), + self.inner.tool_calls.len(), + ) + } +} + +/// Wire enum mirror — matches the Python `ToolCallParseStatus` string +/// values so existing code reading `tc.status == "ok"` keeps working. +#[pyclass( + name = "ToolCallParseStatus", + module = "renderers_native", + skip_from_py_object +)] +#[derive(Clone, Copy)] +struct PyToolCallParseStatus { + inner: ToolCallParseStatus, +} + +#[pymethods] +impl PyToolCallParseStatus { + #[classattr] + const OK: &'static str = "ok"; + #[classattr] + const INVALID_JSON: &'static str = "invalid_json"; + #[classattr] + const UNCLOSED_BLOCK: &'static str = "unclosed_block"; + #[classattr] + const MISSING_NAME: &'static str = "missing_name"; + #[classattr] + const MALFORMED_STRUCTURE: &'static str = "malformed_structure"; + + // PyO3 #[getter] requires `&self`; the Copy enum is 1 byte so clippy + // suggests by-value, but the macro shape is fixed. + #[getter] + #[allow(clippy::trivially_copy_pass_by_ref)] + fn value(&self) -> &'static str { + self.inner.as_wire() + } +} + +#[pyclass( + name = "PreparedTools", + module = "renderers_native", + skip_from_py_object +)] +#[derive(Clone)] +struct PyPreparedTools { + inner: Arc>, +} + +#[pymethods] +impl PyPreparedTools { + fn __len__(&self) -> usize { + self.inner.len() + } + + fn __repr__(&self) -> String { + format!("PreparedTools(<{} tools>)", self.inner.len()) + } +} + +#[pyclass(name = "RendererSession", module = "renderers_native")] +struct PyRendererSession { + renderer: Arc, + messages: Arc>, + tools: Option>>, + last_prompt_ids: Option>, +} + +#[pymethods] +impl PyRendererSession { + fn fork(&self) -> Self { + Self { + renderer: self.renderer.clone(), + messages: self.messages.clone(), + tools: self.tools.clone(), + last_prompt_ids: self.last_prompt_ids.clone(), + } + } + + #[pyo3(signature = (*, add_generation_prompt = false))] + fn render_ids<'py>( + &mut self, + py: Python<'py>, + add_generation_prompt: bool, + ) -> PyResult> { + let renderer = self.renderer.clone(); + let messages = self.messages.clone(); + let tools = self.tools.clone(); + let ids = py + .detach(move || { + renderer.render_ids( + messages.as_slice(), + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .map_err(render_err)?; + let out = PyList::new(py, ids.iter().copied())?; + self.last_prompt_ids = Some(ids); + Ok(out) + } + + #[pyo3(signature = (*, add_generation_prompt = false))] + fn render_ids_np<'py>( + &mut self, + py: Python<'py>, + add_generation_prompt: bool, + ) -> PyResult>> { + let renderer = self.renderer.clone(); + let messages = self.messages.clone(); + let tools = self.tools.clone(); + let ids = py + .detach(move || { + renderer.render_ids( + messages.as_slice(), + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .map_err(render_err)?; + self.last_prompt_ids = Some(ids.clone()); + Ok(ids.into_pyarray(py)) + } + + #[pyo3(signature = (previous_completion_ids, new_messages, *, update = true))] + fn bridge_to_next_turn( + &mut self, + py: Python<'_>, + previous_completion_ids: &Bound<'_, PyAny>, + new_messages: &Bound<'_, PyAny>, + update: bool, + ) -> PyResult> { + let prev_p = self + .last_prompt_ids + .clone() + .ok_or_else(|| invalid("render_ids must be called before session bridge"))?; + let prev_c = parse_u32_list(previous_completion_ids)?; + let msgs = parse_messages(new_messages)?; + let renderer = self.renderer.clone(); + let tools = self.tools.clone(); + let bridged = py + .detach(move || { + renderer.bridge_to_next_turn(&prev_p, &prev_c, &msgs, tools_slice(tools.as_ref())) + }) + .map_err(render_err)?; + if update && let Some(rendered) = &bridged { + self.last_prompt_ids = Some(rendered.token_ids.clone()); + } + Ok(bridged.map(|rt| PyRenderedTokens { inner: rt })) + } + + #[allow(clippy::needless_pass_by_value)] + #[pyo3(signature = (previous_completion_ids, new_messages, *, update = true))] + fn bridge_to_next_turn_np<'py>( + &mut self, + py: Python<'py>, + previous_completion_ids: PyReadonlyArray1<'_, u32>, + new_messages: &Bound<'_, PyAny>, + update: bool, + ) -> PyResult>>> { + let prev_p = self + .last_prompt_ids + .as_deref() + .ok_or_else(|| invalid("render_ids must be called before session bridge"))?; + let prev_c = numpy_u32_slice(&previous_completion_ids)?; + let msgs = parse_messages(new_messages)?; + let bridged = self + .renderer + .bridge_to_next_turn(prev_p, prev_c, &msgs, tools_slice(self.tools.as_ref())) + .map_err(render_err)?; + if let Some(rendered) = bridged { + if update { + self.last_prompt_ids = Some(rendered.token_ids.clone()); + } + Ok(Some(rendered.token_ids.into_pyarray(py))) + } else { + Ok(None) + } + } + + fn __repr__(&self) -> String { + format!( + "RendererSession(messages={}, tools={}, has_prompt={})", + self.messages.len(), + self.tools.as_ref().map_or(0, |t| t.len()), + self.last_prompt_ids.is_some(), + ) + } +} + +/// Polymorphic Python-facing renderer. +#[pyclass(name = "Renderer", module = "renderers_native")] +struct PyRenderer { + inner: Arc, +} + +#[pymethods] +impl PyRenderer { + /// Construct a Qwen3 renderer from a tokenizer.json on disk. + /// + /// Kept as an explicit classmethod (rather than `__new__`) so the + /// type signature stays unambiguous from Python and future families + /// can add their own classmethods. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn qwen3( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + Qwen3RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a Qwen3-VL renderer — alias for [`Renderer.qwen35`]. + /// + /// Qwen3-VL and Qwen3.5-VL share the same chat template and the + /// same set of special tokens, so the renderer implementation is + /// identical. The factory is exposed separately so callers reading + /// from a registry can spell the family name directly. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn qwen3_vl( + cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + Self::qwen35( + cls, + py, + tokenizer_path, + enable_thinking, + preserve_all_thinking, + preserve_thinking_between_tool_calls, + ) + } + + /// Build a Qwen3.5 renderer (text-only path) from a tokenizer.json. + /// + /// `enable_thinking` defaults to `True` (big-size variant). The Python + /// shim is expected to probe the tokenizer's Jinja template to pick + /// the right polarity for 0.8B / 2B models and forward it explicitly. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn qwen35( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + Qwen35RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a Qwen3.6 renderer (Qwen3.5 + JSON-flavoured tool args). + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn qwen36( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + Qwen36RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a GLM-5 renderer from a tokenizer.json. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn glm5( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + GlmRendererBuilder::glm5() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a GLM-5.1 renderer (GLM-5 + empty on last assistant). + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn glm51( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + GlmRendererBuilder::glm51() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a GLM-4.5 Air renderer from a tokenizer.json. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn glm45( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + GlmRendererBuilder::glm45() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a `MiniMax` M2 / M2.5 renderer from a tokenizer.json. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn minimax_m2( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + MiniMaxM2RendererBuilder::default() + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a `DefaultRenderer` (Jinja fallback via minijinja). + /// + /// `chat_template` is the model's Jinja chat template (usually the + /// `chat_template` field of `tokenizer_config.json` or the contents + /// of `chat_template.jinja`). `stop_token_ids` is typically + /// `[eos_token_id]`; pass `None` to leave it empty. + #[classmethod] + #[pyo3(signature = (tokenizer_path, chat_template, *, stop_token_ids = None, extra_context = None))] + fn default_renderer( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + chat_template: &str, + stop_token_ids: Option<&Bound<'_, PyAny>>, + extra_context: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let stop_ids: Vec = match stop_token_ids { + None => Vec::new(), + Some(obj) if obj.is_none() => Vec::new(), + Some(obj) => parse_u32_list(obj)?, + }; + let extras: Vec<(String, serde_json::Value)> = match extra_context { + None => Vec::new(), + Some(obj) if obj.is_none() => Vec::new(), + Some(obj) => { + let v: serde_json::Value = pythonize::depythonize(obj) + .map_err(|e| invalid(format!("extra_context: {e}")))?; + match v { + serde_json::Value::Object(m) => m.into_iter().collect(), + _ => return Err(invalid("extra_context must be a dict")), + } + } + }; + let ct = chat_template.to_string(); + let renderer = py + .detach(move || { + let mut b = DefaultRendererBuilder::new(ct).stop_token_ids(stop_ids); + for (k, v) in extras { + b = b.add_context(k, v); + } + b.build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a GPT-OSS (Harmony) renderer. + /// + /// Unlike the other families, GPT-OSS doesn't need a `HuggingFace` + /// `tokenizer.json` — the harmony encoding embeds its own + /// tiktoken-based tokenizer. The `tokenizer_path` argument is + /// ignored on this path but kept for API uniformity with the other + /// classmethods (callers can pass an empty string). + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + use_system_prompt = true, + reasoning_effort = None, + conversation_start_date = None, + knowledge_cutoff = None, + model_identity = None, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + #[allow(clippy::too_many_arguments)] + fn gpt_oss( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + use_system_prompt: bool, + reasoning_effort: Option<&str>, + conversation_start_date: Option<&str>, + knowledge_cutoff: Option<&str>, + model_identity: Option<&str>, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let _ = tokenizer_path; // not needed for harmony + let effort = reasoning_effort.unwrap_or("medium").to_string(); + let renderer = py + .detach(move || -> Result<_, renderers_core::types::RenderError> { + let mut b = GptOssRendererBuilder::default() + .use_system_prompt(use_system_prompt) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls); + b = b.reasoning_effort(&effort)?; + if let Some(d) = conversation_start_date { + b = b.conversation_start_date(d); + } + if let Some(k) = knowledge_cutoff { + b = b.knowledge_cutoff(k); + } + if let Some(m) = model_identity { + b = b.model_identity(m); + } + b.build() + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a Kimi K2.5 renderer (text-only, no tools). + /// + /// The Python shim is expected to route Kimi K2.5 to native ONLY + /// when there are no tools and no image / video content — the + /// TypeScript-style tool declaration formatter and the vision + /// processor are still pure-Python in this phase. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn kimi_k25( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + KimiK25RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a Kimi K2 renderer from a tokenizer.json. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn kimi_k2( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + KimiK2RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a Nemotron 3 renderer from a tokenizer.json. + /// + /// `<|endoftext|>` is auto-detected: Nemotron-3 Nano / Super ship + /// with only `<|im_end|>` as EOS; larger variants add `<|endoftext|>`. + #[classmethod] + #[pyo3(signature = ( + tokenizer_path, + *, + enable_thinking = true, + preserve_all_thinking = false, + preserve_thinking_between_tool_calls = false, + ))] + fn nemotron3( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + preserve_all_thinking: bool, + preserve_thinking_between_tool_calls: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + Nemotron3RendererBuilder::default() + .enable_thinking(enable_thinking) + .preserve_all_thinking(preserve_all_thinking) + .preserve_thinking_between_tool_calls(preserve_thinking_between_tool_calls) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + /// Build a `DeepSeek` V3 renderer from a tokenizer.json. + /// + /// `enable_thinking=True` (default) prefills the generation prompt + /// with `\n` to trigger reasoning. The Python shim mirrors + /// the upstream class signature. + #[classmethod] + #[pyo3(signature = (tokenizer_path, *, enable_thinking = true))] + fn deepseek_v3( + _cls: &Bound<'_, PyType>, + py: Python<'_>, + tokenizer_path: &str, + enable_thinking: bool, + ) -> PyResult { + let tok = Tokenizer::from_file(tokenizer_path).map_err(render_err)?; + let renderer = py + .detach(|| { + DeepSeekV3RendererBuilder::default() + .enable_thinking(enable_thinking) + .build(tok) + }) + .map_err(render_err)?; + Ok(PyRenderer { + inner: Arc::new(renderer), + }) + } + + #[allow(clippy::unused_self)] + fn prepare_tools(&self, tools: &Bound<'_, PyAny>) -> PyResult { + let parsed = parse_tools(Some(tools))?.unwrap_or_else(|| Arc::new(Vec::new())); + Ok(PyPreparedTools { inner: parsed }) + } + + #[pyo3(signature = (messages, *, tools = None))] + fn new_session( + &self, + messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + Ok(PyRendererSession { + renderer: self.inner.clone(), + messages: Arc::new(parse_messages(messages)?), + tools: parse_tools(tools)?, + last_prompt_ids: None, + }) + } + + #[pyo3(signature = (messages_batch, *, tools = None, add_generation_prompt = false))] + fn render_batch_ids<'py>( + &self, + py: Python<'py>, + messages_batch: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult> { + let batch = parse_message_batch(messages_batch)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let batch_ids = py + .detach(move || { + if batch.len() >= 8 { + batch + .par_iter() + .map(|messages| { + renderer.render_ids( + messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .collect::, _>>() + } else { + batch + .iter() + .map(|messages| { + renderer.render_ids( + messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .collect::, _>>() + } + }) + .map_err(render_err)?; + batch_ids_to_pylist(py, batch_ids) + } + + #[pyo3(signature = (messages_batch, *, tools = None, add_generation_prompt = false))] + fn render_batch_ids_np_packed<'py>( + &self, + py: Python<'py>, + messages_batch: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult> { + let batch = parse_message_batch(messages_batch)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let (ids, offsets) = py + .detach( + move || -> Result<(Vec, Vec), renderers_core::types::RenderError> { + let batch_ids = if batch.len() >= 8 { + batch + .par_iter() + .map(|messages| { + renderer.render_ids( + messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .collect::, _>>()? + } else { + batch + .iter() + .map(|messages| { + renderer.render_ids( + messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .collect::, _>>()? + }; + let total_len = batch_ids.iter().map(Vec::len).sum(); + let mut ids = Vec::with_capacity(total_len); + let mut offsets = Vec::with_capacity(batch_ids.len() + 1); + offsets.push(0); + for row in batch_ids { + ids.extend_from_slice(&row); + offsets.push(ids.len() as i64); + } + Ok((ids, offsets)) + }, + ) + .map_err(render_err)?; + let ids = ids.into_pyarray(py).into_any(); + let offsets = offsets.into_pyarray(py).into_any(); + PyTuple::new(py, [ids, offsets]) + } + + #[pyo3(signature = (roles, contents, *, tools = None, add_generation_prompt = false))] + fn render_fast_ids<'py>( + &self, + py: Python<'py>, + roles: &Bound<'_, PyAny>, + contents: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult> { + let messages = parse_fast_messages(roles, contents)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let ids = py + .detach(move || { + renderer.render_ids( + &messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .map_err(render_err)?; + PyList::new(py, ids) + } + + #[pyo3(signature = (roles, contents, *, tools = None, add_generation_prompt = false))] + fn render_fast_ids_np<'py>( + &self, + py: Python<'py>, + roles: &Bound<'_, PyAny>, + contents: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult>> { + let messages = parse_fast_messages(roles, contents)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let ids = py + .detach(move || { + renderer.render_ids( + &messages, + tools_slice(tools.as_ref()), + add_generation_prompt, + ) + }) + .map_err(render_err)?; + Ok(ids.into_pyarray(py)) + } + + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] + fn render( + &self, + py: Python<'_>, + messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult { + let msgs = parse_messages(messages)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let out = py + .detach(move || { + renderer.render(&msgs, tools_slice(tools.as_ref()), add_generation_prompt) + }) + .map_err(render_err)?; + Ok(PyRenderedTokens { inner: out }) + } + + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] + fn render_ids<'py>( + &self, + py: Python<'py>, + messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult> { + let msgs = parse_messages(messages)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let ids = py + .detach(move || { + renderer.render_ids(&msgs, tools_slice(tools.as_ref()), add_generation_prompt) + }) + .map_err(render_err)?; + PyList::new(py, ids) + } + + /// Render token ids as a `numpy.ndarray[np.uint32]`. + /// + /// This transfers the Rust `Vec` allocation into `NumPy` instead of + /// materialising a Python `list[int]`, which is the preferred hot-path + /// API for benchmark loops and inference clients that already operate on + /// array buffers. + #[pyo3(signature = (messages, *, tools = None, add_generation_prompt = false))] + fn render_ids_np<'py>( + &self, + py: Python<'py>, + messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult>> { + let msgs = parse_messages(messages)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let ids = py + .detach(move || { + renderer.render_ids(&msgs, tools_slice(tools.as_ref()), add_generation_prompt) + }) + .map_err(render_err)?; + Ok(ids.into_pyarray(py)) + } + + #[pyo3(signature = (token_ids, *, tools = None))] + fn parse_response( + &self, + py: Python<'_>, + token_ids: &Bound<'_, PyAny>, + #[allow(unused_variables)] tools: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + let ids = parse_u32_list(token_ids)?; + let renderer = self.inner.clone(); + let parsed = py.detach(move || renderer.parse_response(&ids)); + Ok(PyParsedResponse { inner: parsed }) + } + + /// Parse completion ids from a contiguous `numpy.ndarray[np.uint32]`. + /// + /// The input buffer is borrowed directly, avoiding the Python-list scan and + /// temporary Rust `Vec` used by `parse_response`. + #[allow(clippy::needless_pass_by_value)] + #[pyo3(signature = (token_ids, *, tools = None))] + fn parse_response_np( + &self, + token_ids: PyReadonlyArray1<'_, u32>, + #[allow(unused_variables)] tools: Option<&Bound<'_, PyAny>>, + ) -> PyResult { + let ids = numpy_u32_slice(&token_ids)?; + let parsed = self.inner.parse_response(ids); + Ok(PyParsedResponse { inner: parsed }) + } + + fn get_stop_token_ids<'py>(&self, py: Python<'py>) -> PyResult> { + PyList::new(py, self.inner.stop_token_ids()) + } + + /// Render with pre-resolved multimodal media items. + /// + /// `media` is a list of dicts each shaped like + /// ``{"message_idx": int, "modality": "image" | "video", + /// "num_tokens": int, "hash": str, "hf_payload": }``. + /// `num_tokens` is the placeholder expansion count pre-computed by + /// the caller's vision processor (HF + /// ``image_grid_thw.prod()/merge_size**2`` for Qwen-VL). The Rust + /// renderer never touches pixel data — `hf_payload` rides through + /// as opaque JSON into `multi_modal_data.mm_items`. + /// + /// Raises ``RuntimeError`` when the underlying family doesn't + /// support multimodal (e.g. a Qwen3.5 text-only tokenizer that + /// doesn't ship the ``<|vision_start|>`` token). + #[pyo3(signature = (messages, media, *, tools = None, add_generation_prompt = false))] + fn render_with_media( + &self, + py: Python<'_>, + messages: &Bound<'_, PyAny>, + media: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + add_generation_prompt: bool, + ) -> PyResult { + let msgs = parse_messages(messages)?; + let tools = parse_tools(tools)?; + let bundle = parse_media_bundle(media)?; + let renderer = self.inner.clone(); + let out = py + .detach(move || -> Result<_, renderers_core::types::RenderError> { + let mm = renderer + .as_multimodal() + .ok_or_else(|| renderers_core::types::RenderError::Invalid( + "this renderer does not support multimodal — use a -VL tokenizer or check supports_multimodal()".into(), + ))?; + mm.render_with_media( + &msgs, + tools_slice(tools.as_ref()), + &bundle, + add_generation_prompt, + ) + }) + .map_err(render_err)?; + Ok(PyRenderedTokens { inner: out }) + } + + /// True when the underlying family supports the multimodal trait + /// AND the loaded tokenizer ships the modality special tokens. + fn supports_multimodal(&self) -> bool { + self.inner.as_multimodal().is_some() + } + + #[pyo3(signature = (previous_prompt_ids, previous_completion_ids, new_messages, *, tools = None))] + fn bridge_to_next_turn( + &self, + py: Python<'_>, + previous_prompt_ids: &Bound<'_, PyAny>, + previous_completion_ids: &Bound<'_, PyAny>, + new_messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + ) -> PyResult> { + let prev_p = parse_u32_list(previous_prompt_ids)?; + let prev_c = parse_u32_list(previous_completion_ids)?; + let msgs = parse_messages(new_messages)?; + let tools = parse_tools(tools)?; + let renderer = self.inner.clone(); + let bridged = py + .detach(move || { + renderer.bridge_to_next_turn(&prev_p, &prev_c, &msgs, tools_slice(tools.as_ref())) + }) + .map_err(render_err)?; + Ok(bridged.map(|rt| PyRenderedTokens { inner: rt })) + } + + /// Bridge using `NumPy` token buffers and return a `NumPy` token buffer. + /// + /// Previous prompt/completion ids are borrowed directly from contiguous + /// `uint32` arrays, and the bridged Rust `Vec` is transferred into + /// `NumPy` on output. This is the lowest-overhead Python-facing bridge path. + #[allow(clippy::needless_pass_by_value)] + #[pyo3(signature = (previous_prompt_ids, previous_completion_ids, new_messages, *, tools = None))] + fn bridge_to_next_turn_np<'py>( + &self, + py: Python<'py>, + previous_prompt_ids: PyReadonlyArray1<'_, u32>, + previous_completion_ids: PyReadonlyArray1<'_, u32>, + new_messages: &Bound<'_, PyAny>, + tools: Option<&Bound<'_, PyAny>>, + ) -> PyResult>>> { + let prev_p = numpy_u32_slice(&previous_prompt_ids)?; + let prev_c = numpy_u32_slice(&previous_completion_ids)?; + let msgs = parse_messages(new_messages)?; + let tools = parse_tools(tools)?; + let bridged = self + .inner + .bridge_to_next_turn(prev_p, prev_c, &msgs, tools_slice(tools.as_ref())) + .map_err(render_err)?; + Ok(bridged.map(|rt| rt.token_ids.into_pyarray(py))) + } +} + +// ── Vision: Qwen3-VL image processor ────────────────────────────────── + +/// Rust port of HF's `Qwen3VLImageProcessor` / `Qwen2VLImageProcessor`. +/// +/// Decodes image bytes, smart-resizes, normalises with the `OpenAI` CLIP +/// mean / std, and produces `pixel_values` + `image_grid_thw` tensors +/// in the exact shape the model expects. Equivalent to the Python +/// processor end-to-end; pixel-byte parity is approximate (`CatmullRom` +/// vs PIL bicubic), but grid dims, `num_tokens`, and tensor shape match +/// exactly. +#[pyclass(name = "Qwen3VlImageProcessor", module = "renderers_native")] +struct PyQwen3VlImageProcessor { + inner: Qwen3VlImageProcessor, +} + +#[pymethods] +impl PyQwen3VlImageProcessor { + #[new] + #[pyo3(signature = ( + *, + min_pixels = None, + max_pixels = None, + patch_size = None, + temporal_patch_size = None, + merge_size = None, + ))] + fn new( + min_pixels: Option, + max_pixels: Option, + patch_size: Option, + temporal_patch_size: Option, + merge_size: Option, + ) -> Self { + let mut p = Qwen3VlImageProcessor::default(); + if let Some(v) = min_pixels { + p.min_pixels = v; + } + if let Some(v) = max_pixels { + p.max_pixels = v; + } + if let Some(v) = patch_size { + p.patch_size = v; + } + if let Some(v) = temporal_patch_size { + p.temporal_patch_size = v; + } + if let Some(v) = merge_size { + p.merge_size = v; + } + Self { inner: p } + } + + /// Compute the resized `(height, width)` for an input image + /// without doing any actual pixel work — useful for placeholder + /// counting in test harnesses. + fn smart_resize(&self, height: u32, width: u32) -> PyResult<(u32, u32)> { + self.inner.smart_resize(height, width).map_err(render_err) + } + + /// Process raw image bytes (PNG / JPEG / WebP) into a dict shaped + /// for direct consumption by `Renderer.render_with_media`: + /// + /// ```python + /// { + /// "modality": "image", + /// "num_tokens": int, + /// "hash": str, + /// "hf_payload": { + /// "pixel_values": {"shape": [tokens, features], "data": [...]}, + /// "image_grid_thw": {"shape": [1, 3], "data": [1, h, w]}, + /// }, + /// } + /// ``` + /// + /// `message_idx` is up to the caller — it's not added here. + fn process_bytes<'py>(&self, py: Python<'py>, bytes: &[u8]) -> PyResult> { + // Clone so the move into detach is straightforward + let processed: ProcessedImage = py + .detach(|| self.inner.process_bytes(bytes)) + .map_err(render_err)?; + processed_to_pyobject(py, processed) + } + + /// Convenience: read a file and process it. + fn process_path<'py>(&self, py: Python<'py>, path: &str) -> PyResult> { + let bytes = + std::fs::read(path).map_err(|e| invalid(format!("read image {path:?}: {e}")))?; + let processed: ProcessedImage = py + .detach(|| self.inner.process_bytes(&bytes)) + .map_err(render_err)?; + processed_to_pyobject(py, processed) + } + + #[getter] + fn patch_size(&self) -> u32 { + self.inner.patch_size + } + #[getter] + fn merge_size(&self) -> u32 { + self.inner.merge_size + } + #[getter] + fn temporal_patch_size(&self) -> u32 { + self.inner.temporal_patch_size + } + #[getter] + fn min_pixels(&self) -> u32 { + self.inner.min_pixels + } + #[getter] + fn max_pixels(&self) -> u32 { + self.inner.max_pixels + } +} + +fn processed_to_pyobject<'py>(py: Python<'py>, p: ProcessedImage) -> PyResult> { + // Zero-copy: hand numpy the Vec directly. The numpy array + // takes ownership of the buffer, so this avoids the per-element + // PyFloat allocation that the previous nested-list path triggered. + // Shape: (num_tokens × merge², 3 × temporal × patch²). + let shape = (p.pixel_values.shape()[0], p.pixel_values.shape()[1]); + let pixel_array: Bound<'py, PyArray2> = p.pixel_values.into_pyarray(py); + let grid_array: Bound<'py, PyArray2> = ndarray::Array2::from_shape_vec( + (1, 3), + p.image_grid_thw.iter().copied().map(i64::from).collect(), + ) + .expect("image_grid_thw is always shape [1,3]") + .into_pyarray(py); + + let hf_payload = PyDict::new(py); + hf_payload.set_item("pixel_values", pixel_array)?; + hf_payload.set_item("image_grid_thw", grid_array)?; + + let out = PyDict::new(py); + out.set_item("modality", "image")?; + out.set_item("num_tokens", p.num_tokens)?; + out.set_item("hash", p.hash)?; + out.set_item("hf_payload", hf_payload)?; + let _ = shape; // shape captured in the numpy array's own metadata + Ok(out.into_any()) +} + +#[pymodule] +fn renderers_native(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + let _ = py; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) +} diff --git a/examples/README.md b/examples/README.md index 08d79a6..2d9ba1f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,6 +13,16 @@ backend: The scripts use PEP 723 `uv` headers, so backend dependencies stay local to the recipe and do not touch the repo `uv.lock`. +When `RENDERERS_NATIVE` selects a native renderer, the vLLM and SGLang multiturn +recipes prepare tool schemas once and use a renderer session for first render +plus bridge. The engine-facing contract stays the same: vLLM receives +`prompt_token_ids`, and SGLang receives `input_ids`. + +For local serving loops that already hold parallel role/content arrays, native +renderers also expose `render_fast_ids(roles, contents, tools=prepared_tools)`. +Use the regular message-dict path when structured content parts or multimodal +items are needed. + ## vLLM Multi-Turn Recipe ```bash diff --git a/examples/sglang/multiturn_generate_sglang.py b/examples/sglang/multiturn_generate_sglang.py index bd67fbb..4f352f2 100755 --- a/examples/sglang/multiturn_generate_sglang.py +++ b/examples/sglang/multiturn_generate_sglang.py @@ -122,6 +122,11 @@ def main() -> None: "skip_special_tokens": False, "no_stop_trim": True, } + renderer_tools = ( + renderer.prepare_tools(TOOLS) + if hasattr(renderer, "prepare_tools") + else TOOLS + ) messages = [ {"role": "system", "content": "You are a concise tool-using assistant."}, @@ -133,8 +138,17 @@ def main() -> None: # Turn 1: render locally and pass token IDs to SGLang. SGLang never # sees messages and never applies a chat template. - prompt_ids = renderer.render_ids( - messages, tools=TOOLS, add_generation_prompt=True + session = ( + renderer.new_session(messages, tools=renderer_tools) + if hasattr(renderer, "new_session") + else None + ) + prompt_ids = ( + session.render_ids(add_generation_prompt=True) + if session is not None + else renderer.render_ids( + messages, tools=renderer_tools, add_generation_prompt=True + ) ) output1 = engine.generate(input_ids=prompt_ids, sampling_params=sampling) completion1 = completion_ids(output1, prompt_ids) @@ -185,8 +199,12 @@ def main() -> None: # Turn 2: bridge extends prompt_ids + completion1 exactly. # ``bridge_to_next_turn`` returns a ``RenderedTokens`` (or None); the # extended id stream is on ``.token_ids``. - bridged = renderer.bridge_to_next_turn( - prompt_ids, completion1, new_messages, tools=TOOLS + bridged = ( + session.bridge_to_next_turn(completion1, new_messages) + if session is not None + else renderer.bridge_to_next_turn( + prompt_ids, completion1, new_messages, tools=renderer_tools + ) ) if bridged is None: raise RuntimeError("bridge_to_next_turn returned None") diff --git a/examples/sglang/online_multiturn_sglang.py b/examples/sglang/online_multiturn_sglang.py index 4c17278..1ea2d34 100644 --- a/examples/sglang/online_multiturn_sglang.py +++ b/examples/sglang/online_multiturn_sglang.py @@ -142,6 +142,9 @@ async def run_one( print(f"\n=== {label} ===") renderer = make_renderer(model, enable_thinking) + renderer_tools = ( + renderer.prepare_tools(TOOLS) if hasattr(renderer, "prepare_tools") else TOOLS + ) messages: list[dict[str, Any]] = [ {"role": "system", "content": "You are a concise tool-using assistant."}, @@ -152,7 +155,18 @@ async def run_one( ] # Turn 1: render locally, send token IDs. SGLang never sees messages. - prompt_ids = renderer.render_ids(messages, tools=TOOLS, add_generation_prompt=True) + session = ( + renderer.new_session(messages, tools=renderer_tools) + if hasattr(renderer, "new_session") + else None + ) + prompt_ids = ( + session.render_ids(add_generation_prompt=True) + if session is not None + else renderer.render_ids( + messages, tools=renderer_tools, add_generation_prompt=True + ) + ) output1 = await generate_sglang( client=client, base_url=base_url, @@ -208,8 +222,12 @@ async def run_one( # Turn 2: bridge extends prompt_ids + completion1 exactly. # ``bridge_to_next_turn`` returns a ``RenderedTokens`` (or None); the # extended id stream is on ``.token_ids``. - bridged = renderer.bridge_to_next_turn( - prompt_ids, completion1, new_messages, tools=TOOLS + bridged = ( + session.bridge_to_next_turn(completion1, new_messages) + if session is not None + else renderer.bridge_to_next_turn( + prompt_ids, completion1, new_messages, tools=renderer_tools + ) ) if bridged is None: raise RuntimeError("bridge_to_next_turn returned None") diff --git a/examples/vllm/multiturn_generate_vllm.py b/examples/vllm/multiturn_generate_vllm.py index 0eafd4d..58f74bc 100755 --- a/examples/vllm/multiturn_generate_vllm.py +++ b/examples/vllm/multiturn_generate_vllm.py @@ -111,6 +111,11 @@ def main() -> None: stop_token_ids=renderer.get_stop_token_ids(), skip_special_tokens=False, ) + renderer_tools = ( + renderer.prepare_tools(TOOLS) + if hasattr(renderer, "prepare_tools") + else TOOLS + ) messages = [ {"role": "system", "content": "You are a concise tool-using assistant."}, @@ -122,8 +127,17 @@ def main() -> None: # Turn 1: render locally and pass token IDs to vLLM. vLLM never sees # messages and never applies a chat template. - prompt_ids = renderer.render_ids( - messages, tools=TOOLS, add_generation_prompt=True + session = ( + renderer.new_session(messages, tools=renderer_tools) + if hasattr(renderer, "new_session") + else None + ) + prompt_ids = ( + session.render_ids(add_generation_prompt=True) + if session is not None + else renderer.render_ids( + messages, tools=renderer_tools, add_generation_prompt=True + ) ) output1 = llm.generate( [{"prompt_token_ids": prompt_ids}], @@ -178,8 +192,12 @@ def main() -> None: # Turn 2: bridge extends prompt_ids + completion1 exactly. # ``bridge_to_next_turn`` returns a ``RenderedTokens`` (or None); the # extended id stream is on ``.token_ids``. - bridged = renderer.bridge_to_next_turn( - prompt_ids, completion1, new_messages, tools=TOOLS + bridged = ( + session.bridge_to_next_turn(completion1, new_messages) + if session is not None + else renderer.bridge_to_next_turn( + prompt_ids, completion1, new_messages, tools=renderer_tools + ) ) if bridged is None: raise RuntimeError("bridge_to_next_turn returned None") diff --git a/pyproject.toml b/pyproject.toml index 389870f..63f6cc6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,13 @@ exclude-newer = "7 days" # while the rest of the dependency graph stays gated. exclude-newer-package = { fastokens = false, "prime-pydantic-config" = false } +[tool.pytest.ini_options] +# Registered markers — `parity` gates the Python<->Rust native parity +# suite. Skip with `-m 'not parity'` if the native module isn't built. +markers = [ + "parity: Python<->Rust native parity tests (require maturin develop + a tokenizer.json)", +] + [tool.ty.environment] python-version = "3.13" diff --git a/renderers/_native_router.py b/renderers/_native_router.py new file mode 100644 index 0000000..726af70 --- /dev/null +++ b/renderers/_native_router.py @@ -0,0 +1,216 @@ +"""Routing layer between the pure-Python renderers and the Rust port. + +Loaded by each family shim (currently ``renderers.qwen3``). Resolves +whether the native module is available and, if so, whether the caller +opted into it for this family via the ``RENDERERS_NATIVE`` env var. + +The env-var accepts: + +- ``0`` / empty / unset — use the pure-Python implementation (default). +- ``1`` / ``all`` — route every supported family to the native module. +- comma-separated list of family names, e.g. ``qwen3`` or + ``qwen3,qwen35`` — route only those families. + +Family detection is opt-in per family so callers can roll out the +native path one model at a time; everything else falls back to Python +verbatim. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +import tempfile +from pathlib import Path +from typing import Any + +logger = logging.getLogger("renderers._native_router") + +_NATIVE_MODULE: Any | None = None +_NATIVE_LOAD_ATTEMPTED = False +_ALL_EXCLUDED = {"default"} +_KIMI_TIKTOKEN_PATTERN = "|".join( + [ + r"""[\p{Han}]+""", + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""\p{N}{1,3}""", + r""" ?[^\s\p{L}\p{N}]+[\r\n]*""", + r"""\s*[\r\n]+""", + r"""\s+(?!\S)""", + r"""\s+""", + ] +) + + +def native_enabled(family: str) -> bool: + """Should *family* route to the native module?""" + raw = os.environ.get("RENDERERS_NATIVE", "").strip() + if not raw or raw == "0": + return False + if raw in {"1", "all"}: + return family not in _ALL_EXCLUDED + return family in {part.strip() for part in raw.split(",") if part.strip()} + + +def load_native() -> Any | None: + """Import ``renderers_native`` lazily. Returns ``None`` if the + extension module is not installed (caller falls back to Python). + + Kept as a top-level distribution (rather than `renderers._native`) + so the maturin-built wheel doesn't collide with the hatchling-built + `renderers` wheel at install time. + """ + global _NATIVE_MODULE, _NATIVE_LOAD_ATTEMPTED + if _NATIVE_LOAD_ATTEMPTED: + return _NATIVE_MODULE + _NATIVE_LOAD_ATTEMPTED = True + try: + import renderers_native # type: ignore[import-not-found] + + _NATIVE_MODULE = renderers_native + except ImportError as exc: + logger.info( + "RENDERERS_NATIVE is set but the native extension is not " + "available (%s); falling back to pure Python. Build it with " + "`maturin develop --manifest-path crates/renderers-py/Cargo.toml`.", + exc, + ) + _NATIVE_MODULE = None + return _NATIVE_MODULE + + +def resolve_tokenizer_path(tokenizer: Any) -> str: + """Return a filesystem path to ``tokenizer.json`` for *tokenizer*. + + Accepts either: + + - a string (already a path / HF model id) — the caller is + responsible for snapshotting the model first if it's a remote id. + - a HuggingFace ``PreTrainedTokenizerBase`` — pulls + ``name_or_path`` and locates ``tokenizer.json`` next to it. + """ + if isinstance(tokenizer, (str, os.PathLike)): + path = Path(tokenizer) + if path.is_dir(): + return str(path / "tokenizer.json") + return str(path) + + backend = getattr(tokenizer, "backend_tokenizer", None) + if backend is not None and hasattr(backend, "to_str"): + data = backend.to_str() + digest = hashlib.sha256(data.encode("utf-8")).hexdigest() + cache_dir = Path(tempfile.gettempdir()) / "renderers-tokenizers" + cache_dir.mkdir(parents=True, exist_ok=True) + path = cache_dir / f"{digest}.json" + if not path.exists(): + tmp = path.with_suffix(".tmp") + tmp.write_text(data, encoding="utf-8") + tmp.replace(path) + return str(path) + + name_or_path = getattr(tokenizer, "name_or_path", None) + if not name_or_path: + raise ValueError( + "Cannot determine tokenizer.json path: tokenizer has no " + "name_or_path attribute. Pass an explicit path string instead." + ) + + candidate = Path(name_or_path) + if candidate.is_dir(): + path = candidate / "tokenizer.json" + if path.exists(): + return str(path) + + # HF cache fallback: /models--name--with--slashes/snapshots//tokenizer.json + try: + from huggingface_hub import try_to_load_from_cache # type: ignore + except ImportError: + raise ValueError( + f"tokenizer.json not found near {name_or_path}; install " + "huggingface_hub or pass an explicit path." + ) + + cached = try_to_load_from_cache(repo_id=name_or_path, filename="tokenizer.json") + if isinstance(cached, (str, os.PathLike)): + return str(cached) + + exported = _export_tiktoken_tokenizer_json(name_or_path, try_to_load_from_cache) + if exported is not None: + return exported + + raise ValueError( + f"tokenizer.json not available in the local HF cache for {name_or_path}. " + "Run `snapshot_download` first or pass an explicit path." + ) + + +def _export_tiktoken_tokenizer_json( + repo_id: str, + try_to_load_from_cache: Any, +) -> str | None: + """Export Kimi's tiktoken tokenizer to a native-loadable tokenizer.json.""" + tiktoken_model = try_to_load_from_cache(repo_id=repo_id, filename="tiktoken.model") + tokenizer_config = try_to_load_from_cache( + repo_id=repo_id, filename="tokenizer_config.json" + ) + if not isinstance(tiktoken_model, (str, os.PathLike)) or not isinstance( + tokenizer_config, (str, os.PathLike) + ): + return None + + config_path = Path(tokenizer_config) + model_path = Path(tiktoken_model) + config = json.loads(config_path.read_text(encoding="utf-8")) + if config.get("tokenizer_class") != "TikTokenTokenizer": + return None + + added = { + int(idx): value["content"] + for idx, value in config.get("added_tokens_decoder", {}).items() + } + if not added: + return None + + base_id = min(added) + special_tokens = [ + added.get(idx, f"<|reserved_token_{idx}|>") + for idx in range(base_id, base_id + 256) + ] + digest = hashlib.sha256() + digest.update(model_path.read_bytes()) + digest.update(config_path.read_bytes()) + digest.update(_KIMI_TIKTOKEN_PATTERN.encode("utf-8")) + cache_dir = Path(tempfile.gettempdir()) / "renderers-tokenizers" + cache_dir.mkdir(parents=True, exist_ok=True) + out = cache_dir / f"tiktoken-{digest.hexdigest()}.json" + if out.exists(): + return str(out) + + from transformers.convert_slow_tokenizer import TikTokenConverter + + converted = TikTokenConverter( + vocab_file=str(model_path), + pattern=_KIMI_TIKTOKEN_PATTERN, + extra_special_tokens=special_tokens, + ).converted() + tmp = out.with_suffix(".tmp") + converted.save(str(tmp)) + tmp.replace(out) + return str(out) + + +def try_resolve_tokenizer_path(tokenizer: Any, family: str) -> str | None: + """Best-effort tokenizer resolution for optional native routing.""" + try: + return resolve_tokenizer_path(tokenizer) + except ValueError as exc: + logger.info( + "RENDERERS_NATIVE selected %s but no native tokenizer path was " + "available (%s); falling back to pure Python.", + family, + exc, + ) + return None diff --git a/renderers/_native_vision.py b/renderers/_native_vision.py new file mode 100644 index 0000000..7ab6d9e --- /dev/null +++ b/renderers/_native_vision.py @@ -0,0 +1,147 @@ +"""Bridge helpers for the native Qwen-VL image processor. + +The Rust pipeline in ``renderers_native.Qwen3VlImageProcessor`` produces +``{pixel_values, image_grid_thw, num_tokens, hash}`` dicts that match +what HF's ``Qwen3VLImageProcessor.preprocess(...)`` emits — same shapes, +same OpenAI CLIP normalisation, same patch layout. Pixel-byte parity +is approximate (CatmullRom vs PIL bicubic) but grid dims and token +counts are exact. + +These helpers convert the dict shape into numpy arrays so the result +plugs into vLLM's ``MultiModalKwargsItem`` / SGLang's payload without +extra glue: + + from renderers._native_vision import process_image_for_qwen_vl + media_item = process_image_for_qwen_vl(pil_or_bytes, message_idx=2) + # media_item is the dict shape Renderer.render_with_media expects. +""" + +from __future__ import annotations + +import io +from typing import Any + +try: + import renderers_native # type: ignore[import-not-found] + + _NATIVE = renderers_native +except ImportError: + _NATIVE = None + + +_PROCESSOR_CACHE: dict[tuple[int, int, int, int, int], Any] = {} + + +def get_qwen_vl_processor( + *, + min_pixels: int | None = None, + max_pixels: int | None = None, + patch_size: int = 14, + temporal_patch_size: int = 2, + merge_size: int = 2, +): + """Return a cached ``Qwen3VlImageProcessor`` with the given config. + + Raises ``RuntimeError`` if the native extension isn't built. The + processor itself is cheap to construct (no model weights) so the + cache here is just a courtesy — repeated calls with the same kwargs + return the same handle. + """ + if _NATIVE is None: + raise RuntimeError( + "renderers_native is not installed; build it with " + "`maturin develop --manifest-path crates/renderers-py/Cargo.toml --release`" + ) + key = ( + min_pixels if min_pixels is not None else 56 * 56, + max_pixels if max_pixels is not None else 28 * 28 * 1280, + patch_size, + temporal_patch_size, + merge_size, + ) + cached = _PROCESSOR_CACHE.get(key) + if cached is None: + cached = _NATIVE.Qwen3VlImageProcessor( + min_pixels=key[0], + max_pixels=key[1], + patch_size=key[2], + temporal_patch_size=key[3], + merge_size=key[4], + ) + _PROCESSOR_CACHE[key] = cached + return cached + + +def process_image_for_qwen_vl( + image: Any, + *, + message_idx: int, + return_numpy: bool = True, + **processor_kwargs, +) -> dict[str, Any]: + """Process a single image into the dict shape + ``Renderer.render_with_media`` expects. + + Args: + image: Either ``bytes`` (raw image data), a filesystem path, or + a PIL ``Image.Image`` instance. + message_idx: Index of the user message this image is attached + to. Threaded into the returned dict so the caller can + ``[*items]`` straight into ``render_with_media``. + return_numpy: When True (default), unpack ``pixel_values`` and + ``image_grid_thw`` into numpy arrays before returning. Set + False to keep the lossless list-of-floats shape (useful for + JSON serialisation). + **processor_kwargs: Forwarded to + ``get_qwen_vl_processor`` (``min_pixels`` / ``max_pixels`` / + ``patch_size`` / ``temporal_patch_size`` / ``merge_size``). + + Returns: + A dict shaped as + ``{"message_idx", "modality", "num_tokens", "hash", "hf_payload"}``. + """ + proc = get_qwen_vl_processor(**processor_kwargs) + + if isinstance(image, (bytes, bytearray, memoryview)): + raw = bytes(image) + out = proc.process_bytes(raw) + elif isinstance(image, str): + out = proc.process_path(image) + else: + # Treat as PIL Image — re-encode to PNG bytes. + buf = io.BytesIO() + image.convert("RGB").save(buf, format="PNG") + out = proc.process_bytes(buf.getvalue()) + + import numpy as np # local to keep import cost off the hot path + + pv = out["hf_payload"]["pixel_values"] + gt = out["hf_payload"]["image_grid_thw"] + + def _as_array(value, dtype): + if isinstance(value, dict): + return np.asarray(value["data"], dtype=dtype).reshape(tuple(value["shape"])) + return np.asarray(value, dtype=dtype) + + pixel_values = _as_array(pv, np.float32) + image_grid_thw = _as_array(gt, np.int64) + + if return_numpy: + out["hf_payload"] = { + "pixel_values": pixel_values, + "image_grid_thw": image_grid_thw, + } + else: + out["hf_payload"] = { + "pixel_values": { + "shape": list(pixel_values.shape), + "data": pixel_values.reshape(-1).tolist(), + }, + "image_grid_thw": { + "shape": list(image_grid_thw.shape), + "data": image_grid_thw.reshape(-1).tolist(), + }, + } + + out["message_idx"] = message_idx + return out diff --git a/renderers/base.py b/renderers/base.py index 5bed116..3cae8a6 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1301,6 +1301,8 @@ def factory() -> Renderer: def create_renderer( tokenizer, config: RendererConfig | None = None, + *, + renderer: str | None = None, ) -> Renderer: """Create a Renderer from a typed config. @@ -1316,16 +1318,23 @@ def create_renderer( template-control kwargs (e.g. ``enable_thinking``), pass the specific :class:`Qwen3RendererConfig`, :class:`GLM5RendererConfig` etc. and set those fields. + renderer: Backward-compatible renderer name. Prefer ``config=`` for + new code; ``renderer="auto"`` is equivalent to ``config=None``. Selecting the auto-renderer for a model without a registered renderer falls back to :class:`DefaultRenderer` for text-only models and raises for VLMs (where ``apply_chat_template`` would silently drop images). """ - from renderers.configs import AutoRendererConfig + from renderers.configs import AutoRendererConfig, config_from_name _populate_registry() + if renderer is not None: + if config is not None: + raise TypeError("pass either config= or renderer=, not both") + config = config_from_name(renderer) + if config is None: config = AutoRendererConfig() diff --git a/renderers/deepseek_v3.py b/renderers/deepseek_v3.py index 4efe3ef..dfdd3ba 100644 --- a/renderers/deepseek_v3.py +++ b/renderers/deepseek_v3.py @@ -16,6 +16,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -52,6 +57,28 @@ class DeepSeekV3Renderer: no-ops here too; stored for protocol uniformity. """ + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: DeepSeekV3RendererConfig | None = None, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if config is not None: + enable_thinking = config.enable_thinking + + if native_enabled("deepseek_v3") or native_enabled("deepseek-v3"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.deepseek_v3( + path, + enable_thinking=enable_thinking, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/renderers/default.py b/renderers/default.py index e969421..7c70f2f 100644 --- a/renderers/default.py +++ b/renderers/default.py @@ -13,6 +13,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -88,6 +93,59 @@ class DefaultRenderer: :class:`renderers.DefaultRendererConfig`). """ + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: DefaultRendererConfig | None = None, + *, + tool_parser=None, + reasoning_parser=None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + **chat_template_kwargs, + ): + if config is not None: + tool_parser = config.tool_parser + reasoning_parser = config.reasoning_parser + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + chat_template_kwargs = dict(config.model_extra or {}) + + # Native routing: only when there are no plugged parsers and no + # exotic chat_template kwargs — the Rust path uses minijinja and + # doesn't know about Python-side parser instances. + if ( + native_enabled("default") + and tool_parser is None + and reasoning_parser is None + and not preserve_all_thinking + and not preserve_thinking_between_tool_calls + and not chat_template_kwargs + ): + native = load_native() + if native is not None: + ct = getattr(tokenizer, "chat_template", None) + if isinstance(ct, str) and ct: + path = resolve_tokenizer_path(tokenizer) + stop = ( + [tokenizer.eos_token_id] + if getattr(tokenizer, "eos_token_id", None) is not None + else None + ) + extras = { + "bos_token": getattr(tokenizer, "bos_token", None) or "", + "eos_token": getattr(tokenizer, "eos_token", None) or "", + } + return native.Renderer.default_renderer( + path, + ct, + stop_token_ids=stop, + extra_context=extras, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/renderers/glm45.py b/renderers/glm45.py index efea47b..33f6e8e 100644 --- a/renderers/glm45.py +++ b/renderers/glm45.py @@ -15,6 +15,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -51,6 +56,34 @@ class GLM45Renderer: """Deterministic message → token renderer for GLM-4.5 Air models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: GLM45RendererConfig | None = None, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + + if native_enabled("glm45"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.glm45( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/renderers/glm5.py b/renderers/glm5.py index a42a0af..ecfb73b 100644 --- a/renderers/glm5.py +++ b/renderers/glm5.py @@ -16,6 +16,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -55,6 +60,39 @@ class GLM5Renderer: # GLM51Renderer; GLM-5 proper keeps this off. empty_think_on_last_assistant: bool = False + # Native-routing family key. Overridden in GLM51Renderer. + _NATIVE_KEY = "glm5" + _NATIVE_METHOD = "glm5" + + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: GLM5RendererConfig | GLM51RendererConfig | None = None, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + + if native_enabled(cls._NATIVE_KEY): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + builder = getattr(native.Renderer, cls._NATIVE_METHOD) + return builder( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + # GLM-5.1 uses the same template surface and binds the same kwargs. # Subclassed in ``GLM51Renderer`` so the registry can dispatch on the # ``glm-5.1`` discriminator while sharing this implementation. @@ -646,6 +684,8 @@ class GLM51Renderer(GLM5Renderer): empty_think_on_last_assistant = True _config_cls = GLM51RendererConfig + _NATIVE_KEY = "glm51" + _NATIVE_METHOD = "glm51" @staticmethod def _format_tool_spec(tool: ToolSpec) -> str: diff --git a/renderers/gpt_oss.py b/renderers/gpt_oss.py index f1bb04a..1d8fb99 100644 --- a/renderers/gpt_oss.py +++ b/renderers/gpt_oss.py @@ -51,6 +51,10 @@ ) from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, +) from renderers.base import ( Message, ParsedResponse, @@ -119,6 +123,47 @@ def _arguments_to_str(arguments: Any) -> str: class GptOssRenderer: """Deterministic message → token renderer for OpenAI gpt-oss (harmony).""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: GptOssRendererConfig | None = None, + *, + use_system_prompt: bool = True, + reasoning_effort: str | None = "medium", + conversation_start_date: str | None = None, + knowledge_cutoff: str | None = None, + model_identity: str | None = None, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if config is not None: + use_system_prompt = config.use_system_prompt + reasoning_effort = config.reasoning_effort + conversation_start_date = config.conversation_start_date + knowledge_cutoff = config.knowledge_cutoff + model_identity = config.model_identity + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + + if native_enabled("gpt_oss") or native_enabled("gpt-oss"): + native = load_native() + if native is not None: + # GPT-OSS embeds its own tokenizer; the tokenizer_path + # argument is ignored on the native side. + return native.Renderer.gpt_oss( + "", + use_system_prompt=use_system_prompt, + reasoning_effort=reasoning_effort, + conversation_start_date=conversation_start_date, + knowledge_cutoff=knowledge_cutoff, + model_identity=model_identity, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/renderers/kimi_k2.py b/renderers/kimi_k2.py index 54d6f53..7943ae8 100644 --- a/renderers/kimi_k2.py +++ b/renderers/kimi_k2.py @@ -18,6 +18,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + try_resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -42,6 +47,35 @@ class KimiK2Renderer: have no effect on the byte-level output. """ + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: KimiK2RendererConfig | None = None, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + + if native_enabled("kimi_k2") or native_enabled("kimi-k2"): + native = load_native() + if native is not None: + path = try_resolve_tokenizer_path(tokenizer, "kimi_k2") + if path is not None: + return native.Renderer.kimi_k2( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 352a9ee..95c9451 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -27,6 +27,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + try_resolve_tokenizer_path, +) from renderers.base import ( Message, MultiModalData, @@ -49,6 +54,23 @@ _load_pil_image, ) + +def _messages_have_media(messages: list[Message]) -> bool: + """Return True if any message carries image / video content parts.""" + for m in messages: + c = m.get("content") if isinstance(m, dict) else getattr(m, "content", None) + if isinstance(c, list): + for p in c: + if isinstance(p, dict) and p.get("type") in ( + "image", + "image_url", + "video", + "video_url", + ): + return True + return False + + # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- @@ -576,6 +598,17 @@ class KimiK25Renderer: The tokenizer should be ``moonshotai/Kimi-K2-Instruct`` (same as K2). """ + def __new__( + cls, + tokenizer, + config: KimiK25RendererConfig | None = None, + *, + processor=None, + # Tools / messages are bound to render-time, so native routing + # happens inside render() via a cached text-only delegate. + ): + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, @@ -586,6 +619,18 @@ def __init__( self._tokenizer = tokenizer self._processor = processor self.config = config or KimiK25RendererConfig() + self._native_renderer = None + if native_enabled("kimi_k25") and processor is None: + native = load_native() + if native is not None: + path = try_resolve_tokenizer_path(tokenizer, "kimi_k25") + if path is not None: + self._native_renderer = native.Renderer.kimi_k25( + path, + enable_thinking=self.config.thinking, + preserve_all_thinking=self.config.preserve_all_thinking, + preserve_thinking_between_tool_calls=self.config.preserve_thinking_between_tool_calls, + ) # Core structural tokens — all must be single special tokens in the vocab self._im_user = self._token_id("<|im_user|>") @@ -627,6 +672,22 @@ def __init__( # consistency / debugging. self._image_cache: dict[str, tuple[Any, int]] = {} + @staticmethod + def _content_has_media(content: Any) -> bool: + if not isinstance(content, list): + return False + return any( + isinstance(part, dict) and (_is_image_part(part) or _is_video_part(part)) + for part in content + ) + + def _can_use_native( + self, messages: list[Message], tools: list[ToolSpec] | None + ) -> bool: + if self._native_renderer is None or tools: + return False + return not any(self._content_has_media(msg.get("content")) for msg in messages) + @property def mm_token_type_id_map(self) -> dict[int, int]: """Token-id → modality marker. For Kimi K2.5 only ``<|media_pad|>`` @@ -729,6 +790,13 @@ def render( - Generation prompt: ``<|im_assistant|>assistant<|im_middle|>`` + ```` (or ```` when thinking off) """ + if self._can_use_native(messages, tools): + return self._native_renderer.render( + messages, + tools=tools, + add_generation_prompt=add_generation_prompt, + ) + if not messages: raise ValueError("No messages provided.") @@ -956,6 +1024,14 @@ def render_ids( tools: list[ToolSpec] | None = None, add_generation_prompt: bool = False, ) -> list[int]: + if self._can_use_native(messages, tools): + return list( + self._native_renderer.render_ids( + messages, + tools=tools, + add_generation_prompt=add_generation_prompt, + ) + ) return self.render( messages, tools=tools, diff --git a/renderers/minimax_m2.py b/renderers/minimax_m2.py index 39c12fa..e477e1b 100644 --- a/renderers/minimax_m2.py +++ b/renderers/minimax_m2.py @@ -16,6 +16,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -53,6 +58,29 @@ class MiniMaxM2Renderer: """Deterministic message → token renderer for MiniMax M2 / M2.5 models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: MiniMaxM2RendererConfig | None = None, + ): + # Native routing: only when the caller relies on the default + # system message; a custom model_identity isn't wired through to + # the native classmethod yet. + cfg = config or MiniMaxM2RendererConfig() + default_identity = MiniMaxM2RendererConfig().model_identity + if ( + native_enabled("minimax_m2") or native_enabled("minimax-m2") + ) and cfg.model_identity == default_identity: + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.minimax_m2( + path, + preserve_all_thinking=cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls=cfg.preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py index 06d9d4d..49a1b9c 100644 --- a/renderers/nemotron3.py +++ b/renderers/nemotron3.py @@ -19,6 +19,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -77,6 +82,34 @@ def _render_extra_keys(obj: dict[str, Any], handled_keys: set[str]) -> list[str] class Nemotron3Renderer: """Deterministic message → token renderer for Nemotron 3 models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: Nemotron3RendererConfig | None = None, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + + if native_enabled("nemotron3"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.nemotron3( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, diff --git a/renderers/qwen3.py b/renderers/qwen3.py index fe97561..2b8210d 100644 --- a/renderers/qwen3.py +++ b/renderers/qwen3.py @@ -5,6 +5,14 @@ - Tool calls use JSON format: {"name": "...", "arguments": ...} - Thinking blocks only inserted when loop.last OR reasoning_content present - Generation prompt does NOT add by default + +# Native (Rust) routing + +When ``RENDERERS_NATIVE`` selects ``qwen3`` (see +``renderers._native_router``) and the native extension is available, +``Qwen3Renderer(...)`` returns an instance of the Rust implementation +instead of this Python class. The returned object satisfies the same +duck-typed Renderer protocol, so callers don't need to special-case it. """ from __future__ import annotations @@ -13,6 +21,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, ParsedResponse, @@ -46,11 +59,46 @@ class Qwen3Renderer: """Deterministic message → token renderer for Qwen3 models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: Qwen3RendererConfig | None = None, + *, + enable_thinking: bool = True, + preserve_all_thinking: bool = False, + preserve_thinking_between_tool_calls: bool = False, + ): + if config is not None: + enable_thinking = config.enable_thinking + preserve_all_thinking = config.preserve_all_thinking + preserve_thinking_between_tool_calls = ( + config.preserve_thinking_between_tool_calls + ) + + # Native routing: when ``RENDERERS_NATIVE`` opts qwen3 into the + # Rust path and the extension is installed, return the native + # instance directly. Otherwise fall through to the pure-Python + # constructor below. + if native_enabled("qwen3"): + native = load_native() + if native is not None: + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.qwen3( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=preserve_all_thinking, + preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + def __init__( self, tokenizer: PreTrainedTokenizer, config: Qwen3RendererConfig | None = None, ): + # If __new__ returned a native instance, Python won't call this + # __init__ (different type). For the pure-Python instance, do + # the normal setup. self._tokenizer = tokenizer self.config = config or Qwen3RendererConfig() diff --git a/renderers/qwen35.py b/renderers/qwen35.py index b3c6af7..7498cfc 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -19,6 +19,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.base import ( Message, MultiModalData, @@ -109,6 +114,34 @@ def _default_enable_thinking(tokenizer) -> bool: class Qwen35Renderer: """Deterministic message → token renderer for Qwen3.5 models.""" + def __new__( + cls, + tokenizer: PreTrainedTokenizer, + config: Qwen35RendererConfig | None = None, + *, + processor: Any = None, + ): + # Route to native only when: + # 1. the user opted in via RENDERERS_NATIVE, + # 2. the wheel is installed, + # 3. the message stream is text-only (no processor / images). + # Phase 5 will lift restriction 3. + if native_enabled("qwen35") and processor is None: + native = load_native() + if native is not None: + cfg = config or cls._config_cls() + enable_thinking = cfg.enable_thinking + if enable_thinking is None: + enable_thinking = _default_enable_thinking(tokenizer) + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.qwen35( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls=cfg.preserve_thinking_between_tool_calls, + ) + return super().__new__(cls) + _config_cls: type = Qwen35RendererConfig def __init__( diff --git a/renderers/qwen36.py b/renderers/qwen36.py index 6adf867..4606273 100644 --- a/renderers/qwen36.py +++ b/renderers/qwen36.py @@ -23,8 +23,13 @@ import json from typing import Any +from renderers._native_router import ( + load_native, + native_enabled, + resolve_tokenizer_path, +) from renderers.configs import Qwen36RendererConfig -from renderers.qwen35 import Qwen35Renderer +from renderers.qwen35 import Qwen35Renderer, _default_enable_thinking class Qwen36Renderer(Qwen35Renderer): @@ -32,6 +37,34 @@ class Qwen36Renderer(Qwen35Renderer): _config_cls = Qwen36RendererConfig + def __new__( + cls, + tokenizer, + config: Qwen36RendererConfig | None = None, + *, + processor=None, + ): + # Route to native only for Qwen3.6 specifically — never fall + # through to the parent's qwen35 router (the renderer flag is + # different). + if native_enabled("qwen36") and processor is None: + native = load_native() + if native is not None: + cfg = config or Qwen36RendererConfig() + enable_thinking = cfg.enable_thinking + if enable_thinking is None: + enable_thinking = _default_enable_thinking(tokenizer) + path = resolve_tokenizer_path(tokenizer) + return native.Renderer.qwen36( + path, + enable_thinking=enable_thinking, + preserve_all_thinking=cfg.preserve_all_thinking, + preserve_thinking_between_tool_calls=cfg.preserve_thinking_between_tool_calls, + ) + # Skip Qwen35Renderer.__new__ (would also try to route, with the + # wrong flag). Go straight to object. + return object.__new__(cls) + @staticmethod def _render_arg_value(arg_value: Any) -> str: if isinstance(arg_value, str): diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..85f3606 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,4 @@ +[toolchain] +channel = "stable" +components = ["rustfmt", "clippy"] +profile = "minimal" diff --git a/tests/test_client.py b/tests/test_client.py index 1cc1000..a543c38 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -299,6 +299,18 @@ def test_generate_serializes_multimodal_features_for_qwen_vl_family( pytest.importorskip("torch") pytest.importorskip("vllm", reason="vllm needed for features serialization") + pytest.importorskip( + "vllm.entrypoints.serve.disagg.mm_serde", + reason="vLLM multimodal serializer is not available", + ) + pytest.importorskip( + "vllm.model_executor.models.qwen2_vl", + reason="vLLM Qwen-VL field factory is not available", + ) + pytest.importorskip( + "vllm.multimodal.inputs", + reason="vLLM multimodal input wrappers are not available", + ) import torch as _torch from renderers.base import ( diff --git a/tests/test_native_numpy.py b/tests/test_native_numpy.py new file mode 100644 index 0000000..5b669a4 --- /dev/null +++ b/tests/test_native_numpy.py @@ -0,0 +1,252 @@ +"""NumPy fast-path coverage for the native PyO3 module.""" + +from __future__ import annotations + +import os + +import numpy as np +import pytest + +from renderers import _native_router as router + +TOOLS = [ + { + "name": "get_weather", + "description": "Get current weather.", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + } +] + + +@pytest.fixture(scope="module") +def qwen3_native(): + native = router.load_native() + if native is None: + pytest.skip("renderers_native not built; run `maturin develop`") + + try: + from renderers.base import load_tokenizer + + tokenizer = load_tokenizer("Qwen/Qwen3-8B") + tok_path = router.resolve_tokenizer_path(tokenizer) + except Exception as exc: + pytest.skip(f"could not resolve Qwen3 tokenizer: {exc}") + if not os.path.exists(tok_path): + pytest.skip(f"tokenizer.json missing on disk at {tok_path}") + + return native.Renderer.qwen3(tok_path) + + +def test_render_ids_np_matches_list_api(qwen3_native): + messages = [ + {"role": "system", "content": "You are concise."}, + {"role": "user", "content": "Say hi."}, + ] + + ids = qwen3_native.render_ids_np(messages, add_generation_prompt=True) + + assert ids.dtype == np.uint32 + assert ids.tolist() == qwen3_native.render_ids( + messages, + add_generation_prompt=True, + ) + + +def test_parse_response_np_borrows_uint32_completion(qwen3_native): + prompt = [{"role": "user", "content": "What is 2+2?"}] + assistant = {"role": "assistant", "content": "4"} + prompt_ids = qwen3_native.render_ids_np(prompt, add_generation_prompt=True) + full_ids = qwen3_native.render_ids_np(prompt + [assistant]) + completion_ids = full_ids[len(prompt_ids) :] + + parsed = qwen3_native.parse_response_np(completion_ids) + + assert parsed.content == "4" + + +def test_bridge_to_next_turn_np_matches_list_api(qwen3_native): + prompt = [{"role": "user", "content": "Plan Saturday."}] + assistant = {"role": "assistant", "content": "Start with breakfast."} + new_messages = [{"role": "user", "content": "Add one museum."}] + + prompt_ids = qwen3_native.render_ids_np(prompt, add_generation_prompt=True) + full_ids = qwen3_native.render_ids_np(prompt + [assistant]) + completion_ids = full_ids[len(prompt_ids) :] + + bridged_np = qwen3_native.bridge_to_next_turn_np( + prompt_ids, + completion_ids, + new_messages, + ) + bridged_list = qwen3_native.bridge_to_next_turn( + prompt_ids.tolist(), + completion_ids.tolist(), + new_messages, + ) + + assert bridged_np is not None + assert bridged_list is not None + assert bridged_np.dtype == np.uint32 + assert bridged_np.tolist() == bridged_list.token_ids + + +def test_prepared_tools_match_raw_tools(qwen3_native): + messages = [ + {"role": "system", "content": "You call tools when useful."}, + {"role": "user", "content": "Weather in Paris?"}, + ] + prepared = qwen3_native.prepare_tools(TOOLS) + + raw_ids = qwen3_native.render_ids( + messages, + tools=TOOLS, + add_generation_prompt=True, + ) + prepared_ids = qwen3_native.render_ids( + messages, + tools=prepared, + add_generation_prompt=True, + ) + + assert len(prepared) == 1 + assert prepared_ids == raw_ids + + +def test_render_batch_ids_matches_single_calls(qwen3_native): + batch = [ + [{"role": "user", "content": "Say hi."}], + [{"role": "user", "content": "Say bye."}], + ] + + batch_ids = qwen3_native.render_batch_ids(batch, add_generation_prompt=True) + + assert batch_ids == [ + qwen3_native.render_ids(messages, add_generation_prompt=True) + for messages in batch + ] + + +def test_render_batch_ids_np_packed_matches_single_calls(qwen3_native): + batch = [ + [{"role": "user", "content": "A"}], + [{"role": "user", "content": "B"}], + [{"role": "user", "content": "C"}], + ] + + ids, offsets = qwen3_native.render_batch_ids_np_packed( + batch, + add_generation_prompt=True, + ) + + assert ids.dtype == np.uint32 + assert offsets.dtype == np.int64 + assert offsets.tolist()[0] == 0 + assert len(offsets) == len(batch) + 1 + unpacked = [ + ids[offsets[idx] : offsets[idx + 1]].tolist() for idx in range(len(batch)) + ] + assert unpacked == [ + qwen3_native.render_ids(messages, add_generation_prompt=True) + for messages in batch + ] + + +def test_render_fast_ids_matches_dict_messages(qwen3_native): + roles = ["system", "user", "assistant"] + contents = ["You are concise.", "Say hi.", "Hi."] + messages = [ + {"role": role, "content": content} + for role, content in zip(roles, contents, strict=True) + ] + + fast_ids = qwen3_native.render_fast_ids( + roles, + contents, + add_generation_prompt=True, + ) + fast_np = qwen3_native.render_fast_ids_np( + roles, + contents, + add_generation_prompt=True, + ) + regular_ids = qwen3_native.render_ids( + messages, + add_generation_prompt=True, + ) + + assert fast_ids == regular_ids + assert fast_np.dtype == np.uint32 + assert fast_np.tolist() == regular_ids + + +def test_session_render_and_bridge_match_renderer(qwen3_native): + prompt = [{"role": "user", "content": "Plan Saturday."}] + assistant = {"role": "assistant", "content": "Start with breakfast."} + new_messages = [{"role": "user", "content": "Add one museum."}] + session = qwen3_native.new_session(prompt) + + session_prompt = session.render_ids(add_generation_prompt=True) + full_ids = qwen3_native.render_ids(prompt + [assistant]) + completion_ids = full_ids[len(session_prompt) :] + session_bridge = session.bridge_to_next_turn(completion_ids, new_messages) + direct_bridge = qwen3_native.bridge_to_next_turn( + session_prompt, + completion_ids, + new_messages, + ) + + assert session_prompt == qwen3_native.render_ids( + prompt, + add_generation_prompt=True, + ) + assert session_bridge is not None + assert direct_bridge is not None + assert session_bridge.token_ids == direct_bridge.token_ids + + +def test_session_fork_preserves_prompt_state(qwen3_native): + prompt = [{"role": "user", "content": "Plan Monday."}] + assistant = {"role": "assistant", "content": "Start with tea."} + new_messages = [{"role": "user", "content": "Add one errand."}] + session = qwen3_native.new_session(prompt) + session_prompt = session.render_ids(add_generation_prompt=True) + forked = session.fork() + + full_ids = qwen3_native.render_ids(prompt + [assistant]) + completion_ids = full_ids[len(session_prompt) :] + forked_bridge = forked.bridge_to_next_turn(completion_ids, new_messages) + direct_bridge = qwen3_native.bridge_to_next_turn( + session_prompt, + completion_ids, + new_messages, + ) + + assert forked_bridge is not None + assert direct_bridge is not None + assert forked_bridge.token_ids == direct_bridge.token_ids + + +def test_session_numpy_bridge_match_renderer(qwen3_native): + prompt = [{"role": "user", "content": "Plan Sunday."}] + assistant = {"role": "assistant", "content": "Start with a walk."} + new_messages = [{"role": "user", "content": "Add coffee."}] + session = qwen3_native.new_session(prompt) + + session_prompt = session.render_ids_np(add_generation_prompt=True) + full_ids = qwen3_native.render_ids_np(prompt + [assistant]) + completion_ids = full_ids[len(session_prompt) :] + session_bridge = session.bridge_to_next_turn_np(completion_ids, new_messages) + direct_bridge = qwen3_native.bridge_to_next_turn_np( + session_prompt, + completion_ids, + new_messages, + ) + + assert session_bridge is not None + assert direct_bridge is not None + assert session_bridge.dtype == np.uint32 + assert session_bridge.tolist() == direct_bridge.tolist() diff --git a/tests/test_native_parity.py b/tests/test_native_parity.py new file mode 100644 index 0000000..64d566a --- /dev/null +++ b/tests/test_native_parity.py @@ -0,0 +1,427 @@ +"""Byte-for-byte parity: native (Rust) vs pure-Python. + +For every family that has been ported to Rust, build *both* a +pure-Python renderer and a native renderer from the same tokenizer and +assert their outputs are identical across a representative set of +conversation shapes. + +This complements two existing parity gates: + +- ``tests/test_render_ids.py`` — Python (or, when the env var routes, + native) vs HuggingFace's ``apply_chat_template``. Catches drift from + the upstream reference. Run the suite with + ``RENDERERS_NATIVE=qwen3 pytest tests/test_render_ids.py`` to exercise + the native path through that gate. +- This file — Python vs native, holding the reference fixed. Catches + drift between the two implementations even if HF changes its + template. Cheaper because the HF call isn't on the path. + +Both tests require a real ``tokenizer.json`` on disk. The fixtures here +skip with a clear message when the tokenizer can't be located or the +native extension isn't built — so the test file is safe to import in +sandboxed CI where neither is available. +""" + +from __future__ import annotations + +import os +from typing import Any + +import pytest + +from renderers import _native_router as router + +pytestmark = pytest.mark.parity + + +# ── Test matrix ────────────────────────────────────────────────────── + + +# (model_id, family-key, extra-kwargs) +NATIVE_PARITY_FAMILIES = [ + ("Qwen/Qwen3-8B", "qwen3", {}), + ("Qwen/Qwen3.5-9B", "qwen35", {}), + ("Qwen/Qwen3.6-35B-A3B", "qwen36", {}), + ("zai-org/GLM-5", "glm5", {}), + ("zai-org/GLM-5.1", "glm51", {}), + ("THUDM/GLM-4.5-Air", "glm45", {}), + ("deepseek-ai/DeepSeek-V3", "deepseek_v3", {}), + ("moonshotai/Kimi-K2-Instruct", "kimi_k2", {}), + ("MiniMaxAI/MiniMax-M2.5", "minimax_m2", {}), + ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "nemotron3", {}), +] + + +# ── Fixtures ───────────────────────────────────────────────────────── + + +@pytest.fixture(scope="module") +def native_module(): + mod = router.load_native() + if mod is None: + pytest.skip("renderers_native not built; run `maturin develop`") + return mod + + +@pytest.fixture(scope="module", params=NATIVE_PARITY_FAMILIES, ids=lambda p: p[1]) +def native_pair(request, native_module): + """Return ``(py_renderer, native_renderer, tokenizer)`` for one family.""" + model_id, family, extra = request.param + + # Locate tokenizer.json on disk. Skip cleanly if not in HF cache — + # this test is most useful locally with a real model snapshot. + try: + from renderers.base import load_tokenizer + + tokenizer = load_tokenizer(model_id) + except Exception as exc: + pytest.skip(f"could not load tokenizer for {model_id}: {exc}") + + try: + tok_path = router.resolve_tokenizer_path(tokenizer) + except Exception as exc: + pytest.skip(f"could not resolve tokenizer.json for {model_id}: {exc}") + if not os.path.exists(tok_path): + pytest.skip(f"tokenizer.json missing on disk at {tok_path}") + + # Build the pure-Python renderer with the env var explicitly off so + # the ``__new__`` routing doesn't return a native instance. + saved = os.environ.pop("RENDERERS_NATIVE", None) + try: + py_renderer = _build_python_renderer(family, tokenizer, extra) + if py_renderer is None: + pytest.skip(f"no python builder wired for {family}") + finally: + if saved is not None: + os.environ["RENDERERS_NATIVE"] = saved + + # Build the native renderer directly through the module surface — + # bypasses the env-var routing entirely. + native_renderer = _build_native_renderer(native_module, family, tok_path, extra) + if native_renderer is None: + pytest.skip(f"no native builder wired for {family}") + + return py_renderer, native_renderer, tokenizer + + +# ── Family-specific builder dispatch ───────────────────────────────── + + +def _build_python_renderer(family: str, tokenizer, extra): + """Return a pure-Python renderer for *family*, or ``None`` if missing.""" + if family == "qwen3": + from renderers.qwen3 import Qwen3Renderer + + return Qwen3Renderer(tokenizer, **extra) + if family == "qwen35": + from renderers.qwen35 import Qwen35Renderer + + return Qwen35Renderer(tokenizer, **extra) + if family == "qwen36": + from renderers.qwen36 import Qwen36Renderer + + return Qwen36Renderer(tokenizer, **extra) + if family == "glm5": + from renderers.glm5 import GLM5Renderer + + return GLM5Renderer(tokenizer, **extra) + if family == "glm51": + from renderers.glm5 import GLM51Renderer + + return GLM51Renderer(tokenizer, **extra) + if family == "glm45": + from renderers.glm45 import GLM45Renderer + + return GLM45Renderer(tokenizer, **extra) + if family == "deepseek_v3": + from renderers.deepseek_v3 import DeepSeekV3Renderer + + return DeepSeekV3Renderer(tokenizer, **extra) + if family == "kimi_k2": + from renderers.kimi_k2 import KimiK2Renderer + + return KimiK2Renderer(tokenizer, **extra) + if family == "minimax_m2": + from renderers.minimax_m2 import MiniMaxM2Renderer + + return MiniMaxM2Renderer(tokenizer, **extra) + if family == "nemotron3": + from renderers.nemotron3 import Nemotron3Renderer + + return Nemotron3Renderer(tokenizer, **extra) + return None + + +def _build_native_renderer(native_module, family: str, tok_path: str, extra): + """Return a native renderer for *family* via the explicit factory.""" + factory = { + "qwen3": native_module.Renderer.qwen3, + "qwen35": native_module.Renderer.qwen35, + "qwen36": native_module.Renderer.qwen36, + "glm5": native_module.Renderer.glm5, + "glm51": native_module.Renderer.glm51, + "glm45": native_module.Renderer.glm45, + "deepseek_v3": native_module.Renderer.deepseek_v3, + "kimi_k2": native_module.Renderer.kimi_k2, + "minimax_m2": native_module.Renderer.minimax_m2, + "nemotron3": native_module.Renderer.nemotron3, + }.get(family) + if factory is None: + return None + return factory(tok_path, **extra) + + +# ── Conversation fixtures (a representative cross-section) ─────────── + + +CONVERSATIONS: list[tuple[str, list[dict[str, Any]]]] = [ + ( + "system_and_user", + [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello!"}, + ], + ), + ( + "single_turn", + [ + {"role": "system", "content": "You are a math tutor."}, + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "4"}, + ], + ), + ( + "no_system_message", + [ + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi there!"}, + ], + ), + ( + "multi_turn", + [ + {"role": "user", "content": "A"}, + {"role": "assistant", "content": "B"}, + {"role": "user", "content": "C"}, + {"role": "assistant", "content": "D"}, + ], + ), + ( + "reasoning_content_field", + [ + {"role": "user", "content": "What is 2+2?"}, + { + "role": "assistant", + "reasoning_content": "Simple arithmetic", + "content": "4", + }, + ], + ), + ( + "tool_call_single", + [ + {"role": "user", "content": "What's the weather in Paris?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + }, + } + ], + }, + ], + ), + ( + "tool_call_with_response", + [ + {"role": "user", "content": "Weather?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_weather", + "arguments": {"city": "Paris"}, + }, + } + ], + }, + {"role": "tool", "content": "sunny, 22°C"}, + {"role": "assistant", "content": "It's sunny and 22°C in Paris."}, + ], + ), +] + + +TOOLS = [ + { + "name": "get_weather", + "description": "Get current weather for a city.", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + } +] + + +# ── Tests ──────────────────────────────────────────────────────────── + + +@pytest.mark.parametrize( + "case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None +) +def test_render_ids_parity(native_pair, case, messages): + py_renderer, native_renderer, _tok = native_pair + py_ids = list(py_renderer.render_ids(messages)) + rs_ids = list(native_renderer.render_ids(messages)) + assert py_ids == rs_ids, ( + f"render_ids mismatch for {case}:\n" + f" python: {py_ids[:30]}... (len={len(py_ids)})\n" + f" native: {rs_ids[:30]}... (len={len(rs_ids)})" + ) + + +@pytest.mark.parametrize( + "case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None +) +def test_render_ids_with_gen_prompt_parity(native_pair, case, messages): + py_renderer, native_renderer, _tok = native_pair + py_ids = list(py_renderer.render_ids(messages, add_generation_prompt=True)) + rs_ids = list(native_renderer.render_ids(messages, add_generation_prompt=True)) + assert py_ids == rs_ids + + +@pytest.mark.parametrize( + "case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None +) +def test_render_ids_with_tools_parity(native_pair, case, messages): + py_renderer, native_renderer, _tok = native_pair + py_ids = list(py_renderer.render_ids(messages, tools=TOOLS)) + rs_ids = list(native_renderer.render_ids(messages, tools=TOOLS)) + assert py_ids == rs_ids + + +def test_qwen35_structured_text_parts_parity(native_pair): + py_renderer, native_renderer, _tok = native_pair + if type(py_renderer).__name__ not in {"Qwen35Renderer", "Qwen36Renderer"}: + pytest.skip("structured text part coverage is specific to Qwen3.5/Qwen3.6") + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Hello"}, + {"type": "text", "text": " from structured parts"}, + ], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Structured"}, + {"type": "text", "text": " reply"}, + ], + }, + ] + + py_ids = list(py_renderer.render_ids(messages)) + rs_ids = list(native_renderer.render_ids(messages)) + assert py_ids == rs_ids + + +@pytest.mark.parametrize( + "case,messages", CONVERSATIONS, ids=lambda x: x if isinstance(x, str) else None +) +def test_message_indices_parity(native_pair, case, messages): + """Per-token attribution must match — critical for training loss masks.""" + py_renderer, native_renderer, _tok = native_pair + py_out = py_renderer.render(messages) + rs_out = native_renderer.render(messages) + assert list(py_out.token_ids) == list(rs_out.token_ids) + assert list(py_out.message_indices) == list(rs_out.message_indices) + + +def test_stop_token_ids_parity(native_pair): + py_renderer, native_renderer, _tok = native_pair + assert list(py_renderer.get_stop_token_ids()) == list( + native_renderer.get_stop_token_ids() + ) + + +def test_parse_response_no_tool_calls_parity(native_pair): + """Parse a simple text completion through both.""" + py_renderer, native_renderer, _tok = native_pair + # Render a small assistant turn, take the assistant tokens, parse. + msgs = [{"role": "user", "content": "say hi"}] + completion_ids = py_renderer.render_ids( + msgs + [{"role": "assistant", "content": "Hello there!"}] + ) + # Slice out just the assistant section by re-rendering up to the user. + prompt_ids = py_renderer.render_ids(msgs, add_generation_prompt=True) + assistant_ids = completion_ids[len(prompt_ids) :] + + py_parsed = py_renderer.parse_response(assistant_ids) + rs_parsed = native_renderer.parse_response(assistant_ids) + assert py_parsed.content == rs_parsed.content + assert (py_parsed.reasoning_content or None) == ( + rs_parsed.reasoning_content or None + ) + assert len(py_parsed.tool_calls) == len(rs_parsed.tool_calls) + + +def test_bridge_to_next_turn_parity(native_pair): + py_renderer, native_renderer, _tok = native_pair + initial = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi! How can I help?"}, + ] + prev_prompt_ids = py_renderer.render_ids(initial[:-1], add_generation_prompt=True) + prev_completion_ids = py_renderer.render_ids(initial)[len(prev_prompt_ids) :] + new_messages = [{"role": "user", "content": "Tell me about Rust."}] + + py_b = py_renderer.bridge_to_next_turn( + prev_prompt_ids, prev_completion_ids, new_messages + ) + rs_b = native_renderer.bridge_to_next_turn( + prev_prompt_ids, prev_completion_ids, new_messages + ) + + # Either both return None (refused) or both produce identical tokens. + if py_b is None: + assert rs_b is None + return + assert rs_b is not None + assert list(py_b.token_ids) == list(rs_b.token_ids) + + +def test_bridge_refuses_assistant_in_extension(native_pair): + py_renderer, native_renderer, _tok = native_pair + initial = [{"role": "user", "content": "Hi"}] + prompt_ids = py_renderer.render_ids(initial, add_generation_prompt=True) + completion_ids = list(py_renderer.get_stop_token_ids())[:1] + + # Assistant in the extension → both must return None. + assert ( + py_renderer.bridge_to_next_turn( + prompt_ids, + completion_ids, + [{"role": "assistant", "content": "x"}], + ) + is None + ) + assert ( + native_renderer.bridge_to_next_turn( + prompt_ids, + completion_ids, + [{"role": "assistant", "content": "x"}], + ) + is None + ) diff --git a/tests/test_native_router.py b/tests/test_native_router.py new file mode 100644 index 0000000..c22cbd8 --- /dev/null +++ b/tests/test_native_router.py @@ -0,0 +1,207 @@ +"""Unit tests for the Python/native routing layer. + +These are isolated from the inference engines and don't require a +network connection — they exercise just the env-var parsing, the +lazy import, and (where the wheel is built) the native module's +class surface. +""" + +from __future__ import annotations + +import inspect +import os +import sys +from types import SimpleNamespace +from unittest import mock + +import pytest + +from renderers import _native_router as router + + +def test_native_disabled_by_default(): + with mock.patch.dict(os.environ, {}, clear=True): + assert not router.native_enabled("qwen3") + + +@pytest.mark.parametrize("value", ["", "0"]) +def test_native_off_values(value): + with mock.patch.dict(os.environ, {"RENDERERS_NATIVE": value}, clear=True): + assert not router.native_enabled("qwen3") + + +@pytest.mark.parametrize("value", ["1", "all"]) +def test_native_on_global(value): + with mock.patch.dict(os.environ, {"RENDERERS_NATIVE": value}, clear=True): + assert router.native_enabled("qwen3") + assert router.native_enabled("qwen35") + assert router.native_enabled("glm5") + + +def test_native_csv_specific_families(): + with mock.patch.dict(os.environ, {"RENDERERS_NATIVE": "qwen3,glm5"}, clear=True): + assert router.native_enabled("qwen3") + assert router.native_enabled("glm5") + assert not router.native_enabled("qwen35") + + +def test_native_csv_whitespace_tolerant(): + with mock.patch.dict( + os.environ, {"RENDERERS_NATIVE": " qwen3 , glm5 "}, clear=True + ): + assert router.native_enabled("qwen3") + assert router.native_enabled("glm5") + + +def test_load_native_caches_result(): + # Reset the loader cache for the test. + router._NATIVE_MODULE = None + router._NATIVE_LOAD_ATTEMPTED = False + first = router.load_native() + second = router.load_native() + assert first is second # cached + + +def test_resolve_tokenizer_path_from_string(tmp_path): + # Pass a directory containing tokenizer.json — get the file path back. + (tmp_path / "tokenizer.json").write_text("{}") + assert router.resolve_tokenizer_path(str(tmp_path)).endswith("tokenizer.json") + + +def test_resolve_tokenizer_path_from_exact_file(tmp_path): + f = tmp_path / "tokenizer.json" + f.write_text("{}") + # Pass a file path directly — return as-is. + assert router.resolve_tokenizer_path(str(f)) == str(f) + + +def test_resolve_tokenizer_path_rejects_hf_missing_sentinel(monkeypatch): + tokenizer = SimpleNamespace(name_or_path="org/custom-tokenizer") + fake_hf = SimpleNamespace(try_to_load_from_cache=lambda **_kwargs: object()) + monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hf) + + with pytest.raises(ValueError, match="tokenizer.json not available"): + router.resolve_tokenizer_path(tokenizer) + + +def test_resolve_tokenizer_path_uses_tiktoken_export(monkeypatch, tmp_path): + tokenizer = SimpleNamespace(name_or_path="moonshotai/Kimi-K2-Instruct") + fake_hf = SimpleNamespace(try_to_load_from_cache=lambda **_kwargs: object()) + exported = tmp_path / "tokenizer.json" + exported.write_text("{}") + monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hf) + monkeypatch.setattr( + router, + "_export_tiktoken_tokenizer_json", + lambda repo_id, _loader: ( + str(exported) if repo_id == "moonshotai/Kimi-K2-Instruct" else None + ), + ) + + assert router.resolve_tokenizer_path(tokenizer) == str(exported) + + +def test_kimi_k2_constructor_falls_back_without_tokenizer_path(monkeypatch): + from renderers.kimi_k2 import KimiK2Renderer + + fake_native = mock.Mock() + monkeypatch.setattr("renderers.kimi_k2.native_enabled", lambda _family: True) + monkeypatch.setattr("renderers.kimi_k2.load_native", lambda: fake_native) + monkeypatch.setattr( + "renderers.kimi_k2.try_resolve_tokenizer_path", + lambda _tokenizer, _family: None, + ) + + inst = KimiK2Renderer.__new__(KimiK2Renderer, object()) + + assert isinstance(inst, KimiK2Renderer) + fake_native.Renderer.kimi_k2.assert_not_called() + + +def test_kimi_k25_constructor_does_not_route_eagerly(monkeypatch): + from renderers.kimi_k25 import KimiK25Renderer + + fake_native = mock.Mock() + monkeypatch.setattr("renderers.kimi_k25.native_enabled", lambda _family: True) + monkeypatch.setattr("renderers.kimi_k25.load_native", lambda: fake_native) + + inst = KimiK25Renderer.__new__(KimiK25Renderer, object(), processor=None) + + assert isinstance(inst, KimiK25Renderer) + fake_native.Renderer.kimi_k25.assert_not_called() + + +def test_kimi_k25_native_delegate_rejects_render_time_tools(): + from renderers.kimi_k25 import KimiK25Renderer + + inst = object.__new__(KimiK25Renderer) + inst._native_renderer = object() + + assert inst._can_use_native([{"role": "user", "content": "hi"}], tools=None) + assert not inst._can_use_native( + [{"role": "user", "content": "hi"}], + tools=[{"name": "echo", "parameters": {}}], + ) + + +# ── Native module surface (only runs when the wheel is built) ──────── + + +@pytest.fixture +def native(): + mod = router.load_native() + if mod is None: + pytest.skip("renderers_native not built; run `maturin develop`") + return mod + + +def test_native_exports(native): + # The five classes the Python shim relies on. + for name in ( + "Renderer", + "RenderedTokens", + "ParsedResponse", + "ParsedToolCall", + "ToolCallParseStatus", + ): + assert hasattr(native, name), f"missing {name}" + + +def test_native_status_constants(native): + s = native.ToolCallParseStatus + assert s.OK == "ok" + assert s.INVALID_JSON == "invalid_json" + assert s.UNCLOSED_BLOCK == "unclosed_block" + assert s.MISSING_NAME == "missing_name" + assert s.MALFORMED_STRUCTURE == "malformed_structure" + + +def test_native_base_api_surface(native): + renderer_methods = [ + "render", + "render_ids", + "parse_response", + "get_stop_token_ids", + "bridge_to_next_turn", + ] + rendered_tokens_attrs = [ + "token_ids", + "message_indices", + "sampled_mask", + "is_content", + "message_roles", + "multi_modal_data", + "tokens_per_message", + "message_token_spans", + "role_token_spans", + "tokens_by_role", + "content_token_spans_by_role", + "content_mask_for_roles", + ] + + for name in renderer_methods: + assert hasattr(native.Renderer, name), f"missing Renderer.{name}" + for name in rendered_tokens_attrs: + assert hasattr(native.RenderedTokens, name), f"missing RenderedTokens.{name}" + + assert "tools" in inspect.signature(native.Renderer.parse_response).parameters diff --git a/tests/test_native_vision.py b/tests/test_native_vision.py new file mode 100644 index 0000000..61f6c29 --- /dev/null +++ b/tests/test_native_vision.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import numpy as np + +import renderers._native_vision as _native_vision + + +class _FakeProcessor: + def process_bytes(self, _raw: bytes): + return { + "modality": "image", + "num_tokens": 2, + "hash": "abc", + "hf_payload": { + "pixel_values": np.arange(6, dtype=np.float32).reshape(2, 3), + "image_grid_thw": np.array([[1, 2, 4]], dtype=np.int64), + }, + } + + +def test_process_image_for_qwen_vl_accepts_native_numpy_payload(monkeypatch): + monkeypatch.setattr( + _native_vision, "get_qwen_vl_processor", lambda **_kwargs: _FakeProcessor() + ) + + out = _native_vision.process_image_for_qwen_vl(b"image", message_idx=3) + + assert out["message_idx"] == 3 + assert out["hf_payload"]["pixel_values"].shape == (2, 3) + assert out["hf_payload"]["image_grid_thw"].shape == (1, 3) + + +def test_process_image_for_qwen_vl_return_numpy_false_converts_to_dict(monkeypatch): + monkeypatch.setattr( + _native_vision, "get_qwen_vl_processor", lambda **_kwargs: _FakeProcessor() + ) + + out = _native_vision.process_image_for_qwen_vl( + b"image", message_idx=3, return_numpy=False + ) + + assert out["hf_payload"]["pixel_values"] == { + "shape": [2, 3], + "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0], + } + assert out["hf_payload"]["image_grid_thw"] == { + "shape": [1, 3], + "data": [1, 2, 4], + } diff --git a/tests/test_renderer_e2e.py b/tests/test_renderer_e2e.py new file mode 100644 index 0000000..0b6406d --- /dev/null +++ b/tests/test_renderer_e2e.py @@ -0,0 +1,76 @@ +"""Backend-free end-to-end renderer flow tests. + +These tests simulate the token-in/token-out control loop without launching +vLLM, SGLang, Transformers generation, or Tinker. They cover the glue between +``render_ids``, ``parse_response``, and ``bridge_to_next_turn`` so the examples +have a local parity check for the renderer-owned part of the stack. +""" + +from __future__ import annotations + + +def test_renderer_owned_two_turn_flow_preserves_sampled_prefix(): + from renderers import create_renderer + from renderers.base import load_tokenizer + + tokenizer = load_tokenizer("Qwen/Qwen3.5-9B") + renderer = create_renderer(tokenizer, renderer="auto") + + messages = [ + {"role": "system", "content": "You are concise."}, + {"role": "user", "content": "Say hello."}, + ] + assistant = {"role": "assistant", "content": "Hello."} + + prompt_ids = renderer.render_ids(messages, add_generation_prompt=True) + full_ids = renderer.render_ids(messages + [assistant]) + completion_ids = full_ids[len(prompt_ids) :] + + parsed = renderer.parse_response(completion_ids) + assert "Hello" in parsed.content + + bridged = renderer.bridge_to_next_turn( + prompt_ids, + completion_ids, + [{"role": "user", "content": "Now say bye."}], + ) + assert bridged is not None + bridged_ids = list(bridged.token_ids) + expected_prefix = prompt_ids + completion_ids + assert bridged_ids[: len(expected_prefix)] == expected_prefix + + +def test_default_renderer_fallback_keeps_raw_decoded_completion_prefix(): + """DefaultRenderer cannot bridge, so callers fall back to a full render. + + The fallback must use raw decoded completion bytes, not parse-normalized + assistant structure. For round-tripping tokenizers, that preserves the + sampled assistant prefix even though the bridge API correctly returns + ``None``. + """ + + from renderers import create_renderer + from renderers.base import load_tokenizer + + tokenizer = load_tokenizer("Qwen/Qwen2.5-0.5B-Instruct") + renderer = create_renderer(tokenizer, renderer="default") + + messages = [{"role": "user", "content": "Say hello."}] + assistant = {"role": "assistant", "content": "HELLO_SENTINEL"} + new_messages = [{"role": "user", "content": "Now say bye."}] + + prompt_ids = renderer.render_ids(messages, add_generation_prompt=True) + full_ids = renderer.render_ids(messages + [assistant]) + completion_ids = full_ids[len(prompt_ids) :] + + assert ( + renderer.bridge_to_next_turn(prompt_ids, completion_ids, new_messages) is None + ) + + raw_completion = tokenizer.decode(completion_ids, skip_special_tokens=False) + fallback_ids = renderer.render_ids( + messages + [{"role": "assistant", "content": raw_completion}] + new_messages, + add_generation_prompt=True, + ) + expected_prefix = prompt_ids + completion_ids + assert fallback_ids[: len(expected_prefix)] == expected_prefix diff --git a/uv.lock b/uv.lock index 8096df3..6b22371 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-05-18T21:42:54.18041997Z" +exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "P7D" [options.exclude-newer-package] @@ -1409,7 +1409,7 @@ requires-dist = [ { name = "jinja2" }, { name = "numpy" }, { name = "openai", specifier = ">=1.108.1" }, - { name = "openai-harmony", specifier = ">=0.0.8" }, + { name = "openai-harmony", specifier = ">=0.0.4" }, { name = "prime-pydantic-config", specifier = ">=0.3.0.dev83" }, { name = "tiktoken" }, { name = "transformers", specifier = ">=4.50.0" },