From 4c7e9893c4ca8eec35d032726d14b74fae76cff8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 2 Nov 2025 10:44:06 +0200 Subject: [PATCH 1/2] benches : add folder with benchmarks --- benches/dgx-spark.md | 275 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 benches/dgx-spark.md diff --git a/benches/dgx-spark.md b/benches/dgx-spark.md new file mode 100644 index 0000000000000..3c23738904132 --- /dev/null +++ b/benches/dgx-spark.md @@ -0,0 +1,275 @@ +## System info + +```bash +uname --all +Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux + +g++ --version +g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 + +nvidia-smi +Sun Nov 2 10:43:25 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | ++-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A | +| N/A 35C P8 4W / N/A | Not Supported | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ +``` + +## ggml-org/gpt-oss-20b-GGUF + +Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.390 | 1311.72 | 0.395 | 81.11 | 0.785 | 693.12 | +| 512 | 32 | 2 | 1088 | 0.273 | 3754.20 | 0.674 | 95.00 | 0.946 | 1149.52 | +| 512 | 32 | 4 | 2176 | 0.517 | 3965.01 | 0.843 | 151.90 | 1.359 | 1601.00 | +| 512 | 32 | 8 | 4352 | 1.025 | 3994.63 | 1.008 | 254.09 | 2.033 | 2140.79 | +| 512 | 32 | 16 | 8704 | 2.054 | 3988.71 | 1.277 | 400.94 | 3.331 | 2613.19 | +| 512 | 32 | 32 | 17408 | 4.110 | 3986.29 | 1.666 | 614.82 | 5.776 | 3014.04 | +| 4096 | 32 | 1 | 4128 | 1.166 | 3513.60 | 0.410 | 77.97 | 1.576 | 2619.03 | +| 4096 | 32 | 2 | 8256 | 2.316 | 3537.64 | 0.734 | 87.18 | 3.050 | 2707.09 | +| 4096 | 32 | 4 | 16512 | 4.653 | 3521.53 | 0.915 | 139.90 | 5.567 | 2965.81 | +| 4096 | 32 | 8 | 33024 | 9.277 | 3532.17 | 1.181 | 216.70 | 10.458 | 3157.66 | +| 4096 | 32 | 16 | 66048 | 18.624 | 3518.87 | 1.649 | 310.48 | 20.273 | 3257.89 | +| 4096 | 32 | 32 | 132096 | 37.217 | 3521.82 | 2.424 | 422.45 | 39.641 | 3332.30 | +| 8192 | 32 | 1 | 8224 | 2.473 | 3313.11 | 0.444 | 72.07 | 2.917 | 2819.71 | +| 8192 | 32 | 2 | 16448 | 4.926 | 3326.23 | 0.771 | 82.98 | 5.697 | 2887.14 | +| 8192 | 32 | 4 | 32896 | 9.851 | 3326.52 | 1.006 | 127.28 | 10.856 | 3030.15 | +| 8192 | 32 | 8 | 65792 | 19.635 | 3337.67 | 1.338 | 191.33 | 20.973 | 3136.95 | +| 8192 | 32 | 16 | 131584 | 39.290 | 3336.01 | 1.952 | 262.36 | 41.242 | 3190.56 | +| 8192 | 32 | 32 | 263168 | 78.571 | 3336.40 | 3.002 | 341.16 | 81.572 | 3226.19 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3782.76 ± 11.95 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 84.10 ± 0.52 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 3030.29 ± 8.25 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 77.36 ± 0.13 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2852.09 ± 11.44 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.00 ± 0.64 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 2470.31 ± 13.96 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 68.85 ± 0.11 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1899.37 ± 11.28 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 59.85 ± 0.12 | + +build: 2f68ce7cf (6918) + +## ggml-org/gpt-oss-120b-GGUF + +Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.605 | 846.15 | 0.559 | 57.28 | 1.164 | 467.46 | +| 512 | 32 | 2 | 1088 | 0.585 | 1749.57 | 1.061 | 60.35 | 1.646 | 661.06 | +| 512 | 32 | 4 | 2176 | 1.026 | 1995.16 | 1.346 | 95.08 | 2.373 | 917.12 | +| 512 | 32 | 8 | 4352 | 2.051 | 1997.01 | 1.746 | 146.65 | 3.797 | 1146.26 | +| 512 | 32 | 16 | 8704 | 4.122 | 1987.35 | 2.327 | 220.02 | 6.449 | 1349.63 | +| 512 | 32 | 32 | 17408 | 8.262 | 1983.16 | 3.276 | 312.56 | 11.538 | 1508.79 | +| 4096 | 32 | 1 | 4128 | 2.245 | 1824.57 | 0.591 | 54.18 | 2.835 | 1455.83 | +| 4096 | 32 | 2 | 8256 | 4.484 | 1827.03 | 1.122 | 57.02 | 5.606 | 1472.68 | +| 4096 | 32 | 4 | 16512 | 8.967 | 1827.22 | 1.474 | 86.83 | 10.441 | 1581.50 | +| 4096 | 32 | 8 | 33024 | 17.902 | 1830.43 | 1.997 | 128.22 | 19.898 | 1659.64 | +| 4096 | 32 | 16 | 66048 | 35.801 | 1830.56 | 2.901 | 176.49 | 38.702 | 1706.58 | +| 4096 | 32 | 32 | 132096 | 71.607 | 1830.43 | 4.433 | 230.99 | 76.040 | 1737.18 | +| 8192 | 32 | 1 | 8224 | 4.688 | 1747.28 | 0.627 | 51.01 | 5.316 | 1547.11 | +| 8192 | 32 | 2 | 16448 | 9.342 | 1753.75 | 1.182 | 54.13 | 10.525 | 1562.81 | +| 8192 | 32 | 4 | 32896 | 18.697 | 1752.56 | 1.605 | 79.74 | 20.302 | 1620.30 | +| 8192 | 32 | 8 | 65792 | 37.442 | 1750.33 | 2.228 | 114.91 | 39.670 | 1658.48 | +| 8192 | 32 | 16 | 131584 | 74.863 | 1750.82 | 3.419 | 149.74 | 78.283 | 1680.89 | +| 8192 | 32 | 32 | 263168 | 149.598 | 1752.33 | 5.209 | 196.60 | 154.806 | 1699.98 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 1937.54 ± 7.91 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 59.02 ± 0.17 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 1653.89 ± 4.68 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 53.95 ± 0.52 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1580.60 ± 3.10 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 51.85 ± 0.12 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1411.42 ± 3.75 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 48.03 ± 0.11 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1155.40 ± 2.98 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 41.35 ± 0.09 | + +build: 2f68ce7cf (6918) + +## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF + +Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.401 | 1275.89 | 0.555 | 57.63 | 0.957 | 568.70 | +| 512 | 32 | 2 | 1088 | 0.355 | 2884.04 | 0.998 | 64.11 | 1.353 | 803.94 | +| 512 | 32 | 4 | 2176 | 0.614 | 3333.11 | 1.255 | 102.01 | 1.869 | 1164.11 | +| 512 | 32 | 8 | 4352 | 1.215 | 3371.40 | 1.653 | 154.91 | 2.868 | 1517.67 | +| 512 | 32 | 16 | 8704 | 2.424 | 3379.01 | 2.198 | 232.98 | 4.622 | 1883.15 | +| 512 | 32 | 32 | 17408 | 4.845 | 3381.63 | 2.968 | 345.01 | 7.813 | 2228.08 | +| 4096 | 32 | 1 | 4128 | 1.319 | 3104.74 | 0.657 | 48.67 | 1.977 | 2088.30 | +| 4096 | 32 | 2 | 8256 | 2.636 | 3107.30 | 1.133 | 56.48 | 3.770 | 2190.15 | +| 4096 | 32 | 4 | 16512 | 5.290 | 3097.27 | 1.494 | 85.69 | 6.784 | 2434.13 | +| 4096 | 32 | 8 | 33024 | 10.554 | 3104.75 | 2.136 | 119.87 | 12.690 | 2602.40 | +| 4096 | 32 | 16 | 66048 | 21.162 | 3096.80 | 3.150 | 162.55 | 24.312 | 2716.66 | +| 4096 | 32 | 32 | 132096 | 42.380 | 3092.81 | 4.928 | 207.80 | 47.307 | 2792.29 | +| 8192 | 32 | 1 | 8224 | 2.838 | 2886.60 | 0.752 | 42.56 | 3.590 | 2290.94 | +| 8192 | 32 | 2 | 16448 | 5.678 | 2885.69 | 1.267 | 50.51 | 6.945 | 2368.42 | +| 8192 | 32 | 4 | 32896 | 11.342 | 2889.18 | 1.763 | 72.59 | 13.105 | 2510.17 | +| 8192 | 32 | 8 | 65792 | 22.650 | 2893.42 | 2.656 | 96.39 | 25.306 | 2599.86 | +| 8192 | 32 | 16 | 131584 | 45.366 | 2889.21 | 4.209 | 121.64 | 49.575 | 2654.23 | +| 8192 | 32 | 32 | 263168 | 90.690 | 2890.56 | 7.158 | 143.06 | 97.847 | 2689.58 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3175.62 ± 15.26 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 60.92 ± 0.17 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2704.06 ± 6.21 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 53.52 ± 0.35 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2381.88 ± 8.79 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 48.58 ± 0.11 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1900.91 ± 8.51 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 40.81 ± 0.06 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1292.19 ± 7.09 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 30.80 ± 0.02 | + +build: 2f68ce7cf (6918) + +## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF + +Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.215 | 2379.83 | 1.095 | 29.24 | 1.310 | 415.37 | +| 512 | 32 | 2 | 1088 | 0.424 | 2416.31 | 1.175 | 54.46 | 1.599 | 680.46 | +| 512 | 32 | 4 | 2176 | 0.880 | 2328.41 | 1.227 | 104.33 | 2.106 | 1033.02 | +| 512 | 32 | 8 | 4352 | 1.738 | 2356.86 | 1.308 | 195.67 | 3.046 | 1428.66 | +| 512 | 32 | 16 | 8704 | 3.472 | 2359.65 | 1.528 | 335.01 | 5.000 | 1740.80 | +| 512 | 32 | 32 | 17408 | 6.956 | 2355.21 | 1.726 | 593.27 | 8.683 | 2004.95 | +| 4096 | 32 | 1 | 4128 | 1.813 | 2259.55 | 1.126 | 28.42 | 2.939 | 1404.78 | +| 4096 | 32 | 2 | 8256 | 3.601 | 2274.88 | 1.246 | 51.35 | 4.848 | 1703.13 | +| 4096 | 32 | 4 | 16512 | 7.203 | 2274.48 | 1.364 | 93.86 | 8.567 | 1927.37 | +| 4096 | 32 | 8 | 33024 | 14.403 | 2275.04 | 1.570 | 163.10 | 15.973 | 2067.50 | +| 4096 | 32 | 16 | 66048 | 28.797 | 2275.78 | 2.025 | 252.80 | 30.822 | 2142.85 | +| 4096 | 32 | 32 | 132096 | 57.630 | 2274.38 | 2.716 | 377.05 | 60.346 | 2188.99 | +| 8192 | 32 | 1 | 8224 | 3.746 | 2186.60 | 1.164 | 27.50 | 4.910 | 1674.94 | +| 8192 | 32 | 2 | 16448 | 7.473 | 2192.42 | 1.323 | 48.39 | 8.796 | 1870.03 | +| 8192 | 32 | 4 | 32896 | 14.905 | 2198.45 | 1.511 | 84.71 | 16.416 | 2003.89 | +| 8192 | 32 | 8 | 65792 | 29.837 | 2196.49 | 1.843 | 138.88 | 31.680 | 2076.77 | +| 8192 | 32 | 16 | 131584 | 59.700 | 2195.53 | 2.609 | 196.21 | 62.309 | 2111.80 | +| 8192 | 32 | 32 | 263168 | 119.441 | 2194.75 | 3.818 | 268.20 | 123.259 | 2135.08 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2276.28 ± 4.92 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 29.30 ± 0.03 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2090.96 ± 6.11 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 28.27 ± 0.03 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1913.47 ± 5.86 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 27.42 ± 0.03 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1643.88 ± 8.45 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 25.93 ± 0.01 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1283.36 ± 1.00 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 22.70 ± 0.02 | + +build: 2f68ce7cf (6918) + +## ggml-org/gemma-3-4b-it-qat-GGUF + +Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.098 | 5236.19 | 0.408 | 78.38 | 0.506 | 1074.95 | +| 512 | 32 | 2 | 1088 | 0.171 | 5984.52 | 0.507 | 126.21 | 0.678 | 1604.23 | +| 512 | 32 | 4 | 2176 | 0.342 | 5981.45 | 0.554 | 231.11 | 0.896 | 2427.90 | +| 512 | 32 | 8 | 4352 | 0.674 | 6077.05 | 0.692 | 369.93 | 1.366 | 3185.88 | +| 512 | 32 | 16 | 8704 | 1.340 | 6115.09 | 0.930 | 550.47 | 2.270 | 3834.78 | +| 512 | 32 | 32 | 17408 | 2.676 | 6123.63 | 1.277 | 801.81 | 3.953 | 4404.13 | +| 4096 | 32 | 1 | 4128 | 0.711 | 5756.92 | 0.454 | 70.44 | 1.166 | 3541.03 | +| 4096 | 32 | 2 | 8256 | 1.404 | 5835.14 | 0.571 | 112.12 | 1.975 | 4180.78 | +| 4096 | 32 | 4 | 16512 | 2.786 | 5880.66 | 0.668 | 191.68 | 3.454 | 4780.71 | +| 4096 | 32 | 8 | 33024 | 5.547 | 5907.20 | 0.891 | 287.46 | 6.438 | 5129.79 | +| 4096 | 32 | 16 | 66048 | 11.097 | 5905.68 | 1.315 | 389.48 | 12.412 | 5321.43 | +| 4096 | 32 | 32 | 132096 | 22.178 | 5910.07 | 2.055 | 498.18 | 24.233 | 5451.03 | +| 8192 | 32 | 1 | 8224 | 1.424 | 5752.81 | 0.467 | 68.51 | 1.891 | 4348.89 | +| 8192 | 32 | 2 | 16448 | 2.819 | 5812.28 | 0.630 | 101.53 | 3.449 | 4768.59 | +| 8192 | 32 | 4 | 32896 | 5.626 | 5824.22 | 0.791 | 161.74 | 6.418 | 5125.96 | +| 8192 | 32 | 8 | 65792 | 11.238 | 5831.66 | 1.163 | 220.10 | 12.401 | 5305.36 | +| 8192 | 32 | 16 | 131584 | 22.426 | 5844.66 | 1.857 | 275.70 | 24.283 | 5418.77 | +| 8192 | 32 | 32 | 263168 | 44.834 | 5847.04 | 3.146 | 325.48 | 47.980 | 5484.98 | + + + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 5713.05 ± 19.74 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 78.97 ± 0.15 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 5165.17 ± 27.83 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 68.94 ± 0.99 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 4885.85 ± 46.27 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 67.81 ± 0.84 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 4417.88 ± 52.14 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 64.35 ± 0.66 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 3667.74 ± 50.02 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 58.38 ± 0.05 | + +build: 2f68ce7cf (6918) + + From 9ea450394127e1a0c1f6272eab745c107840efa0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 9 Nov 2025 12:50:45 +0200 Subject: [PATCH 2/2] benches : update dgx-spark bench --- benches/dgx-spark.md | 311 +++++++++++++++++++++---------------------- 1 file changed, 150 insertions(+), 161 deletions(-) diff --git a/benches/dgx-spark.md b/benches/dgx-spark.md index 3c23738904132..ec6c20d8a0595 100644 --- a/benches/dgx-spark.md +++ b/benches/dgx-spark.md @@ -33,44 +33,42 @@ main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.390 | 1311.72 | 0.395 | 81.11 | 0.785 | 693.12 | -| 512 | 32 | 2 | 1088 | 0.273 | 3754.20 | 0.674 | 95.00 | 0.946 | 1149.52 | -| 512 | 32 | 4 | 2176 | 0.517 | 3965.01 | 0.843 | 151.90 | 1.359 | 1601.00 | -| 512 | 32 | 8 | 4352 | 1.025 | 3994.63 | 1.008 | 254.09 | 2.033 | 2140.79 | -| 512 | 32 | 16 | 8704 | 2.054 | 3988.71 | 1.277 | 400.94 | 3.331 | 2613.19 | -| 512 | 32 | 32 | 17408 | 4.110 | 3986.29 | 1.666 | 614.82 | 5.776 | 3014.04 | -| 4096 | 32 | 1 | 4128 | 1.166 | 3513.60 | 0.410 | 77.97 | 1.576 | 2619.03 | -| 4096 | 32 | 2 | 8256 | 2.316 | 3537.64 | 0.734 | 87.18 | 3.050 | 2707.09 | -| 4096 | 32 | 4 | 16512 | 4.653 | 3521.53 | 0.915 | 139.90 | 5.567 | 2965.81 | -| 4096 | 32 | 8 | 33024 | 9.277 | 3532.17 | 1.181 | 216.70 | 10.458 | 3157.66 | -| 4096 | 32 | 16 | 66048 | 18.624 | 3518.87 | 1.649 | 310.48 | 20.273 | 3257.89 | -| 4096 | 32 | 32 | 132096 | 37.217 | 3521.82 | 2.424 | 422.45 | 39.641 | 3332.30 | -| 8192 | 32 | 1 | 8224 | 2.473 | 3313.11 | 0.444 | 72.07 | 2.917 | 2819.71 | -| 8192 | 32 | 2 | 16448 | 4.926 | 3326.23 | 0.771 | 82.98 | 5.697 | 2887.14 | -| 8192 | 32 | 4 | 32896 | 9.851 | 3326.52 | 1.006 | 127.28 | 10.856 | 3030.15 | -| 8192 | 32 | 8 | 65792 | 19.635 | 3337.67 | 1.338 | 191.33 | 20.973 | 3136.95 | -| 8192 | 32 | 16 | 131584 | 39.290 | 3336.01 | 1.952 | 262.36 | 41.242 | 3190.56 | -| 8192 | 32 | 32 | 263168 | 78.571 | 3336.40 | 3.002 | 341.16 | 81.572 | 3226.19 | - - +| 512 | 32 | 1 | 544 | 0.374 | 1369.01 | 0.383 | 83.64 | 0.757 | 719.01 | +| 512 | 32 | 2 | 1088 | 0.274 | 3741.35 | 0.659 | 97.14 | 0.933 | 1166.66 | +| 512 | 32 | 4 | 2176 | 0.526 | 3896.47 | 0.817 | 156.73 | 1.342 | 1621.08 | +| 512 | 32 | 8 | 4352 | 1.044 | 3925.10 | 0.987 | 259.44 | 2.030 | 2143.56 | +| 512 | 32 | 16 | 8704 | 2.076 | 3945.84 | 1.248 | 410.32 | 3.324 | 2618.60 | +| 512 | 32 | 32 | 17408 | 4.170 | 3929.28 | 1.630 | 628.40 | 5.799 | 3001.76 | +| 4096 | 32 | 1 | 4128 | 1.083 | 3782.66 | 0.394 | 81.21 | 1.477 | 2795.13 | +| 4096 | 32 | 2 | 8256 | 2.166 | 3782.72 | 0.725 | 88.28 | 2.891 | 2856.14 | +| 4096 | 32 | 4 | 16512 | 4.333 | 3780.88 | 0.896 | 142.82 | 5.230 | 3157.38 | +| 4096 | 32 | 8 | 33024 | 8.618 | 3802.14 | 1.155 | 221.69 | 9.773 | 3379.08 | +| 4096 | 32 | 16 | 66048 | 17.330 | 3781.73 | 1.598 | 320.34 | 18.928 | 3489.45 | +| 4096 | 32 | 32 | 132096 | 34.671 | 3780.48 | 2.336 | 438.35 | 37.007 | 3569.51 | +| 8192 | 32 | 1 | 8224 | 2.233 | 3668.56 | 0.438 | 72.98 | 2.671 | 3078.44 | +| 8192 | 32 | 2 | 16448 | 4.425 | 3702.95 | 0.756 | 84.66 | 5.181 | 3174.95 | +| 8192 | 32 | 4 | 32896 | 8.859 | 3698.64 | 0.967 | 132.38 | 9.826 | 3347.72 | +| 8192 | 32 | 8 | 65792 | 17.714 | 3699.57 | 1.277 | 200.52 | 18.991 | 3464.35 | +| 8192 | 32 | 16 | 131584 | 35.494 | 3692.84 | 1.841 | 278.12 | 37.335 | 3524.46 | +| 8192 | 32 | 32 | 263168 | 70.949 | 3694.82 | 2.798 | 365.99 | 73.747 | 3568.53 | - `llama-bench` | model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3782.76 ± 11.95 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 84.10 ± 0.52 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 3030.29 ± 8.25 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 77.36 ± 0.13 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2852.09 ± 11.44 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.00 ± 0.64 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 2470.31 ± 13.96 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 68.85 ± 0.11 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1899.37 ± 11.28 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 59.85 ± 0.12 | - -build: 2f68ce7cf (6918) +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3714.25 ± 20.36 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 86.58 ± 0.43 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 3445.17 ± 17.85 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 81.72 ± 0.53 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 3218.78 ± 11.34 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.86 ± 0.64 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 2732.83 ± 7.17 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 71.57 ± 0.51 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 2119.75 ± 12.81 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 62.33 ± 0.24 | + +build: eeee367de (6989) ## ggml-org/gpt-oss-120b-GGUF @@ -83,44 +81,42 @@ main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.605 | 846.15 | 0.559 | 57.28 | 1.164 | 467.46 | -| 512 | 32 | 2 | 1088 | 0.585 | 1749.57 | 1.061 | 60.35 | 1.646 | 661.06 | -| 512 | 32 | 4 | 2176 | 1.026 | 1995.16 | 1.346 | 95.08 | 2.373 | 917.12 | -| 512 | 32 | 8 | 4352 | 2.051 | 1997.01 | 1.746 | 146.65 | 3.797 | 1146.26 | -| 512 | 32 | 16 | 8704 | 4.122 | 1987.35 | 2.327 | 220.02 | 6.449 | 1349.63 | -| 512 | 32 | 32 | 17408 | 8.262 | 1983.16 | 3.276 | 312.56 | 11.538 | 1508.79 | -| 4096 | 32 | 1 | 4128 | 2.245 | 1824.57 | 0.591 | 54.18 | 2.835 | 1455.83 | -| 4096 | 32 | 2 | 8256 | 4.484 | 1827.03 | 1.122 | 57.02 | 5.606 | 1472.68 | -| 4096 | 32 | 4 | 16512 | 8.967 | 1827.22 | 1.474 | 86.83 | 10.441 | 1581.50 | -| 4096 | 32 | 8 | 33024 | 17.902 | 1830.43 | 1.997 | 128.22 | 19.898 | 1659.64 | -| 4096 | 32 | 16 | 66048 | 35.801 | 1830.56 | 2.901 | 176.49 | 38.702 | 1706.58 | -| 4096 | 32 | 32 | 132096 | 71.607 | 1830.43 | 4.433 | 230.99 | 76.040 | 1737.18 | -| 8192 | 32 | 1 | 8224 | 4.688 | 1747.28 | 0.627 | 51.01 | 5.316 | 1547.11 | -| 8192 | 32 | 2 | 16448 | 9.342 | 1753.75 | 1.182 | 54.13 | 10.525 | 1562.81 | -| 8192 | 32 | 4 | 32896 | 18.697 | 1752.56 | 1.605 | 79.74 | 20.302 | 1620.30 | -| 8192 | 32 | 8 | 65792 | 37.442 | 1750.33 | 2.228 | 114.91 | 39.670 | 1658.48 | -| 8192 | 32 | 16 | 131584 | 74.863 | 1750.82 | 3.419 | 149.74 | 78.283 | 1680.89 | -| 8192 | 32 | 32 | 263168 | 149.598 | 1752.33 | 5.209 | 196.60 | 154.806 | 1699.98 | - - +| 512 | 32 | 1 | 544 | 0.571 | 897.18 | 0.543 | 58.96 | 1.113 | 488.60 | +| 512 | 32 | 2 | 1088 | 0.593 | 1725.37 | 1.041 | 61.45 | 1.635 | 665.48 | +| 512 | 32 | 4 | 2176 | 1.043 | 1963.15 | 1.334 | 95.95 | 2.377 | 915.36 | +| 512 | 32 | 8 | 4352 | 2.099 | 1951.63 | 1.717 | 149.07 | 3.816 | 1140.45 | +| 512 | 32 | 16 | 8704 | 4.207 | 1947.12 | 2.311 | 221.56 | 6.518 | 1335.35 | +| 512 | 32 | 32 | 17408 | 8.422 | 1945.36 | 3.298 | 310.46 | 11.720 | 1485.27 | +| 4096 | 32 | 1 | 4128 | 2.138 | 1915.88 | 0.571 | 56.09 | 2.708 | 1524.12 | +| 4096 | 32 | 2 | 8256 | 4.266 | 1920.25 | 1.137 | 56.27 | 5.404 | 1527.90 | +| 4096 | 32 | 4 | 16512 | 8.564 | 1913.02 | 1.471 | 86.99 | 10.036 | 1645.29 | +| 4096 | 32 | 8 | 33024 | 17.092 | 1917.19 | 1.979 | 129.33 | 19.071 | 1731.63 | +| 4096 | 32 | 16 | 66048 | 34.211 | 1915.65 | 2.850 | 179.66 | 37.061 | 1782.15 | +| 4096 | 32 | 32 | 132096 | 68.394 | 1916.44 | 4.381 | 233.72 | 72.775 | 1815.13 | +| 8192 | 32 | 1 | 8224 | 4.349 | 1883.45 | 0.620 | 51.65 | 4.969 | 1655.04 | +| 8192 | 32 | 2 | 16448 | 8.674 | 1888.83 | 1.178 | 54.33 | 9.852 | 1669.48 | +| 8192 | 32 | 4 | 32896 | 17.351 | 1888.55 | 1.580 | 81.01 | 18.931 | 1737.68 | +| 8192 | 32 | 8 | 65792 | 34.743 | 1886.31 | 2.173 | 117.80 | 36.916 | 1782.20 | +| 8192 | 32 | 16 | 131584 | 69.413 | 1888.29 | 3.297 | 155.28 | 72.710 | 1809.70 | +| 8192 | 32 | 32 | 263168 | 138.903 | 1887.24 | 5.004 | 204.63 | 143.907 | 1828.73 | - `llama-bench` | model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 1937.54 ± 7.91 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 59.02 ± 0.17 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 1653.89 ± 4.68 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 53.95 ± 0.52 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1580.60 ± 3.10 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 51.85 ± 0.12 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1411.42 ± 3.75 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 48.03 ± 0.11 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1155.40 ± 2.98 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 41.35 ± 0.09 | - -build: 2f68ce7cf (6918) +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 1919.36 ± 5.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 60.40 ± 0.30 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 1825.30 ± 6.37 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 56.94 ± 0.29 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1739.19 ± 6.00 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 52.51 ± 0.42 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1536.75 ± 4.27 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 49.33 ± 0.27 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1255.85 ± 3.26 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 42.99 ± 0.18 | + +build: eeee367de (6989) ## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF @@ -133,44 +129,42 @@ main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.401 | 1275.89 | 0.555 | 57.63 | 0.957 | 568.70 | -| 512 | 32 | 2 | 1088 | 0.355 | 2884.04 | 0.998 | 64.11 | 1.353 | 803.94 | -| 512 | 32 | 4 | 2176 | 0.614 | 3333.11 | 1.255 | 102.01 | 1.869 | 1164.11 | -| 512 | 32 | 8 | 4352 | 1.215 | 3371.40 | 1.653 | 154.91 | 2.868 | 1517.67 | -| 512 | 32 | 16 | 8704 | 2.424 | 3379.01 | 2.198 | 232.98 | 4.622 | 1883.15 | -| 512 | 32 | 32 | 17408 | 4.845 | 3381.63 | 2.968 | 345.01 | 7.813 | 2228.08 | -| 4096 | 32 | 1 | 4128 | 1.319 | 3104.74 | 0.657 | 48.67 | 1.977 | 2088.30 | -| 4096 | 32 | 2 | 8256 | 2.636 | 3107.30 | 1.133 | 56.48 | 3.770 | 2190.15 | -| 4096 | 32 | 4 | 16512 | 5.290 | 3097.27 | 1.494 | 85.69 | 6.784 | 2434.13 | -| 4096 | 32 | 8 | 33024 | 10.554 | 3104.75 | 2.136 | 119.87 | 12.690 | 2602.40 | -| 4096 | 32 | 16 | 66048 | 21.162 | 3096.80 | 3.150 | 162.55 | 24.312 | 2716.66 | -| 4096 | 32 | 32 | 132096 | 42.380 | 3092.81 | 4.928 | 207.80 | 47.307 | 2792.29 | -| 8192 | 32 | 1 | 8224 | 2.838 | 2886.60 | 0.752 | 42.56 | 3.590 | 2290.94 | -| 8192 | 32 | 2 | 16448 | 5.678 | 2885.69 | 1.267 | 50.51 | 6.945 | 2368.42 | -| 8192 | 32 | 4 | 32896 | 11.342 | 2889.18 | 1.763 | 72.59 | 13.105 | 2510.17 | -| 8192 | 32 | 8 | 65792 | 22.650 | 2893.42 | 2.656 | 96.39 | 25.306 | 2599.86 | -| 8192 | 32 | 16 | 131584 | 45.366 | 2889.21 | 4.209 | 121.64 | 49.575 | 2654.23 | -| 8192 | 32 | 32 | 263168 | 90.690 | 2890.56 | 7.158 | 143.06 | 97.847 | 2689.58 | - - +| 512 | 32 | 1 | 544 | 0.398 | 1285.90 | 0.530 | 60.41 | 0.928 | 586.27 | +| 512 | 32 | 2 | 1088 | 0.386 | 2651.65 | 0.948 | 67.50 | 1.334 | 815.38 | +| 512 | 32 | 4 | 2176 | 0.666 | 3076.37 | 1.209 | 105.87 | 1.875 | 1160.71 | +| 512 | 32 | 8 | 4352 | 1.325 | 3091.39 | 1.610 | 158.98 | 2.935 | 1482.65 | +| 512 | 32 | 16 | 8704 | 2.664 | 3075.58 | 2.150 | 238.19 | 4.813 | 1808.39 | +| 512 | 32 | 32 | 17408 | 5.336 | 3070.31 | 2.904 | 352.59 | 8.240 | 2112.50 | +| 4096 | 32 | 1 | 4128 | 1.444 | 2836.81 | 0.581 | 55.09 | 2.025 | 2038.81 | +| 4096 | 32 | 2 | 8256 | 2.872 | 2852.14 | 1.084 | 59.06 | 3.956 | 2086.99 | +| 4096 | 32 | 4 | 16512 | 5.744 | 2852.32 | 1.440 | 88.90 | 7.184 | 2298.47 | +| 4096 | 32 | 8 | 33024 | 11.463 | 2858.68 | 2.068 | 123.78 | 13.531 | 2440.65 | +| 4096 | 32 | 16 | 66048 | 22.915 | 2859.95 | 3.018 | 169.67 | 25.933 | 2546.90 | +| 4096 | 32 | 32 | 132096 | 45.956 | 2852.10 | 4.609 | 222.18 | 50.565 | 2612.39 | +| 8192 | 32 | 1 | 8224 | 3.063 | 2674.72 | 0.693 | 46.20 | 3.755 | 2189.92 | +| 8192 | 32 | 2 | 16448 | 6.109 | 2681.87 | 1.214 | 52.71 | 7.323 | 2245.98 | +| 8192 | 32 | 4 | 32896 | 12.197 | 2686.63 | 1.682 | 76.11 | 13.878 | 2370.30 | +| 8192 | 32 | 8 | 65792 | 24.409 | 2684.94 | 2.556 | 100.17 | 26.965 | 2439.95 | +| 8192 | 32 | 16 | 131584 | 48.753 | 2688.50 | 3.994 | 128.20 | 52.747 | 2494.64 | +| 8192 | 32 | 32 | 263168 | 97.508 | 2688.42 | 6.528 | 156.86 | 104.037 | 2529.57 | - `llama-bench` | model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3175.62 ± 15.26 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 60.92 ± 0.17 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2704.06 ± 6.21 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 53.52 ± 0.35 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2381.88 ± 8.79 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 48.58 ± 0.11 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1900.91 ± 8.51 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 40.81 ± 0.06 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1292.19 ± 7.09 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 30.80 ± 0.02 | - -build: 2f68ce7cf (6918) +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2925.55 ± 4.25 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 62.80 ± 0.27 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2531.01 ± 6.79 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 55.86 ± 0.33 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2244.39 ± 5.33 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 45.95 ± 0.33 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1783.17 ± 3.68 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 39.07 ± 0.10 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1241.90 ± 3.13 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 29.92 ± 0.06 | + +build: eeee367de (6989) ## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF @@ -183,44 +177,42 @@ main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.215 | 2379.83 | 1.095 | 29.24 | 1.310 | 415.37 | -| 512 | 32 | 2 | 1088 | 0.424 | 2416.31 | 1.175 | 54.46 | 1.599 | 680.46 | -| 512 | 32 | 4 | 2176 | 0.880 | 2328.41 | 1.227 | 104.33 | 2.106 | 1033.02 | -| 512 | 32 | 8 | 4352 | 1.738 | 2356.86 | 1.308 | 195.67 | 3.046 | 1428.66 | -| 512 | 32 | 16 | 8704 | 3.472 | 2359.65 | 1.528 | 335.01 | 5.000 | 1740.80 | -| 512 | 32 | 32 | 17408 | 6.956 | 2355.21 | 1.726 | 593.27 | 8.683 | 2004.95 | -| 4096 | 32 | 1 | 4128 | 1.813 | 2259.55 | 1.126 | 28.42 | 2.939 | 1404.78 | -| 4096 | 32 | 2 | 8256 | 3.601 | 2274.88 | 1.246 | 51.35 | 4.848 | 1703.13 | -| 4096 | 32 | 4 | 16512 | 7.203 | 2274.48 | 1.364 | 93.86 | 8.567 | 1927.37 | -| 4096 | 32 | 8 | 33024 | 14.403 | 2275.04 | 1.570 | 163.10 | 15.973 | 2067.50 | -| 4096 | 32 | 16 | 66048 | 28.797 | 2275.78 | 2.025 | 252.80 | 30.822 | 2142.85 | -| 4096 | 32 | 32 | 132096 | 57.630 | 2274.38 | 2.716 | 377.05 | 60.346 | 2188.99 | -| 8192 | 32 | 1 | 8224 | 3.746 | 2186.60 | 1.164 | 27.50 | 4.910 | 1674.94 | -| 8192 | 32 | 2 | 16448 | 7.473 | 2192.42 | 1.323 | 48.39 | 8.796 | 1870.03 | -| 8192 | 32 | 4 | 32896 | 14.905 | 2198.45 | 1.511 | 84.71 | 16.416 | 2003.89 | -| 8192 | 32 | 8 | 65792 | 29.837 | 2196.49 | 1.843 | 138.88 | 31.680 | 2076.77 | -| 8192 | 32 | 16 | 131584 | 59.700 | 2195.53 | 2.609 | 196.21 | 62.309 | 2111.80 | -| 8192 | 32 | 32 | 263168 | 119.441 | 2194.75 | 3.818 | 268.20 | 123.259 | 2135.08 | - - +| 512 | 32 | 1 | 544 | 0.211 | 2421.57 | 1.055 | 30.33 | 1.266 | 429.57 | +| 512 | 32 | 2 | 1088 | 0.419 | 2441.34 | 1.130 | 56.65 | 1.549 | 702.32 | +| 512 | 32 | 4 | 2176 | 0.873 | 2345.54 | 1.174 | 108.99 | 2.048 | 1062.74 | +| 512 | 32 | 8 | 4352 | 1.727 | 2371.85 | 1.254 | 204.22 | 2.980 | 1460.19 | +| 512 | 32 | 16 | 8704 | 3.452 | 2373.22 | 1.492 | 343.16 | 4.944 | 1760.56 | +| 512 | 32 | 32 | 17408 | 6.916 | 2368.93 | 1.675 | 611.51 | 8.591 | 2026.36 | +| 4096 | 32 | 1 | 4128 | 1.799 | 2277.26 | 1.084 | 29.51 | 2.883 | 1431.91 | +| 4096 | 32 | 2 | 8256 | 3.577 | 2290.01 | 1.196 | 53.50 | 4.774 | 1729.51 | +| 4096 | 32 | 4 | 16512 | 7.172 | 2284.36 | 1.313 | 97.50 | 8.485 | 1946.00 | +| 4096 | 32 | 8 | 33024 | 14.341 | 2284.96 | 1.520 | 168.46 | 15.860 | 2082.18 | +| 4096 | 32 | 16 | 66048 | 28.675 | 2285.44 | 1.983 | 258.21 | 30.658 | 2154.33 | +| 4096 | 32 | 32 | 132096 | 57.354 | 2285.32 | 2.640 | 387.87 | 59.994 | 2201.82 | +| 8192 | 32 | 1 | 8224 | 3.701 | 2213.75 | 1.119 | 28.59 | 4.820 | 1706.34 | +| 8192 | 32 | 2 | 16448 | 7.410 | 2211.19 | 1.272 | 50.31 | 8.682 | 1894.56 | +| 8192 | 32 | 4 | 32896 | 14.802 | 2213.83 | 1.460 | 87.68 | 16.261 | 2022.96 | +| 8192 | 32 | 8 | 65792 | 29.609 | 2213.35 | 1.781 | 143.74 | 31.390 | 2095.93 | +| 8192 | 32 | 16 | 131584 | 59.229 | 2212.96 | 2.495 | 205.17 | 61.725 | 2131.79 | +| 8192 | 32 | 32 | 263168 | 118.449 | 2213.15 | 3.714 | 275.75 | 122.162 | 2154.25 | - `llama-bench` | model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2276.28 ± 4.92 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 29.30 ± 0.03 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2090.96 ± 6.11 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 28.27 ± 0.03 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1913.47 ± 5.86 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 27.42 ± 0.03 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1643.88 ± 8.45 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 25.93 ± 0.01 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1283.36 ± 1.00 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 22.70 ± 0.02 | - -build: 2f68ce7cf (6918) +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2272.74 ± 4.68 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 30.66 ± 0.02 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2107.80 ± 9.55 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 29.71 ± 0.05 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1937.80 ± 6.75 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 28.86 ± 0.04 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1641.12 ± 1.78 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 27.24 ± 0.04 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1296.02 ± 2.67 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 23.78 ± 0.03 | + +build: eeee367de (6989) ## ggml-org/gemma-3-4b-it-qat-GGUF @@ -233,43 +225,40 @@ main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_ | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.098 | 5236.19 | 0.408 | 78.38 | 0.506 | 1074.95 | -| 512 | 32 | 2 | 1088 | 0.171 | 5984.52 | 0.507 | 126.21 | 0.678 | 1604.23 | -| 512 | 32 | 4 | 2176 | 0.342 | 5981.45 | 0.554 | 231.11 | 0.896 | 2427.90 | -| 512 | 32 | 8 | 4352 | 0.674 | 6077.05 | 0.692 | 369.93 | 1.366 | 3185.88 | -| 512 | 32 | 16 | 8704 | 1.340 | 6115.09 | 0.930 | 550.47 | 2.270 | 3834.78 | -| 512 | 32 | 32 | 17408 | 2.676 | 6123.63 | 1.277 | 801.81 | 3.953 | 4404.13 | -| 4096 | 32 | 1 | 4128 | 0.711 | 5756.92 | 0.454 | 70.44 | 1.166 | 3541.03 | -| 4096 | 32 | 2 | 8256 | 1.404 | 5835.14 | 0.571 | 112.12 | 1.975 | 4180.78 | -| 4096 | 32 | 4 | 16512 | 2.786 | 5880.66 | 0.668 | 191.68 | 3.454 | 4780.71 | -| 4096 | 32 | 8 | 33024 | 5.547 | 5907.20 | 0.891 | 287.46 | 6.438 | 5129.79 | -| 4096 | 32 | 16 | 66048 | 11.097 | 5905.68 | 1.315 | 389.48 | 12.412 | 5321.43 | -| 4096 | 32 | 32 | 132096 | 22.178 | 5910.07 | 2.055 | 498.18 | 24.233 | 5451.03 | -| 8192 | 32 | 1 | 8224 | 1.424 | 5752.81 | 0.467 | 68.51 | 1.891 | 4348.89 | -| 8192 | 32 | 2 | 16448 | 2.819 | 5812.28 | 0.630 | 101.53 | 3.449 | 4768.59 | -| 8192 | 32 | 4 | 32896 | 5.626 | 5824.22 | 0.791 | 161.74 | 6.418 | 5125.96 | -| 8192 | 32 | 8 | 65792 | 11.238 | 5831.66 | 1.163 | 220.10 | 12.401 | 5305.36 | -| 8192 | 32 | 16 | 131584 | 22.426 | 5844.66 | 1.857 | 275.70 | 24.283 | 5418.77 | -| 8192 | 32 | 32 | 263168 | 44.834 | 5847.04 | 3.146 | 325.48 | 47.980 | 5484.98 | - - +| 512 | 32 | 1 | 544 | 0.094 | 5434.73 | 0.394 | 81.21 | 0.488 | 1114.15 | +| 512 | 32 | 2 | 1088 | 0.168 | 6091.68 | 0.498 | 128.52 | 0.666 | 1633.41 | +| 512 | 32 | 4 | 2176 | 0.341 | 6010.68 | 0.542 | 236.37 | 0.882 | 2466.43 | +| 512 | 32 | 8 | 4352 | 0.665 | 6161.46 | 0.678 | 377.74 | 1.342 | 3241.72 | +| 512 | 32 | 16 | 8704 | 1.323 | 6193.19 | 0.902 | 567.41 | 2.225 | 3911.74 | +| 512 | 32 | 32 | 17408 | 2.642 | 6202.03 | 1.231 | 832.03 | 3.872 | 4495.36 | +| 4096 | 32 | 1 | 4128 | 0.701 | 5840.49 | 0.439 | 72.95 | 1.140 | 3621.23 | +| 4096 | 32 | 2 | 8256 | 1.387 | 5906.82 | 0.574 | 111.48 | 1.961 | 4210.12 | +| 4096 | 32 | 4 | 16512 | 2.758 | 5940.33 | 0.651 | 196.58 | 3.409 | 4843.33 | +| 4096 | 32 | 8 | 33024 | 5.491 | 5967.56 | 0.876 | 292.40 | 6.367 | 5187.12 | +| 4096 | 32 | 16 | 66048 | 10.978 | 5969.58 | 1.275 | 401.69 | 12.253 | 5390.38 | +| 4096 | 32 | 32 | 132096 | 21.944 | 5972.93 | 1.992 | 514.16 | 23.936 | 5518.73 | +| 8192 | 32 | 1 | 8224 | 1.402 | 5841.91 | 0.452 | 70.73 | 1.855 | 4434.12 | +| 8192 | 32 | 2 | 16448 | 2.793 | 5865.34 | 0.637 | 100.55 | 3.430 | 4795.51 | +| 8192 | 32 | 4 | 32896 | 5.564 | 5889.64 | 0.770 | 166.26 | 6.334 | 5193.95 | +| 8192 | 32 | 8 | 65792 | 11.114 | 5896.44 | 1.122 | 228.07 | 12.237 | 5376.51 | +| 8192 | 32 | 16 | 131584 | 22.210 | 5901.38 | 1.789 | 286.15 | 24.000 | 5482.74 | +| 8192 | 32 | 32 | 263168 | 44.382 | 5906.56 | 3.044 | 336.38 | 47.426 | 5549.02 | - `llama-bench` | model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 5713.05 ± 19.74 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 78.97 ± 0.15 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 5165.17 ± 27.83 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 68.94 ± 0.99 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 4885.85 ± 46.27 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 67.81 ± 0.84 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 4417.88 ± 52.14 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 64.35 ± 0.66 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 3667.74 ± 50.02 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 58.38 ± 0.05 | - -build: 2f68ce7cf (6918) - +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 5810.04 ± 21.71 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 84.54 ± 0.18 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 5288.04 ± 3.54 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 78.82 ± 1.37 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 4960.43 ± 16.64 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.13 ± 0.30 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 4495.92 ± 31.11 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 72.37 ± 0.29 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 3746.90 ± 40.01 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 63.02 ± 0.20 | + +build: eeee367de (6989)