From 9cd402cbe1744f191b48e1ed943c37c210f83ab6 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Wed, 11 Jun 2025 17:53:55 -0400
Subject: [PATCH 01/13] tests : add test-model-random

This generates random models and then tests different concurrencies
of batches to check if the output is consistent.

This can detect when e.g. the recurrent cache has been broken,
or anything else which would affect the consistency of the output
when inferencing multiple distinct sequences.

More architectures will be added, but for now this starts with Mamba.

Eventually, consistency of pooled embeddings will also be tested.

The goal is to reduce accidental regressions
by making it easy to quickly test a lot of edge cases
on the supported architectures,
without having to download any model.
---
 tests/CMakeLists.txt        |   1 +
 tests/test-model-random.cpp | 920 ++++++++++++++++++++++++++++++++++++
 2 files changed, 921 insertions(+)
 create mode 100644 tests/test-model-random.cpp
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index db4b2cf65cc43..14cb9aee8c29d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -193,6 +193,7 @@ endif()
 # llama_build_and_test(test-opt.cpp) # SLOW
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
+llama_build_and_test(test-model-random.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")
diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
new file mode 100644
index 0000000000000..95a03bc2f1c17
--- /dev/null
+++ b/tests/test-model-random.cpp
@@ -0,0 +1,920 @@
+
+#include <common.h>
+#include <llama.h>
+#include <string.h>
+
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <random>
+#include <utility>
+// NOTE: the llm_arch enum is in the private API
+#include "../src/llama-model.h"
+#include "ggml.h"
+#include "gguf.h"
+// For gguf_type_size
+#include "../ggml/src/ggml-impl.h"
+
+struct random_tensor {
+    const std::string name;
+    const std::vector<int64_t> shape;
+    const ggml_type type = GGML_TYPE_F32; // TODO: maybe make this configurable?
+    const std::function<float(std::mt19937 &, int64_t i)> distribution;
+
+    random_tensor(std::string name, std::vector<int64_t> shape,
+                  const std::function<float(std::mt19937 &)> & distribution = std::normal_distribution<float>()) :
+        name(std::move(name)),
+        shape(std::move(shape)),
+        distribution([distribution](std::mt19937 & rng, int64_t i) {
+            GGML_UNUSED(i);
+            return distribution(rng);
+        }) {}
+
+    random_tensor(std::string name, std::vector<int64_t> shape, const std::function<float(int64_t)> & distribution) :
+        name(std::move(name)),
+        shape(std::move(shape)),
+        distribution([distribution](std::mt19937 & rng, int64_t i) {
+            GGML_UNUSED(rng);
+            return distribution(i);
+        }) {}
+
+    random_tensor(std::string name, std::vector<int64_t> shape, const std::function<float()> & distribution) :
+        name(std::move(name)),
+        shape(std::move(shape)),
+        distribution([distribution](std::mt19937 & rng, int64_t i) {
+            GGML_UNUSED(rng);
+            GGML_UNUSED(i);
+            return distribution();
+        }) {}
+
+    random_tensor(std::string name, std::vector<int64_t> shape,
+                  std::function<float(std::mt19937 &, int64_t i)> distribution) :
+        name(std::move(name)),
+        shape(std::move(shape)),
+        distribution(std::move(distribution)) {}
+
+    random_tensor(const random_tensor & other) :
+        name(other.name),
+        shape(other.shape),
+        type(other.type),
+        distribution(other.distribution) {}
+
+    size_t n_bytes() const {
+        int64_t prod = 1;
+        for (int64_t d : shape) {
+            prod *= d;
+        }
+        return ggml_row_size(type, prod);
+    }
+
+    ggml_tensor * to_ggml_tensor(ggml_context * ctx, std::mt19937 & rng) const {
+        ggml_tensor * tensor = ggml_new_tensor(ctx, type, shape.size(), shape.data());
+        GGML_ASSERT(tensor->data != nullptr);
+
+        int64_t n_elems = 1;
+        for (size_t i = 0; i < shape.size(); ++i) {
+            n_elems *= shape[i];
+        }
+
+        for (int64_t i = 0; i < n_elems; ++i) {
+            ((float *) tensor->data)[i] = distribution(rng, i);
+        }
+
+        return tensor;
+    }
+};
+
+// TODO: move this to the gguf library?
+
+struct gguf_value {
+    const gguf_type type;
+    union {
+        uint8_t                   uint8;
+        int8_t                    int8;
+        uint16_t                  uint16;
+        int16_t                   int16;
+        uint32_t                  uint32;
+        int32_t                   int32;
+        float                     float32;
+        bool                      boolean;
+        std::string *             string;
+        std::vector<gguf_value> * array;
+        uint64_t                  uint64;
+        int64_t                   int64;
+        double                    float64;
+    } value;
+
+    ~gguf_value() {
+        switch (type) {
+            case GGUF_TYPE_STRING:
+                delete value.string;
+                break;
+            case GGUF_TYPE_ARRAY:
+                delete value.array;
+                break;
+            default:
+                break;
+        }
+    }
+
+    gguf_value(const gguf_value & other) : type(other.type) {
+        switch (type) {
+            case GGUF_TYPE_UINT8:
+            case GGUF_TYPE_INT8:
+            case GGUF_TYPE_UINT16:
+            case GGUF_TYPE_INT16:
+            case GGUF_TYPE_UINT32:
+            case GGUF_TYPE_INT32:
+            case GGUF_TYPE_FLOAT32:
+            case GGUF_TYPE_BOOL:
+            case GGUF_TYPE_UINT64:
+            case GGUF_TYPE_INT64:
+            case GGUF_TYPE_FLOAT64:
+            case GGUF_TYPE_COUNT:
+                value = other.value;
+                break;
+            case GGUF_TYPE_STRING:
+                value.string = new std::string(*other.value.string);
+                break;
+            case GGUF_TYPE_ARRAY:
+                value.array = new std::vector(*other.value.array);
+                break;
+        }
+    }
+
+    gguf_value(int8_t val) : type(GGUF_TYPE_INT8) { value.int8 = val; }
+
+    gguf_value(uint8_t val) : type(GGUF_TYPE_UINT8) { value.uint8 = val; }
+
+    gguf_value(int16_t val) : type(GGUF_TYPE_INT16) { value.int16 = val; }
+
+    gguf_value(uint16_t val) : type(GGUF_TYPE_UINT16) { value.uint16 = val; }
+
+    gguf_value(int32_t val) : type(GGUF_TYPE_INT32) { value.int32 = val; }
+
+    gguf_value(uint32_t val) : type(GGUF_TYPE_UINT32) { value.uint32 = val; }
+
+    gguf_value(float val) : type(GGUF_TYPE_FLOAT32) { value.float32 = val; }
+
+    gguf_value(bool val) : type(GGUF_TYPE_BOOL) { value.boolean = val; }
+
+    gguf_value(uint64_t val) : type(GGUF_TYPE_UINT64) { value.uint64 = val; }
+
+    gguf_value(int64_t val) : type(GGUF_TYPE_INT64) { value.int64 = val; }
+
+    gguf_value(double val) : type(GGUF_TYPE_FLOAT64) { value.float64 = val; }
+
+    gguf_value(std::string val) : type(GGUF_TYPE_STRING) { value.string = new std::string(std::move(val)); }
+
+    gguf_value(const char * val) : type(GGUF_TYPE_STRING) { value.string = new std::string(val); }
+
+    gguf_value(std::vector<gguf_value> val) : type(GGUF_TYPE_ARRAY) { value.array = new std::vector(std::move(val)); }
+
+    gguf_value(const std::vector<gguf_value> & val) : type(GGUF_TYPE_ARRAY) { value.array = new std::vector(val); }
+
+    template <typename T> gguf_value(const std::vector<T> & val) : type(GGUF_TYPE_ARRAY) {
+        value.array = new std::vector<gguf_value>();
+        for (const auto & v : val) {
+            value.array->push_back(gguf_value(v));
+        }
+    }
+
+    void set(gguf_context * ctx, const std::string & key) const {
+        const char * k = key.c_str();
+        switch (type) {
+            case GGUF_TYPE_UINT8:
+                gguf_set_val_u8(ctx, k, value.uint8);
+                break;
+            case GGUF_TYPE_INT8:
+                gguf_set_val_i8(ctx, k, value.int8);
+                break;
+            case GGUF_TYPE_UINT16:
+                gguf_set_val_u16(ctx, k, value.uint16);
+                break;
+            case GGUF_TYPE_INT16:
+                gguf_set_val_i16(ctx, k, value.int16);
+                break;
+            case GGUF_TYPE_UINT32:
+                gguf_set_val_u32(ctx, k, value.uint32);
+                break;
+            case GGUF_TYPE_INT32:
+                gguf_set_val_i32(ctx, k, value.int32);
+                break;
+            case GGUF_TYPE_FLOAT32:
+                gguf_set_val_f32(ctx, k, value.float32);
+                break;
+            case GGUF_TYPE_BOOL:
+                gguf_set_val_bool(ctx, k, value.boolean);
+                break;
+            case GGUF_TYPE_STRING:
+                gguf_set_val_str(ctx, k, value.string->c_str());
+                break;
+            case GGUF_TYPE_ARRAY:
+                {
+                    const size_t arr_size = value.array->size();
+                    if (arr_size > 0) {
+                        const gguf_type arr_type = (*value.array)[0].type;
+                        if (arr_type == GGUF_TYPE_STRING) {
+                            std::vector<const char *> strings(arr_size);
+                            for (size_t i = 0; i < arr_size; ++i) {
+                                strings[i] = (*value.array)[i].value.string->c_str();
+                            }
+                            gguf_set_arr_str(ctx, k, strings.data(), strings.size());
+                        } else {
+                            const size_t type_size = gguf_type_size(arr_type);
+                            std::vector<uint8_t> data(type_size * arr_size);
+                            for (size_t i = 0; i < arr_size; ++i) {
+                                memcpy(data.data() + type_size * i, &(*value.array)[i].value, type_size);
+                            }
+                            gguf_set_arr_data(ctx, k, arr_type, data.data(), data.size());
+                        }
+                        // TODO: handle nested arrays
+                    }
+                    break;
+                }
+            case GGUF_TYPE_UINT64:
+                gguf_set_val_u64(ctx, k, value.uint64);
+                break;
+            case GGUF_TYPE_INT64:
+                gguf_set_val_i64(ctx, k, value.int64);
+                break;
+            case GGUF_TYPE_FLOAT64:
+                gguf_set_val_f64(ctx, k, value.float64);
+                break;
+            case GGUF_TYPE_COUNT:
+                break;
+        }
+    }
+};
+
+struct model_variant {
+    llm_arch arch;
+    std::string name;
+    std::vector<random_tensor> tensors;
+    std::vector<std::pair<llm_kv, gguf_value>> metadata;
+
+    model_variant(llm_arch arch, const std::string & name) : arch(arch), name(name) {
+        add_kv(LLM_KV_GENERAL_TYPE, "model");
+        add_kv(LLM_KV_GENERAL_ARCHITECTURE, llm_arch_name(arch));
+    }
+
+    model_variant(const model_variant & other) :
+        arch(other.arch),
+        name(other.name),
+        tensors(other.tensors),
+        metadata(other.metadata) {}
+
+    void add_tensor(const std::string & name, const std::vector<int64_t> & shape,
+                    const std::function<float(std::mt19937 &)> & distribution = std::normal_distribution<float>()) {
+        tensors.push_back(random_tensor(name, shape, distribution));
+    }
+
+    void add_tensor(const std::string & name, const std::vector<int64_t> & shape,
+                    const std::function<float(std::mt19937 &, int64_t)> & distribution) {
+        tensors.push_back(random_tensor(name, shape, distribution));
+    }
+
+    void add_tensor(const std::string & name, const std::vector<int64_t> & shape,
+                    const std::function<float(int64_t)> & distribution) {
+        tensors.push_back(random_tensor(name, shape, distribution));
+    }
+
+    void add_tensor(const std::string & name, const std::vector<int64_t> & shape,
+                    const std::function<float()> & distribution) {
+        tensors.push_back(random_tensor(name, shape, distribution));
+    }
+
+    void add_kv(llm_kv kv, const gguf_value & value) { metadata.push_back(std::pair(kv, value)); }
+
+    bool write_to_file(const char * fname, std::mt19937 & rng) const {
+        gguf_context * ctx_gguf = gguf_init_empty();
+
+        auto kv = LLM_KV(arch);
+
+        for (const auto & m : metadata) {
+            m.second.set(ctx_gguf, kv(m.first));
+        }
+
+        size_t total_size = 0;
+        for (const auto & t : tensors) {
+            total_size += t.n_bytes() + ggml_tensor_overhead();
+        }
+
+        ggml_init_params init_params = {
+            total_size,
+            nullptr, // allocate internally
+            false,   // do allocate memory when creating tensors
+        };
+
+        ggml_context * ctx = ggml_init(init_params);
+
+        // initialize the tensors and add to GGUF
+        for (const auto & t : tensors) {
+            ggml_tensor * tensor = t.to_ggml_tensor(ctx, rng);
+            ggml_set_name(tensor, t.name.c_str());
+
+            gguf_add_tensor(ctx_gguf, tensor);
+        }
+
+        return gguf_write_to_file(ctx_gguf, fname, false);
+    }
+
+    static void insert_from_arch(std::vector<model_variant> & variants, llm_arch arch) {
+        uint32_t n_vocab = 256;
+        uint32_t n_embd = 32;
+        uint32_t n_ff = 3 * n_embd;
+
+        uint32_t n_layer = 2;
+
+        auto tn = LLM_TN(arch);
+
+        // random vocab
+        const auto add_tokenizer = [](model_variant & m, uint32_t n_vocab) {
+            std::vector<std::string> vocab_tokens(n_vocab);
+            std::vector<float>       vocab_scores(n_vocab);
+            std::vector<int32_t>     vocab_types(n_vocab);
+
+            char buf[32];
+            for (size_t i = 0; i < n_vocab; ++i) {
+                sprintf(buf, "<%zu>", i);
+                vocab_tokens[i] = std::string(buf);
+                vocab_scores[i] = -1000.0f;
+                vocab_types[i]  = 4;  // USER_DEFINED type
+            }
+
+            m.add_kv(LLM_KV_TOKENIZER_MODEL, "llama");
+            m.add_kv(LLM_KV_TOKENIZER_PRE, "default");
+            m.add_kv(LLM_KV_TOKENIZER_LIST, vocab_tokens);
+            m.add_kv(LLM_KV_TOKENIZER_SCORES, vocab_scores);
+            m.add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, vocab_types);
+        };
+
+        // TODO: fill the variants
+        // TODO: how to make the variants more modular?
+        switch (arch) {
+            case LLM_ARCH_LLAMA:
+            case LLM_ARCH_LLAMA4:
+            case LLM_ARCH_DECI:
+            case LLM_ARCH_FALCON:
+            case LLM_ARCH_BAICHUAN:
+            case LLM_ARCH_GROK:
+            case LLM_ARCH_GPT2:
+            case LLM_ARCH_GPTJ:
+            case LLM_ARCH_GPTNEOX:
+            case LLM_ARCH_MPT:
+            case LLM_ARCH_STARCODER:
+            case LLM_ARCH_REFACT:
+            case LLM_ARCH_BERT:
+            case LLM_ARCH_NOMIC_BERT:
+            case LLM_ARCH_NOMIC_BERT_MOE:
+            case LLM_ARCH_JINA_BERT_V2:
+            case LLM_ARCH_BLOOM:
+            case LLM_ARCH_STABLELM:
+            case LLM_ARCH_QWEN:
+            case LLM_ARCH_QWEN2:
+            case LLM_ARCH_QWEN2MOE:
+            case LLM_ARCH_QWEN2VL:
+            case LLM_ARCH_QWEN3:
+            case LLM_ARCH_QWEN3MOE:
+            case LLM_ARCH_PHI2:
+            case LLM_ARCH_PHI3:
+            case LLM_ARCH_PHIMOE:
+            case LLM_ARCH_PLAMO:
+            case LLM_ARCH_CODESHELL:
+            case LLM_ARCH_ORION:
+            case LLM_ARCH_INTERNLM2:
+            case LLM_ARCH_MINICPM:
+            case LLM_ARCH_MINICPM3:
+            case LLM_ARCH_GEMMA:
+            case LLM_ARCH_GEMMA2:
+            case LLM_ARCH_GEMMA3:
+            case LLM_ARCH_STARCODER2:
+                break;
+            case LLM_ARCH_MAMBA:
+                {
+                    variants.push_back(model_variant(arch, "Mamba"));
+                    model_variant & cur = variants.back();
+
+                    const uint32_t d_inner = 2 * n_embd;
+                    const uint32_t d_conv = 4;
+                    const uint32_t d_state = 16;
+                    const uint32_t dt_rank = (n_embd + 15) / 16;
+
+                    const auto init_A_S4D = [](int64_t i) {
+                        return -((i % d_state) + 1);
+                    };
+
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 1024 * 1024);
+                    cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
+                    cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, (uint32_t) 0);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, (uint32_t) 0);
+                    cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
+                    cur.add_kv(LLM_KV_SSM_CONV_KERNEL, d_conv);
+                    cur.add_kv(LLM_KV_SSM_INNER_SIZE, d_inner);
+                    cur.add_kv(LLM_KV_SSM_STATE_SIZE, d_state);
+                    cur.add_kv(LLM_KV_SSM_TIME_STEP_RANK, dt_rank);
+                    cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
+
+                    add_tokenizer(cur, n_vocab);
+
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, 2 * d_inner });
+
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { d_conv, d_inner });
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), { d_inner });
+
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), { d_inner, dt_rank + 2 * d_state });
+
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), { dt_rank, d_inner });
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { d_inner });
+
+                        // no "weight" suffix for these
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_A, i), { d_state, d_inner }, init_A_S4D);
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_D, i), { d_inner }, []() { return 1.0f; });
+
+                        // out_proj
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { d_inner, n_embd });
+                    }
+                }
+                break;
+            case LLM_ARCH_XVERSE:
+            case LLM_ARCH_COMMAND_R:
+            case LLM_ARCH_COHERE2:
+            case LLM_ARCH_DBRX:
+            case LLM_ARCH_OLMO:
+            case LLM_ARCH_OLMO2:
+            case LLM_ARCH_OLMOE:
+            case LLM_ARCH_OPENELM:
+            case LLM_ARCH_ARCTIC:
+            case LLM_ARCH_DEEPSEEK:
+            case LLM_ARCH_DEEPSEEK2:
+            case LLM_ARCH_CHATGLM:
+            case LLM_ARCH_GLM4:
+            case LLM_ARCH_BITNET:
+            case LLM_ARCH_T5:
+            case LLM_ARCH_T5ENCODER:
+            case LLM_ARCH_JAIS:
+            case LLM_ARCH_NEMOTRON:
+            case LLM_ARCH_EXAONE:
+            case LLM_ARCH_RWKV6:
+            case LLM_ARCH_RWKV6QWEN2:
+                break;
+            case LLM_ARCH_RWKV7:
+                break;
+                // TODO: proper initialization of the tensors
+                // ref: https://github.com/BlinkDL/RWKV-LM/blob/247ce631e372b743ff496908d3e29df710506661/RWKV-v7/train_temp/src/model.py
+                {
+                    variants.push_back(model_variant(arch, "RWKV7"));
+                    model_variant & cur = variants.back();
+
+                    // TODO: use more realistic hyperparams
+
+                    n_embd = 128; // TODO: why does this need to be bigger than head_size?
+
+                    const uint32_t head_size = 64; // TODO: is this assumed by the ops?
+
+                    const auto calc_lora_rank = [](uint32_t hidden_size, float exponent, float multiplier) {
+                        return std::max((uint32_t) 1,
+                                        (uint32_t) std::round(std::pow(hidden_size, exponent) * multiplier / 32.0f)) *
+                               (uint32_t) 32;
+                    };
+
+                    const uint32_t n_lora_decay = calc_lora_rank(n_embd, 0.5, 1.8);
+                    const uint32_t n_lora_iclr = calc_lora_rank(n_embd, 0.5, 1.8);
+                    const uint32_t n_lora_value_res_mix = calc_lora_rank(n_embd, 0.5, 1.3);
+                    const uint32_t n_lora_gate = calc_lora_rank(n_embd, 0.8, 0.6);
+                    const uint32_t attn_hidden_size = n_embd;
+                    const uint32_t ffn_size = n_ff;
+
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 1024 * 1024);
+                    cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
+                    cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
+                    cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, 1e-5f);
+                    cur.add_kv(LLM_KV_WKV_HEAD_SIZE, head_size);
+                    cur.add_kv(LLM_KV_ATTENTION_DECAY_LORA_RANK, n_lora_decay);
+                    cur.add_kv(LLM_KV_ATTENTION_ICLR_LORA_RANK, n_lora_iclr);
+                    cur.add_kv(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, n_lora_value_res_mix);
+                    cur.add_kv(LLM_KV_ATTENTION_GATE_LORA_RANK, n_lora_gate);
+                    cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, (uint32_t) 0);
+
+                    add_tokenizer(cur, n_vocab);
+
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // Block 0, LN0
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
+
+                    // output
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd});
+
+                        if (i == 0) {
+                            // actually not used
+                            cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd});
+                            cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr});
+                            cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd});
+                        } else {
+                            cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd});
+                            cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix});
+                            cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd});
+                        }
+
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6});
+
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size});
+
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
+
+                        cur.add_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
+
+                        cur.add_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
+                        cur.add_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
+                    }
+                } break;
+            case LLM_ARCH_ARWKV7:
+            case LLM_ARCH_GRANITE:
+            case LLM_ARCH_GRANITE_MOE:
+            case LLM_ARCH_CHAMELEON:
+            case LLM_ARCH_WAVTOKENIZER_DEC:
+            case LLM_ARCH_PLM:
+            case LLM_ARCH_BAILINGMOE:
+            case LLM_ARCH_UNKNOWN:
+                break;
+        }
+    }
+};
+
+struct reference_logits {
+    int32_t n_vocab;
+    int32_t prompt_len;
+    std::vector<llama_token> inputs;
+    std::vector<float> outputs;
+
+    reference_logits(llama_context * ctx, int32_t seq_len, std::mt19937 & rng) {
+        n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(llama_get_model(ctx)));
+        std::uniform_int_distribution<llama_token> rand_token(0, n_vocab - 1);
+        std::uniform_int_distribution<int32_t> rand_prompt_len(seq_len / 4, 3 * seq_len / 4);
+
+        llama_batch batch = llama_batch_init(seq_len, 0, 1);
+
+        outputs.reserve(n_vocab * seq_len);
+
+        prompt_len = rand_prompt_len(rng);
+
+        for (int32_t i = 0; i < prompt_len; ++i) {
+            const llama_token token = rand_token(rng);
+            inputs.push_back(token);
+
+            common_batch_add(batch, token, i, { 0 }, true);
+        }
+
+        const int status_prompt = llama_decode(ctx, batch);
+        GGML_ASSERT(status_prompt == 0);
+
+        const float * output_prompt = llama_get_logits(ctx);
+        GGML_ASSERT(output_prompt);
+        outputs.insert(outputs.end(), output_prompt, output_prompt + n_vocab * prompt_len);
+
+        for (int32_t i = prompt_len; i < seq_len; ++i) {
+            common_batch_clear(batch);
+
+            const llama_token token = rand_token(rng); // no real need to sample
+            inputs.push_back(token);
+            common_batch_add(batch, token, i, { 0 }, true);
+
+            const int status = llama_decode(ctx, batch);
+            GGML_ASSERT(status == 0);
+
+            const float * output = llama_get_logits_ith(ctx, -1);
+            GGML_ASSERT(output);
+            outputs.insert(outputs.end(), output, output + n_vocab);
+        }
+
+        llama_batch_free(batch);
+    }
+
+    // TODO: unlink pos from indice into inputs
+    // TODO: randomize which ouputs are enabled
+    void add_to_batch(llama_batch & batch, llama_pos initial_pos, int32_t seq_len, llama_seq_id seq_id) {
+        for (int32_t i = 0; i < seq_len; ++i) {
+            llama_pos pos = initial_pos + i;
+            common_batch_add(batch, inputs[pos], pos, { seq_id }, i == seq_len - 1);
+        }
+    }
+
+    // returns normalized squared error of the output for the seq_id
+    float validate_batch(llama_context * ctx, const llama_batch & batch, llama_seq_id seq_id) {
+        float sumr2 = 0.0f;
+        float sumo2 = 0.0f;
+
+        float sumerr2 = 0.0f;
+
+        llama_pos first_pos_error = -1;
+
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            if (batch.seq_id[i][0] == seq_id && batch.logits[i]) {
+                const llama_pos pos = batch.pos[i];
+
+                const float * out           = llama_get_logits_ith(ctx, i);
+                const float * reference_out = &outputs[pos * n_vocab];
+
+                GGML_ASSERT(out);
+
+                for (int j = 0; j < n_vocab; ++j) {
+                    sumo2 += out[j] * out[j];
+                    sumr2 += reference_out[j] * reference_out[j];
+
+                    const float err = (out[j] - reference_out[j]);
+                    if (err > 0.0f) {
+                        if (first_pos_error < 0 || pos < first_pos_error) {
+                            first_pos_error = pos;
+                        }
+                    }
+
+                    sumerr2 += err * err;
+                }
+            }
+        }
+        if (first_pos_error >= 0) {
+            // fprintf(stderr, "Potential error in seq_id %i starting from pos %i\n", seq_id, first_pos_error);
+        }
+
+        const float denom = std::sqrt(sumr2 * sumo2);
+
+        return sumerr2 / (denom > 0.0f ? denom : 1.0f);
+    }
+};
+
+struct reference_embd {
+    size_t n_embd;
+    std::vector<float> inputs;
+    std::vector<float> outputs;
+
+    // TODO: constructor
+};
+
+static void batch_add_embd(struct llama_batch & batch, const std::vector<float> & embd, llama_pos pos, llama_seq_id seq_id, bool logits) {
+    GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
+
+    memcpy(batch.embd + batch.n_tokens * embd.size(), embd.data(), sizeof(float) * embd.size());
+    batch.pos[batch.n_tokens]       = pos;
+    batch.n_seq_id[batch.n_tokens]  = 1;
+    batch.seq_id[batch.n_tokens][0] = seq_id;
+    batch.logits[batch.n_tokens]    = logits;
+
+    batch.n_tokens++;
+}
+
+static void permute_from_ids(uint8_t * array, size_t elem_size, const std::vector<int32_t> & ids) {
+    std::vector<uint8_t> tmp(elem_size * ids.size(), 0);
+
+    for (size_t i = 0; i < ids.size(); ++i) {
+        memcpy(tmp.data() + i * elem_size, array + ids[i] * elem_size, elem_size);
+    }
+
+    memcpy(array, tmp.data(), ids.size() * elem_size);
+}
+
+static void shuffle_batch(struct llama_batch & batch, std::mt19937 & rng) {
+    std::vector<int32_t> ids(batch.n_tokens);
+    for (int32_t i = 0; i < batch.n_tokens; ++i) {
+        ids[i] = i;
+    }
+
+    std::shuffle(ids.begin(), ids.end(), rng);
+
+    if (batch.token) {
+        permute_from_ids((uint8_t *) batch.token, sizeof(*batch.token), ids);
+    }
+    if (batch.embd) {
+        permute_from_ids((uint8_t *) batch.embd, sizeof(*batch.embd), ids);
+    }
+    permute_from_ids((uint8_t *) batch.pos, sizeof(*batch.pos), ids);
+    permute_from_ids((uint8_t *) batch.n_seq_id, sizeof(*batch.n_seq_id), ids);
+    permute_from_ids((uint8_t *) batch.seq_id, sizeof(*batch.seq_id), ids);
+    permute_from_ids((uint8_t *) batch.logits, sizeof(*batch.logits), ids);
+}
+
+// TODO: use more args
+int main(int argc, char ** argv) {
+
+    std::string tmp_fname = "test-model-random-tmp.gguf";
+
+    if (argc > 1) {
+        // TODO: ensure it ends with .gguf
+        std::string arg(argv[1]);
+        const std::string suffix = ".gguf";
+
+        if (suffix == arg.substr(arg.size() - suffix.size())) {
+            tmp_fname = std::move(arg);
+        } else {
+            // TODO: print usage
+            exit(1);
+        }
+    }
+
+    // only print errors
+    llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
+        if (level >= GGML_LOG_LEVEL_ERROR) {
+            fprintf(stderr, "%s", text);
+        }
+    }, nullptr);
+
+    // TODO: use specific backends
+    llama_backend_init();
+
+    // TODO: maybe use a faster rng algorithm
+    std::mt19937 rng(42);
+
+    // TODO: multiple sequences per token
+    const int64_t n_batch = 2048;
+    const int64_t n_seq_len = 1024;
+    std::uniform_int_distribution<int64_t> rand_seq_init_len(n_seq_len / 4, 3 * n_seq_len / 4);
+
+    llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    // TODO: batch with embeddings
+
+    std::vector<model_variant> model_variants;
+
+    for (int i = 0; i < LLM_ARCH_UNKNOWN; ++i) {
+        llm_arch arch = (llm_arch) i;
+        model_variant::insert_from_arch(model_variants, arch);
+    }
+
+    // TODO: concurrent tests?
+
+    for (const model_variant & variant : model_variants) {
+        std::vector<std::string> splits = {};
+
+        variant.write_to_file(tmp_fname.c_str(), rng);
+
+        llama_model_params model_params = llama_model_default_params();
+
+        model_params.check_tensors = true;
+
+        llama_model * model = llama_model_load_from_file(tmp_fname.c_str(), model_params);
+
+        GGML_ASSERT(model);
+
+        // const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+        // const auto n_embd = llama_model_n_embd(model);
+
+        for (int64_t n_seq_max : { 1, 2, 13 } ) {
+
+            // TODO(later): context shift testing
+            for (int64_t n_ctx : { n_seq_len * n_seq_max }) {
+
+                std::vector<reference_logits> ref_outputs;
+
+                {
+                    llama_context_params ref_params = llama_context_default_params();
+                    ref_params.n_batch = n_seq_len;
+                    ref_params.n_ubatch = 1;
+                    ref_params.n_ctx = n_seq_len;
+                    ref_params.n_seq_max = 1;
+                    ref_params.n_threads = 1;
+                    ref_params.n_threads_batch = 1;
+
+                    llama_context * ref_ctx = llama_init_from_model(model, ref_params);
+
+                    llama_memory_t mem = llama_get_memory(ref_ctx);
+
+                    for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
+                        llama_memory_clear(mem, true);
+                        ref_outputs.push_back(reference_logits(ref_ctx, n_seq_len, rng));
+                    }
+
+                    llama_free(ref_ctx);
+                }
+
+                for (bool shuffle : { false, true }) {
+
+                    for (int64_t n_ubatch : { 1, 2, 512 } ) {
+
+                        std::vector<bool> valid(n_seq_max, true);
+
+                        llama_context_params ctx_params = llama_context_default_params();
+                        ctx_params.n_ctx = n_ctx;
+                        ctx_params.n_seq_max = n_seq_max;
+                        ctx_params.n_ubatch = n_ubatch;
+                        ctx_params.n_batch = n_batch;
+
+                        llama_context * ctx = llama_init_from_model(model, ctx_params);
+
+                        common_batch_clear(batch);
+
+                        std::set<llama_seq_id> seq_ids_in_batch;
+                        std::vector<llama_pos> seq_id_n_past(n_seq_max, 0);
+
+                        // start filling the batch with prompts
+                        while (std::any_of(seq_id_n_past.begin(), seq_id_n_past.end(),
+                                           [](llama_pos p) { return p < n_seq_len; })) {
+                            for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
+                                if (seq_id_n_past[seq_id] >= ref_outputs[seq_id].prompt_len) {
+                                    continue;
+                                }
+
+                                if (batch.n_tokens < n_batch) {
+                                    const int64_t seq_len =
+                                        std::min(n_batch - batch.n_tokens,
+                                                 (int64_t) ref_outputs[seq_id].prompt_len - seq_id_n_past[seq_id]);
+
+                                    ref_outputs[seq_id].add_to_batch(batch, seq_id_n_past[seq_id], seq_len, seq_id);
+                                    seq_ids_in_batch.insert(seq_id);
+                                    seq_id_n_past[seq_id] += seq_len;
+                                }
+                            }
+                            if (shuffle) {
+                                shuffle_batch(batch, rng);
+                            }
+
+                            llama_decode(ctx, batch);
+
+                            for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
+                                float err = ref_outputs[seq_id].validate_batch(ctx, batch, seq_id);
+                                if (!isfinite(err) || err > 1.0f / 1024.0f) {
+                                    fprintf(stderr, "Error for seq_id %i is %f\n", seq_id, err);
+                                    valid[seq_id] = false;
+                                }
+                            }
+
+                            common_batch_clear(batch);
+
+                            GGML_ASSERT(n_seq_max <= n_batch); // not handling splitting this across batches here
+
+                            // TODO: use seq_rm, seq_cp, etc. to test if they work properly
+
+                            // cont batching
+                            for (llama_seq_id s : seq_ids_in_batch) {
+                                llama_pos & pos = seq_id_n_past[s];
+                                if (pos >= n_seq_len) {
+                                    continue;
+                                }
+                                ref_outputs[s].add_to_batch(batch, pos, 1, s);
+                                pos += 1;
+                            }
+                        }
+
+                        fprintf(stdout,
+                                "Comparing output for '%s', with shuffle=%i, n_seq_max=%li, n_ctx=%li, n_ubatch=%li: ",
+                                variant.name.c_str(), shuffle, n_seq_max, n_ctx, n_ubatch);
+                        if (std::all_of(valid.begin(), valid.end(), [](bool v) { return v; })) {
+                            fprintf(stdout, "\033[1;32mOK\033[0m\n");
+                        } else {
+                            fprintf(stdout, "(%zu%%) \033[1;31mFAILED\033[0m\n",
+                                    std::count_if(valid.begin(), valid.end(), [](bool v) { return v == false; }) * 100 / valid.size());
+                            // cleanup and exit on first failure
+                            llama_free(ctx);
+                            llama_model_free(model);
+                            llama_batch_free(batch);
+                            exit(1);
+                        }
+
+                        llama_free(ctx);
+                    }
+                }
+            }
+        }
+
+        llama_model_free(model);
+    }
+
+    llama_batch_free(batch);
+
+    return 0;
+}

From 7657835b337f8487fa55d11435885608271a6b8f Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 12 Jun 2025 02:23:44 -0400
Subject: [PATCH 02/13] tests : fix overflow and memory leaks in
 test-model-random

* tests : fix integer types in test-model-random
---
 src/llama-model.cpp         |  7 +++----
 tests/test-model-random.cpp | 24 ++++++++++++++----------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index c64bf9de939f4..ed491721d49e9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -8847,9 +8847,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
 };
 
 struct llm_build_mamba : public llm_graph_context {
-    const llama_model & model;
-
-    llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
+    llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         ggml_tensor * cur;
         ggml_tensor * inpL;
 
@@ -8865,7 +8863,7 @@ struct llm_build_mamba : public llm_graph_context {
                     LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
-            cur = build_mamba_layer(gf, cur, state_copy, ubatch, il);
+            cur = build_mamba_layer(gf, cur, state_copy, model, ubatch, il);
 
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
@@ -8906,6 +8904,7 @@ struct llm_build_mamba : public llm_graph_context {
              ggml_cgraph * gf,
              ggml_tensor * cur,
              ggml_tensor * state_copy,
+       const llama_model & model,
       const llama_ubatch & ubatch,
                      int   il) const {
         const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index 95a03bc2f1c17..8e959686703a1 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -227,7 +227,7 @@ struct gguf_value {
                             for (size_t i = 0; i < arr_size; ++i) {
                                 memcpy(data.data() + type_size * i, &(*value.array)[i].value, type_size);
                             }
-                            gguf_set_arr_data(ctx, k, arr_type, data.data(), data.size());
+                            gguf_set_arr_data(ctx, k, arr_type, data.data(), data.size() / type_size);
                         }
                         // TODO: handle nested arrays
                     }
@@ -317,7 +317,12 @@ struct model_variant {
             gguf_add_tensor(ctx_gguf, tensor);
         }
 
-        return gguf_write_to_file(ctx_gguf, fname, false);
+        bool status = gguf_write_to_file(ctx_gguf, fname, false);
+
+        ggml_free(ctx);
+        gguf_free(ctx_gguf);
+
+        return status;
     }
 
     static void insert_from_arch(std::vector<model_variant> & variants, llm_arch arch) {
@@ -762,9 +767,8 @@ int main(int argc, char ** argv) {
     std::mt19937 rng(42);
 
     // TODO: multiple sequences per token
-    const int64_t n_batch = 2048;
-    const int64_t n_seq_len = 1024;
-    std::uniform_int_distribution<int64_t> rand_seq_init_len(n_seq_len / 4, 3 * n_seq_len / 4);
+    const int32_t n_batch = 2048;
+    const int32_t n_seq_len = 1024;
 
     llama_batch batch = llama_batch_init(n_batch, 0, 1);
     // TODO: batch with embeddings
@@ -794,10 +798,10 @@ int main(int argc, char ** argv) {
         // const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
         // const auto n_embd = llama_model_n_embd(model);
 
-        for (int64_t n_seq_max : { 1, 2, 13 } ) {
+        for (int32_t n_seq_max : { 1, 2, 13 } ) {
 
             // TODO(later): context shift testing
-            for (int64_t n_ctx : { n_seq_len * n_seq_max }) {
+            for (int32_t n_ctx : { n_seq_len * n_seq_max }) {
 
                 std::vector<reference_logits> ref_outputs;
 
@@ -824,7 +828,7 @@ int main(int argc, char ** argv) {
 
                 for (bool shuffle : { false, true }) {
 
-                    for (int64_t n_ubatch : { 1, 2, 512 } ) {
+                    for (int32_t n_ubatch : { 1, 2, 512 } ) {
 
                         std::vector<bool> valid(n_seq_max, true);
 
@@ -852,7 +856,7 @@ int main(int argc, char ** argv) {
                                 if (batch.n_tokens < n_batch) {
                                     const int64_t seq_len =
                                         std::min(n_batch - batch.n_tokens,
-                                                 (int64_t) ref_outputs[seq_id].prompt_len - seq_id_n_past[seq_id]);
+                                                 ref_outputs[seq_id].prompt_len - seq_id_n_past[seq_id]);
 
                                     ref_outputs[seq_id].add_to_batch(batch, seq_id_n_past[seq_id], seq_len, seq_id);
                                     seq_ids_in_batch.insert(seq_id);
@@ -891,7 +895,7 @@ int main(int argc, char ** argv) {
                         }
 
                         fprintf(stdout,
-                                "Comparing output for '%s', with shuffle=%i, n_seq_max=%li, n_ctx=%li, n_ubatch=%li: ",
+                                "Comparing output for '%s', with shuffle=%i, n_seq_max=%i, n_ctx=%i, n_ubatch=%i: ",
                                 variant.name.c_str(), shuffle, n_seq_max, n_ctx, n_ubatch);
                         if (std::all_of(valid.begin(), valid.end(), [](bool v) { return v; })) {
                             fprintf(stdout, "\033[1;32mOK\033[0m\n");

From 8fe213af7634eb75686fa7f6b0dcdbedf98539da Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 12 Jun 2025 02:48:11 -0400
Subject: [PATCH 03/13] tests : avoid sprintf in test-model-random

---
 tests/test-model-random.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index 8e959686703a1..d80044e43f6e9 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -342,7 +342,7 @@ struct model_variant {
 
             char buf[32];
             for (size_t i = 0; i < n_vocab; ++i) {
-                sprintf(buf, "<%zu>", i);
+                snprintf(buf, sizeof(buf), "<%zu>", i);
                 vocab_tokens[i] = std::string(buf);
                 vocab_scores[i] = -1000.0f;
                 vocab_types[i]  = 4;  // USER_DEFINED type

From dfa3c182667b3028b845f7db8c6b046a514e5406 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 13 Jun 2025 20:02:29 -0400
Subject: [PATCH 04/13] tests : add LLAMA, LLAMA4, and GEMMA2 to
 test-model-random

---
 tests/CMakeLists.txt        |   2 +-
 tests/test-model-random.cpp | 186 ++++++++++++++++++++++++++++++++++--
 2 files changed, 177 insertions(+), 11 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 14cb9aee8c29d..f3530bea993b7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -145,6 +145,7 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
     llama_build_and_test(test-grammar-integration.cpp)
     llama_build_and_test(test-llama-grammar.cpp)
     llama_build_and_test(test-chat.cpp)
+    llama_build_and_test(test-model-random.cpp)
     # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
     if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
         llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
@@ -193,7 +194,6 @@ endif()
 # llama_build_and_test(test-opt.cpp) # SLOW
 llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
-llama_build_and_test(test-model-random.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")
diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index d80044e43f6e9..a03ef1a4c4dab 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -257,6 +257,7 @@ struct model_variant {
     model_variant(llm_arch arch, const std::string & name) : arch(arch), name(name) {
         add_kv(LLM_KV_GENERAL_TYPE, "model");
         add_kv(LLM_KV_GENERAL_ARCHITECTURE, llm_arch_name(arch));
+        add_kv(LLM_KV_GENERAL_NAME, name);
     }
 
     model_variant(const model_variant & other) :
@@ -359,7 +360,118 @@ struct model_variant {
         // TODO: how to make the variants more modular?
         switch (arch) {
             case LLM_ARCH_LLAMA:
-            case LLM_ARCH_LLAMA4:
+                {
+                    variants.push_back(model_variant(arch, "Llama2"));
+                    model_variant & cur = variants.back();
+
+                    n_embd = 16;
+                    const uint32_t n_head = 4;
+                    const uint32_t n_head_kv = n_head / 2;
+                    const uint32_t n_embd_head_k = n_embd / n_head;
+                    const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
+                    const uint32_t n_embd_v_gqa = n_embd_k_gqa;
+
+                    cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
+                    cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
+                    cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
+                    cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
+                    cur.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
+
+                    add_tokenizer(cur, n_vocab);
+
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    // omitting the actual output tensor to leave it use tok_embd
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
+            case LLM_ARCH_LLAMA4: // has chunked interleaved sliding-window
+                {
+                    variants.push_back(model_variant(arch, "Llama4"));
+                    model_variant & cur = variants.back();
+
+                    n_layer = 4; // for the swa pattern
+                    n_embd = 16;
+                    const uint32_t n_head = 4;
+                    const uint32_t n_head_kv = n_head / 2;
+                    const uint32_t n_embd_head_k = n_embd / n_head;
+                    const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
+                    const uint32_t n_embd_v_gqa = n_embd_k_gqa;
+                    const uint32_t n_moe_layer_step = 2;
+                    const uint32_t n_ff_exp = n_embd * 2;
+                    const uint32_t n_expert = 4;
+
+                    cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
+                    cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
+                    cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
+                    cur.add_kv(LLM_KV_EXPERT_COUNT, n_expert);
+                    cur.add_kv(LLM_KV_EXPERT_USED_COUNT, (uint32_t) 2);
+                    cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
+                    cur.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
+                    cur.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, n_moe_layer_step);
+                    cur.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff_exp);
+                    // FIXME: this isn't used because the default is 8192
+                    cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t) 389); // prime number
+
+                    add_tokenizer(cur, n_vocab);
+
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    // omitting the actual output tensor to leave it use tok_embd
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        bool is_moe_layer = (i + 1) % n_moe_layer_step == 0;
+
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+                        if (is_moe_layer) {
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert});
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert});
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff_exp, n_embd, n_expert});
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert});
+
+                            // Shared expert
+                            const int64_t n_ff_shexp = n_ff_exp;
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {    n_embd, n_ff_shexp});
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd    });
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {    n_embd, n_ff_shexp});
+                        } else {
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                            cur.add_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        }
+                    }
+                } break;
             case LLM_ARCH_DECI:
             case LLM_ARCH_FALCON:
             case LLM_ARCH_BAICHUAN:
@@ -392,7 +504,54 @@ struct model_variant {
             case LLM_ARCH_MINICPM:
             case LLM_ARCH_MINICPM3:
             case LLM_ARCH_GEMMA:
-            case LLM_ARCH_GEMMA2:
+                break;
+            case LLM_ARCH_GEMMA2: // has standard interleaved sliding-window
+                {
+                    variants.push_back(model_variant(arch, "Gemma2"));
+                    model_variant & cur = variants.back();
+
+                    n_layer = 2; // minimum for the swa pattern
+                    n_embd = 16;
+                    const uint32_t n_head = 4;
+                    const uint32_t n_head_kv = n_head / 2;
+                    const uint32_t n_embd_head_k = n_embd / n_head;
+                    const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
+                    const uint32_t n_embd_v_gqa = n_embd_k_gqa;
+
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
+                    cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
+                    cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
+                    cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_kv);
+                    cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
+                    cur.add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, 50.0f);
+                    cur.add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, 30.0f);
+                    cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t) 389); // prime number
+
+                    add_tokenizer(cur, n_vocab);
+
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+                    }
+                } break;
             case LLM_ARCH_GEMMA3:
             case LLM_ARCH_STARCODER2:
                 break;
@@ -679,7 +838,7 @@ struct reference_logits {
             // fprintf(stderr, "Potential error in seq_id %i starting from pos %i\n", seq_id, first_pos_error);
         }
 
-        const float denom = std::sqrt(sumr2 * sumo2);
+        const float denom = std::sqrt(sumr2) * std::sqrt(sumo2);
 
         return sumerr2 / (denom > 0.0f ? denom : 1.0f);
     }
@@ -767,8 +926,8 @@ int main(int argc, char ** argv) {
     std::mt19937 rng(42);
 
     // TODO: multiple sequences per token
-    const int32_t n_batch = 2048;
-    const int32_t n_seq_len = 1024;
+    const int32_t n_batch = 3 * 512;
+    const int32_t n_seq_len = 643; // prime number
 
     llama_batch batch = llama_batch_init(n_batch, 0, 1);
     // TODO: batch with embeddings
@@ -798,7 +957,7 @@ int main(int argc, char ** argv) {
         // const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
         // const auto n_embd = llama_model_n_embd(model);
 
-        for (int32_t n_seq_max : { 1, 2, 13 } ) {
+        for (int32_t n_seq_max : { 1, 2, 5 } ) {
 
             // TODO(later): context shift testing
             for (int32_t n_ctx : { n_seq_len * n_seq_max }) {
@@ -811,8 +970,6 @@ int main(int argc, char ** argv) {
                     ref_params.n_ubatch = 1;
                     ref_params.n_ctx = n_seq_len;
                     ref_params.n_seq_max = 1;
-                    ref_params.n_threads = 1;
-                    ref_params.n_threads_batch = 1;
 
                     llama_context * ref_ctx = llama_init_from_model(model, ref_params);
 
@@ -828,6 +985,13 @@ int main(int argc, char ** argv) {
 
                 for (bool shuffle : { false, true }) {
 
+                    // skip shuffling the batch for non-recurrent models
+                    // (simple splits don't handle shuffled batches correctly)
+                    // FIXME: remove this
+                    if (shuffle && !llama_model_is_recurrent(model)) {
+                        continue;
+                    }
+
                     for (int32_t n_ubatch : { 1, 2, 512 } ) {
 
                         std::vector<bool> valid(n_seq_max, true);
@@ -881,8 +1045,6 @@ int main(int argc, char ** argv) {
 
                             GGML_ASSERT(n_seq_max <= n_batch); // not handling splitting this across batches here
 
-                            // TODO: use seq_rm, seq_cp, etc. to test if they work properly
-
                             // cont batching
                             for (llama_seq_id s : seq_ids_in_batch) {
                                 llama_pos & pos = seq_id_n_past[s];
@@ -909,6 +1071,10 @@ int main(int argc, char ** argv) {
                             exit(1);
                         }
 
+                        // TODO: use seq_rm, seq_cp, etc. to test if they work properly
+
+                        // TODO: test pooled embeddings
+
                         llama_free(ctx);
                     }
                 }

From 352703b08ba9e9d3a0b843ff6e1914c7286f9bca Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 16 Jun 2025 21:30:21 -0400
Subject: [PATCH 05/13] test-model-random : better default tensor
 initialization distribution

---
 tests/test-model-random.cpp | 38 +++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index a03ef1a4c4dab..a8e6847000dd2 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -65,6 +65,7 @@ struct random_tensor {
         for (int64_t d : shape) {
             prod *= d;
         }
+        GGML_ASSERT(prod != 0);
         return ggml_row_size(type, prod);
     }
 
@@ -266,8 +267,20 @@ struct model_variant {
         tensors(other.tensors),
         metadata(other.metadata) {}
 
+    void add_tensor(const std::string & name, const std::vector<int64_t> & shape, float gain = 1.0f) {
+        // ref: https://github.com/pytorch/pytorch/blob/134179474539648ba7dee1317959529fbd0e7f89/torch/nn/init.py#L515-L516
+        const auto init_kaiming_uniform = [gain](uint32_t fan_in) {
+            const float std = gain * std::sqrt(fan_in);
+            const float bound = std::sqrt(3.0f) * std;
+
+            return std::uniform_real_distribution<float>(-bound, bound);
+        };
+
+        tensors.push_back(random_tensor(name, shape, init_kaiming_uniform(shape[0])));
+    }
+
     void add_tensor(const std::string & name, const std::vector<int64_t> & shape,
-                    const std::function<float(std::mt19937 &)> & distribution = std::normal_distribution<float>()) {
+                    const std::function<float(std::mt19937 &)> & distribution) {
         tensors.push_back(random_tensor(name, shape, distribution));
     }
 
@@ -299,7 +312,7 @@ struct model_variant {
 
         size_t total_size = 0;
         for (const auto & t : tensors) {
-            total_size += t.n_bytes() + ggml_tensor_overhead();
+            total_size += GGML_PAD(t.n_bytes() + ggml_tensor_overhead(), GGML_MEM_ALIGN);
         }
 
         ggml_init_params init_params = {
@@ -356,6 +369,11 @@ struct model_variant {
             m.add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, vocab_types);
         };
 
+        // don't actually use bias
+        const auto init_bias = []() {
+            return 0.0f;
+        };
+
         // TODO: fill the variants
         // TODO: how to make the variants more modular?
         switch (arch) {
@@ -591,12 +609,12 @@ struct model_variant {
                         cur.add_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, 2 * d_inner });
 
                         cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { d_conv, d_inner });
-                        cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), { d_inner });
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), { d_inner }, init_bias);
 
                         cur.add_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), { d_inner, dt_rank + 2 * d_state });
 
                         cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), { dt_rank, d_inner });
-                        cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { d_inner });
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { d_inner }, init_bias);
 
                         // no "weight" suffix for these
                         cur.add_tensor(tn(LLM_TENSOR_SSM_A, i), { d_state, d_inner }, init_A_S4D);
@@ -674,19 +692,19 @@ struct model_variant {
 
                     // Block 0, LN0
                     cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
-                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, init_bias);
 
                     // output
                     cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, init_bias);
                     cur.add_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
 
                     for (uint32_t i = 0; i < n_layer; ++i) {
                         cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
-                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, init_bias);
 
                         cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
-                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, init_bias);
 
                         cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd});
                         cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay});
@@ -721,7 +739,7 @@ struct model_variant {
                         cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
 
                         cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
-                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
+                        cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, init_bias);
                         cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
 
                         cur.add_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
@@ -1036,7 +1054,7 @@ int main(int argc, char ** argv) {
                             for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
                                 float err = ref_outputs[seq_id].validate_batch(ctx, batch, seq_id);
                                 if (!isfinite(err) || err > 1.0f / 1024.0f) {
-                                    fprintf(stderr, "Error for seq_id %i is %f\n", seq_id, err);
+                                    fprintf(stderr, "Error for seq_id %i is %f at n_past=%i\n", seq_id, err, seq_id_n_past[seq_id]);
                                     valid[seq_id] = false;
                                 }
                             }

From 9d873d75431118b72af79dc28cea1383ef190403 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Wed, 18 Jun 2025 15:07:24 -0400
Subject: [PATCH 06/13] test-model-random : shuffle across sequences but not
 within

There isn't really a use-case for fully-shuffled batches

* test-model-random : use F32 as the KV cache type

Temporary until F16 is fixed on ARM when using FP16_VECTOR_ARITHMETIC
---
 tests/test-model-random.cpp | 73 +++++++++++++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index 218cfcb82bc8a..3b636039b4796 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
+#include <queue>
 #include <random>
 #include <utility>
 // NOTE: the llm_arch enum is in the private API
@@ -895,13 +896,68 @@ static void permute_from_ids(uint8_t * array, size_t elem_size, const std::vecto
     memcpy(array, tmp.data(), ids.size() * elem_size);
 }
 
+static std::vector<int32_t> random_merge_ids(std::vector<std::queue<int32_t>> & ids_per_seq, std::mt19937 & rng) {
+    size_t total_size = 0;
+    for (const auto & v : ids_per_seq) {
+        total_size += v.size();
+    }
+
+    std::vector<int32_t> ids;
+    ids.reserve(total_size);
+
+    for (size_t i = 1; i <= total_size; ++i) {
+        // need weighted random selection
+        std::uniform_int_distribution<int32_t> rand(0, total_size - i);
+        int32_t rand_id = rand(rng);
+
+        // find out in which seq set this would belong
+        for (size_t j = 0; j < ids_per_seq.size(); ++j) {
+            if (rand_id < (int32_t) ids_per_seq[j].size()) {
+                ids.push_back(ids_per_seq[j].front());
+                ids_per_seq[j].pop();
+                break;
+            }
+            rand_id -= ids_per_seq[j].size();
+        }
+    }
+
+    return ids;
+}
+
+// shuffle across sequences but not within seqences
 static void shuffle_batch(struct llama_batch & batch, std::mt19937 & rng) {
-    std::vector<int32_t> ids(batch.n_tokens);
+    std::vector<std::set<llama_seq_id>> seq_sets;
+    std::vector<std::queue<int32_t>> ids_per_seq;
+
     for (int32_t i = 0; i < batch.n_tokens; ++i) {
-        ids[i] = i;
+        int32_t seq_set_id = -1;
+        for (size_t s = 0; s < seq_sets.size(); ++s) {
+            for (int j = 0; j < batch.n_seq_id[i]; ++j) {
+                if (seq_sets[s].find(batch.seq_id[i][j]) != seq_sets[s].end()) {
+                    // any match, to avoid shuffling between dependent sets
+                    seq_set_id = s;
+                    break;
+                }
+            }
+        }
+
+        if (seq_set_id < 0) {
+            seq_sets.push_back({});
+            ids_per_seq.push_back({});
+            seq_set_id = seq_sets.size() - 1;
+        }
+        std::set<llama_seq_id> & seq_set = seq_sets[seq_set_id];
+        for (int j = 0; j < batch.n_seq_id[i]; ++j) {
+            // make sure the set contains all relevant seq_ids
+            seq_set.insert(batch.seq_id[i][j]);
+        }
+
+        ids_per_seq[seq_set_id].push(i);
     }
 
-    std::shuffle(ids.begin(), ids.end(), rng);
+    std::vector<int32_t> ids = random_merge_ids(ids_per_seq, rng);
+
+    GGML_ASSERT(ids.size() == (size_t) batch.n_tokens);
 
     if (batch.token) {
         permute_from_ids((uint8_t *) batch.token, sizeof(*batch.token), ids);
@@ -991,6 +1047,8 @@ int main(int argc, char ** argv) {
                     ref_params.n_ubatch = 1;
                     ref_params.n_ctx = n_seq_len;
                     ref_params.n_seq_max = 1;
+                    ref_params.type_k = GGML_TYPE_F32;
+                    ref_params.type_v = GGML_TYPE_F32;
 
                     llama_context * ref_ctx = llama_init_from_model(model, ref_params);
 
@@ -1006,10 +1064,8 @@ int main(int argc, char ** argv) {
 
                 for (bool shuffle : { false, true }) {
 
-                    // skip shuffling the batch for non-recurrent models
-                    // (simple splits don't handle shuffled batches correctly)
-                    // FIXME: remove this
-                    if (shuffle && !llama_model_is_recurrent(model)) {
+                    // can't really shuffle a single sequence with itself
+                    if (shuffle && n_seq_max == 1) {
                         continue;
                     }
 
@@ -1022,6 +1078,9 @@ int main(int argc, char ** argv) {
                         ctx_params.n_seq_max = n_seq_max;
                         ctx_params.n_ubatch = n_ubatch;
                         ctx_params.n_batch = n_batch;
+                        // TODO: remove once F16 is fixed on ARM
+                        ctx_params.type_k = GGML_TYPE_F32;
+                        ctx_params.type_v = GGML_TYPE_F32;
 
                         llama_context * ctx = llama_init_from_model(model, ctx_params);
 

From ccb2bb9988a419fe8429021d17941a5199abee06 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Wed, 18 Jun 2025 15:11:23 -0400
Subject: [PATCH 07/13] test-model-random : show max error

---
 tests/test-model-random.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index 3b636039b4796..d3739f1a8f32a 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -1089,6 +1089,8 @@ int main(int argc, char ** argv) {
                         std::set<llama_seq_id> seq_ids_in_batch;
                         std::vector<llama_pos> seq_id_n_past(n_seq_max, 0);
 
+                        float max_err = 0.0f;
+
                         // start filling the batch with prompts
                         while (std::any_of(seq_id_n_past.begin(), seq_id_n_past.end(),
                                            [](llama_pos p) { return p < n_seq_len; })) {
@@ -1119,6 +1121,7 @@ int main(int argc, char ** argv) {
                                     fprintf(stderr, "Error for seq_id %i is %f at n_past=%i\n", seq_id, err, seq_id_n_past[seq_id]);
                                     valid[seq_id] = false;
                                 }
+                                max_err = std::max(err, max_err);
                             }
 
                             common_batch_clear(batch);
@@ -1140,10 +1143,11 @@ int main(int argc, char ** argv) {
                                 "Comparing output for '%s', with shuffle=%i, n_seq_max=%i, n_ctx=%i, n_ubatch=%i: ",
                                 variant.name.c_str(), shuffle, n_seq_max, n_ctx, n_ubatch);
                         if (std::all_of(valid.begin(), valid.end(), [](bool v) { return v; })) {
-                            fprintf(stdout, "\033[1;32mOK\033[0m\n");
+                            fprintf(stdout, "\033[1;32mOK\033[0m (max err: %.2g)\n", max_err);
                         } else {
-                            fprintf(stdout, "(%zu%%) \033[1;31mFAILED\033[0m\n",
-                                    std::count_if(valid.begin(), valid.end(), [](bool v) { return v == false; }) * 100 / valid.size());
+                            fprintf(stdout, "(%zu%%) \033[1;31mFAILED\033[0m (max err: %.4g)\n",
+                                    std::count_if(valid.begin(), valid.end(), [](bool v) { return v == false; }) * 100 / valid.size(),
+                                    max_err);
                             // cleanup and exit on first failure
                             llama_free(ctx);
                             llama_model_free(model);

From 985cda6c7bdbf3c6e2fa6cfdb338214d3283878d Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 7 Jul 2025 21:07:46 -0400
Subject: [PATCH 08/13] test-model-random : add Mamba2

---
 tests/test-model-random.cpp | 68 +++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index 796ba7292d221..ade11cfa8c854 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -628,6 +628,64 @@ struct model_variant {
                     }
                 }
                 break;
+            case LLM_ARCH_MAMBA2:
+                {
+                    variants.push_back(model_variant(arch, "Mamba2"));
+                    model_variant & cur = variants.back();
+
+                    n_embd = 64;
+
+                    const uint32_t d_inner = 2 * n_embd;
+                    const uint32_t d_conv = 4;
+                    const uint32_t d_state = 128;
+                    const uint32_t n_group = 2;
+                    const uint32_t head_dim = 64;
+                    const uint32_t n_head = d_inner / head_dim;
+                    const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head;
+
+                    const auto init_A = [](std::mt19937 & rng) {
+                        return -std::uniform_real_distribution<float>(1, 16)(rng);
+                    };
+
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 1024 * 1024);
+                    cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
+                    cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, (uint32_t) 0);
+                    cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, (uint32_t) 0);
+                    cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
+                    cur.add_kv(LLM_KV_SSM_CONV_KERNEL, d_conv);
+                    cur.add_kv(LLM_KV_SSM_INNER_SIZE, d_inner);
+                    cur.add_kv(LLM_KV_SSM_STATE_SIZE, d_state);
+                    cur.add_kv(LLM_KV_SSM_TIME_STEP_RANK, n_head);
+                    cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
+                    cur.add_kv(LLM_KV_SSM_GROUP_COUNT, n_group);
+
+                    add_tokenizer(cur, n_vocab);
+
+                    cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
+                    cur.add_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj});
+
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state});
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, init_bias);
+
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, init_bias);
+
+                        // no "weight" suffix for these
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, init_A);
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, []() { return 1.0f; });
+
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group});
+
+                        // out_proj
+                        cur.add_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
+                    }
+                }
+                break;
             case LLM_ARCH_XVERSE:
             case LLM_ARCH_COMMAND_R:
             case LLM_ARCH_COHERE2:
@@ -760,6 +818,7 @@ struct model_variant {
             case LLM_ARCH_BAILINGMOE:
             case LLM_ARCH_DOTS1:
             case LLM_ARCH_ARCEE:
+            case LLM_ARCH_ERNIE4_5:
             case LLM_ARCH_UNKNOWN:
                 break;
         }
@@ -1042,6 +1101,8 @@ int main(int argc, char ** argv) {
 
                 std::vector<reference_logits> ref_outputs;
 
+                fprintf(stdout, "Generating reference outputs for '%s', n_seq_max=%i...\n", variant.name.c_str(), n_seq_max);
+
                 {
                     llama_context_params ref_params = llama_context_default_params();
                     ref_params.n_batch = n_seq_len;
@@ -1092,6 +1153,10 @@ int main(int argc, char ** argv) {
 
                         float max_err = 0.0f;
 
+                        fprintf(stdout,
+                                "Comparing output for '%s', with shuffle=%i, n_seq_max=%i, n_ctx=%i, n_ubatch=%i: ",
+                                variant.name.c_str(), shuffle, n_seq_max, n_ctx, n_ubatch);
+
                         // start filling the batch with prompts
                         while (std::any_of(seq_id_n_past.begin(), seq_id_n_past.end(),
                                            [](llama_pos p) { return p < n_seq_len; })) {
@@ -1140,9 +1205,6 @@ int main(int argc, char ** argv) {
                             }
                         }
 
-                        fprintf(stdout,
-                                "Comparing output for '%s', with shuffle=%i, n_seq_max=%i, n_ctx=%i, n_ubatch=%i: ",
-                                variant.name.c_str(), shuffle, n_seq_max, n_ctx, n_ubatch);
                         if (std::all_of(valid.begin(), valid.end(), [](bool v) { return v; })) {
                             fprintf(stdout, "\033[1;32mOK\033[0m (max err: %.2g)\n", max_err);
                         } else {

From 6b38c7a04cf9382232f7d214007e1ac0dd51b528 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 7 Jul 2025 21:39:54 -0400
Subject: [PATCH 09/13] memory : fix broken batch splits for recurrent cache

---
 src/llama-memory-recurrent.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index 4b90dac7a327c..a1b5b1a272cc0 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -377,14 +377,18 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
                 ubatch = balloc.split_equal(n_ubatch, false);
             }
 
-            if (balloc.get_n_used() < balloc.get_n_tokens()) {
-                // failed to find a suitable split
+            if (ubatch.n_tokens == 0) {
                 break;
             }
 
             ubatches.push_back(std::move(ubatch)); // NOLINT
         }
 
+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
+            break;
+        }
+
         if (!prepare(ubatches)) {
             break;
         }

From 362cf5429cf066eca5f682cba47eded39a8d7ef6 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 8 Jul 2025 16:34:44 -0400
Subject: [PATCH 10/13] test-model-random : configurable model n_ctx, and
 smaller seq lengths

---
 src/llama-model.cpp         |  4 +++-
 tests/test-model-random.cpp | 21 ++++++++++-----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 29ded0aff96a1..fc478887004f1 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -598,9 +598,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP,   hparams.n_moe_layer_step);
 
                 hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
-                hparams.n_swa         = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
+                hparams.n_swa         = 8192; // currently it's the same for Scout and Maverick
                 hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
 
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); // for overrides in tests
+
                 switch (hparams.n_expert) {
                     case 16:  type = LLM_TYPE_17B_16E; break;
                     case 128: type = LLM_TYPE_17B_128E; break;
diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index ade11cfa8c854..8c8f83fe58ba9 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -340,7 +340,7 @@ struct model_variant {
         return status;
     }
 
-    static void insert_from_arch(std::vector<model_variant> & variants, llm_arch arch) {
+    static void insert_from_arch(std::vector<model_variant> & variants, llm_arch arch, uint32_t n_ctx) {
         uint32_t n_vocab = 256;
         uint32_t n_embd = 32;
         uint32_t n_ff = 3 * n_embd;
@@ -391,7 +391,7 @@ struct model_variant {
                     const uint32_t n_embd_v_gqa = n_embd_k_gqa;
 
                     cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
-                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
                     cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
                     cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
                     cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
@@ -439,7 +439,7 @@ struct model_variant {
                     const uint32_t n_expert = 4;
 
                     cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
-                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
                     cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
                     cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
                     cur.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
@@ -450,8 +450,7 @@ struct model_variant {
                     cur.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, n_embd / n_head);
                     cur.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, n_moe_layer_step);
                     cur.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff_exp);
-                    // FIXME: this isn't used because the default is 8192
-                    cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t) 389); // prime number
+                    cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx / 2); // TODO: use a prime number
 
                     add_tokenizer(cur, n_vocab);
 
@@ -538,7 +537,7 @@ struct model_variant {
                     const uint32_t n_embd_k_gqa = n_embd_head_k * n_head_kv;
                     const uint32_t n_embd_v_gqa = n_embd_k_gqa;
 
-                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, (uint32_t) 4096);
+                    cur.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
                     cur.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
                     cur.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
                     cur.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
@@ -547,7 +546,7 @@ struct model_variant {
                     cur.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
                     cur.add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, 50.0f);
                     cur.add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, 30.0f);
-                    cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, (uint32_t) 389); // prime number
+                    cur.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx / 2); // TODO: use a prime number
 
                     add_tokenizer(cur, n_vocab);
 
@@ -1063,8 +1062,8 @@ int main(int argc, char ** argv) {
     std::mt19937 rng(42);
 
     // TODO: multiple sequences per token
-    const int32_t n_batch = 3 * 512;
-    const int32_t n_seq_len = 643; // prime number
+    const int32_t n_batch = 509; // prime number
+    const int32_t n_seq_len = 127; // prime number
 
     llama_batch batch = llama_batch_init(n_batch, 0, 1);
     // TODO: batch with embeddings
@@ -1073,7 +1072,7 @@ int main(int argc, char ** argv) {
 
     for (int i = 0; i < LLM_ARCH_UNKNOWN; ++i) {
         llm_arch arch = (llm_arch) i;
-        model_variant::insert_from_arch(model_variants, arch);
+        model_variant::insert_from_arch(model_variants, arch, n_seq_len);
     }
 
     // TODO: concurrent tests?
@@ -1094,7 +1093,7 @@ int main(int argc, char ** argv) {
         // const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
         // const auto n_embd = llama_model_n_embd(model);
 
-        for (int32_t n_seq_max : { 1, 2, 5 } ) {
+        for (int32_t n_seq_max : { 1, 2, 5, 13 } ) {
 
             // TODO(later): context shift testing
             for (int32_t n_ctx : { n_seq_len * n_seq_max }) {

From 4e58ca46df44ee5493ed625bb3553fa480f46a13 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 8 Jul 2025 16:47:18 -0400
Subject: [PATCH 11/13] test-model-random : avoid testing too many sequences
 for now

It makes the test longer for little benefit.
---
 tests/test-model-random.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index 8c8f83fe58ba9..f65444c5525cb 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -818,6 +818,8 @@ struct model_variant {
             case LLM_ARCH_DOTS1:
             case LLM_ARCH_ARCEE:
             case LLM_ARCH_ERNIE4_5:
+            case LLM_ARCH_HUNYUAN_MOE:
+            case LLM_ARCH_SMOLLM3:
             case LLM_ARCH_UNKNOWN:
                 break;
         }
@@ -1093,7 +1095,7 @@ int main(int argc, char ** argv) {
         // const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
         // const auto n_embd = llama_model_n_embd(model);
 
-        for (int32_t n_seq_max : { 1, 2, 5, 13 } ) {
+        for (int32_t n_seq_max : { 1, 2, 5 }) {
 
             // TODO(later): context shift testing
             for (int32_t n_ctx : { n_seq_len * n_seq_max }) {

From a17c4f7d75f3dfdcf2de04ef5e85c8764313d3d4 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 8 Jul 2025 17:48:04 -0400
Subject: [PATCH 12/13] test-model-random : add shared prompt test variant

---
 tests/test-model-random.cpp | 201 ++++++++++++++++++++++--------------
 1 file changed, 123 insertions(+), 78 deletions(-)

diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index f65444c5525cb..51d3bfdf29047 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -832,9 +832,11 @@ struct reference_logits {
     std::vector<llama_token> inputs;
     std::vector<float> outputs;
 
-    reference_logits(llama_context * ctx, int32_t seq_len, std::mt19937 & rng) {
+    reference_logits(llama_context * ctx, int32_t seq_len, std::mt19937 & rng,
+                     const std::vector<llama_token> & shared_prompt) {
         n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(llama_get_model(ctx)));
         std::uniform_int_distribution<llama_token> rand_token(0, n_vocab - 1);
+        GGML_ASSERT(shared_prompt.size() < (size_t) (seq_len / 4));
         std::uniform_int_distribution<int32_t> rand_prompt_len(seq_len / 4, 3 * seq_len / 4);
 
         llama_batch batch = llama_batch_init(seq_len, 0, 1);
@@ -843,7 +845,14 @@ struct reference_logits {
 
         prompt_len = rand_prompt_len(rng);
 
-        for (int32_t i = 0; i < prompt_len; ++i) {
+        for (int32_t i = 0; i < (int32_t) shared_prompt.size(); ++i) {
+            const llama_token token = shared_prompt[i];
+            inputs.push_back(token);
+
+            common_batch_add(batch, token, i, { 0 }, true);
+        }
+
+        for (int32_t i = shared_prompt.size(); i < prompt_len; ++i) {
             const llama_token token = rand_token(rng);
             inputs.push_back(token);
 
@@ -1065,6 +1074,7 @@ int main(int argc, char ** argv) {
 
     // TODO: multiple sequences per token
     const int32_t n_batch = 509; // prime number
+    const int32_t n_shared_len = 13; // prime number, shared prompt length
     const int32_t n_seq_len = 127; // prime number
 
     llama_batch batch = llama_batch_init(n_batch, 0, 1);
@@ -1092,9 +1102,21 @@ int main(int argc, char ** argv) {
 
         GGML_ASSERT(model);
 
-        // const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+        const auto n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
         // const auto n_embd = llama_model_n_embd(model);
 
+        std::vector<llama_token> shared_prompt;
+        // populate shared prompt
+        {
+            std::uniform_int_distribution<llama_token> rand_token(0, n_vocab - 1);
+            shared_prompt.reserve(n_shared_len);
+
+            for (int32_t i = 0; i < n_shared_len; ++i) {
+                shared_prompt.push_back(rand_token(rng));
+            }
+        }
+
+        // TODO: avoid re-creating reference outputs
         for (int32_t n_seq_max : { 1, 2, 5 }) {
 
             // TODO(later): context shift testing
@@ -1119,111 +1141,134 @@ int main(int argc, char ** argv) {
 
                     for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
                         llama_memory_clear(mem, true);
-                        ref_outputs.push_back(reference_logits(ref_ctx, n_seq_len, rng));
+                        ref_outputs.push_back(reference_logits(ref_ctx, n_seq_len, rng, shared_prompt));
                     }
 
                     llama_free(ref_ctx);
                 }
 
-                for (bool shuffle : { false, true }) {
+                for (bool use_shared_prompt : { false, true }) {
+                    for (bool shuffle : { false, true }) {
 
-                    // can't really shuffle a single sequence with itself
-                    if (shuffle && n_seq_max == 1) {
-                        continue;
-                    }
+                        // can't really shuffle a single sequence with itself
+                        if (shuffle && n_seq_max == 1) {
+                            continue;
+                        }
+                        // can't really share a prompt with only one sequence
+                        if (use_shared_prompt && n_seq_max == 1) {
+                            continue;
+                        }
 
-                    for (int32_t n_ubatch : { 1, 2, 512 } ) {
+                        for (int32_t n_ubatch : { 1, 2, 512 } ) {
 
-                        std::vector<bool> valid(n_seq_max, true);
+                            std::vector<bool> valid(n_seq_max, true);
 
-                        llama_context_params ctx_params = llama_context_default_params();
-                        ctx_params.n_ctx = n_ctx;
-                        ctx_params.n_seq_max = n_seq_max;
-                        ctx_params.n_ubatch = n_ubatch;
-                        ctx_params.n_batch = n_batch;
-                        // TODO: remove once F16 is fixed on ARM
-                        ctx_params.type_k = GGML_TYPE_F32;
-                        ctx_params.type_v = GGML_TYPE_F32;
+                            llama_context_params ctx_params = llama_context_default_params();
+                            ctx_params.n_ctx = n_ctx;
+                            ctx_params.n_seq_max = n_seq_max;
+                            ctx_params.n_ubatch = n_ubatch;
+                            ctx_params.n_batch = n_batch;
+                            // TODO: remove once F16 is fixed on ARM
+                            ctx_params.type_k = GGML_TYPE_F32;
+                            ctx_params.type_v = GGML_TYPE_F32;
 
-                        llama_context * ctx = llama_init_from_model(model, ctx_params);
+                            llama_context * ctx = llama_init_from_model(model, ctx_params);
 
-                        common_batch_clear(batch);
+                            common_batch_clear(batch);
 
-                        std::set<llama_seq_id> seq_ids_in_batch;
-                        std::vector<llama_pos> seq_id_n_past(n_seq_max, 0);
+                            std::set<llama_seq_id> seq_ids_in_batch;
+                            std::vector<llama_pos> seq_id_n_past(n_seq_max, 0);
 
-                        float max_err = 0.0f;
+                            float max_err = 0.0f;
 
-                        fprintf(stdout,
-                                "Comparing output for '%s', with shuffle=%i, n_seq_max=%i, n_ctx=%i, n_ubatch=%i: ",
-                                variant.name.c_str(), shuffle, n_seq_max, n_ctx, n_ubatch);
+                            fprintf(stdout,
+                                    "Comparing output for '%s', with shared=%i, shuffle=%i, n_seq_max=%i, n_ctx=%i, n_ubatch=%i: ",
+                                    variant.name.c_str(), use_shared_prompt, shuffle, n_seq_max, n_ctx, n_ubatch);
 
-                        // start filling the batch with prompts
-                        while (std::any_of(seq_id_n_past.begin(), seq_id_n_past.end(),
-                                           [](llama_pos p) { return p < n_seq_len; })) {
-                            for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
-                                if (seq_id_n_past[seq_id] >= ref_outputs[seq_id].prompt_len) {
-                                    continue;
-                                }
+                            if (use_shared_prompt) {
+                                // TODO: also test multiple distinct shared prompts in the same batch
+                                std::vector<llama_seq_id> seq_id_group;
+                                seq_id_group.reserve(n_seq_max);
 
-                                if (batch.n_tokens < n_batch) {
-                                    const int64_t seq_len =
-                                        std::min(n_batch - batch.n_tokens,
-                                                 ref_outputs[seq_id].prompt_len - seq_id_n_past[seq_id]);
+                                GGML_ASSERT(shared_prompt.size() < n_batch);
 
-                                    ref_outputs[seq_id].add_to_batch(batch, seq_id_n_past[seq_id], seq_len, seq_id);
-                                    seq_ids_in_batch.insert(seq_id);
-                                    seq_id_n_past[seq_id] += seq_len;
-                                }
-                            }
-                            if (shuffle) {
-                                shuffle_batch(batch, rng);
+                                for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
+                                    seq_id_group.push_back(seq_id);
+                                    seq_id_n_past[seq_id] += shared_prompt.size();
+                                };
+
+                                for (size_t i = 0; i < shared_prompt.size(); ++i) {
+                                    common_batch_add(batch, shared_prompt[i], i, seq_id_group, true);
+                                };
                             }
 
-                            llama_decode(ctx, batch);
+                            // start filling the batch with prompts
+                            while (std::any_of(seq_id_n_past.begin(), seq_id_n_past.end(),
+                                               [](llama_pos p) { return p < n_seq_len; })) {
+                                for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
+                                    if (seq_id_n_past[seq_id] >= ref_outputs[seq_id].prompt_len) {
+                                        continue;
+                                    }
+
+                                    if (batch.n_tokens < n_batch) {
+                                        const int64_t seq_len =
+                                            std::min(n_batch - batch.n_tokens,
+                                                     ref_outputs[seq_id].prompt_len - seq_id_n_past[seq_id]);
+
+                                        ref_outputs[seq_id].add_to_batch(batch, seq_id_n_past[seq_id], seq_len, seq_id);
+                                        seq_ids_in_batch.insert(seq_id);
+                                        seq_id_n_past[seq_id] += seq_len;
+                                    }
+                                }
+                                if (shuffle) {
+                                    shuffle_batch(batch, rng);
+                                }
 
-                            for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
-                                float err = ref_outputs[seq_id].validate_batch(ctx, batch, seq_id);
-                                if (!isfinite(err) || err > 1.0f / 1024.0f) {
-                                    fprintf(stderr, "Error for seq_id %i is %f at n_past=%i\n", seq_id, err, seq_id_n_past[seq_id]);
-                                    valid[seq_id] = false;
+                                llama_decode(ctx, batch);
+
+                                for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
+                                    float err = ref_outputs[seq_id].validate_batch(ctx, batch, seq_id);
+                                    if (!isfinite(err) || err > 1.0f / 1024.0f) {
+                                        fprintf(stderr, "Error for seq_id %i is %f at n_past=%i\n", seq_id, err, seq_id_n_past[seq_id]);
+                                        valid[seq_id] = false;
+                                    }
+                                    max_err = std::max(err, max_err);
                                 }
-                                max_err = std::max(err, max_err);
-                            }
 
-                            common_batch_clear(batch);
+                                common_batch_clear(batch);
 
-                            GGML_ASSERT(n_seq_max <= n_batch); // not handling splitting this across batches here
+                                GGML_ASSERT(n_seq_max <= n_batch); // not handling splitting this across batches here
 
-                            // cont batching
-                            for (llama_seq_id s : seq_ids_in_batch) {
-                                llama_pos & pos = seq_id_n_past[s];
-                                if (pos >= n_seq_len) {
-                                    continue;
+                                // cont batching
+                                for (llama_seq_id s : seq_ids_in_batch) {
+                                    llama_pos & pos = seq_id_n_past[s];
+                                    if (pos >= n_seq_len) {
+                                        continue;
+                                    }
+                                    ref_outputs[s].add_to_batch(batch, pos, 1, s);
+                                    pos += 1;
                                 }
-                                ref_outputs[s].add_to_batch(batch, pos, 1, s);
-                                pos += 1;
                             }
-                        }
 
-                        if (std::all_of(valid.begin(), valid.end(), [](bool v) { return v; })) {
-                            fprintf(stdout, "\033[1;32mOK\033[0m (max err: %.2g)\n", max_err);
-                        } else {
-                            fprintf(stdout, "(%zu%%) \033[1;31mFAILED\033[0m (max err: %.4g)\n",
-                                    std::count_if(valid.begin(), valid.end(), [](bool v) { return v == false; }) * 100 / valid.size(),
-                                    max_err);
-                            // cleanup and exit on first failure
-                            llama_free(ctx);
-                            llama_model_free(model);
-                            llama_batch_free(batch);
-                            exit(1);
-                        }
+                            if (std::all_of(valid.begin(), valid.end(), [](bool v) { return v; })) {
+                                fprintf(stdout, "\033[1;32mOK\033[0m (max err: %.2g)\n", max_err);
+                            } else {
+                                fprintf(stdout, "(%zu%%) \033[1;31mFAILED\033[0m (max err: %.4g)\n",
+                                        std::count_if(valid.begin(), valid.end(), [](bool v) { return v == false; }) * 100 / valid.size(),
+                                        max_err);
+                                // cleanup and exit on first failure
+                                llama_free(ctx);
+                                llama_model_free(model);
+                                llama_batch_free(batch);
+                                exit(1);
+                            }
 
-                        // TODO: use seq_rm, seq_cp, etc. to test if they work properly
+                            // TODO: use seq_rm, seq_cp, etc. to test if they work properly
 
-                        // TODO: test pooled embeddings
+                            // TODO: test pooled embeddings
 
-                        llama_free(ctx);
+                            llama_free(ctx);
+                        }
                     }
                 }
             }

From 7634d14d7a492cf4ed88ca1fe392b8997eaa70ec Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 8 Jul 2025 18:23:58 -0400
Subject: [PATCH 13/13] test-model-random : fix seq_id buffer overflow

---
 tests/test-model-random.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/test-model-random.cpp b/tests/test-model-random.cpp
index 51d3bfdf29047..3645c00b5f9a8 100644
--- a/tests/test-model-random.cpp
+++ b/tests/test-model-random.cpp
@@ -1077,7 +1077,6 @@ int main(int argc, char ** argv) {
     const int32_t n_shared_len = 13; // prime number, shared prompt length
     const int32_t n_seq_len = 127; // prime number
 
-    llama_batch batch = llama_batch_init(n_batch, 0, 1);
     // TODO: batch with embeddings
 
     std::vector<model_variant> model_variants;
@@ -1119,6 +1118,8 @@ int main(int argc, char ** argv) {
         // TODO: avoid re-creating reference outputs
         for (int32_t n_seq_max : { 1, 2, 5 }) {
 
+            llama_batch batch = llama_batch_init(n_batch, 0, n_seq_max);
+
             // TODO(later): context shift testing
             for (int32_t n_ctx : { n_seq_len * n_seq_max }) {
 
@@ -1195,6 +1196,7 @@ int main(int argc, char ** argv) {
                                 for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
                                     seq_id_group.push_back(seq_id);
                                     seq_id_n_past[seq_id] += shared_prompt.size();
+                                    seq_ids_in_batch.insert(seq_id);
                                 };
 
                                 for (size_t i = 0; i < shared_prompt.size(); ++i) {
@@ -1272,12 +1274,12 @@ int main(int argc, char ** argv) {
                     }
                 }
             }
+
+            llama_batch_free(batch);
         }
 
         llama_model_free(model);
     }
 
-    llama_batch_free(batch);
-
     return 0;
 }