diff --git a/server/src/common/gguf_inspect.cpp b/server/src/common/gguf_inspect.cpp index 95cc30c41..4d8fd20ef 100644 --- a/server/src/common/gguf_inspect.cpp +++ b/server/src/common/gguf_inspect.cpp @@ -1,9 +1,15 @@ #include "gguf_inspect.h" #include "gguf.h" +#include +#include #include +#include #include +#include #include +#include +#include namespace dflash::common { @@ -36,4 +42,332 @@ GgufModelInfo inspect_gguf_model_info(const char * path) { return info; } +// ─── SHA-256 (RFC 6234) ───────────────────────────────────────────────── +// +// Self-contained mini-implementation so we don't pull in OpenSSL just for +// one hash. Performance is "fine" — hashing a 17 GB GGUF takes ~30s on a +// fast NVMe, which is comparable to the per-file numbers `sha256sum` gets. +// We sidecar the result so this only happens on the first server start +// after a model is downloaded. + +namespace { + +struct Sha256Ctx { + uint32_t state[8]; + uint64_t bit_len; + uint8_t buf[64]; + size_t buf_len; +}; + +inline uint32_t rotr32(uint32_t x, uint32_t n) { + return (x >> n) | (x << (32 - n)); +} + +void sha256_init(Sha256Ctx & c) { + c.state[0] = 0x6a09e667u; + c.state[1] = 0xbb67ae85u; + c.state[2] = 0x3c6ef372u; + c.state[3] = 0xa54ff53au; + c.state[4] = 0x510e527fu; + c.state[5] = 0x9b05688cu; + c.state[6] = 0x1f83d9abu; + c.state[7] = 0x5be0cd19u; + c.bit_len = 0; + c.buf_len = 0; +} + +void sha256_compress(Sha256Ctx & c, const uint8_t * block) { + static const uint32_t K[64] = { + 0x428a2f98u,0x71374491u,0xb5c0fbcfu,0xe9b5dba5u,0x3956c25bu,0x59f111f1u,0x923f82a4u,0xab1c5ed5u, + 0xd807aa98u,0x12835b01u,0x243185beu,0x550c7dc3u,0x72be5d74u,0x80deb1feu,0x9bdc06a7u,0xc19bf174u, + 0xe49b69c1u,0xefbe4786u,0x0fc19dc6u,0x240ca1ccu,0x2de92c6fu,0x4a7484aau,0x5cb0a9dcu,0x76f988dau, + 0x983e5152u,0xa831c66du,0xb00327c8u,0xbf597fc7u,0xc6e00bf3u,0xd5a79147u,0x06ca6351u,0x14292967u, + 0x27b70a85u,0x2e1b2138u,0x4d2c6dfcu,0x53380d13u,0x650a7354u,0x766a0abbu,0x81c2c92eu,0x92722c85u, + 0xa2bfe8a1u,0xa81a664bu,0xc24b8b70u,0xc76c51a3u,0xd192e819u,0xd6990624u,0xf40e3585u,0x106aa070u, + 0x19a4c116u,0x1e376c08u,0x2748774cu,0x34b0bcb5u,0x391c0cb3u,0x4ed8aa4au,0x5b9cca4fu,0x682e6ff3u, + 0x748f82eeu,0x78a5636fu,0x84c87814u,0x8cc70208u,0x90befffau,0xa4506cebu,0xbef9a3f7u,0xc67178f2u + }; + uint32_t w[64]; + for (int i = 0; i < 16; ++i) { + w[i] = (uint32_t(block[i*4+0]) << 24) | (uint32_t(block[i*4+1]) << 16) | + (uint32_t(block[i*4+2]) << 8 ) | uint32_t(block[i*4+3]); + } + for (int i = 16; i < 64; ++i) { + uint32_t s0 = rotr32(w[i-15], 7) ^ rotr32(w[i-15], 18) ^ (w[i-15] >> 3); + uint32_t s1 = rotr32(w[i-2], 17) ^ rotr32(w[i-2], 19) ^ (w[i-2] >> 10); + w[i] = w[i-16] + s0 + w[i-7] + s1; + } + uint32_t a = c.state[0], b = c.state[1], cc = c.state[2], d = c.state[3]; + uint32_t e = c.state[4], f = c.state[5], g = c.state[6], h = c.state[7]; + for (int i = 0; i < 64; ++i) { + uint32_t S1 = rotr32(e, 6) ^ rotr32(e, 11) ^ rotr32(e, 25); + uint32_t ch = (e & f) ^ ((~e) & g); + uint32_t t1 = h + S1 + ch + K[i] + w[i]; + uint32_t S0 = rotr32(a, 2) ^ rotr32(a, 13) ^ rotr32(a, 22); + uint32_t mj = (a & b) ^ (a & cc) ^ (b & cc); + uint32_t t2 = S0 + mj; + h = g; g = f; f = e; e = d + t1; + d = cc; cc = b; b = a; a = t1 + t2; + } + c.state[0] += a; c.state[1] += b; c.state[2] += cc; c.state[3] += d; + c.state[4] += e; c.state[5] += f; c.state[6] += g; c.state[7] += h; +} + +void sha256_update(Sha256Ctx & c, const uint8_t * data, size_t len) { + c.bit_len += uint64_t(len) * 8; + if (c.buf_len) { + size_t take = std::min(size_t(64) - c.buf_len, len); + std::memcpy(c.buf + c.buf_len, data, take); + c.buf_len += take; + data += take; + len -= take; + if (c.buf_len == 64) { + sha256_compress(c, c.buf); + c.buf_len = 0; + } + } + while (len >= 64) { + sha256_compress(c, data); + data += 64; + len -= 64; + } + if (len) { + std::memcpy(c.buf, data, len); + c.buf_len = len; + } +} + +std::string sha256_final(Sha256Ctx & c) { + uint64_t bits = c.bit_len; + c.buf[c.buf_len++] = 0x80; + if (c.buf_len > 56) { + std::memset(c.buf + c.buf_len, 0, 64 - c.buf_len); + sha256_compress(c, c.buf); + c.buf_len = 0; + } + std::memset(c.buf + c.buf_len, 0, 56 - c.buf_len); + for (int i = 7; i >= 0; --i) { + c.buf[56 + i] = uint8_t(bits & 0xff); + bits >>= 8; + } + sha256_compress(c, c.buf); + + static const char * hex = "0123456789abcdef"; + std::string out; + out.resize(64); + for (int i = 0; i < 8; ++i) { + uint32_t v = c.state[i]; + for (int j = 0; j < 4; ++j) { + uint8_t byte = uint8_t((v >> (24 - j * 8)) & 0xff); + out[i*8 + j*2 + 0] = hex[byte >> 4]; + out[i*8 + j*2 + 1] = hex[byte & 0x0f]; + } + } + return out; +} + +std::string sha256_of_file(const std::string & path) { + std::ifstream f(path, std::ios::binary); + if (!f) return {}; + Sha256Ctx c; + sha256_init(c); + // 4 MiB read buffer: empirically best throughput on NVMe without + // gulping the page cache. std::vector heap-allocates so we don't + // blow the C++ thread stack. + constexpr size_t BUF = 4 * 1024 * 1024; + std::vector buf(BUF); + while (f) { + f.read(reinterpret_cast(buf.data()), BUF); + std::streamsize got = f.gcount(); + if (got > 0) sha256_update(c, buf.data(), size_t(got)); + } + // If the loop exited on anything other than clean EOF (disk error, etc.), + // bail rather than return a finalized hash over a partial read — caching + // that as the model's SHA-256 would silently misidentify the file. + if (f.bad() || (f.fail() && !f.eof())) return {}; + return sha256_final(c); +} + +// Map LLAMA_FTYPE_* int → operator-friendly tag (Q4_K_M, IQ4_XS, BF16, …). +// Kept inline so we don't pull in llama.h here — those enum values are part +// of the GGUF on-disk format and won't change without a format bump. +const char * llama_ftype_name(int32_t v) { + switch (v) { + case 0: return "F32"; + case 1: return "F16"; + case 2: return "Q4_0"; + case 3: return "Q4_1"; + case 7: return "Q8_0"; + case 8: return "Q5_0"; + case 9: return "Q5_1"; + case 10: return "Q2_K"; + case 11: return "Q3_K_S"; + case 12: return "Q3_K_M"; + case 13: return "Q3_K_L"; + case 14: return "Q4_K_S"; + case 15: return "Q4_K_M"; + case 16: return "Q5_K_S"; + case 17: return "Q5_K_M"; + case 18: return "Q6_K"; + case 19: return "IQ2_XXS"; + case 20: return "IQ2_XS"; + case 21: return "Q2_K_S"; + case 22: return "IQ3_XS"; + case 23: return "IQ3_XXS"; + case 24: return "IQ1_S"; + case 25: return "IQ4_NL"; + case 26: return "IQ3_S"; + case 27: return "IQ3_M"; + case 28: return "IQ2_S"; + case 29: return "IQ2_M"; + case 30: return "IQ4_XS"; + case 31: return "IQ1_M"; + case 32: return "BF16"; + case 36: return "TQ1_0"; + case 37: return "TQ2_0"; + case 38: return "MXFP4_MOE"; + case 39: return "NVFP4"; + case 40: return "Q1_0"; + case 1024: return "GUESSED"; + default: return ""; + } +} + +// Sidecar layout (extends standard sha256sum format with a validation hint): +// line 1: "<64-hex> \n" (sha256sum-compatible) +// line 2: "# size=\n" (our extension; required to trust line 1) +// +// The size guard is what protects us from a stale sidecar after the GGUF was +// replaced/edited in place without the sidecar being updated. We deliberately +// don't trust legacy/external sidecars that lack the size hint — silently +// reporting the wrong model identity at /props is worse than re-hashing once. +bool read_sidecar_sha(const std::string & path, int64_t expected_size, std::string & out) { + if (expected_size < 0) return false; // can't validate without a known size + std::ifstream f(path + ".sha256"); + if (!f) return false; + std::string hex; + f >> hex; // tolerate ` filename\n` (sha256sum format) — we only want the first token + if (hex.size() != 64) return false; + for (char c : hex) { + bool is_hex = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'); + if (!is_hex) return false; + } + // Scan the rest of the file for a `# size=` directive. Refuse to + // trust the cached hash if it's missing or doesn't match — that + // indicates either a legacy sidecar (pre-validation-guard) or that the + // underlying GGUF has been replaced since the hash was written. + std::string line; + std::getline(f, line); // consume rest of line 1 + bool size_matches = false; + while (std::getline(f, line)) { + // Strip leading whitespace, then look for "# size=" prefix. + size_t i = 0; + while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) ++i; + const std::string prefix = "# size="; + if (line.compare(i, prefix.size(), prefix) != 0) continue; + const char * num = line.c_str() + i + prefix.size(); + char * end = nullptr; + long long n = std::strtoll(num, &end, 10); + if (end == num) continue; + if (n == (long long)expected_size) size_matches = true; + break; + } + if (!size_matches) return false; + out = std::move(hex); + return true; +} + +void write_sidecar_sha(const std::string & path, const std::string & sha, int64_t size_bytes) { + // Best-effort. If the directory isn't writable (read-only mount, model + // dir owned by another user), we just skip — the in-memory hash is + // already what /props will report this run. + std::ofstream f(path + ".sha256"); + if (!f) return; + // Emit sha256sum-compatible line + our size guard. The basename keeps + // `sha256sum -c` happy if a human ever runs it against the sidecar. + std::string base = path; + auto slash = base.find_last_of('/'); + if (slash != std::string::npos) base = base.substr(slash + 1); + f << sha << " " << base << "\n"; + if (size_bytes >= 0) f << "# size=" << size_bytes << "\n"; +} + +} // namespace + +GgufMetadata read_gguf_metadata(const std::string & path, + bool compute_sha256) { + GgufMetadata m; + m.path = path; + + struct stat st{}; + if (::stat(path.c_str(), &st) == 0) { + m.size_bytes = int64_t(st.st_size); + } + + gguf_init_params gip{}; + gip.no_alloc = true; + gip.ctx = nullptr; + gguf_context * gctx = gguf_init_from_file(path.c_str(), gip); + if (!gctx) { + // No GGUF header → bail. Still report path/size if we got them. + return m; + } + m.ok = true; + + auto get_str = [&](const char * key, std::string & out) { + int64_t id = gguf_find_key(gctx, key); + if (id < 0) return; + const char * v = gguf_get_val_str(gctx, id); + if (v) out = v; + }; + auto get_u32 = [&](const char * key, int32_t & out) { + int64_t id = gguf_find_key(gctx, key); + if (id < 0) return; + out = int32_t(gguf_get_val_u32(gctx, id)); + }; + + get_str("general.architecture", m.general_architecture); + get_str("general.name", m.general_name); + get_u32("general.file_type", m.file_type); + get_u32("general.quantization_version", m.quantization_version); + if (m.file_type >= 0) { + const char * name = llama_ftype_name(m.file_type); + if (name) m.file_type_name = name; + } + + if (!m.general_architecture.empty()) { + const std::string a = m.general_architecture; + get_u32((a + ".block_count").c_str(), m.block_count); + get_u32((a + ".embedding_length").c_str(), m.embedding_length); + get_u32((a + ".context_length").c_str(), m.context_length); + // vocab_size: prefer the explicit .vocab_size key. Fall back + // to the tokenizer token array length (the canonical source on + // models that don't write the redundant key). + get_u32((a + ".vocab_size").c_str(), m.vocab_size); + } + if (m.vocab_size < 0) { + int64_t toks_id = gguf_find_key(gctx, "tokenizer.ggml.tokens"); + if (toks_id >= 0) { + m.vocab_size = int32_t(gguf_get_arr_n(gctx, toks_id)); + } + } + + gguf_free(gctx); + + if (compute_sha256) { + std::string cached; + if (read_sidecar_sha(path, m.size_bytes, cached)) { + m.sha256 = std::move(cached); + } else { + std::string hash = sha256_of_file(path); + if (!hash.empty()) { + m.sha256 = hash; + write_sidecar_sha(path, hash, m.size_bytes); + } + } + } + + return m; +} + } // namespace dflash::common diff --git a/server/src/common/gguf_inspect.h b/server/src/common/gguf_inspect.h index 11c11379e..29ffb91a5 100644 --- a/server/src/common/gguf_inspect.h +++ b/server/src/common/gguf_inspect.h @@ -5,6 +5,7 @@ #pragma once +#include #include namespace dflash::common { @@ -18,4 +19,47 @@ struct GgufModelInfo { // Returns info with arch="" and n_layer=-1 on failure. GgufModelInfo inspect_gguf_model_info(const char * path); +// Richer GGUF identity captured at server startup and re-emitted at /props. +// All header values are best-effort: missing keys leave the corresponding +// field at the listed default (empty string or -1). `ok` is false only if +// the file itself couldn't be opened (path missing, not a GGUF, etc.). +// +// The intent is "exactly what binary + GGUF + quant + sha256 is loaded"; +// any field the file doesn't carry stays at the default so consumers can +// distinguish "not in GGUF" (-1) from "0" (legitimately zero). +struct GgufMetadata { + bool ok = false; // false: open failed, all other fields ignorable + std::string path; // absolute filesystem path passed in + int64_t size_bytes = -1; // file size (-1 if stat failed) + std::string sha256; // lowercase hex sha256 (empty if not computed) + + // Header fields (`general.*` + `.*`). All optional. + std::string general_architecture; // raw value of "general.architecture" + std::string general_name; // "general.name" (display string) + int32_t file_type = -1; // "general.file_type" (LLAMA_FTYPE_* int) + std::string file_type_name; // decoded LLAMA_FTYPE_* (e.g. "Q4_K_M", "IQ4_XS") + int32_t quantization_version = -1; // "general.quantization_version" + + int32_t block_count = -1; // ".block_count" + int32_t embedding_length = -1; // ".embedding_length" + int32_t context_length = -1; // ".context_length" + int32_t vocab_size = -1; // ".vocab_size" (or tokenizer.ggml.tokens length) +}; + +// Read GGUF identity for /props. Set `compute_sha256` to hash the file (slow, +// O(size) — multi-GB GGUFs take ~30s on a fast SSD). When false, `sha256` +// stays empty. The header read is cheap (no weight load). +// +// When `compute_sha256` is true and a sidecar file `.sha256` exists, +// its cached sha256 is trusted only when it carries a `# size=` guard +// matching the current GGUF file size; otherwise (legacy sidecar, size +// mismatch, or missing guard) the file is rehashed and the sidecar rewritten. +// This protects against a stale sidecar reporting the wrong identity after +// the GGUF was edited or replaced in place. After a successful hash, the +// result is written to the sidecar with the size guard so subsequent +// restarts skip the rehash. Sidecar I/O failures are non-fatal — the +// in-memory hash still gets returned. +GgufMetadata read_gguf_metadata(const std::string & path, + bool compute_sha256); + } // namespace dflash::common