Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
334 changes: 334 additions & 0 deletions server/src/common/gguf_inspect.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
#include "gguf_inspect.h"
#include "gguf.h"

#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <string>
#include <sys/stat.h>
#include <vector>

namespace dflash::common {

Expand Down Expand Up @@ -36,4 +42,332 @@ GgufModelInfo inspect_gguf_model_info(const char * path) {
return info;
}

// ─── SHA-256 (RFC 6234) ─────────────────────────────────────────────────
//
// Self-contained mini-implementation so we don't pull in OpenSSL just for
// one hash. Performance is "fine" — hashing a 17 GB GGUF takes ~30s on a
// fast NVMe, which is comparable to the per-file numbers `sha256sum` gets.
// We sidecar the result so this only happens on the first server start
// after a model is downloaded.

namespace {

struct Sha256Ctx {
uint32_t state[8];
uint64_t bit_len;
uint8_t buf[64];
size_t buf_len;
};

inline uint32_t rotr32(uint32_t x, uint32_t n) {
return (x >> n) | (x << (32 - n));
}

void sha256_init(Sha256Ctx & c) {
c.state[0] = 0x6a09e667u;
c.state[1] = 0xbb67ae85u;
c.state[2] = 0x3c6ef372u;
c.state[3] = 0xa54ff53au;
c.state[4] = 0x510e527fu;
c.state[5] = 0x9b05688cu;
c.state[6] = 0x1f83d9abu;
c.state[7] = 0x5be0cd19u;
c.bit_len = 0;
c.buf_len = 0;
}

void sha256_compress(Sha256Ctx & c, const uint8_t * block) {
static const uint32_t K[64] = {
0x428a2f98u,0x71374491u,0xb5c0fbcfu,0xe9b5dba5u,0x3956c25bu,0x59f111f1u,0x923f82a4u,0xab1c5ed5u,
0xd807aa98u,0x12835b01u,0x243185beu,0x550c7dc3u,0x72be5d74u,0x80deb1feu,0x9bdc06a7u,0xc19bf174u,
0xe49b69c1u,0xefbe4786u,0x0fc19dc6u,0x240ca1ccu,0x2de92c6fu,0x4a7484aau,0x5cb0a9dcu,0x76f988dau,
0x983e5152u,0xa831c66du,0xb00327c8u,0xbf597fc7u,0xc6e00bf3u,0xd5a79147u,0x06ca6351u,0x14292967u,
0x27b70a85u,0x2e1b2138u,0x4d2c6dfcu,0x53380d13u,0x650a7354u,0x766a0abbu,0x81c2c92eu,0x92722c85u,
0xa2bfe8a1u,0xa81a664bu,0xc24b8b70u,0xc76c51a3u,0xd192e819u,0xd6990624u,0xf40e3585u,0x106aa070u,
0x19a4c116u,0x1e376c08u,0x2748774cu,0x34b0bcb5u,0x391c0cb3u,0x4ed8aa4au,0x5b9cca4fu,0x682e6ff3u,
0x748f82eeu,0x78a5636fu,0x84c87814u,0x8cc70208u,0x90befffau,0xa4506cebu,0xbef9a3f7u,0xc67178f2u
};
uint32_t w[64];
for (int i = 0; i < 16; ++i) {
w[i] = (uint32_t(block[i*4+0]) << 24) | (uint32_t(block[i*4+1]) << 16) |
(uint32_t(block[i*4+2]) << 8 ) | uint32_t(block[i*4+3]);
}
for (int i = 16; i < 64; ++i) {
uint32_t s0 = rotr32(w[i-15], 7) ^ rotr32(w[i-15], 18) ^ (w[i-15] >> 3);
uint32_t s1 = rotr32(w[i-2], 17) ^ rotr32(w[i-2], 19) ^ (w[i-2] >> 10);
w[i] = w[i-16] + s0 + w[i-7] + s1;
}
uint32_t a = c.state[0], b = c.state[1], cc = c.state[2], d = c.state[3];
uint32_t e = c.state[4], f = c.state[5], g = c.state[6], h = c.state[7];
for (int i = 0; i < 64; ++i) {
uint32_t S1 = rotr32(e, 6) ^ rotr32(e, 11) ^ rotr32(e, 25);
uint32_t ch = (e & f) ^ ((~e) & g);
uint32_t t1 = h + S1 + ch + K[i] + w[i];
uint32_t S0 = rotr32(a, 2) ^ rotr32(a, 13) ^ rotr32(a, 22);
uint32_t mj = (a & b) ^ (a & cc) ^ (b & cc);
uint32_t t2 = S0 + mj;
h = g; g = f; f = e; e = d + t1;
d = cc; cc = b; b = a; a = t1 + t2;
}
c.state[0] += a; c.state[1] += b; c.state[2] += cc; c.state[3] += d;
c.state[4] += e; c.state[5] += f; c.state[6] += g; c.state[7] += h;
}

void sha256_update(Sha256Ctx & c, const uint8_t * data, size_t len) {
c.bit_len += uint64_t(len) * 8;
if (c.buf_len) {
size_t take = std::min(size_t(64) - c.buf_len, len);
std::memcpy(c.buf + c.buf_len, data, take);
c.buf_len += take;
data += take;
len -= take;
if (c.buf_len == 64) {
sha256_compress(c, c.buf);
c.buf_len = 0;
}
}
while (len >= 64) {
sha256_compress(c, data);
data += 64;
len -= 64;
}
if (len) {
std::memcpy(c.buf, data, len);
c.buf_len = len;
}
}

std::string sha256_final(Sha256Ctx & c) {
uint64_t bits = c.bit_len;
c.buf[c.buf_len++] = 0x80;
if (c.buf_len > 56) {
std::memset(c.buf + c.buf_len, 0, 64 - c.buf_len);
sha256_compress(c, c.buf);
c.buf_len = 0;
}
std::memset(c.buf + c.buf_len, 0, 56 - c.buf_len);
for (int i = 7; i >= 0; --i) {
c.buf[56 + i] = uint8_t(bits & 0xff);
bits >>= 8;
}
sha256_compress(c, c.buf);

static const char * hex = "0123456789abcdef";
std::string out;
out.resize(64);
for (int i = 0; i < 8; ++i) {
uint32_t v = c.state[i];
for (int j = 0; j < 4; ++j) {
uint8_t byte = uint8_t((v >> (24 - j * 8)) & 0xff);
out[i*8 + j*2 + 0] = hex[byte >> 4];
out[i*8 + j*2 + 1] = hex[byte & 0x0f];
}
}
return out;
}

std::string sha256_of_file(const std::string & path) {
std::ifstream f(path, std::ios::binary);
if (!f) return {};
Sha256Ctx c;
sha256_init(c);
// 4 MiB read buffer: empirically best throughput on NVMe without
// gulping the page cache. std::vector heap-allocates so we don't
// blow the C++ thread stack.
constexpr size_t BUF = 4 * 1024 * 1024;
std::vector<uint8_t> buf(BUF);
while (f) {
f.read(reinterpret_cast<char*>(buf.data()), BUF);
std::streamsize got = f.gcount();
if (got > 0) sha256_update(c, buf.data(), size_t(got));
}
// If the loop exited on anything other than clean EOF (disk error, etc.),
// bail rather than return a finalized hash over a partial read — caching
// that as the model's SHA-256 would silently misidentify the file.
if (f.bad() || (f.fail() && !f.eof())) return {};
return sha256_final(c);
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
}

// Map LLAMA_FTYPE_* int → operator-friendly tag (Q4_K_M, IQ4_XS, BF16, …).
// Kept inline so we don't pull in llama.h here — those enum values are part
// of the GGUF on-disk format and won't change without a format bump.
const char * llama_ftype_name(int32_t v) {
switch (v) {
case 0: return "F32";
case 1: return "F16";
case 2: return "Q4_0";
case 3: return "Q4_1";
case 7: return "Q8_0";
case 8: return "Q5_0";
case 9: return "Q5_1";
case 10: return "Q2_K";
case 11: return "Q3_K_S";
case 12: return "Q3_K_M";
case 13: return "Q3_K_L";
case 14: return "Q4_K_S";
case 15: return "Q4_K_M";
case 16: return "Q5_K_S";
case 17: return "Q5_K_M";
case 18: return "Q6_K";
case 19: return "IQ2_XXS";
case 20: return "IQ2_XS";
case 21: return "Q2_K_S";
case 22: return "IQ3_XS";
case 23: return "IQ3_XXS";
case 24: return "IQ1_S";
case 25: return "IQ4_NL";
case 26: return "IQ3_S";
case 27: return "IQ3_M";
case 28: return "IQ2_S";
case 29: return "IQ2_M";
case 30: return "IQ4_XS";
case 31: return "IQ1_M";
case 32: return "BF16";
case 36: return "TQ1_0";
case 37: return "TQ2_0";
case 38: return "MXFP4_MOE";
case 39: return "NVFP4";
case 40: return "Q1_0";
case 1024: return "GUESSED";
default: return "";
}
}

// Sidecar layout (extends standard sha256sum format with a validation hint):
// line 1: "<64-hex> <basename>\n" (sha256sum-compatible)
// line 2: "# size=<bytes>\n" (our extension; required to trust line 1)
//
// The size guard is what protects us from a stale sidecar after the GGUF was
// replaced/edited in place without the sidecar being updated. We deliberately
// don't trust legacy/external sidecars that lack the size hint — silently
// reporting the wrong model identity at /props is worse than re-hashing once.
bool read_sidecar_sha(const std::string & path, int64_t expected_size, std::string & out) {
if (expected_size < 0) return false; // can't validate without a known size
std::ifstream f(path + ".sha256");
if (!f) return false;
std::string hex;
f >> hex; // tolerate `<hex> filename\n` (sha256sum format) — we only want the first token
if (hex.size() != 64) return false;
for (char c : hex) {
bool is_hex = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
if (!is_hex) return false;
}
// Scan the rest of the file for a `# size=<n>` directive. Refuse to
// trust the cached hash if it's missing or doesn't match — that
// indicates either a legacy sidecar (pre-validation-guard) or that the
// underlying GGUF has been replaced since the hash was written.
std::string line;
std::getline(f, line); // consume rest of line 1
bool size_matches = false;
while (std::getline(f, line)) {
// Strip leading whitespace, then look for "# size=" prefix.
size_t i = 0;
while (i < line.size() && (line[i] == ' ' || line[i] == '\t')) ++i;
const std::string prefix = "# size=";
if (line.compare(i, prefix.size(), prefix) != 0) continue;
const char * num = line.c_str() + i + prefix.size();
char * end = nullptr;
long long n = std::strtoll(num, &end, 10);
if (end == num) continue;
if (n == (long long)expected_size) size_matches = true;
break;
}
if (!size_matches) return false;
out = std::move(hex);
return true;
}

void write_sidecar_sha(const std::string & path, const std::string & sha, int64_t size_bytes) {
// Best-effort. If the directory isn't writable (read-only mount, model
// dir owned by another user), we just skip — the in-memory hash is
// already what /props will report this run.
std::ofstream f(path + ".sha256");
if (!f) return;
// Emit sha256sum-compatible line + our size guard. The basename keeps
// `sha256sum -c` happy if a human ever runs it against the sidecar.
std::string base = path;
auto slash = base.find_last_of('/');
if (slash != std::string::npos) base = base.substr(slash + 1);
f << sha << " " << base << "\n";
if (size_bytes >= 0) f << "# size=" << size_bytes << "\n";
}

} // namespace

GgufMetadata read_gguf_metadata(const std::string & path,
bool compute_sha256) {
GgufMetadata m;
m.path = path;

struct stat st{};
if (::stat(path.c_str(), &st) == 0) {
m.size_bytes = int64_t(st.st_size);
}

gguf_init_params gip{};
gip.no_alloc = true;
gip.ctx = nullptr;
gguf_context * gctx = gguf_init_from_file(path.c_str(), gip);
if (!gctx) {
// No GGUF header → bail. Still report path/size if we got them.
return m;
}
m.ok = true;

auto get_str = [&](const char * key, std::string & out) {
int64_t id = gguf_find_key(gctx, key);
if (id < 0) return;
const char * v = gguf_get_val_str(gctx, id);
if (v) out = v;
};
auto get_u32 = [&](const char * key, int32_t & out) {
int64_t id = gguf_find_key(gctx, key);
if (id < 0) return;
out = int32_t(gguf_get_val_u32(gctx, id));
};

get_str("general.architecture", m.general_architecture);
get_str("general.name", m.general_name);
get_u32("general.file_type", m.file_type);
get_u32("general.quantization_version", m.quantization_version);
if (m.file_type >= 0) {
const char * name = llama_ftype_name(m.file_type);
if (name) m.file_type_name = name;
}

if (!m.general_architecture.empty()) {
const std::string a = m.general_architecture;
get_u32((a + ".block_count").c_str(), m.block_count);
get_u32((a + ".embedding_length").c_str(), m.embedding_length);
get_u32((a + ".context_length").c_str(), m.context_length);
// vocab_size: prefer the explicit <arch>.vocab_size key. Fall back
// to the tokenizer token array length (the canonical source on
// models that don't write the redundant key).
get_u32((a + ".vocab_size").c_str(), m.vocab_size);
}
if (m.vocab_size < 0) {
int64_t toks_id = gguf_find_key(gctx, "tokenizer.ggml.tokens");
if (toks_id >= 0) {
m.vocab_size = int32_t(gguf_get_arr_n(gctx, toks_id));
}
}

gguf_free(gctx);

if (compute_sha256) {
std::string cached;
if (read_sidecar_sha(path, m.size_bytes, cached)) {
m.sha256 = std::move(cached);
} else {
std::string hash = sha256_of_file(path);
if (!hash.empty()) {
m.sha256 = hash;
write_sidecar_sha(path, hash, m.size_bytes);
}
}
}

return m;
}

} // namespace dflash::common
Loading
Loading