diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 24e41bcd3..849de1be9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,6 +34,35 @@ jobs:
       - name: Lint Python surfaces touched by lucebox tooling
         run: uv run --frozen --extra dev ruff check .
 
+      - name: Install shellcheck (for bash test runner)
+        # ubuntu-latest typically ships shellcheck pre-installed, but pin
+        # the dependency explicitly so the bash test runner can always rely
+        # on `command -v shellcheck` succeeding.
+        run: |
+          if ! command -v shellcheck >/dev/null 2>&1; then
+            sudo apt-get update
+            sudo apt-get install -y shellcheck
+          fi
+          shellcheck --version | head -3
+
+      - name: Typecheck lucebox CLI
+        run: uv run --frozen --extra dev python -m mypy --package lucebox
+
+      - name: Unit-test lucebox CLI
+        # The fast workspace sync above is enough: the suite mocks the
+        # docker / HTTP surfaces, so no torch wheel or GPU is needed.
+        # Keeps the lucebox Python honest on every push.
+        run: uv run --frozen --extra dev pytest lucebox -q
+
+      - name: Smoke-test lucebox.sh wrapper
+        # Catches `set -u` regressions, syntax errors, and stale dispatch
+        # handlers in the host-side wrapper + the in-container entrypoint.
+        # Runs shellcheck --severity=error across every shipped .sh file,
+        # exercises every subcommand dispatch under `set -u`, and drives the
+        # entrypoint's draft-resolution block through every family-glob
+        # branch — all on the bare runner without docker/nvidia/systemd.
+        run: bash scripts/test_lucebox_sh.sh
+
   build:
     name: Build (cmake + uv sync --extra megakernel)
     runs-on: ubuntu-latest
diff --git a/install.sh b/install.sh
new file mode 100755
index 000000000..cff54a02e
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,138 @@
+#!/usr/bin/env bash
+# install.sh — Bootstrap installer for the lucebox host wrapper.
+#
+# Canonical install (Luce-Org main, stable channel):
+#
+#   curl -fsSL https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/install.sh | bash
+#
+# Install from a different fork / branch (dev channel). Note the env var
+# is on the `bash` side of the pipe — `VAR=val curl … | bash` would attach
+# it to the `curl` process, leaving `bash` with the canonical default:
+#
+#   curl -fsSL https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/install.sh | \
+#     LUCEBOX_INSTALL_URL=https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/lucebox.sh bash
+#
+# The installer bakes the source URL into the installed `lucebox.sh` as
+# `LUCEBOX_INSTALLED_FROM=...`, so `lucebox update` later re-pulls from the
+# same channel without the user having to remember which fork they used.
+#
+# Override the install destination via $LUCEBOX_INSTALL_DEST (default
+# $HOME/.local/bin/lucebox). This is what `lucebox update` uses to replace
+# the file in place.
+
+set -euo pipefail
+
+LUCEBOX_INSTALL_URL="${LUCEBOX_INSTALL_URL:-https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/lucebox.sh}"
+DEST="${LUCEBOX_INSTALL_DEST:-$HOME/.local/bin/lucebox}"
+
+# ── helpers ───────────────────────────────────────────────────────────────
+C_OK=$'\033[1;32m' ; C_ERR=$'\033[1;31m' ; C_DIM=$'\033[2m' ; C_RST=$'\033[0m'
+if [ ! -t 1 ] || [ "${NO_COLOR:-}" ]; then
+    C_OK="" ; C_ERR="" ; C_DIM="" ; C_RST=""
+fi
+info() { printf '%s[install]%s %s\n' "$C_DIM" "$C_RST" "$*"; }
+ok()   { printf '%s[install] ✓%s %s\n' "$C_OK"  "$C_RST" "$*"; }
+die()  { printf '%s[install] ✗%s %s\n' "$C_ERR" "$C_RST" "$*" >&2; exit 1; }
+
+command -v curl >/dev/null 2>&1 || die "curl is required (apt-get install curl)"
+
+# ── fetch ─────────────────────────────────────────────────────────────────
+tmp=$(mktemp -t lucebox.XXXXXX) || die "couldn't create temp file"
+# shellcheck disable=SC2064  # we want $tmp expanded now, not at trap time
+trap "rm -f '$tmp' '$tmp.bak'" EXIT
+info "fetching $LUCEBOX_INSTALL_URL"
+curl -fsSL "$LUCEBOX_INSTALL_URL" -o "$tmp" \
+    || die "download failed from $LUCEBOX_INSTALL_URL"
+
+# ── sanity check ──────────────────────────────────────────────────────────
+# Refuse to install something that isn't recognizably lucebox.sh. Catches
+# 404 pages, redirects to HTML, and accidental URL typos.
+head -1 "$tmp" | grep -q '^#!/usr/bin/env bash$' \
+    || die "downloaded file does not look like a bash script (got: $(head -1 "$tmp"))"
+grep -q '^VERSION=' "$tmp" \
+    || die "downloaded file is missing VERSION marker — not lucebox.sh?"
+
+# ── decide what gets baked in as the persisted channel ───────────────────
+# `lucebox update` reads LUCEBOX_INSTALLED_FROM from the installed copy and
+# re-fetches from it. Persisting a SHA-pinned URL is a footgun — every
+# future update would re-install the same frozen SHA forever, defeating
+# the point of `update`. So:
+#
+#   1. If $LUCEBOX_INSTALL_CHANNEL is set, that's the persisted URL
+#      (caller takes responsibility for picking a real branch URL).
+#   2. Else if LUCEBOX_INSTALL_URL has a 40-char hex SHA segment, refuse
+#      to persist it — tell the user to set LUCEBOX_INSTALL_CHANNEL.
+#      Common case: someone curl'd from /raw/<sha>/ to bypass a stale CDN
+#      cache during dev; they meant for updates to track the branch.
+#   3. Else persist LUCEBOX_INSTALL_URL as-is (branch or canonical main).
+channel_url="${LUCEBOX_INSTALL_CHANNEL:-}"
+if [ -z "$channel_url" ]; then
+    # Match a full 40-char hex SHA in the URL path, not the broader
+    # {7,40} range — a 7-39 char hex segment is more likely a branch
+    # name shaped like a short SHA (e.g. `feat/abc1234-hotfix`) than an
+    # actual SHA-pin. Keeping the gate at exactly 40 chars matches what
+    # `git rev-parse HEAD` emits and what `/raw/<sha>/` URLs from
+    # GitHub's CDN actually carry.
+    if [[ "$LUCEBOX_INSTALL_URL" =~ /[0-9a-fA-F]{40}/[^/]+\.sh$ ]]; then
+        die "$(cat <<EOM
+LUCEBOX_INSTALL_URL is SHA-pinned ($LUCEBOX_INSTALL_URL).
+Persisting that as LUCEBOX_INSTALLED_FROM would freeze \`lucebox update\`
+to that specific commit forever. Set LUCEBOX_INSTALL_CHANNEL to the
+branch URL you want \`update\` to track, e.g.:
+
+  curl -fsSL <sha-pinned>/install.sh | \\
+    LUCEBOX_INSTALL_URL=<sha-pinned>/lucebox.sh \\
+    LUCEBOX_INSTALL_CHANNEL=https://raw.githubusercontent.com/<org>/<repo>/<branch>/lucebox.sh \\
+    bash
+EOM
+)"
+    fi
+    channel_url="$LUCEBOX_INSTALL_URL"
+fi
+
+# Bake the channel URL into the file. Use a `|` delimiter since URLs
+# contain `/`. The line is expected to exist in lucebox.sh with a `:-`
+# default; we rewrite the whole assignment.
+#
+# The URL ends up inside a bash double-quoted literal in the installed
+# script, so any of $ ` " \ in `channel_url` would break the installed
+# file (or worse, allow command substitution to run at next sourcing).
+# Validate that the URL is plain http(s)+ASCII-URL-safe characters; we
+# don't expect arbitrary content here, only an upstream raw.github URL
+# (or a forked equivalent). Escape the sed metachars (\&|) separately so
+# the substitution itself round-trips.
+case "$channel_url" in
+    *['"$`\']*) die "channel URL contains unsafe characters: $channel_url" ;;
+esac
+escaped_url=$(printf '%s' "$channel_url" | sed 's/[\\&|]/\\&/g')
+sed "s|^LUCEBOX_INSTALLED_FROM=.*|LUCEBOX_INSTALLED_FROM=\"$escaped_url\"|" "$tmp" > "$tmp.baked"
+mv "$tmp.baked" "$tmp"
+grep -q "^LUCEBOX_INSTALLED_FROM=\"$escaped_url\"$" "$tmp" \
+    || die "failed to bake install source into the downloaded script"
+
+# ── install ───────────────────────────────────────────────────────────────
+mkdir -p "$(dirname "$DEST")"
+chmod +x "$tmp"
+mv "$tmp" "$DEST"
+trap - EXIT
+ok "installed lucebox → $DEST"
+info "  fetched from:    $LUCEBOX_INSTALL_URL"
+info "  update channel:  $channel_url"
+if [ "$LUCEBOX_INSTALL_URL" != "$channel_url" ]; then
+    info "  (lucebox update will track the channel URL, not the fetch URL)"
+fi
+
+# ── PATH hint ─────────────────────────────────────────────────────────────
+case ":${PATH:-}:" in
+    *":$(dirname "$DEST"):"*) ;;
+    *) info "  hint: add $(dirname "$DEST") to PATH so 'lucebox' is on the path" ;;
+esac
+
+cat <<EOF
+
+Next:
+  ${C_DIM}lucebox check${C_RST}            verify host prereqs (docker + NVIDIA CTK + driver)
+  ${C_DIM}lucebox install${C_RST}          install the user systemd unit
+  ${C_DIM}lucebox start${C_RST}            start the server
+  ${C_DIM}lucebox update${C_RST}           re-run this installer to fetch the latest lucebox.sh
+EOF
diff --git a/lefthook.yml b/lefthook.yml
new file mode 100644
index 000000000..a57c6535a
--- /dev/null
+++ b/lefthook.yml
@@ -0,0 +1,59 @@
+# lefthook.yml — git hook config for the lucebox-hub monorepo.
+#
+# Install once per checkout:
+#
+#     # Option A: standalone binary (recommended)
+#     curl -sSfL https://raw.githubusercontent.com/evilmartians/lefthook/master/install.sh | sh
+#     lefthook install
+#
+#     # Option B: via npm
+#     npm install -g lefthook && lefthook install
+#
+# After `lefthook install`, git's hook dir is wired so the pre-commit
+# block below fires on every `git commit`. Skip a one-off with
+# `LEFTHOOK=0 git commit ...`.
+#
+# What we enforce here is the same surface CI does — keeping commits
+# from breaking the bash scripts that ship with the image. The goal is
+# specifically to prevent another `unbound variable` regression like the
+# DRAFT_FAMILY_GLOB / LUCEBOX_HOST_HAS_SYSTEMD ones from landing.
+
+pre-commit:
+  parallel: true
+  commands:
+    shellcheck:
+      # Run shellcheck on every staged *.sh file with the same
+      # `--severity=error` gate the CI test runner uses
+      # (scripts/test_lucebox_sh.sh). Warning-level findings are
+      # informational; errors fail the commit.
+      #
+      # `--external-sources` lets shellcheck follow `source` directives
+      # — needed for harness/clients/common.sh consumers. The exclude
+      # below skips vendored llama.cpp scripts under server/deps/ so we
+      # don't trip on upstream style.
+      glob: "**/*.sh"
+      exclude:
+        - "server/deps/**"
+      run: shellcheck --severity=error --external-sources {staged_files}
+
+    bash-parse:
+      # `bash -n` catches syntax errors that shellcheck can miss when
+      # configured to error-only. Cheap second pass — runs in parallel.
+      glob: "**/*.sh"
+      exclude:
+        - "server/deps/**"
+      run: |
+        rc=0
+        for f in {staged_files}; do
+          bash -n "$f" || rc=1
+        done
+        exit "$rc"
+
+# pre-push runs the full bash smoke test — more thorough, slower than
+# pre-commit. Covers every subcommand dispatch under `set -u` plus the
+# entrypoint's draft-resolution branches. Skip with `LEFTHOOK=0 git push`
+# if you really need to push without local validation (CI still catches it).
+pre-push:
+  commands:
+    bash-smoke:
+      run: bash scripts/test_lucebox_sh.sh
diff --git a/lucebox.sh b/lucebox.sh
new file mode 100755
index 000000000..7ee5d1f2d
--- /dev/null
+++ b/lucebox.sh
@@ -0,0 +1,1278 @@
+#!/usr/bin/env bash
+# lucebox.sh — host-side wrapper for the lucebox-hub container.
+#
+# Two jobs:
+#
+#   1) Probe the host (driver, docker, NVIDIA Container Toolkit, VRAM, RAM,
+#      systemd), select the CUDA 12 image, and
+#      dispatch into the in-container Python CLI via `docker run`. The
+#      Python CLI lives at /opt/lucebox-hub/lucebox/ inside the image and is
+#      the single source of truth for orchestration logic — TOML config,
+#      autotune rules + sweep, server bring-up, smoke tests, model downloads.
+#
+#   2) Manage a user-level systemd unit (~/.config/systemd/user/lucebox.service)
+#      so the server can run as a long-lived service without keeping a shell
+#      open. install/uninstall/start/stop/enable/disable/status/logs all
+#      delegate to systemctl --user / journalctl --user.
+#
+# Install:
+#   curl -fsSL https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/install.sh | bash
+#
+# The installer bakes the source URL into the installed copy as
+# `LUCEBOX_INSTALLED_FROM=`, so `lucebox update` later re-pulls from the
+# same channel (canonical, dev fork, branch — whatever you originally
+# installed from).
+#
+# Then: lucebox check && lucebox install && lucebox start
+#
+# The runtime works whether the file is installed as `lucebox` (preferred)
+# or `lucebox.sh` — all self-referencing hints use the actual basename.
+#
+# No root is ever taken automatically. Anything that needs sudo (package
+# install, loginctl enable-linger) is printed for the user to run.
+
+set -euo pipefail
+
+VERSION="0.2.0"
+SCRIPT_PATH="$(readlink -f "$0" 2>/dev/null || realpath "$0" 2>/dev/null || echo "$0")"
+SCRIPT_NAME="$(basename "$SCRIPT_PATH")"
+
+# ── tunables / env overrides ───────────────────────────────────────────────
+# Host-side scalars (image registry+variant, port, container name, models
+# dir). Resolution order, applied uniformly via _lucebox_resolve below:
+#   1. $LUCEBOX_<NAME>            per-invocation env override
+#   2. config.toml <section>.<key>  persisted user choice (system of record)
+#   3. derived / canonical default
+# This keeps the wrapper and the in-container Python CLI agreeing on
+# effective values — config.toml is the single source of truth, both
+# sides read it.
+UNIT_NAME="lucebox.service"
+UNIT_PATH="${XDG_CONFIG_HOME:-$HOME/.config}/systemd/user/$UNIT_NAME"
+
+# CUDA driver floor for the prebuilt CUDA 12 image.
+# shellcheck disable=SC2034
+MIN_DRIVER_CUDA12=525
+
+# Canonical source of `lucebox.sh`. The bootstrap installer (`install.sh`)
+# rewrites this line at install time to record which URL the user actually
+# installed from — `lucebox update` then re-pulls from the same channel
+# without losing track of forks. Falls back to the Luce-Org main branch
+# when nothing was baked in (e.g. someone curl'd the script directly).
+LUCEBOX_INSTALLED_FROM="${LUCEBOX_INSTALLED_FROM:-https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/lucebox.sh}"
+
+# Path to the persisted config.toml. Mirrors
+# lucebox.config.default_config_path: $LUCEBOX_HOME/config.toml if set,
+# else $HOME/.lucebox/config.toml. Read-only from this wrapper — the
+# Python CLI is the writer.
+_lucebox_config_path() {
+    if [ -n "${LUCEBOX_HOME:-}" ]; then
+        printf '%s/config.toml' "$LUCEBOX_HOME"
+        return
+    fi
+    printf '%s/.lucebox/config.toml' "$HOME"
+}
+
+# Read a `<section>.<key>` value from config.toml. Returns empty if the
+# file is missing, the section/key is absent, or the value is empty.
+# Handles the subset of TOML that lucebox writes:
+#   [section]
+#   key = "string"      # surrounding double-quotes are stripped
+#   key = 8080          # bare scalars passed through verbatim
+#   key = true          # same
+# Inline `# comment` is honored. Arrays / inline tables / multi-line
+# strings aren't written by the Python persister, so we don't parse them.
+_lucebox_config_get() {
+    local dotted="$1" cfg
+    cfg="$(_lucebox_config_path)"
+    [ -f "$cfg" ] || return 0
+    local section="${dotted%.*}"
+    local key="${dotted##*.}"
+    [ "$section" = "$dotted" ] && section=""
+    awk -v want_section="$section" -v want_key="$key" '
+        BEGIN { current = "" }
+        /^[[:space:]]*\[/ {
+            t = $0
+            sub(/^[[:space:]]*\[[[:space:]]*/, "", t)
+            sub(/[[:space:]]*\][[:space:]]*$/, "", t)
+            current = t
+            next
+        }
+        /^[[:space:]]*#/ { next }
+        /=/ {
+            if (current != want_section) next
+            line = $0
+            sub(/#.*$/, "", line)
+            eq = index(line, "=")
+            if (eq == 0) next
+            k = substr(line, 1, eq - 1)
+            v = substr(line, eq + 1)
+            gsub(/^[[:space:]]+|[[:space:]]+$/, "", k)
+            gsub(/^[[:space:]]+|[[:space:]]+$/, "", v)
+            if (k != want_key) next
+            if (length(v) >= 2 && substr(v, 1, 1) == "\"" && substr(v, length(v), 1) == "\"")
+                v = substr(v, 2, length(v) - 2)
+            print v
+            exit
+        }
+    ' "$cfg"
+}
+
+# Resolve a scalar through the precedence ladder. env_value comes from
+# the caller (typically `"${LUCEBOX_FOO:-}"` — the `:-` matters under
+# `set -u`).
+_lucebox_resolve() {
+    local env_value="$1" toml_key="$2" default="$3" v
+    if [ -n "$env_value" ]; then
+        printf '%s' "$env_value"
+        return
+    fi
+    v="$(_lucebox_config_get "$toml_key")"
+    if [ -n "$v" ]; then
+        printf '%s' "$v"
+        return
+    fi
+    printf '%s' "$default"
+}
+
+# Derive the default image URL from the install source so a fork install
+# (e.g. easel/lucebox-hub) gets the fork's GHCR image automatically when
+# config.toml hasn't pinned one yet. Pattern:
+#   https://raw.githubusercontent.com/<org>/<repo>/<ref>/lucebox.sh
+#   → ghcr.io/<org-lowercase>/<repo>
+# GHCR rejects mixed-case org paths so the org segment is lowercased; the
+# repo name is preserved as-is. Falls back to the canonical Luce-Org image
+# when the URL doesn't match the raw.githubusercontent.com pattern.
+_lucebox_derive_image() {
+    # The ref segment can contain slashes (e.g. `feat/lucebox-docker`), so
+    # the middle `.+` greedily eats everything up to the trailing
+    # `/lucebox.sh`. The first two `[^/]+` capture org + repo, which are
+    # never slash-containing on GitHub.
+    local url="$1" org repo
+    if [[ "$url" =~ ^https?://raw\.githubusercontent\.com/([^/]+)/([^/]+)/.+/lucebox\.sh$ ]]; then
+        org=$(printf '%s' "${BASH_REMATCH[1]}" | tr '[:upper:]' '[:lower:]')
+        repo="${BASH_REMATCH[2]}"
+        printf 'ghcr.io/%s/%s' "$org" "$repo"
+        return
+    fi
+    printf 'ghcr.io/luce-org/lucebox-hub'
+}
+
+# Effective scalars, env > config.toml > default.
+CONTAINER_NAME=$(_lucebox_resolve "${LUCEBOX_CONTAINER:-}" runtime.container_name "lucebox")
+DEFAULT_PORT=$(_lucebox_resolve "${LUCEBOX_PORT:-}" runtime.port "8080")
+DEFAULT_MODELS_DIR=$(_lucebox_resolve "${LUCEBOX_MODELS:-}" paths.models "${XDG_DATA_HOME:-$HOME/.local/share}/lucebox/models")
+IMAGE_BASE=$(_lucebox_resolve "${LUCEBOX_IMAGE:-}" image.registry "$(_lucebox_derive_image "$LUCEBOX_INSTALLED_FROM")")
+
+# ── LUCEBOX_HOST_* safe defaults (belt-and-suspenders) ────────────────────
+# `set -u` makes any unbound LUCEBOX_HOST_* read fatal. Historically this has
+# been the #1 source of regressions in this wrapper: someone adds a code path
+# that touches a LUCEBOX_HOST_* var before probe_host has run, the call sites
+# that DO pre-probe still work, and the bug ships. To make the bug literally
+# unrepresentable we seed every LUCEBOX_HOST_* with an explicit safe default
+# at script-load time (these mirror probe_host's "nothing detected" state).
+# probe_host then overwrites them with real values. Any future read — pre- or
+# post-probe — is now well-defined.
+: "${LUCEBOX_HOST_NPROC:=1}"
+: "${LUCEBOX_HOST_RAM_GB:=0}"
+: "${LUCEBOX_HOST_GPU_VENDOR:=none}"
+: "${LUCEBOX_HOST_GPU_NAME:=}"
+: "${LUCEBOX_HOST_GPU_COUNT:=0}"
+: "${LUCEBOX_HOST_VRAM_GB:=0}"
+: "${LUCEBOX_HOST_GPU_SM:=}"
+: "${LUCEBOX_HOST_DRIVER_VERSION:=}"
+: "${LUCEBOX_HOST_DRIVER_MAJOR:=0}"
+: "${LUCEBOX_HOST_HAS_SYSTEMD:=0}"
+: "${LUCEBOX_HOST_IS_WSL:=0}"
+: "${LUCEBOX_HOST_HAS_DOCKER:=0}"
+: "${LUCEBOX_HOST_DOCKER_VERSION:=}"
+: "${LUCEBOX_HOST_HAS_CTK:=none}"
+# Host-identity facts (item 1 — host-identity capture). These ride along
+# the existing LUCEBOX_HOST_* convoy into the container so /opt/lucebox-hub/
+# HOST_INFO can be written without re-probing inside the container (where
+# /proc and nvidia-smi see the container's view, not the rig's).
+: "${LUCEBOX_HOST_OS_PRETTY:=}"
+: "${LUCEBOX_HOST_KERNEL:=}"
+: "${LUCEBOX_HOST_WSL_VERSION:=}"
+: "${LUCEBOX_HOST_NVIDIA_CTK_VERSION:=}"
+: "${LUCEBOX_HOST_CPU_MODEL:=}"
+: "${LUCEBOX_HOST_GPU_LIST_CSV:=}"
+: "${LUCEBOX_HOST_CUDA_VISIBLE_DEVICES:=}"
+# Tracks whether probe_host has actually run; pieces of the code that need
+# fresh host facts (e.g. cmd_check, cmd_serve) gate on this. Default 0.
+: "${_LUCEBOX_HOST_PROBED:=0}"
+
+# ── output helpers ────────────────────────────────────────────────────────
+if [ -t 1 ] && [ -z "${NO_COLOR:-}" ]; then
+    C_INFO='\033[1;34m'; C_OK='\033[1;32m'; C_WARN='\033[1;33m'
+    C_ERR='\033[1;31m'; C_DIM='\033[2m'; C_RST='\033[0m'
+else
+    C_INFO=''; C_OK=''; C_WARN=''; C_ERR=''; C_DIM=''; C_RST=''
+fi
+
+info()  { printf '%b[INFO]%b  %s\n' "$C_INFO" "$C_RST" "$*"; }
+ok()    { printf '%b[OK]%b    %s\n' "$C_OK"   "$C_RST" "$*"; }
+warn()  { printf '%b[WARN]%b  %s\n' "$C_WARN" "$C_RST" "$*"; }
+err()   { printf '%b[ERROR]%b %s\n' "$C_ERR"  "$C_RST" "$*" >&2; }
+hint()  { printf '       %b%s%b\n'  "$C_DIM"  "$*"     "$C_RST"; }
+die()   { err "$*"; exit 1; }
+
+# ── host probing ──────────────────────────────────────────────────────────
+# Sets the LUCEBOX_HOST_* variables consumed by the in-container Python CLI
+# (passed through with -e). The Python side trusts these and doesn't reprobe
+# — it can't see the host's /proc anyway, only the container's.
+
+probe_host() {
+    LUCEBOX_HOST_NPROC=$(nproc 2>/dev/null || echo 1)
+    # RAM: try Linux /proc/meminfo first, then macOS/BSD sysctl, else 0.
+    LUCEBOX_HOST_RAM_GB=0
+    if [ -r /proc/meminfo ]; then
+        LUCEBOX_HOST_RAM_GB=$(awk '/MemTotal/{printf "%.0f", $2/1024/1024}' /proc/meminfo 2>/dev/null || echo 0)
+    elif command -v sysctl &>/dev/null; then
+        mem_bytes=$(sysctl -n hw.memsize 2>/dev/null || echo 0)
+        LUCEBOX_HOST_RAM_GB=$(( mem_bytes / 1024 / 1024 / 1024 ))
+    fi
+    LUCEBOX_HOST_GPU_VENDOR="none"
+    LUCEBOX_HOST_GPU_NAME=""
+    LUCEBOX_HOST_GPU_COUNT=0
+    LUCEBOX_HOST_VRAM_GB=0
+    LUCEBOX_HOST_GPU_SM=""
+    LUCEBOX_HOST_DRIVER_VERSION=""
+    LUCEBOX_HOST_DRIVER_MAJOR=0
+
+    if command -v nvidia-smi &>/dev/null; then
+        local q
+        if q=$(nvidia-smi --query-gpu=name,memory.total,driver_version,compute_cap \
+                          --format=csv,noheader,nounits 2>/dev/null) && [ -n "$q" ]; then
+            LUCEBOX_HOST_GPU_VENDOR="nvidia"
+            LUCEBOX_HOST_GPU_NAME=$(printf '%s\n' "$q" | head -1 | awk -F', ' '{print $1}')
+            local mem_mib
+            mem_mib=$(printf '%s\n' "$q" | head -1 | awk -F', ' '{print $2}')
+            LUCEBOX_HOST_VRAM_GB=$((mem_mib / 1024))
+            LUCEBOX_HOST_DRIVER_VERSION=$(printf '%s\n' "$q" | head -1 | awk -F', ' '{print $3}')
+            LUCEBOX_HOST_DRIVER_MAJOR=${LUCEBOX_HOST_DRIVER_VERSION%%.*}
+            local cc
+            cc=$(printf '%s\n' "$q" | head -1 | awk -F', ' '{print $4}')
+            LUCEBOX_HOST_GPU_SM="${cc//./}"
+            LUCEBOX_HOST_GPU_COUNT=$(printf '%s\n' "$q" | wc -l)
+        fi
+        # Multi-GPU enumeration for /props.host. The single-GPU vars
+        # above (GPU_NAME / GPU_SM / VRAM_GB / DRIVER_VERSION) keep
+        # describing GPU 0 for back-compat with cmd_check + autotune;
+        # the full per-GPU CSV rides along separately so HOST_INFO can
+        # emit the whole array.
+        LUCEBOX_HOST_GPU_LIST_CSV=$(nvidia-smi \
+            --query-gpu=index,uuid,pci.bus_id,name,compute_cap,memory.total,power.limit \
+            --format=csv,noheader 2>/dev/null || echo "")
+    fi
+    # CUDA_VISIBLE_DEVICES from the caller's env (empty default = "all GPUs").
+    LUCEBOX_HOST_CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-}"
+
+    # OS / kernel identity. /etc/os-release is the freedesktop spec for
+    # "what distro is this?" and we keep PRETTY_NAME verbatim (it already
+    # includes the version, e.g. "Ubuntu 22.04.3 LTS").
+    LUCEBOX_HOST_OS_PRETTY=""
+    if [ -r /etc/os-release ]; then
+        # shellcheck source=/dev/null
+        LUCEBOX_HOST_OS_PRETTY=$(. /etc/os-release 2>/dev/null && printf '%s' "${PRETTY_NAME:-}")
+    fi
+    LUCEBOX_HOST_KERNEL=$(uname -r 2>/dev/null || echo "")
+
+    # WSL version detection. "wsl2" matches the kernel-side string the
+    # MS-shipped WSL2 kernel embeds; "wsl1" is what the legacy translation
+    # layer writes. Anything else stays empty (= not WSL).
+    LUCEBOX_HOST_WSL_VERSION=""
+    if [ -r /proc/version ]; then
+        if grep -q "microsoft-standard-WSL2" /proc/version 2>/dev/null; then
+            LUCEBOX_HOST_WSL_VERSION="wsl2"
+        elif grep -qi "Microsoft" /proc/version 2>/dev/null; then
+            LUCEBOX_HOST_WSL_VERSION="wsl1"
+        fi
+    fi
+
+    # CPU model — first "model name" hit in /proc/cpuinfo. Cheaper than
+    # lscpu and keeps the bash side dep-free.
+    LUCEBOX_HOST_CPU_MODEL=""
+    if [ -r /proc/cpuinfo ]; then
+        LUCEBOX_HOST_CPU_MODEL=$(awk -F': ' '/^model name/{print $2; exit}' /proc/cpuinfo 2>/dev/null || echo "")
+    fi
+
+    LUCEBOX_HOST_HAS_SYSTEMD=0
+    if command -v systemctl &>/dev/null && systemctl --user show-environment &>/dev/null; then
+        LUCEBOX_HOST_HAS_SYSTEMD=1
+    fi
+
+    LUCEBOX_HOST_IS_WSL=0
+    if grep -qi microsoft /proc/version 2>/dev/null \
+       || [ -e /proc/sys/fs/binfmt_misc/WSLInterop ]; then
+        LUCEBOX_HOST_IS_WSL=1
+    fi
+
+    LUCEBOX_HOST_HAS_DOCKER=0
+    LUCEBOX_HOST_DOCKER_VERSION=""
+    if command -v docker &>/dev/null && docker ps &>/dev/null; then
+        LUCEBOX_HOST_HAS_DOCKER=1
+        LUCEBOX_HOST_DOCKER_VERSION=$(timeout 5 docker version --format '{{.Server.Version}}' 2>/dev/null || echo "")
+    fi
+
+    LUCEBOX_HOST_HAS_CTK="none"
+    if [ "$LUCEBOX_HOST_HAS_DOCKER" = "1" ]; then
+        if command -v nvidia-container-runtime &>/dev/null; then
+            LUCEBOX_HOST_HAS_CTK="runtime"
+        elif command -v nvidia-ctk &>/dev/null \
+             && nvidia-ctk cdi list 2>/dev/null | grep -q 'nvidia.com/gpu'; then
+            LUCEBOX_HOST_HAS_CTK="cdi"
+        elif command -v nvidia-ctk &>/dev/null; then
+            LUCEBOX_HOST_HAS_CTK="installed-unwired"
+        fi
+    fi
+
+    # NVIDIA Container Toolkit version (best-effort; empty when nvidia-ctk
+    # is not installed). nvidia-ctk --version prints "NVIDIA Container
+    # Toolkit CLI version 1.16.2" on a single line — extract the trailing
+    # token so the host-info JSON carries just the version, not the banner.
+    LUCEBOX_HOST_NVIDIA_CTK_VERSION=""
+    if command -v nvidia-ctk &>/dev/null; then
+        LUCEBOX_HOST_NVIDIA_CTK_VERSION=$(nvidia-ctk --version 2>/dev/null \
+            | awk '/version/{print $NF; exit}' \
+            || echo "")
+    fi
+
+    export LUCEBOX_HOST_NPROC LUCEBOX_HOST_RAM_GB LUCEBOX_HOST_GPU_VENDOR
+    export LUCEBOX_HOST_GPU_NAME LUCEBOX_HOST_GPU_COUNT LUCEBOX_HOST_VRAM_GB
+    export LUCEBOX_HOST_GPU_SM LUCEBOX_HOST_DRIVER_VERSION LUCEBOX_HOST_DRIVER_MAJOR
+    export LUCEBOX_HOST_HAS_SYSTEMD LUCEBOX_HOST_IS_WSL
+    export LUCEBOX_HOST_HAS_DOCKER LUCEBOX_HOST_DOCKER_VERSION
+    export LUCEBOX_HOST_HAS_CTK
+    export LUCEBOX_HOST_OS_PRETTY LUCEBOX_HOST_KERNEL LUCEBOX_HOST_WSL_VERSION
+    export LUCEBOX_HOST_NVIDIA_CTK_VERSION LUCEBOX_HOST_CPU_MODEL
+    export LUCEBOX_HOST_GPU_LIST_CSV LUCEBOX_HOST_CUDA_VISIBLE_DEVICES
+    _LUCEBOX_HOST_PROBED=1
+}
+
+# Cheap idempotency wrapper. Anything that needs real host facts (vs the safe
+# defaults seeded at script-load) calls this. Subcommands that go straight to
+# `systemctl`/`journalctl` no longer need to remember to call probe_host.
+ensure_probed() {
+    [ "$_LUCEBOX_HOST_PROBED" = "1" ] || probe_host
+}
+
+pick_variant() {
+    # CUDA 12.8 is the supported image variant for this branch. Effective
+    # value goes through the same env > config.toml > default ladder as
+    # everything else so `config set image.variant=...` propagates.
+    _lucebox_resolve "${LUCEBOX_VARIANT:-}" image.variant "cuda12"
+}
+
+# ── prereq checks (host-only) ─────────────────────────────────────────────
+# Print-and-exit on anything that needs root to install. The Python CLI does
+# the richer reporting; this is the bare minimum to make `docker run` viable.
+
+require_host_prereqs() {
+    local missing=0
+    if ! command -v docker &>/dev/null; then
+        err "docker is not installed"
+        hint "Install: https://docs.docker.com/engine/install/"
+        missing=1
+    elif ! docker ps &>/dev/null; then
+        err "docker daemon not reachable"
+        hint "sudo systemctl start docker   (or: add your user to the 'docker' group, then re-login)"
+        missing=1
+    fi
+
+    if ! command -v nvidia-smi &>/dev/null; then
+        err "nvidia-smi not found — no NVIDIA driver detected"
+        hint "Install the NVIDIA driver: https://www.nvidia.com/Download/index.aspx"
+        missing=1
+    elif ! nvidia-smi --query-gpu=name --format=csv,noheader &>/dev/null; then
+        err "nvidia-smi present but NVML calls fail — likely a driver/library mismatch"
+        hint "Reboot, or reinstall the matching NVIDIA driver package"
+        missing=1
+    fi
+
+    [ "$missing" = "0" ] || exit 1
+}
+
+require_ctk() {
+    case "$LUCEBOX_HOST_HAS_CTK" in
+        runtime|cdi) return 0 ;;
+        installed-unwired)
+            err "NVIDIA Container Toolkit installed but not wired into docker"
+            hint "sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker"
+            hint "  or generate a CDI spec: sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml"
+            exit 1 ;;
+        none|*)
+            err "NVIDIA Container Toolkit not installed"
+            hint "Install: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html"
+            hint "Then register with docker:"
+            hint "  sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker"
+            exit 1 ;;
+    esac
+}
+
+require_systemd() {
+    # Earlier versions of this wrapper had `start`/`stop`/`logs`/etc. drop
+    # straight into cmd_systemctl_passthrough without probing first, which
+    # tripped `set -u` on the reference below. Two layers of defence now:
+    #   1) top-of-script seeds LUCEBOX_HOST_HAS_SYSTEMD=0 unconditionally, so
+    #      no read can be unbound even if probe_host is bypassed entirely.
+    #   2) ensure_probed runs probe_host on first call so we still get the
+    #      real answer for the require_systemd error path.
+    ensure_probed
+    if [ "$LUCEBOX_HOST_HAS_SYSTEMD" != "1" ]; then
+        err "user systemd is not available — required for $1"
+        hint "On WSL: set 'systemd=true' under [boot] in /etc/wsl.conf, then 'wsl --shutdown'."
+        hint "Otherwise: install systemd, or run '$SCRIPT_NAME serve' to run in the foreground without systemd."
+        exit 1
+    fi
+}
+
+# ── docker run construction ───────────────────────────────────────────────
+# All the Python-CLI subcommands share the same docker run incantation:
+# mount the host docker socket (so the in-container CLI can spawn server /
+# bench containers on the host daemon), mount $HOME at the same path (so
+# paths look identical in and out), and pass host facts via env. When an
+# NVIDIA GPU is detected we also pass --gpus all so the orchestrator can
+# call nvidia-smi during profile snapshot export; without it nvidia_smi_csv (and
+# any downstream power/utilization fields) come back empty.
+
+DOCKER_SOCK_PATH="${DOCKER_HOST:-/var/run/docker.sock}"
+DOCKER_SOCK_PATH="${DOCKER_SOCK_PATH#unix://}"
+
+# Append `-e LUCEBOX_HOST_<x>=<val>` for every exported host fact onto the
+# named docker-argv array (bash 4.3+ nameref). The Python side reads these
+# instead of reprobing — see build_orchestrator_argv / cmd_exec_in_container.
+_append_host_env() {
+    # shellcheck disable=SC2178  # nameref to a caller's array, not a string
+    local -n _arr="$1"
+    local var
+    for var in $(compgen -e | grep '^LUCEBOX_HOST_' || true); do
+        _arr+=(-e "$var=${!var}")
+    done
+}
+
+# Append the LUCEBOX_* scalar overrides (image/variant/port/container/models)
+# plus the optional HF_TOKEN guard onto the named docker-argv array. Shared
+# by the docker-run (build_orchestrator_argv) and docker-exec
+# (cmd_exec_in_container) paths so both forward an identical env subset.
+_append_scalar_env() {
+    # shellcheck disable=SC2178  # nameref to a caller's array, not a string
+    local -n _arr="$1"
+    local variant="$2"
+    _arr+=(-e "LUCEBOX_IMAGE=$IMAGE_BASE")
+    _arr+=(-e "LUCEBOX_VARIANT=$variant")
+    _arr+=(-e "LUCEBOX_PORT=$DEFAULT_PORT")
+    _arr+=(-e "LUCEBOX_CONTAINER=$CONTAINER_NAME")
+    _arr+=(-e "LUCEBOX_MODELS=$DEFAULT_MODELS_DIR")
+    [ -n "${HF_TOKEN:-}" ] && _arr+=(-e "HF_TOKEN=$HF_TOKEN")
+    return 0
+}
+
+# Pick docker's interactive flags: -it on a real tty, -i otherwise.
+# Writes into a caller-supplied array via nameref. This MUST run in the
+# caller's scope (not a subshell or `< <(...)` process substitution): the
+# `[ -t 1 ]` test inspects fd 1, and inside a process substitution fd 1 is
+# the pipe to the consumer, not the terminal — which would force -i even on
+# a real tty and break the interactive client TUIs (lucebox claude, etc.).
+_set_tty_flags() {  # usage: _set_tty_flags arrayname
+    # shellcheck disable=SC2178
+    local -n _a="$1"
+    if [ -t 0 ] && [ -t 1 ]; then
+        _a=(-it)
+    else
+        _a=(-i)
+    fi
+}
+
+build_orchestrator_argv() {
+    local variant="$1"; shift
+    local tty=()
+    _set_tty_flags tty
+    local argv=(docker run --rm "${tty[@]}")
+    if [ "${LUCEBOX_HOST_GPU_VENDOR:-none}" = "nvidia" ]; then
+        argv+=(--gpus all)
+    fi
+    argv+=(--name "${CONTAINER_NAME}-cli-$$")
+    argv+=(--user "$(id -u):$(id -g)")
+    # Only bind-mount the docker socket when DOCKER_HOST actually points
+    # at a unix socket on this host. With DOCKER_HOST=tcp://… or ssh://…
+    # the path we'd construct is `tcp` or empty, and `docker run -v` would
+    # bark with an "invalid mount" error before the orchestrator even
+    # starts. The orchestrator-in-container relies on docker access only
+    # when actually needed; pulling that mount when the host talks to
+    # docker over TCP/SSH is fine.
+    if [ -S "$DOCKER_SOCK_PATH" ]; then
+        argv+=(--group-add "$(stat -c '%g' "$DOCKER_SOCK_PATH")")
+        argv+=(-v "$DOCKER_SOCK_PATH:/var/run/docker.sock")
+    fi
+    argv+=(-v "$HOME:$HOME")
+    # Bind-mount the XDG models dir explicitly (host = container path) so
+    # paths line up in/out. The $HOME mount above already covers it when
+    # XDG_DATA_HOME is unset, but an explicit -v is required when the user
+    # points XDG_DATA_HOME outside $HOME.
+    mkdir -p "$DEFAULT_MODELS_DIR"
+    argv+=(-v "$DEFAULT_MODELS_DIR:$DEFAULT_MODELS_DIR")
+    argv+=(-w "$PWD")
+    argv+=(-e "HOME=$HOME")
+    # Host facts — Python side reads these instead of reprobing.
+    _append_host_env argv
+    # User overrides for image/port/container/models scalars + HF_TOKEN.
+    # Always exports the resolved models dir so the in-container CLI sees
+    # the same path the wrapper mounts (the XDG default flows through too).
+    _append_scalar_env argv "$variant"
+
+    argv+=("${IMAGE_BASE}:${variant}")
+    # `lucebox` is the entrypoint subcommand handled by server/scripts/entrypoint.sh
+    # — it execs `python -m lucebox` with whatever args we pass on.
+    argv+=(lucebox "$@")
+    printf '%s\n' "${argv[@]}"
+}
+
+# ── subcommand implementations ────────────────────────────────────────────
+
+cmd_serve() {
+    # Long-running foreground server. Also what systemd's ExecStart= calls.
+    #
+    # Two-stage so config.toml takes effect:
+    #   1. Run an ephemeral orchestrator container that emits the canonical
+    #      server docker-run argv from .lucebox/config.toml (one arg per
+    #      line on stdout).
+    #   2. Exec that argv.
+    #
+    # If stage 1 fails (image not pulled yet, no config), fall back to a
+    # conservative docker run — the container's own VRAM-tiered autotune
+    # picks reasonable defaults from there.
+    require_host_prereqs
+    ensure_probed
+    require_ctk
+    local variant
+    variant=$(pick_variant)
+
+    # Pre-flight: refuse to stomp on something that's already serving this
+    # slot. Three states to distinguish, because silently `docker rm -f`-ing
+    # whatever is there hides real bugs (e.g. the user forgot they had a
+    # systemd unit up, and we'd happily race two servers on the same port):
+    #
+    #   1. systemd unit active           → refuse, redirect to `logs`/`stop`
+    #   2. container running (no systemd)→ refuse, redirect to `docker logs`
+    #   3. container present but stopped → orphan from a SIGKILLed previous
+    #      run (docker run --rm only cleans up on clean exit). Remove it,
+    #      but TELL the user — they need to know their last run died dirty.
+    # CRITICAL: when systemd invokes US as the unit's ExecStart, is-active
+    # returns true *because of us* — refusing here would deadlock the unit
+    # in a restart loop (and historically did — commit a30dbe5 shipped this
+    # bug). systemd sets $INVOCATION_ID in every service exec, so its
+    # presence is the unambiguous "I am running as the systemd ExecStart"
+    # signal. Skip the unit-active check in that case; the container-state
+    # check below still catches a stale container holding the slot.
+    if [ -z "${INVOCATION_ID:-}" ] \
+       && systemctl --user is-active --quiet "$UNIT_NAME" 2>/dev/null; then
+        err "${UNIT_NAME} is already running under systemd."
+        hint "  $SCRIPT_NAME logs            # follow the journal"
+        hint "  $SCRIPT_NAME restart         # bounce the service"
+        hint "  $SCRIPT_NAME stop            # stop the service"
+        exit 1
+    fi
+    local container_state
+    container_state=$(docker inspect --format '{{.State.Status}}' "$CONTAINER_NAME" 2>/dev/null || echo absent)
+    case "$container_state" in
+        absent)
+            ;;
+        running|restarting)
+            err "Container '$CONTAINER_NAME' is already running (outside systemd)."
+            hint "  docker logs -f $CONTAINER_NAME    # follow output"
+            hint "  $SCRIPT_NAME stop                # stop it"
+            exit 1
+            ;;
+        exited|created|paused|dead)
+            info "Removing stale '$CONTAINER_NAME' container (state=$container_state, likely from a previous unclean exit)"
+            docker rm -f "$CONTAINER_NAME" >/dev/null
+            ;;
+        *)
+            warn "Container '$CONTAINER_NAME' is in unexpected state '$container_state' — removing"
+            docker rm -f "$CONTAINER_NAME" >/dev/null
+            ;;
+    esac
+
+    local orch_argv server_argv
+    mapfile -t orch_argv < <(build_orchestrator_argv "$variant" print-serve-argv)
+
+    if mapfile -t server_argv < <("${orch_argv[@]}" 2>/dev/null) \
+       && [ "${#server_argv[@]}" -gt 0 ] \
+       && [ "${server_argv[0]}" = "docker" ]; then
+        info "Starting lucebox server (variant=$variant, from config.toml)"
+        _serve_and_track "${server_argv[@]}"
+        return $?
+    fi
+
+    warn "Couldn't fetch server argv from container (image not pulled?) — using fallback"
+    info "Starting lucebox server (variant=$variant, port=$DEFAULT_PORT, defaults only)"
+    local fallback_models="$DEFAULT_MODELS_DIR"
+    mkdir -p "$fallback_models"
+    # Forward host facts even on the fallback path so the in-container
+    # entrypoint can still write /opt/lucebox-hub/HOST_INFO from the host's
+    # view of the rig. Matches the orchestrator path (see
+    # build_orchestrator_argv) — without it, HOST_INFO would be written
+    # with "source: unknown" any time print-serve-argv fails.
+    local fallback_argv=(docker run --rm
+        --name "$CONTAINER_NAME"
+        --gpus all
+        -p "$DEFAULT_PORT:8080"
+        -v "$HOME:$HOME"
+        -v "$fallback_models:/opt/lucebox-hub/server/models")
+    _append_host_env fallback_argv
+    fallback_argv+=("${IMAGE_BASE}:${variant}")
+    _serve_and_track "${fallback_argv[@]}"
+}
+
+# Foreground server runner with controlling-process lifetime semantics:
+# the docker daemon owns containers independently of the CLI, so a bare
+# `exec docker run` leaves the container alive after the wrapper's parent
+# (a terminal, a systemd unit, anything) goes away. `docker run --rm` only
+# cleans up on the container's own clean exit, not on our death.
+#
+# Fix: run docker as a child, install signal traps that issue `docker stop`
+# before exiting. Now `lucebox serve` behaves like a normal foreground
+# program — close the terminal, kill the wrapper, send SIGTERM from
+# systemd, the container goes down with it.
+#
+# Stops also from EXIT so even a `set -e` propagation cleans up.
+_serve_and_track() {
+    "$@" &
+    local docker_pid=$!
+    # shellcheck disable=SC2317  # called via trap, not "unreachable"
+    _serve_stop() {
+        trap - HUP INT TERM EXIT
+        # Best-effort: container may already be exiting / never started.
+        # `docker stop` blocks up to -t seconds for graceful shutdown
+        # (server handles SIGTERM), then SIGKILLs. 10s is enough for the
+        # in-flight request to finish on a typical decode.
+        docker stop -t 10 "$CONTAINER_NAME" >/dev/null 2>&1 || true
+        wait "$docker_pid" 2>/dev/null || true
+    }
+    trap _serve_stop HUP INT TERM EXIT
+    wait "$docker_pid"
+    local rc=$?
+    trap - HUP INT TERM EXIT
+    return $rc
+}
+
+cmd_systemd_install() {
+    require_host_prereqs
+    ensure_probed
+    require_systemd "service install"
+    local docker_bin
+    docker_bin=$(command -v docker)
+
+    mkdir -p "$(dirname "$UNIT_PATH")"
+    # Capture the user's resolved env at install time so the unit launches
+    # with the same image/variant/port/models the user expected when they
+    # ran `lucebox install`. Systemd's user-session env is sparse — without
+    # this block, the wrapper inside the unit would fall back to the
+    # in-script defaults and silently pick a different image or models
+    # directory than the user's interactive session uses.
+    #
+    # ExecStartPre cleans up any orphaned container with the target name
+    # left behind by a previous crash (docker's `--rm` only fires on clean
+    # exit — a SIGKILL or daemon restart leaves the name claimed, and the
+    # next ExecStart would die with "name already in use" while systemd
+    # reports a useless "exit code 125").
+    cat > "$UNIT_PATH" <<EOF
+[Unit]
+Description=Lucebox hub LLM inference server
+Documentation=https://github.com/Luce-Org/lucebox-hub
+After=network-online.target docker.service
+Wants=network-online.target docker.service
+
+[Service]
+Type=exec
+Restart=on-failure
+RestartSec=10
+Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+Environment=LUCEBOX_IMAGE=$IMAGE_BASE
+Environment=LUCEBOX_VARIANT=$(pick_variant)
+Environment=LUCEBOX_PORT=$DEFAULT_PORT
+Environment=LUCEBOX_CONTAINER=$CONTAINER_NAME
+Environment=LUCEBOX_MODELS=$DEFAULT_MODELS_DIR
+ExecStartPre=-$docker_bin rm -f $CONTAINER_NAME
+ExecStart=$SCRIPT_PATH serve
+ExecStop=$docker_bin stop -t 30 $CONTAINER_NAME
+TimeoutStopSec=45
+
+[Install]
+WantedBy=default.target
+EOF
+    systemctl --user daemon-reload
+    ok "Installed $UNIT_PATH"
+
+    # Check for linger — without it, the unit dies when the user logs out.
+    local linger
+    linger=$(loginctl show-user "$USER" 2>/dev/null | awk -F= '/^Linger=/{print $2}')
+    if [ "$linger" != "yes" ]; then
+        warn "Linger is off for $USER — the service will stop when you log out"
+        hint "To enable (requires sudo): sudo loginctl enable-linger \"$USER\""
+    fi
+
+    printf '\nNext:\n'
+    hint "  $SCRIPT_NAME start            # start now"
+    hint "  $SCRIPT_NAME enable           # start at every login"
+    hint "  $SCRIPT_NAME logs             # follow the journal"
+}
+
+cmd_systemd_uninstall() {
+    require_systemd "service uninstall"
+    if systemctl --user is-active --quiet "$UNIT_NAME" 2>/dev/null; then
+        info "Stopping $UNIT_NAME"
+        systemctl --user stop "$UNIT_NAME" || true
+    fi
+    if systemctl --user is-enabled --quiet "$UNIT_NAME" 2>/dev/null; then
+        info "Disabling $UNIT_NAME"
+        systemctl --user disable "$UNIT_NAME" || true
+    fi
+    if [ -f "$UNIT_PATH" ]; then
+        rm -f "$UNIT_PATH"
+        ok "Removed $UNIT_PATH"
+    else
+        info "No unit at $UNIT_PATH — nothing to remove"
+    fi
+    systemctl --user daemon-reload
+    hint "Config and models are left in place. Remove them by hand if you want."
+}
+
+cmd_systemctl_passthrough() {
+    local action="$1"
+    require_systemd "$action"
+    if [ ! -f "$UNIT_PATH" ]; then
+        err "$UNIT_NAME is not installed — run '$SCRIPT_NAME install' first"
+        exit 1
+    fi
+    case "$action" in
+        start|restart)
+            # `systemctl start` is fire-and-forget for Type=exec: it returns
+            # success as soon as execve() completes, even if the wrapper
+            # exits 1 a millisecond later. That gave us the worst possible
+            # UX — `lucebox start` reports no error but no container ever
+            # binds port 8080. Poll is-active for a few seconds and dump
+            # status + recent journal lines so the user sees the real cause.
+            local current
+            current=$(systemctl --user is-active "$UNIT_NAME" 2>/dev/null || true)
+            # `start` against an already-active unit: systemctl returns 0
+            # silently. That's polite for scripts but confusing for humans
+            # — say so explicitly. For `restart` always run through.
+            if [ "$action" = "start" ] && [ "$current" = "active" ]; then
+                ok "$UNIT_NAME is already active"
+                hint "logs:    $SCRIPT_NAME logs"
+                hint "smoke:   curl -s http://localhost:$DEFAULT_PORT/v1/models"
+                hint "(use \`$SCRIPT_NAME restart\` to bounce, \`$SCRIPT_NAME stop\` to halt)"
+                return 0
+            fi
+            # `start` against a unit stuck in restart-loop ("activating") is
+            # the symptom of a broken ExecStart — calling start would just
+            # block waiting for active that never comes. Surface this
+            # specifically so the user goes to `lucebox logs` to find the
+            # exit reason rather than waiting for the poll to give up.
+            if [ "$action" = "start" ] && [ "$current" = "activating" ]; then
+                err "$UNIT_NAME is in restart-loop (state=activating)"
+                hint "the unit is failing and being auto-restarted by systemd"
+                hint "  $SCRIPT_NAME stop          # halt the loop first"
+                hint "  $SCRIPT_NAME logs          # find the exit reason"
+                exit 1
+            fi
+            info "$action $UNIT_NAME"
+            if ! systemctl --user "$action" "$UNIT_NAME"; then
+                err "systemctl --user $action $UNIT_NAME failed"
+                systemctl --user status "$UNIT_NAME" --no-pager -n 30 || true
+                exit 1
+            fi
+            local i state
+            for i in 1 2 3 4 5 6 7 8 9 10; do
+                state=$(systemctl --user is-active "$UNIT_NAME" 2>/dev/null || true)
+                case "$state" in
+                    active) break ;;     # already up — no need to keep polling
+                    activating) ;;       # still booting; keep waiting
+                    *) break ;;          # failed / inactive — fall through to error path
+                esac
+                sleep 1
+            done
+            state=$(systemctl --user is-active "$UNIT_NAME" 2>/dev/null || true)
+            if [ "$state" != "active" ]; then
+                err "$UNIT_NAME did not reach active state (current: ${state:-unknown})"
+                if [ "$state" = "activating" ]; then
+                    hint "the unit is in a restart loop — \`$SCRIPT_NAME stop\` to halt it"
+                fi
+                hint "status:"
+                systemctl --user status "$UNIT_NAME" --no-pager -n 30 || true
+                hint "recent journal:"
+                journalctl --user -u "$UNIT_NAME" -n 30 --no-pager || true
+                exit 1
+            fi
+            ok "$UNIT_NAME is active"
+            hint "logs:    $SCRIPT_NAME logs"
+            hint "smoke:   curl -s http://localhost:$DEFAULT_PORT/v1/models"
+            ;;
+        stop|enable|disable)
+            exec systemctl --user "$action" "$UNIT_NAME" ;;
+        status)
+            exec systemctl --user status "$UNIT_NAME" --no-pager ;;
+        *)
+            die "unknown systemctl passthrough: $action" ;;
+    esac
+}
+
+cmd_logs() {
+    require_systemd "logs"
+    # Pure passthrough: any flags the user wants (-f, -n, --since, ...) go
+    # straight to journalctl. Default is follow.
+    if [ $# -eq 0 ]; then
+        exec journalctl --user -u "$UNIT_NAME" -f
+    fi
+    exec journalctl --user -u "$UNIT_NAME" "$@"
+}
+
+cmd_pull() {
+    # Pull has to run on the host. Delegating this into the container creates a
+    # stale-image trap: docker may start an old local tag before the fresh tag
+    # has been pulled.
+    require_host_prereqs
+    local variant
+    variant=$(pick_variant)
+    info "Pulling ${IMAGE_BASE}:${variant}"
+    exec docker pull "${IMAGE_BASE}:${variant}"
+}
+
+cmd_update() {
+    # Re-run the bootstrap installer against the channel we were installed
+    # from. The installer is the source of truth for "how do you install
+    # lucebox correctly" — chmod, atomic mv, validation, baking the source
+    # URL back into the new copy so the channel is preserved across
+    # upgrades. Keeping the logic in install.sh means it can evolve
+    # independently (sha verify, signature check, etc.) and the installed
+    # `lucebox update` picks those changes up on the next run.
+    #
+    # The installer URL is derived from LUCEBOX_INSTALLED_FROM by swapping
+    # `lucebox.sh` → `install.sh` in the same directory, so forks don't
+    # need a separate registration. Override the source channel via
+    # $LUCEBOX_INSTALL_URL (e.g. to switch from canonical to a dev fork).
+    local source_url installer_url target
+    source_url="${LUCEBOX_INSTALL_URL:-$LUCEBOX_INSTALLED_FROM}"
+    if [[ "$source_url" != */lucebox.sh ]]; then
+        die "LUCEBOX_INSTALLED_FROM doesn't end in /lucebox.sh: $source_url"
+    fi
+    installer_url="${source_url%/lucebox.sh}/install.sh"
+    target=$(realpath "$SCRIPT_PATH")
+
+    info "Updating lucebox via $installer_url"
+    info "  source: $source_url"
+    info "  target: $target"
+
+    # Pass the URLs through to install.sh via env. The installer reads
+    # $LUCEBOX_INSTALL_URL (which we set to source_url) and
+    # $LUCEBOX_INSTALL_DEST (the realpath of *this* file, so a symlinked
+    # install replaces the actual file behind the link).
+    LUCEBOX_INSTALL_URL="$source_url" \
+    LUCEBOX_INSTALL_DEST="$target" \
+        bash -c "$(curl -fsSL "$installer_url")" \
+            || die "update failed (installer exited non-zero)"
+}
+
+cmd_completion() {
+    # Print shell completion script for bash / zsh / fish. Usage:
+    #
+    #   # bash  (in ~/.bashrc):
+    #   source <(lucebox completion bash)
+    #
+    #   # zsh  (in ~/.zshrc, before `compinit`):
+    #   source <(lucebox completion zsh)
+    #
+    #   # fish:
+    #   lucebox completion fish | source
+    #
+    # Keep this in sync with the dispatch table in main() and the sub-app
+    # verbs (config get/set/unset, models list/download). Adding a new
+    # top-level command means adding it here too.
+    local shell="${1:-}"
+    case "$shell" in
+        bash)
+            cat <<'BASH'
+# lucebox bash completion. Source from ~/.bashrc:
+#   source <(lucebox completion bash)
+_lucebox_complete() {
+    local cur prev cmds config_verbs models_verbs completion_shells
+    COMPREPLY=()
+    cur="${COMP_WORDS[COMP_CWORD]}"
+    prev="${COMP_WORDS[COMP_CWORD-1]}"
+    cmds="install uninstall start stop restart enable disable status logs \
+          serve pull update check completion config models \
+          print-run help version"
+    config_verbs="get set unset"
+    models_verbs="list download"
+    completion_shells="bash zsh fish"
+
+    # Sub-app verbs / shell args.
+    case "$prev" in
+        config)     COMPREPLY=( $(compgen -W "$config_verbs"     -- "$cur") ); return ;;
+        models)     COMPREPLY=( $(compgen -W "$models_verbs"     -- "$cur") ); return ;;
+        completion) COMPREPLY=( $(compgen -W "$completion_shells" -- "$cur") ); return ;;
+    esac
+
+    # Top-level command.
+    if [ "$COMP_CWORD" = 1 ]; then
+        COMPREPLY=( $(compgen -W "$cmds" -- "$cur") )
+        return
+    fi
+}
+complete -F _lucebox_complete lucebox lucebox.sh
+BASH
+            ;;
+        zsh)
+            # Bash-compat shim: zsh sources our bash completion through
+            # bashcompinit. Users who prefer native zsh _arguments-style
+            # completion can write their own; this gets `<TAB>` working
+            # in two lines for free.
+            cat <<'ZSH'
+# lucebox zsh completion. Source from ~/.zshrc (after compinit):
+#   source <(lucebox completion zsh)
+autoload -Uz compinit bashcompinit
+compinit
+bashcompinit
+ZSH
+            cmd_completion bash
+            ;;
+        fish)
+            cat <<'FISH'
+# lucebox fish completion. Source from ~/.config/fish/config.fish:
+#   lucebox completion fish | source
+complete -c lucebox -f
+set -l __lucebox_cmds install uninstall start stop restart enable disable \
+    status logs serve pull update check completion config models \
+    print-run help version
+for cmd in $__lucebox_cmds
+    complete -c lucebox -n "not __fish_seen_subcommand_from $__lucebox_cmds" -a $cmd
+end
+complete -c lucebox -n "__fish_seen_subcommand_from config" -a "get set unset"
+complete -c lucebox -n "__fish_seen_subcommand_from models" -a "list download"
+complete -c lucebox -n "__fish_seen_subcommand_from completion" -a "bash zsh fish"
+FISH
+            ;;
+        ""|--help|-h)
+            cat <<EOF
+$SCRIPT_NAME completion {bash|zsh|fish}
+
+Emits a shell completion script. Source it from your shell's rc file:
+
+  bash:  source <($SCRIPT_NAME completion bash)
+  zsh:   source <($SCRIPT_NAME completion zsh)
+  fish:  $SCRIPT_NAME completion fish | source
+EOF
+            ;;
+        *)
+            die "unknown shell: $shell — want bash, zsh, or fish" ;;
+    esac
+}
+
+cmd_check() {
+    # Host-only readiness report. Pure shell — never enters the container,
+    # since the point is to verify the host can run the container in the
+    # first place. Reuses probe_host (LUCEBOX_HOST_* env vars) for the
+    # actual detection so the formatting is the only thing here.
+    ensure_probed
+
+    local variant
+    variant=$(pick_variant)
+
+    # Two-column grid: "  name        ✓  detail" — matches the visual
+    # style of the lucebench preflight output.
+    local mark
+    _row() {
+        # Brace every var ref so multi-byte glyphs (✓ ✗) don't get parsed
+        # as part of the identifier — some bash builds with permissive
+        # locales count them as identifier characters and `set -u` then
+        # errors out on the resulting "C_OK✓" / "C_ERR✗" names.
+        if [ "$1" = "1" ]; then mark="${C_OK}✓${C_RST}"
+        elif [ "$1" = "warn" ]; then mark="${C_WARN}!${C_RST}"
+        else mark="${C_ERR}✗${C_RST}"; fi
+        printf '  %-22s %b  %s\n' "$2" "$mark" "$3"
+    }
+
+    echo "[lucebox] host readiness report"
+
+    # docker
+    if [ "$LUCEBOX_HOST_HAS_DOCKER" = "1" ]; then
+        _row 1 "docker daemon" "reachable (server ${LUCEBOX_HOST_DOCKER_VERSION:-?})"
+    elif command -v docker &>/dev/null; then
+        _row 0 "docker daemon" "installed but unreachable — start the daemon or add user to 'docker' group"
+    else
+        _row 0 "docker daemon" "not installed — https://docs.docker.com/engine/install/"
+    fi
+
+    # nvidia container toolkit
+    case "$LUCEBOX_HOST_HAS_CTK" in
+        runtime)            _row 1    "nvidia ctk"     "wired into docker (runtime)" ;;
+        cdi)                _row 1    "nvidia ctk"     "wired via CDI (nvidia.com/gpu)" ;;
+        installed-unwired)  _row warn "nvidia ctk"     "installed but not registered with docker — sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker" ;;
+        none|*)             _row 0    "nvidia ctk"     "not installed — https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html" ;;
+    esac
+
+    # nvidia-smi + driver
+    if [ "$LUCEBOX_HOST_GPU_VENDOR" = "nvidia" ]; then
+        if [ "$LUCEBOX_HOST_DRIVER_MAJOR" -ge "$MIN_DRIVER_CUDA12" ]; then
+            _row 1 "nvidia driver" "$LUCEBOX_HOST_DRIVER_VERSION (≥ $MIN_DRIVER_CUDA12 required for cuda12)"
+        else
+            _row 0 "nvidia driver" "$LUCEBOX_HOST_DRIVER_VERSION (< $MIN_DRIVER_CUDA12 — cuda12 image will fail)"
+        fi
+    elif command -v nvidia-smi &>/dev/null; then
+        _row 0 "nvidia driver" "nvidia-smi present but NVML calls fail — driver/library mismatch, try reboot"
+    else
+        _row 0 "nvidia driver" "nvidia-smi not found — install the NVIDIA driver"
+    fi
+
+    # GPU detail
+    if [ "$LUCEBOX_HOST_GPU_VENDOR" = "nvidia" ]; then
+        _row 1 "gpu" "$LUCEBOX_HOST_GPU_NAME × $LUCEBOX_HOST_GPU_COUNT (sm_$LUCEBOX_HOST_GPU_SM, ${LUCEBOX_HOST_VRAM_GB} GB VRAM)"
+        # cuda12 image arch coverage: sm_75;80;86;89;90;120 (see docker-bake.hcl)
+        case "$LUCEBOX_HOST_GPU_SM" in
+            75|80|86|89|90|120) _row 1    "cuda12 arch" "sm_$LUCEBOX_HOST_GPU_SM covered by image" ;;
+            "")                 _row warn "cuda12 arch" "compute_cap not detected" ;;
+            *)                  _row warn "cuda12 arch" "sm_$LUCEBOX_HOST_GPU_SM not in image arch list (75;80;86;89;90;120)" ;;
+        esac
+    fi
+
+    # systemd
+    if [ "$LUCEBOX_HOST_HAS_SYSTEMD" = "1" ]; then
+        _row 1 "user systemd" "available (needed for '$SCRIPT_NAME install')"
+    elif [ "$LUCEBOX_HOST_IS_WSL" = "1" ]; then
+        _row warn "user systemd" "WSL detected — set 'systemd=true' under [boot] in /etc/wsl.conf, then 'wsl --shutdown'"
+    else
+        _row warn "user systemd" "not available — '$SCRIPT_NAME install' (service unit) won't work; '$SCRIPT_NAME serve' (foreground) will"
+    fi
+
+    # image we'd pull — marked ✗ when the host clearly can't run cuda12
+    # (no nvidia driver, or no CTK wired into docker). It's still useful
+    # to print the line so the user knows what would be pulled, but a
+    # green ✓ would be misleading.
+    if [ "$LUCEBOX_HOST_GPU_VENDOR" != "nvidia" ]; then
+        _row 0 "image" "${IMAGE_BASE}:${variant} — requires NVIDIA driver"
+    elif [ "$LUCEBOX_HOST_HAS_CTK" = "none" ] || [ "$LUCEBOX_HOST_HAS_CTK" = "installed-unwired" ]; then
+        _row 0 "image" "${IMAGE_BASE}:${variant} — needs NVIDIA Container Toolkit wired into docker"
+    else
+        _row 1 "image" "${IMAGE_BASE}:${variant}"
+    fi
+    # RAM / cores (informational)
+    _row 1 "host" "${LUCEBOX_HOST_NPROC} cpus, ${LUCEBOX_HOST_RAM_GB} GB RAM"
+}
+
+cmd_in_container() {
+    # Generic dispatcher: anything that isn't a systemd action goes here.
+    # Runs the in-container Python CLI with the supplied argv.
+    require_host_prereqs
+    ensure_probed
+    # CTK isn't strictly required for every subcommand (e.g. `config get`
+    # or `autotune` only touch local files), but the server-spawning
+    # subcommands need it.
+    # Letting docker error its own way is fine for the no-CTK case.
+    local variant
+    variant=$(pick_variant)
+    local argv
+    mapfile -t argv < <(build_orchestrator_argv "$variant" "$@")
+    exec "${argv[@]}"
+}
+
+# Is the long-running lucebox container currently up? Used by the dispatcher
+# to decide between `docker exec` into it (cheap, shares the running server's
+# network namespace so localhost:8080 reaches the server) vs. `docker run`
+# (cold start, isolated network — can't reach the live server).
+#
+# `docker ps -q -f name=^<CONTAINER>$` prints the container id when running,
+# empty otherwise. The anchored regex avoids matching `lucebox-cli-12345`
+# style ephemeral siblings.
+_lucebox_container_running() {
+    # No docker on PATH → definitely not running. Don't even probe.
+    command -v docker >/dev/null 2>&1 || return 1
+    local id
+    id=$(docker ps -q -f "name=^${CONTAINER_NAME}\$" 2>/dev/null || true)
+    [ -n "$id" ]
+}
+
+# `docker exec` variant of cmd_in_container. Same calling convention, but:
+#   - shares the running container's network namespace (localhost:8080 → the
+#     server), filesystem, and mounts — no bind mounts needed.
+#   - skips the ~1-3s cold-start cost of a fresh `docker run --rm`.
+#   - only safe for steady-state / read-only / config-only subcommands. Any
+#     command that restarts the lucebox service (autotune --sweep, serve)
+#     would kill the very container the exec is in — caller must route those
+#     to cmd_in_container instead.
+#
+# Pass through the same env-var subset the run path uses so the in-container
+# CLI sees consistent overrides whichever route it took: HOME, every
+# LUCEBOX_HOST_*, the image/port/container/models scalars, and HF_TOKEN.
+cmd_exec_in_container() {
+    require_host_prereqs
+    ensure_probed
+    local variant
+    variant=$(pick_variant)
+    local tty=()
+    _set_tty_flags tty
+    local argv=(docker exec "${tty[@]}")
+    argv+=(--user "$(id -u):$(id -g)")
+    argv+=(-w "$PWD")
+    argv+=(-e "HOME=$HOME")
+    _append_host_env argv
+    _append_scalar_env argv "$variant"
+    # The image has no top-level `lucebox` binary on PATH — that name only
+    # works as the first arg to /opt/lucebox-hub/server/scripts/entrypoint.sh,
+    # which then `exec uv run ... python -m lucebox`s. docker exec bypasses
+    # the image's ENTRYPOINT, so we invoke the entrypoint shim explicitly
+    # with `lucebox` as its SUBCMD and the user's argv tail. Keeps the
+    # exec path bit-for-bit equivalent to what docker run does on the
+    # SUBCMD=lucebox branch.
+    argv+=("$CONTAINER_NAME" /opt/lucebox-hub/server/scripts/entrypoint.sh lucebox "$@")
+    exec "${argv[@]}"
+}
+
+# Decide whether a given (subcommand, argv) pair is safe to run via
+# `docker exec` into the live container. Returns 0 (yes, prefer exec) or 1
+# (no, must use docker run / host-side).
+#
+# The safe-to-exec set is exactly the steady-state / read-only / hits-the-
+# running-server subcommands. Anything that restarts the service, mutates
+# images, or is itself the long-running service must stay on cmd_in_container.
+#
+_lucebox_prefer_exec() {
+    local cmd="$1"; shift
+    case "$cmd" in
+        config|models|check|print-run|print-serve-argv)
+            return 0
+            ;;
+        *)
+            return 1
+            ;;
+    esac
+}
+
+# Top-level routing for the in-container Python CLI. Picks between exec
+# (cheap, shares the live server's namespace) and run (cold start, isolated).
+#
+# Decision tree:
+#   1. LUCEBOX_NO_EXEC=1 / --no-exec was set → always run, never exec.
+#      Useful for debugging the wrapper or when the in-container Python is
+#      stale relative to the image.
+#   2. cmd is not in the prefer-exec list → run (sweep, service mutators).
+#   3. container is running → exec (the fast path, hits the live server).
+#   4. container is not running → run (fall back so first-run / pre-install
+#      flows still work without a live service).
+cmd_route_to_container() {
+    local cmd="$1"; shift
+    if [ "${LUCEBOX_NO_EXEC:-0}" = "1" ]; then
+        cmd_in_container "$cmd" "$@"
+        return
+    fi
+    if _lucebox_prefer_exec "$cmd" "$@" && _lucebox_container_running; then
+        cmd_exec_in_container "$cmd" "$@"
+        return
+    fi
+    cmd_in_container "$cmd" "$@"
+}
+
+usage() {
+    cat <<EOF
+$SCRIPT_NAME $VERSION — host-side wrapper for the lucebox-hub container
+
+Service management (via user systemd):
+  install               install user systemd unit
+  uninstall             stop, disable, remove the unit (keeps config + models)
+  start | stop          systemctl --user start|stop lucebox
+  enable | disable      systemctl --user enable|disable lucebox
+  status                systemctl --user status lucebox
+  logs [args]           journalctl --user -u lucebox  (default: -f)
+
+Direct server invocation (foreground, no systemd):
+  serve                 docker run the server in the foreground
+
+Provisioning + workloads (delegated to the in-container Python CLI):
+  check                 host + docker readiness report
+  pull                  docker pull the cuda12 image
+  update                re-run the bootstrap installer to upgrade this script
+  completion <shell>    print shell completion script (bash / zsh / fish)
+  models                list / download / activate model presets
+  config                read / write keys in .lucebox/config.toml
+  print-run             print the docker-run command for the server
+
+Misc:
+  help, --help, -h      this message
+  version, --version    print version
+
+Environment overrides:
+  LUCEBOX_IMAGE         image name without tag (default: ghcr.io/luce-org/lucebox-hub)
+  LUCEBOX_VARIANT       image tag to pull/run (default: cuda12)
+  LUCEBOX_PORT          host port for the server (default: 8080)
+  LUCEBOX_CONTAINER     server container name (default: lucebox)
+  LUCEBOX_MODELS        host model directory (default: \$XDG_DATA_HOME/lucebox/models
+  LUCEBOX_NO_EXEC=1     force docker-run for in-container subcommands even
+                        when the container is up (equivalent to --no-exec)
+  HF_TOKEN              propagated to \`models download\` for gated HF repos
+
+Container routing:
+  When the long-running '$CONTAINER_NAME' container is up, steady-state
+  subcommands (config, models, check, print-run, print-serve-argv)
+  'docker exec' into it instead of starting a fresh container. This avoids
+  the ~1-3s docker-run cold-start AND shares the live server's network
+  namespace so localhost:\$LUCEBOX_PORT reaches the server. Service-restarting
+  commands (serve, pull, update, install, etc.) stay on the host-side /
+  docker-run path. Pass --no-exec (or LUCEBOX_NO_EXEC=1) to force docker-run.
+EOF
+}
+
+# ── dispatch ──────────────────────────────────────────────────────────────
+
+main() {
+    # Global flag pass: `--no-exec` anywhere before the subcommand forces the
+    # docker-run path even if the container is up. Equivalent to
+    # `LUCEBOX_NO_EXEC=1 lucebox ...`. We pop it out of argv up-front so the
+    # rest of dispatch doesn't have to know about it.
+    local args=()
+    while [ $# -gt 0 ]; do
+        case "$1" in
+            --no-exec) export LUCEBOX_NO_EXEC=1; shift ;;
+            *) args+=("$1"); shift ;;
+        esac
+    done
+    set -- "${args[@]}"
+
+    local cmd="${1:-help}"
+    [ $# -gt 0 ] && shift
+    case "$cmd" in
+        # Systemd surface
+        install)          cmd_systemd_install "$@" ;;
+        uninstall)        cmd_systemd_uninstall "$@" ;;
+        start|stop|restart|enable|disable|status)
+                          cmd_systemctl_passthrough "$cmd" "$@" ;;
+        logs)             cmd_logs "$@" ;;
+
+        # Direct server
+        serve)            cmd_serve "$@" ;;
+        pull)             cmd_pull "$@" ;;
+
+        # Self-update — re-runs the bootstrap installer against the channel
+        # this script was installed from (LUCEBOX_INSTALLED_FROM).
+        update)           cmd_update "$@" ;;
+
+        # Host-only readiness check — pure shell, never enters the container.
+        check)            cmd_check "$@" ;;
+
+        # Shell completion — print a script the user sources into their rc
+        # file. Bash and zsh share the bash-style emitter (zsh users add a
+        # `bashcompinit; complete` shim); fish is native.
+        completion)       cmd_completion "$@" ;;
+
+        # Help / version
+        help|--help|-h)   usage ;;
+        version|--version) printf '%s\n' "$VERSION" ;;
+
+        # Everything else → in-container Python CLI. cmd_route_to_container
+        # picks between `docker exec` into the live container (cheap, shares
+        # the running server's network namespace) and `docker run` (cold,
+        # isolated) based on container state + the safe-to-exec command set.
+        *)                cmd_route_to_container "$cmd" "$@" ;;
+    esac
+}
+
+main "$@"
diff --git a/lucebox/.gitignore b/lucebox/.gitignore
new file mode 100644
index 000000000..15f95f0e7
--- /dev/null
+++ b/lucebox/.gitignore
@@ -0,0 +1,3 @@
+
+# Generated by hatch-vcs at build time from git tags.
+src/lucebox/_version.py
diff --git a/lucebox/README.md b/lucebox/README.md
new file mode 100644
index 000000000..747a49eef
--- /dev/null
+++ b/lucebox/README.md
@@ -0,0 +1,18 @@
+# lucebox — host CLI for the lucebox-hub container
+
+This package ships *inside* the `ghcr.io/luce-org/lucebox-hub` Docker image
+and is invoked from the host via the [`lucebox.sh`](../lucebox.sh) wrapper:
+
+    lucebox.sh check          # `docker run … lucebox check`
+    lucebox.sh config get
+    lucebox.sh print-run
+
+The wrapper is the only thing that runs on the host; everything else (host
+checks, TOML config, docker daemon calls, model download) is Python in the
+container. Host facts (driver, GPU, RAM, VRAM, systemd availability) are
+passed in via `LUCEBOX_HOST_*` environment variables so the Python side
+doesn't reprobe. The autotune sweep, profiling, and agent-client launchers
+land in follow-up PRs.
+
+Subcommands are defined in [`lucebox/cli.py`](src/lucebox/cli.py). See the
+top-level [README.md](../README.md) for the user-facing flow.
diff --git a/lucebox/pyproject.toml b/lucebox/pyproject.toml
new file mode 100644
index 000000000..5277b268d
--- /dev/null
+++ b/lucebox/pyproject.toml
@@ -0,0 +1,54 @@
+[project]
+name = "lucebox"
+# Version is derived from git tags via hatch-vcs (see [tool.hatch.version]
+# below). Tag `lucebox-v0.2.1` → release version `0.2.1`. Commits past a
+# tag get a `.devN+g<sha>` suffix so dev installs are visibly distinct
+# from releases. Single source of truth: the git tag.
+dynamic = ["version"]
+description = "Host-side CLI for the lucebox-hub container: launch, config, model download"
+readme = "README.md"
+requires-python = ">=3.11"
+authors = [{ name = "Lucebox" }]
+license = { text = "Apache-2.0" }
+
+# Kept intentionally narrow. typer pulls click+rich; tomli-w gives us TOML
+# writes (stdlib tomllib only reads). httpx for the smoke + readiness probes.
+# huggingface_hub for download-models — used directly (not via subprocess)
+# so we can drive a Rich progress bar + verify sha256 against the repo
+# metadata before re-fetching multi-GB GGUFs.
+dependencies = [
+    "typer>=0.12",
+    "rich>=13",
+    "httpx>=0.27",
+    "tomli-w>=1.0",
+    "huggingface_hub>=0.27",
+    # luce-bench is consumed lazily by the autotune sweep scorer
+    # (agent_replay_pass_rate in sweep.py does a function-local
+    # `from lucebench.areas.agent_recorded import ...` wrapped in try/except).
+    # It's deliberately NOT a hard dep here because the workspace can't lock
+    # against it until #337 (luce-bench in-tree) lands. Install with
+    # `uv pip install luce-bench` on the host running the scorer.
+]
+
+[project.scripts]
+lucebox = "lucebox.cli:app"
+
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[tool.hatch.version]
+source = "vcs"
+# Untagged checkouts (e.g. fresh clone before tagging lucebox-v0.2.1)
+# resolve to this rather than 0.0.0.dev0.
+fallback-version = "0.2.1.dev0"
+raw-options.tag_regex = '''^lucebox-v(?P<version>\d+\.\d+\.\d+)$'''
+
+[tool.hatch.build.hooks.vcs]
+# Build hook writes the resolved version into src/lucebox/_version.py
+# so `__init__.py` can `from lucebox._version import __version__`.
+# Generated file — see lucebox/.gitignore.
+version-file = "src/lucebox/_version.py"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/lucebox"]
diff --git a/lucebox/src/lucebox/__init__.py b/lucebox/src/lucebox/__init__.py
new file mode 100644
index 000000000..8ca821024
--- /dev/null
+++ b/lucebox/src/lucebox/__init__.py
@@ -0,0 +1,16 @@
+"""lucebox — host-side CLI for the lucebox-hub container.
+
+Runs inside the container; the host wrapper at ../lucebox.sh handles `docker
+run` plumbing and systemd integration. This package owns: TOML config, the
+host-derived DFLASH_* serve heuristic, docker daemon calls (via the mounted
+socket), and model download. The empirical autotune sweep, profiling, and
+agent-client launchers land in follow-up PRs.
+"""
+
+# Version is generated by hatch-vcs at build time into _version.py.
+# Fresh source-tree checkouts before any build will not yet have the
+# file — fall back to a dev marker so imports don't break.
+try:
+    from lucebox._version import __version__
+except ImportError:
+    __version__ = "0.0.0.dev0+unbuilt"
diff --git a/lucebox/src/lucebox/__main__.py b/lucebox/src/lucebox/__main__.py
new file mode 100644
index 000000000..128e2ca87
--- /dev/null
+++ b/lucebox/src/lucebox/__main__.py
@@ -0,0 +1,6 @@
+"""Entry point for `python -m lucebox`."""
+
+from lucebox.cli import app
+
+if __name__ == "__main__":
+    app()
diff --git a/lucebox/src/lucebox/autotune.py b/lucebox/src/lucebox/autotune.py
new file mode 100644
index 000000000..51b4f8424
--- /dev/null
+++ b/lucebox/src/lucebox/autotune.py
@@ -0,0 +1,80 @@
+"""Heuristic autotune: VRAM tier → DflashRuntime defaults.
+
+The recommended runtime is computed from HostFacts (VRAM, is_wsl) — stateless:
+it takes HostFacts in and returns a fresh DflashRuntime. ``config.live_config``
+applies it so ``lucebox print-serve-argv`` / ``docker_run`` bake conservative
+DFLASH_* defaults into the serve command for the detected VRAM tier.
+
+The empirical sweep + per-workload profiles (``lucebox autotune --sweep``)
+live in a follow-up PR; this module keeps only the host-derived heuristic
+that the serve path depends on.
+"""
+
+from __future__ import annotations
+
+from lucebox.types import DflashRuntime, HostFacts
+
+
+def runtime_from_host(host: HostFacts) -> DflashRuntime:
+    """Pick a conservative DflashRuntime that 'should work' on this VRAM tier.
+
+    Tiers (NVIDIA, baseline = Qwen3.6-27B Q4_K_M ~18 GB total):
+        <12 GB  — too small for 27B; pick min ctx as a floor so a fallback
+                  start at least gets an error from the daemon rather than
+                  a silent OOM.
+        12-21   — fits but tight; cap ctx.
+        22-31   — 24 GB-class consumer flagships (3090/4090/5090/5090-Laptop).
+                  98 K with tq3_0 KV (~2 GB KV + ~18 GB model ≈ 20 GB).
+                  Confirmed on bragi (RTX 5090 Laptop, 23 GB VRAM) 2026-05-31.
+        32-47   — RTX 6000 Ada / A100 40 GB. Full 128 K.
+        ≥48     — A100 80 GB / H100 / RTX 6000 Pro. Full 128 K.
+
+    Prefix cache remains an explicit sweep tunable, but the automatic
+    baseline keeps it off because tool prompts currently exercise a daemon
+    snapshot path that is not reliable with prefix slots enabled.
+    Empirically confirmed on bragi 2026-05-31: prefix_cache_slots=32
+    caused -19pp regression on agent_recorded (23.1% vs 42.3% baseline).
+    5 previously-passing cases regressed; 0 new cases unlocked. See
+    docs/experiments/qwen3.6-27b-prefix-cache-regression-bragi-2026-05-31.md.
+
+    On `lazy`: the C++ server requires `--prefill-drafter` (and `--draft`)
+    to be set for `--lazy-draft` to take effect, and silently ignores it
+    otherwise (`--lazy-draft ignored: requires both --prefill-drafter and
+    --draft`). Since the heuristic path does NOT set `prefill_drafter`,
+    we default `lazy=False` here — "what we say" matches "what runs".
+    Users who explicitly opt in via config.toml will be warned at server
+    startup that the flag is being dropped (see entrypoint.sh).
+    """
+    if host.vram_gb <= 0:
+        return DflashRuntime()  # no VRAM signal — stick with class defaults
+
+    if host.vram_gb < 12:
+        return DflashRuntime(max_ctx=4096)
+    if host.vram_gb < 22:
+        return DflashRuntime(max_ctx=32768)
+    if host.vram_gb < 32:
+        # 22-31 GB cards. tq3_0 KV is required at 98K: model (~18-19 GB) +
+        # q8_0 KV at 98K (~5-6 GB) = 24-25 GB → OOM, while tq3_0 KV (~2 GB)
+        # leaves ~3 GB headroom. Confirmed on bragi (RTX 5090 Laptop, 23 GB
+        # VRAM) 2026-05-30 — q8_0 timed out on every 98K cell; all tq3_0 cells
+        # passed. Preset-size-aware capping (large models → 32K) lives with the
+        # autotune sweep in a follow-up PR.
+        if host.is_wsl:
+            # Bumped from max_ctx=65536 → 98304 on 2026-05-30 after the
+            # coding-agent-loop sweep on sindri proved 98K serves real
+            # 90K-token agentic prompts with ~3 GB VRAM headroom and no
+            # CUDA VMM failures. See
+            # docs/experiments/gemma4-26b-coding-agent-loop-sweep-2026-05-30.md.
+            # The original 65K cap cited unverified VMM failures —
+            # bisect history showed no commit reproducing them.
+            return DflashRuntime(
+                budget=16, max_ctx=98304,
+                cache_type_k="tq3_0", cache_type_v="tq3_0",
+            )
+        return DflashRuntime(
+            max_ctx=98304,
+            cache_type_k="tq3_0", cache_type_v="tq3_0",
+        )
+    if host.vram_gb < 48:
+        return DflashRuntime(max_ctx=131072)
+    return DflashRuntime(max_ctx=131072)
diff --git a/lucebox/src/lucebox/cli.py b/lucebox/src/lucebox/cli.py
new file mode 100644
index 000000000..f661d62cd
--- /dev/null
+++ b/lucebox/src/lucebox/cli.py
@@ -0,0 +1,346 @@
+"""Typer app — the user-facing subcommands.
+
+Layout follows the host wrapper's dispatch table. Anything `lucebox`
+doesn't intercept (everything outside the systemd surface) ends up here.
+
+Subcommand inventory:
+    check                  — readiness report
+    config get/set/unset   — read / write a single key in config.toml
+    pull                   — docker pull the cuda12 image
+    print-run              — emit the docker-run command for the server
+    print-serve-argv       — same, raw argv lines (consumed by `lucebox serve`)
+    models                 — list / download presets, activate one
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from dataclasses import replace
+from pathlib import Path
+from typing import Annotated
+
+import typer
+from rich.console import Console
+from rich.table import Table
+
+import lucebox.config as config_mod
+import lucebox.docker_run as docker_run
+import lucebox.download as download_mod
+import lucebox.host_check as host_check
+from lucebox import __version__
+from lucebox.config import config_get, config_set, config_unset, live_config
+from lucebox.host_facts import from_env
+
+app = typer.Typer(
+    name="lucebox",
+    help="Host CLI for the lucebox-hub container. Invoked by lucebox.sh.",
+    no_args_is_help=True,
+    add_completion=False,
+)
+console = Console()
+
+
+# ── helpers ────────────────────────────────────────────────────────────────
+
+
+def _load_or_build() -> config_mod.Config:  # type: ignore[name-defined]
+    """env > config.toml > dataclass defaults — the canonical precedence.
+
+    Without the env-overlay step below, `config_mod.load()` returned the
+    persisted config verbatim and `LUCEBOX_IMAGE` / `LUCEBOX_VARIANT` /
+    `LUCEBOX_PORT` / `LUCEBOX_CONTAINER` / `LUCEBOX_MODELS` from the
+    systemd unit's `Environment=` (or any one-shot shell export) were
+    silently dropped. That contradicted the precedence lucebox.sh
+    documents and applies — and bit sindri when its config.toml had
+    `[image]` without `registry`, so the dataclass default
+    `ghcr.io/luce-org/lucebox-hub` won over the unit's
+    `LUCEBOX_IMAGE=ghcr.io/easel/lucebox-hub`.
+
+    Fix: overlay env on top of the loaded config (or the live_config
+    fallback when config.toml is absent). Only the five top-level
+    scalars have env hooks — dflash/host/model don't, by design.
+    """
+    cfg = config_mod.load()
+    if cfg is None:
+        cfg = live_config()
+    # Overlay live host facts. When ``config.toml`` exists without a
+    # ``[host]`` block (the common case — operators don't hand-edit
+    # host facts), ``cfg.host`` defaults to a zero-filled ``HostFacts``
+    # and the DFLASH_* serve heuristic silently falls through to the
+    # "no VRAM signal" path. Re-probe from env so the wrapper-exported
+    # LUCEBOX_HOST_* facts always win over the persisted (possibly
+    # absent) snapshot.
+    live_host = from_env()
+    host = live_host if live_host.vram_gb > 0 or live_host.nproc > 0 else cfg.host
+    return replace(
+        cfg,
+        variant=os.environ.get("LUCEBOX_VARIANT", cfg.variant),
+        image=os.environ.get("LUCEBOX_IMAGE", cfg.image),
+        container_name=os.environ.get("LUCEBOX_CONTAINER", cfg.container_name),
+        port=int(os.environ.get("LUCEBOX_PORT", str(cfg.port))),
+        models_dir=Path(os.environ.get("LUCEBOX_MODELS", str(cfg.models_dir))),
+        host=host,
+    )
+
+
+# ── subcommands ────────────────────────────────────────────────────────────
+
+
+@app.command()
+def check() -> None:
+    """Print a readiness report (driver, docker, CTK, RAM, VRAM, systemd)."""
+    host = from_env()
+    results = host_check.run_checks(host)
+    worst = host_check.render(console, host, results)
+    if worst == "fail":
+        raise typer.Exit(code=1)
+
+
+@app.command()
+def pull() -> None:
+    """`docker pull` the image variant from config.toml."""
+    cfg = _load_or_build()
+    tag = f"{cfg.image}:{cfg.variant}"
+    console.print(f"[bold]Pulling {tag}[/bold] (~14 GB; takes a while)…")
+    rc = docker_run.docker_pull(tag)
+    if rc != 0:
+        raise typer.Exit(code=rc)
+
+
+@app.command("print-run")
+def print_run() -> None:
+    """Print the docker-run command for the server (copy-pasteable)."""
+    cfg = _load_or_build()
+    spec = docker_run.server_run_spec(cfg)
+    print(spec.printable())
+
+
+@app.command("print-serve-argv")
+def print_serve_argv() -> None:
+    """Emit the server docker-run argv, one token per line.
+
+    Consumed by lucebox.sh's `serve` subcommand and the systemd unit. Kept as
+    a separate command from `print-run` so the bash side has a guaranteed
+    machine-readable contract that's independent of the pretty formatter.
+    """
+    cfg = _load_or_build()
+    spec = docker_run.server_run_spec(cfg)
+    for tok in spec.argv():
+        print(tok)
+
+
+# ── config sub-app ─────────────────────────────────────────────────────────
+
+
+config_app = typer.Typer(no_args_is_help=True, help="Read/write keys in config.toml.")
+app.add_typer(config_app, name="config")
+
+
+@config_app.command("get")
+def config_get_cmd(
+    key: Annotated[str, typer.Argument(help="Dotted key (omit to list every key).")] = "",
+) -> None:
+    """Print a single key (or every reachable key) with its origin annotation."""
+    try:
+        entries = config_get(key or None)
+    except KeyError as exc:
+        console.print(f"[red]{exc}[/red]")
+        raise typer.Exit(code=2) from exc
+    for k, (value, origin) in entries.items():
+        console.print(f"{k} = {value!r} ([dim]from {origin}[/dim])")
+
+
+@config_app.command("set")
+def config_set_cmd(
+    kv: Annotated[str, typer.Argument(help='"key=value" pair (e.g. "model.preset=qwen3.6-27b")')],
+) -> None:
+    """Set one dotted key. Auto-creates config.toml when missing.
+
+    Only the named key is written — other on-disk keys are preserved
+    untouched, unset keys stay implicit. Use `lucebox config unset` to
+    remove a key (next read falls back to the live default).
+    """
+    if "=" not in kv:
+        console.print("[red]argument must be key=value[/red]")
+        raise typer.Exit(code=2)
+    key, _, value = kv.partition("=")
+    key = key.strip()
+    value = value.strip()
+    try:
+        config_set(key, value)
+    except (KeyError, ValueError) as exc:
+        console.print(f"[red]{exc}[/red]")
+        raise typer.Exit(code=2) from exc
+    console.print(f"[green]Set[/green] {key} = {value}")
+
+
+@config_app.command("unset")
+def config_unset_cmd(
+    key: Annotated[str, typer.Argument(help="Dotted key to remove from config.toml.")],
+) -> None:
+    """Remove a key from config.toml. Next read uses the live default."""
+    try:
+        changed = config_unset(key)
+    except KeyError as exc:
+        console.print(f"[red]{exc}[/red]")
+        raise typer.Exit(code=2) from exc
+    if changed:
+        console.print(f"[green]Unset[/green] {key}")
+    else:
+        console.print(f"[dim]{key} was not in config.toml; nothing to do[/dim]")
+
+
+# ── models sub-app ─────────────────────────────────────────────────────────
+
+
+models_app = typer.Typer(
+    no_args_is_help=False, help="Manage local model presets (list, download, activate)."
+)
+app.add_typer(models_app, name="models")
+
+
+def _print_installed_presets() -> None:
+    cfg = _load_or_build()
+    installed = download_mod.installed_presets(cfg)
+    active = cfg.model.preset
+    console.print(f"Models dir: [bold]{cfg.models_dir}[/bold]")
+    if not installed:
+        console.print("[dim]No presets installed yet — try `lucebox models download`.[/dim]")
+        return
+    table = Table()
+    table.add_column("preset")
+    table.add_column("status")
+    table.add_column("size (GB)")
+    for pres in installed:
+        marker = "* " if pres.name == active else "  "
+        size_gb = download_mod.installed_size_gb(cfg, pres)
+        table.add_row(f"{marker}{pres.name}", "installed", f"{size_gb:.1f}")
+    console.print(table)
+    total = sum(download_mod.installed_size_gb(cfg, p) for p in installed)
+    console.print(f"[dim]Total disk usage: {total:.1f} GB[/dim]")
+
+
+@models_app.callback(invoke_without_command=True)
+def models_default(ctx: typer.Context) -> None:
+    """Default action: list installed presets, mark active with `*`."""
+    if ctx.invoked_subcommand is None:
+        _print_installed_presets()
+
+
+@models_app.command("list")
+def models_list() -> None:
+    """Show every registered preset (installed or not) with status + size."""
+    cfg = _load_or_build()
+    active = cfg.model.preset
+    table = Table()
+    table.add_column("preset")
+    table.add_column("status")
+    table.add_column("size (GB)")
+    table.add_column("description")
+    for name in sorted(download_mod.PRESETS):
+        pres = download_mod.PRESETS[name]
+        marker = "* " if name == active else "  "
+        status = download_mod.installed_status(cfg, pres)
+        size = download_mod.installed_size_gb(cfg, pres)
+        size_text = f"{size:.1f}" if size > 0 else f"~{pres.approx_total_gb}*"
+        table.add_row(f"{marker}{name}", status, size_text, pres.description or "")
+    console.print(table)
+
+
+@models_app.command("download")
+def models_download(
+    preset: Annotated[str, typer.Argument(help="Preset name (empty = recommend)")] = "",
+    activate: Annotated[
+        bool, typer.Option("--activate", help="Also set as active preset (model.preset).")
+    ] = False,
+) -> None:
+    """Fetch a preset's GGUFs into the models dir.
+
+    With no argument and no preset configured, recommends one for this
+    host's VRAM tier and auto-activates it (the first-install path).
+    Otherwise the named preset is downloaded; pass ``--activate`` to
+    also flip `model.preset` to it.
+    """
+    cfg = _load_or_build()
+    if not preset:
+        if cfg.model.preset:
+            console.print(
+                "[yellow]No preset specified and one is already active. "
+                "Pass an explicit preset name (or use --activate to switch).[/yellow]"
+            )
+            raise typer.Exit(code=2)
+        recommended = download_mod.recommend_preset(cfg.host)
+        if recommended is None:
+            console.print(
+                "[red]Cannot recommend a preset for this host. "
+                "Run `lucebox models list` and pick one explicitly.[/red]"
+            )
+            raise typer.Exit(code=2)
+        preset = recommended
+        activate = True
+        console.print(
+            f"[bold]Recommended preset: {preset}[/bold] "
+            "(no preset configured; auto-activating after download)"
+        )
+
+    try:
+        pres = download_mod.resolve_preset(preset)
+    except KeyError as exc:
+        console.print(f"[red]{exc}[/red]")
+        raise typer.Exit(code=2) from exc
+
+    current = download_mod.status(cfg, pres)
+    console.print(f"Models dir: [bold]{cfg.models_dir}[/bold]")
+    console.print(f"Preset:     [bold]{pres.name}[/bold]")
+    console.print(
+        f"  target ({pres.target_repo}/{pres.target_file}):"
+        f"  {'present' if current['target_present'] else 'will download'}"
+    )
+    if pres.has_draft:
+        console.print(
+            f"  draft  ({pres.draft_repo}/{pres.draft_file}):"
+            f"  {'present' if current['draft_present'] else 'will download'}"
+        )
+    else:
+        console.print("  draft  [dim](none — target-only preset)[/dim]")
+
+    if current["target_present"] and current["draft_present"]:
+        console.print("[green]Already present.[/green]")
+    else:
+        console.print(f"[bold]Downloading[/bold] (~{pres.approx_total_gb} GB total)…")
+        rc = download_mod.download_preset(cfg, pres)
+        if rc != 0:
+            raise typer.Exit(code=rc)
+        console.print("[green]Done.[/green]")
+
+    if activate:
+        config_set("model.preset", preset)
+        if pres.target_file:
+            config_set("model.target_file", pres.target_file)
+        if pres.has_draft and pres.draft_file:
+            config_set("model.draft_file", pres.draft_file)
+        else:
+            # Drop any stale draft_file from a previous activation; the
+            # active preset has no draft.
+            config_unset("model.draft_file")
+        console.print(f"[green]Activated:[/green] model.preset = {preset}")
+
+
+@app.command()
+def version() -> None:
+    """Print lucebox version."""
+    print(__version__)
+
+
+def main() -> None:
+    """Module entrypoint — `python -m lucebox`."""
+    try:
+        app()
+    except KeyboardInterrupt:
+        console.print("\n[dim]interrupted[/dim]")
+        sys.exit(130)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lucebox/src/lucebox/config.py b/lucebox/src/lucebox/config.py
new file mode 100644
index 000000000..2b5bd140e
--- /dev/null
+++ b/lucebox/src/lucebox/config.py
@@ -0,0 +1,448 @@
+"""Sparse TOML persistence for .lucebox/config.toml.
+
+Single source of truth for user-overridden configuration. We track which
+dotted keys were explicitly set by the user (or by commands acting on
+their behalf) and serialize ONLY those keys back to disk — defaults
+stay implicit, so `config.toml` reads like a diff against live defaults
+and upgrades that add new fields don't gratuitously rewrite every file.
+
+The dotted-key surface area is small and flat:
+  model.preset, model.target_file, model.draft_file
+  port, models_dir, variant, image, container_name
+  dflash.<field>  for each of the 11 DflashRuntime knobs + think_max
+
+Load resolves the TOML file → ``Config`` object, with anything absent
+filled from ``Config()`` defaults. Save writes back only the keys that
+appear in the TOML doc (tracked on ``Config._user_set``). The TOML doc
+itself is a plain ``dict[str, Any]`` carrying only the set keys.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+import tomllib
+from collections.abc import Callable
+from dataclasses import replace
+from pathlib import Path
+from typing import Any
+
+import tomli_w
+
+from lucebox.types import (
+    Config,
+    DflashRuntime,
+    HostFacts,
+    ModelMeta,
+    Variant,
+    default_models_dir,
+)
+
+
+def default_config_path() -> Path:
+    """Where .lucebox/config.toml lives.
+
+    Convention: under $LUCEBOX_HOME if set, otherwise $HOME/.lucebox. Lives in
+    the bind-mounted host home dir so the config survives container teardown
+    and is editable from the host.
+    """
+    base = os.environ.get("LUCEBOX_HOME")
+    if base:
+        return Path(base) / "config.toml"
+    return Path.home() / ".lucebox" / "config.toml"
+
+
+# ── dotted-key registry ────────────────────────────────────────────────────
+
+def _cast_prefill_mode(v: Any) -> str:
+    s = str(v)
+    if s not in {"off", "auto", "always"}:
+        raise ValueError(f"prefill_mode must be off/auto/always, got {s!r}")
+    return s
+
+
+def _cast_bool(v: Any) -> bool:
+    """Strict-ish boolean coercion for config values.
+
+    - Native booleans pass through.
+    - Strings: 1/true/yes/on → True; 0/false/no/off/"" → False (case-insensitive).
+    - Anything else raises ``ValueError`` rather than silently coercing,
+      because that's what bit ``dflash.debug_thinking_logits`` — the
+      built-in ``bool`` caster turned ``"false"`` into ``True``.
+    """
+    if isinstance(v, bool):
+        return v
+    if isinstance(v, str):
+        s = v.strip().lower()
+        if s in ("1", "true", "yes", "on"):
+            return True
+        if s in ("0", "false", "no", "off", ""):
+            return False
+        raise ValueError(f"cannot parse boolean: {v!r}")
+    if isinstance(v, int):
+        return bool(v)
+    raise ValueError(f"cannot parse boolean: {v!r}")
+
+
+# Each entry: dotted-key → (toml_path, type_caster, default_getter).
+# ``toml_path`` is the (section, field) pair on disk; ``"_root"`` means the
+# key lives at the top level (no [section]). ``default_getter`` returns the
+# in-memory default so ``config get`` can annotate origin.
+KEY_REGISTRY: dict[str, tuple[tuple[str, str], Callable[[Any], Any]]] = {
+    "variant": (("image", "variant"), str),
+    "image": (("image", "registry"), str),
+    "container_name": (("runtime", "container_name"), str),
+    "port": (("runtime", "port"), int),
+    "models_dir": (("paths", "models"), str),
+    "model.preset": (("model", "preset"), str),
+    "model.target_file": (("model", "target_file"), str),
+    "model.draft_file": (("model", "draft_file"), str),
+    "dflash.budget": (("dflash", "budget"), int),
+    "dflash.max_ctx": (("dflash", "max_ctx"), int),
+    "dflash.lazy": (("dflash", "lazy"), _cast_bool),
+    "dflash.prefix_cache_slots": (("dflash", "prefix_cache_slots"), int),
+    "dflash.prefill_cache_slots": (("dflash", "prefill_cache_slots"), int),
+    "dflash.cache_type_k": (("dflash", "cache_type_k"), str),
+    "dflash.cache_type_v": (("dflash", "cache_type_v"), str),
+    "dflash.prefill_mode": (("dflash", "prefill_mode"), _cast_prefill_mode),
+    "dflash.prefill_keep_ratio": (("dflash", "prefill_keep_ratio"), float),
+    "dflash.prefill_threshold": (("dflash", "prefill_threshold"), int),
+    "dflash.prefill_drafter": (("dflash", "prefill_drafter"), str),
+    "dflash.think_max": (("dflash", "think_max"), int),
+    "dflash.fa_window": (("dflash", "fa_window"), int),
+    "dflash.think_soft_close_min_ratio": (
+        ("dflash", "think_soft_close_min_ratio"), float),
+    "dflash.debug_thinking_logits": (
+        ("dflash", "debug_thinking_logits"), _cast_bool),
+}
+
+
+def _doc_get(doc: dict[str, Any], section: str, field: str) -> Any:
+    if section == "_root":
+        return doc.get(field)
+    sub = doc.get(section)
+    if isinstance(sub, dict):
+        return sub.get(field)
+    return None
+
+
+def _doc_set(doc: dict[str, Any], section: str, field: str, value: Any) -> None:
+    if section == "_root":
+        doc[field] = value
+        return
+    doc.setdefault(section, {})[field] = value
+
+
+def _doc_unset(doc: dict[str, Any], section: str, field: str) -> bool:
+    """Remove a dotted key from the doc. Returns True iff something was removed."""
+    if section == "_root":
+        if field in doc:
+            del doc[field]
+            return True
+        return False
+    sub = doc.get(section)
+    if isinstance(sub, dict) and field in sub:
+        del sub[field]
+        if not sub:
+            del doc[section]
+        return True
+    return False
+
+
+# ── load ───────────────────────────────────────────────────────────────────
+
+
+def load(path: Path | None = None) -> Config | None:
+    """Load config.toml, or return None if missing.
+
+    If a legacy `.env` sits next to it (or in place of it), migrate that
+    first and write back as TOML.
+    """
+    path = path or default_config_path()
+    if path.exists():
+        return _load_toml(path)
+
+    legacy = path.with_suffix(".env")
+    if legacy.exists():
+        cfg, doc = _load_legacy_env(legacy)
+        save(cfg, path, doc=doc)
+        return cfg
+
+    return None
+
+
+def _load_toml(path: Path) -> Config:
+    raw = tomllib.loads(path.read_text())
+    return _from_dict(raw)
+
+
+def load_doc(path: Path | None = None) -> dict[str, Any]:
+    """Return the raw TOML doc (a dict). Empty when no file or empty file."""
+    path = path or default_config_path()
+    if not path.exists():
+        return {}
+    return tomllib.loads(path.read_text())
+
+
+_LEGACY_KEY_MAP: dict[str, tuple[str, str, Callable[[str], Any]]] = {
+    "DFLASH_BUDGET": ("dflash", "budget", int),
+    "DFLASH_MAX_CTX": ("dflash", "max_ctx", int),
+    "DFLASH_LAZY": ("dflash", "lazy",
+                    lambda v: str(v).strip().lower() in ("1", "true", "yes", "on")),
+    "DFLASH_PREFIX_CACHE_SLOTS": ("dflash", "prefix_cache_slots", int),
+    "DFLASH_PORT": ("runtime", "port", int),
+    "LUCEBOX_VARIANT": ("image", "variant", str),
+    "LUCEBOX_IMAGE": ("image", "registry", str),
+    "LUCEBOX_MODELS": ("paths", "models", str),
+}
+
+
+def _load_legacy_env(path: Path) -> tuple[Config, dict[str, Any]]:
+    """Best-effort migration from the bash-era .lucebox/config.env."""
+    raw: dict[str, Any] = {}
+    line_re = re.compile(r"^([A-Z_][A-Z0-9_]*)=(.*)$")
+    for line in path.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        m = line_re.match(line)
+        if not m:
+            continue
+        key, val = m.group(1), m.group(2).strip().strip('"').strip("'")
+        if key not in _LEGACY_KEY_MAP:
+            continue
+        section, field, cast_fn = _LEGACY_KEY_MAP[key]
+        try:
+            raw.setdefault(section, {})[field] = cast_fn(val)
+        except (TypeError, ValueError):
+            continue
+    return _from_dict(raw), raw
+
+
+def _from_dict(raw: dict[str, Any]) -> Config:
+    img = raw.get("image", {})
+    variant: Variant = str(img.get("variant", "cuda12"))
+    registry = img.get("registry", "ghcr.io/luce-org/lucebox-hub")
+
+    runtime = raw.get("runtime", {})
+    port = int(runtime.get("port", 8080))
+    container_name = str(runtime.get("container_name", "lucebox"))
+
+    paths = raw.get("paths", {})
+    models_dir = Path(paths.get("models", str(default_models_dir())))
+
+    df = raw.get("dflash", {})
+    dflash = DflashRuntime(
+        budget=int(df.get("budget", 22)),
+        max_ctx=int(df.get("max_ctx", 16384)),
+        lazy=bool(df.get("lazy", False)),
+        prefix_cache_slots=int(df.get("prefix_cache_slots", 0)),
+        prefill_cache_slots=int(df.get("prefill_cache_slots", 0)),
+        cache_type_k=str(df.get("cache_type_k", "")),
+        cache_type_v=str(df.get("cache_type_v", "")),
+        prefill_mode=df.get("prefill_mode", "off"),
+        prefill_keep_ratio=float(df.get("prefill_keep_ratio", 0.05)),
+        prefill_threshold=int(df.get("prefill_threshold", 32000)),
+        prefill_drafter=str(df.get("prefill_drafter", "")),
+        think_max=int(df.get("think_max", 15488)),
+        fa_window=int(df.get("fa_window", 0)),
+        think_soft_close_min_ratio=float(
+            df.get("think_soft_close_min_ratio", 0.0)),
+        debug_thinking_logits=bool(df.get("debug_thinking_logits", False)),
+    )
+
+    host_raw = raw.get("host", {})
+    host = HostFacts(
+        nproc=int(host_raw.get("nproc", 0)),
+        ram_gb=int(host_raw.get("ram_gb", 0)),
+        gpu_vendor=host_raw.get("gpu_vendor", "none"),
+        gpu_name=str(host_raw.get("gpu_name", "")),
+        gpu_count=int(host_raw.get("gpu_count", 0)),
+        vram_gb=int(host_raw.get("vram_gb", 0)),
+        gpu_sm=str(host_raw.get("gpu_sm", "")),
+        driver_version=str(host_raw.get("driver_version", "")),
+        driver_major=int(host_raw.get("driver_major", 0)),
+        has_systemd=bool(host_raw.get("has_systemd", False)),
+        is_wsl=bool(host_raw.get("is_wsl", False)),
+        has_docker=bool(host_raw.get("has_docker", False)),
+        docker_version=str(host_raw.get("docker_version", "")),
+        ctk=host_raw.get("ctk", "none"),
+    )
+
+    # `[model]` is optional — legacy configs (pre-multi-model) carry no
+    # such section and we want them to keep working unchanged. If
+    # `preset` is set but `target_file` / `draft_file` isn't, derive
+    # them from the registry so users only have to write one key.
+    mdl = raw.get("model", {})
+    preset_name = str(mdl.get("preset", ""))
+    target_file = str(mdl.get("target_file", ""))
+    draft_file = str(mdl.get("draft_file", ""))
+    if preset_name and (not target_file or not draft_file):
+        from lucebox.download import PRESETS
+
+        if preset_name in PRESETS:
+            pres = PRESETS[preset_name]
+            if not target_file:
+                target_file = pres.target_file
+            if not draft_file and pres.has_draft and pres.draft_file:
+                draft_file = pres.draft_file
+    model = ModelMeta(preset=preset_name, target_file=target_file, draft_file=draft_file)
+
+    return Config(
+        variant=variant,
+        image=registry,
+        container_name=container_name,
+        port=port,
+        models_dir=models_dir,
+        dflash=dflash,
+        host=host,
+        model=model,
+    )
+
+
+# ── save ───────────────────────────────────────────────────────────────────
+
+
+def _atomic_write_doc(path: Path, doc: dict[str, Any]) -> None:
+    """Serialize ``doc`` to TOML and write it to ``path`` atomically.
+
+    Write to a sibling ``.toml.tmp`` then ``replace`` so a crash mid-write
+    never leaves a truncated config.toml. Caller ensures ``path.parent`` exists.
+    """
+    tmp = path.with_suffix(".toml.tmp")
+    tmp.write_bytes(tomli_w.dumps(doc).encode("utf-8"))
+    tmp.replace(path)
+
+
+def save(cfg: Config, path: Path | None = None, *, doc: dict[str, Any] | None = None) -> Path:
+    """Persist a Config to ``path``. Only keys present in ``doc`` are written.
+
+    ``doc`` is the raw TOML mapping returned by ``load_doc`` — it carries
+    exactly the keys the user (or a command on their behalf) has set. When
+    ``doc=None`` and the file exists we re-use the on-disk doc; when both
+    are absent we write an empty file.
+    """
+    path = path or default_config_path()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if doc is None:
+        doc = load_doc(path)
+    _atomic_write_doc(path, doc)
+    # Silence unused-arg: cfg is the on-disk representation's source of
+    # truth for callers that want to round-trip through a Config object,
+    # but the sparse write never re-derives keys from it.
+    del cfg
+    return path
+
+
+# ── dotted-key API ─────────────────────────────────────────────────────────
+
+
+def _value_to_toml(value: Any) -> Any:
+    """Make a Python value safe for tomli_w (no None, Path→str)."""
+    if isinstance(value, Path):
+        return str(value)
+    return value
+
+
+def _live_default(key: str) -> Any:
+    """Return the in-memory default for ``key`` (from a fresh Config())."""
+    cfg = Config()
+    section_field = KEY_REGISTRY[key][0]
+    section, field = section_field
+    if section == "image":
+        return {"variant": cfg.variant, "registry": cfg.image}[field]
+    if section == "runtime":
+        return {"port": cfg.port, "container_name": cfg.container_name}[field]
+    if section == "paths":
+        return str(cfg.models_dir) if field == "models" else None
+    if section == "dflash":
+        return getattr(cfg.dflash, field)
+    if section == "model":
+        return getattr(cfg.model, field)
+    return None
+
+
+def config_set(key: str, value: Any, *, path: Path | None = None) -> None:
+    """Set one dotted key and write the file. Auto-creates a missing file."""
+    if key not in KEY_REGISTRY:
+        raise KeyError(f"unknown config key {key!r}; known: {sorted(KEY_REGISTRY)}")
+    section_field, caster = KEY_REGISTRY[key]
+    section, field = section_field
+    try:
+        cast_value = caster(value)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(f"cannot coerce {value!r} for {key}: {exc}") from exc
+    path = path or default_config_path()
+    doc = load_doc(path) if path.exists() else {}
+    _doc_set(doc, section, field, _value_to_toml(cast_value))
+    path.parent.mkdir(parents=True, exist_ok=True)
+    _atomic_write_doc(path, doc)
+
+
+def config_unset(key: str, *, path: Path | None = None) -> bool:
+    """Remove a dotted key from the file. Returns True if something changed."""
+    if key not in KEY_REGISTRY:
+        raise KeyError(f"unknown config key {key!r}; known: {sorted(KEY_REGISTRY)}")
+    section_field, _ = KEY_REGISTRY[key]
+    section, field = section_field
+    path = path or default_config_path()
+    if not path.exists():
+        return False
+    doc = load_doc(path)
+    changed = _doc_unset(doc, section, field)
+    if changed:
+        # Leave the file in place even when empty — `config set` will
+        # repopulate; deleting would surprise users who expect their
+        # config dir to exist.
+        _atomic_write_doc(path, doc)
+    return changed
+
+
+def config_get(key: str | None = None, *, path: Path | None = None) -> dict[str, tuple[Any, str]]:
+    """Return ``{key: (value, origin)}``. ``origin`` is ``"file"`` or ``"default"``.
+
+    When ``key`` is None or empty, every registered key is returned.
+    Otherwise just that one key (still as a single-item dict, for caller
+    uniformity).
+    """
+    path = path or default_config_path()
+    doc = load_doc(path) if path.exists() else {}
+    keys = [key] if key else list(KEY_REGISTRY)
+    out: dict[str, tuple[Any, str]] = {}
+    for k in keys:
+        if k not in KEY_REGISTRY:
+            raise KeyError(f"unknown config key {k!r}; known: {sorted(KEY_REGISTRY)}")
+        section_field, _ = KEY_REGISTRY[k]
+        section, field = section_field
+        in_file = _doc_get(doc, section, field)
+        if in_file is not None:
+            out[k] = (in_file, "file")
+        else:
+            out[k] = (_live_default(k), "default")
+    return out
+
+
+def live_config() -> Config:
+    """Build a fresh Config from current host facts + the DFLASH_* heuristic.
+
+    Used as the no-config fallback in ``cli._load_or_build`` and reused by
+    the ``models`` sub-app, so the host probe + heuristic + env-override
+    logic lives in one place rather than being duplicated per caller.
+    """
+    # Lazy import to avoid the autotune ↔ config import cycle the importer
+    # would hit if this moved to module scope.
+    import lucebox.autotune as autotune_mod
+    from lucebox.host_facts import from_env
+
+    host = from_env()
+    default = Config()
+    return replace(
+        default,
+        variant=os.environ.get("LUCEBOX_VARIANT", "cuda12"),
+        image=os.environ.get("LUCEBOX_IMAGE", default.image),
+        container_name=os.environ.get("LUCEBOX_CONTAINER", default.container_name),
+        port=int(os.environ.get("LUCEBOX_PORT", str(default.port))),
+        models_dir=Path(os.environ.get("LUCEBOX_MODELS", str(default.models_dir))),
+        dflash=autotune_mod.runtime_from_host(host),
+        host=host,
+    )
diff --git a/lucebox/src/lucebox/docker_run.py b/lucebox/src/lucebox/docker_run.py
new file mode 100644
index 000000000..ff3615b26
--- /dev/null
+++ b/lucebox/src/lucebox/docker_run.py
@@ -0,0 +1,232 @@
+"""Build and execute `docker run` argv for the server and download containers.
+
+We shell out to the `docker` CLI rather than using the docker SDK because
+(a) the CLI is the user-visible contract — errors look the same whether
+issued by lucebox or the user; (b) zero import cost; (c) trivially mockable
+via subprocess in tests. Wrap everything in one module so swapping to the
+SDK later is a single-file change.
+"""
+
+from __future__ import annotations
+
+import os
+import shlex
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+from lucebox.types import Config
+
+
+def _host_facts_env() -> list[tuple[str, str]]:
+    """Forward LUCEBOX_HOST_* from the orchestrator's env into the server.
+
+    lucebox.sh's probe_host() exports every host-identity fact (OS,
+    kernel, GPU list CSV, CTK version, …) before invoking ``docker run``
+    on the orchestrator. The orchestrator inherits them and we pass
+    them through verbatim so the server entrypoint can write
+    /opt/lucebox-hub/HOST_INFO without re-probing inside the container
+    (where /proc and nvidia-smi see the container's view, not the
+    rig's). See entrypoint.sh::write_host_info and http_server.cpp's
+    /props.host block.
+    """
+    out: list[tuple[str, str]] = []
+    for key, value in sorted(os.environ.items()):
+        if key.startswith("LUCEBOX_HOST_"):
+            out.append((key, value))
+    return out
+
+
+def _resolve_model_files(cfg: Config) -> tuple[str, str, str]:
+    """Return (target_file, draft_file, draft_dir) for DFLASH_TARGET / DFLASH_DRAFT.
+
+    Resolution order — first non-empty wins per field:
+        1. cfg.model.target_file / draft_file (explicit override in config.toml)
+        2. PRESETS[cfg.model.preset].target_file / draft_file / speculator_dir (registry)
+        3. "" (entrypoint autodetect path runs unchanged).
+
+    ``draft_dir`` is a directory name under ``models/draft/`` holding a
+    safetensors speculator (e.g. ``laguna-xs2-speculator``). It is only set
+    when the preset declares one AND the directory exists on disk; otherwise
+    it is empty. When non-empty, docker_run_spec uses it as DFLASH_DRAFT
+    (a directory path) instead of the GGUF-file path, allowing the entrypoint
+    to discover the safetensors file inside it.
+
+    Imported lazily to avoid the lucebox.types ↔ lucebox.download circular
+    import that surfaces when this module is imported from ``__init__``.
+    """
+    target = cfg.model.target_file
+    draft = cfg.model.draft_file
+    draft_dir = ""
+    if (not target or not draft) and cfg.model.preset:
+        from lucebox.download import PRESETS
+
+        pres = PRESETS.get(cfg.model.preset)
+        if pres is not None:
+            if not target:
+                target = pres.target_file
+            if not draft and pres.has_draft and pres.draft_file:
+                draft = pres.draft_file
+            if not draft and pres.speculator_dir:
+                spec_path = cfg.models_dir / "draft" / pres.speculator_dir
+                if spec_path.is_dir():
+                    draft_dir = pres.speculator_dir
+    return target, draft, draft_dir
+
+
+def _runtime_volumes(cfg: Config) -> tuple[tuple[str, str], ...]:
+    """Mount models plus $HOME so absolute symlink targets remain valid."""
+    home = str(Path.home())
+    models = str(cfg.models_dir)
+    volumes = [(models, "/opt/lucebox-hub/server/models")]
+    if home != models:
+        volumes.append((home, home))
+    return tuple(volumes)
+
+
+@dataclass(frozen=True, slots=True)
+class DockerRunSpec:
+    """Pre-render of a docker-run command. Render via `argv()` or `printable()`."""
+
+    image: str
+    name: str
+    gpus: bool = True
+    detach: bool = False
+    remove: bool = True
+    port_publish: tuple[int, int] | None = None  # (host, container)
+    volumes: tuple[tuple[str, str], ...] = ()
+    env: tuple[tuple[str, str], ...] = ()
+    entrypoint_args: tuple[str, ...] = ()
+    extra: tuple[str, ...] = ()
+
+    def argv(self) -> list[str]:
+        out = ["docker", "run"]
+        if self.remove:
+            out.append("--rm")
+        if self.detach:
+            out.append("-d")
+        out += ["--name", self.name]
+        if self.gpus:
+            out += ["--gpus", "all"]
+        if self.port_publish is not None:
+            host, container = self.port_publish
+            out += ["-p", f"{host}:{container}"]
+        for host_path, container_path in self.volumes:
+            out += ["-v", f"{host_path}:{container_path}"]
+        for k, v in self.env:
+            out += ["-e", f"{k}={v}"]
+        out += list(self.extra)
+        out.append(self.image)
+        out += list(self.entrypoint_args)
+        return out
+
+    def printable(self) -> str:
+        """Human-readable, one-flag-per-line docker run. Copy-pasteable."""
+        argv = self.argv()
+        if not argv:
+            return ""
+        out = argv[0]
+        i = 1
+        while i < len(argv):
+            tok = argv[i]
+            out += " \\\n    " + tok
+            # Glue value-taking flags onto the same line.
+            if tok in {
+                "-p",
+                "-v",
+                "-e",
+                "--name",
+                "--gpus",
+                "--env",
+                "--volume",
+                "--publish",
+                "--entrypoint",
+            } and i + 1 < len(argv):
+                i += 1
+                out += " " + shlex.quote(argv[i])
+            i += 1
+        return out
+
+
+# ── server argv from Config ────────────────────────────────────────────────
+
+
+def server_run_spec(cfg: Config) -> DockerRunSpec:
+    """Long-running OpenAI-compatible server. Foreground (systemd manages
+    lifecycle), --gpus all, models bind-mounted, DFLASH_* propagated.
+    """
+    # LUCEBOX_HOST_* first so they ride out front in the rendered argv,
+    # making it obvious in `print-run` output what host facts get forwarded.
+    env: list[tuple[str, str]] = list(_host_facts_env())
+    env += [
+        ("DFLASH_BUDGET", str(cfg.dflash.budget)),
+        ("DFLASH_MAX_CTX", str(cfg.dflash.max_ctx)),
+        ("DFLASH_PREFIX_CACHE_SLOTS", str(cfg.dflash.prefix_cache_slots)),
+        ("DFLASH_PREFILL_CACHE_SLOTS", str(cfg.dflash.prefill_cache_slots)),
+        ("DFLASH_THINK_MAX", str(cfg.dflash.think_max)),
+        ("DFLASH_PORT", "8080"),
+    ]
+    # Resolve target/draft GGUFs in priority order:
+    #   1. cfg.model.target_file / draft_file (explicit override in config.toml)
+    #   2. PRESETS[cfg.model.preset].target_file / draft_file / speculator_dir (registry)
+    #   3. unset — entrypoint's autodetect path runs unchanged.
+    # Container view of the models dir is /opt/lucebox-hub/server/models
+    # (see _runtime_volumes); the entrypoint reads DFLASH_TARGET / DFLASH_DRAFT.
+    # draft_dir is a subdirectory of models/draft/ holding a safetensors speculator;
+    # it takes effect only when draft_file is empty and the directory exists on disk.
+    target_file, draft_file, draft_dir = _resolve_model_files(cfg)
+    if target_file:
+        env.append(("DFLASH_TARGET", f"/opt/lucebox-hub/server/models/{target_file}"))
+    if draft_file:
+        env.append(("DFLASH_DRAFT", f"/opt/lucebox-hub/server/models/draft/{draft_file}"))
+    elif draft_dir:
+        env.append(("DFLASH_DRAFT", f"/opt/lucebox-hub/server/models/draft/{draft_dir}"))
+    if cfg.dflash.lazy:
+        env.append(("DFLASH_LAZY", "1"))
+    if cfg.dflash.cache_type_k:
+        env.append(("DFLASH_CACHE_TYPE_K", cfg.dflash.cache_type_k))
+    if cfg.dflash.cache_type_v:
+        env.append(("DFLASH_CACHE_TYPE_V", cfg.dflash.cache_type_v))
+    if cfg.dflash.prefill_mode != "off":
+        env += [
+            ("DFLASH_PREFILL_MODE", cfg.dflash.prefill_mode),
+            ("DFLASH_PREFILL_KEEP", str(cfg.dflash.prefill_keep_ratio)),
+            ("DFLASH_PREFILL_THRESHOLD", str(cfg.dflash.prefill_threshold)),
+        ]
+        if cfg.dflash.prefill_drafter:
+            env.append(("DFLASH_PREFILL_DRAFTER", cfg.dflash.prefill_drafter))
+    # fa_window=0 is the server's own default (full attention); only emit
+    # the env when the operator has selected a sparse decode window. The
+    # entrypoint mirrors this guard so an unset env reproduces the
+    # server's stock behavior.
+    if cfg.dflash.fa_window > 0:
+        env.append(("DFLASH_FA_WINDOW", str(cfg.dflash.fa_window)))
+    # Soft-close ratio: 0.0 is server-side disabled (byte-identical
+    # to pre-PR-#326 behavior). Emit only when nonzero to keep the
+    # docker env minimal and mirror the entrypoint's `case` guard.
+    if cfg.dflash.think_soft_close_min_ratio > 0.0:
+        env.append((
+            "DFLASH_THINK_SOFT_CLOSE_MIN_RATIO",
+            f"{cfg.dflash.think_soft_close_min_ratio:g}",
+        ))
+    if cfg.dflash.debug_thinking_logits:
+        env.append(("DFLASH_DEBUG_THINKING_LOGITS", "1"))
+
+    return DockerRunSpec(
+        image=f"{cfg.image}:{cfg.variant}",
+        name=cfg.container_name,
+        gpus=True,
+        remove=True,
+        detach=False,
+        port_publish=(cfg.port, 8080),
+        volumes=_runtime_volumes(cfg),
+        env=tuple(env),
+    )
+
+
+# ── subprocess helpers ─────────────────────────────────────────────────────
+
+
+def docker_pull(image_tag: str) -> int:
+    """Pull an image, streaming progress. Returns docker's exit code."""
+    return subprocess.call(["docker", "pull", image_tag])
diff --git a/lucebox/src/lucebox/download.py b/lucebox/src/lucebox/download.py
new file mode 100644
index 000000000..df1d5c96f
--- /dev/null
+++ b/lucebox/src/lucebox/download.py
@@ -0,0 +1,515 @@
+"""Model download orchestration.
+
+Runs *inside* the orchestrator container. Uses `huggingface_hub` directly
+(no subprocess) so we can:
+
+  * drive a Rich progress bar based on real byte counts (the previous
+    `uvx hf download` subprocess produced no visible progress inside the
+    container — hf-xet's TTY detection misfires there),
+  * verify each candidate file's size and sha256 against the repo
+    metadata BEFORE downloading, so a re-run on a host that already has
+    the target GGUF (e.g. previous download into the same models_dir)
+    skips the multi-GB fetch entirely.
+
+The :data:`PRESETS` registry encodes the canonical (target_repo,
+target_file, draft_repo, draft_file) tuple per model — selectable via
+``lucebox models download <name>``. ``DEFAULT_PRESET`` stays pinned to
+Qwen3.6-27B for back-compat with callers that pre-date the registry.
+Drafts are optional: presets that have no published DFlash draft
+(e.g. Laguna's speculator is safetensors, not GGUF) carry
+``draft_repo=None`` and run target-only.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import os
+import threading
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+# hf-xet (huggingface_hub ≥ 1.16) streams the entire file in one final
+# burst — the polling-based progress bar sits at 0% for ~14 minutes
+# then snaps to 100% on a 17 GB GGUF. Force the chunked Python
+# downloader instead so bytes grow continuously and the Rich bar tracks
+# reality. Set before importing hf_hub_download so the import picks
+# the env up. `setdefault` lets a user override on the command line.
+os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
+
+from huggingface_hub import HfApi, hf_hub_download  # noqa: E402
+from huggingface_hub._local_folder import get_local_download_paths  # noqa: E402
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    DownloadColumn,
+    Progress,
+    TextColumn,
+    TimeRemainingColumn,
+    TransferSpeedColumn,
+)
+
+from lucebox.types import Config, HostFacts
+
+
+@dataclass(frozen=True, slots=True)
+class ModelPreset:
+    """Canonical (target, draft) repo+filename pair for a supported model.
+
+    ``draft_repo`` and ``draft_file`` may both be ``None`` for models
+    where no GGUF DFlash draft is published (e.g. Laguna's safetensors
+    speculator). In that case the entrypoint runs target-only — DFlash
+    speculative decoding is disabled but the server still works.
+
+    ``speculator_dir`` names a directory under ``models/draft/`` that holds
+    a safetensors-format speculator (e.g. ``model.safetensors``). When
+    present on disk the server launch sets ``DFLASH_DRAFT`` to that
+    directory; absent, the server runs target-only. Unlike ``draft_file``
+    (which marks the preset as incomplete when missing), ``speculator_dir``
+    is optional supplementary hardware and doesn't affect installed_status.
+    """
+
+    name: str
+    target_repo: str
+    target_file: str
+    draft_repo: str | None
+    draft_file: str | None
+    approx_total_gb: int
+    description: str = ""
+    speculator_dir: str | None = None
+
+    @property
+    def has_draft(self) -> bool:
+        return bool(self.draft_repo and self.draft_file)
+
+
+# Registry of supported models. Keyed by preset name; the CLI surface
+# exposes these via `lucebox models download <name>` and the
+# `lucebox models list` table. The values come straight from the model
+# cards under share/model_cards/ — keep them in sync.
+PRESETS: dict[str, ModelPreset] = {
+    "qwen3.6-27b": ModelPreset(
+        name="qwen3.6-27b",
+        target_repo="unsloth/Qwen3.6-27B-GGUF",
+        target_file="Qwen3.6-27B-Q4_K_M.gguf",
+        draft_repo="spiritbuun/Qwen3.6-27B-DFlash-GGUF",
+        draft_file="dflash-draft-3.6-q4_k_m.gguf",
+        approx_total_gb=17,
+        description="Qwen3.6 27B dense (Q4_K_M) + Qwen3.6 DFlash draft. Lucebox default.",
+    ),
+    "gemma-4-26b": ModelPreset(
+        name="gemma-4-26b",
+        target_repo="bartowski/google_gemma-4-26B-A4B-it-GGUF",
+        target_file="google_gemma-4-26B-A4B-it-Q4_K_M.gguf",
+        draft_repo="Lucebox/gemma-4-26B-A4B-it-DFlash-GGUF",
+        draft_file="gemma-4-26B-A4B-it-DFlash-q8_0.gguf",
+        approx_total_gb=18,
+        description="Gemma 4 26B-A4B IT MoE (Q4_K_M) + Lucebox DFlash q8_0 draft.",
+    ),
+    "gemma-4-31b": ModelPreset(
+        name="gemma-4-31b",
+        target_repo="bartowski/google_gemma-4-31B-it-GGUF",
+        target_file="google_gemma-4-31B-it-Q4_K_M.gguf",
+        draft_repo="Lucebox/gemma-4-31B-it-DFlash-GGUF",
+        draft_file="gemma-4-31B-it-DFlash-q8_0.gguf",
+        approx_total_gb=21,
+        description="Gemma 4 31B IT dense (Q4_K_M) + Lucebox DFlash q8_0 draft.",
+    ),
+    "laguna-xs.2": ModelPreset(
+        name="laguna-xs.2",
+        target_repo="Lucebox/Laguna-XS.2-GGUF",
+        target_file="laguna-xs2-Q4_K_M.gguf",
+        # Laguna's DFlash speculator is safetensors-format
+        # (poolside/Laguna-XS.2-speculator.dflash), downloaded manually
+        # into models/draft/laguna-xs2-speculator/. The download command
+        # doesn't fetch it automatically — it's opt-in. When present,
+        # speculator_dir wires it into DFLASH_DRAFT at server launch.
+        draft_repo=None,
+        draft_file=None,
+        speculator_dir="laguna-xs2-speculator",
+        approx_total_gb=20,
+        description=(
+            "Laguna-XS.2 MoE code model (Q4_K_M). "
+            "DFlash safetensors speculator in draft/laguna-xs2-speculator/ "
+            "is used automatically when present."
+        ),
+    ),
+    "qwen3.6-moe": ModelPreset(
+        name="qwen3.6-moe",
+        target_repo="unsloth/Qwen3.6-35B-A3B-GGUF",
+        # Unsloth's MoE repo publishes both a "UD" (dynamic) and a plain
+        # Q4_K_M family. Verified 2026-05-28 via HfApi.repo_info: the
+        # `-UD-Q4_K_M.gguf` variant (22.1 GB) is the canonical Q4_K_M
+        # release — there is no plain `Q4_K_M.gguf` on the MoE repo.
+        target_file="Qwen3.6-35B-A3B-UD-Q4_K_M.gguf",
+        # No DFlash draft GGUF has been published for the MoE variant
+        # (probed Lucebox/* and spiritbuun/* repos 2026-05-28 — none
+        # exist). Target-only, mirroring laguna-xs.2's wiring. The
+        # lucebox C++ server speaks the `qwen35moe` arch natively
+        # (server/src/qwen35moe/) so this runs without a draft.
+        draft_repo=None,
+        draft_file=None,
+        approx_total_gb=22,
+        description=(
+            "Qwen3.6 35B-A3B MoE (3B active per token), Q4_K_M unsloth "
+            "dynamic quant. Target-only — no DFlash MoE draft published "
+            "yet. Uses lucebox's qwen35moe arch backend."
+        ),
+    ),
+}
+
+DEFAULT_PRESET = PRESETS["qwen3.6-27b"]
+
+
+def resolve_preset(name: str | None) -> ModelPreset:
+    """Look up a preset by name, with a friendly error on typos.
+
+    ``None`` (or empty string) resolves to :data:`DEFAULT_PRESET` so
+    callers and the CLI default both flow through one code path.
+    """
+    if not name:
+        return DEFAULT_PRESET
+    if name in PRESETS:
+        return PRESETS[name]
+    # Build a suggestion list — show every known preset; the user's
+    # search space is small (4 entries today) so listing them all is
+    # cheaper and clearer than a fuzzy-match heuristic.
+    known = ", ".join(sorted(PRESETS.keys()))
+    raise KeyError(f"unknown preset {name!r}. Known presets: {known}")
+
+
+def _file_meta(api: HfApi, repo_id: str, filename: str) -> tuple[int, str | None]:
+    """Return (expected_size, lfs_sha256_or_None) for filename in repo_id."""
+    info = api.model_info(repo_id, files_metadata=True)
+    for sib in info.siblings or []:
+        if sib.rfilename == filename:
+            sha = getattr(sib.lfs, "sha256", None) if sib.lfs else None
+            return int(sib.size or 0), sha
+    raise FileNotFoundError(f"{filename} not present in repo {repo_id}")
+
+
+def _sha256(path: Path, chunk_mb: int = 16) -> str:
+    h = hashlib.sha256()
+    chunk = chunk_mb * 1024 * 1024
+    with path.open("rb") as f:
+        while buf := f.read(chunk):
+            h.update(buf)
+    return h.hexdigest()
+
+
+def _local_matches(path: Path, size: int, sha256: str | None, console: Console) -> bool:
+    """True iff a local file at `path` matches the expected size + sha256.
+
+    Size mismatch shortcircuits (cheap). Sha256 is verified for LFS files
+    (multi-GB GGUFs always carry one) and skipped when the repo doesn't
+    expose a hash. Hashing 17 GB takes ~30s on a fast SSD — worth it to
+    avoid a multi-GB re-download on rate-limited / metered links.
+    """
+    if not path.exists():
+        return False
+    actual_size = path.stat().st_size
+    if actual_size != size:
+        console.print(
+            f"  [yellow]✗[/yellow] {path.name} present but size {actual_size:,} != "
+            f"expected {size:,} — will re-download"
+        )
+        return False
+    if sha256:
+        console.print(f"  [dim]verifying sha256 of {path.name} ({actual_size / 1e9:.1f} GB)…[/dim]")
+        actual_sha = _sha256(path)
+        if actual_sha != sha256:
+            console.print(
+                f"  [yellow]✗[/yellow] {path.name} sha256 {actual_sha[:12]}… != "
+                f"expected {sha256[:12]}… — will re-download"
+            )
+            return False
+    return True
+
+
+def _incomplete_path_candidates(local_dir: Path, filename: str, etag: str | None) -> list[Path]:
+    """Return likely paths of the partial file currently being written.
+
+    huggingface_hub 1.x (with hf-xet) stages downloads under
+    ``{local_dir}/.cache/huggingface/download/`` using a *hashed* name —
+    ``{short_hash(metadata_filename)}.{etag}.incomplete`` — so a naive
+    ``{filename}.incomplete`` poll never sees any growth and the
+    progress bar sits at 0 % for the whole multi-GB transfer.
+
+    We get the *exact* expected staging path from
+    ``get_local_download_paths().incomplete_path(etag)`` when we already
+    know the LFS sha256 (which acts as the etag for Xet downloads), and
+    fall back to globbing every ``*.incomplete`` in the staging dir
+    otherwise. The legacy non-Xet downloader writes a ``.incomplete``
+    next to the destination blob in ``~/.cache/huggingface/hub`` — but
+    when ``local_dir`` is set hf-hub always uses the local staging dir,
+    so the two candidates above cover every code path we hit.
+    """
+    paths = get_local_download_paths(local_dir, filename)
+    candidates: list[Path] = []
+    if etag:
+        candidates.append(paths.incomplete_path(etag))
+    # Fallback: every .incomplete file in the staging dir. This is what
+    # rescues us when sha256 is unknown (non-LFS file) or when hf-hub
+    # changes the etag derivation again in some future release.
+    candidates.append(paths.metadata_path.parent)  # sentinel: glob this dir
+    return candidates
+
+
+def _current_bytes(target: Path, candidates: list[Path]) -> int:
+    """Best-effort byte count of the file currently being written."""
+    if target.exists():
+        try:
+            return target.stat().st_size
+        except OSError:
+            pass
+    for c in candidates:
+        if c.is_dir():
+            # Glob every .incomplete in the staging dir; return the
+            # largest (there's typically only one in-flight transfer).
+            largest = 0
+            try:
+                for p in c.glob("*.incomplete"):
+                    try:
+                        largest = max(largest, p.stat().st_size)
+                    except OSError:
+                        continue
+            except OSError:
+                continue
+            if largest:
+                return largest
+        else:
+            try:
+                if c.exists():
+                    return c.stat().st_size
+            except OSError:
+                continue
+    return 0
+
+
+def _download_with_progress(
+    repo_id: str,
+    filename: str,
+    local_dir: Path,
+    expected_size: int,
+    console: Console,
+    etag: str | None = None,
+) -> Path:
+    """Download a single HF file with a Rich progress bar.
+
+    Runs hf_hub_download in a worker thread; the main thread polls the
+    growing file size and updates the Rich progress bar. The polled
+    target is computed via ``get_local_download_paths`` so we hit the
+    actual hf-xet staging path (a hashed filename under
+    ``.cache/huggingface/download/``), not a guess.
+    """
+    local_dir.mkdir(parents=True, exist_ok=True)
+    target = local_dir / filename
+    candidates = _incomplete_path_candidates(local_dir, filename, etag)
+
+    result: list[str | None] = [None]
+    error: list[BaseException | None] = [None]
+
+    def _worker() -> None:
+        try:
+            result[0] = hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                local_dir=str(local_dir),
+            )
+        except BaseException as exc:  # propagate to main thread
+            error[0] = exc
+
+    t = threading.Thread(target=_worker, daemon=True)
+    t.start()
+
+    with Progress(
+        TextColumn("[cyan]{task.description}"),
+        BarColumn(bar_width=40),
+        DownloadColumn(),
+        TransferSpeedColumn(),
+        TimeRemainingColumn(),
+        console=console,
+        transient=False,
+    ) as progress:
+        task = progress.add_task(filename, total=expected_size or 1)
+        while t.is_alive():
+            current = _current_bytes(target, candidates)
+            # Always tick the bar — even at 0 bytes — so Rich repaints
+            # the spinner/ETA and the user sees the UI is alive within
+            # the first poll tick rather than a blank "Downloading…" line.
+            progress.update(task, completed=min(current, expected_size or current or 1))
+            time.sleep(0.5)
+        # Final tick after the worker finishes so the bar paints 100%.
+        if target.exists():
+            progress.update(task, completed=target.stat().st_size)
+
+    t.join(timeout=5)
+    if error[0] is not None:
+        raise error[0]
+    if result[0] is None:
+        raise RuntimeError(f"hf_hub_download returned no path for {filename}")
+    return Path(result[0])
+
+
+def _fetch(
+    api: HfApi,
+    repo_id: str,
+    filename: str,
+    local_dir: Path,
+    console: Console,
+) -> Path:
+    """Verify-or-download a single file. Skips when the local copy matches."""
+    size, sha = _file_meta(api, repo_id, filename)
+    target = local_dir / filename
+    if _local_matches(target, size, sha, console):
+        console.print(f"  [green]✓[/green] {filename} already present (size + sha256 match)")
+        return target
+    # `sha` doubles as the etag for hf-xet's staging path
+    # ({local_dir}/.cache/huggingface/download/{hash}.{etag}.incomplete);
+    # passing it through is what makes the Rich progress bar see real
+    # byte counts during the multi-GB transfer.
+    return _download_with_progress(repo_id, filename, local_dir, size, console, etag=sha)
+
+
+def download_preset(cfg: Config, preset: ModelPreset | None = None) -> int:
+    """Fetch the target GGUF + (optional) DFlash draft into cfg.models_dir.
+
+    Returns 0 on success, non-zero on failure. Verifies each file's size
+    and (LFS) sha256 against the repo metadata before downloading, so a
+    repeat run with the files already on disk is a no-op + sha256 walk.
+
+    ``preset=None`` resolves to :data:`DEFAULT_PRESET` for back-compat;
+    presets with ``has_draft=False`` (e.g. Laguna) skip the draft fetch
+    entirely and let the server run target-only.
+    """
+    preset = preset or DEFAULT_PRESET
+    console = Console()
+    api = HfApi()
+    models = cfg.models_dir
+    models.mkdir(parents=True, exist_ok=True)
+    draft = models / "draft"
+    draft.mkdir(exist_ok=True)
+
+    try:
+        _fetch(api, preset.target_repo, preset.target_file, models, console)
+        if preset.has_draft:
+            # Narrow the optionals for the type-checker — has_draft is
+            # exactly the predicate that proves these aren't None.
+            assert preset.draft_repo is not None and preset.draft_file is not None
+            _fetch(api, preset.draft_repo, preset.draft_file, draft, console)
+        else:
+            console.print(
+                f"  [dim]no DFlash draft published for {preset.name} — running target-only[/dim]"
+            )
+    except Exception as exc:
+        console.print(f"[red]download failed:[/red] {exc}")
+        return 1
+    return 0
+
+
+def _local_target_path(cfg: Config, preset: ModelPreset) -> Path:
+    return cfg.models_dir / preset.target_file
+
+
+def _local_draft_path(cfg: Config, preset: ModelPreset) -> Path | None:
+    if not (preset.has_draft and preset.draft_file):
+        return None
+    return cfg.models_dir / "draft" / preset.draft_file
+
+
+def installed_status(cfg: Config, preset: ModelPreset) -> str:
+    """Return ``"installed"`` / ``"partial"`` / ``"absent"`` for a preset.
+
+    Size-only — doesn't hash. ``"installed"`` requires the target (and
+    draft when one is published) to exist on disk; ``"partial"`` means
+    at least one of the two is present but the set is incomplete.
+    """
+    target_exists = _local_target_path(cfg, preset).exists()
+    draft_path = _local_draft_path(cfg, preset)
+    if draft_path is None:
+        return "installed" if target_exists else "absent"
+    draft_exists = draft_path.exists()
+    if target_exists and draft_exists:
+        return "installed"
+    if target_exists or draft_exists:
+        return "partial"
+    return "absent"
+
+
+def installed_size_gb(cfg: Config, preset: ModelPreset) -> float:
+    """Sum of on-disk byte sizes for the preset's files, in GB (binary 1e9)."""
+    total = 0
+    target = _local_target_path(cfg, preset)
+    if target.exists():
+        try:
+            total += target.stat().st_size
+        except OSError:
+            pass
+    draft = _local_draft_path(cfg, preset)
+    if draft is not None and draft.exists():
+        try:
+            total += draft.stat().st_size
+        except OSError:
+            pass
+    return total / 1e9
+
+
+def installed_presets(cfg: Config) -> list[ModelPreset]:
+    """Return every preset whose files are currently present in cfg.models_dir.
+
+    "Present" follows ``installed_status`` — fully installed only.
+    Partial states (target without draft, etc.) are excluded so the
+    default ``lucebox models`` view stays uncluttered.
+    """
+    out: list[ModelPreset] = []
+    for name in sorted(PRESETS):
+        pres = PRESETS[name]
+        if installed_status(cfg, pres) == "installed":
+            out.append(pres)
+    return out
+
+
+def status(cfg: Config, preset: ModelPreset | None = None) -> dict[str, bool]:
+    """Quick presence check — what's already on disk? Size-only, no sha256.
+
+    For presets without a published DFlash draft, ``draft_present`` is
+    reported as ``True`` (nothing to fetch → nothing missing). That
+    keeps the "all present, nothing to do" UX path uniform whether or
+    not a draft exists.
+    """
+    preset = preset or DEFAULT_PRESET
+    api = HfApi()
+    out: dict[str, bool] = {}
+    try:
+        size, _ = _file_meta(api, preset.target_repo, preset.target_file)
+        local = cfg.models_dir / preset.target_file
+        out["target_present"] = local.exists() and local.stat().st_size == size
+    except Exception:
+        out["target_present"] = False
+
+    if preset.has_draft:
+        assert preset.draft_repo is not None and preset.draft_file is not None
+        try:
+            size, _ = _file_meta(api, preset.draft_repo, preset.draft_file)
+            local = cfg.models_dir / "draft" / preset.draft_file
+            out["draft_present"] = local.exists() and local.stat().st_size == size
+        except Exception:
+            out["draft_present"] = False
+    else:
+        out["draft_present"] = True
+    return out
+
+
+def recommend_preset(host: HostFacts) -> str | None:
+    """Pick a default preset for first-run install. None = ask the user.
+
+    Tiers follow the model size catalog: 22 GB+ → Qwen3.6-27B (the
+    Lucebox default), 16-21 GB → Laguna-XS.2 (small target-only). Below
+    16 GB we punt and let the user pick explicitly — the registered
+    presets all need at least 16 GB to run usefully.
+    """
+    if host.vram_gb >= 22:
+        return "qwen3.6-27b"
+    if host.vram_gb >= 16:
+        return "laguna-xs.2"
+    return None
diff --git a/lucebox/src/lucebox/host_check.py b/lucebox/src/lucebox/host_check.py
new file mode 100644
index 000000000..2ce8d3889
--- /dev/null
+++ b/lucebox/src/lucebox/host_check.py
@@ -0,0 +1,232 @@
+"""Readiness check: aggregate HostFacts (provided by lucebox.sh) with the
+docker-daemon checks we can do from inside the container via the mounted
+socket. Prints a status report and returns an aggregate severity.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from typing import Literal
+
+from rich.console import Console
+
+from lucebox.types import HostFacts
+
+Severity = Literal["ok", "warn", "fail"]
+_SEVERITY_ORDER: dict[Severity, int] = {"ok": 0, "warn": 1, "fail": 2}
+
+
+@dataclass(frozen=True, slots=True)
+class CheckResult:
+    name: str
+    severity: Severity
+    message: str
+    hint: str | None = None
+
+
+def run_checks(host: HostFacts) -> list[CheckResult]:
+    return [
+        _check_docker(host),
+        _check_nvidia_driver(host),
+        _check_ctk(host),
+        _check_ram(host),
+        _check_vram(host),
+        _check_systemd(host),
+    ]
+
+
+def _check_docker(host: HostFacts) -> CheckResult:
+    if not host.has_docker:
+        return CheckResult(
+            "docker",
+            "fail",
+            "docker daemon unreachable",
+            "sudo systemctl start docker, or add your user to the 'docker' group",
+        )
+    return CheckResult("docker", "ok", f"daemon reachable ({host.docker_version})")
+
+
+def _check_nvidia_driver(host: HostFacts) -> CheckResult:
+    if host.gpu_vendor != "nvidia":
+        if host.gpu_vendor == "amd":
+            return CheckResult(
+                "gpu",
+                "fail",
+                "AMD GPU detected — prebuilt images are NVIDIA-only",
+                "Build dflash from source with HIP; see dflash/README.md",
+            )
+        return CheckResult("gpu", "fail", "no NVIDIA GPU detected")
+    if not host.driver_version:
+        return CheckResult(
+            "driver",
+            "warn",
+            "nvidia-smi present but NVML query failed (likely driver/library mismatch)",
+            "reboot, or reinstall the matching NVIDIA driver",
+        )
+    if host.driver_major < 525:
+        return CheckResult(
+            "driver",
+            "fail",
+            f"driver r{host.driver_major} too old (need r525+ for cuda12)",
+            "upgrade the NVIDIA driver",
+        )
+    return CheckResult("driver", "ok", f"nvidia r{host.driver_major} ({host.driver_version})")
+
+
+def _check_ctk(host: HostFacts) -> CheckResult:
+    match host.ctk:
+        case "runtime":
+            return CheckResult("ctk", "ok", "NVIDIA Container Toolkit registered as docker runtime")
+        case "cdi":
+            return CheckResult("ctk", "ok", "NVIDIA Container Toolkit available via CDI")
+        case "installed-unwired":
+            return CheckResult(
+                "ctk",
+                "warn",
+                "NVIDIA Container Toolkit installed but not wired into docker",
+                "sudo nvidia-ctk runtime configure --runtime=docker && "
+                "sudo systemctl restart docker",
+            )
+        case _:
+            return CheckResult(
+                "ctk",
+                "fail",
+                "NVIDIA Container Toolkit not installed",
+                "https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html",
+            )
+
+
+def _check_ram(host: HostFacts) -> CheckResult:
+    if host.ram_gb == 0:
+        return CheckResult("ram", "warn", "RAM unknown")
+    if host.ram_gb < 16:
+        return CheckResult("ram", "warn", f"{host.ram_gb} GB RAM — model load may swap")
+    return CheckResult("ram", "ok", f"{host.ram_gb} GB RAM")
+
+
+def _check_vram(host: HostFacts) -> CheckResult:
+    if host.vram_gb == 0:
+        return CheckResult("vram", "warn", "VRAM unknown")
+    if host.vram_gb < 12:
+        return CheckResult(
+            "vram",
+            "fail",
+            f"VRAM {host.vram_gb} GB < 12 GB — 27B target won't fit",
+            "use a smaller model preset or larger GPU",
+        )
+    if host.vram_gb < 22:
+        return CheckResult(
+            "vram",
+            "warn",
+            f"VRAM {host.vram_gb} GB — 27B fits but max_ctx will be capped near 32K",
+        )
+    return CheckResult("vram", "ok", f"VRAM {host.vram_gb} GB ({host.gpu_name})")
+
+
+def _check_systemd(host: HostFacts) -> CheckResult:
+    if not host.has_systemd:
+        return CheckResult(
+            "systemd",
+            "warn",
+            "user systemd not available",
+            "WSL: enable systemd in /etc/wsl.conf; otherwise 'lucebox serve' "
+            "still works in the foreground",
+        )
+    return CheckResult("systemd", "ok", "user systemd available")
+
+
+def aggregate(results: list[CheckResult]) -> Severity:
+    worst: Severity = "ok"
+    for r in results:
+        if _SEVERITY_ORDER[r.severity] > _SEVERITY_ORDER[worst]:
+            worst = r.severity
+    return worst
+
+
+def render(console: Console, host: HostFacts, results: list[CheckResult]) -> Severity:
+    """Print a status block, return the worst severity."""
+    summary = f"[bold]Host:[/bold] {host.nproc} CPUs · {host.ram_gb} GB RAM"
+    if host.gpu_vendor == "nvidia" and host.gpu_name:
+        summary += f" · {host.gpu_name} · {host.vram_gb} GB VRAM" + (
+            f" (sm_{host.gpu_sm})" if host.gpu_sm else ""
+        )
+    if host.is_wsl:
+        summary += " · WSL2"
+    console.print(summary)
+    console.print()
+
+    sev_style = {
+        "ok": "[green]OK[/green]",
+        "warn": "[yellow]WARN[/yellow]",
+        "fail": "[red]FAIL[/red]",
+    }
+    for r in results:
+        console.print(f"  {sev_style[r.severity]:<22} {r.name:<8} {r.message}")
+        if r.hint:
+            console.print(f"  {'':<22} {'':<8} [dim]{r.hint}[/dim]")
+
+    render_host_facts(console)
+
+    worst = aggregate(results)
+    console.print()
+    if worst == "ok":
+        console.print("[green]All checks passed.[/green]")
+    elif worst == "warn":
+        console.print("[yellow]Checks passed with warnings.[/yellow]")
+    else:
+        console.print(
+            "[red]Critical checks failed — fix the issues above before 'lucebox start'.[/red]"
+        )
+    return worst
+
+
+def render_host_facts(console: Console) -> None:
+    """Print a pretty 'Host facts' section sourced from LUCEBOX_HOST_*.
+
+    Same data that ends up in /opt/lucebox-hub/HOST_INFO inside the
+    container — printed here so the operator can sanity-check the
+    rig classification BEFORE starting a long bench run, and so the
+    CI exit-code gate (the pass/fail checks above) stays orthogonal
+    to the informational host facts.
+
+    Reads from the same LUCEBOX_HOST_* env the host wrapper exports
+    (see lucebox.sh::probe_host). Quiet — emits the section header
+    even when most facts are unset, since "no host facts probed at
+    all" is itself a useful signal.
+    """
+    console.print()
+    console.print("[bold]Host facts[/bold] (LUCEBOX_HOST_*, surfaced as /props.host)")
+    facts = [
+        ("os", os.environ.get("LUCEBOX_HOST_OS_PRETTY", "")),
+        ("kernel", os.environ.get("LUCEBOX_HOST_KERNEL", "")),
+        ("wsl_version", os.environ.get("LUCEBOX_HOST_WSL_VERSION", "")),
+        ("docker", os.environ.get("LUCEBOX_HOST_DOCKER_VERSION", "")),
+        ("nvidia_driver", os.environ.get("LUCEBOX_HOST_DRIVER_VERSION", "")),
+        ("nvidia_ctk", os.environ.get("LUCEBOX_HOST_NVIDIA_CTK_VERSION", "")),
+        ("cpu", os.environ.get("LUCEBOX_HOST_CPU_MODEL", "")),
+        ("cuda_visible_devices", os.environ.get("LUCEBOX_HOST_CUDA_VISIBLE_DEVICES", "")),
+    ]
+    for key, value in facts:
+        display = value if value else "[dim](unset)[/dim]"
+        console.print(f"  {key:<22} {display}")
+
+    # Multi-GPU table — one line per device. LUCEBOX_HOST_GPU_LIST_CSV
+    # carries the verbatim nvidia-smi CSV the host wrapper probed.
+    csv = os.environ.get("LUCEBOX_HOST_GPU_LIST_CSV", "")
+    if csv:
+        console.print("  gpus:")
+        for line in csv.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            parts = [c.strip() for c in line.split(",")]
+            if len(parts) >= 7:
+                idx, _uuid, _pci, name, sm, mem, plimit = parts[:7]
+                console.print(
+                    f"    [{idx}] {name} (sm_{sm}, {mem}, {plimit})"
+                )
+            else:
+                console.print(f"    {line}")
+    else:
+        console.print("  gpus                  [dim](none — nvidia-smi unavailable)[/dim]")
diff --git a/lucebox/src/lucebox/host_facts.py b/lucebox/src/lucebox/host_facts.py
new file mode 100644
index 000000000..5deb6721a
--- /dev/null
+++ b/lucebox/src/lucebox/host_facts.py
@@ -0,0 +1,58 @@
+"""Read HostFacts from the LUCEBOX_HOST_* env vars that lucebox.sh exports.
+
+We deliberately don't try to detect anything ourselves on the Python side —
+inside the container, /proc/meminfo reports the container's view, not the
+host's, and nvidia-smi may or may not be available depending on how the
+caller invoked us. The host wrapper is the only thing that can see the
+truth, and it's already paid for the probe.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import cast
+
+from lucebox.types import CtkStatus, GpuVendor, HostFacts
+
+
+def _env_int(key: str, default: int = 0) -> int:
+    raw = os.environ.get(key, "").strip()
+    if not raw:
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+def _env_bool(key: str) -> bool:
+    return os.environ.get(key, "").strip() in {"1", "true", "yes", "on"}
+
+
+def from_env() -> HostFacts:
+    vendor: GpuVendor = "none"
+    raw_vendor = os.environ.get("LUCEBOX_HOST_GPU_VENDOR", "none")
+    if raw_vendor in {"nvidia", "amd", "none"}:
+        vendor = cast(GpuVendor, raw_vendor)
+
+    ctk: CtkStatus = "none"
+    raw_ctk = os.environ.get("LUCEBOX_HOST_HAS_CTK", "none")
+    if raw_ctk in {"runtime", "cdi", "installed-unwired", "none"}:
+        ctk = cast(CtkStatus, raw_ctk)
+
+    return HostFacts(
+        nproc=_env_int("LUCEBOX_HOST_NPROC"),
+        ram_gb=_env_int("LUCEBOX_HOST_RAM_GB"),
+        gpu_vendor=vendor,
+        gpu_name=os.environ.get("LUCEBOX_HOST_GPU_NAME", ""),
+        gpu_count=_env_int("LUCEBOX_HOST_GPU_COUNT"),
+        vram_gb=_env_int("LUCEBOX_HOST_VRAM_GB"),
+        gpu_sm=os.environ.get("LUCEBOX_HOST_GPU_SM", ""),
+        driver_version=os.environ.get("LUCEBOX_HOST_DRIVER_VERSION", ""),
+        driver_major=_env_int("LUCEBOX_HOST_DRIVER_MAJOR"),
+        has_systemd=_env_bool("LUCEBOX_HOST_HAS_SYSTEMD"),
+        is_wsl=_env_bool("LUCEBOX_HOST_IS_WSL"),
+        has_docker=_env_bool("LUCEBOX_HOST_HAS_DOCKER"),
+        docker_version=os.environ.get("LUCEBOX_HOST_DOCKER_VERSION", ""),
+        ctk=ctk,
+    )
diff --git a/lucebox/src/lucebox/py.typed b/lucebox/src/lucebox/py.typed
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/lucebox/src/lucebox/py.typed
@@ -0,0 +1 @@
+
diff --git a/lucebox/src/lucebox/types.py b/lucebox/src/lucebox/types.py
new file mode 100644
index 000000000..e1d3620d7
--- /dev/null
+++ b/lucebox/src/lucebox/types.py
@@ -0,0 +1,140 @@
+"""Shared dataclasses passed between modules.
+
+HostFacts is populated from the LUCEBOX_HOST_* env vars set by lucebox.sh.
+Config is what we serialize to/from .lucebox/config.toml. Both are frozen so
+mistakes (e.g. mutating a config after autotune wrote it) fail loudly.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+Variant = str
+CtkStatus = Literal["runtime", "cdi", "installed-unwired", "none"]
+
+
+def default_models_dir() -> Path:
+    """Resolve the default models directory under the XDG Base Directory spec.
+
+    $XDG_DATA_HOME (default ~/.local/share) is the conventional location for
+    user-specific data files on Linux + macOS. Lucebox nests its model store
+    under that so downloads live alongside other per-user app data instead
+    of cluttering $HOME directly. The host wrapper bind-mounts this path
+    into the container so paths line up in and out of the image.
+    """
+    base = os.environ.get("XDG_DATA_HOME") or str(Path.home() / ".local" / "share")
+    return Path(base) / "lucebox" / "models"
+
+
+GpuVendor = Literal["nvidia", "amd", "none"]
+
+
+@dataclass(frozen=True, slots=True)
+class HostFacts:
+    """Probed once by lucebox.sh, passed in via env vars. Single source of
+    truth on the Python side — we never reprobe (we can't see host /proc)."""
+
+    nproc: int = 0
+    ram_gb: int = 0
+    gpu_vendor: GpuVendor = "none"
+    gpu_name: str = ""
+    gpu_count: int = 0
+    vram_gb: int = 0
+    gpu_sm: str = ""  # e.g. "120" — matches docker-bake arch lists
+    driver_version: str = ""  # e.g. "595.71.05"
+    driver_major: int = 0
+    has_systemd: bool = False
+    is_wsl: bool = False
+    has_docker: bool = False
+    docker_version: str = ""
+    ctk: CtkStatus = "none"
+
+
+@dataclass(frozen=True, slots=True)
+class DflashRuntime:
+    """The DFLASH_* knobs as typed values. Serialized under [dflash] in TOML
+    and emitted as -e DFLASH_FOO=bar args to docker run.
+
+    The 11 fields below (budget through prefill_drafter) form the strict
+    allowlist mirrored by lucebench's snapshot config.json — keep both
+    in lockstep. ``think_max`` is a separate phase-1 thinking cap that
+    isn't part of the runtime snapshot allowlist (it's per-request, not
+    per-server).
+    """
+
+    budget: int = 22
+    max_ctx: int = 16384
+    lazy: bool = False
+    prefix_cache_slots: int = 0
+    prefill_cache_slots: int = 0
+    cache_type_k: str = ""
+    cache_type_v: str = ""
+    prefill_mode: Literal["off", "auto", "always"] = "off"
+    prefill_keep_ratio: float = 0.05
+    prefill_threshold: int = 32000
+    prefill_drafter: str = ""
+    # Phase-1 (thinking) cap when a request opts into thinking. Default mirrors
+    # antirez/ds4 ds4_eval.c: think_max_tokens = max_tokens - hard_limit_reply
+    # budget = 16000 - 512 = 15488. The server's own hardcoded default is 10000.
+    think_max: int = 15488
+    # Flash-attention sliding-window on full-attention layers. 0 = full
+    # attention (server default). On gemma4's hybrid iSWA the full-attn
+    # layers grow KV linearly with max_ctx; a sparse fa_window keeps
+    # decode compute bounded on long prompts without changing the KV
+    # footprint. Q: passed through to the server's `--fa-window <N>`
+    # flag (see server/src/server/server_main.cpp).
+    fa_window: int = 0
+    # Soft-close thinking termination dial (PR #326 in lucebox-hub).
+    # Lets the AR loop force </think> early when the close-token logit
+    # comes within this probability ratio of the chosen-token logit.
+    # Range [0.0, 1.0]; 0.0 = disabled (byte-identical to pre-change
+    # behaviour). 0.5 = close when close-token prob >= 0.5 * chosen-token
+    # prob; 0.9 = aggressive. Qwen3.5/3.6 AR path only in v1. Surfaced
+    # to the server via DFLASH_THINK_SOFT_CLOSE_MIN_RATIO →
+    # --think-soft-close-min-ratio.
+    think_soft_close_min_ratio: float = 0.0
+    # Diagnostic: when True, surface --debug-thinking-logits to the
+    # server CLI via DFLASH_DEBUG_THINKING_LOGITS=1, producing one
+    # stderr line per thinking AR step recording the close-vs-chosen
+    # logit gap. Used to fit a sliding-ratio curve from real trajectory
+    # data. Heavy stderr (one line per thinking token across all
+    # in-flight requests); leave off in production.
+    debug_thinking_logits: bool = False
+
+
+@dataclass(frozen=True, slots=True)
+class ModelMeta:
+    """Which preset the operator picked at configure/download time.
+
+    Persisted under ``[model]`` in config.toml so `lucebox serve` can
+    pass ``DFLASH_TARGET=/opt/lucebox-hub/server/models/<file>`` and
+    ``DFLASH_DRAFT`` for the draft GGUF (when one is published for the
+    preset). The entrypoint's "multiple candidate GGUFs" branch never
+    has to guess which one to load.
+
+    ``target_file`` and ``draft_file`` are advanced overrides — when set
+    they win over the preset's registry default. Empty strings mean
+    "fall back to the registry value for [model] preset, then to the
+    entrypoint's autodetect".
+    """
+
+    preset: str = ""
+    target_file: str = ""
+    draft_file: str = ""
+
+
+@dataclass(frozen=True, slots=True)
+class Config:
+    """The whole config.toml, materialized."""
+
+    variant: Variant = "cuda12"
+    image: str = "ghcr.io/luce-org/lucebox-hub"
+    container_name: str = "lucebox"
+    port: int = 8080
+    models_dir: Path = field(default_factory=default_models_dir)
+    dflash: DflashRuntime = field(default_factory=DflashRuntime)
+    host: HostFacts = field(default_factory=HostFacts)
+    model: ModelMeta = field(default_factory=ModelMeta)
diff --git a/lucebox/tests/test_autotune.py b/lucebox/tests/test_autotune.py
new file mode 100644
index 000000000..c47c17c9e
--- /dev/null
+++ b/lucebox/tests/test_autotune.py
@@ -0,0 +1,48 @@
+from lucebox.autotune import runtime_from_host
+from lucebox.types import HostFacts
+
+
+def test_wsl_24gb_defaults_leave_cuda_headroom() -> None:
+    runtime = runtime_from_host(HostFacts(vram_gb=24, is_wsl=True))
+
+    assert runtime.budget == 16
+    # Bumped 65536 → 98304 on 2026-05-30 after the gemma4-26b coding-
+    # agent-loop sweep proved 98K serves 90K-token agentic prompts
+    # with ~3 GB VRAM headroom and no CUDA VMM failures on the 3090 Ti
+    # WSL configuration (see
+    # docs/experiments/gemma4-26b-coding-agent-loop-sweep-2026-05-30.md).
+    assert runtime.max_ctx == 98304
+    # lazy is False because the heuristic path does NOT set prefill_drafter,
+    # and the C++ server silently ignores --lazy-draft without it. Flipping
+    # to False makes the host config match runtime behaviour. See the
+    # `entrypoint.sh` warning emitted when the two are out-of-sync.
+    assert runtime.lazy is False
+    assert runtime.prefix_cache_slots == 0
+
+
+def test_native_24gb_caps_context_below_vmm_failure_boundary() -> None:
+    runtime = runtime_from_host(HostFacts(vram_gb=24, is_wsl=False))
+
+    assert runtime.budget == 22
+    assert runtime.max_ctx == 98304
+    assert runtime.lazy is False  # see WSL test above
+    assert runtime.prefix_cache_slots == 0
+
+
+def test_no_heuristic_tier_sets_lazy_without_prefill_drafter() -> None:
+    """Regression for the `--lazy-draft ignored` silent no-op.
+
+    The C++ dflash_server drops `--lazy-draft` unless `--prefill-drafter`
+    is also passed. The heuristic doesn't set `prefill_drafter`, so any
+    tier that sets `lazy=True` would produce a host config that doesn't
+    match what actually ran — exactly the mismatch the sindri decode
+    sweep tripped over (every docker.stderr contained the warning).
+    """
+    for vram in (0, 8, 16, 24, 40, 80):
+        for is_wsl in (False, True):
+            rt = runtime_from_host(HostFacts(vram_gb=vram, is_wsl=is_wsl))
+            if rt.lazy:
+                assert rt.prefill_drafter, (
+                    f"vram={vram} is_wsl={is_wsl}: lazy=True without "
+                    f"prefill_drafter → silent no-op on the C++ server"
+                )
diff --git a/lucebox/tests/test_check.py b/lucebox/tests/test_check.py
new file mode 100644
index 000000000..3fdd469d9
--- /dev/null
+++ b/lucebox/tests/test_check.py
@@ -0,0 +1,118 @@
+"""Tests for ``lucebox check`` — readiness report.
+
+The check command has two surfaces that must stay independent:
+
+  * pass/fail checks → drive the exit code, so the command is usable
+    as a CI exit-code gate;
+  * Host facts section → informational, prints the LUCEBOX_HOST_*
+    convoy that gets baked into /opt/lucebox-hub/HOST_INFO inside
+    the container.
+"""
+
+from __future__ import annotations
+
+import pytest
+from lucebox.cli import app
+from lucebox.types import HostFacts
+from rich.console import Console
+from typer.testing import CliRunner
+
+from lucebox import host_check
+
+
+def test_check_prints_host_facts_section(monkeypatch: pytest.MonkeyPatch) -> None:
+    """`lucebox check` includes a Host facts block sourced from LUCEBOX_HOST_*."""
+    monkeypatch.setenv("LUCEBOX_HOST_OS_PRETTY", "Ubuntu 22.04.3 LTS")
+    monkeypatch.setenv("LUCEBOX_HOST_KERNEL", "6.6.87.2-microsoft-standard-WSL2")
+    monkeypatch.setenv("LUCEBOX_HOST_WSL_VERSION", "wsl2")
+    monkeypatch.setenv("LUCEBOX_HOST_DOCKER_VERSION", "29.1.3")
+    monkeypatch.setenv("LUCEBOX_HOST_DRIVER_VERSION", "596.36")
+    monkeypatch.setenv("LUCEBOX_HOST_NVIDIA_CTK_VERSION", "1.16.2")
+    monkeypatch.setenv("LUCEBOX_HOST_CPU_MODEL", "Intel Test CPU")
+    monkeypatch.setenv(
+        "LUCEBOX_HOST_GPU_LIST_CSV",
+        "0, GPU-abc, 00000000:01:00.0, NVIDIA RTX 5090, 12.0, 24576 MiB, 175.00 W",
+    )
+    # Stub HostFacts so the pass/fail checks succeed at least minimally.
+    # `cli.check` imports `from_env` into its module namespace, so patch
+    # both names.
+    def stub() -> HostFacts:
+        return HostFacts(
+            nproc=24,
+            ram_gb=64,
+            gpu_vendor="nvidia",
+            gpu_name="NVIDIA RTX 5090",
+            gpu_count=1,
+            vram_gb=24,
+            gpu_sm="120",
+            driver_version="596.36",
+            driver_major=596,
+            has_systemd=True,
+            is_wsl=True,
+            has_docker=True,
+            docker_version="29.1.3",
+            ctk="runtime",
+        )
+    monkeypatch.setattr("lucebox.host_facts.from_env", stub)
+    monkeypatch.setattr("lucebox.cli.from_env", stub)
+    result = CliRunner().invoke(app, ["check"])
+    # The pass/fail half of `check` should still exit 0 on this stubbed host.
+    assert result.exit_code == 0, result.stdout
+    assert "Host facts" in result.stdout
+    assert "Ubuntu 22.04.3 LTS" in result.stdout
+    assert "wsl2" in result.stdout
+    assert "1.16.2" in result.stdout
+    assert "Intel Test CPU" in result.stdout
+    # Multi-GPU table line.
+    assert "NVIDIA RTX 5090" in result.stdout
+
+
+def test_render_host_facts_unset_env_shows_placeholders(
+    monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]
+) -> None:
+    """All LUCEBOX_HOST_* unset → section still renders with explicit (unset) markers."""
+    for k in list(__import__("os").environ):
+        if k.startswith("LUCEBOX_HOST_"):
+            monkeypatch.delenv(k, raising=False)
+    console = Console(force_terminal=False, no_color=True, record=True)
+    host_check.render_host_facts(console)
+    text = console.export_text()
+    assert "Host facts" in text
+    # Multi-line section renders even when no env was passed in.
+    assert "(unset)" in text
+    assert "gpus" in text
+
+
+def test_check_exit_code_independent_of_host_facts(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Host facts section must not change the exit-code semantics of check.
+
+    Drives the pass/fail logic through a known-fail HostFacts (no docker)
+    and asserts the exit code is still 1, regardless of what the Host
+    facts block prints.
+    """
+    monkeypatch.setenv("LUCEBOX_HOST_OS_PRETTY", "Bare Linux")
+    def stub() -> HostFacts:
+        return HostFacts(
+            nproc=8,
+            ram_gb=16,
+            gpu_vendor="nvidia",
+            gpu_name="X",
+            gpu_count=1,
+            vram_gb=24,
+            gpu_sm="86",
+            driver_version="555.00",
+            driver_major=555,
+            has_systemd=False,
+            is_wsl=False,
+            has_docker=False,  # → fail
+            docker_version="",
+            ctk="none",  # also fail
+        )
+    monkeypatch.setattr("lucebox.host_facts.from_env", stub)
+    monkeypatch.setattr("lucebox.cli.from_env", stub)
+    result = CliRunner().invoke(app, ["check"])
+    assert result.exit_code == 1
+    # Host facts block still printed despite the failure.
+    assert "Host facts" in result.stdout
diff --git a/lucebox/tests/test_cli.py b/lucebox/tests/test_cli.py
new file mode 100644
index 000000000..f7628e8b7
--- /dev/null
+++ b/lucebox/tests/test_cli.py
@@ -0,0 +1,102 @@
+"""Tests for the top-level Typer surface."""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+from lucebox.cli import app
+from typer.testing import CliRunner
+
+
+def test_config_subcommand_is_registered() -> None:
+    result = CliRunner().invoke(app, ["config", "--help"])
+    assert result.exit_code == 0
+    assert "get" in result.output
+    assert "set" in result.output
+    assert "unset" in result.output
+
+
+def test_models_subcommand_is_registered() -> None:
+    result = CliRunner().invoke(app, ["models", "--help"])
+    assert result.exit_code == 0
+    assert "list" in result.output
+    assert "download" in result.output
+
+
+@pytest.mark.parametrize(
+    "verb",
+    [
+        "autotune",
+        "sweep",
+        "profile",
+        "smoke",
+        "claude",
+        "codex",
+        "opencode",
+        "hermes",
+        "pi",
+        "openclaw",
+    ],
+)
+def test_deferred_verbs_are_not_registered(verb: str) -> None:
+    """autotune/sweep, profile/smoke and the client launchers are deferred to
+    follow-up PRs — this core CLI (launch / serve / install / download) must
+    not expose them."""
+    result = CliRunner().invoke(app, [verb, "--help"])
+    assert result.exit_code != 0
+
+
+def test_core_verbs_present_in_app() -> None:
+    """The core launch/serve surface stays wired into the Typer command table."""
+    registered = {
+        c.name or (c.callback.__name__ if c.callback else "")
+        for c in app.registered_commands
+    }
+    for verb in ("check", "pull", "print-run", "print-serve-argv", "version"):
+        assert verb in registered
+
+
+def test_legacy_subcommands_are_removed() -> None:
+    """`configure` and `download-models` were folded into config/models."""
+    cfg = CliRunner().invoke(app, ["configure", "--help"])
+    assert cfg.exit_code != 0
+    dl = CliRunner().invoke(app, ["download-models", "--help"])
+    assert dl.exit_code != 0
+
+
+def test_server_run_spec_forwards_lucebox_host_env(monkeypatch) -> None:
+    """server_run_spec carries LUCEBOX_HOST_* from the orchestrator into the server.
+
+    lucebox.sh exports the LUCEBOX_HOST_* convoy before `docker run` on the
+    orchestrator; the orchestrator inherits them and we forward each one
+    as ``-e KEY=VALUE`` to the server container so entrypoint.sh's
+    write_host_info() can populate /opt/lucebox-hub/HOST_INFO.
+    """
+    import lucebox.docker_run as docker_run
+    from lucebox.config import live_config
+
+    # Scrub any pre-existing LUCEBOX_HOST_* env so the test sees only what we set.
+    for k in list(os.environ):
+        if k.startswith("LUCEBOX_HOST_"):
+            monkeypatch.delenv(k, raising=False)
+    monkeypatch.setenv("LUCEBOX_HOST_OS_PRETTY", "Ubuntu 22.04.3 LTS")
+    monkeypatch.setenv("LUCEBOX_HOST_KERNEL", "6.6.87.2-microsoft-standard-WSL2")
+    monkeypatch.setenv("LUCEBOX_HOST_WSL_VERSION", "wsl2")
+    monkeypatch.setenv(
+        "LUCEBOX_HOST_GPU_LIST_CSV",
+        "0, GPU-x, 00000000:01:00.0, NVIDIA RTX 5090, 12.0, 24576 MiB, 175.00 W",
+    )
+
+    cfg = live_config()
+    spec = docker_run.server_run_spec(cfg)
+    env_keys = {k for k, _ in spec.env}
+    assert "LUCEBOX_HOST_OS_PRETTY" in env_keys
+    assert "LUCEBOX_HOST_KERNEL" in env_keys
+    assert "LUCEBOX_HOST_WSL_VERSION" in env_keys
+    assert "LUCEBOX_HOST_GPU_LIST_CSV" in env_keys
+    # DFLASH_* still present.
+    assert "DFLASH_BUDGET" in env_keys
+    # Values surface verbatim.
+    env_map = dict(spec.env)
+    assert env_map["LUCEBOX_HOST_OS_PRETTY"] == "Ubuntu 22.04.3 LTS"
diff --git a/lucebox/tests/test_config.py b/lucebox/tests/test_config.py
new file mode 100644
index 000000000..d60f3d882
--- /dev/null
+++ b/lucebox/tests/test_config.py
@@ -0,0 +1,176 @@
+"""Tests for the sparse TOML config persistence layer."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from lucebox.config import config_get, config_set, config_unset
+
+from lucebox import config
+
+
+def test_legacy_env_migration_skips_invalid_values(tmp_path: Path) -> None:
+    legacy = tmp_path / "config.env"
+    legacy.write_text("DFLASH_BUDGET=not-an-int\nDFLASH_MAX_CTX=65536\nDFLASH_LAZY=true\n")
+
+    cfg, _doc = config._load_legacy_env(legacy)
+
+    assert cfg.dflash.budget == 22
+    assert cfg.dflash.max_ctx == 65536
+    assert cfg.dflash.lazy is True
+
+
+def test_image_variant_round_trips_from_toml(tmp_path: Path) -> None:
+    path = tmp_path / "config.toml"
+    path.write_text(
+        "[image]\n"
+        'registry = "ghcr.io/luce-org/lucebox-hub"\n'
+        'variant = "integration-props-uv-squared-clean-cuda12"\n'
+    )
+
+    cfg = config._load_toml(path)
+
+    assert cfg.image == "ghcr.io/luce-org/lucebox-hub"
+    assert cfg.variant == "integration-props-uv-squared-clean-cuda12"
+
+
+def test_model_preset_round_trips_through_set_and_load(tmp_path: Path) -> None:
+    """Setting model.preset writes a sparse TOML doc that loads back correctly."""
+    path = tmp_path / "config.toml"
+    config_set("model.preset", "gemma-4-26b", path=path)
+    config_set("model.target_file", "google_gemma-4-26B-A4B-it-Q4_K_M.gguf", path=path)
+
+    cfg = config._load_toml(path)
+    assert cfg.model.preset == "gemma-4-26b"
+    assert cfg.model.target_file == "google_gemma-4-26B-A4B-it-Q4_K_M.gguf"
+
+
+def test_legacy_config_without_model_section_stays_unpinned(tmp_path: Path) -> None:
+    """Legacy configs (no [model] section) must NOT silently pin to qwen."""
+    path = tmp_path / "config.toml"
+    path.write_text('[image]\nvariant = "cuda12"\n')
+
+    cfg = config._load_toml(path)
+
+    assert cfg.model.preset == ""
+    assert cfg.model.target_file == ""
+    assert cfg.model.draft_file == ""
+
+
+def test_model_section_picks_target_file_from_registry(tmp_path: Path) -> None:
+    """A bare [model] preset="..." entry pulls target_file from the registry."""
+    path = tmp_path / "config.toml"
+    path.write_text('[model]\npreset = "gemma-4-31b"\n')
+
+    cfg = config._load_toml(path)
+
+    assert cfg.model.preset == "gemma-4-31b"
+    assert cfg.model.target_file == "google_gemma-4-31B-it-Q4_K_M.gguf"
+
+
+def test_model_section_picks_draft_file_from_registry(tmp_path: Path) -> None:
+    """When preset has a published draft GGUF, [model] preset="..." picks draft_file too."""
+    path = tmp_path / "config.toml"
+    path.write_text('[model]\npreset = "qwen3.6-27b"\n')
+
+    cfg = config._load_toml(path)
+    assert cfg.model.preset == "qwen3.6-27b"
+    assert cfg.model.draft_file == "dflash-draft-3.6-q4_k_m.gguf"
+
+
+def test_config_set_writes_only_named_key(tmp_path: Path) -> None:
+    """Sparse persistence: setting one key does NOT serialize every default."""
+    path = tmp_path / "config.toml"
+    config_set("dflash.budget", 16, path=path)
+    body = path.read_text()
+    # The only [dflash] field that should appear is budget — none of the others.
+    assert "[dflash]" in body
+    assert "budget = 16" in body
+    assert "max_ctx" not in body  # not user-set, must not appear
+    assert "lazy" not in body
+    assert "[host]" not in body  # whole section absent
+    assert "[image]" not in body  # not touched either
+
+
+def test_config_set_preserves_existing_keys(tmp_path: Path) -> None:
+    """Setting a new key leaves previously-set keys intact."""
+    path = tmp_path / "config.toml"
+    config_set("dflash.budget", 16, path=path)
+    config_set("model.preset", "qwen3.6-27b", path=path)
+    body = path.read_text()
+    assert "budget = 16" in body
+    assert 'preset = "qwen3.6-27b"' in body
+
+
+def test_config_unset_removes_one_key(tmp_path: Path) -> None:
+    """Unset removes the named key and leaves siblings alone."""
+    path = tmp_path / "config.toml"
+    config_set("dflash.budget", 16, path=path)
+    config_set("dflash.max_ctx", 65536, path=path)
+    changed = config_unset("dflash.budget", path=path)
+    assert changed is True
+    body = path.read_text()
+    assert "budget" not in body
+    assert "max_ctx = 65536" in body
+
+
+def test_config_unset_drops_empty_section(tmp_path: Path) -> None:
+    """Unsetting the last key in a section drops the empty section."""
+    path = tmp_path / "config.toml"
+    config_set("dflash.budget", 16, path=path)
+    config_unset("dflash.budget", path=path)
+    body = path.read_text()
+    # The section may still exist as an empty table but `[dflash]` shouldn't.
+    assert "[dflash]" not in body
+
+
+def test_config_get_reports_origin(tmp_path: Path) -> None:
+    """Each key carries an origin label — `file` when overridden, `default` otherwise."""
+    path = tmp_path / "config.toml"
+    config_set("dflash.budget", 9, path=path)
+    entries = config_get(path=path)
+    assert entries["dflash.budget"] == (9, "file")
+    # max_ctx wasn't set so should report the live default.
+    value, origin = entries["dflash.max_ctx"]
+    assert origin == "default"
+    assert value == 16384  # DflashRuntime.max_ctx default
+
+
+def test_config_get_rejects_unknown_key(tmp_path: Path) -> None:
+    path = tmp_path / "config.toml"
+    with pytest.raises(KeyError):
+        config_get("not.a.key", path=path)
+
+
+def test_config_set_rejects_unknown_key(tmp_path: Path) -> None:
+    path = tmp_path / "config.toml"
+    with pytest.raises(KeyError):
+        config_set("not.a.key", 1, path=path)
+
+
+def test_config_set_auto_creates_file(tmp_path: Path) -> None:
+    """`config set` creates a missing config.toml on first write."""
+    path = tmp_path / "config.toml"
+    assert not path.exists()
+    config_set("port", 9090, path=path)
+    assert path.exists()
+    assert "port = 9090" in path.read_text()
+
+
+def test_save_writes_sparse_doc(tmp_path: Path) -> None:
+    """`save` writes whatever doc is handed in — no defaults serialized."""
+    path = tmp_path / "config.toml"
+    cfg = config._from_dict({})
+    config.save(cfg, path, doc={"dflash": {"budget": 9}})
+    body = path.read_text()
+    assert "budget = 9" in body
+    assert "max_ctx" not in body
+
+
+def test_live_config_uses_recommend_preset_indirectly(tmp_path: Path) -> None:
+    """``live_config()`` returns a Config — no implicit preset when none given."""
+    # The function probes the env-provided HostFacts; with no preset arg
+    # we must NOT silently pin one (that would surprise legacy installs).
+    cfg = config.live_config()
+    assert cfg.model.preset == ""
diff --git a/lucebox/tests/test_config_cli.py b/lucebox/tests/test_config_cli.py
new file mode 100644
index 000000000..446ab41b6
--- /dev/null
+++ b/lucebox/tests/test_config_cli.py
@@ -0,0 +1,127 @@
+"""Tests for the ``lucebox config`` sub-app CLI."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from lucebox.cli import app
+from typer.testing import CliRunner
+
+
+def _set_config_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
+    monkeypatch.setenv("LUCEBOX_HOME", str(tmp_path))
+    return tmp_path / "config.toml"
+
+
+def test_config_set_then_get_round_trip(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    cfg_path = _set_config_path(tmp_path, monkeypatch)
+    set_result = CliRunner().invoke(app, ["config", "set", "dflash.budget=12"])
+    assert set_result.exit_code == 0
+    assert cfg_path.exists()
+    get_result = CliRunner().invoke(app, ["config", "get", "dflash.budget"])
+    assert get_result.exit_code == 0
+    assert "12" in get_result.stdout
+    assert "from file" in get_result.stdout
+
+
+def test_config_get_with_no_key_lists_every_registered_key(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    _set_config_path(tmp_path, monkeypatch)
+    result = CliRunner().invoke(app, ["config", "get"])
+    assert result.exit_code == 0
+    # Every registered dotted key shows up at least once.
+    for key in ("model.preset", "dflash.budget", "port"):
+        assert key in result.stdout
+
+
+def test_config_unset_drops_key(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    cfg_path = _set_config_path(tmp_path, monkeypatch)
+    CliRunner().invoke(app, ["config", "set", "dflash.budget=9"])
+    assert "budget = 9" in cfg_path.read_text()
+    unset_result = CliRunner().invoke(app, ["config", "unset", "dflash.budget"])
+    assert unset_result.exit_code == 0
+    body = cfg_path.read_text()
+    assert "budget" not in body
+
+
+def test_config_set_unknown_key_errors(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    _set_config_path(tmp_path, monkeypatch)
+    result = CliRunner().invoke(app, ["config", "set", "totally.unknown=1"])
+    assert result.exit_code == 2
+
+
+def test_config_set_rejects_missing_equals(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    _set_config_path(tmp_path, monkeypatch)
+    result = CliRunner().invoke(app, ["config", "set", "dflash.budget"])
+    assert result.exit_code == 2
+
+
+def test_config_set_creates_file_when_missing(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    cfg_path = _set_config_path(tmp_path, monkeypatch)
+    assert not cfg_path.exists()
+    CliRunner().invoke(app, ["config", "set", "port=9090"])
+    assert cfg_path.exists()
+    assert "port = 9090" in cfg_path.read_text()
+
+
+def test_load_or_build_env_overrides_persisted_config(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """LUCEBOX_* env vars must win over config.toml.
+
+    Regression test for the precedence bug fixed in this commit: prior
+    to the fix, `_load_or_build()` returned `config_mod.load()`'s result
+    verbatim when config.toml existed, so the systemd unit's
+    `Environment=LUCEBOX_IMAGE=...` was silently ignored. Sindri's
+    config.toml had `[image]` without `registry`, which made the
+    dataclass default `ghcr.io/luce-org/lucebox-hub` win over the
+    intended easel image.
+    """
+    from lucebox.cli import _load_or_build
+
+    cfg_path = _set_config_path(tmp_path, monkeypatch)
+    # Write a config.toml WITHOUT an image.registry line — the
+    # bug-trigger shape on sindri.
+    cfg_path.write_text(
+        '[image]\nvariant = "cuda12"\n[runtime]\nport = 9090\n'
+        '[dflash]\nbudget = 22\n'
+    )
+    # Env should override what config.toml says (and what dataclass
+    # defaults fill in for missing keys).
+    monkeypatch.setenv("LUCEBOX_IMAGE", "ghcr.io/myfork/lucebox-hub")
+    monkeypatch.setenv("LUCEBOX_PORT", "7777")
+    monkeypatch.setenv("LUCEBOX_CONTAINER", "lucebox-test")
+    cfg = _load_or_build()
+    assert cfg.image == "ghcr.io/myfork/lucebox-hub"  # env beats dataclass default
+    assert cfg.port == 7777                            # env beats config.toml
+    assert cfg.container_name == "lucebox-test"        # env applied
+    # variant is in config.toml — config.toml value (no env override).
+    assert cfg.variant == "cuda12"
+    # dflash IS persisted in config.toml — env doesn't touch it (no DFLASH_*
+    # env hooks at this layer).
+    assert cfg.dflash.budget == 22
+
+
+def test_load_or_build_no_toml_env_overrides_defaults(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """When config.toml is absent, env must still override defaults."""
+    from lucebox.cli import _load_or_build
+
+    _set_config_path(tmp_path, monkeypatch)
+    # Don't write a config.toml — exercise the live_config() fallback.
+    monkeypatch.setenv("LUCEBOX_IMAGE", "ghcr.io/myfork/lucebox-hub")
+    cfg = _load_or_build()
+    assert cfg.image == "ghcr.io/myfork/lucebox-hub"
diff --git a/lucebox/tests/test_docker_run.py b/lucebox/tests/test_docker_run.py
new file mode 100644
index 000000000..bb888514f
--- /dev/null
+++ b/lucebox/tests/test_docker_run.py
@@ -0,0 +1,254 @@
+"""Tests for the docker-run serve-argv builder.
+
+This is the core's whole job: turn a Config into the exact `docker run`
+command (and DFLASH_* env) that launches the server. The argv contract is
+what `lucebox serve` / the systemd unit / `print-run` all consume, so it is
+pinned field-by-field here rather than only smoke-tested.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from lucebox.download import PRESETS
+from lucebox.types import Config, DflashRuntime, ModelMeta
+
+from lucebox import docker_run
+
+
+def _env(spec) -> dict[str, str]:
+    return dict(spec.env)
+
+
+# ── DockerRunSpec.argv ───────────────────────────────────────────────────────
+
+
+def test_argv_minimal_defaults() -> None:
+    spec = docker_run.DockerRunSpec(image="img:tag", name="box")
+    argv = spec.argv()
+    assert argv[:2] == ["docker", "run"]
+    assert "--rm" in argv  # remove defaults True
+    assert ["--name", "box"] == argv[argv.index("--name") : argv.index("--name") + 2]
+    assert ["--gpus", "all"] == argv[argv.index("--gpus") : argv.index("--gpus") + 2]
+    # image is the last positional (no entrypoint_args here)
+    assert argv[-1] == "img:tag"
+    assert "-d" not in argv  # detach defaults False
+
+
+def test_argv_flags_and_ordering() -> None:
+    spec = docker_run.DockerRunSpec(
+        image="img:tag",
+        name="box",
+        gpus=False,
+        detach=True,
+        remove=False,
+        port_publish=(8080, 8080),
+        volumes=(("/host/models", "/opt/lucebox-hub/server/models"),),
+        env=(("DFLASH_BUDGET", "22"),),
+        entrypoint_args=("serve",),
+        extra=("--shm-size", "1g"),
+    )
+    argv = spec.argv()
+    assert "--rm" not in argv  # remove=False
+    assert "-d" in argv  # detach
+    assert "--gpus" not in argv  # gpus=False
+    assert ["-p", "8080:8080"] == argv[argv.index("-p") : argv.index("-p") + 2]
+    assert ["-v", "/host/models:/opt/lucebox-hub/server/models"] == argv[
+        argv.index("-v") : argv.index("-v") + 2
+    ]
+    assert ["-e", "DFLASH_BUDGET=22"] == argv[argv.index("-e") : argv.index("-e") + 2]
+    # extra flags precede the image; entrypoint_args follow it.
+    assert argv[-1] == "serve"
+    assert argv[-2] == "img:tag"
+    assert argv.index("--shm-size") < argv.index("img:tag")
+
+
+def test_printable_glues_value_taking_flags() -> None:
+    spec = docker_run.DockerRunSpec(
+        image="img:tag",
+        name="box",
+        port_publish=(8080, 8080),
+        env=(("K", "v"),),
+    )
+    out = spec.printable()
+    # one flag per line, continued with backslash-newline
+    assert out.startswith("docker \\\n    run")
+    # value-taking flags keep their value on the same line
+    assert "--name box" in out
+    assert "--gpus all" in out
+    assert "-p 8080:8080" in out
+    assert "-e K=v" in out
+
+
+# ── _runtime_volumes ─────────────────────────────────────────────────────────
+
+
+def test_runtime_volumes_mounts_models_and_home(tmp_path: Path) -> None:
+    cfg = Config(models_dir=tmp_path / "models")
+    vols = docker_run._runtime_volumes(cfg)
+    assert (str(tmp_path / "models"), "/opt/lucebox-hub/server/models") in vols
+    # $HOME is also mounted so absolute symlink targets resolve in-container.
+    assert any(host == str(Path.home()) for host, _ in vols)
+
+
+def test_runtime_volumes_dedupes_when_models_is_home(monkeypatch, tmp_path: Path) -> None:
+    monkeypatch.setattr(Path, "home", staticmethod(lambda: tmp_path))
+    cfg = Config(models_dir=tmp_path)
+    vols = docker_run._runtime_volumes(cfg)
+    # models_dir == home → only the models mount, no duplicate home mount.
+    assert len(vols) == 1
+
+
+# ── _resolve_model_files ─────────────────────────────────────────────────────
+
+
+def test_resolve_model_files_explicit_override_wins(tmp_path: Path) -> None:
+    cfg = Config(
+        models_dir=tmp_path,
+        model=ModelMeta(preset="qwen3.6-27b", target_file="custom.gguf", draft_file="d.gguf"),
+    )
+    target, draft, draft_dir = docker_run._resolve_model_files(cfg)
+    assert target == "custom.gguf"
+    assert draft == "d.gguf"
+    assert draft_dir == ""
+
+
+def test_resolve_model_files_falls_back_to_preset_registry(tmp_path: Path) -> None:
+    pres = PRESETS["qwen3.6-27b"]
+    cfg = Config(models_dir=tmp_path, model=ModelMeta(preset="qwen3.6-27b"))
+    target, draft, draft_dir = docker_run._resolve_model_files(cfg)
+    assert target == pres.target_file
+    assert draft == (pres.draft_file or "")
+    assert draft_dir == ""  # no speculator dir on disk
+
+
+def test_resolve_model_files_no_preset_no_override(tmp_path: Path) -> None:
+    cfg = Config(models_dir=tmp_path)  # ModelMeta() defaults: all empty
+    assert docker_run._resolve_model_files(cfg) == ("", "", "")
+
+
+# ── server_run_spec ──────────────────────────────────────────────────────────
+
+
+def test_server_run_spec_top_level_shape(tmp_path: Path) -> None:
+    cfg = Config(
+        image="ghcr.io/x/lucebox-hub",
+        variant="cuda12",
+        container_name="lucebox",
+        port=9000,
+        models_dir=tmp_path,
+    )
+    spec = docker_run.server_run_spec(cfg)
+    assert spec.image == "ghcr.io/x/lucebox-hub:cuda12"
+    assert spec.name == "lucebox"
+    assert spec.gpus is True
+    assert spec.remove is True
+    assert spec.detach is False
+    assert spec.port_publish == (9000, 8080)
+    assert (str(tmp_path), "/opt/lucebox-hub/server/models") in spec.volumes
+
+
+def test_server_run_spec_always_emits_core_dflash_env(tmp_path: Path) -> None:
+    cfg = Config(models_dir=tmp_path, dflash=DflashRuntime(budget=22, max_ctx=32768))
+    env = _env(docker_run.server_run_spec(cfg))
+    assert env["DFLASH_BUDGET"] == "22"
+    assert env["DFLASH_MAX_CTX"] == "32768"
+    assert env["DFLASH_PREFIX_CACHE_SLOTS"] == "0"
+    assert env["DFLASH_PREFILL_CACHE_SLOTS"] == "0"
+    assert env["DFLASH_THINK_MAX"] == "15488"
+    assert env["DFLASH_PORT"] == "8080"
+
+
+def test_server_run_spec_optional_env_off_by_default(tmp_path: Path) -> None:
+    env = _env(docker_run.server_run_spec(Config(models_dir=tmp_path)))
+    for absent in (
+        "DFLASH_LAZY",
+        "DFLASH_CACHE_TYPE_K",
+        "DFLASH_CACHE_TYPE_V",
+        "DFLASH_PREFILL_MODE",
+        "DFLASH_FA_WINDOW",
+        "DFLASH_THINK_SOFT_CLOSE_MIN_RATIO",
+        "DFLASH_DEBUG_THINKING_LOGITS",
+        "DFLASH_TARGET",
+        "DFLASH_DRAFT",
+    ):
+        assert absent not in env
+
+
+def test_server_run_spec_optional_env_emitted_when_set(tmp_path: Path) -> None:
+    cfg = Config(
+        models_dir=tmp_path,
+        dflash=DflashRuntime(
+            lazy=True,
+            cache_type_k="tq3_0",
+            cache_type_v="tq3_0",
+            prefill_mode="auto",
+            prefill_keep_ratio=0.1,
+            prefill_threshold=20000,
+            prefill_drafter="drafter.gguf",
+            fa_window=512,
+            think_soft_close_min_ratio=0.5,
+            debug_thinking_logits=True,
+        ),
+    )
+    env = _env(docker_run.server_run_spec(cfg))
+    assert env["DFLASH_LAZY"] == "1"
+    assert env["DFLASH_CACHE_TYPE_K"] == "tq3_0"
+    assert env["DFLASH_CACHE_TYPE_V"] == "tq3_0"
+    assert env["DFLASH_PREFILL_MODE"] == "auto"
+    assert env["DFLASH_PREFILL_KEEP"] == "0.1"
+    assert env["DFLASH_PREFILL_THRESHOLD"] == "20000"
+    assert env["DFLASH_PREFILL_DRAFTER"] == "drafter.gguf"
+    assert env["DFLASH_FA_WINDOW"] == "512"
+    assert env["DFLASH_THINK_SOFT_CLOSE_MIN_RATIO"] == "0.5"
+    assert env["DFLASH_DEBUG_THINKING_LOGITS"] == "1"
+
+
+def test_server_run_spec_resolves_target_and_draft_paths(tmp_path: Path) -> None:
+    pres = PRESETS["qwen3.6-27b"]
+    cfg = Config(models_dir=tmp_path, model=ModelMeta(preset="qwen3.6-27b"))
+    env = _env(docker_run.server_run_spec(cfg))
+    assert env["DFLASH_TARGET"] == f"/opt/lucebox-hub/server/models/{pres.target_file}"
+    if pres.draft_file:
+        assert env["DFLASH_DRAFT"] == (
+            f"/opt/lucebox-hub/server/models/draft/{pres.draft_file}"
+        )
+
+
+def test_server_run_spec_forwards_host_env(monkeypatch, tmp_path: Path) -> None:
+    monkeypatch.setenv("LUCEBOX_HOST_OS_PRETTY", "Ubuntu 22.04")
+    monkeypatch.setenv("LUCEBOX_HOST_GPU_NAME", "RTX 5090")
+    env = _env(docker_run.server_run_spec(Config(models_dir=tmp_path)))
+    assert env["LUCEBOX_HOST_OS_PRETTY"] == "Ubuntu 22.04"
+    assert env["LUCEBOX_HOST_GPU_NAME"] == "RTX 5090"
+
+
+def test_large_preset_serves_at_safe_default_ctx(tmp_path: Path) -> None:
+    """Regression guard for the preset-cap analysis (#5).
+
+    Activating a preset writes only [model], never [dflash], so a loaded
+    Config keeps the conservative DflashRuntime() floor (max_ctx=16384).
+    The VRAM-tier heuristic's higher caps only apply via `autotune --apply`
+    (which threads cfg.model.preset and is a separate PR). This test pins
+    that a large preset does NOT silently serve at a high, OOM-prone ctx
+    through the default serve path.
+    """
+    cfg = Config(models_dir=tmp_path, model=ModelMeta(preset="qwen3.6-27b"))
+    env = _env(docker_run.server_run_spec(cfg))
+    assert env["DFLASH_MAX_CTX"] == "16384"
+
+
+# ── docker_pull ──────────────────────────────────────────────────────────────
+
+
+def test_docker_pull_shells_out_and_returns_code(monkeypatch) -> None:
+    seen: dict[str, list[str]] = {}
+
+    def fake_call(argv: list[str]) -> int:
+        seen["argv"] = argv
+        return 7
+
+    monkeypatch.setattr(docker_run.subprocess, "call", fake_call)
+    rc = docker_run.docker_pull("img:tag")
+    assert rc == 7
+    assert seen["argv"] == ["docker", "pull", "img:tag"]
diff --git a/lucebox/tests/test_download.py b/lucebox/tests/test_download.py
new file mode 100644
index 000000000..8b69e96b8
--- /dev/null
+++ b/lucebox/tests/test_download.py
@@ -0,0 +1,323 @@
+"""Tests for the model-download orchestration.
+
+The downloader now drives `huggingface_hub.hf_hub_download` directly
+(no subprocess) and verifies size + sha256 against the repo metadata
+before re-fetching. The tests stub out the network calls so the
+behavior contract — what gets requested, when downloads are skipped —
+stays pinned without actually talking to the Hub.
+"""
+
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+from lucebox.download import (
+    DEFAULT_PRESET,
+    PRESETS,
+    recommend_preset,
+    resolve_preset,
+    status,
+)
+from lucebox.types import HostFacts
+
+from lucebox import download
+
+
+def test_default_preset_uses_quantized_gguf_draft():
+    assert DEFAULT_PRESET.draft_repo == "spiritbuun/Qwen3.6-27B-DFlash-GGUF"
+    assert DEFAULT_PRESET.draft_file == "dflash-draft-3.6-q4_k_m.gguf"
+
+
+def test_default_preset_is_registered_under_qwen_name():
+    assert DEFAULT_PRESET is PRESETS["qwen3.6-27b"]
+    assert DEFAULT_PRESET.name == "qwen3.6-27b"
+
+
+def test_resolve_preset_returns_default_on_none():
+    assert resolve_preset(None) is DEFAULT_PRESET
+    assert resolve_preset("") is DEFAULT_PRESET
+
+
+def test_resolve_preset_picks_gemma_target_and_draft():
+    pres = resolve_preset("gemma-4-26b")
+    assert pres.name == "gemma-4-26b"
+    assert pres.target_repo == "bartowski/google_gemma-4-26B-A4B-it-GGUF"
+    assert pres.target_file == "google_gemma-4-26B-A4B-it-Q4_K_M.gguf"
+    assert pres.draft_repo == "Lucebox/gemma-4-26B-A4B-it-DFlash-GGUF"
+    assert pres.draft_file == "gemma-4-26B-A4B-it-DFlash-q8_0.gguf"
+    assert pres.has_draft
+
+
+def test_resolve_preset_supports_target_only_laguna():
+    pres = resolve_preset("laguna-xs.2")
+    assert pres.target_repo == "Lucebox/Laguna-XS.2-GGUF"
+    assert pres.draft_repo is None
+    assert not pres.has_draft
+
+
+def test_resolve_preset_picks_qwen36_moe_target_only():
+    """Qwen3.6 MoE preset routes to unsloth's UD-Q4_K_M file, no draft.
+
+    The MoE variant has no published DFlash draft GGUF (verified against
+    HfApi.repo_info 2026-05-28), so it runs target-only like Laguna. The
+    file stem is `Qwen3.6-35B-A3B-UD-Q4_K_M.gguf` — the unsloth repo only
+    publishes the UD ("unsloth dynamic") family at Q4_K_M, not a plain
+    `Q4_K_M.gguf`.
+    """
+    pres = resolve_preset("qwen3.6-moe")
+    assert pres.name == "qwen3.6-moe"
+    assert pres.target_repo == "unsloth/Qwen3.6-35B-A3B-GGUF"
+    assert pres.target_file == "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"
+    assert pres.draft_repo is None
+    assert pres.draft_file is None
+    assert not pres.has_draft
+
+
+def test_download_preset_target_only_qwen36_moe_skips_draft(tmp_path, monkeypatch):
+    """qwen3.6-moe behaves identically to laguna-xs.2: target only, no draft fetch."""
+    cfg = SimpleNamespace(models_dir=tmp_path)
+    pres = resolve_preset("qwen3.6-moe")
+    assert not pres.has_draft
+    fetches: list[tuple[str, str]] = []
+
+    def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]:
+        return 10, None
+
+    def _stub_fetch(api, repo_id, filename, local_dir, console):  # noqa: ARG001
+        fetches.append((repo_id, filename))
+        out = local_dir / filename
+        out.parent.mkdir(parents=True, exist_ok=True)
+        with out.open("wb") as f:
+            f.truncate(10)
+        return out
+
+    monkeypatch.setattr(download, "_file_meta", _meta)
+    monkeypatch.setattr(download, "_fetch", _stub_fetch)
+
+    assert download.download_preset(cfg, pres) == 0
+    # Only the target — no draft attempt at all.
+    assert fetches == [(pres.target_repo, pres.target_file)]
+
+
+def test_status_qwen36_moe_reports_draft_present_when_target_only(tmp_path, monkeypatch):
+    """No published draft → status reports draft_present=True (nothing to fetch)."""
+    cfg = SimpleNamespace(models_dir=tmp_path)
+    pres = resolve_preset("qwen3.6-moe")
+
+    def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]:
+        return 22 * 10**9, None
+
+    monkeypatch.setattr(download, "_file_meta", _meta)
+    # Target absent → target_present False, draft_present True (no draft).
+    assert status(cfg, pres) == {"target_present": False, "draft_present": True}
+
+
+def test_resolve_preset_unknown_name_lists_known_options():
+    with pytest.raises(KeyError) as exc_info:
+        resolve_preset("qwen-99b")
+    msg = str(exc_info.value)
+    # Every registered preset must appear in the suggestion list so the
+    # user can copy-paste the right name.
+    for name in PRESETS:
+        assert name in msg
+
+
+def _stub_file_meta(target_size: int, draft_size: int):
+    """Build a `_file_meta` replacement that returns (size, None) per repo+file.
+
+    sha256 is left None so tests don't need to compute real hashes; the
+    real metadata path is exercised by the live `models download`
+    invocation, not the unit tests.
+    """
+
+    def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]:
+        if repo_id == DEFAULT_PRESET.target_repo and filename == DEFAULT_PRESET.target_file:
+            return target_size, None
+        if repo_id == DEFAULT_PRESET.draft_repo and filename == DEFAULT_PRESET.draft_file:
+            return draft_size, None
+        raise FileNotFoundError(f"unexpected ({repo_id}, {filename})")
+
+    return _meta
+
+
+def test_status_checks_default_draft_gguf(tmp_path, monkeypatch):
+    cfg = SimpleNamespace(models_dir=tmp_path)
+    draft_dir = tmp_path / "draft"
+    draft_dir.mkdir()
+    target = tmp_path / DEFAULT_PRESET.target_file
+    draft = draft_dir / DEFAULT_PRESET.draft_file
+
+    monkeypatch.setattr(download, "_file_meta", _stub_file_meta(target_size=1024, draft_size=512))
+
+    # Neither file exists yet.
+    assert status(cfg) == {"target_present": False, "draft_present": False}
+
+    # Write files at the expected sizes.
+    with target.open("wb") as f:
+        f.truncate(1024)
+    with draft.open("wb") as f:
+        f.truncate(512)
+    assert status(cfg) == {"target_present": True, "draft_present": True}
+
+
+def test_status_rejects_partial_model_files(tmp_path, monkeypatch):
+    cfg = SimpleNamespace(models_dir=tmp_path)
+    draft_dir = tmp_path / "draft"
+    draft_dir.mkdir()
+    target = tmp_path / DEFAULT_PRESET.target_file
+    draft = draft_dir / DEFAULT_PRESET.draft_file
+    target.write_bytes(b"partial")
+    draft.write_bytes(b"partial")
+
+    # Repo says the target is 1 GB; a 7-byte file is partial, not present.
+    monkeypatch.setattr(
+        download, "_file_meta", _stub_file_meta(target_size=10**9, draft_size=10**6)
+    )
+    assert status(cfg) == {"target_present": False, "draft_present": False}
+
+
+def test_current_bytes_reads_xet_staging_path(tmp_path):
+    """Regression: progress polling must see hf-xet's hashed staging file.
+
+    huggingface_hub 1.x writes partial Xet downloads to
+    ``{local_dir}/.cache/huggingface/download/{short_hash}.{etag}.incomplete``
+    — NOT to ``{local_dir}/{filename}.incomplete``. Before the fix the
+    polling code only checked the latter (which never appears) so the
+    Rich progress bar sat at 0 bytes for the entire transfer.
+    """
+    filename = "model.gguf"
+    etag = "abc123"
+    candidates = download._incomplete_path_candidates(tmp_path, filename, etag)
+    # The first candidate must point at the actual hf-xet staging path.
+    xet_path: Path = candidates[0]
+    assert xet_path.parent == tmp_path / ".cache" / "huggingface" / "download"
+    assert xet_path.name.endswith(f".{etag}.incomplete")
+
+    # Now: writing to that path must be observed by _current_bytes.
+    xet_path.parent.mkdir(parents=True, exist_ok=True)
+    xet_path.write_bytes(b"x" * 4096)
+    target = tmp_path / filename
+    assert download._current_bytes(target, candidates) == 4096
+
+
+def test_current_bytes_falls_back_to_glob_without_etag(tmp_path):
+    """When sha256 is unknown we still find growing .incomplete files."""
+    filename = "model.gguf"
+    candidates = download._incomplete_path_candidates(tmp_path, filename, etag=None)
+    target = tmp_path / filename
+
+    staging = tmp_path / ".cache" / "huggingface" / "download"
+    staging.mkdir(parents=True, exist_ok=True)
+    (staging / "deadbeef.deadbeef.incomplete").write_bytes(b"x" * 8192)
+    assert download._current_bytes(target, candidates) == 8192
+
+
+def test_current_bytes_prefers_final_target_when_complete(tmp_path):
+    filename = "model.gguf"
+    candidates = download._incomplete_path_candidates(tmp_path, filename, etag="abc")
+    target = tmp_path / filename
+    target.write_bytes(b"x" * 1234)
+    assert download._current_bytes(target, candidates) == 1234
+
+
+def test_download_preset_fetches_exact_draft_file(tmp_path, monkeypatch):
+    cfg = SimpleNamespace(models_dir=tmp_path)
+    fetches: list[tuple[str, str, str]] = []
+
+    monkeypatch.setattr(download, "_file_meta", _stub_file_meta(target_size=10, draft_size=10))
+
+    # Stub the actual download to record what was requested + create a stub
+    # file of the expected size so `_local_matches` would pass on a re-run.
+    def _stub_fetch(api, repo_id, filename, local_dir, console):  # noqa: ARG001
+        fetches.append((repo_id, filename, str(local_dir)))
+        target = local_dir / filename
+        target.parent.mkdir(parents=True, exist_ok=True)
+        with target.open("wb") as f:
+            f.truncate(10)
+        return target
+
+    monkeypatch.setattr(download, "_fetch", _stub_fetch)
+
+    assert download.download_preset(cfg) == 0
+    assert (DEFAULT_PRESET.target_repo, DEFAULT_PRESET.target_file, str(tmp_path)) in fetches
+    assert (
+        DEFAULT_PRESET.draft_repo,
+        DEFAULT_PRESET.draft_file,
+        str(tmp_path / "draft"),
+    ) in fetches
+
+
+def test_download_preset_routes_gemma_preset_to_correct_repos(tmp_path, monkeypatch):
+    cfg = SimpleNamespace(models_dir=tmp_path)
+    pres = resolve_preset("gemma-4-26b")
+    fetches: list[tuple[str, str, str]] = []
+
+    def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]:
+        return 10, None
+
+    def _stub_fetch(api, repo_id, filename, local_dir, console):  # noqa: ARG001
+        fetches.append((repo_id, filename, str(local_dir)))
+        out = local_dir / filename
+        out.parent.mkdir(parents=True, exist_ok=True)
+        with out.open("wb") as f:
+            f.truncate(10)
+        return out
+
+    monkeypatch.setattr(download, "_file_meta", _meta)
+    monkeypatch.setattr(download, "_fetch", _stub_fetch)
+
+    assert download.download_preset(cfg, pres) == 0
+    assert (pres.target_repo, pres.target_file, str(tmp_path)) in fetches
+    assert (pres.draft_repo, pres.draft_file, str(tmp_path / "draft")) in fetches
+
+
+def test_download_preset_target_only_skips_draft_fetch(tmp_path, monkeypatch):
+    cfg = SimpleNamespace(models_dir=tmp_path)
+    pres = resolve_preset("laguna-xs.2")
+    assert not pres.has_draft
+    fetches: list[tuple[str, str]] = []
+
+    def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]:
+        return 10, None
+
+    def _stub_fetch(api, repo_id, filename, local_dir, console):  # noqa: ARG001
+        fetches.append((repo_id, filename))
+        out = local_dir / filename
+        out.parent.mkdir(parents=True, exist_ok=True)
+        with out.open("wb") as f:
+            f.truncate(10)
+        return out
+
+    monkeypatch.setattr(download, "_file_meta", _meta)
+    monkeypatch.setattr(download, "_fetch", _stub_fetch)
+
+    assert download.download_preset(cfg, pres) == 0
+    # Target fetched, no draft fetch attempted at all.
+    assert fetches == [(pres.target_repo, pres.target_file)]
+
+
+def test_status_target_only_preset_reports_draft_as_present(tmp_path, monkeypatch):
+    cfg = SimpleNamespace(models_dir=tmp_path)
+    pres = resolve_preset("laguna-xs.2")
+
+    def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]:
+        return 1024, None
+
+    monkeypatch.setattr(download, "_file_meta", _meta)
+    # Target absent → target_present False, draft_present True (nothing to download).
+    assert status(cfg, pres) == {"target_present": False, "draft_present": True}
+
+
+def test_recommend_preset_tiers() -> None:
+    """First-run preset recommendation is a pure VRAM-tier function.
+
+    22 GB+ → the Lucebox default (qwen3.6-27b); 16-21 GB → laguna-xs.2;
+    below 16 GB → None (the registered presets need ≥16 GB, so we punt to
+    an explicit choice rather than recommend something that can't run).
+    """
+    assert recommend_preset(HostFacts(vram_gb=24)) == "qwen3.6-27b"
+    assert recommend_preset(HostFacts(vram_gb=22)) == "qwen3.6-27b"
+    assert recommend_preset(HostFacts(vram_gb=20)) == "laguna-xs.2"
+    assert recommend_preset(HostFacts(vram_gb=16)) == "laguna-xs.2"
+    assert recommend_preset(HostFacts(vram_gb=12)) is None
+    assert recommend_preset(HostFacts(vram_gb=0)) is None
diff --git a/lucebox/tests/test_models_cli.py b/lucebox/tests/test_models_cli.py
new file mode 100644
index 000000000..f44583044
--- /dev/null
+++ b/lucebox/tests/test_models_cli.py
@@ -0,0 +1,142 @@
+"""Tests for the ``lucebox models`` sub-app."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from lucebox.cli import app
+from lucebox.download import PRESETS
+from lucebox.types import HostFacts
+from typer.testing import CliRunner
+
+from lucebox import config as config_mod
+from lucebox import download as download_mod
+
+
+def _set_config_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
+    monkeypatch.setenv("LUCEBOX_HOME", str(tmp_path))
+    monkeypatch.setenv("LUCEBOX_MODELS", str(tmp_path / "models"))
+    return tmp_path / "config.toml"
+
+
+def _stub_host(monkeypatch: pytest.MonkeyPatch, vram_gb: int) -> None:
+    monkeypatch.setattr("lucebox.host_facts.from_env", lambda: HostFacts(vram_gb=vram_gb))
+    monkeypatch.setattr("lucebox.cli.from_env", lambda: HostFacts(vram_gb=vram_gb))
+
+
+def test_models_list_shows_every_registered_preset(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    _set_config_path(tmp_path, monkeypatch)
+    _stub_host(monkeypatch, vram_gb=24)
+    result = CliRunner().invoke(app, ["models", "list"])
+    assert result.exit_code == 0
+    for name in PRESETS:
+        assert name in result.stdout
+
+
+def test_models_default_view_lists_only_installed(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    _set_config_path(tmp_path, monkeypatch)
+    _stub_host(monkeypatch, vram_gb=24)
+    # No models on disk → default view says "no presets installed".
+    result = CliRunner().invoke(app, ["models"])
+    assert result.exit_code == 0
+    assert "No presets installed" in result.stdout or "Models dir" in result.stdout
+
+
+def test_models_download_recommends_when_empty(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """No preset configured + nothing on argv → auto-recommend + auto-activate."""
+    cfg_path = _set_config_path(tmp_path, monkeypatch)
+    _stub_host(monkeypatch, vram_gb=24)
+
+    # Stub the network calls so the test doesn't try to talk to HF.
+    monkeypatch.setattr(download_mod, "download_preset", lambda cfg, pres: 0)
+    monkeypatch.setattr(
+        download_mod,
+        "status",
+        lambda cfg, pres: {"target_present": True, "draft_present": True},
+    )
+
+    result = CliRunner().invoke(app, ["models", "download"])
+    assert result.exit_code == 0
+    assert "Recommended preset" in result.stdout
+    assert cfg_path.exists()
+    # The active preset should now be model.preset = qwen3.6-27b.
+    entries = config_mod.config_get(path=cfg_path)
+    assert entries["model.preset"] == ("qwen3.6-27b", "file")
+
+
+def test_models_download_refuses_silent_switch(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """When a preset is already active, `download` with no arg refuses."""
+    cfg_path = _set_config_path(tmp_path, monkeypatch)
+    _stub_host(monkeypatch, vram_gb=24)
+    config_mod.config_set("model.preset", "qwen3.6-27b", path=cfg_path)
+
+    result = CliRunner().invoke(app, ["models", "download"])
+    assert result.exit_code == 2
+    assert "already active" in result.stdout.lower()
+
+
+def test_models_download_explicit_preset_no_activate(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Passing a preset without --activate downloads but doesn't flip model.preset."""
+    cfg_path = _set_config_path(tmp_path, monkeypatch)
+    _stub_host(monkeypatch, vram_gb=24)
+    monkeypatch.setattr(download_mod, "download_preset", lambda cfg, pres: 0)
+    monkeypatch.setattr(
+        download_mod,
+        "status",
+        lambda cfg, pres: {"target_present": False, "draft_present": False},
+    )
+
+    result = CliRunner().invoke(app, ["models", "download", "gemma-4-26b"])
+    assert result.exit_code == 0
+    if cfg_path.exists():
+        entries = config_mod.config_get(path=cfg_path)
+        assert entries["model.preset"] == ("", "default")
+
+
+def test_models_download_explicit_preset_with_activate(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    cfg_path = _set_config_path(tmp_path, monkeypatch)
+    _stub_host(monkeypatch, vram_gb=24)
+    monkeypatch.setattr(download_mod, "download_preset", lambda cfg, pres: 0)
+    monkeypatch.setattr(
+        download_mod,
+        "status",
+        lambda cfg, pres: {"target_present": False, "draft_present": False},
+    )
+
+    result = CliRunner().invoke(app, ["models", "download", "gemma-4-26b", "--activate"])
+    assert result.exit_code == 0
+    entries = config_mod.config_get(path=cfg_path)
+    assert entries["model.preset"] == ("gemma-4-26b", "file")
+
+
+def test_installed_helpers_track_presence(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """``installed_status`` / ``installed_size_gb`` reflect on-disk byte counts."""
+    _set_config_path(tmp_path, monkeypatch)
+    _stub_host(monkeypatch, vram_gb=24)
+    from lucebox.config import live_config
+
+    cfg = live_config()
+    cfg.models_dir.mkdir(parents=True, exist_ok=True)
+    laguna = PRESETS["laguna-xs.2"]
+    assert download_mod.installed_status(cfg, laguna) == "absent"
+
+    target = cfg.models_dir / laguna.target_file
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_bytes(b"x" * (5 * 10**9))
+    assert download_mod.installed_status(cfg, laguna) == "installed"
+    assert download_mod.installed_size_gb(cfg, laguna) == pytest.approx(5.0, rel=0.01)
diff --git a/pyproject.toml b/pyproject.toml
index 56ae2bf4f..520838041 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ license = { text = "Apache-2.0" }
 authors = [{ name = "Lucebox" }]
 
 dependencies = [
+    "lucebox",
     "lucebox-dflash",
     "pflash",
 ]
@@ -23,7 +24,7 @@ line-length = 100
 # server-internal and optimization Python (server/scripts, optimizations/*)
 # carries pre-existing style debt and is added to `include` as it is cleaned
 # up. Vendored deps stay excluded permanently (extend-exclude below).
-include = ["harness/**/*.py", "scripts/**/*.py"]
+include = ["harness/**/*.py", "scripts/**/*.py", "lucebox/**/*.py"]
 extend-exclude = [
     "dflash/deps",
     "megakernel",
@@ -51,11 +52,12 @@ package = false
 no-build-isolation-package = ["qwen35-megakernel-bf16"]
 
 [tool.uv.workspace]
-# Workspace members. Keeping the list to the packages that live in this
-# repo lets `uv lock --check` / `uv sync --frozen` pass.
-members = ["server", "optimizations/megakernel", "optimizations/pflash"]
+# Workspace members. PR adds the lucebox/ package alongside the existing
+# server / megakernel / pflash members.
+members = ["lucebox", "server", "optimizations/megakernel", "optimizations/pflash"]
 
 [tool.uv.sources]
+lucebox = { workspace = true }
 lucebox-dflash = { workspace = true }
 pflash = { workspace = true }
 qwen35-megakernel-bf16 = { workspace = true }
diff --git a/scripts/check_lucebox_wrapper_sandbox.sh b/scripts/check_lucebox_wrapper_sandbox.sh
new file mode 100755
index 000000000..df2b2b9bc
--- /dev/null
+++ b/scripts/check_lucebox_wrapper_sandbox.sh
@@ -0,0 +1,242 @@
+#!/usr/bin/env bash
+# Exercise the host-side lucebox.sh installer/wrapper from an isolated prefix.
+#
+# The script intentionally runs from a throwaway HOME, XDG_CONFIG_HOME,
+# LUCEBOX_HOME, model directory, and working directory. That catches accidental
+# dependencies on the checkout or the user's real ~/.lucebox while keeping the
+# test reproducible enough to paste into a bug report.
+
+set -euo pipefail
+
+IMAGE="${LUCEBOX_TEST_IMAGE:-ghcr.io/easel/lucebox-hub}"
+VARIANT="${LUCEBOX_TEST_VARIANT:-integration-props-uv-squared-clean-cuda12}"
+WRAPPER_SOURCE="${LUCEBOX_TEST_WRAPPER_SOURCE:-local}"
+RUN_PULL="${LUCEBOX_TEST_RUN_PULL:-1}"
+RUN_CONTAINER_CLI="${LUCEBOX_TEST_RUN_CONTAINER_CLI:-1}"
+KEEP_SANDBOX="${LUCEBOX_TEST_KEEP_SANDBOX:-0}"
+
+ROOT=""
+LOG=""
+
+usage() {
+    cat <<EOF
+Usage: $0 [--source local|URL] [--image IMAGE] [--variant TAG] [--no-pull] [--no-container-cli] [--keep]
+
+Defaults:
+  --source        local
+  --image         $IMAGE
+  --variant       $VARIANT
+
+Environment aliases:
+  LUCEBOX_TEST_WRAPPER_SOURCE, LUCEBOX_TEST_IMAGE, LUCEBOX_TEST_VARIANT,
+  LUCEBOX_TEST_RUN_PULL=0, LUCEBOX_TEST_RUN_CONTAINER_CLI=0,
+  LUCEBOX_TEST_KEEP_SANDBOX=1
+EOF
+}
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --source) WRAPPER_SOURCE="$2"; shift 2 ;;
+        --image) IMAGE="$2"; shift 2 ;;
+        --variant) VARIANT="$2"; shift 2 ;;
+        --no-pull) RUN_PULL=0; shift ;;
+        --no-container-cli) RUN_CONTAINER_CLI=0; shift ;;
+        --keep) KEEP_SANDBOX=1; shift ;;
+        -h|--help) usage; exit 0 ;;
+        *) echo "unknown argument: $1" >&2; usage >&2; exit 2 ;;
+    esac
+done
+
+die() {
+    echo "[FAIL] $*" >&2
+    if [ -n "$LOG" ] && [ -f "$LOG" ]; then
+        echo "[FAIL] transcript: $LOG" >&2
+    fi
+    exit 1
+}
+
+note() {
+    printf '[INFO] %s\n' "$*"
+}
+
+pass() {
+    printf '[PASS] %s\n' "$*"
+}
+
+assert_file() {
+    [ -f "$1" ] || die "missing file: $1"
+    pass "file exists: $1"
+}
+
+assert_contains() {
+    local file="$1"
+    local pattern="$2"
+    if ! grep -Fq "$pattern" "$file"; then
+        echo "----- $file -----" >&2
+        sed -n '1,220p' "$file" >&2 || true
+        echo "-----------------" >&2
+        die "expected '$pattern' in $file"
+    fi
+    pass "$file contains: $pattern"
+}
+
+run_logged() {
+    note "run: $*"
+    {
+        printf '\n===== %s =====\n' "$*"
+        "$@"
+        printf '===== exit=0 =====\n'
+    } 2>&1 | tee -a "$LOG"
+}
+
+run_logged_capture() {
+    local out="$1"
+    shift
+    note "run: $* > $out"
+    {
+        printf '\n===== %s > %s =====\n' "$*" "$out"
+        "$@"
+        local rc=$?
+        printf '===== exit=%s =====\n' "$rc"
+        return "$rc"
+    } 2>&1 | tee "$out" | tee -a "$LOG" >/dev/null
+}
+
+cleanup() {
+    if [ -n "$ROOT" ] && [ "$KEEP_SANDBOX" != "1" ]; then
+        rm -rf "$ROOT"
+    elif [ -n "$ROOT" ]; then
+        note "kept sandbox: $ROOT"
+        note "transcript: $LOG"
+    fi
+}
+trap cleanup EXIT
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+ROOT="$(mktemp -d "${TMPDIR:-/tmp}/lucebox-wrapper-sandbox.XXXXXX")"
+LOG="$ROOT/transcript.log"
+
+HOME_DIR="$ROOT/home"
+BIN_DIR="$ROOT/bin"
+XDG_DIR="$ROOT/xdg"
+MODELS_DIR="$ROOT/models"
+WORK_DIR="$ROOT/work"
+mkdir -p "$HOME_DIR" "$BIN_DIR" "$XDG_DIR" "$MODELS_DIR" "$WORK_DIR"
+
+note "sandbox: $ROOT"
+note "transcript: $LOG"
+
+case "$WRAPPER_SOURCE" in
+    local)
+        cp "$REPO_ROOT/lucebox.sh" "$BIN_DIR/lucebox"
+        ;;
+    http://*|https://*)
+        curl -fsSL "$WRAPPER_SOURCE" -o "$BIN_DIR/lucebox"
+        ;;
+    *)
+        cp "$WRAPPER_SOURCE" "$BIN_DIR/lucebox"
+        ;;
+esac
+chmod +x "$BIN_DIR/lucebox"
+
+FIRST_LINE="$(head -n 1 "$BIN_DIR/lucebox")"
+[ "$FIRST_LINE" = "#!/usr/bin/env bash" ] || die "unexpected shebang: $FIRST_LINE"
+pass "wrapper has expected shebang"
+
+export HOME="$HOME_DIR"
+export XDG_CONFIG_HOME="$XDG_DIR"
+export LUCEBOX_HOME="$HOME_DIR/.lucebox"
+export LUCEBOX_MODELS="$MODELS_DIR"
+export LUCEBOX_IMAGE="$IMAGE"
+export LUCEBOX_VARIANT="$VARIANT"
+export LUCEBOX_CONTAINER="lucebox-sandbox"
+export LUCEBOX_PORT="18080"
+export PATH="$BIN_DIR:$PATH"
+
+cd "$WORK_DIR"
+[ "$PWD" = "$WORK_DIR" ] || die "failed to enter sandbox workdir"
+pass "working directory isolated: $PWD"
+
+run_logged_capture "$ROOT/version.out" lucebox version
+assert_contains "$ROOT/version.out" "0.2.0"
+
+run_logged_capture "$ROOT/help.out" lucebox help
+assert_contains "$ROOT/help.out" "LUCEBOX_VARIANT"
+assert_contains "$ROOT/help.out" "LUCEBOX_IMAGE"
+
+docker manifest inspect "${IMAGE}:${VARIANT}" >/dev/null
+pass "image manifest exists: ${IMAGE}:${VARIANT}"
+
+if [ "$RUN_PULL" = "1" ]; then
+    run_logged_capture "$ROOT/pull.out" lucebox pull
+    assert_contains "$ROOT/pull.out" "${IMAGE}:${VARIANT}"
+fi
+
+if [ "$RUN_CONTAINER_CLI" = "1" ]; then
+    run_logged_capture "$ROOT/check.out" lucebox check
+    # Sparse persistence: `config set` creates config.toml with only the
+    # named key. Replaces the old `configure --overwrite` path.
+    run_logged_capture "$ROOT/config-image.out" lucebox config set "image=$IMAGE"
+    run_logged_capture "$ROOT/config-variant.out" lucebox config set "variant=$VARIANT"
+    assert_file "$LUCEBOX_HOME/config.toml"
+    [ "$(stat -c '%u' "$LUCEBOX_HOME/config.toml")" = "$(id -u)" ] \
+        || die "config.toml is not owned by the invoking user"
+    pass "config.toml ownership matches invoking user"
+    assert_contains "$LUCEBOX_HOME/config.toml" "registry = \"$IMAGE\""
+    assert_contains "$LUCEBOX_HOME/config.toml" "variant = \"$VARIANT\""
+
+    run_logged_capture "$ROOT/print-run.out" lucebox print-run
+    assert_contains "$ROOT/print-run.out" "${IMAGE}:${VARIANT}"
+    assert_contains "$ROOT/print-run.out" "$MODELS_DIR:/opt/lucebox-hub/dflash/models"
+    if grep -Fq "$REPO_ROOT" "$ROOT/print-run.out"; then
+        die "print-run leaked repository path: $REPO_ROOT"
+    fi
+    pass "print-run did not reference repository checkout"
+fi
+
+# Exercise `lucebox install` without allowing it to call real systemctl,
+# loginctl, docker, or nvidia-smi. The generated user unit must land under the
+# sandbox XDG_CONFIG_HOME and point ExecStart at the sandbox-installed wrapper.
+SHIM_DIR="$ROOT/shims"
+mkdir -p "$SHIM_DIR"
+cat > "$SHIM_DIR/docker" <<'EOF'
+#!/usr/bin/env bash
+case "${1:-}" in
+  info) exit 0 ;;
+  version) echo "25.0.0"; exit 0 ;;
+  stop) exit 0 ;;
+  *) echo "docker shim: $*" >&2; exit 0 ;;
+esac
+EOF
+cat > "$SHIM_DIR/nvidia-smi" <<'EOF'
+#!/usr/bin/env bash
+case "$*" in
+  *"--query-gpu=name,memory.total,driver_version,compute_cap"*)
+    echo "Fake GPU, 24576, 555.42.01, 8.6"; exit 0 ;;
+  *"--query-gpu=name"*)
+    echo "Fake GPU"; exit 0 ;;
+  *) echo "Fake GPU"; exit 0 ;;
+esac
+EOF
+cat > "$SHIM_DIR/systemctl" <<'EOF'
+#!/usr/bin/env bash
+if [ "$1" = "--user" ] && [ "$2" = "show-environment" ]; then exit 0; fi
+if [ "$1" = "--user" ] && [ "$2" = "daemon-reload" ]; then exit 0; fi
+echo "systemctl shim: $*" >&2
+exit 0
+EOF
+cat > "$SHIM_DIR/loginctl" <<'EOF'
+#!/usr/bin/env bash
+echo "Linger=no"
+EOF
+chmod +x "$SHIM_DIR/docker" "$SHIM_DIR/nvidia-smi" "$SHIM_DIR/systemctl" "$SHIM_DIR/loginctl"
+
+PATH="$SHIM_DIR:$BIN_DIR:$PATH" run_logged_capture "$ROOT/install.out" lucebox install
+UNIT="$XDG_CONFIG_HOME/systemd/user/lucebox.service"
+assert_file "$UNIT"
+assert_contains "$UNIT" "ExecStart=$BIN_DIR/lucebox serve"
+assert_contains "$UNIT" "ExecStop=$SHIM_DIR/docker stop -t 30 lucebox-sandbox"
+assert_contains "$ROOT/install.out" "Installed $UNIT"
+
+pass "sandbox wrapper check completed"
+note "summary: image=${IMAGE}:${VARIANT} wrapper_source=${WRAPPER_SOURCE}"
diff --git a/scripts/test_lucebox_sh.sh b/scripts/test_lucebox_sh.sh
new file mode 100755
index 000000000..464f552b2
--- /dev/null
+++ b/scripts/test_lucebox_sh.sh
@@ -0,0 +1,1127 @@
+#!/usr/bin/env bash
+# scripts/test_lucebox_sh.sh — smoke tests for the host-side wrapper +
+# every other bash script we ship.
+#
+# Catches regressions like:
+#   * syntax errors (bash -n)
+#   * shellcheck error-level findings across every shipped bash script
+#   * `set -u` violations in command paths that don't need docker/nvidia —
+#     each subcommand dispatch is exercised in isolation to verify no
+#     LUCEBOX_HOST_* or DFLASH_* read fires before the helper that should
+#     populate it has run.
+#   * missing dispatch handlers (help, version, check, usage)
+#   * stale references to subcommands removed from main's case
+#
+# The wrapper is shell + has zero non-coreutils deps for the host-only
+# commands, so this script doesn't need docker/nvidia/systemd present —
+# probe_host degrades cleanly when those aren't found, and the
+# formatter must render fine for the "everything is missing" case too.
+#
+# Run from anywhere:  scripts/test_lucebox_sh.sh
+
+set -euo pipefail
+
+# Resolve repo root + script under test.
+ROOT="$(git rev-parse --show-toplevel 2>/dev/null || (cd "$(dirname "$0")/.." && pwd))"
+SCRIPT="$ROOT/lucebox.sh"
+ENTRYPOINT="$ROOT/server/scripts/entrypoint.sh"
+INSTALLER="$ROOT/install.sh"
+
+if [ ! -f "$SCRIPT" ]; then
+    echo "FAIL: lucebox.sh not found at $SCRIPT" >&2
+    exit 1
+fi
+
+# entrypoint.sh ships with the docker-stack PR (#334). When it's absent
+# (e.g. on the lucebox-cli branch in isolation), skip the entire suite —
+# every section below either references $ENTRYPOINT in shellcheck targets,
+# parses it with `bash -n`, or sources/dispatches into it directly. The
+# host-only lucebox.sh wrapper itself is covered by lucebox.sh's own unit
+# tests; this script's value is the wrapper↔entrypoint contract.
+if [ ! -f "$ENTRYPOINT" ]; then
+    echo "Skipping entrypoint tests: server/scripts/entrypoint.sh not present (provided by #334 docker-stack)"
+    exit 0
+fi
+
+fail=0
+pass=0
+report() {
+    if [ "$1" = "ok" ]; then
+        printf '  \033[1;32m✓\033[0m %s\n' "$2"
+        pass=$((pass + 1))
+    else
+        printf '  \033[1;31m✗\033[0m %s\n' "$2"
+        if [ -n "${3:-}" ]; then
+            printf '    %s\n' "$3"
+        fi
+        fail=$((fail + 1))
+    fi
+}
+
+# Helper: run the wrapper with strict bash, capture stdout+stderr, check for
+# (a) zero exit code, (b) substring match. NO_COLOR is set so colour codes
+# don't pollute substring matches.
+assert_runs() {
+    local label="$1" cmd="$2" expect="${3:-}"
+    local out rc
+    out=$(NO_COLOR=1 bash -c "$cmd" 2>&1)
+    rc=$?
+    if [ "$rc" -ne 0 ]; then
+        report fail "$label" "exit $rc; output: $(printf '%s' "$out" | head -3)"
+        return
+    fi
+    if [ -n "$expect" ] && ! grep -qF "$expect" <<<"$out"; then
+        report fail "$label" "missing expected substring '$expect'; got: $(printf '%s' "$out" | head -3)"
+        return
+    fi
+    report ok "$label"
+}
+
+# Helper: run a subcommand whose successful completion would normally need
+# docker / nvidia / systemd. We only care that the bash dispatch up to the
+# point of the missing dependency does NOT trip `set -u`. Exit code is
+# allowed to be non-zero; what we forbid is a raw "unbound variable" /
+# "syntax error" / "line N:" leak in the captured output.
+#
+# Wrapped in `timeout` so subcommands that exec into a follow-style binary
+# (logs → journalctl -f, status when systemd is healthy, etc.) don't hang
+# the test runner on a dev box where the underlying tools succeed.
+assert_no_set_u_leak() {
+    local label="$1"
+    shift
+    local out
+    out=$(NO_COLOR=1 timeout 5 bash "$@" 2>&1 || true)
+    # The "line N:" pattern is anchored to a script-path prefix to avoid
+    # false positives from journalctl output ("systemd[1385106]:") which
+    # contains a similar shape but isn't a bash error. Bash always emits
+    # the source filename before the line number, e.g.
+    #   /tmp/lbh-flat/lucebox.sh: line 200: VAR: unbound variable
+    if grep -qE 'unbound variable|syntax error|\.sh: line [0-9]+:' <<<"$out"; then
+        report fail "$label" "raw bash error leaked: $(head -3 <<<"$out")"
+    else
+        report ok "$label"
+    fi
+}
+
+echo "[test_lucebox_sh] running against $SCRIPT"
+
+# ── 1. shellcheck ─────────────────────────────────────────────────────────
+# Run shellcheck across every bash script we ship (the wrapper, the
+# in-container entrypoint, and every helper under scripts/). Error-level
+# findings fail the build; warnings are informational only — those have
+# been triaged and the SC2034/SC2155/SC2164 hits in sweep_ds4_2case.sh
+# aren't user-visible bugs.
+SHELLCHECK_TARGETS=(
+    "$SCRIPT"
+    "$ENTRYPOINT"
+    "$INSTALLER"
+)
+# Add every scripts/*.sh except this one (don't recurse into our own tests).
+while IFS= read -r -d '' f; do
+    [ "$f" = "${BASH_SOURCE[0]}" ] && continue
+    SHELLCHECK_TARGETS+=("$f")
+done < <(find "$ROOT/scripts" -maxdepth 1 -name '*.sh' -type f -print0 2>/dev/null)
+SHELLCHECK_TARGETS+=("${BASH_SOURCE[0]}")
+
+if command -v shellcheck >/dev/null 2>&1; then
+    sc_out=$(shellcheck --severity=error "${SHELLCHECK_TARGETS[@]}" 2>&1) || sc_rc=$?
+    sc_rc="${sc_rc:-0}"
+    if [ "$sc_rc" -eq 0 ]; then
+        report ok "shellcheck --severity=error (${#SHELLCHECK_TARGETS[@]} files)"
+    else
+        report fail "shellcheck --severity=error" "$(printf '%s' "$sc_out" | head -10)"
+    fi
+else
+    report fail "shellcheck not installed" "install via 'apt-get install -y shellcheck' (Ubuntu) or 'brew install shellcheck'"
+fi
+
+# ── 2. Syntax / parse ─────────────────────────────────────────────────────
+if bash -n "$SCRIPT"; then report ok "bash -n lucebox.sh parses cleanly"
+else report fail "bash -n lucebox.sh"; fi
+if bash -n "$ENTRYPOINT"; then report ok "bash -n entrypoint.sh parses cleanly"
+else report fail "bash -n entrypoint.sh"; fi
+
+# ── 3. Trivial subcommands (zero-exit expected) ───────────────────────────
+assert_runs "help"     "bash '$SCRIPT' help"     "host-side wrapper"
+assert_runs "--help"   "bash '$SCRIPT' --help"   "host-side wrapper"
+assert_runs "-h"       "bash '$SCRIPT' -h"       "host-side wrapper"
+assert_runs "version"  "bash '$SCRIPT' version"  ""
+assert_runs "--version" "bash '$SCRIPT' --version" ""
+
+# ── 4. check — host-only, must run to completion even without docker/nvidia.
+#    This is the path that broke last time (multi-byte glyph + set -u).
+assert_runs "check"    "bash '$SCRIPT' check"    "host readiness report"
+
+# ── 5. systemd-surface subcommands — every one of these used to crash with
+# `LUCEBOX_HOST_HAS_SYSTEMD: unbound variable` because cmd_systemctl_passthrough
+# / cmd_logs / cmd_systemd_uninstall reached require_systemd without first
+# calling probe_host. The fix routes through require_systemd → probe_host
+# when the var is unset; these tests pin that invariant.
+#
+# On the bare runner there is no user systemd, no installed unit, and no
+# docker — so every command is expected to exit non-zero with a CLEAN error
+# message. What we forbid is a raw bash "unbound variable" leak.
+for sub in start stop restart enable disable status install uninstall; do
+    assert_no_set_u_leak "$sub dispatch (no set -u leak)" "$SCRIPT" "$sub"
+done
+# `logs` is special: it execs `journalctl -f` which streams every historical
+# journal record for the unit. On a dev box where the lucebox service has
+# actually run, that stream contains every past error — including the very
+# bugs this test exists to prevent — and we'd false-positive on them. Pass
+# `-n 0 --no-pager` so we only see new entries (none, in the test window).
+assert_no_set_u_leak "logs dispatch (no set -u leak)" "$SCRIPT" logs -n 0 --no-pager
+
+# ── 6. server-spawning subcommands — exercise the dispatch up to where
+# the missing docker daemon stops them. `serve` is intentionally skipped
+# because on a host with a working docker + the cuda12 image already
+# pulled, it would actually exec into the container — at which point
+# we'd be testing the image's entrypoint, not the wrapper. `pull` just
+# execs `docker pull`, so we still smoke its host-side dispatch.
+assert_no_set_u_leak "pull dispatch (no set -u leak)" "$SCRIPT" pull
+
+# ── 7. Unknown subcommand → cmd_in_container fallback path. Same rule:
+# clean error, no raw bash leak.
+assert_no_set_u_leak "unknown subcommand dispatch" "$SCRIPT" no-such-subcommand
+
+# ── 8. Pre-populated LUCEBOX_HOST_* env (simulates an already-probed host
+# whose vars are passed in from a parent process). Useful in CI matrices
+# where we want to mock a "good host" without nvidia-smi/docker on PATH.
+out=$(
+    NO_COLOR=1 \
+    LUCEBOX_HOST_HAS_SYSTEMD=0 \
+    LUCEBOX_HOST_HAS_DOCKER=0 \
+    LUCEBOX_HOST_HAS_CTK=none \
+    LUCEBOX_HOST_GPU_VENDOR=none \
+    LUCEBOX_HOST_GPU_NAME="" \
+    LUCEBOX_HOST_GPU_COUNT=0 \
+    LUCEBOX_HOST_VRAM_GB=0 \
+    LUCEBOX_HOST_GPU_SM="" \
+    LUCEBOX_HOST_DRIVER_VERSION="" \
+    LUCEBOX_HOST_DRIVER_MAJOR=0 \
+    LUCEBOX_HOST_NPROC=1 \
+    LUCEBOX_HOST_RAM_GB=0 \
+    LUCEBOX_HOST_IS_WSL=0 \
+    LUCEBOX_HOST_DOCKER_VERSION="" \
+    timeout 5 bash "$SCRIPT" start 2>&1 || true
+)
+if grep -qE 'unbound variable|syntax error' <<<"$out"; then
+    report fail "start with pre-populated LUCEBOX_HOST_* env" "leak: $(head -3 <<<"$out")"
+else
+    report ok "start with pre-populated LUCEBOX_HOST_* env"
+fi
+
+# ── 8b. PIN the top-of-script LUCEBOX_HOST_* safe-default seeds. Even with
+# probe_host short-circuited to a no-op (the worst-case bug recurrence: a
+# future refactor accidentally deletes the call from a dispatch path) the
+# wrapper must not leak `unbound variable` on `start`. We achieve "probe_host
+# is a no-op" by exporting `_LUCEBOX_HOST_PROBED=1` so ensure_probed skips
+# the real probe — equivalent to a future refactor that calls ensure_probed
+# but mis-implements the gate.
+out=$(
+    NO_COLOR=1 \
+    _LUCEBOX_HOST_PROBED=1 \
+    timeout 5 bash "$SCRIPT" start 2>&1 || true
+)
+if grep -qE 'unbound variable|syntax error' <<<"$out"; then
+    report fail "start with probe_host bypassed (seed defaults must catch this)" "leak: $(head -3 <<<"$out")"
+else
+    report ok "start with probe_host bypassed (seed defaults intact)"
+fi
+
+# Same for every other systemd-surface subcommand, since the seed defaults
+# are the only thing keeping these safe under `set -u` if probe_host is ever
+# bypassed.
+for sub in stop restart enable disable status install uninstall logs; do
+    out=$(
+        NO_COLOR=1 \
+        _LUCEBOX_HOST_PROBED=1 \
+        timeout 5 bash "$SCRIPT" "$sub" -n 0 --no-pager 2>&1 || true
+    )
+    if grep -qE 'unbound variable|syntax error' <<<"$out"; then
+        report fail "$sub with probe_host bypassed" "leak: $(head -3 <<<"$out")"
+    else
+        report ok "$sub with probe_host bypassed"
+    fi
+done
+
+# ── 8c. Install path writes a robust unit file. Use a sandbox HOME so we
+# don't clobber the developer's real ~/.config/systemd/user/lucebox.service,
+# and verify the generated unit contains the Environment= / ExecStartPre=
+# hardening that Bug 2 ("systemctl start succeeds but no container") added.
+# The install runs in a host with no real systemd (the sandbox doesn't have
+# `systemctl --user`), so we pre-seed LUCEBOX_HOST_HAS_SYSTEMD=1 to slip past
+# the require_systemd gate, then stub out the `systemctl` binary itself so
+# daemon-reload is a no-op.
+test_install_writes_robust_unit() {
+    local label="install writes hardened unit file"
+    local sandbox shim_dir
+    sandbox=$(mktemp -d)
+    shim_dir="$sandbox/bin"
+    mkdir -p "$shim_dir"
+    # Stub systemctl + docker + nvidia-smi + loginctl so the install's
+    # require_host_prereqs and daemon-reload calls all succeed.
+    for binname in systemctl docker nvidia-smi loginctl; do
+        cat > "$shim_dir/$binname" <<'STUB'
+#!/usr/bin/env bash
+case "$1" in
+  ps|version) exit 0 ;;
+  show-user) echo "Linger=no" ;;
+  --query-gpu=*) echo "Fake, 24576, 550.00, 8.9" ;;
+esac
+exit 0
+STUB
+        chmod +x "$shim_dir/$binname"
+    done
+    local out rc unit_path
+    unit_path="$sandbox/.config/systemd/user/lucebox.service"
+    out=$(
+        set +e
+        HOME="$sandbox" \
+        XDG_CONFIG_HOME="$sandbox/.config" \
+        XDG_DATA_HOME="$sandbox/.local/share" \
+        PATH="$shim_dir:$PATH" \
+        LUCEBOX_HOST_HAS_SYSTEMD=1 \
+        LUCEBOX_HOST_HAS_DOCKER=1 \
+        LUCEBOX_HOST_HAS_CTK=runtime \
+        LUCEBOX_HOST_GPU_VENDOR=nvidia \
+        _LUCEBOX_HOST_PROBED=1 \
+        NO_COLOR=1 \
+        timeout 10 bash "$SCRIPT" install 2>&1
+        echo "RC=$?"
+    )
+    rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//')
+    rc="${rc:-99}"
+    if [ "$rc" != "0" ]; then
+        report fail "$label" "exit $rc; output: $(head -10 <<<"$out")"
+        rm -rf "$sandbox"
+        return
+    fi
+    if [ ! -f "$unit_path" ]; then
+        report fail "$label" "unit file not written at $unit_path"
+        rm -rf "$sandbox"
+        return
+    fi
+    # Required hardening — each line is a Bug-2 root-cause defence:
+    #   ExecStartPre=…docker rm -f …   → clear orphaned container name
+    #   Environment=PATH=…             → systemd user-session PATH is sparse
+    #   Environment=LUCEBOX_IMAGE=…    → pin the image the user installed against
+    local missing=""
+    for needle in \
+        "ExecStartPre=" \
+        "Environment=PATH=" \
+        "Environment=LUCEBOX_IMAGE=" \
+        "Environment=LUCEBOX_VARIANT=" \
+        "Environment=LUCEBOX_PORT=" \
+        "Environment=LUCEBOX_MODELS=" \
+        ; do
+        grep -qF "$needle" "$unit_path" || missing="$missing $needle"
+    done
+    if [ -n "$missing" ]; then
+        report fail "$label" "unit missing required directives:$missing"
+        rm -rf "$sandbox"
+        return
+    fi
+    report ok "$label"
+    rm -rf "$sandbox"
+}
+test_install_writes_robust_unit
+
+# ── 9. entrypoint.sh dispatch — confirm the in-container dispatch routes
+# trivial subcommands (shell, an unknown passthrough) without firing
+# `set -u` on DFLASH_* / DRAFT_* vars that only get assigned on the
+# serve path. We can't fully exec the serve path here (it needs nvidia
+# and the compiled binary) but we can confirm the early dispatch is clean.
+#
+# Each `exec` would actually try to run the underlying binary, which we
+# don't have — so we shim it by overriding `exec` via a wrapper script.
+# Easier: just confirm `bash -n` parses and run a tiny subset.
+out=$(NO_COLOR=1 SUBCMD=help bash -c "
+    cd '$ROOT'
+    # Simulate 'docker run ... lucebox-hub:cuda12 shell echo ok' — entrypoint
+    # gets SUBCMD=shell and execs /bin/bash with the rest of argv. We replace
+    # exec via PATH so we don't actually exec.
+    tmpdir=\$(mktemp -d)
+    trap 'rm -rf \$tmpdir' EXIT
+    cat > \$tmpdir/uv <<'STUB'
+#!/usr/bin/env bash
+echo \"uv stub: \$*\"
+exit 0
+STUB
+    chmod +x \$tmpdir/uv
+    PATH=\$tmpdir:\$PATH bash $ENTRYPOINT shell -c 'echo entrypoint-shell-dispatched'
+" 2>&1 || true)
+if grep -qE 'unbound variable|syntax error' <<<"$out"; then
+    report fail "entrypoint shell dispatch (no set -u leak)" "leak: $(head -5 <<<"$out")"
+else
+    report ok "entrypoint shell dispatch (no set -u leak)"
+fi
+
+# ── 10. entrypoint.sh serve-path under `set -u` — drive the REAL
+# server/scripts/entrypoint.sh through its full draft-resolution block by
+# sandboxing it with a synthetic DFLASH_DIR layout and a `dflash_server`
+# shim that captures argv instead of execing the native binary. The
+# `DRAFT_FAMILY_GLOB: unbound variable` bug fired precisely here — the
+# previous version of this test inlined the block instead of sourcing
+# the real file, and silently passed even when the shipped script was
+# broken. So this test invokes server/scripts/entrypoint.sh directly.
+# Build the shared entrypoint-serve sandbox: a synthetic DFLASH_DIR layout
+# plus the `dflash_server` + `nvidia-smi` shims used by the three serve-path
+# tests below. Assigns sandbox/models_dir/draft_dir/bin_dir/shim_dir into the
+# CALLER'S scope (bash dynamic scoping) — the caller must `local`-declare
+# them first. Mirrors the _make_docker_shim factoring above.
+_make_entrypoint_sandbox() {
+    sandbox=$(mktemp -d)
+    models_dir="$sandbox/models"
+    draft_dir="$models_dir/draft"
+    bin_dir="$sandbox/build"
+    shim_dir="$sandbox/bin"
+    mkdir -p "$draft_dir" "$bin_dir" "$shim_dir"
+    # `dflash_server` shim — print argv and exit 0 instead of running.
+    cat > "$bin_dir/dflash_server" <<'STUB'
+#!/usr/bin/env bash
+printf '[shim] dflash_server'
+for a in "$@"; do printf ' %q' "$a"; done
+printf '\n'
+exit 0
+STUB
+    chmod +x "$bin_dir/dflash_server"
+    # `nvidia-smi` shim — pretend we have a 24 GB GPU so the autotune
+    # block runs but doesn't pick the under-12-GB warn tier.
+    cat > "$shim_dir/nvidia-smi" <<'STUB'
+#!/usr/bin/env bash
+case "$*" in
+  *"--query-gpu=memory.total"*) echo 24576 ;;
+  -L|*-L*) echo "GPU 0: Fake (UUID: 0)" ;;
+  *) echo "ok" ;;
+esac
+exit 0
+STUB
+    chmod +x "$shim_dir/nvidia-smi"
+}
+
+test_entrypoint_serve_path() {
+    local label="$1" target_name="$2" draft_file="$3"
+    local sandbox draft_dir models_dir bin_dir shim_dir
+    _make_entrypoint_sandbox
+    # Synthetic target (must be a real file at least 5 GB to pass the
+    # auto-detect block, OR we set DFLASH_TARGET explicitly to skip it).
+    touch "$models_dir/$target_name"
+    touch "$draft_dir/$draft_file"
+
+    local out rc
+    out=$(
+        set +e
+        PATH="$shim_dir:$PATH" \
+        DFLASH_DIR="$sandbox" \
+        DFLASH_SERVER_BIN="$bin_dir/dflash_server" \
+        DFLASH_TARGET="$models_dir/$target_name" \
+        DFLASH_DRAFT="$draft_dir" \
+            timeout 10 bash "$ENTRYPOINT" serve 2>&1
+        echo "RC=$?"
+    )
+    rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//')
+    rc="${rc:-99}"
+    rm -rf "$sandbox"
+    if grep -qE 'unbound variable|syntax error' <<<"$out"; then
+        report fail "$label" "leak: $(head -5 <<<"$out")"
+    elif [ "$rc" != "0" ]; then
+        report fail "$label" "exit $rc; output: $(head -5 <<<"$out")"
+    elif ! grep -qF "[shim] dflash_server" <<<"$out"; then
+        report fail "$label" "shim never executed; output: $(head -5 <<<"$out")"
+    else
+        report ok "$label"
+    fi
+}
+
+# Exercise three branches of the family-glob logic: qwen3.6 + gemma-4 (the
+# two families with family-specific globs) and an unknown target that
+# triggers the empty-FAMILY_GLOBS fallback to the generic glob list.
+test_entrypoint_serve_path "entrypoint serve: qwen3.6 family match" \
+    "Qwen3.6-27B-Q4_K_M.gguf" "dflash-draft-3.6-test.gguf"
+test_entrypoint_serve_path "entrypoint serve: gemma-4-31b family match" \
+    "gemma-4-31B-it-Q8_0.gguf" "gemma-4-31b-dflash-q8.gguf"
+test_entrypoint_serve_path "entrypoint serve: generic fallback" \
+    "Mystery-Model-7B.gguf" "model.gguf"
+
+# ── 11. entrypoint.sh serve-path with MULTIPLE target-sized GGUFs in
+# models/. The single-candidate fixture in test 10 doesn't exercise the
+# auto-detect path that picks "first alphabetically" when more than one
+# target ≥5 GB lives in the models dir — that path is what the sindri
+# decode sweep tripped over after the user added the qwen3.6-moe preset
+# (commit 4b6bced) alongside the existing Qwen3.6-27B target. The crash
+# manifested as `DRAFT_FAMILY_GLOB: unbound variable`, and the partial
+# fix in a87bb93 didn't survive a recurrence.
+#
+# Uses sparse files (`truncate -s 6G`) so the test stays cheap on disk —
+# the 6 GB virtual size is enough to clear the find ... -size +5G filter
+# without consuming actual blocks. Skip if truncate is missing (e.g.
+# minimal busybox CI image).
+test_entrypoint_multi_target() {
+    local label="$1"
+    shift
+    if ! command -v truncate &>/dev/null; then
+        report ok "$label (skipped: truncate not available)"
+        return
+    fi
+    local sandbox draft_dir models_dir bin_dir shim_dir
+    _make_entrypoint_sandbox
+    # Two qwen3.6-shaped targets ≥5 GB each — exactly the layout that
+    # broke on sindri (Qwen3.6-27B + Qwen3.6-35B-A3B-UD-Q4_K_M).
+    truncate -s 6G "$models_dir/Qwen3.6-27B-Q4_K_M.gguf"
+    truncate -s 6G "$models_dir/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"
+    touch "$draft_dir/dflash-draft-3.6-test.gguf"
+
+    local out rc
+    out=$(
+        set +e
+        # NOTE: deliberately NOT setting DFLASH_TARGET — the test must
+        # exercise the auto-detect block (line ~151). The explicit-config
+        # workaround from the bug report would skip the bug entirely.
+        PATH="$shim_dir:$PATH" \
+        DFLASH_DIR="$sandbox" \
+        DFLASH_SERVER_BIN="$bin_dir/dflash_server" \
+        DFLASH_DRAFT="$draft_dir" \
+            timeout 10 bash "$ENTRYPOINT" serve 2>&1
+        echo "RC=$?"
+    )
+    rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//')
+    rc="${rc:-99}"
+    rm -rf "$sandbox"
+    # The auto-detect block is entered (so any `set -u` regression on
+    # DRAFT_FAMILY_GLOB will trip) and then the entrypoint refuses to
+    # auto-pick — the deliberate safety added in PR #334's cubic round.
+    # We require: no set-u leak, the refuse warn fired, a non-zero exit
+    # (so a future regression that logs the warning but still returns 0
+    # cannot slip past — the container MUST fail to start, not silently
+    # auto-pick a stale GGUF), and the shim was NOT exec'd.
+    if grep -qE 'unbound variable|syntax error' <<<"$out"; then
+        report fail "$label" "leak: $(grep -E 'unbound variable|syntax error' <<<"$out" | head -3)"
+    elif ! grep -qF "Refusing to auto-select" <<<"$out"; then
+        report fail "$label" "refuse-to-auto-pick warn missing — did the auto-detect block fire?  rc=$rc output: $(head -5 <<<"$out")"
+    elif [ "$rc" = "0" ]; then
+        report fail "$label" "refuse warn fired but rc=0 — entrypoint must exit non-zero on multi-target refuse"
+    elif grep -qF "[shim] dflash_server" <<<"$out"; then
+        report fail "$label" "shim was exec'd despite multi-target refuse"
+    else
+        report ok "$label"
+    fi
+}
+
+# Drive the regression: the sindri layout that broke (post-moe-preset).
+test_entrypoint_multi_target "entrypoint serve: multi-target auto-detect (no DRAFT_FAMILY_GLOB leak)"
+
+# Also drive the DFLASH_DRAFT-is-a-file path. The init at entrypoint.sh:257
+# sits inside `if [ -d "$DFLASH_DRAFT" ]; then` — when DRAFT is a file the
+# block is skipped, and any future read of DRAFT_FAMILY_GLOB outside the
+# block would trip set -u. The defensive `:-` guard at the read site is
+# meant to survive that refactor; this test guarantees it.
+test_entrypoint_draft_is_file() {
+    local label="$1"
+    local sandbox draft_dir models_dir bin_dir shim_dir
+    _make_entrypoint_sandbox
+    touch "$models_dir/Qwen3.6-27B-Q4_K_M.gguf"
+    # DFLASH_DRAFT points at a FILE (not a directory).
+    touch "$draft_dir/dflash-draft-3.6-test.gguf"
+
+    local out rc
+    out=$(
+        set +e
+        PATH="$shim_dir:$PATH" \
+        DFLASH_DIR="$sandbox" \
+        DFLASH_SERVER_BIN="$bin_dir/dflash_server" \
+        DFLASH_TARGET="$models_dir/Qwen3.6-27B-Q4_K_M.gguf" \
+        DFLASH_DRAFT="$draft_dir/dflash-draft-3.6-test.gguf" \
+            timeout 10 bash "$ENTRYPOINT" serve 2>&1
+        echo "RC=$?"
+    )
+    rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//')
+    rc="${rc:-99}"
+    rm -rf "$sandbox"
+    if grep -qE 'unbound variable|syntax error' <<<"$out"; then
+        report fail "$label" "leak: $(grep -E 'unbound variable|syntax error' <<<"$out" | head -3)"
+    elif [ "$rc" != "0" ]; then
+        report fail "$label" "exit $rc; output: $(head -5 <<<"$out")"
+    else
+        report ok "$label"
+    fi
+}
+test_entrypoint_draft_is_file "entrypoint serve: DFLASH_DRAFT is a file (no DRAFT_FAMILY_GLOB leak)"
+
+# ── 12. entrypoint.sh writes HOST_INFO atomically on the serve path. The
+# C++ server reads /opt/lucebox-hub/HOST_INFO into ServerConfig.host_info
+# and surfaces it under /props.host. We can't write to /opt/lucebox-hub
+# from the test runner, so override the path by sourcing the helpers and
+# calling _build_host_info_json directly. The full entrypoint runs in
+# test 10/11 already; this test pins the JSON shape independently.
+test_entrypoint_host_info_json() {
+    local label="$1"
+    # Source the helper functions from the real entrypoint.sh.
+    # shellcheck disable=SC1090
+    source <(awk '/^_json_escape\(\) \{/,/^\}/' "$ENTRYPOINT")
+    # shellcheck disable=SC1090
+    source <(awk '/^_json_str_or_null\(\) \{/,/^\}/' "$ENTRYPOINT")
+    # shellcheck disable=SC1090
+    source <(awk '/^_json_int_or_null\(\) \{/,/^\}/' "$ENTRYPOINT")
+    # shellcheck disable=SC1090
+    source <(awk '/^_trim\(\) \{/,/^\}/' "$ENTRYPOINT")
+    # shellcheck disable=SC1090
+    source <(awk '/^_emit_gpu_array\(\) \{/,/^\}/' "$ENTRYPOINT")
+    # shellcheck disable=SC1090
+    source <(awk '/^_build_host_info_json\(\) \{/,/^\}/' "$ENTRYPOINT")
+
+    local out
+    LUCEBOX_HOST_OS_PRETTY="Ubuntu 22.04.3 LTS" \
+    LUCEBOX_HOST_KERNEL="6.6.87.2-microsoft-standard-WSL2" \
+    LUCEBOX_HOST_WSL_VERSION="wsl2" \
+    LUCEBOX_HOST_DOCKER_VERSION="29.1.3" \
+    LUCEBOX_HOST_DRIVER_VERSION="596.36" \
+    LUCEBOX_HOST_NVIDIA_CTK_VERSION="1.16.2" \
+    LUCEBOX_HOST_CPU_MODEL='Intel(R) Core(TM) Ultra 9 275HX' \
+    LUCEBOX_HOST_NPROC=24 \
+    LUCEBOX_HOST_RAM_GB=64 \
+    LUCEBOX_HOST_GPU_LIST_CSV="0, GPU-abc, 00000000:01:00.0, NVIDIA RTX 5090, 12.0, 24576 MiB, 175.00 W" \
+    LUCEBOX_HOST_CUDA_VISIBLE_DEVICES="0" \
+        out=$(_build_host_info_json "lucebox.sh" "lucebox.sh" "2026-05-28T20:31:42Z")
+    if ! python3 -c "import json,sys; d=json.loads(sys.argv[1]); assert d['os_pretty']=='Ubuntu 22.04.3 LTS'; assert d['wsl_version']=='wsl2'; assert d['nvidia_ctk_version']=='1.16.2'; assert d['source']=='lucebox.sh'; assert d['gpus'][0]['vram_gb']==24; assert d['gpus'][0]['name']=='NVIDIA RTX 5090'" "$out" >/dev/null 2>&1; then
+        report fail "$label (populated)" "JSON shape mismatch: $out"
+        return
+    fi
+    # Now drive the unknown path: every LUCEBOX_HOST_* unset → nulls and source=unknown.
+    out=$(env -i bash -c "
+        set -u
+        $(declare -f _json_escape _json_str_or_null _json_int_or_null _emit_gpu_array _build_host_info_json)
+        _build_host_info_json 'unknown' 'entrypoint.sh' '2026-05-28T20:31:42Z'
+    ")
+    if ! python3 -c "import json,sys; d=json.loads(sys.argv[1]); assert d['source']=='unknown'; assert d['gpus']==[]; assert d['os_pretty'] is None" "$out" >/dev/null 2>&1; then
+        report fail "$label (unknown)" "JSON shape mismatch: $out"
+        return
+    fi
+    report ok "$label"
+}
+test_entrypoint_host_info_json "entrypoint HOST_INFO JSON shape (populated + unknown)"
+
+# ── install.sh end-to-end ─────────────────────────────────────────────────
+# Drive install.sh against a file:// URL pointing at a fixture lucebox.sh,
+# verify the installed copy has LUCEBOX_INSTALLED_FROM rewritten to the
+# fetched URL — that's the contract that `lucebox update` depends on to
+# preserve the user's channel across upgrades.
+test_install_sh_bakes_source_url() {
+    local label="$1"
+    local tmp dest_dir dest_path src_url out rc
+    tmp=$(mktemp -d -t lucebox-install.XXXXXX)
+    # Use the real lucebox.sh as the "remote" file — `file://` works with
+    # curl out of the box and exercises the same install.sh code path as
+    # an https fetch would.
+    src_url="file://$SCRIPT"
+    dest_dir="$tmp/bin"
+    dest_path="$dest_dir/lucebox"
+    out=$(LUCEBOX_INSTALL_URL="$src_url" LUCEBOX_INSTALL_DEST="$dest_path" \
+        NO_COLOR=1 bash "$INSTALLER" 2>&1) || rc=$?
+    rc="${rc:-0}"
+    if [ "$rc" -ne 0 ]; then
+        rm -rf "$tmp"
+        report fail "$label" "installer exited $rc; output: $(printf '%s' "$out" | head -3)"
+        return
+    fi
+    if [ ! -x "$dest_path" ]; then
+        rm -rf "$tmp"
+        report fail "$label" "installed file missing or not executable at $dest_path"
+        return
+    fi
+    if ! grep -q "^LUCEBOX_INSTALLED_FROM=\"$src_url\"$" "$dest_path"; then
+        rm -rf "$tmp"
+        report fail "$label" "LUCEBOX_INSTALLED_FROM not rewritten in installed copy"
+        return
+    fi
+    rm -rf "$tmp"
+    report ok "$label"
+}
+test_install_sh_bakes_source_url "install.sh bakes LUCEBOX_INSTALLED_FROM into installed copy"
+
+# ── update dispatch ───────────────────────────────────────────────────────
+# `lucebox update` must dispatch to cmd_update — verify it's wired in the
+# main case statement and appears in --help. We can't actually run the
+# update (it'd curl + replace this very script) so the test is parse-level.
+test_update_subcommand_wired() {
+    local label="$1"
+    local out
+    out=$(LUCEBOX_HOST_HAS_SYSTEMD=0 "$SCRIPT" --help 2>&1)
+    if ! grep -q '^  update ' <<<"$out"; then
+        report fail "$label" "update command missing from --help output"
+        return
+    fi
+    if ! grep -q '^[[:space:]]*update)[[:space:]]*cmd_update' "$SCRIPT"; then
+        report fail "$label" "update) → cmd_update dispatch not wired"
+        return
+    fi
+    report ok "$label"
+}
+test_update_subcommand_wired "lucebox update subcommand is wired"
+
+# ── IMAGE_BASE derived from install source ────────────────────────────────
+# Source lucebox.sh in a subshell with LUCEBOX_INSTALLED_FROM pointing at
+# various URLs, then check that IMAGE_BASE comes out right. Uses
+# `set -e; return` early so we don't actually run the wrapper's main().
+test_image_base_derives_from_install_url() {
+    local label="$1" url expected got
+    for case in \
+        "https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/lucebox.sh|ghcr.io/easel/lucebox-hub" \
+        "https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/lucebox.sh|ghcr.io/luce-org/lucebox-hub" \
+        "https://raw.githubusercontent.com/easel/lucebox-hub/601ab52/lucebox.sh|ghcr.io/easel/lucebox-hub" \
+        "https://example.com/bogus|ghcr.io/luce-org/lucebox-hub"
+    do
+        url="${case%%|*}"
+        expected="${case##*|}"
+        # Extract the derivation function from the script and run it in
+        # isolation — sourcing the whole script triggers main() and side
+        # effects we don't want under a test harness.
+        got=$(bash -c '
+            '"$(sed -n "/^_lucebox_derive_image()/,/^}/p" "$SCRIPT")"'
+            _lucebox_derive_image "$1"
+        ' bash "$url")
+        if [ "$got" != "$expected" ]; then
+            report fail "$label" "url=$url expected=$expected got=$got"
+            return
+        fi
+    done
+    report ok "$label"
+}
+test_image_base_derives_from_install_url "IMAGE_BASE derived from LUCEBOX_INSTALLED_FROM (4 URL shapes)"
+
+# ── config.toml reader + resolver ─────────────────────────────────────────
+# Drive _lucebox_config_get + _lucebox_resolve against a fixture
+# config.toml in a tmp $LUCEBOX_HOME. Verifies the wrapper agrees with
+# the Python CLI on every scalar that lives in [image]/[runtime]/[paths].
+test_config_toml_reader_and_resolve() {
+    local label="$1" tmp got
+    tmp=$(mktemp -d -t lucebox-cfg.XXXXXX)
+    cat > "$tmp/config.toml" <<'TOML'
+[image]
+variant = "cuda13"
+registry = "ghcr.io/myorg/forkedhub"
+
+[runtime]
+port = 9090
+container_name = "luce-test"
+
+[paths]
+models = "/srv/models"
+
+[dflash]
+budget = 22
+lazy = false
+TOML
+
+    # Exercise both helpers + the resolver via a subshell that sources
+    # the relevant snippets out of lucebox.sh. Each case is a triple:
+    # env_value | toml_key | default | expected
+    local cases=(
+        "|image.registry|ghcr.io/luce-org/lucebox-hub|ghcr.io/myorg/forkedhub"
+        "|image.variant|cuda12|cuda13"
+        "|runtime.port|8080|9090"
+        "|runtime.container_name|lucebox|luce-test"
+        "|paths.models|/var/lib/lucebox|/srv/models"
+        "OVERRIDE|image.registry|ghcr.io/luce-org/lucebox-hub|OVERRIDE"
+        "|missing.key|fallback-default|fallback-default"
+    )
+    local case env_value toml_key default expected
+    for case in "${cases[@]}"; do
+        IFS='|' read -r env_value toml_key default expected <<<"$case"
+        got=$(LUCEBOX_HOME="$tmp" bash -c '
+            '"$(sed -n "/^_lucebox_config_path()/,/^}/p" "$SCRIPT")"'
+            '"$(sed -n "/^_lucebox_config_get()/,/^}/p" "$SCRIPT")"'
+            '"$(sed -n "/^_lucebox_resolve()/,/^}/p" "$SCRIPT")"'
+            _lucebox_resolve "$1" "$2" "$3"
+        ' bash "$env_value" "$toml_key" "$default")
+        if [ "$got" != "$expected" ]; then
+            rm -rf "$tmp"
+            report fail "$label" "env=$env_value key=$toml_key default=$default expected=$expected got=$got"
+            return
+        fi
+    done
+    rm -rf "$tmp"
+    report ok "$label"
+}
+test_config_toml_reader_and_resolve "config.toml reader + env > toml > default resolution (7 cases)"
+
+# ── cmd_serve under systemd: INVOCATION_ID short-circuits is-active ──────
+# When systemd invokes the wrapper as a unit's ExecStart, it sets
+# $INVOCATION_ID. The wrapper must NOT then refuse "already running under
+# systemd" — that's a self-defeating check that turns into a restart loop.
+# Verify the guard is present in the source (the actual behavior requires
+# a running systemd unit to test end-to-end, which the harness can't do).
+test_cmd_serve_invocation_id_guard() {
+    local label="$1"
+    if ! grep -q 'INVOCATION_ID' "$SCRIPT"; then
+        report fail "$label" "INVOCATION_ID guard missing from cmd_serve preflight"
+        return
+    fi
+    # The guard must be the AND-condition gating the is-active check.
+    # If grep finds the is-active line WITHOUT INVOCATION_ID nearby,
+    # the guard isn't wired correctly.
+    if ! awk '
+        /INVOCATION_ID/ { saw_guard = NR }
+        /is-active --quiet "\$UNIT_NAME"/ {
+            if (saw_guard && NR - saw_guard <= 3) found = 1
+        }
+        END { exit (found ? 0 : 1) }
+    ' "$SCRIPT"; then
+        report fail "$label" "INVOCATION_ID not adjacent to is-active check (guard not wired)"
+        return
+    fi
+    report ok "$label"
+}
+test_cmd_serve_invocation_id_guard "cmd_serve has INVOCATION_ID guard on systemd is-active check"
+
+# ── cmd_systemctl_passthrough: smart start ───────────────────────────────
+# Verify the source has the "already active" + "restart loop" short
+# circuits for the start action. Behavior-level testing requires a real
+# unit; this is a source-level guarantee that the branches exist.
+test_cmd_start_already_active_shortcircuit() {
+    local label="$1"
+    if ! grep -q 'is already active' "$SCRIPT"; then
+        report fail "$label" "already-active short-circuit missing"
+        return
+    fi
+    if ! grep -q 'is in restart-loop' "$SCRIPT"; then
+        report fail "$label" "restart-loop short-circuit missing"
+        return
+    fi
+    report ok "$label"
+}
+test_cmd_start_already_active_shortcircuit "lucebox start has already-active + restart-loop short-circuits"
+
+# ── install.sh SHA-pin refusal + CHANNEL override ────────────────────────
+# A SHA-pinned LUCEBOX_INSTALL_URL with no LUCEBOX_INSTALL_CHANNEL must
+# refuse — otherwise `lucebox update` would re-fetch that frozen SHA
+# forever. With CHANNEL set, the bake-in uses the channel URL, not the
+# fetch URL.
+test_install_sha_pin_refusal_and_channel_override() {
+    local label="$1" tmp got rc
+    tmp=$(mktemp -d -t lucebox-sha.XXXXXX)
+
+    # Case 1: SHA-pinned URL without CHANNEL → must refuse
+    LUCEBOX_INSTALL_URL="https://raw.githubusercontent.com/easel/lucebox-hub/abc1234567/lucebox.sh" \
+    LUCEBOX_INSTALL_DEST="$tmp/lucebox1" \
+    NO_COLOR=1 \
+        bash "$INSTALLER" >/dev/null 2>&1 && rc=0 || rc=$?
+    if [ "$rc" -eq 0 ]; then
+        rm -rf "$tmp"
+        report fail "$label" "SHA-pinned URL without CHANNEL should have refused (rc=$rc, got success)"
+        return
+    fi
+    if [ -f "$tmp/lucebox1" ]; then
+        rm -rf "$tmp"
+        report fail "$label" "SHA-pinned URL refusal still wrote $tmp/lucebox1"
+        return
+    fi
+
+    # Case 2: SHA-pinned URL WITH CHANNEL → installs, bakes CHANNEL
+    LUCEBOX_INSTALL_URL="file://$SCRIPT" \
+    LUCEBOX_INSTALL_CHANNEL="https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/lucebox.sh" \
+    LUCEBOX_INSTALL_DEST="$tmp/lucebox2" \
+    NO_COLOR=1 \
+        bash "$INSTALLER" >/dev/null 2>&1 || rc=$?
+    got=$(grep '^LUCEBOX_INSTALLED_FROM=' "$tmp/lucebox2" 2>/dev/null || echo missing)
+    if [ "$got" != 'LUCEBOX_INSTALLED_FROM="https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/lucebox.sh"' ]; then
+        rm -rf "$tmp"
+        report fail "$label" "CHANNEL not baked; got: $got"
+        return
+    fi
+
+    rm -rf "$tmp"
+    report ok "$label"
+}
+test_install_sha_pin_refusal_and_channel_override "install.sh refuses SHA-pin without CHANNEL + honors CHANNEL override"
+
+# ── lucebox completion ───────────────────────────────────────────────────
+# The completion script must source cleanly and complete a known prefix.
+test_completion_bash() {
+    local label="$1" out
+    out=$(LUCEBOX_HOST_HAS_SYSTEMD=0 bash -c '
+        source <("$1" completion bash 2>/dev/null)
+        COMP_WORDS=(lucebox conf)
+        COMP_CWORD=1
+        _lucebox_complete
+        printf "%s\n" "${COMPREPLY[@]}"
+    ' bash "$SCRIPT")
+    if ! grep -qx 'config' <<<"$out"; then
+        report fail "$label" "completion didn't suggest 'config' for prefix 'conf'; got: $(printf '%s' "$out" | tr '\n' ' ')"
+        return
+    fi
+    report ok "$label"
+}
+test_completion_bash "lucebox completion bash completes a known prefix"
+
+# ── docker exec routing ───────────────────────────────────────────────────
+# When the lucebox container is running, steady-state subcommands must
+# `docker exec` into it (cheap + shares the live server's net namespace) and
+# service-restarting subcommands (serve, pull, ...) must stay on
+# `docker run`. We mock docker via a PATH shim that:
+#   - on `docker ps -q -f name=^lucebox$` prints a fake container id
+#     (signals "container is running") iff DOCKER_FAKE_RUNNING=1.
+#   - on any other call (run, exec, pull, ...) echoes its argv on stdout and
+#     exits 0. The test then asserts on the captured first-token (run vs exec)
+#     and trailing argv.
+#
+# nvidia-smi is stubbed too so probe_host doesn't barf, but the captured argv
+# we care about is the docker invocation downstream of dispatch.
+_make_docker_shim() {
+    local sandbox="$1" running="$2"
+    local shim_dir="$sandbox/bin"
+    mkdir -p "$shim_dir"
+    # docker shim: dispatch on first arg. Important: ps -q -f name=^lucebox$
+    # must print a fake id when DOCKER_FAKE_RUNNING=1 and nothing otherwise.
+    # All other invocations (run, exec, pull) print "DOCKER_INVOKED <argv>"
+    # on stdout so the caller can grep it.
+    cat > "$shim_dir/docker" <<STUB
+#!/usr/bin/env bash
+case "\$1" in
+    ps)
+        # The wrapper calls: docker ps -q -f name=^lucebox\$
+        if [ "$running" = "1" ]; then
+            echo "deadbeefcafe"
+        fi
+        exit 0
+        ;;
+    version)
+        echo "29.1.3"
+        exit 0
+        ;;
+    inspect)
+        # cmd_serve probes container state — only relevant for the serve
+        # path which our tests don't drive. Return "absent" defensively.
+        echo "absent"
+        exit 0
+        ;;
+    *)
+        printf 'DOCKER_INVOKED'
+        for a in "\$@"; do printf ' %q' "\$a"; done
+        printf '\n'
+        exit 0
+        ;;
+esac
+STUB
+    chmod +x "$shim_dir/docker"
+    # nvidia-smi stub (lets probe_host succeed without real hardware).
+    cat > "$shim_dir/nvidia-smi" <<'STUB'
+#!/usr/bin/env bash
+case "$*" in
+    *"--query-gpu="*) echo "Fake GPU, 24576, 550.00, 8.9" ;;
+    *) echo "ok" ;;
+esac
+exit 0
+STUB
+    chmod +x "$shim_dir/nvidia-smi"
+}
+
+# Drive the wrapper through the dispatch case under test and capture the
+# docker invocation it would have exec'd. Because `cmd_in_container` /
+# `cmd_exec_in_container` call `exec docker ...` we replace `exec` semantics
+# by running the wrapper in a subshell — the docker shim prints what it was
+# called with and the captured stdout is the proof.
+_run_wrapper_capture_docker() {
+    local sandbox="$1"; shift
+    local shim_dir="$sandbox/bin"
+    set +e
+    HOME="$sandbox" \
+    XDG_CONFIG_HOME="$sandbox/.config" \
+    XDG_DATA_HOME="$sandbox/.local/share" \
+    LUCEBOX_HOME="$sandbox/.lucebox" \
+    PATH="$shim_dir:$PATH" \
+    LUCEBOX_HOST_HAS_DOCKER=1 \
+    LUCEBOX_HOST_HAS_CTK=runtime \
+    LUCEBOX_HOST_GPU_VENDOR=nvidia \
+    LUCEBOX_HOST_DRIVER_MAJOR=550 \
+    LUCEBOX_HOST_DRIVER_VERSION="550.00" \
+    LUCEBOX_HOST_GPU_NAME="Fake GPU" \
+    LUCEBOX_HOST_GPU_COUNT=1 \
+    LUCEBOX_HOST_VRAM_GB=24 \
+    LUCEBOX_HOST_GPU_SM="89" \
+    LUCEBOX_HOST_NPROC=8 \
+    LUCEBOX_HOST_RAM_GB=64 \
+    LUCEBOX_HOST_HAS_SYSTEMD=0 \
+    LUCEBOX_HOST_IS_WSL=0 \
+    LUCEBOX_HOST_DOCKER_VERSION="29.1.3" \
+    _LUCEBOX_HOST_PROBED=1 \
+    NO_COLOR=1 \
+        timeout 10 bash "$SCRIPT" "$@" 2>&1
+    set -e
+}
+
+test_routes_to_exec_when_running() {
+    local label="$1" sandbox out
+    sandbox=$(mktemp -d -t lucebox-route.XXXXXX)
+    _make_docker_shim "$sandbox" 1
+    out=$(_run_wrapper_capture_docker "$sandbox" config get model.preset || true)
+    rm -rf "$sandbox"
+    if ! grep -q '^DOCKER_INVOKED exec' <<<"$out"; then
+        report fail "$label" "expected 'docker exec' invocation; got: $(head -3 <<<"$out")"
+        return
+    fi
+    if grep -q '^DOCKER_INVOKED run' <<<"$out"; then
+        report fail "$label" "got 'docker run' when container is up — should have exec'd"
+        return
+    fi
+    # Sanity: the exec line ends with `lucebox config get model.preset`.
+    if ! grep -qE 'lucebox config get model.preset' <<<"$out"; then
+        report fail "$label" "exec argv missing tail 'lucebox config get model.preset'; got: $(head -3 <<<"$out")"
+        return
+    fi
+    # The exec path must forward the LUCEBOX_* scalar env subset (shared
+    # with the docker-run path via _append_scalar_env). Pin LUCEBOX_IMAGE=
+    # so a regression in that helper is caught here.
+    if ! grep -q 'LUCEBOX_IMAGE=' <<<"$out"; then
+        report fail "$label" "exec argv missing 'LUCEBOX_IMAGE=' scalar env; got: $(head -3 <<<"$out")"
+        return
+    fi
+    report ok "$label"
+}
+test_routes_to_exec_when_running "config get routes to docker exec when container running"
+
+test_routes_to_run_when_not_running() {
+    local label="$1" sandbox out
+    sandbox=$(mktemp -d -t lucebox-route.XXXXXX)
+    _make_docker_shim "$sandbox" 0
+    out=$(_run_wrapper_capture_docker "$sandbox" config get model.preset || true)
+    rm -rf "$sandbox"
+    if ! grep -q '^DOCKER_INVOKED run' <<<"$out"; then
+        report fail "$label" "expected 'docker run' invocation (container not running); got: $(head -3 <<<"$out")"
+        return
+    fi
+    if grep -q '^DOCKER_INVOKED exec' <<<"$out"; then
+        report fail "$label" "got 'docker exec' but container is not running — should fall back to run"
+        return
+    fi
+    report ok "$label"
+}
+test_routes_to_run_when_not_running "config get falls back to docker run when container not running"
+
+test_no_exec_flag_forces_run() {
+    local label="$1" sandbox out
+    sandbox=$(mktemp -d -t lucebox-route.XXXXXX)
+    _make_docker_shim "$sandbox" 1
+    # --no-exec must override the prefer-exec path even when container is up.
+    out=$(_run_wrapper_capture_docker "$sandbox" --no-exec config get model.preset || true)
+    rm -rf "$sandbox"
+    if grep -q '^DOCKER_INVOKED exec' <<<"$out"; then
+        report fail "$label" "--no-exec failed to force run path; got exec"
+        return
+    fi
+    if ! grep -q '^DOCKER_INVOKED run' <<<"$out"; then
+        report fail "$label" "expected 'docker run' under --no-exec; got: $(head -3 <<<"$out")"
+        return
+    fi
+    report ok "$label"
+}
+test_no_exec_flag_forces_run "--no-exec flag forces docker run even when container is up"
+
+test_no_exec_env_forces_run() {
+    local label="$1" sandbox out
+    sandbox=$(mktemp -d -t lucebox-route.XXXXXX)
+    _make_docker_shim "$sandbox" 1
+    out=$(
+        LUCEBOX_NO_EXEC=1 _run_wrapper_capture_docker "$sandbox" config get model.preset || true
+    )
+    rm -rf "$sandbox"
+    if grep -q '^DOCKER_INVOKED exec' <<<"$out"; then
+        report fail "$label" "LUCEBOX_NO_EXEC=1 failed to force run path; got exec"
+        return
+    fi
+    if ! grep -q '^DOCKER_INVOKED run' <<<"$out"; then
+        report fail "$label" "expected 'docker run' under LUCEBOX_NO_EXEC=1; got: $(head -3 <<<"$out")"
+        return
+    fi
+    report ok "$label"
+}
+test_no_exec_env_forces_run "LUCEBOX_NO_EXEC=1 env override forces docker run"
+
+test_models_routes_to_exec() {
+    local label="$1" sandbox out
+    sandbox=$(mktemp -d -t lucebox-route.XXXXXX)
+    _make_docker_shim "$sandbox" 1
+    out=$(_run_wrapper_capture_docker "$sandbox" models list || true)
+    rm -rf "$sandbox"
+    if ! grep -q '^DOCKER_INVOKED exec' <<<"$out"; then
+        report fail "$label" "expected 'docker exec' for models when running; got: $(head -3 <<<"$out")"
+        return
+    fi
+    # Confirm the exec'd command tail is `lucebox models list` — the
+    # in-container CLI's argv must NOT be polluted with dispatcher bookkeeping.
+    if ! grep -qE 'lucebox models list' <<<"$out"; then
+        report fail "$label" "exec'd argv missing 'lucebox models list' tail"
+        return
+    fi
+    report ok "$label"
+}
+test_models_routes_to_exec "models list routes to docker exec when container running"
+
+# ── usage mentions exec-when-running ──────────────────────────────────────
+test_usage_mentions_exec_routing() {
+    local label="$1" out
+    out=$(NO_COLOR=1 bash "$SCRIPT" --help 2>&1)
+    if ! grep -qi 'docker exec\|--no-exec' <<<"$out"; then
+        report fail "$label" "usage doesn't mention the exec routing / --no-exec flag"
+        return
+    fi
+    report ok "$label"
+}
+test_usage_mentions_exec_routing "usage documents docker exec routing + --no-exec flag"
+
+# ── TTY flag selection. Regression guard for the process-substitution bug:
+# _set_tty_flags must run in the CALLER's scope so `[ -t 1 ]` inspects the
+# real terminal. If it is ever moved back behind `< <(...)` or `$(...)`,
+# fd 1 becomes a pipe and it emits -i even on a real tty, silently dropping
+# docker's -t and breaking the interactive client TUIs (lucebox claude …).
+# The rest of this suite runs non-tty, so only this test exercises the -it
+# branch — via a real PTY allocated by python's pty.fork.
+test_tty_flags_selection() {
+    local label="$1" fn out
+    fn=$(awk '/^_set_tty_flags\(\) \{/,/^\}/' "$SCRIPT")
+
+    # (a) non-tty (stdin /dev/null, stdout a pipe) → -i
+    out=$(bash -c "$fn"$'\n''f=(); _set_tty_flags f; printf "%s" "${f[*]}"' </dev/null 2>/dev/null)
+    if [ "$out" != "-i" ]; then
+        report fail "$label" "non-tty expected -i, got '$out'"
+        return
+    fi
+
+    # (b) real tty on fd0+fd1 (python pty.fork) → -it
+    out=$(python3 - "$SCRIPT" <<'PY' 2>/dev/null
+import os, pty, re, sys
+src = open(sys.argv[1]).read()
+fn = re.search(r'^_set_tty_flags\(\) \{.*?^\}', src, re.S | re.M).group(0)
+script = fn + '\nf=(); _set_tty_flags f; printf "TTYFLAG=%s\\n" "${f[*]}"\n'
+pid, fd = pty.fork()
+if pid == 0:
+    os.execvp("bash", ["bash", "-c", script])
+buf = b""
+try:
+    while True:
+        chunk = os.read(fd, 1024)
+        if not chunk:
+            break
+        buf += chunk
+except OSError:
+    pass
+os.waitpid(pid, 0)
+m = re.search(rb"TTYFLAG=(\S+)", buf)
+sys.stdout.write(m.group(1).decode() if m else "NONE")
+PY
+)
+    if [ "$out" != "-it" ]; then
+        report fail "$label" "real tty expected -it, got '$out'"
+        return
+    fi
+    report ok "$label"
+}
+test_tty_flags_selection "_set_tty_flags: -it on a real tty, -i otherwise"
+
+echo
+if [ "$fail" -eq 0 ]; then
+    echo "[test_lucebox_sh] $pass passed, 0 failed"
+    exit 0
+else
+    echo "[test_lucebox_sh] $pass passed, $fail failed" >&2
+    exit 1
+fi
diff --git a/server/scripts/entrypoint.sh b/server/scripts/entrypoint.sh
index f35e295c4..fe37b3cbe 100755
--- a/server/scripts/entrypoint.sh
+++ b/server/scripts/entrypoint.sh
@@ -73,6 +73,15 @@ esac
 # write-failure (read-only FS, etc.) gets a warning and we continue.
 write_host_info() {
     local target="/opt/lucebox-hub/HOST_INFO"
+    # If the target dir doesn't exist (e.g. running the entrypoint outside
+    # the canonical container layout: unit tests, plain `docker run` without
+    # a bind mount), don't try to write — bash's own "No such file or
+    # directory" complaint on the `> "$tmp"` redirect below would leak to
+    # stderr regardless of `2>/dev/null` (that suppresses the command's
+    # stderr, not the redirect itself). HOST_INFO is informational.
+    if [ ! -d "$(dirname "$target")" ]; then
+        return 0
+    fi
     local tmp="${target}.tmp.$$"
     local collected_at
     collected_at=$(date -u +%FT%TZ 2>/dev/null || echo "")
@@ -158,6 +167,15 @@ _json_int_or_null() {
 # `nvidia-smi --query-gpu=index,uuid,pci.bus_id,name,compute_cap,memory.total,power.limit
 #               --format=csv,noheader` produced on the host) into a JSON
 # array. Empty CSV → "[]". Each row becomes one object.
+# Strip leading/trailing whitespace from a string. Pure bash (no sed fork)
+# via prefix/suffix removal of the longest run of spaces or tabs.
+_trim() {
+    local s="$1"
+    s="${s#"${s%%[![:space:]]*}"}"   # leading
+    s="${s%"${s##*[![:space:]]}"}"   # trailing
+    printf '%s' "$s"
+}
+
 _emit_gpu_array() {
     local csv="${LUCEBOX_HOST_GPU_LIST_CSV:-}"
     if [ -z "$csv" ]; then
@@ -173,13 +191,13 @@ _emit_gpu_array() {
         # split on `,` alone and trim whitespace per field so both forms parse.
         local idx uuid pci name cc mem plimit
         IFS=',' read -r idx uuid pci name cc mem plimit <<<"$line"
-        idx=$(printf '%s' "$idx" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')
-        uuid=$(printf '%s' "$uuid" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')
-        pci=$(printf '%s' "$pci" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')
-        name=$(printf '%s' "$name" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')
-        cc=$(printf '%s' "$cc" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')
-        mem=$(printf '%s' "$mem" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')
-        plimit=$(printf '%s' "$plimit" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')
+        idx=$(_trim "$idx")
+        uuid=$(_trim "$uuid")
+        pci=$(_trim "$pci")
+        name=$(_trim "$name")
+        cc=$(_trim "$cc")
+        mem=$(_trim "$mem")
+        plimit=$(_trim "$plimit")
         # Strip units. "24576 MiB" → 24576; "175.00 W" → 175 (truncate).
         local mem_mib vram_gb power_w
         mem_mib=$(printf '%s' "$mem" | awk '{print $1+0}')
diff --git a/uv.lock b/uv.lock
index fee8de0df..ba16922d8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -9,6 +9,7 @@ resolution-markers = [
 
 [manifest]
 members = [
+    "lucebox",
     "lucebox-dflash",
     "lucebox-hub",
     "pflash",
@@ -429,6 +430,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/59/67/a6739ac96e28b7855808bdb0370e250606104a859750d209e5a0716fe7ab/librt-0.11.0-cp312-cp312-win_arm64.whl", hash = "sha256:2f10cf143e4a9bb0f4f5af568a00df94a2d69ef41c2579584454bb0fe5cc642c", size = 103470, upload-time = "2026-05-10T18:16:10.369Z" },
 ]
 
+[[package]]
+name = "lucebox"
+source = { editable = "lucebox" }
+dependencies = [
+    { name = "httpx" },
+    { name = "huggingface-hub" },
+    { name = "rich" },
+    { name = "tomli-w" },
+    { name = "typer" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "httpx", specifier = ">=0.27" },
+    { name = "huggingface-hub", specifier = ">=0.27" },
+    { name = "rich", specifier = ">=13" },
+    { name = "tomli-w", specifier = ">=1.0" },
+    { name = "typer", specifier = ">=0.12" },
+]
+
 [[package]]
 name = "lucebox-dflash"
 version = "0.1.0"
@@ -466,6 +487,7 @@ name = "lucebox-hub"
 version = "0.0.0"
 source = { virtual = "." }
 dependencies = [
+    { name = "lucebox" },
     { name = "lucebox-dflash" },
     { name = "pflash" },
 ]
@@ -482,6 +504,7 @@ megakernel = [
 
 [package.metadata]
 requires-dist = [
+    { name = "lucebox", editable = "lucebox" },
     { name = "lucebox-dflash", virtual = "server" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.10,<2" },
     { name = "pflash", editable = "optimizations/pflash" },
@@ -1124,6 +1147,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
 ]
 
+[[package]]
+name = "tomli-w"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/19/75/241269d1da26b624c0d5e110e8149093c759b7a286138f4efd61a60e75fe/tomli_w-1.2.0.tar.gz", hash = "sha256:2dd14fac5a47c27be9cd4c976af5a12d87fb1f0b4512f81d69cce3b35ae25021", size = 7184, upload-time = "2025-01-15T12:07:24.262Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/18/c86eb8e0202e32dd3df50d43d7ff9854f8e0603945ff398974c1d91ac1ef/tomli_w-1.2.0-py3-none-any.whl", hash = "sha256:188306098d013b691fcadc011abd66727d3c414c571bb01b1a174ba8c983cf90", size = 6675, upload-time = "2025-01-15T12:07:22.074Z" },
+]
+
 [[package]]
 name = "torch"
 version = "2.11.0+cu128"